1/*
2 * intel_pstate.c: Native P state management for Intel processors
3 *
4 * (C) Copyright 2012 Intel Corporation
5 * Author: Dirk Brandewie <dirk.j.brandewie@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13#include <linux/kernel.h>
14#include <linux/kernel_stat.h>
15#include <linux/module.h>
16#include <linux/ktime.h>
17#include <linux/hrtimer.h>
18#include <linux/tick.h>
19#include <linux/slab.h>
20#include <linux/sched.h>
21#include <linux/list.h>
22#include <linux/cpu.h>
23#include <linux/cpufreq.h>
24#include <linux/sysfs.h>
25#include <linux/types.h>
26#include <linux/fs.h>
27#include <linux/debugfs.h>
28#include <linux/acpi.h>
29#include <trace/events/power.h>
30
31#include <asm/div64.h>
32#include <asm/msr.h>
33#include <asm/cpu_device_id.h>
34#include <asm/cpufeature.h>
35
36#define BYT_RATIOS		0x66a
37#define BYT_VIDS		0x66b
38#define BYT_TURBO_RATIOS	0x66c
39#define BYT_TURBO_VIDS		0x66d
40
41#define FRAC_BITS 8
42#define int_tofp(X) ((int64_t)(X) << FRAC_BITS)
43#define fp_toint(X) ((X) >> FRAC_BITS)
44
45
46static inline int32_t mul_fp(int32_t x, int32_t y)
47{
48	return ((int64_t)x * (int64_t)y) >> FRAC_BITS;
49}
50
51static inline int32_t div_fp(s64 x, s64 y)
52{
53	return div64_s64((int64_t)x << FRAC_BITS, y);
54}
55
56static inline int ceiling_fp(int32_t x)
57{
58	int mask, ret;
59
60	ret = fp_toint(x);
61	mask = (1 << FRAC_BITS) - 1;
62	if (x & mask)
63		ret += 1;
64	return ret;
65}
66
67struct sample {
68	int32_t core_pct_busy;
69	u64 aperf;
70	u64 mperf;
71	int freq;
72	ktime_t time;
73};
74
75struct pstate_data {
76	int	current_pstate;
77	int	min_pstate;
78	int	max_pstate;
79	int	scaling;
80	int	turbo_pstate;
81};
82
83struct vid_data {
84	int min;
85	int max;
86	int turbo;
87	int32_t ratio;
88};
89
90struct _pid {
91	int setpoint;
92	int32_t integral;
93	int32_t p_gain;
94	int32_t i_gain;
95	int32_t d_gain;
96	int deadband;
97	int32_t last_err;
98};
99
100struct cpudata {
101	int cpu;
102
103	struct timer_list timer;
104
105	struct pstate_data pstate;
106	struct vid_data vid;
107	struct _pid pid;
108
109	ktime_t last_sample_time;
110	u64	prev_aperf;
111	u64	prev_mperf;
112	struct sample sample;
113};
114
115static struct cpudata **all_cpu_data;
116struct pstate_adjust_policy {
117	int sample_rate_ms;
118	int deadband;
119	int setpoint;
120	int p_gain_pct;
121	int d_gain_pct;
122	int i_gain_pct;
123};
124
125struct pstate_funcs {
126	int (*get_max)(void);
127	int (*get_min)(void);
128	int (*get_turbo)(void);
129	int (*get_scaling)(void);
130	void (*set)(struct cpudata*, int pstate);
131	void (*get_vid)(struct cpudata *);
132};
133
134struct cpu_defaults {
135	struct pstate_adjust_policy pid_policy;
136	struct pstate_funcs funcs;
137};
138
139static struct pstate_adjust_policy pid_params;
140static struct pstate_funcs pstate_funcs;
141static int hwp_active;
142
143struct perf_limits {
144	int no_turbo;
145	int turbo_disabled;
146	int max_perf_pct;
147	int min_perf_pct;
148	int32_t max_perf;
149	int32_t min_perf;
150	int max_policy_pct;
151	int max_sysfs_pct;
152	int min_policy_pct;
153	int min_sysfs_pct;
154};
155
156static struct perf_limits limits = {
157	.no_turbo = 0,
158	.turbo_disabled = 0,
159	.max_perf_pct = 100,
160	.max_perf = int_tofp(1),
161	.min_perf_pct = 0,
162	.min_perf = 0,
163	.max_policy_pct = 100,
164	.max_sysfs_pct = 100,
165	.min_policy_pct = 0,
166	.min_sysfs_pct = 0,
167};
168
169static inline void pid_reset(struct _pid *pid, int setpoint, int busy,
170			     int deadband, int integral) {
171	pid->setpoint = setpoint;
172	pid->deadband  = deadband;
173	pid->integral  = int_tofp(integral);
174	pid->last_err  = int_tofp(setpoint) - int_tofp(busy);
175}
176
177static inline void pid_p_gain_set(struct _pid *pid, int percent)
178{
179	pid->p_gain = div_fp(int_tofp(percent), int_tofp(100));
180}
181
182static inline void pid_i_gain_set(struct _pid *pid, int percent)
183{
184	pid->i_gain = div_fp(int_tofp(percent), int_tofp(100));
185}
186
187static inline void pid_d_gain_set(struct _pid *pid, int percent)
188{
189	pid->d_gain = div_fp(int_tofp(percent), int_tofp(100));
190}
191
192static signed int pid_calc(struct _pid *pid, int32_t busy)
193{
194	signed int result;
195	int32_t pterm, dterm, fp_error;
196	int32_t integral_limit;
197
198	fp_error = int_tofp(pid->setpoint) - busy;
199
200	if (abs(fp_error) <= int_tofp(pid->deadband))
201		return 0;
202
203	pterm = mul_fp(pid->p_gain, fp_error);
204
205	pid->integral += fp_error;
206
207	/*
208	 * We limit the integral here so that it will never
209	 * get higher than 30.  This prevents it from becoming
210	 * too large an input over long periods of time and allows
211	 * it to get factored out sooner.
212	 *
213	 * The value of 30 was chosen through experimentation.
214	 */
215	integral_limit = int_tofp(30);
216	if (pid->integral > integral_limit)
217		pid->integral = integral_limit;
218	if (pid->integral < -integral_limit)
219		pid->integral = -integral_limit;
220
221	dterm = mul_fp(pid->d_gain, fp_error - pid->last_err);
222	pid->last_err = fp_error;
223
224	result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm;
225	result = result + (1 << (FRAC_BITS-1));
226	return (signed int)fp_toint(result);
227}
228
229static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu)
230{
231	pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct);
232	pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct);
233	pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct);
234
235	pid_reset(&cpu->pid, pid_params.setpoint, 100, pid_params.deadband, 0);
236}
237
238static inline void intel_pstate_reset_all_pid(void)
239{
240	unsigned int cpu;
241
242	for_each_online_cpu(cpu) {
243		if (all_cpu_data[cpu])
244			intel_pstate_busy_pid_reset(all_cpu_data[cpu]);
245	}
246}
247
248static inline void update_turbo_state(void)
249{
250	u64 misc_en;
251	struct cpudata *cpu;
252
253	cpu = all_cpu_data[0];
254	rdmsrl(MSR_IA32_MISC_ENABLE, misc_en);
255	limits.turbo_disabled =
256		(misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE ||
257		 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
258}
259
260#define PCT_TO_HWP(x) (x * 255 / 100)
261static void intel_pstate_hwp_set(void)
262{
263	int min, max, cpu;
264	u64 value, freq;
265
266	get_online_cpus();
267
268	for_each_online_cpu(cpu) {
269		rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
270		min = PCT_TO_HWP(limits.min_perf_pct);
271		value &= ~HWP_MIN_PERF(~0L);
272		value |= HWP_MIN_PERF(min);
273
274		max = PCT_TO_HWP(limits.max_perf_pct);
275		if (limits.no_turbo) {
276			rdmsrl( MSR_HWP_CAPABILITIES, freq);
277			max = HWP_GUARANTEED_PERF(freq);
278		}
279
280		value &= ~HWP_MAX_PERF(~0L);
281		value |= HWP_MAX_PERF(max);
282		wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
283	}
284
285	put_online_cpus();
286}
287
288/************************** debugfs begin ************************/
289static int pid_param_set(void *data, u64 val)
290{
291	*(u32 *)data = val;
292	intel_pstate_reset_all_pid();
293	return 0;
294}
295
296static int pid_param_get(void *data, u64 *val)
297{
298	*val = *(u32 *)data;
299	return 0;
300}
301DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, pid_param_set, "%llu\n");
302
303struct pid_param {
304	char *name;
305	void *value;
306};
307
308static struct pid_param pid_files[] = {
309	{"sample_rate_ms", &pid_params.sample_rate_ms},
310	{"d_gain_pct", &pid_params.d_gain_pct},
311	{"i_gain_pct", &pid_params.i_gain_pct},
312	{"deadband", &pid_params.deadband},
313	{"setpoint", &pid_params.setpoint},
314	{"p_gain_pct", &pid_params.p_gain_pct},
315	{NULL, NULL}
316};
317
318static void __init intel_pstate_debug_expose_params(void)
319{
320	struct dentry *debugfs_parent;
321	int i = 0;
322
323	if (hwp_active)
324		return;
325	debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
326	if (IS_ERR_OR_NULL(debugfs_parent))
327		return;
328	while (pid_files[i].name) {
329		debugfs_create_file(pid_files[i].name, 0660,
330				    debugfs_parent, pid_files[i].value,
331				    &fops_pid_param);
332		i++;
333	}
334}
335
336/************************** debugfs end ************************/
337
338/************************** sysfs begin ************************/
339#define show_one(file_name, object)					\
340	static ssize_t show_##file_name					\
341	(struct kobject *kobj, struct attribute *attr, char *buf)	\
342	{								\
343		return sprintf(buf, "%u\n", limits.object);		\
344	}
345
346static ssize_t show_turbo_pct(struct kobject *kobj,
347				struct attribute *attr, char *buf)
348{
349	struct cpudata *cpu;
350	int total, no_turbo, turbo_pct;
351	uint32_t turbo_fp;
352
353	cpu = all_cpu_data[0];
354
355	total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
356	no_turbo = cpu->pstate.max_pstate - cpu->pstate.min_pstate + 1;
357	turbo_fp = div_fp(int_tofp(no_turbo), int_tofp(total));
358	turbo_pct = 100 - fp_toint(mul_fp(turbo_fp, int_tofp(100)));
359	return sprintf(buf, "%u\n", turbo_pct);
360}
361
362static ssize_t show_num_pstates(struct kobject *kobj,
363				struct attribute *attr, char *buf)
364{
365	struct cpudata *cpu;
366	int total;
367
368	cpu = all_cpu_data[0];
369	total = cpu->pstate.turbo_pstate - cpu->pstate.min_pstate + 1;
370	return sprintf(buf, "%u\n", total);
371}
372
373static ssize_t show_no_turbo(struct kobject *kobj,
374			     struct attribute *attr, char *buf)
375{
376	ssize_t ret;
377
378	update_turbo_state();
379	if (limits.turbo_disabled)
380		ret = sprintf(buf, "%u\n", limits.turbo_disabled);
381	else
382		ret = sprintf(buf, "%u\n", limits.no_turbo);
383
384	return ret;
385}
386
387static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
388			      const char *buf, size_t count)
389{
390	unsigned int input;
391	int ret;
392
393	ret = sscanf(buf, "%u", &input);
394	if (ret != 1)
395		return -EINVAL;
396
397	update_turbo_state();
398	if (limits.turbo_disabled) {
399		pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
400		return -EPERM;
401	}
402
403	limits.no_turbo = clamp_t(int, input, 0, 1);
404
405	if (hwp_active)
406		intel_pstate_hwp_set();
407
408	return count;
409}
410
411static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
412				  const char *buf, size_t count)
413{
414	unsigned int input;
415	int ret;
416
417	ret = sscanf(buf, "%u", &input);
418	if (ret != 1)
419		return -EINVAL;
420
421	limits.max_sysfs_pct = clamp_t(int, input, 0 , 100);
422	limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
423	limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
424
425	if (hwp_active)
426		intel_pstate_hwp_set();
427	return count;
428}
429
430static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
431				  const char *buf, size_t count)
432{
433	unsigned int input;
434	int ret;
435
436	ret = sscanf(buf, "%u", &input);
437	if (ret != 1)
438		return -EINVAL;
439
440	limits.min_sysfs_pct = clamp_t(int, input, 0 , 100);
441	limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
442	limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
443
444	if (hwp_active)
445		intel_pstate_hwp_set();
446	return count;
447}
448
449show_one(max_perf_pct, max_perf_pct);
450show_one(min_perf_pct, min_perf_pct);
451
452define_one_global_rw(no_turbo);
453define_one_global_rw(max_perf_pct);
454define_one_global_rw(min_perf_pct);
455define_one_global_ro(turbo_pct);
456define_one_global_ro(num_pstates);
457
458static struct attribute *intel_pstate_attributes[] = {
459	&no_turbo.attr,
460	&max_perf_pct.attr,
461	&min_perf_pct.attr,
462	&turbo_pct.attr,
463	&num_pstates.attr,
464	NULL
465};
466
467static struct attribute_group intel_pstate_attr_group = {
468	.attrs = intel_pstate_attributes,
469};
470
471static void __init intel_pstate_sysfs_expose_params(void)
472{
473	struct kobject *intel_pstate_kobject;
474	int rc;
475
476	intel_pstate_kobject = kobject_create_and_add("intel_pstate",
477						&cpu_subsys.dev_root->kobj);
478	BUG_ON(!intel_pstate_kobject);
479	rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
480	BUG_ON(rc);
481}
482/************************** sysfs end ************************/
483
484static void intel_pstate_hwp_enable(void)
485{
486	hwp_active++;
487	pr_info("intel_pstate HWP enabled\n");
488
489	wrmsrl( MSR_PM_ENABLE, 0x1);
490}
491
492static int byt_get_min_pstate(void)
493{
494	u64 value;
495
496	rdmsrl(BYT_RATIOS, value);
497	return (value >> 8) & 0x7F;
498}
499
500static int byt_get_max_pstate(void)
501{
502	u64 value;
503
504	rdmsrl(BYT_RATIOS, value);
505	return (value >> 16) & 0x7F;
506}
507
508static int byt_get_turbo_pstate(void)
509{
510	u64 value;
511
512	rdmsrl(BYT_TURBO_RATIOS, value);
513	return value & 0x7F;
514}
515
516static void byt_set_pstate(struct cpudata *cpudata, int pstate)
517{
518	u64 val;
519	int32_t vid_fp;
520	u32 vid;
521
522	val = pstate << 8;
523	if (limits.no_turbo && !limits.turbo_disabled)
524		val |= (u64)1 << 32;
525
526	vid_fp = cpudata->vid.min + mul_fp(
527		int_tofp(pstate - cpudata->pstate.min_pstate),
528		cpudata->vid.ratio);
529
530	vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max);
531	vid = ceiling_fp(vid_fp);
532
533	if (pstate > cpudata->pstate.max_pstate)
534		vid = cpudata->vid.turbo;
535
536	val |= vid;
537
538	wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
539}
540
541#define BYT_BCLK_FREQS 5
542static int byt_freq_table[BYT_BCLK_FREQS] = { 833, 1000, 1333, 1167, 800};
543
544static int byt_get_scaling(void)
545{
546	u64 value;
547	int i;
548
549	rdmsrl(MSR_FSB_FREQ, value);
550	i = value & 0x3;
551
552	BUG_ON(i > BYT_BCLK_FREQS);
553
554	return byt_freq_table[i] * 100;
555}
556
557static void byt_get_vid(struct cpudata *cpudata)
558{
559	u64 value;
560
561	rdmsrl(BYT_VIDS, value);
562	cpudata->vid.min = int_tofp((value >> 8) & 0x7f);
563	cpudata->vid.max = int_tofp((value >> 16) & 0x7f);
564	cpudata->vid.ratio = div_fp(
565		cpudata->vid.max - cpudata->vid.min,
566		int_tofp(cpudata->pstate.max_pstate -
567			cpudata->pstate.min_pstate));
568
569	rdmsrl(BYT_TURBO_VIDS, value);
570	cpudata->vid.turbo = value & 0x7f;
571}
572
573static int core_get_min_pstate(void)
574{
575	u64 value;
576
577	rdmsrl(MSR_PLATFORM_INFO, value);
578	return (value >> 40) & 0xFF;
579}
580
581static int core_get_max_pstate(void)
582{
583	u64 value;
584
585	rdmsrl(MSR_PLATFORM_INFO, value);
586	return (value >> 8) & 0xFF;
587}
588
589static int core_get_turbo_pstate(void)
590{
591	u64 value;
592	int nont, ret;
593
594	rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value);
595	nont = core_get_max_pstate();
596	ret = (value) & 255;
597	if (ret <= nont)
598		ret = nont;
599	return ret;
600}
601
602static inline int core_get_scaling(void)
603{
604	return 100000;
605}
606
607static void core_set_pstate(struct cpudata *cpudata, int pstate)
608{
609	u64 val;
610
611	val = pstate << 8;
612	if (limits.no_turbo && !limits.turbo_disabled)
613		val |= (u64)1 << 32;
614
615	wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val);
616}
617
618static int knl_get_turbo_pstate(void)
619{
620	u64 value;
621	int nont, ret;
622
623	rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value);
624	nont = core_get_max_pstate();
625	ret = (((value) >> 8) & 0xFF);
626	if (ret <= nont)
627		ret = nont;
628	return ret;
629}
630
631static struct cpu_defaults core_params = {
632	.pid_policy = {
633		.sample_rate_ms = 10,
634		.deadband = 0,
635		.setpoint = 97,
636		.p_gain_pct = 20,
637		.d_gain_pct = 0,
638		.i_gain_pct = 0,
639	},
640	.funcs = {
641		.get_max = core_get_max_pstate,
642		.get_min = core_get_min_pstate,
643		.get_turbo = core_get_turbo_pstate,
644		.get_scaling = core_get_scaling,
645		.set = core_set_pstate,
646	},
647};
648
649static struct cpu_defaults byt_params = {
650	.pid_policy = {
651		.sample_rate_ms = 10,
652		.deadband = 0,
653		.setpoint = 60,
654		.p_gain_pct = 14,
655		.d_gain_pct = 0,
656		.i_gain_pct = 4,
657	},
658	.funcs = {
659		.get_max = byt_get_max_pstate,
660		.get_min = byt_get_min_pstate,
661		.get_turbo = byt_get_turbo_pstate,
662		.set = byt_set_pstate,
663		.get_scaling = byt_get_scaling,
664		.get_vid = byt_get_vid,
665	},
666};
667
668static struct cpu_defaults knl_params = {
669	.pid_policy = {
670		.sample_rate_ms = 10,
671		.deadband = 0,
672		.setpoint = 97,
673		.p_gain_pct = 20,
674		.d_gain_pct = 0,
675		.i_gain_pct = 0,
676	},
677	.funcs = {
678		.get_max = core_get_max_pstate,
679		.get_min = core_get_min_pstate,
680		.get_turbo = knl_get_turbo_pstate,
681		.get_scaling = core_get_scaling,
682		.set = core_set_pstate,
683	},
684};
685
686static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
687{
688	int max_perf = cpu->pstate.turbo_pstate;
689	int max_perf_adj;
690	int min_perf;
691
692	if (limits.no_turbo || limits.turbo_disabled)
693		max_perf = cpu->pstate.max_pstate;
694
695	/*
696	 * performance can be limited by user through sysfs, by cpufreq
697	 * policy, or by cpu specific default values determined through
698	 * experimentation.
699	 */
700	max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf));
701	*max = clamp_t(int, max_perf_adj,
702			cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
703
704	min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits.min_perf));
705	*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
706}
707
708static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate)
709{
710	int max_perf, min_perf;
711
712	update_turbo_state();
713
714	intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
715
716	pstate = clamp_t(int, pstate, min_perf, max_perf);
717
718	if (pstate == cpu->pstate.current_pstate)
719		return;
720
721	trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
722
723	cpu->pstate.current_pstate = pstate;
724
725	pstate_funcs.set(cpu, pstate);
726}
727
728static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
729{
730	cpu->pstate.min_pstate = pstate_funcs.get_min();
731	cpu->pstate.max_pstate = pstate_funcs.get_max();
732	cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
733	cpu->pstate.scaling = pstate_funcs.get_scaling();
734
735	if (pstate_funcs.get_vid)
736		pstate_funcs.get_vid(cpu);
737	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
738}
739
740static inline void intel_pstate_calc_busy(struct cpudata *cpu)
741{
742	struct sample *sample = &cpu->sample;
743	int64_t core_pct;
744
745	core_pct = int_tofp(sample->aperf) * int_tofp(100);
746	core_pct = div64_u64(core_pct, int_tofp(sample->mperf));
747
748	sample->freq = fp_toint(
749		mul_fp(int_tofp(
750			cpu->pstate.max_pstate * cpu->pstate.scaling / 100),
751			core_pct));
752
753	sample->core_pct_busy = (int32_t)core_pct;
754}
755
756static inline void intel_pstate_sample(struct cpudata *cpu)
757{
758	u64 aperf, mperf;
759	unsigned long flags;
760
761	local_irq_save(flags);
762	rdmsrl(MSR_IA32_APERF, aperf);
763	rdmsrl(MSR_IA32_MPERF, mperf);
764	if (cpu->prev_mperf == mperf) {
765		local_irq_restore(flags);
766		return;
767	}
768
769	local_irq_restore(flags);
770
771	cpu->last_sample_time = cpu->sample.time;
772	cpu->sample.time = ktime_get();
773	cpu->sample.aperf = aperf;
774	cpu->sample.mperf = mperf;
775	cpu->sample.aperf -= cpu->prev_aperf;
776	cpu->sample.mperf -= cpu->prev_mperf;
777
778	intel_pstate_calc_busy(cpu);
779
780	cpu->prev_aperf = aperf;
781	cpu->prev_mperf = mperf;
782}
783
784static inline void intel_hwp_set_sample_time(struct cpudata *cpu)
785{
786	int delay;
787
788	delay = msecs_to_jiffies(50);
789	mod_timer_pinned(&cpu->timer, jiffies + delay);
790}
791
792static inline void intel_pstate_set_sample_time(struct cpudata *cpu)
793{
794	int delay;
795
796	delay = msecs_to_jiffies(pid_params.sample_rate_ms);
797	mod_timer_pinned(&cpu->timer, jiffies + delay);
798}
799
800static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu)
801{
802	int32_t core_busy, max_pstate, current_pstate, sample_ratio;
803	s64 duration_us;
804	u32 sample_time;
805
806	/*
807	 * core_busy is the ratio of actual performance to max
808	 * max_pstate is the max non turbo pstate available
809	 * current_pstate was the pstate that was requested during
810	 * 	the last sample period.
811	 *
812	 * We normalize core_busy, which was our actual percent
813	 * performance to what we requested during the last sample
814	 * period. The result will be a percentage of busy at a
815	 * specified pstate.
816	 */
817	core_busy = cpu->sample.core_pct_busy;
818	max_pstate = int_tofp(cpu->pstate.max_pstate);
819	current_pstate = int_tofp(cpu->pstate.current_pstate);
820	core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate));
821
822	/*
823	 * Since we have a deferred timer, it will not fire unless
824	 * we are in C0.  So, determine if the actual elapsed time
825	 * is significantly greater (3x) than our sample interval.  If it
826	 * is, then we were idle for a long enough period of time
827	 * to adjust our busyness.
828	 */
829	sample_time = pid_params.sample_rate_ms  * USEC_PER_MSEC;
830	duration_us = ktime_us_delta(cpu->sample.time,
831				     cpu->last_sample_time);
832	if (duration_us > sample_time * 3) {
833		sample_ratio = div_fp(int_tofp(sample_time),
834				      int_tofp(duration_us));
835		core_busy = mul_fp(core_busy, sample_ratio);
836	}
837
838	return core_busy;
839}
840
841static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
842{
843	int32_t busy_scaled;
844	struct _pid *pid;
845	signed int ctl;
846
847	pid = &cpu->pid;
848	busy_scaled = intel_pstate_get_scaled_busy(cpu);
849
850	ctl = pid_calc(pid, busy_scaled);
851
852	/* Negative values of ctl increase the pstate and vice versa */
853	intel_pstate_set_pstate(cpu, cpu->pstate.current_pstate - ctl);
854}
855
856static void intel_hwp_timer_func(unsigned long __data)
857{
858	struct cpudata *cpu = (struct cpudata *) __data;
859
860	intel_pstate_sample(cpu);
861	intel_hwp_set_sample_time(cpu);
862}
863
864static void intel_pstate_timer_func(unsigned long __data)
865{
866	struct cpudata *cpu = (struct cpudata *) __data;
867	struct sample *sample;
868
869	intel_pstate_sample(cpu);
870
871	sample = &cpu->sample;
872
873	intel_pstate_adjust_busy_pstate(cpu);
874
875	trace_pstate_sample(fp_toint(sample->core_pct_busy),
876			fp_toint(intel_pstate_get_scaled_busy(cpu)),
877			cpu->pstate.current_pstate,
878			sample->mperf,
879			sample->aperf,
880			sample->freq);
881
882	intel_pstate_set_sample_time(cpu);
883}
884
885#define ICPU(model, policy) \
886	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\
887			(unsigned long)&policy }
888
889static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
890	ICPU(0x2a, core_params),
891	ICPU(0x2d, core_params),
892	ICPU(0x37, byt_params),
893	ICPU(0x3a, core_params),
894	ICPU(0x3c, core_params),
895	ICPU(0x3d, core_params),
896	ICPU(0x3e, core_params),
897	ICPU(0x3f, core_params),
898	ICPU(0x45, core_params),
899	ICPU(0x46, core_params),
900	ICPU(0x47, core_params),
901	ICPU(0x4c, byt_params),
902	ICPU(0x4e, core_params),
903	ICPU(0x4f, core_params),
904	ICPU(0x56, core_params),
905	ICPU(0x57, knl_params),
906	{}
907};
908MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids);
909
910static const struct x86_cpu_id intel_pstate_cpu_oob_ids[] = {
911	ICPU(0x56, core_params),
912	{}
913};
914
915static int intel_pstate_init_cpu(unsigned int cpunum)
916{
917	struct cpudata *cpu;
918
919	if (!all_cpu_data[cpunum])
920		all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata),
921					       GFP_KERNEL);
922	if (!all_cpu_data[cpunum])
923		return -ENOMEM;
924
925	cpu = all_cpu_data[cpunum];
926
927	cpu->cpu = cpunum;
928	intel_pstate_get_cpu_pstates(cpu);
929
930	init_timer_deferrable(&cpu->timer);
931	cpu->timer.data = (unsigned long)cpu;
932	cpu->timer.expires = jiffies + HZ/100;
933
934	if (!hwp_active)
935		cpu->timer.function = intel_pstate_timer_func;
936	else
937		cpu->timer.function = intel_hwp_timer_func;
938
939	intel_pstate_busy_pid_reset(cpu);
940	intel_pstate_sample(cpu);
941
942	add_timer_on(&cpu->timer, cpunum);
943
944	pr_debug("Intel pstate controlling: cpu %d\n", cpunum);
945
946	return 0;
947}
948
949static unsigned int intel_pstate_get(unsigned int cpu_num)
950{
951	struct sample *sample;
952	struct cpudata *cpu;
953
954	cpu = all_cpu_data[cpu_num];
955	if (!cpu)
956		return 0;
957	sample = &cpu->sample;
958	return sample->freq;
959}
960
961static int intel_pstate_set_policy(struct cpufreq_policy *policy)
962{
963	if (!policy->cpuinfo.max_freq)
964		return -ENODEV;
965
966	if (policy->policy == CPUFREQ_POLICY_PERFORMANCE &&
967	    policy->max >= policy->cpuinfo.max_freq) {
968		limits.min_policy_pct = 100;
969		limits.min_perf_pct = 100;
970		limits.min_perf = int_tofp(1);
971		limits.max_policy_pct = 100;
972		limits.max_perf_pct = 100;
973		limits.max_perf = int_tofp(1);
974		limits.no_turbo = 0;
975		return 0;
976	}
977
978	limits.min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
979	limits.min_policy_pct = clamp_t(int, limits.min_policy_pct, 0 , 100);
980	limits.min_perf_pct = max(limits.min_policy_pct, limits.min_sysfs_pct);
981	limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100));
982
983	limits.max_policy_pct = (policy->max * 100) / policy->cpuinfo.max_freq;
984	limits.max_policy_pct = clamp_t(int, limits.max_policy_pct, 0 , 100);
985	limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct);
986	limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100));
987
988	if (hwp_active)
989		intel_pstate_hwp_set();
990
991	return 0;
992}
993
994static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
995{
996	cpufreq_verify_within_cpu_limits(policy);
997
998	if (policy->policy != CPUFREQ_POLICY_POWERSAVE &&
999	    policy->policy != CPUFREQ_POLICY_PERFORMANCE)
1000		return -EINVAL;
1001
1002	return 0;
1003}
1004
1005static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
1006{
1007	int cpu_num = policy->cpu;
1008	struct cpudata *cpu = all_cpu_data[cpu_num];
1009
1010	pr_info("intel_pstate CPU %d exiting\n", cpu_num);
1011
1012	del_timer_sync(&all_cpu_data[cpu_num]->timer);
1013	if (hwp_active)
1014		return;
1015
1016	intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate);
1017}
1018
1019static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
1020{
1021	struct cpudata *cpu;
1022	int rc;
1023
1024	rc = intel_pstate_init_cpu(policy->cpu);
1025	if (rc)
1026		return rc;
1027
1028	cpu = all_cpu_data[policy->cpu];
1029
1030	if (limits.min_perf_pct == 100 && limits.max_perf_pct == 100)
1031		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
1032	else
1033		policy->policy = CPUFREQ_POLICY_POWERSAVE;
1034
1035	policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
1036	policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
1037
1038	/* cpuinfo and default policy values */
1039	policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling;
1040	update_turbo_state();
1041	policy->cpuinfo.max_freq = limits.turbo_disabled ?
1042			cpu->pstate.max_pstate : cpu->pstate.turbo_pstate;
1043	policy->cpuinfo.max_freq *= cpu->pstate.scaling;
1044
1045	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
1046	cpumask_set_cpu(policy->cpu, policy->cpus);
1047
1048	return 0;
1049}
1050
1051static struct cpufreq_driver intel_pstate_driver = {
1052	.flags		= CPUFREQ_CONST_LOOPS,
1053	.verify		= intel_pstate_verify_policy,
1054	.setpolicy	= intel_pstate_set_policy,
1055	.get		= intel_pstate_get,
1056	.init		= intel_pstate_cpu_init,
1057	.stop_cpu	= intel_pstate_stop_cpu,
1058	.name		= "intel_pstate",
1059};
1060
1061static int __initdata no_load;
1062static int __initdata no_hwp;
1063static int __initdata hwp_only;
1064static unsigned int force_load;
1065
1066static int intel_pstate_msrs_not_valid(void)
1067{
1068	if (!pstate_funcs.get_max() ||
1069	    !pstate_funcs.get_min() ||
1070	    !pstate_funcs.get_turbo())
1071		return -ENODEV;
1072
1073	return 0;
1074}
1075
1076static void copy_pid_params(struct pstate_adjust_policy *policy)
1077{
1078	pid_params.sample_rate_ms = policy->sample_rate_ms;
1079	pid_params.p_gain_pct = policy->p_gain_pct;
1080	pid_params.i_gain_pct = policy->i_gain_pct;
1081	pid_params.d_gain_pct = policy->d_gain_pct;
1082	pid_params.deadband = policy->deadband;
1083	pid_params.setpoint = policy->setpoint;
1084}
1085
1086static void copy_cpu_funcs(struct pstate_funcs *funcs)
1087{
1088	pstate_funcs.get_max   = funcs->get_max;
1089	pstate_funcs.get_min   = funcs->get_min;
1090	pstate_funcs.get_turbo = funcs->get_turbo;
1091	pstate_funcs.get_scaling = funcs->get_scaling;
1092	pstate_funcs.set       = funcs->set;
1093	pstate_funcs.get_vid   = funcs->get_vid;
1094}
1095
1096#if IS_ENABLED(CONFIG_ACPI)
1097#include <acpi/processor.h>
1098
1099static bool intel_pstate_no_acpi_pss(void)
1100{
1101	int i;
1102
1103	for_each_possible_cpu(i) {
1104		acpi_status status;
1105		union acpi_object *pss;
1106		struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
1107		struct acpi_processor *pr = per_cpu(processors, i);
1108
1109		if (!pr)
1110			continue;
1111
1112		status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer);
1113		if (ACPI_FAILURE(status))
1114			continue;
1115
1116		pss = buffer.pointer;
1117		if (pss && pss->type == ACPI_TYPE_PACKAGE) {
1118			kfree(pss);
1119			return false;
1120		}
1121
1122		kfree(pss);
1123	}
1124
1125	return true;
1126}
1127
1128static bool intel_pstate_has_acpi_ppc(void)
1129{
1130	int i;
1131
1132	for_each_possible_cpu(i) {
1133		struct acpi_processor *pr = per_cpu(processors, i);
1134
1135		if (!pr)
1136			continue;
1137		if (acpi_has_method(pr->handle, "_PPC"))
1138			return true;
1139	}
1140	return false;
1141}
1142
1143enum {
1144	PSS,
1145	PPC,
1146};
1147
1148struct hw_vendor_info {
1149	u16  valid;
1150	char oem_id[ACPI_OEM_ID_SIZE];
1151	char oem_table_id[ACPI_OEM_TABLE_ID_SIZE];
1152	int  oem_pwr_table;
1153};
1154
1155/* Hardware vendor-specific info that has its own power management modes */
1156static struct hw_vendor_info vendor_info[] = {
1157	{1, "HP    ", "ProLiant", PSS},
1158	{1, "ORACLE", "X4-2    ", PPC},
1159	{1, "ORACLE", "X4-2L   ", PPC},
1160	{1, "ORACLE", "X4-2B   ", PPC},
1161	{1, "ORACLE", "X3-2    ", PPC},
1162	{1, "ORACLE", "X3-2L   ", PPC},
1163	{1, "ORACLE", "X3-2B   ", PPC},
1164	{1, "ORACLE", "X4470M2 ", PPC},
1165	{1, "ORACLE", "X4270M3 ", PPC},
1166	{1, "ORACLE", "X4270M2 ", PPC},
1167	{1, "ORACLE", "X4170M2 ", PPC},
1168	{0, "", ""},
1169};
1170
1171static bool intel_pstate_platform_pwr_mgmt_exists(void)
1172{
1173	struct acpi_table_header hdr;
1174	struct hw_vendor_info *v_info;
1175	const struct x86_cpu_id *id;
1176	u64 misc_pwr;
1177
1178	id = x86_match_cpu(intel_pstate_cpu_oob_ids);
1179	if (id) {
1180		rdmsrl(MSR_MISC_PWR_MGMT, misc_pwr);
1181		if ( misc_pwr & (1 << 8))
1182			return true;
1183	}
1184
1185	if (acpi_disabled ||
1186	    ACPI_FAILURE(acpi_get_table_header(ACPI_SIG_FADT, 0, &hdr)))
1187		return false;
1188
1189	for (v_info = vendor_info; v_info->valid; v_info++) {
1190		if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) &&
1191			!strncmp(hdr.oem_table_id, v_info->oem_table_id,
1192						ACPI_OEM_TABLE_ID_SIZE))
1193			switch (v_info->oem_pwr_table) {
1194			case PSS:
1195				return intel_pstate_no_acpi_pss();
1196			case PPC:
1197				return intel_pstate_has_acpi_ppc() &&
1198					(!force_load);
1199			}
1200	}
1201
1202	return false;
1203}
1204#else /* CONFIG_ACPI not enabled */
1205static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
1206static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
1207#endif /* CONFIG_ACPI */
1208
1209static int __init intel_pstate_init(void)
1210{
1211	int cpu, rc = 0;
1212	const struct x86_cpu_id *id;
1213	struct cpu_defaults *cpu_def;
1214
1215	if (no_load)
1216		return -ENODEV;
1217
1218	id = x86_match_cpu(intel_pstate_cpu_ids);
1219	if (!id)
1220		return -ENODEV;
1221
1222	/*
1223	 * The Intel pstate driver will be ignored if the platform
1224	 * firmware has its own power management modes.
1225	 */
1226	if (intel_pstate_platform_pwr_mgmt_exists())
1227		return -ENODEV;
1228
1229	cpu_def = (struct cpu_defaults *)id->driver_data;
1230
1231	copy_pid_params(&cpu_def->pid_policy);
1232	copy_cpu_funcs(&cpu_def->funcs);
1233
1234	if (intel_pstate_msrs_not_valid())
1235		return -ENODEV;
1236
1237	pr_info("Intel P-state driver initializing.\n");
1238
1239	all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus());
1240	if (!all_cpu_data)
1241		return -ENOMEM;
1242
1243	if (static_cpu_has_safe(X86_FEATURE_HWP) && !no_hwp)
1244		intel_pstate_hwp_enable();
1245
1246	if (!hwp_active && hwp_only)
1247		goto out;
1248
1249	rc = cpufreq_register_driver(&intel_pstate_driver);
1250	if (rc)
1251		goto out;
1252
1253	intel_pstate_debug_expose_params();
1254	intel_pstate_sysfs_expose_params();
1255
1256	return rc;
1257out:
1258	get_online_cpus();
1259	for_each_online_cpu(cpu) {
1260		if (all_cpu_data[cpu]) {
1261			del_timer_sync(&all_cpu_data[cpu]->timer);
1262			kfree(all_cpu_data[cpu]);
1263		}
1264	}
1265
1266	put_online_cpus();
1267	vfree(all_cpu_data);
1268	return -ENODEV;
1269}
1270device_initcall(intel_pstate_init);
1271
1272static int __init intel_pstate_setup(char *str)
1273{
1274	if (!str)
1275		return -EINVAL;
1276
1277	if (!strcmp(str, "disable"))
1278		no_load = 1;
1279	if (!strcmp(str, "no_hwp"))
1280		no_hwp = 1;
1281	if (!strcmp(str, "force"))
1282		force_load = 1;
1283	if (!strcmp(str, "hwp_only"))
1284		hwp_only = 1;
1285	return 0;
1286}
1287early_param("intel_pstate", intel_pstate_setup);
1288
1289MODULE_AUTHOR("Dirk Brandewie <dirk.j.brandewie@intel.com>");
1290MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors");
1291MODULE_LICENSE("GPL");
1292