1/*
2 *  drivers/cpufreq/cpufreq_ondemand.c
3 *
4 *  Copyright (C)  2001 Russell King
5 *            (C)  2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
6 *                      Jun Nakajima <jun.nakajima@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/cpu.h>
16#include <linux/percpu-defs.h>
17#include <linux/slab.h>
18#include <linux/tick.h>
19#include "cpufreq_governor.h"
20
21/* On-demand governor macros */
22#define DEF_FREQUENCY_UP_THRESHOLD		(80)
23#define DEF_SAMPLING_DOWN_FACTOR		(1)
24#define MAX_SAMPLING_DOWN_FACTOR		(100000)
25#define MICRO_FREQUENCY_UP_THRESHOLD		(95)
26#define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
27#define MIN_FREQUENCY_UP_THRESHOLD		(11)
28#define MAX_FREQUENCY_UP_THRESHOLD		(100)
29
30static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
31
32static struct od_ops od_ops;
33
34#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
35static struct cpufreq_governor cpufreq_gov_ondemand;
36#endif
37
38static unsigned int default_powersave_bias;
39
40static void ondemand_powersave_bias_init_cpu(int cpu)
41{
42	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
43
44	dbs_info->freq_table = cpufreq_frequency_get_table(cpu);
45	dbs_info->freq_lo = 0;
46}
47
48/*
49 * Not all CPUs want IO time to be accounted as busy; this depends on how
50 * efficient idling at a higher frequency/voltage is.
51 * Pavel Machek says this is not so for various generations of AMD and old
52 * Intel systems.
53 * Mike Chan (android.com) claims this is also not true for ARM.
54 * Because of this, whitelist specific known (series) of CPUs by default, and
55 * leave all others up to the user.
56 */
57static int should_io_be_busy(void)
58{
59#if defined(CONFIG_X86)
60	/*
61	 * For Intel, Core 2 (model 15) and later have an efficient idle.
62	 */
63	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
64			boot_cpu_data.x86 == 6 &&
65			boot_cpu_data.x86_model >= 15)
66		return 1;
67#endif
68	return 0;
69}
70
71/*
72 * Find right freq to be set now with powersave_bias on.
73 * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
74 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
75 */
76static unsigned int generic_powersave_bias_target(struct cpufreq_policy *policy,
77		unsigned int freq_next, unsigned int relation)
78{
79	unsigned int freq_req, freq_reduc, freq_avg;
80	unsigned int freq_hi, freq_lo;
81	unsigned int index = 0;
82	unsigned int jiffies_total, jiffies_hi, jiffies_lo;
83	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
84						   policy->cpu);
85	struct dbs_data *dbs_data = policy->governor_data;
86	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
87
88	if (!dbs_info->freq_table) {
89		dbs_info->freq_lo = 0;
90		dbs_info->freq_lo_jiffies = 0;
91		return freq_next;
92	}
93
94	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next,
95			relation, &index);
96	freq_req = dbs_info->freq_table[index].frequency;
97	freq_reduc = freq_req * od_tuners->powersave_bias / 1000;
98	freq_avg = freq_req - freq_reduc;
99
100	/* Find freq bounds for freq_avg in freq_table */
101	index = 0;
102	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
103			CPUFREQ_RELATION_H, &index);
104	freq_lo = dbs_info->freq_table[index].frequency;
105	index = 0;
106	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
107			CPUFREQ_RELATION_L, &index);
108	freq_hi = dbs_info->freq_table[index].frequency;
109
110	/* Find out how long we have to be in hi and lo freqs */
111	if (freq_hi == freq_lo) {
112		dbs_info->freq_lo = 0;
113		dbs_info->freq_lo_jiffies = 0;
114		return freq_lo;
115	}
116	jiffies_total = usecs_to_jiffies(od_tuners->sampling_rate);
117	jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
118	jiffies_hi += ((freq_hi - freq_lo) / 2);
119	jiffies_hi /= (freq_hi - freq_lo);
120	jiffies_lo = jiffies_total - jiffies_hi;
121	dbs_info->freq_lo = freq_lo;
122	dbs_info->freq_lo_jiffies = jiffies_lo;
123	dbs_info->freq_hi_jiffies = jiffies_hi;
124	return freq_hi;
125}
126
127static void ondemand_powersave_bias_init(void)
128{
129	int i;
130	for_each_online_cpu(i) {
131		ondemand_powersave_bias_init_cpu(i);
132	}
133}
134
135static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq)
136{
137	struct dbs_data *dbs_data = policy->governor_data;
138	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
139
140	if (od_tuners->powersave_bias)
141		freq = od_ops.powersave_bias_target(policy, freq,
142				CPUFREQ_RELATION_H);
143	else if (policy->cur == policy->max)
144		return;
145
146	__cpufreq_driver_target(policy, freq, od_tuners->powersave_bias ?
147			CPUFREQ_RELATION_L : CPUFREQ_RELATION_H);
148}
149
150/*
151 * Every sampling_rate, we check, if current idle time is less than 20%
152 * (default), then we try to increase frequency. Else, we adjust the frequency
153 * proportional to load.
154 */
155static void od_check_cpu(int cpu, unsigned int load)
156{
157	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
158	struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
159	struct dbs_data *dbs_data = policy->governor_data;
160	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
161
162	dbs_info->freq_lo = 0;
163
164	/* Check for frequency increase */
165	if (load > od_tuners->up_threshold) {
166		/* If switching to max speed, apply sampling_down_factor */
167		if (policy->cur < policy->max)
168			dbs_info->rate_mult =
169				od_tuners->sampling_down_factor;
170		dbs_freq_increase(policy, policy->max);
171	} else {
172		/* Calculate the next frequency proportional to load */
173		unsigned int freq_next, min_f, max_f;
174
175		min_f = policy->cpuinfo.min_freq;
176		max_f = policy->cpuinfo.max_freq;
177		freq_next = min_f + load * (max_f - min_f) / 100;
178
179		/* No longer fully busy, reset rate_mult */
180		dbs_info->rate_mult = 1;
181
182		if (!od_tuners->powersave_bias) {
183			__cpufreq_driver_target(policy, freq_next,
184					CPUFREQ_RELATION_C);
185			return;
186		}
187
188		freq_next = od_ops.powersave_bias_target(policy, freq_next,
189					CPUFREQ_RELATION_L);
190		__cpufreq_driver_target(policy, freq_next, CPUFREQ_RELATION_C);
191	}
192}
193
194static void od_dbs_timer(struct work_struct *work)
195{
196	struct od_cpu_dbs_info_s *dbs_info =
197		container_of(work, struct od_cpu_dbs_info_s, cdbs.work.work);
198	unsigned int cpu = dbs_info->cdbs.cur_policy->cpu;
199	struct od_cpu_dbs_info_s *core_dbs_info = &per_cpu(od_cpu_dbs_info,
200			cpu);
201	struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
202	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
203	int delay = 0, sample_type = core_dbs_info->sample_type;
204	bool modify_all = true;
205
206	mutex_lock(&core_dbs_info->cdbs.timer_mutex);
207	if (!need_load_eval(&core_dbs_info->cdbs, od_tuners->sampling_rate)) {
208		modify_all = false;
209		goto max_delay;
210	}
211
212	/* Common NORMAL_SAMPLE setup */
213	core_dbs_info->sample_type = OD_NORMAL_SAMPLE;
214	if (sample_type == OD_SUB_SAMPLE) {
215		delay = core_dbs_info->freq_lo_jiffies;
216		__cpufreq_driver_target(core_dbs_info->cdbs.cur_policy,
217				core_dbs_info->freq_lo, CPUFREQ_RELATION_H);
218	} else {
219		dbs_check_cpu(dbs_data, cpu);
220		if (core_dbs_info->freq_lo) {
221			/* Setup timer for SUB_SAMPLE */
222			core_dbs_info->sample_type = OD_SUB_SAMPLE;
223			delay = core_dbs_info->freq_hi_jiffies;
224		}
225	}
226
227max_delay:
228	if (!delay)
229		delay = delay_for_sampling_rate(od_tuners->sampling_rate
230				* core_dbs_info->rate_mult);
231
232	gov_queue_work(dbs_data, dbs_info->cdbs.cur_policy, delay, modify_all);
233	mutex_unlock(&core_dbs_info->cdbs.timer_mutex);
234}
235
236/************************** sysfs interface ************************/
237static struct common_dbs_data od_dbs_cdata;
238
239/**
240 * update_sampling_rate - update sampling rate effective immediately if needed.
241 * @new_rate: new sampling rate
242 *
243 * If new rate is smaller than the old, simply updating
244 * dbs_tuners_int.sampling_rate might not be appropriate. For example, if the
245 * original sampling_rate was 1 second and the requested new sampling rate is 10
246 * ms because the user needs immediate reaction from ondemand governor, but not
247 * sure if higher frequency will be required or not, then, the governor may
248 * change the sampling rate too late; up to 1 second later. Thus, if we are
249 * reducing the sampling rate, we need to make the new value effective
250 * immediately.
251 */
252static void update_sampling_rate(struct dbs_data *dbs_data,
253		unsigned int new_rate)
254{
255	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
256	int cpu;
257
258	od_tuners->sampling_rate = new_rate = max(new_rate,
259			dbs_data->min_sampling_rate);
260
261	for_each_online_cpu(cpu) {
262		struct cpufreq_policy *policy;
263		struct od_cpu_dbs_info_s *dbs_info;
264		unsigned long next_sampling, appointed_at;
265
266		policy = cpufreq_cpu_get(cpu);
267		if (!policy)
268			continue;
269		if (policy->governor != &cpufreq_gov_ondemand) {
270			cpufreq_cpu_put(policy);
271			continue;
272		}
273		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
274		cpufreq_cpu_put(policy);
275
276		mutex_lock(&dbs_info->cdbs.timer_mutex);
277
278		if (!delayed_work_pending(&dbs_info->cdbs.work)) {
279			mutex_unlock(&dbs_info->cdbs.timer_mutex);
280			continue;
281		}
282
283		next_sampling = jiffies + usecs_to_jiffies(new_rate);
284		appointed_at = dbs_info->cdbs.work.timer.expires;
285
286		if (time_before(next_sampling, appointed_at)) {
287
288			mutex_unlock(&dbs_info->cdbs.timer_mutex);
289			cancel_delayed_work_sync(&dbs_info->cdbs.work);
290			mutex_lock(&dbs_info->cdbs.timer_mutex);
291
292			gov_queue_work(dbs_data, dbs_info->cdbs.cur_policy,
293					usecs_to_jiffies(new_rate), true);
294
295		}
296		mutex_unlock(&dbs_info->cdbs.timer_mutex);
297	}
298}
299
300static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
301		size_t count)
302{
303	unsigned int input;
304	int ret;
305	ret = sscanf(buf, "%u", &input);
306	if (ret != 1)
307		return -EINVAL;
308
309	update_sampling_rate(dbs_data, input);
310	return count;
311}
312
313static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
314		size_t count)
315{
316	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
317	unsigned int input;
318	int ret;
319	unsigned int j;
320
321	ret = sscanf(buf, "%u", &input);
322	if (ret != 1)
323		return -EINVAL;
324	od_tuners->io_is_busy = !!input;
325
326	/* we need to re-evaluate prev_cpu_idle */
327	for_each_online_cpu(j) {
328		struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
329									j);
330		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
331			&dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
332	}
333	return count;
334}
335
336static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
337		size_t count)
338{
339	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
340	unsigned int input;
341	int ret;
342	ret = sscanf(buf, "%u", &input);
343
344	if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
345			input < MIN_FREQUENCY_UP_THRESHOLD) {
346		return -EINVAL;
347	}
348
349	od_tuners->up_threshold = input;
350	return count;
351}
352
353static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
354		const char *buf, size_t count)
355{
356	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
357	unsigned int input, j;
358	int ret;
359	ret = sscanf(buf, "%u", &input);
360
361	if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
362		return -EINVAL;
363	od_tuners->sampling_down_factor = input;
364
365	/* Reset down sampling multiplier in case it was active */
366	for_each_online_cpu(j) {
367		struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
368				j);
369		dbs_info->rate_mult = 1;
370	}
371	return count;
372}
373
374static ssize_t store_ignore_nice_load(struct dbs_data *dbs_data,
375		const char *buf, size_t count)
376{
377	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
378	unsigned int input;
379	int ret;
380
381	unsigned int j;
382
383	ret = sscanf(buf, "%u", &input);
384	if (ret != 1)
385		return -EINVAL;
386
387	if (input > 1)
388		input = 1;
389
390	if (input == od_tuners->ignore_nice_load) { /* nothing to do */
391		return count;
392	}
393	od_tuners->ignore_nice_load = input;
394
395	/* we need to re-evaluate prev_cpu_idle */
396	for_each_online_cpu(j) {
397		struct od_cpu_dbs_info_s *dbs_info;
398		dbs_info = &per_cpu(od_cpu_dbs_info, j);
399		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
400			&dbs_info->cdbs.prev_cpu_wall, od_tuners->io_is_busy);
401		if (od_tuners->ignore_nice_load)
402			dbs_info->cdbs.prev_cpu_nice =
403				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
404
405	}
406	return count;
407}
408
409static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
410		size_t count)
411{
412	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
413	unsigned int input;
414	int ret;
415	ret = sscanf(buf, "%u", &input);
416
417	if (ret != 1)
418		return -EINVAL;
419
420	if (input > 1000)
421		input = 1000;
422
423	od_tuners->powersave_bias = input;
424	ondemand_powersave_bias_init();
425	return count;
426}
427
428show_store_one(od, sampling_rate);
429show_store_one(od, io_is_busy);
430show_store_one(od, up_threshold);
431show_store_one(od, sampling_down_factor);
432show_store_one(od, ignore_nice_load);
433show_store_one(od, powersave_bias);
434declare_show_sampling_rate_min(od);
435
436gov_sys_pol_attr_rw(sampling_rate);
437gov_sys_pol_attr_rw(io_is_busy);
438gov_sys_pol_attr_rw(up_threshold);
439gov_sys_pol_attr_rw(sampling_down_factor);
440gov_sys_pol_attr_rw(ignore_nice_load);
441gov_sys_pol_attr_rw(powersave_bias);
442gov_sys_pol_attr_ro(sampling_rate_min);
443
444static struct attribute *dbs_attributes_gov_sys[] = {
445	&sampling_rate_min_gov_sys.attr,
446	&sampling_rate_gov_sys.attr,
447	&up_threshold_gov_sys.attr,
448	&sampling_down_factor_gov_sys.attr,
449	&ignore_nice_load_gov_sys.attr,
450	&powersave_bias_gov_sys.attr,
451	&io_is_busy_gov_sys.attr,
452	NULL
453};
454
455static struct attribute_group od_attr_group_gov_sys = {
456	.attrs = dbs_attributes_gov_sys,
457	.name = "ondemand",
458};
459
460static struct attribute *dbs_attributes_gov_pol[] = {
461	&sampling_rate_min_gov_pol.attr,
462	&sampling_rate_gov_pol.attr,
463	&up_threshold_gov_pol.attr,
464	&sampling_down_factor_gov_pol.attr,
465	&ignore_nice_load_gov_pol.attr,
466	&powersave_bias_gov_pol.attr,
467	&io_is_busy_gov_pol.attr,
468	NULL
469};
470
471static struct attribute_group od_attr_group_gov_pol = {
472	.attrs = dbs_attributes_gov_pol,
473	.name = "ondemand",
474};
475
476/************************** sysfs end ************************/
477
478static int od_init(struct dbs_data *dbs_data)
479{
480	struct od_dbs_tuners *tuners;
481	u64 idle_time;
482	int cpu;
483
484	tuners = kzalloc(sizeof(*tuners), GFP_KERNEL);
485	if (!tuners) {
486		pr_err("%s: kzalloc failed\n", __func__);
487		return -ENOMEM;
488	}
489
490	cpu = get_cpu();
491	idle_time = get_cpu_idle_time_us(cpu, NULL);
492	put_cpu();
493	if (idle_time != -1ULL) {
494		/* Idle micro accounting is supported. Use finer thresholds */
495		tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
496		/*
497		 * In nohz/micro accounting case we set the minimum frequency
498		 * not depending on HZ, but fixed (very low). The deferred
499		 * timer might skip some samples if idle/sleeping as needed.
500		*/
501		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
502	} else {
503		tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
504
505		/* For correct statistics, we need 10 ticks for each measure */
506		dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
507			jiffies_to_usecs(10);
508	}
509
510	tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
511	tuners->ignore_nice_load = 0;
512	tuners->powersave_bias = default_powersave_bias;
513	tuners->io_is_busy = should_io_be_busy();
514
515	dbs_data->tuners = tuners;
516	mutex_init(&dbs_data->mutex);
517	return 0;
518}
519
520static void od_exit(struct dbs_data *dbs_data)
521{
522	kfree(dbs_data->tuners);
523}
524
525define_get_cpu_dbs_routines(od_cpu_dbs_info);
526
527static struct od_ops od_ops = {
528	.powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
529	.powersave_bias_target = generic_powersave_bias_target,
530	.freq_increase = dbs_freq_increase,
531};
532
533static struct common_dbs_data od_dbs_cdata = {
534	.governor = GOV_ONDEMAND,
535	.attr_group_gov_sys = &od_attr_group_gov_sys,
536	.attr_group_gov_pol = &od_attr_group_gov_pol,
537	.get_cpu_cdbs = get_cpu_cdbs,
538	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
539	.gov_dbs_timer = od_dbs_timer,
540	.gov_check_cpu = od_check_cpu,
541	.gov_ops = &od_ops,
542	.init = od_init,
543	.exit = od_exit,
544};
545
546static void od_set_powersave_bias(unsigned int powersave_bias)
547{
548	struct cpufreq_policy *policy;
549	struct dbs_data *dbs_data;
550	struct od_dbs_tuners *od_tuners;
551	unsigned int cpu;
552	cpumask_t done;
553
554	default_powersave_bias = powersave_bias;
555	cpumask_clear(&done);
556
557	get_online_cpus();
558	for_each_online_cpu(cpu) {
559		if (cpumask_test_cpu(cpu, &done))
560			continue;
561
562		policy = per_cpu(od_cpu_dbs_info, cpu).cdbs.cur_policy;
563		if (!policy)
564			continue;
565
566		cpumask_or(&done, &done, policy->cpus);
567
568		if (policy->governor != &cpufreq_gov_ondemand)
569			continue;
570
571		dbs_data = policy->governor_data;
572		od_tuners = dbs_data->tuners;
573		od_tuners->powersave_bias = default_powersave_bias;
574	}
575	put_online_cpus();
576}
577
578void od_register_powersave_bias_handler(unsigned int (*f)
579		(struct cpufreq_policy *, unsigned int, unsigned int),
580		unsigned int powersave_bias)
581{
582	od_ops.powersave_bias_target = f;
583	od_set_powersave_bias(powersave_bias);
584}
585EXPORT_SYMBOL_GPL(od_register_powersave_bias_handler);
586
587void od_unregister_powersave_bias_handler(void)
588{
589	od_ops.powersave_bias_target = generic_powersave_bias_target;
590	od_set_powersave_bias(0);
591}
592EXPORT_SYMBOL_GPL(od_unregister_powersave_bias_handler);
593
594static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
595		unsigned int event)
596{
597	return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
598}
599
600#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
601static
602#endif
603struct cpufreq_governor cpufreq_gov_ondemand = {
604	.name			= "ondemand",
605	.governor		= od_cpufreq_governor_dbs,
606	.max_transition_latency	= TRANSITION_LATENCY_LIMIT,
607	.owner			= THIS_MODULE,
608};
609
610static int __init cpufreq_gov_dbs_init(void)
611{
612	return cpufreq_register_governor(&cpufreq_gov_ondemand);
613}
614
615static void __exit cpufreq_gov_dbs_exit(void)
616{
617	cpufreq_unregister_governor(&cpufreq_gov_ondemand);
618}
619
620MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
621MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>");
622MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
623	"Low Latency Frequency Transition capable processors");
624MODULE_LICENSE("GPL");
625
626#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
627fs_initcall(cpufreq_gov_dbs_init);
628#else
629module_init(cpufreq_gov_dbs_init);
630#endif
631module_exit(cpufreq_gov_dbs_exit);
632