This source file includes following definitions.
- arch_asym_cpu_priority
- update_load_add
- update_load_sub
- update_load_set
- get_update_sysctl_factor
- update_sysctl
- sched_init_granularity
- __update_inv_weight
- __calc_delta
- task_of
- task_cfs_rq
- cfs_rq_of
- group_cfs_rq
- cfs_rq_tg_path
- list_add_leaf_cfs_rq
- list_del_leaf_cfs_rq
- assert_list_leaf_cfs_rq
- is_same_group
- parent_entity
- find_matching_se
- task_of
- task_cfs_rq
- cfs_rq_of
- group_cfs_rq
- cfs_rq_tg_path
- list_add_leaf_cfs_rq
- list_del_leaf_cfs_rq
- assert_list_leaf_cfs_rq
- parent_entity
- find_matching_se
- max_vruntime
- min_vruntime
- entity_before
- update_min_vruntime
- __enqueue_entity
- __dequeue_entity
- __pick_first_entity
- __pick_next_entity
- __pick_last_entity
- sched_proc_update_handler
- calc_delta_fair
- __sched_period
- sched_slice
- sched_vslice
- init_entity_runnable_average
- post_init_entity_util_avg
- init_entity_runnable_average
- post_init_entity_util_avg
- update_tg_load_avg
- update_curr
- update_curr_fair
- update_stats_wait_start
- update_stats_wait_end
- update_stats_enqueue_sleeper
- update_stats_enqueue
- update_stats_dequeue
- update_stats_curr_start
- deref_task_numa_group
- deref_curr_numa_group
- task_nr_scan_windows
- task_scan_min
- task_scan_start
- task_scan_max
- account_numa_enqueue
- account_numa_dequeue
- task_numa_group_id
- task_faults_idx
- task_faults
- group_faults
- group_faults_cpu
- group_faults_priv
- group_faults_shared
- numa_is_active_node
- score_nearby_nodes
- task_weight
- group_weight
- should_numa_migrate_memory
- update_numa_stats
- task_numa_assign
- load_too_imbalanced
- task_numa_compare
- task_numa_find_cpu
- task_numa_migrate
- numa_migrate_preferred
- numa_group_count_active_nodes
- update_task_scan_period
- numa_get_avg_runtime
- preferred_group_nid
- task_numa_placement
- get_numa_group
- put_numa_group
- task_numa_group
- task_numa_free
- task_numa_fault
- reset_ptenuma_scan
- task_numa_work
- init_numa_balancing
- task_tick_numa
- update_scan_period
- task_tick_numa
- account_numa_enqueue
- account_numa_dequeue
- update_scan_period
- account_entity_enqueue
- account_entity_dequeue
- enqueue_runnable_load_avg
- dequeue_runnable_load_avg
- enqueue_load_avg
- dequeue_load_avg
- enqueue_runnable_load_avg
- dequeue_runnable_load_avg
- enqueue_load_avg
- dequeue_load_avg
- reweight_entity
- reweight_task
- calc_group_shares
- calc_group_runnable
- update_cfs_group
- update_cfs_group
- cfs_rq_util_change
- update_tg_load_avg
- set_task_rq_fair
- update_tg_cfs_util
- update_tg_cfs_runnable
- add_tg_cfs_propagate
- propagate_entity_load_avg
- skip_blocked_update
- update_tg_load_avg
- propagate_entity_load_avg
- add_tg_cfs_propagate
- update_cfs_rq_load_avg
- attach_entity_load_avg
- detach_entity_load_avg
- update_load_avg
- cfs_rq_last_update_time
- cfs_rq_last_update_time
- sync_entity_load_avg
- remove_entity_load_avg
- cfs_rq_runnable_load_avg
- cfs_rq_load_avg
- task_util
- _task_util_est
- task_util_est
- util_est_enqueue
- within_margin
- util_est_dequeue
- task_fits_capacity
- update_misfit_status
- update_load_avg
- remove_entity_load_avg
- attach_entity_load_avg
- detach_entity_load_avg
- idle_balance
- util_est_enqueue
- util_est_dequeue
- update_misfit_status
- check_spread
- place_entity
- check_schedstat_required
- enqueue_entity
- __clear_buddies_last
- __clear_buddies_next
- __clear_buddies_skip
- clear_buddies
- dequeue_entity
- check_preempt_tick
- set_next_entity
- pick_next_entity
- put_prev_entity
- entity_tick
- cfs_bandwidth_used
- cfs_bandwidth_usage_inc
- cfs_bandwidth_usage_dec
- cfs_bandwidth_used
- cfs_bandwidth_usage_inc
- cfs_bandwidth_usage_dec
- default_cfs_period
- sched_cfs_bandwidth_slice
- __refill_cfs_bandwidth_runtime
- tg_cfs_bandwidth
- assign_cfs_rq_runtime
- __account_cfs_rq_runtime
- account_cfs_rq_runtime
- cfs_rq_throttled
- throttled_hierarchy
- throttled_lb_pair
- tg_unthrottle_up
- tg_throttle_down
- throttle_cfs_rq
- unthrottle_cfs_rq
- distribute_cfs_runtime
- do_sched_cfs_period_timer
- runtime_refresh_within
- start_cfs_slack_bandwidth
- __return_cfs_rq_runtime
- return_cfs_rq_runtime
- do_sched_cfs_slack_timer
- check_enqueue_throttle
- sync_throttle
- check_cfs_rq_runtime
- sched_cfs_slack_timer
- sched_cfs_period_timer
- init_cfs_bandwidth
- init_cfs_rq_runtime
- start_cfs_bandwidth
- destroy_cfs_bandwidth
- update_runtime_enabled
- unthrottle_offline_cfs_rqs
- cfs_bandwidth_used
- account_cfs_rq_runtime
- check_cfs_rq_runtime
- check_enqueue_throttle
- sync_throttle
- return_cfs_rq_runtime
- cfs_rq_throttled
- throttled_hierarchy
- throttled_lb_pair
- init_cfs_bandwidth
- init_cfs_rq_runtime
- tg_cfs_bandwidth
- destroy_cfs_bandwidth
- update_runtime_enabled
- unthrottle_offline_cfs_rqs
- hrtick_start_fair
- hrtick_update
- hrtick_start_fair
- hrtick_update
- cpu_overutilized
- update_overutilized_status
- update_overutilized_status
- enqueue_task_fair
- dequeue_task_fair
- sched_idle_cpu
- cpu_runnable_load
- capacity_of
- cpu_avg_load_per_task
- record_wakee
- wake_wide
- wake_affine_idle
- wake_affine_weight
- wake_affine
- capacity_spare_without
- find_idlest_group
- find_idlest_group_cpu
- find_idlest_cpu
- set_idle_cores
- test_idle_cores
- __update_idle_core
- select_idle_core
- select_idle_smt
- select_idle_core
- select_idle_smt
- select_idle_cpu
- select_idle_sibling
- cpu_util
- cpu_util_without
- wake_cap
- cpu_util_next
- compute_energy
- find_energy_efficient_cpu
- select_task_rq_fair
- migrate_task_rq_fair
- task_dead_fair
- balance_fair
- wakeup_gran
- wakeup_preempt_entity
- set_last_buddy
- set_next_buddy
- set_skip_buddy
- check_preempt_wakeup
- pick_next_task_fair
- put_prev_task_fair
- yield_task_fair
- yield_to_task_fair
- task_hot
- migrate_degrades_locality
- migrate_degrades_locality
- can_migrate_task
- detach_task
- detach_one_task
- detach_tasks
- attach_task
- attach_one_task
- attach_tasks
- cfs_rq_has_blocked
- others_have_blocked
- update_blocked_load_status
- cfs_rq_has_blocked
- others_have_blocked
- update_blocked_load_status
- __update_blocked_others
- cfs_rq_is_decayed
- __update_blocked_fair
- update_cfs_rq_h_load
- task_h_load
- __update_blocked_fair
- task_h_load
- update_blocked_averages
- init_sd_lb_stats
- scale_rt_capacity
- update_cpu_capacity
- update_group_capacity
- check_cpu_capacity
- check_misfit_status
- sg_imbalanced
- group_has_capacity
- group_is_overloaded
- group_smaller_min_cpu_capacity
- group_smaller_max_cpu_capacity
- group_classify
- update_nohz_stats
- update_sg_lb_stats
- update_sd_pick_busiest
- fbq_classify_group
- fbq_classify_rq
- fbq_classify_group
- fbq_classify_rq
- update_sd_lb_stats
- check_asym_packing
- fix_small_imbalance
- calculate_imbalance
- find_busiest_group
- find_busiest_queue
- asym_active_balance
- voluntary_active_balance
- need_active_balance
- should_we_balance
- load_balance
- get_sd_balance_interval
- update_next_balance
- active_load_balance_cpu_stop
- update_max_interval
- rebalance_domains
- on_null_domain
- find_new_ilb
- kick_ilb
- nohz_balancer_kick
- set_cpu_sd_state_busy
- nohz_balance_exit_idle
- set_cpu_sd_state_idle
- nohz_balance_enter_idle
- _nohz_idle_balance
- nohz_idle_balance
- nohz_newidle_balance
- nohz_balancer_kick
- nohz_idle_balance
- nohz_newidle_balance
- newidle_balance
- run_rebalance_domains
- trigger_load_balance
- rq_online_fair
- rq_offline_fair
- task_tick_fair
- task_fork_fair
- prio_changed_fair
- vruntime_normalized
- propagate_entity_cfs_rq
- propagate_entity_cfs_rq
- detach_entity_cfs_rq
- attach_entity_cfs_rq
- detach_task_cfs_rq
- attach_task_cfs_rq
- switched_from_fair
- switched_to_fair
- set_next_task_fair
- init_cfs_rq
- task_set_group_fair
- task_move_group_fair
- task_change_group_fair
- free_fair_sched_group
- alloc_fair_sched_group
- online_fair_sched_group
- unregister_fair_sched_group
- init_tg_cfs_entry
- sched_group_set_shares
- free_fair_sched_group
- alloc_fair_sched_group
- online_fair_sched_group
- unregister_fair_sched_group
- get_rr_interval_fair
- print_cfs_stats
- show_numa_stats
- init_sched_fair_class
- sched_trace_cfs_rq_avg
- sched_trace_cfs_rq_path
- sched_trace_cfs_rq_cpu
- sched_trace_rq_avg_rt
- sched_trace_rq_avg_dl
- sched_trace_rq_avg_irq
- sched_trace_rq_cpu
- sched_trace_rd_span
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 #include "sched.h"
24
25 #include <trace/events/sched.h>
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40 unsigned int sysctl_sched_latency = 6000000ULL;
41 static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
42
43
44
45
46
47
48
49
50
51
52
53
54 enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
55
56
57
58
59
60
61 unsigned int sysctl_sched_min_granularity = 750000ULL;
62 static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
63
64
65
66
67 static unsigned int sched_nr_latency = 8;
68
69
70
71
72
73 unsigned int sysctl_sched_child_runs_first __read_mostly;
74
75
76
77
78
79
80
81
82
83
84 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
85 static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
86
87 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
88
89 #ifdef CONFIG_SMP
90
91
92
93 int __weak arch_asym_cpu_priority(int cpu)
94 {
95 return -cpu;
96 }
97
98
99
100
101
102
103 #define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
104
105 #endif
106
107 #ifdef CONFIG_CFS_BANDWIDTH
108
109
110
111
112
113
114
115
116
117
118 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
119 #endif
120
121 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
122 {
123 lw->weight += inc;
124 lw->inv_weight = 0;
125 }
126
127 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
128 {
129 lw->weight -= dec;
130 lw->inv_weight = 0;
131 }
132
133 static inline void update_load_set(struct load_weight *lw, unsigned long w)
134 {
135 lw->weight = w;
136 lw->inv_weight = 0;
137 }
138
139
140
141
142
143
144
145
146
147
148 static unsigned int get_update_sysctl_factor(void)
149 {
150 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
151 unsigned int factor;
152
153 switch (sysctl_sched_tunable_scaling) {
154 case SCHED_TUNABLESCALING_NONE:
155 factor = 1;
156 break;
157 case SCHED_TUNABLESCALING_LINEAR:
158 factor = cpus;
159 break;
160 case SCHED_TUNABLESCALING_LOG:
161 default:
162 factor = 1 + ilog2(cpus);
163 break;
164 }
165
166 return factor;
167 }
168
169 static void update_sysctl(void)
170 {
171 unsigned int factor = get_update_sysctl_factor();
172
173 #define SET_SYSCTL(name) \
174 (sysctl_##name = (factor) * normalized_sysctl_##name)
175 SET_SYSCTL(sched_min_granularity);
176 SET_SYSCTL(sched_latency);
177 SET_SYSCTL(sched_wakeup_granularity);
178 #undef SET_SYSCTL
179 }
180
181 void sched_init_granularity(void)
182 {
183 update_sysctl();
184 }
185
186 #define WMULT_CONST (~0U)
187 #define WMULT_SHIFT 32
188
189 static void __update_inv_weight(struct load_weight *lw)
190 {
191 unsigned long w;
192
193 if (likely(lw->inv_weight))
194 return;
195
196 w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204 }
205
206
207
208
209
210
211
212
213
214
215
216
217
218 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
219 {
220 u64 fact = scale_load_down(weight);
221 int shift = WMULT_SHIFT;
222
223 __update_inv_weight(lw);
224
225 if (unlikely(fact >> 32)) {
226 while (fact >> 32) {
227 fact >>= 1;
228 shift--;
229 }
230 }
231
232
233 fact = (u64)(u32)fact * lw->inv_weight;
234
235 while (fact >> 32) {
236 fact >>= 1;
237 shift--;
238 }
239
240 return mul_u64_u32_shr(delta_exec, fact, shift);
241 }
242
243
244 const struct sched_class fair_sched_class;
245
246
247
248
249
250 #ifdef CONFIG_FAIR_GROUP_SCHED
251 static inline struct task_struct *task_of(struct sched_entity *se)
252 {
253 SCHED_WARN_ON(!entity_is_task(se));
254 return container_of(se, struct task_struct, se);
255 }
256
257
258 #define for_each_sched_entity(se) \
259 for (; se; se = se->parent)
260
261 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
262 {
263 return p->se.cfs_rq;
264 }
265
266
267 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
268 {
269 return se->cfs_rq;
270 }
271
272
273 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
274 {
275 return grp->my_q;
276 }
277
278 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
279 {
280 if (!path)
281 return;
282
283 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
284 autogroup_path(cfs_rq->tg, path, len);
285 else if (cfs_rq && cfs_rq->tg->css.cgroup)
286 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
287 else
288 strlcpy(path, "(null)", len);
289 }
290
291 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
292 {
293 struct rq *rq = rq_of(cfs_rq);
294 int cpu = cpu_of(rq);
295
296 if (cfs_rq->on_list)
297 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
298
299 cfs_rq->on_list = 1;
300
301
302
303
304
305
306
307
308
309
310 if (cfs_rq->tg->parent &&
311 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
312
313
314
315
316
317
318 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
319 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
320
321
322
323
324
325 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
326 return true;
327 }
328
329 if (!cfs_rq->tg->parent) {
330
331
332
333
334 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
335 &rq->leaf_cfs_rq_list);
336
337
338
339
340 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
341 return true;
342 }
343
344
345
346
347
348
349
350 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
351
352
353
354
355 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
356 return false;
357 }
358
359 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
360 {
361 if (cfs_rq->on_list) {
362 struct rq *rq = rq_of(cfs_rq);
363
364
365
366
367
368
369
370
371 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
372 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
373
374 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
375 cfs_rq->on_list = 0;
376 }
377 }
378
379 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
380 {
381 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
382 }
383
384
385 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
386 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
387 leaf_cfs_rq_list)
388
389
390 static inline struct cfs_rq *
391 is_same_group(struct sched_entity *se, struct sched_entity *pse)
392 {
393 if (se->cfs_rq == pse->cfs_rq)
394 return se->cfs_rq;
395
396 return NULL;
397 }
398
399 static inline struct sched_entity *parent_entity(struct sched_entity *se)
400 {
401 return se->parent;
402 }
403
404 static void
405 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
406 {
407 int se_depth, pse_depth;
408
409
410
411
412
413
414
415
416
417 se_depth = (*se)->depth;
418 pse_depth = (*pse)->depth;
419
420 while (se_depth > pse_depth) {
421 se_depth--;
422 *se = parent_entity(*se);
423 }
424
425 while (pse_depth > se_depth) {
426 pse_depth--;
427 *pse = parent_entity(*pse);
428 }
429
430 while (!is_same_group(*se, *pse)) {
431 *se = parent_entity(*se);
432 *pse = parent_entity(*pse);
433 }
434 }
435
436 #else
437
438 static inline struct task_struct *task_of(struct sched_entity *se)
439 {
440 return container_of(se, struct task_struct, se);
441 }
442
443 #define for_each_sched_entity(se) \
444 for (; se; se = NULL)
445
446 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
447 {
448 return &task_rq(p)->cfs;
449 }
450
451 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
452 {
453 struct task_struct *p = task_of(se);
454 struct rq *rq = task_rq(p);
455
456 return &rq->cfs;
457 }
458
459
460 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
461 {
462 return NULL;
463 }
464
465 static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
466 {
467 if (path)
468 strlcpy(path, "(null)", len);
469 }
470
471 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
472 {
473 return true;
474 }
475
476 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
477 {
478 }
479
480 static inline void assert_list_leaf_cfs_rq(struct rq *rq)
481 {
482 }
483
484 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
485 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
486
487 static inline struct sched_entity *parent_entity(struct sched_entity *se)
488 {
489 return NULL;
490 }
491
492 static inline void
493 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
494 {
495 }
496
497 #endif
498
499 static __always_inline
500 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
501
502
503
504
505
506 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
507 {
508 s64 delta = (s64)(vruntime - max_vruntime);
509 if (delta > 0)
510 max_vruntime = vruntime;
511
512 return max_vruntime;
513 }
514
515 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
516 {
517 s64 delta = (s64)(vruntime - min_vruntime);
518 if (delta < 0)
519 min_vruntime = vruntime;
520
521 return min_vruntime;
522 }
523
524 static inline int entity_before(struct sched_entity *a,
525 struct sched_entity *b)
526 {
527 return (s64)(a->vruntime - b->vruntime) < 0;
528 }
529
530 static void update_min_vruntime(struct cfs_rq *cfs_rq)
531 {
532 struct sched_entity *curr = cfs_rq->curr;
533 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
534
535 u64 vruntime = cfs_rq->min_vruntime;
536
537 if (curr) {
538 if (curr->on_rq)
539 vruntime = curr->vruntime;
540 else
541 curr = NULL;
542 }
543
544 if (leftmost) {
545 struct sched_entity *se;
546 se = rb_entry(leftmost, struct sched_entity, run_node);
547
548 if (!curr)
549 vruntime = se->vruntime;
550 else
551 vruntime = min_vruntime(vruntime, se->vruntime);
552 }
553
554
555 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
556 #ifndef CONFIG_64BIT
557 smp_wmb();
558 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
559 #endif
560 }
561
562
563
564
565 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
566 {
567 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
568 struct rb_node *parent = NULL;
569 struct sched_entity *entry;
570 bool leftmost = true;
571
572
573
574
575 while (*link) {
576 parent = *link;
577 entry = rb_entry(parent, struct sched_entity, run_node);
578
579
580
581
582 if (entity_before(se, entry)) {
583 link = &parent->rb_left;
584 } else {
585 link = &parent->rb_right;
586 leftmost = false;
587 }
588 }
589
590 rb_link_node(&se->run_node, parent, link);
591 rb_insert_color_cached(&se->run_node,
592 &cfs_rq->tasks_timeline, leftmost);
593 }
594
595 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
596 {
597 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
598 }
599
600 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
601 {
602 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
603
604 if (!left)
605 return NULL;
606
607 return rb_entry(left, struct sched_entity, run_node);
608 }
609
610 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
611 {
612 struct rb_node *next = rb_next(&se->run_node);
613
614 if (!next)
615 return NULL;
616
617 return rb_entry(next, struct sched_entity, run_node);
618 }
619
620 #ifdef CONFIG_SCHED_DEBUG
621 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
622 {
623 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
624
625 if (!last)
626 return NULL;
627
628 return rb_entry(last, struct sched_entity, run_node);
629 }
630
631
632
633
634
635 int sched_proc_update_handler(struct ctl_table *table, int write,
636 void __user *buffer, size_t *lenp,
637 loff_t *ppos)
638 {
639 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
640 unsigned int factor = get_update_sysctl_factor();
641
642 if (ret || !write)
643 return ret;
644
645 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
646 sysctl_sched_min_granularity);
647
648 #define WRT_SYSCTL(name) \
649 (normalized_sysctl_##name = sysctl_##name / (factor))
650 WRT_SYSCTL(sched_min_granularity);
651 WRT_SYSCTL(sched_latency);
652 WRT_SYSCTL(sched_wakeup_granularity);
653 #undef WRT_SYSCTL
654
655 return 0;
656 }
657 #endif
658
659
660
661
662 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
663 {
664 if (unlikely(se->load.weight != NICE_0_LOAD))
665 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
666
667 return delta;
668 }
669
670
671
672
673
674
675
676
677
678 static u64 __sched_period(unsigned long nr_running)
679 {
680 if (unlikely(nr_running > sched_nr_latency))
681 return nr_running * sysctl_sched_min_granularity;
682 else
683 return sysctl_sched_latency;
684 }
685
686
687
688
689
690
691
692 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
693 {
694 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
695
696 for_each_sched_entity(se) {
697 struct load_weight *load;
698 struct load_weight lw;
699
700 cfs_rq = cfs_rq_of(se);
701 load = &cfs_rq->load;
702
703 if (unlikely(!se->on_rq)) {
704 lw = cfs_rq->load;
705
706 update_load_add(&lw, se->load.weight);
707 load = &lw;
708 }
709 slice = __calc_delta(slice, se->load.weight, load);
710 }
711 return slice;
712 }
713
714
715
716
717
718
719 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
720 {
721 return calc_delta_fair(sched_slice(cfs_rq, se), se);
722 }
723
724 #include "pelt.h"
725 #ifdef CONFIG_SMP
726
727 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
728 static unsigned long task_h_load(struct task_struct *p);
729 static unsigned long capacity_of(int cpu);
730
731
732 void init_entity_runnable_average(struct sched_entity *se)
733 {
734 struct sched_avg *sa = &se->avg;
735
736 memset(sa, 0, sizeof(*sa));
737
738
739
740
741
742
743
744 if (entity_is_task(se))
745 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
746
747 se->runnable_weight = se->load.weight;
748
749
750 }
751
752 static void attach_entity_cfs_rq(struct sched_entity *se);
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780 void post_init_entity_util_avg(struct task_struct *p)
781 {
782 struct sched_entity *se = &p->se;
783 struct cfs_rq *cfs_rq = cfs_rq_of(se);
784 struct sched_avg *sa = &se->avg;
785 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
786 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
787
788 if (cap > 0) {
789 if (cfs_rq->avg.util_avg != 0) {
790 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
791 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
792
793 if (sa->util_avg > cap)
794 sa->util_avg = cap;
795 } else {
796 sa->util_avg = cap;
797 }
798 }
799
800 if (p->sched_class != &fair_sched_class) {
801
802
803
804
805
806
807
808
809
810
811 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
812 return;
813 }
814
815 attach_entity_cfs_rq(se);
816 }
817
818 #else
819 void init_entity_runnable_average(struct sched_entity *se)
820 {
821 }
822 void post_init_entity_util_avg(struct task_struct *p)
823 {
824 }
825 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
826 {
827 }
828 #endif
829
830
831
832
833 static void update_curr(struct cfs_rq *cfs_rq)
834 {
835 struct sched_entity *curr = cfs_rq->curr;
836 u64 now = rq_clock_task(rq_of(cfs_rq));
837 u64 delta_exec;
838
839 if (unlikely(!curr))
840 return;
841
842 delta_exec = now - curr->exec_start;
843 if (unlikely((s64)delta_exec <= 0))
844 return;
845
846 curr->exec_start = now;
847
848 schedstat_set(curr->statistics.exec_max,
849 max(delta_exec, curr->statistics.exec_max));
850
851 curr->sum_exec_runtime += delta_exec;
852 schedstat_add(cfs_rq->exec_clock, delta_exec);
853
854 curr->vruntime += calc_delta_fair(delta_exec, curr);
855 update_min_vruntime(cfs_rq);
856
857 if (entity_is_task(curr)) {
858 struct task_struct *curtask = task_of(curr);
859
860 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
861 cgroup_account_cputime(curtask, delta_exec);
862 account_group_exec_runtime(curtask, delta_exec);
863 }
864
865 account_cfs_rq_runtime(cfs_rq, delta_exec);
866 }
867
868 static void update_curr_fair(struct rq *rq)
869 {
870 update_curr(cfs_rq_of(&rq->curr->se));
871 }
872
873 static inline void
874 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
875 {
876 u64 wait_start, prev_wait_start;
877
878 if (!schedstat_enabled())
879 return;
880
881 wait_start = rq_clock(rq_of(cfs_rq));
882 prev_wait_start = schedstat_val(se->statistics.wait_start);
883
884 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
885 likely(wait_start > prev_wait_start))
886 wait_start -= prev_wait_start;
887
888 __schedstat_set(se->statistics.wait_start, wait_start);
889 }
890
891 static inline void
892 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
893 {
894 struct task_struct *p;
895 u64 delta;
896
897 if (!schedstat_enabled())
898 return;
899
900 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
901
902 if (entity_is_task(se)) {
903 p = task_of(se);
904 if (task_on_rq_migrating(p)) {
905
906
907
908
909
910 __schedstat_set(se->statistics.wait_start, delta);
911 return;
912 }
913 trace_sched_stat_wait(p, delta);
914 }
915
916 __schedstat_set(se->statistics.wait_max,
917 max(schedstat_val(se->statistics.wait_max), delta));
918 __schedstat_inc(se->statistics.wait_count);
919 __schedstat_add(se->statistics.wait_sum, delta);
920 __schedstat_set(se->statistics.wait_start, 0);
921 }
922
923 static inline void
924 update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
925 {
926 struct task_struct *tsk = NULL;
927 u64 sleep_start, block_start;
928
929 if (!schedstat_enabled())
930 return;
931
932 sleep_start = schedstat_val(se->statistics.sleep_start);
933 block_start = schedstat_val(se->statistics.block_start);
934
935 if (entity_is_task(se))
936 tsk = task_of(se);
937
938 if (sleep_start) {
939 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
940
941 if ((s64)delta < 0)
942 delta = 0;
943
944 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
945 __schedstat_set(se->statistics.sleep_max, delta);
946
947 __schedstat_set(se->statistics.sleep_start, 0);
948 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
949
950 if (tsk) {
951 account_scheduler_latency(tsk, delta >> 10, 1);
952 trace_sched_stat_sleep(tsk, delta);
953 }
954 }
955 if (block_start) {
956 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
957
958 if ((s64)delta < 0)
959 delta = 0;
960
961 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
962 __schedstat_set(se->statistics.block_max, delta);
963
964 __schedstat_set(se->statistics.block_start, 0);
965 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
966
967 if (tsk) {
968 if (tsk->in_iowait) {
969 __schedstat_add(se->statistics.iowait_sum, delta);
970 __schedstat_inc(se->statistics.iowait_count);
971 trace_sched_stat_iowait(tsk, delta);
972 }
973
974 trace_sched_stat_blocked(tsk, delta);
975
976
977
978
979
980
981 if (unlikely(prof_on == SLEEP_PROFILING)) {
982 profile_hits(SLEEP_PROFILING,
983 (void *)get_wchan(tsk),
984 delta >> 20);
985 }
986 account_scheduler_latency(tsk, delta >> 10, 0);
987 }
988 }
989 }
990
991
992
993
994 static inline void
995 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
996 {
997 if (!schedstat_enabled())
998 return;
999
1000
1001
1002
1003
1004 if (se != cfs_rq->curr)
1005 update_stats_wait_start(cfs_rq, se);
1006
1007 if (flags & ENQUEUE_WAKEUP)
1008 update_stats_enqueue_sleeper(cfs_rq, se);
1009 }
1010
1011 static inline void
1012 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1013 {
1014
1015 if (!schedstat_enabled())
1016 return;
1017
1018
1019
1020
1021
1022 if (se != cfs_rq->curr)
1023 update_stats_wait_end(cfs_rq, se);
1024
1025 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
1026 struct task_struct *tsk = task_of(se);
1027
1028 if (tsk->state & TASK_INTERRUPTIBLE)
1029 __schedstat_set(se->statistics.sleep_start,
1030 rq_clock(rq_of(cfs_rq)));
1031 if (tsk->state & TASK_UNINTERRUPTIBLE)
1032 __schedstat_set(se->statistics.block_start,
1033 rq_clock(rq_of(cfs_rq)));
1034 }
1035 }
1036
1037
1038
1039
1040 static inline void
1041 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1042 {
1043
1044
1045
1046 se->exec_start = rq_clock_task(rq_of(cfs_rq));
1047 }
1048
1049
1050
1051
1052
1053 #ifdef CONFIG_NUMA_BALANCING
1054
1055
1056
1057
1058
1059 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1060 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
1061
1062
1063 unsigned int sysctl_numa_balancing_scan_size = 256;
1064
1065
1066 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1067
1068 struct numa_group {
1069 refcount_t refcount;
1070
1071 spinlock_t lock;
1072 int nr_tasks;
1073 pid_t gid;
1074 int active_nodes;
1075
1076 struct rcu_head rcu;
1077 unsigned long total_faults;
1078 unsigned long max_faults_cpu;
1079
1080
1081
1082
1083
1084 unsigned long *faults_cpu;
1085 unsigned long faults[0];
1086 };
1087
1088
1089
1090
1091
1092 static struct numa_group *deref_task_numa_group(struct task_struct *p)
1093 {
1094 return rcu_dereference_check(p->numa_group, p == current ||
1095 (lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
1096 }
1097
1098 static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1099 {
1100 return rcu_dereference_protected(p->numa_group, p == current);
1101 }
1102
1103 static inline unsigned long group_faults_priv(struct numa_group *ng);
1104 static inline unsigned long group_faults_shared(struct numa_group *ng);
1105
1106 static unsigned int task_nr_scan_windows(struct task_struct *p)
1107 {
1108 unsigned long rss = 0;
1109 unsigned long nr_scan_pages;
1110
1111
1112
1113
1114
1115
1116 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1117 rss = get_mm_rss(p->mm);
1118 if (!rss)
1119 rss = nr_scan_pages;
1120
1121 rss = round_up(rss, nr_scan_pages);
1122 return rss / nr_scan_pages;
1123 }
1124
1125
1126 #define MAX_SCAN_WINDOW 2560
1127
1128 static unsigned int task_scan_min(struct task_struct *p)
1129 {
1130 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
1131 unsigned int scan, floor;
1132 unsigned int windows = 1;
1133
1134 if (scan_size < MAX_SCAN_WINDOW)
1135 windows = MAX_SCAN_WINDOW / scan_size;
1136 floor = 1000 / windows;
1137
1138 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1139 return max_t(unsigned int, floor, scan);
1140 }
1141
1142 static unsigned int task_scan_start(struct task_struct *p)
1143 {
1144 unsigned long smin = task_scan_min(p);
1145 unsigned long period = smin;
1146 struct numa_group *ng;
1147
1148
1149 rcu_read_lock();
1150 ng = rcu_dereference(p->numa_group);
1151 if (ng) {
1152 unsigned long shared = group_faults_shared(ng);
1153 unsigned long private = group_faults_priv(ng);
1154
1155 period *= refcount_read(&ng->refcount);
1156 period *= shared + 1;
1157 period /= private + shared + 1;
1158 }
1159 rcu_read_unlock();
1160
1161 return max(smin, period);
1162 }
1163
1164 static unsigned int task_scan_max(struct task_struct *p)
1165 {
1166 unsigned long smin = task_scan_min(p);
1167 unsigned long smax;
1168 struct numa_group *ng;
1169
1170
1171 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1172
1173
1174 ng = deref_curr_numa_group(p);
1175 if (ng) {
1176 unsigned long shared = group_faults_shared(ng);
1177 unsigned long private = group_faults_priv(ng);
1178 unsigned long period = smax;
1179
1180 period *= refcount_read(&ng->refcount);
1181 period *= shared + 1;
1182 period /= private + shared + 1;
1183
1184 smax = max(smax, period);
1185 }
1186
1187 return max(smin, smax);
1188 }
1189
1190 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1191 {
1192 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
1193 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1194 }
1195
1196 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1197 {
1198 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
1199 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1200 }
1201
1202
1203 #define NR_NUMA_HINT_FAULT_TYPES 2
1204
1205
1206 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1207
1208
1209 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1210
1211 pid_t task_numa_group_id(struct task_struct *p)
1212 {
1213 struct numa_group *ng;
1214 pid_t gid = 0;
1215
1216 rcu_read_lock();
1217 ng = rcu_dereference(p->numa_group);
1218 if (ng)
1219 gid = ng->gid;
1220 rcu_read_unlock();
1221
1222 return gid;
1223 }
1224
1225
1226
1227
1228
1229
1230
1231 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1232 {
1233 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1234 }
1235
1236 static inline unsigned long task_faults(struct task_struct *p, int nid)
1237 {
1238 if (!p->numa_faults)
1239 return 0;
1240
1241 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1242 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1243 }
1244
1245 static inline unsigned long group_faults(struct task_struct *p, int nid)
1246 {
1247 struct numa_group *ng = deref_task_numa_group(p);
1248
1249 if (!ng)
1250 return 0;
1251
1252 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1253 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1254 }
1255
1256 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1257 {
1258 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1259 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1260 }
1261
1262 static inline unsigned long group_faults_priv(struct numa_group *ng)
1263 {
1264 unsigned long faults = 0;
1265 int node;
1266
1267 for_each_online_node(node) {
1268 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1269 }
1270
1271 return faults;
1272 }
1273
1274 static inline unsigned long group_faults_shared(struct numa_group *ng)
1275 {
1276 unsigned long faults = 0;
1277 int node;
1278
1279 for_each_online_node(node) {
1280 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1281 }
1282
1283 return faults;
1284 }
1285
1286
1287
1288
1289
1290
1291 #define ACTIVE_NODE_FRACTION 3
1292
1293 static bool numa_is_active_node(int nid, struct numa_group *ng)
1294 {
1295 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1296 }
1297
1298
1299 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1300 int maxdist, bool task)
1301 {
1302 unsigned long score = 0;
1303 int node;
1304
1305
1306
1307
1308
1309 if (sched_numa_topology_type == NUMA_DIRECT)
1310 return 0;
1311
1312
1313
1314
1315
1316 for_each_online_node(node) {
1317 unsigned long faults;
1318 int dist = node_distance(nid, node);
1319
1320
1321
1322
1323
1324 if (dist == sched_max_numa_distance || node == nid)
1325 continue;
1326
1327
1328
1329
1330
1331
1332
1333
1334 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1335 dist >= maxdist)
1336 continue;
1337
1338
1339 if (task)
1340 faults = task_faults(p, node);
1341 else
1342 faults = group_faults(p, node);
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1353 faults *= (sched_max_numa_distance - dist);
1354 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1355 }
1356
1357 score += faults;
1358 }
1359
1360 return score;
1361 }
1362
1363
1364
1365
1366
1367
1368
1369 static inline unsigned long task_weight(struct task_struct *p, int nid,
1370 int dist)
1371 {
1372 unsigned long faults, total_faults;
1373
1374 if (!p->numa_faults)
1375 return 0;
1376
1377 total_faults = p->total_numa_faults;
1378
1379 if (!total_faults)
1380 return 0;
1381
1382 faults = task_faults(p, nid);
1383 faults += score_nearby_nodes(p, nid, dist, true);
1384
1385 return 1000 * faults / total_faults;
1386 }
1387
1388 static inline unsigned long group_weight(struct task_struct *p, int nid,
1389 int dist)
1390 {
1391 struct numa_group *ng = deref_task_numa_group(p);
1392 unsigned long faults, total_faults;
1393
1394 if (!ng)
1395 return 0;
1396
1397 total_faults = ng->total_faults;
1398
1399 if (!total_faults)
1400 return 0;
1401
1402 faults = group_faults(p, nid);
1403 faults += score_nearby_nodes(p, nid, dist, false);
1404
1405 return 1000 * faults / total_faults;
1406 }
1407
1408 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1409 int src_nid, int dst_cpu)
1410 {
1411 struct numa_group *ng = deref_curr_numa_group(p);
1412 int dst_nid = cpu_to_node(dst_cpu);
1413 int last_cpupid, this_cpupid;
1414
1415 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1416 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1417
1418
1419
1420
1421
1422
1423
1424 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
1425 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1426 return true;
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445 if (!cpupid_pid_unset(last_cpupid) &&
1446 cpupid_to_nid(last_cpupid) != dst_nid)
1447 return false;
1448
1449
1450 if (cpupid_match_pid(p, last_cpupid))
1451 return true;
1452
1453
1454 if (!ng)
1455 return true;
1456
1457
1458
1459
1460
1461 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1462 ACTIVE_NODE_FRACTION)
1463 return true;
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1474 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1475 }
1476
1477 static unsigned long cpu_runnable_load(struct rq *rq);
1478
1479
1480 struct numa_stats {
1481 unsigned long load;
1482
1483
1484 unsigned long compute_capacity;
1485 };
1486
1487
1488
1489
1490 static void update_numa_stats(struct numa_stats *ns, int nid)
1491 {
1492 int cpu;
1493
1494 memset(ns, 0, sizeof(*ns));
1495 for_each_cpu(cpu, cpumask_of_node(nid)) {
1496 struct rq *rq = cpu_rq(cpu);
1497
1498 ns->load += cpu_runnable_load(rq);
1499 ns->compute_capacity += capacity_of(cpu);
1500 }
1501
1502 }
1503
1504 struct task_numa_env {
1505 struct task_struct *p;
1506
1507 int src_cpu, src_nid;
1508 int dst_cpu, dst_nid;
1509
1510 struct numa_stats src_stats, dst_stats;
1511
1512 int imbalance_pct;
1513 int dist;
1514
1515 struct task_struct *best_task;
1516 long best_imp;
1517 int best_cpu;
1518 };
1519
1520 static void task_numa_assign(struct task_numa_env *env,
1521 struct task_struct *p, long imp)
1522 {
1523 struct rq *rq = cpu_rq(env->dst_cpu);
1524
1525
1526 if (xchg(&rq->numa_migrate_on, 1))
1527 return;
1528
1529
1530
1531
1532
1533 if (env->best_cpu != -1) {
1534 rq = cpu_rq(env->best_cpu);
1535 WRITE_ONCE(rq->numa_migrate_on, 0);
1536 }
1537
1538 if (env->best_task)
1539 put_task_struct(env->best_task);
1540 if (p)
1541 get_task_struct(p);
1542
1543 env->best_task = p;
1544 env->best_imp = imp;
1545 env->best_cpu = env->dst_cpu;
1546 }
1547
1548 static bool load_too_imbalanced(long src_load, long dst_load,
1549 struct task_numa_env *env)
1550 {
1551 long imb, old_imb;
1552 long orig_src_load, orig_dst_load;
1553 long src_capacity, dst_capacity;
1554
1555
1556
1557
1558
1559
1560
1561
1562 src_capacity = env->src_stats.compute_capacity;
1563 dst_capacity = env->dst_stats.compute_capacity;
1564
1565 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
1566
1567 orig_src_load = env->src_stats.load;
1568 orig_dst_load = env->dst_stats.load;
1569
1570 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
1571
1572
1573 return (imb > old_imb);
1574 }
1575
1576
1577
1578
1579
1580
1581 #define SMALLIMP 30
1582
1583
1584
1585
1586
1587
1588
1589 static void task_numa_compare(struct task_numa_env *env,
1590 long taskimp, long groupimp, bool maymove)
1591 {
1592 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
1593 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1594 long imp = p_ng ? groupimp : taskimp;
1595 struct task_struct *cur;
1596 long src_load, dst_load;
1597 int dist = env->dist;
1598 long moveimp = imp;
1599 long load;
1600
1601 if (READ_ONCE(dst_rq->numa_migrate_on))
1602 return;
1603
1604 rcu_read_lock();
1605 cur = rcu_dereference(dst_rq->curr);
1606 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
1607 cur = NULL;
1608
1609
1610
1611
1612
1613 if (cur == env->p)
1614 goto unlock;
1615
1616 if (!cur) {
1617 if (maymove && moveimp >= env->best_imp)
1618 goto assign;
1619 else
1620 goto unlock;
1621 }
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1632 goto unlock;
1633
1634
1635
1636
1637
1638 cur_ng = rcu_dereference(cur->numa_group);
1639 if (cur_ng == p_ng) {
1640 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1641 task_weight(cur, env->dst_nid, dist);
1642
1643
1644
1645
1646 if (cur_ng)
1647 imp -= imp / 16;
1648 } else {
1649
1650
1651
1652
1653 if (cur_ng && p_ng)
1654 imp += group_weight(cur, env->src_nid, dist) -
1655 group_weight(cur, env->dst_nid, dist);
1656 else
1657 imp += task_weight(cur, env->src_nid, dist) -
1658 task_weight(cur, env->dst_nid, dist);
1659 }
1660
1661 if (maymove && moveimp > imp && moveimp > env->best_imp) {
1662 imp = moveimp;
1663 cur = NULL;
1664 goto assign;
1665 }
1666
1667
1668
1669
1670
1671
1672
1673 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1674 goto unlock;
1675
1676
1677
1678
1679 load = task_h_load(env->p) - task_h_load(cur);
1680 if (!load)
1681 goto assign;
1682
1683 dst_load = env->dst_stats.load + load;
1684 src_load = env->src_stats.load - load;
1685
1686 if (load_too_imbalanced(src_load, dst_load, env))
1687 goto unlock;
1688
1689 assign:
1690
1691
1692
1693
1694 if (!cur) {
1695
1696
1697
1698
1699 local_irq_disable();
1700 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1701 env->dst_cpu);
1702 local_irq_enable();
1703 }
1704
1705 task_numa_assign(env, cur, imp);
1706 unlock:
1707 rcu_read_unlock();
1708 }
1709
1710 static void task_numa_find_cpu(struct task_numa_env *env,
1711 long taskimp, long groupimp)
1712 {
1713 long src_load, dst_load, load;
1714 bool maymove = false;
1715 int cpu;
1716
1717 load = task_h_load(env->p);
1718 dst_load = env->dst_stats.load + load;
1719 src_load = env->src_stats.load - load;
1720
1721
1722
1723
1724
1725 maymove = !load_too_imbalanced(src_load, dst_load, env);
1726
1727 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1728
1729 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1730 continue;
1731
1732 env->dst_cpu = cpu;
1733 task_numa_compare(env, taskimp, groupimp, maymove);
1734 }
1735 }
1736
1737 static int task_numa_migrate(struct task_struct *p)
1738 {
1739 struct task_numa_env env = {
1740 .p = p,
1741
1742 .src_cpu = task_cpu(p),
1743 .src_nid = task_node(p),
1744
1745 .imbalance_pct = 112,
1746
1747 .best_task = NULL,
1748 .best_imp = 0,
1749 .best_cpu = -1,
1750 };
1751 unsigned long taskweight, groupweight;
1752 struct sched_domain *sd;
1753 long taskimp, groupimp;
1754 struct numa_group *ng;
1755 struct rq *best_rq;
1756 int nid, ret, dist;
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766 rcu_read_lock();
1767 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1768 if (sd)
1769 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1770 rcu_read_unlock();
1771
1772
1773
1774
1775
1776
1777
1778 if (unlikely(!sd)) {
1779 sched_setnuma(p, task_node(p));
1780 return -EINVAL;
1781 }
1782
1783 env.dst_nid = p->numa_preferred_nid;
1784 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1785 taskweight = task_weight(p, env.src_nid, dist);
1786 groupweight = group_weight(p, env.src_nid, dist);
1787 update_numa_stats(&env.src_stats, env.src_nid);
1788 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1789 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1790 update_numa_stats(&env.dst_stats, env.dst_nid);
1791
1792
1793 task_numa_find_cpu(&env, taskimp, groupimp);
1794
1795
1796
1797
1798
1799
1800
1801
1802 ng = deref_curr_numa_group(p);
1803 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
1804 for_each_online_node(nid) {
1805 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1806 continue;
1807
1808 dist = node_distance(env.src_nid, env.dst_nid);
1809 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1810 dist != env.dist) {
1811 taskweight = task_weight(p, env.src_nid, dist);
1812 groupweight = group_weight(p, env.src_nid, dist);
1813 }
1814
1815
1816 taskimp = task_weight(p, nid, dist) - taskweight;
1817 groupimp = group_weight(p, nid, dist) - groupweight;
1818 if (taskimp < 0 && groupimp < 0)
1819 continue;
1820
1821 env.dist = dist;
1822 env.dst_nid = nid;
1823 update_numa_stats(&env.dst_stats, env.dst_nid);
1824 task_numa_find_cpu(&env, taskimp, groupimp);
1825 }
1826 }
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836 if (ng) {
1837 if (env.best_cpu == -1)
1838 nid = env.src_nid;
1839 else
1840 nid = cpu_to_node(env.best_cpu);
1841
1842 if (nid != p->numa_preferred_nid)
1843 sched_setnuma(p, nid);
1844 }
1845
1846
1847 if (env.best_cpu == -1)
1848 return -EAGAIN;
1849
1850 best_rq = cpu_rq(env.best_cpu);
1851 if (env.best_task == NULL) {
1852 ret = migrate_task_to(p, env.best_cpu);
1853 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1854 if (ret != 0)
1855 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1856 return ret;
1857 }
1858
1859 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
1860 WRITE_ONCE(best_rq->numa_migrate_on, 0);
1861
1862 if (ret != 0)
1863 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1864 put_task_struct(env.best_task);
1865 return ret;
1866 }
1867
1868
1869 static void numa_migrate_preferred(struct task_struct *p)
1870 {
1871 unsigned long interval = HZ;
1872
1873
1874 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
1875 return;
1876
1877
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval;
1880
1881
1882 if (task_node(p) == p->numa_preferred_nid)
1883 return;
1884
1885
1886 task_numa_migrate(p);
1887 }
1888
1889
1890
1891
1892
1893
1894
1895 static void numa_group_count_active_nodes(struct numa_group *numa_group)
1896 {
1897 unsigned long faults, max_faults = 0;
1898 int nid, active_nodes = 0;
1899
1900 for_each_online_node(nid) {
1901 faults = group_faults_cpu(numa_group, nid);
1902 if (faults > max_faults)
1903 max_faults = faults;
1904 }
1905
1906 for_each_online_node(nid) {
1907 faults = group_faults_cpu(numa_group, nid);
1908 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1909 active_nodes++;
1910 }
1911
1912 numa_group->max_faults_cpu = max_faults;
1913 numa_group->active_nodes = active_nodes;
1914 }
1915
1916
1917
1918
1919
1920
1921
1922
1923 #define NUMA_PERIOD_SLOTS 10
1924 #define NUMA_PERIOD_THRESHOLD 7
1925
1926
1927
1928
1929
1930
1931
1932 static void update_task_scan_period(struct task_struct *p,
1933 unsigned long shared, unsigned long private)
1934 {
1935 unsigned int period_slot;
1936 int lr_ratio, ps_ratio;
1937 int diff;
1938
1939 unsigned long remote = p->numa_faults_locality[0];
1940 unsigned long local = p->numa_faults_locality[1];
1941
1942
1943
1944
1945
1946
1947
1948
1949 if (local + shared == 0 || p->numa_faults_locality[2]) {
1950 p->numa_scan_period = min(p->numa_scan_period_max,
1951 p->numa_scan_period << 1);
1952
1953 p->mm->numa_next_scan = jiffies +
1954 msecs_to_jiffies(p->numa_scan_period);
1955
1956 return;
1957 }
1958
1959
1960
1961
1962
1963
1964
1965 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1966 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1967 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1968
1969 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1970
1971
1972
1973
1974 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1975 if (!slot)
1976 slot = 1;
1977 diff = slot * period_slot;
1978 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1979
1980
1981
1982
1983
1984 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
1985 if (!slot)
1986 slot = 1;
1987 diff = slot * period_slot;
1988 } else {
1989
1990
1991
1992
1993
1994 int ratio = max(lr_ratio, ps_ratio);
1995 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1996 }
1997
1998 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1999 task_scan_min(p), task_scan_max(p));
2000 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2001 }
2002
2003
2004
2005
2006
2007
2008
2009
2010 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2011 {
2012 u64 runtime, delta, now;
2013
2014 now = p->se.exec_start;
2015 runtime = p->se.sum_exec_runtime;
2016
2017 if (p->last_task_numa_placement) {
2018 delta = runtime - p->last_sum_exec_runtime;
2019 *period = now - p->last_task_numa_placement;
2020
2021
2022 if (unlikely((s64)*period < 0))
2023 *period = 0;
2024 } else {
2025 delta = p->se.avg.load_sum;
2026 *period = LOAD_AVG_MAX;
2027 }
2028
2029 p->last_sum_exec_runtime = runtime;
2030 p->last_task_numa_placement = now;
2031
2032 return delta;
2033 }
2034
2035
2036
2037
2038
2039
2040 static int preferred_group_nid(struct task_struct *p, int nid)
2041 {
2042 nodemask_t nodes;
2043 int dist;
2044
2045
2046 if (sched_numa_topology_type == NUMA_DIRECT)
2047 return nid;
2048
2049
2050
2051
2052
2053
2054 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2055 unsigned long score, max_score = 0;
2056 int node, max_node = nid;
2057
2058 dist = sched_max_numa_distance;
2059
2060 for_each_online_node(node) {
2061 score = group_weight(p, node, dist);
2062 if (score > max_score) {
2063 max_score = score;
2064 max_node = node;
2065 }
2066 }
2067 return max_node;
2068 }
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079 nodes = node_online_map;
2080 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2081 unsigned long max_faults = 0;
2082 nodemask_t max_group = NODE_MASK_NONE;
2083 int a, b;
2084
2085
2086 if (!find_numa_distance(dist))
2087 continue;
2088
2089 for_each_node_mask(a, nodes) {
2090 unsigned long faults = 0;
2091 nodemask_t this_group;
2092 nodes_clear(this_group);
2093
2094
2095 for_each_node_mask(b, nodes) {
2096 if (node_distance(a, b) < dist) {
2097 faults += group_faults(p, b);
2098 node_set(b, this_group);
2099 node_clear(b, nodes);
2100 }
2101 }
2102
2103
2104 if (faults > max_faults) {
2105 max_faults = faults;
2106 max_group = this_group;
2107
2108
2109
2110
2111
2112 nid = a;
2113 }
2114 }
2115
2116 if (!max_faults)
2117 break;
2118 nodes = max_group;
2119 }
2120 return nid;
2121 }
2122
2123 static void task_numa_placement(struct task_struct *p)
2124 {
2125 int seq, nid, max_nid = NUMA_NO_NODE;
2126 unsigned long max_faults = 0;
2127 unsigned long fault_types[2] = { 0, 0 };
2128 unsigned long total_faults;
2129 u64 runtime, period;
2130 spinlock_t *group_lock = NULL;
2131 struct numa_group *ng;
2132
2133
2134
2135
2136
2137
2138 seq = READ_ONCE(p->mm->numa_scan_seq);
2139 if (p->numa_scan_seq == seq)
2140 return;
2141 p->numa_scan_seq = seq;
2142 p->numa_scan_period_max = task_scan_max(p);
2143
2144 total_faults = p->numa_faults_locality[0] +
2145 p->numa_faults_locality[1];
2146 runtime = numa_get_avg_runtime(p, &period);
2147
2148
2149 ng = deref_curr_numa_group(p);
2150 if (ng) {
2151 group_lock = &ng->lock;
2152 spin_lock_irq(group_lock);
2153 }
2154
2155
2156 for_each_online_node(nid) {
2157
2158 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2159 unsigned long faults = 0, group_faults = 0;
2160 int priv;
2161
2162 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2163 long diff, f_diff, f_weight;
2164
2165 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2166 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2167 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2168 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2169
2170
2171 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2172 fault_types[priv] += p->numa_faults[membuf_idx];
2173 p->numa_faults[membuf_idx] = 0;
2174
2175
2176
2177
2178
2179
2180
2181
2182 f_weight = div64_u64(runtime << 16, period + 1);
2183 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2184 (total_faults + 1);
2185 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2186 p->numa_faults[cpubuf_idx] = 0;
2187
2188 p->numa_faults[mem_idx] += diff;
2189 p->numa_faults[cpu_idx] += f_diff;
2190 faults += p->numa_faults[mem_idx];
2191 p->total_numa_faults += diff;
2192 if (ng) {
2193
2194
2195
2196
2197
2198
2199
2200 ng->faults[mem_idx] += diff;
2201 ng->faults_cpu[mem_idx] += f_diff;
2202 ng->total_faults += diff;
2203 group_faults += ng->faults[mem_idx];
2204 }
2205 }
2206
2207 if (!ng) {
2208 if (faults > max_faults) {
2209 max_faults = faults;
2210 max_nid = nid;
2211 }
2212 } else if (group_faults > max_faults) {
2213 max_faults = group_faults;
2214 max_nid = nid;
2215 }
2216 }
2217
2218 if (ng) {
2219 numa_group_count_active_nodes(ng);
2220 spin_unlock_irq(group_lock);
2221 max_nid = preferred_group_nid(p, max_nid);
2222 }
2223
2224 if (max_faults) {
2225
2226 if (max_nid != p->numa_preferred_nid)
2227 sched_setnuma(p, max_nid);
2228 }
2229
2230 update_task_scan_period(p, fault_types[0], fault_types[1]);
2231 }
2232
2233 static inline int get_numa_group(struct numa_group *grp)
2234 {
2235 return refcount_inc_not_zero(&grp->refcount);
2236 }
2237
2238 static inline void put_numa_group(struct numa_group *grp)
2239 {
2240 if (refcount_dec_and_test(&grp->refcount))
2241 kfree_rcu(grp, rcu);
2242 }
2243
2244 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2245 int *priv)
2246 {
2247 struct numa_group *grp, *my_grp;
2248 struct task_struct *tsk;
2249 bool join = false;
2250 int cpu = cpupid_to_cpu(cpupid);
2251 int i;
2252
2253 if (unlikely(!deref_curr_numa_group(p))) {
2254 unsigned int size = sizeof(struct numa_group) +
2255 4*nr_node_ids*sizeof(unsigned long);
2256
2257 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2258 if (!grp)
2259 return;
2260
2261 refcount_set(&grp->refcount, 1);
2262 grp->active_nodes = 1;
2263 grp->max_faults_cpu = 0;
2264 spin_lock_init(&grp->lock);
2265 grp->gid = p->pid;
2266
2267 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2268 nr_node_ids;
2269
2270 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2271 grp->faults[i] = p->numa_faults[i];
2272
2273 grp->total_faults = p->total_numa_faults;
2274
2275 grp->nr_tasks++;
2276 rcu_assign_pointer(p->numa_group, grp);
2277 }
2278
2279 rcu_read_lock();
2280 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2281
2282 if (!cpupid_match_pid(tsk, cpupid))
2283 goto no_join;
2284
2285 grp = rcu_dereference(tsk->numa_group);
2286 if (!grp)
2287 goto no_join;
2288
2289 my_grp = deref_curr_numa_group(p);
2290 if (grp == my_grp)
2291 goto no_join;
2292
2293
2294
2295
2296
2297 if (my_grp->nr_tasks > grp->nr_tasks)
2298 goto no_join;
2299
2300
2301
2302
2303 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2304 goto no_join;
2305
2306
2307 if (tsk->mm == current->mm)
2308 join = true;
2309
2310
2311 if (flags & TNF_SHARED)
2312 join = true;
2313
2314
2315 *priv = !join;
2316
2317 if (join && !get_numa_group(grp))
2318 goto no_join;
2319
2320 rcu_read_unlock();
2321
2322 if (!join)
2323 return;
2324
2325 BUG_ON(irqs_disabled());
2326 double_lock_irq(&my_grp->lock, &grp->lock);
2327
2328 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2329 my_grp->faults[i] -= p->numa_faults[i];
2330 grp->faults[i] += p->numa_faults[i];
2331 }
2332 my_grp->total_faults -= p->total_numa_faults;
2333 grp->total_faults += p->total_numa_faults;
2334
2335 my_grp->nr_tasks--;
2336 grp->nr_tasks++;
2337
2338 spin_unlock(&my_grp->lock);
2339 spin_unlock_irq(&grp->lock);
2340
2341 rcu_assign_pointer(p->numa_group, grp);
2342
2343 put_numa_group(my_grp);
2344 return;
2345
2346 no_join:
2347 rcu_read_unlock();
2348 return;
2349 }
2350
2351
2352
2353
2354
2355
2356
2357
2358 void task_numa_free(struct task_struct *p, bool final)
2359 {
2360
2361 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
2362 unsigned long *numa_faults = p->numa_faults;
2363 unsigned long flags;
2364 int i;
2365
2366 if (!numa_faults)
2367 return;
2368
2369 if (grp) {
2370 spin_lock_irqsave(&grp->lock, flags);
2371 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2372 grp->faults[i] -= p->numa_faults[i];
2373 grp->total_faults -= p->total_numa_faults;
2374
2375 grp->nr_tasks--;
2376 spin_unlock_irqrestore(&grp->lock, flags);
2377 RCU_INIT_POINTER(p->numa_group, NULL);
2378 put_numa_group(grp);
2379 }
2380
2381 if (final) {
2382 p->numa_faults = NULL;
2383 kfree(numa_faults);
2384 } else {
2385 p->total_numa_faults = 0;
2386 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2387 numa_faults[i] = 0;
2388 }
2389 }
2390
2391
2392
2393
2394 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2395 {
2396 struct task_struct *p = current;
2397 bool migrated = flags & TNF_MIGRATED;
2398 int cpu_node = task_node(current);
2399 int local = !!(flags & TNF_FAULT_LOCAL);
2400 struct numa_group *ng;
2401 int priv;
2402
2403 if (!static_branch_likely(&sched_numa_balancing))
2404 return;
2405
2406
2407 if (!p->mm)
2408 return;
2409
2410
2411 if (unlikely(!p->numa_faults)) {
2412 int size = sizeof(*p->numa_faults) *
2413 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2414
2415 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2416 if (!p->numa_faults)
2417 return;
2418
2419 p->total_numa_faults = 0;
2420 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2421 }
2422
2423
2424
2425
2426
2427 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2428 priv = 1;
2429 } else {
2430 priv = cpupid_match_pid(p, last_cpupid);
2431 if (!priv && !(flags & TNF_NO_GROUP))
2432 task_numa_group(p, last_cpupid, flags, &priv);
2433 }
2434
2435
2436
2437
2438
2439
2440
2441 ng = deref_curr_numa_group(p);
2442 if (!priv && !local && ng && ng->active_nodes > 1 &&
2443 numa_is_active_node(cpu_node, ng) &&
2444 numa_is_active_node(mem_node, ng))
2445 local = 1;
2446
2447
2448
2449
2450
2451 if (time_after(jiffies, p->numa_migrate_retry)) {
2452 task_numa_placement(p);
2453 numa_migrate_preferred(p);
2454 }
2455
2456 if (migrated)
2457 p->numa_pages_migrated += pages;
2458 if (flags & TNF_MIGRATE_FAIL)
2459 p->numa_faults_locality[2] += pages;
2460
2461 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2462 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2463 p->numa_faults_locality[local] += pages;
2464 }
2465
2466 static void reset_ptenuma_scan(struct task_struct *p)
2467 {
2468
2469
2470
2471
2472
2473
2474
2475
2476 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2477 p->mm->numa_scan_offset = 0;
2478 }
2479
2480
2481
2482
2483
2484 static void task_numa_work(struct callback_head *work)
2485 {
2486 unsigned long migrate, next_scan, now = jiffies;
2487 struct task_struct *p = current;
2488 struct mm_struct *mm = p->mm;
2489 u64 runtime = p->se.sum_exec_runtime;
2490 struct vm_area_struct *vma;
2491 unsigned long start, end;
2492 unsigned long nr_pte_updates = 0;
2493 long pages, virtpages;
2494
2495 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2496
2497 work->next = work;
2498
2499
2500
2501
2502
2503
2504
2505
2506 if (p->flags & PF_EXITING)
2507 return;
2508
2509 if (!mm->numa_next_scan) {
2510 mm->numa_next_scan = now +
2511 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2512 }
2513
2514
2515
2516
2517 migrate = mm->numa_next_scan;
2518 if (time_before(now, migrate))
2519 return;
2520
2521 if (p->numa_scan_period == 0) {
2522 p->numa_scan_period_max = task_scan_max(p);
2523 p->numa_scan_period = task_scan_start(p);
2524 }
2525
2526 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2527 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2528 return;
2529
2530
2531
2532
2533
2534 p->node_stamp += 2 * TICK_NSEC;
2535
2536 start = mm->numa_scan_offset;
2537 pages = sysctl_numa_balancing_scan_size;
2538 pages <<= 20 - PAGE_SHIFT;
2539 virtpages = pages * 8;
2540 if (!pages)
2541 return;
2542
2543
2544 if (!down_read_trylock(&mm->mmap_sem))
2545 return;
2546 vma = find_vma(mm, start);
2547 if (!vma) {
2548 reset_ptenuma_scan(p);
2549 start = 0;
2550 vma = mm->mmap;
2551 }
2552 for (; vma; vma = vma->vm_next) {
2553 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2554 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2555 continue;
2556 }
2557
2558
2559
2560
2561
2562
2563
2564 if (!vma->vm_mm ||
2565 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2566 continue;
2567
2568
2569
2570
2571
2572 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2573 continue;
2574
2575 do {
2576 start = max(start, vma->vm_start);
2577 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2578 end = min(end, vma->vm_end);
2579 nr_pte_updates = change_prot_numa(vma, start, end);
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589 if (nr_pte_updates)
2590 pages -= (end - start) >> PAGE_SHIFT;
2591 virtpages -= (end - start) >> PAGE_SHIFT;
2592
2593 start = end;
2594 if (pages <= 0 || virtpages <= 0)
2595 goto out;
2596
2597 cond_resched();
2598 } while (end != vma->vm_end);
2599 }
2600
2601 out:
2602
2603
2604
2605
2606
2607
2608 if (vma)
2609 mm->numa_scan_offset = start;
2610 else
2611 reset_ptenuma_scan(p);
2612 up_read(&mm->mmap_sem);
2613
2614
2615
2616
2617
2618
2619
2620 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2621 u64 diff = p->se.sum_exec_runtime - runtime;
2622 p->node_stamp += 32 * diff;
2623 }
2624 }
2625
2626 void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2627 {
2628 int mm_users = 0;
2629 struct mm_struct *mm = p->mm;
2630
2631 if (mm) {
2632 mm_users = atomic_read(&mm->mm_users);
2633 if (mm_users == 1) {
2634 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2635 mm->numa_scan_seq = 0;
2636 }
2637 }
2638 p->node_stamp = 0;
2639 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2640 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2641
2642 p->numa_work.next = &p->numa_work;
2643 p->numa_faults = NULL;
2644 RCU_INIT_POINTER(p->numa_group, NULL);
2645 p->last_task_numa_placement = 0;
2646 p->last_sum_exec_runtime = 0;
2647
2648 init_task_work(&p->numa_work, task_numa_work);
2649
2650
2651 if (!(clone_flags & CLONE_VM)) {
2652 p->numa_preferred_nid = NUMA_NO_NODE;
2653 return;
2654 }
2655
2656
2657
2658
2659
2660 if (mm) {
2661 unsigned int delay;
2662
2663 delay = min_t(unsigned int, task_scan_max(current),
2664 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2665 delay += 2 * TICK_NSEC;
2666 p->node_stamp = delay;
2667 }
2668 }
2669
2670
2671
2672
2673 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2674 {
2675 struct callback_head *work = &curr->numa_work;
2676 u64 period, now;
2677
2678
2679
2680
2681 if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2682 return;
2683
2684
2685
2686
2687
2688
2689
2690 now = curr->se.sum_exec_runtime;
2691 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2692
2693 if (now > curr->node_stamp + period) {
2694 if (!curr->node_stamp)
2695 curr->numa_scan_period = task_scan_start(curr);
2696 curr->node_stamp += period;
2697
2698 if (!time_before(jiffies, curr->mm->numa_next_scan))
2699 task_work_add(curr, work, true);
2700 }
2701 }
2702
2703 static void update_scan_period(struct task_struct *p, int new_cpu)
2704 {
2705 int src_nid = cpu_to_node(task_cpu(p));
2706 int dst_nid = cpu_to_node(new_cpu);
2707
2708 if (!static_branch_likely(&sched_numa_balancing))
2709 return;
2710
2711 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2712 return;
2713
2714 if (src_nid == dst_nid)
2715 return;
2716
2717
2718
2719
2720
2721
2722 if (p->numa_scan_seq) {
2723
2724
2725
2726
2727
2728 if (dst_nid == p->numa_preferred_nid ||
2729 (p->numa_preferred_nid != NUMA_NO_NODE &&
2730 src_nid != p->numa_preferred_nid))
2731 return;
2732 }
2733
2734 p->numa_scan_period = task_scan_start(p);
2735 }
2736
2737 #else
2738 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2739 {
2740 }
2741
2742 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2743 {
2744 }
2745
2746 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2747 {
2748 }
2749
2750 static inline void update_scan_period(struct task_struct *p, int new_cpu)
2751 {
2752 }
2753
2754 #endif
2755
2756 static void
2757 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2758 {
2759 update_load_add(&cfs_rq->load, se->load.weight);
2760 #ifdef CONFIG_SMP
2761 if (entity_is_task(se)) {
2762 struct rq *rq = rq_of(cfs_rq);
2763
2764 account_numa_enqueue(rq, task_of(se));
2765 list_add(&se->group_node, &rq->cfs_tasks);
2766 }
2767 #endif
2768 cfs_rq->nr_running++;
2769 }
2770
2771 static void
2772 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2773 {
2774 update_load_sub(&cfs_rq->load, se->load.weight);
2775 #ifdef CONFIG_SMP
2776 if (entity_is_task(se)) {
2777 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2778 list_del_init(&se->group_node);
2779 }
2780 #endif
2781 cfs_rq->nr_running--;
2782 }
2783
2784
2785
2786
2787
2788
2789
2790
2791 #define add_positive(_ptr, _val) do { \
2792 typeof(_ptr) ptr = (_ptr); \
2793 typeof(_val) val = (_val); \
2794 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2795 \
2796 res = var + val; \
2797 \
2798 if (val < 0 && res > var) \
2799 res = 0; \
2800 \
2801 WRITE_ONCE(*ptr, res); \
2802 } while (0)
2803
2804
2805
2806
2807
2808
2809
2810
2811 #define sub_positive(_ptr, _val) do { \
2812 typeof(_ptr) ptr = (_ptr); \
2813 typeof(*ptr) val = (_val); \
2814 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2815 res = var - val; \
2816 if (res > var) \
2817 res = 0; \
2818 WRITE_ONCE(*ptr, res); \
2819 } while (0)
2820
2821
2822
2823
2824
2825
2826
2827 #define lsub_positive(_ptr, _val) do { \
2828 typeof(_ptr) ptr = (_ptr); \
2829 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
2830 } while (0)
2831
2832 #ifdef CONFIG_SMP
2833 static inline void
2834 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2835 {
2836 cfs_rq->runnable_weight += se->runnable_weight;
2837
2838 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2839 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
2840 }
2841
2842 static inline void
2843 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2844 {
2845 cfs_rq->runnable_weight -= se->runnable_weight;
2846
2847 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2848 sub_positive(&cfs_rq->avg.runnable_load_sum,
2849 se_runnable(se) * se->avg.runnable_load_sum);
2850 }
2851
2852 static inline void
2853 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2854 {
2855 cfs_rq->avg.load_avg += se->avg.load_avg;
2856 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2857 }
2858
2859 static inline void
2860 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2861 {
2862 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2863 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2864 }
2865 #else
2866 static inline void
2867 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2868 static inline void
2869 dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2870 static inline void
2871 enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2872 static inline void
2873 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2874 #endif
2875
2876 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2877 unsigned long weight, unsigned long runnable)
2878 {
2879 if (se->on_rq) {
2880
2881 if (cfs_rq->curr == se)
2882 update_curr(cfs_rq);
2883 account_entity_dequeue(cfs_rq, se);
2884 dequeue_runnable_load_avg(cfs_rq, se);
2885 }
2886 dequeue_load_avg(cfs_rq, se);
2887
2888 se->runnable_weight = runnable;
2889 update_load_set(&se->load, weight);
2890
2891 #ifdef CONFIG_SMP
2892 do {
2893 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2894
2895 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2896 se->avg.runnable_load_avg =
2897 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2898 } while (0);
2899 #endif
2900
2901 enqueue_load_avg(cfs_rq, se);
2902 if (se->on_rq) {
2903 account_entity_enqueue(cfs_rq, se);
2904 enqueue_runnable_load_avg(cfs_rq, se);
2905 }
2906 }
2907
2908 void reweight_task(struct task_struct *p, int prio)
2909 {
2910 struct sched_entity *se = &p->se;
2911 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2912 struct load_weight *load = &se->load;
2913 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2914
2915 reweight_entity(cfs_rq, se, weight, weight);
2916 load->inv_weight = sched_prio_to_wmult[prio];
2917 }
2918
2919 #ifdef CONFIG_FAIR_GROUP_SCHED
2920 #ifdef CONFIG_SMP
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994 static long calc_group_shares(struct cfs_rq *cfs_rq)
2995 {
2996 long tg_weight, tg_shares, load, shares;
2997 struct task_group *tg = cfs_rq->tg;
2998
2999 tg_shares = READ_ONCE(tg->shares);
3000
3001 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
3002
3003 tg_weight = atomic_long_read(&tg->load_avg);
3004
3005
3006 tg_weight -= cfs_rq->tg_load_avg_contrib;
3007 tg_weight += load;
3008
3009 shares = (tg_shares * load);
3010 if (tg_weight)
3011 shares /= tg_weight;
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025 return clamp_t(long, shares, MIN_SHARES, tg_shares);
3026 }
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055 static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
3056 {
3057 long runnable, load_avg;
3058
3059 load_avg = max(cfs_rq->avg.load_avg,
3060 scale_load_down(cfs_rq->load.weight));
3061
3062 runnable = max(cfs_rq->avg.runnable_load_avg,
3063 scale_load_down(cfs_rq->runnable_weight));
3064
3065 runnable *= shares;
3066 if (load_avg)
3067 runnable /= load_avg;
3068
3069 return clamp_t(long, runnable, MIN_SHARES, shares);
3070 }
3071 #endif
3072
3073 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3074
3075
3076
3077
3078
3079 static void update_cfs_group(struct sched_entity *se)
3080 {
3081 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3082 long shares, runnable;
3083
3084 if (!gcfs_rq)
3085 return;
3086
3087 if (throttled_hierarchy(gcfs_rq))
3088 return;
3089
3090 #ifndef CONFIG_SMP
3091 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
3092
3093 if (likely(se->load.weight == shares))
3094 return;
3095 #else
3096 shares = calc_group_shares(gcfs_rq);
3097 runnable = calc_group_runnable(gcfs_rq, shares);
3098 #endif
3099
3100 reweight_entity(cfs_rq_of(se), se, shares, runnable);
3101 }
3102
3103 #else
3104 static inline void update_cfs_group(struct sched_entity *se)
3105 {
3106 }
3107 #endif
3108
3109 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
3110 {
3111 struct rq *rq = rq_of(cfs_rq);
3112
3113 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128 cpufreq_update_util(rq, flags);
3129 }
3130 }
3131
3132 #ifdef CONFIG_SMP
3133 #ifdef CONFIG_FAIR_GROUP_SCHED
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3150 {
3151 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
3152
3153
3154
3155
3156 if (cfs_rq->tg == &root_task_group)
3157 return;
3158
3159 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3160 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3161 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
3162 }
3163 }
3164
3165
3166
3167
3168
3169
3170 void set_task_rq_fair(struct sched_entity *se,
3171 struct cfs_rq *prev, struct cfs_rq *next)
3172 {
3173 u64 p_last_update_time;
3174 u64 n_last_update_time;
3175
3176 if (!sched_feat(ATTACH_AGE_LOAD))
3177 return;
3178
3179
3180
3181
3182
3183
3184
3185
3186 if (!(se->avg.last_update_time && prev))
3187 return;
3188
3189 #ifndef CONFIG_64BIT
3190 {
3191 u64 p_last_update_time_copy;
3192 u64 n_last_update_time_copy;
3193
3194 do {
3195 p_last_update_time_copy = prev->load_last_update_time_copy;
3196 n_last_update_time_copy = next->load_last_update_time_copy;
3197
3198 smp_rmb();
3199
3200 p_last_update_time = prev->avg.last_update_time;
3201 n_last_update_time = next->avg.last_update_time;
3202
3203 } while (p_last_update_time != p_last_update_time_copy ||
3204 n_last_update_time != n_last_update_time_copy);
3205 }
3206 #else
3207 p_last_update_time = prev->avg.last_update_time;
3208 n_last_update_time = next->avg.last_update_time;
3209 #endif
3210 __update_load_avg_blocked_se(p_last_update_time, se);
3211 se->avg.last_update_time = n_last_update_time;
3212 }
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283 static inline void
3284 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3285 {
3286 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3287
3288
3289 if (!delta)
3290 return;
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301 se->avg.util_avg = gcfs_rq->avg.util_avg;
3302 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3303
3304
3305 add_positive(&cfs_rq->avg.util_avg, delta);
3306 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3307 }
3308
3309 static inline void
3310 update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3311 {
3312 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3313 unsigned long runnable_load_avg, load_avg;
3314 u64 runnable_load_sum, load_sum = 0;
3315 s64 delta_sum;
3316
3317 if (!runnable_sum)
3318 return;
3319
3320 gcfs_rq->prop_runnable_sum = 0;
3321
3322 if (runnable_sum >= 0) {
3323
3324
3325
3326
3327 runnable_sum += se->avg.load_sum;
3328 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3329 } else {
3330
3331
3332
3333
3334 if (scale_load_down(gcfs_rq->load.weight)) {
3335 load_sum = div_s64(gcfs_rq->avg.load_sum,
3336 scale_load_down(gcfs_rq->load.weight));
3337 }
3338
3339
3340 runnable_sum = min(se->avg.load_sum, load_sum);
3341 }
3342
3343
3344
3345
3346
3347
3348
3349 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
3350 runnable_sum = max(runnable_sum, running_sum);
3351
3352 load_sum = (s64)se_weight(se) * runnable_sum;
3353 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
3354
3355 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3356 delta_avg = load_avg - se->avg.load_avg;
3357
3358 se->avg.load_sum = runnable_sum;
3359 se->avg.load_avg = load_avg;
3360 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3361 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3362
3363 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3364 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
3365 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3366 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
3367
3368 se->avg.runnable_load_sum = runnable_sum;
3369 se->avg.runnable_load_avg = runnable_load_avg;
3370
3371 if (se->on_rq) {
3372 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3373 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
3374 }
3375 }
3376
3377 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
3378 {
3379 cfs_rq->propagate = 1;
3380 cfs_rq->prop_runnable_sum += runnable_sum;
3381 }
3382
3383
3384 static inline int propagate_entity_load_avg(struct sched_entity *se)
3385 {
3386 struct cfs_rq *cfs_rq, *gcfs_rq;
3387
3388 if (entity_is_task(se))
3389 return 0;
3390
3391 gcfs_rq = group_cfs_rq(se);
3392 if (!gcfs_rq->propagate)
3393 return 0;
3394
3395 gcfs_rq->propagate = 0;
3396
3397 cfs_rq = cfs_rq_of(se);
3398
3399 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
3400
3401 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3402 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3403
3404 trace_pelt_cfs_tp(cfs_rq);
3405 trace_pelt_se_tp(se);
3406
3407 return 1;
3408 }
3409
3410
3411
3412
3413
3414 static inline bool skip_blocked_update(struct sched_entity *se)
3415 {
3416 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3417
3418
3419
3420
3421
3422 if (se->avg.load_avg || se->avg.util_avg)
3423 return false;
3424
3425
3426
3427
3428
3429 if (gcfs_rq->propagate)
3430 return false;
3431
3432
3433
3434
3435
3436
3437 return true;
3438 }
3439
3440 #else
3441
3442 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3443
3444 static inline int propagate_entity_load_avg(struct sched_entity *se)
3445 {
3446 return 0;
3447 }
3448
3449 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
3450
3451 #endif
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469 static inline int
3470 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
3471 {
3472 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
3473 struct sched_avg *sa = &cfs_rq->avg;
3474 int decayed = 0;
3475
3476 if (cfs_rq->removed.nr) {
3477 unsigned long r;
3478 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
3479
3480 raw_spin_lock(&cfs_rq->removed.lock);
3481 swap(cfs_rq->removed.util_avg, removed_util);
3482 swap(cfs_rq->removed.load_avg, removed_load);
3483 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
3484 cfs_rq->removed.nr = 0;
3485 raw_spin_unlock(&cfs_rq->removed.lock);
3486
3487 r = removed_load;
3488 sub_positive(&sa->load_avg, r);
3489 sub_positive(&sa->load_sum, r * divider);
3490
3491 r = removed_util;
3492 sub_positive(&sa->util_avg, r);
3493 sub_positive(&sa->util_sum, r * divider);
3494
3495 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
3496
3497 decayed = 1;
3498 }
3499
3500 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
3501
3502 #ifndef CONFIG_64BIT
3503 smp_wmb();
3504 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3505 #endif
3506
3507 return decayed;
3508 }
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3520 {
3521 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3522
3523
3524
3525
3526
3527
3528
3529
3530 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3531 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3532
3533
3534
3535
3536
3537
3538
3539 se->avg.util_sum = se->avg.util_avg * divider;
3540
3541 se->avg.load_sum = divider;
3542 if (se_weight(se)) {
3543 se->avg.load_sum =
3544 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3545 }
3546
3547 se->avg.runnable_load_sum = se->avg.load_sum;
3548
3549 enqueue_load_avg(cfs_rq, se);
3550 cfs_rq->avg.util_avg += se->avg.util_avg;
3551 cfs_rq->avg.util_sum += se->avg.util_sum;
3552
3553 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3554
3555 cfs_rq_util_change(cfs_rq, flags);
3556
3557 trace_pelt_cfs_tp(cfs_rq);
3558 }
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3569 {
3570 dequeue_load_avg(cfs_rq, se);
3571 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3572 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3573
3574 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3575
3576 cfs_rq_util_change(cfs_rq, 0);
3577
3578 trace_pelt_cfs_tp(cfs_rq);
3579 }
3580
3581
3582
3583
3584 #define UPDATE_TG 0x1
3585 #define SKIP_AGE_LOAD 0x2
3586 #define DO_ATTACH 0x4
3587
3588
3589 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3590 {
3591 u64 now = cfs_rq_clock_pelt(cfs_rq);
3592 int decayed;
3593
3594
3595
3596
3597
3598 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3599 __update_load_avg_se(now, cfs_rq, se);
3600
3601 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3602 decayed |= propagate_entity_load_avg(se);
3603
3604 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3605
3606
3607
3608
3609
3610
3611
3612
3613 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
3614 update_tg_load_avg(cfs_rq, 0);
3615
3616 } else if (decayed) {
3617 cfs_rq_util_change(cfs_rq, 0);
3618
3619 if (flags & UPDATE_TG)
3620 update_tg_load_avg(cfs_rq, 0);
3621 }
3622 }
3623
3624 #ifndef CONFIG_64BIT
3625 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3626 {
3627 u64 last_update_time_copy;
3628 u64 last_update_time;
3629
3630 do {
3631 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3632 smp_rmb();
3633 last_update_time = cfs_rq->avg.last_update_time;
3634 } while (last_update_time != last_update_time_copy);
3635
3636 return last_update_time;
3637 }
3638 #else
3639 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3640 {
3641 return cfs_rq->avg.last_update_time;
3642 }
3643 #endif
3644
3645
3646
3647
3648
3649 static void sync_entity_load_avg(struct sched_entity *se)
3650 {
3651 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3652 u64 last_update_time;
3653
3654 last_update_time = cfs_rq_last_update_time(cfs_rq);
3655 __update_load_avg_blocked_se(last_update_time, se);
3656 }
3657
3658
3659
3660
3661
3662 static void remove_entity_load_avg(struct sched_entity *se)
3663 {
3664 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3665 unsigned long flags;
3666
3667
3668
3669
3670
3671
3672
3673 sync_entity_load_avg(se);
3674
3675 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3676 ++cfs_rq->removed.nr;
3677 cfs_rq->removed.util_avg += se->avg.util_avg;
3678 cfs_rq->removed.load_avg += se->avg.load_avg;
3679 cfs_rq->removed.runnable_sum += se->avg.load_sum;
3680 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
3681 }
3682
3683 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3684 {
3685 return cfs_rq->avg.runnable_load_avg;
3686 }
3687
3688 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3689 {
3690 return cfs_rq->avg.load_avg;
3691 }
3692
3693 static inline unsigned long task_util(struct task_struct *p)
3694 {
3695 return READ_ONCE(p->se.avg.util_avg);
3696 }
3697
3698 static inline unsigned long _task_util_est(struct task_struct *p)
3699 {
3700 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3701
3702 return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
3703 }
3704
3705 static inline unsigned long task_util_est(struct task_struct *p)
3706 {
3707 return max(task_util(p), _task_util_est(p));
3708 }
3709
3710 static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3711 struct task_struct *p)
3712 {
3713 unsigned int enqueued;
3714
3715 if (!sched_feat(UTIL_EST))
3716 return;
3717
3718
3719 enqueued = cfs_rq->avg.util_est.enqueued;
3720 enqueued += _task_util_est(p);
3721 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3722 }
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732 static inline bool within_margin(int value, int margin)
3733 {
3734 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3735 }
3736
3737 static void
3738 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3739 {
3740 long last_ewma_diff;
3741 struct util_est ue;
3742 int cpu;
3743
3744 if (!sched_feat(UTIL_EST))
3745 return;
3746
3747
3748 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3749 ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
3750 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3751
3752
3753
3754
3755
3756 if (!task_sleep)
3757 return;
3758
3759
3760
3761
3762
3763 ue = p->se.avg.util_est;
3764 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3765 return;
3766
3767
3768
3769
3770
3771 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
3772 last_ewma_diff = ue.enqueued - ue.ewma;
3773 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3774 return;
3775
3776
3777
3778
3779
3780 cpu = cpu_of(rq_of(cfs_rq));
3781 if (task_util(p) > capacity_orig_of(cpu))
3782 return;
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3802 ue.ewma += last_ewma_diff;
3803 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3804 WRITE_ONCE(p->se.avg.util_est, ue);
3805 }
3806
3807 static inline int task_fits_capacity(struct task_struct *p, long capacity)
3808 {
3809 return fits_capacity(task_util_est(p), capacity);
3810 }
3811
3812 static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
3813 {
3814 if (!static_branch_unlikely(&sched_asym_cpucapacity))
3815 return;
3816
3817 if (!p) {
3818 rq->misfit_task_load = 0;
3819 return;
3820 }
3821
3822 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
3823 rq->misfit_task_load = 0;
3824 return;
3825 }
3826
3827 rq->misfit_task_load = task_h_load(p);
3828 }
3829
3830 #else
3831
3832 #define UPDATE_TG 0x0
3833 #define SKIP_AGE_LOAD 0x0
3834 #define DO_ATTACH 0x0
3835
3836 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
3837 {
3838 cfs_rq_util_change(cfs_rq, 0);
3839 }
3840
3841 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3842
3843 static inline void
3844 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
3845 static inline void
3846 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3847
3848 static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
3849 {
3850 return 0;
3851 }
3852
3853 static inline void
3854 util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3855
3856 static inline void
3857 util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3858 bool task_sleep) {}
3859 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
3860
3861 #endif
3862
3863 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3864 {
3865 #ifdef CONFIG_SCHED_DEBUG
3866 s64 d = se->vruntime - cfs_rq->min_vruntime;
3867
3868 if (d < 0)
3869 d = -d;
3870
3871 if (d > 3*sysctl_sched_latency)
3872 schedstat_inc(cfs_rq->nr_spread_over);
3873 #endif
3874 }
3875
3876 static void
3877 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3878 {
3879 u64 vruntime = cfs_rq->min_vruntime;
3880
3881
3882
3883
3884
3885
3886
3887 if (initial && sched_feat(START_DEBIT))
3888 vruntime += sched_vslice(cfs_rq, se);
3889
3890
3891 if (!initial) {
3892 unsigned long thresh = sysctl_sched_latency;
3893
3894
3895
3896
3897
3898 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3899 thresh >>= 1;
3900
3901 vruntime -= thresh;
3902 }
3903
3904
3905 se->vruntime = max_vruntime(se->vruntime, vruntime);
3906 }
3907
3908 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3909
3910 static inline void check_schedstat_required(void)
3911 {
3912 #ifdef CONFIG_SCHEDSTATS
3913 if (schedstat_enabled())
3914 return;
3915
3916
3917 if (trace_sched_stat_wait_enabled() ||
3918 trace_sched_stat_sleep_enabled() ||
3919 trace_sched_stat_iowait_enabled() ||
3920 trace_sched_stat_blocked_enabled() ||
3921 trace_sched_stat_runtime_enabled()) {
3922 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
3923 "stat_blocked and stat_runtime require the "
3924 "kernel parameter schedstats=enable or "
3925 "kernel.sched_schedstats=1\n");
3926 }
3927 #endif
3928 }
3929
3930 static inline bool cfs_bandwidth_used(void);
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962 static void
3963 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3964 {
3965 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3966 bool curr = cfs_rq->curr == se;
3967
3968
3969
3970
3971
3972 if (renorm && curr)
3973 se->vruntime += cfs_rq->min_vruntime;
3974
3975 update_curr(cfs_rq);
3976
3977
3978
3979
3980
3981
3982
3983 if (renorm && !curr)
3984 se->vruntime += cfs_rq->min_vruntime;
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
3995 update_cfs_group(se);
3996 enqueue_runnable_load_avg(cfs_rq, se);
3997 account_entity_enqueue(cfs_rq, se);
3998
3999 if (flags & ENQUEUE_WAKEUP)
4000 place_entity(cfs_rq, se, 0);
4001
4002 check_schedstat_required();
4003 update_stats_enqueue(cfs_rq, se, flags);
4004 check_spread(cfs_rq, se);
4005 if (!curr)
4006 __enqueue_entity(cfs_rq, se);
4007 se->on_rq = 1;
4008
4009
4010
4011
4012
4013
4014 if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
4015 list_add_leaf_cfs_rq(cfs_rq);
4016
4017 if (cfs_rq->nr_running == 1)
4018 check_enqueue_throttle(cfs_rq);
4019 }
4020
4021 static void __clear_buddies_last(struct sched_entity *se)
4022 {
4023 for_each_sched_entity(se) {
4024 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4025 if (cfs_rq->last != se)
4026 break;
4027
4028 cfs_rq->last = NULL;
4029 }
4030 }
4031
4032 static void __clear_buddies_next(struct sched_entity *se)
4033 {
4034 for_each_sched_entity(se) {
4035 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4036 if (cfs_rq->next != se)
4037 break;
4038
4039 cfs_rq->next = NULL;
4040 }
4041 }
4042
4043 static void __clear_buddies_skip(struct sched_entity *se)
4044 {
4045 for_each_sched_entity(se) {
4046 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4047 if (cfs_rq->skip != se)
4048 break;
4049
4050 cfs_rq->skip = NULL;
4051 }
4052 }
4053
4054 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4055 {
4056 if (cfs_rq->last == se)
4057 __clear_buddies_last(se);
4058
4059 if (cfs_rq->next == se)
4060 __clear_buddies_next(se);
4061
4062 if (cfs_rq->skip == se)
4063 __clear_buddies_skip(se);
4064 }
4065
4066 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4067
4068 static void
4069 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
4070 {
4071
4072
4073
4074 update_curr(cfs_rq);
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084 update_load_avg(cfs_rq, se, UPDATE_TG);
4085 dequeue_runnable_load_avg(cfs_rq, se);
4086
4087 update_stats_dequeue(cfs_rq, se, flags);
4088
4089 clear_buddies(cfs_rq, se);
4090
4091 if (se != cfs_rq->curr)
4092 __dequeue_entity(cfs_rq, se);
4093 se->on_rq = 0;
4094 account_entity_dequeue(cfs_rq, se);
4095
4096
4097
4098
4099
4100
4101
4102 if (!(flags & DEQUEUE_SLEEP))
4103 se->vruntime -= cfs_rq->min_vruntime;
4104
4105
4106 return_cfs_rq_runtime(cfs_rq);
4107
4108 update_cfs_group(se);
4109
4110
4111
4112
4113
4114
4115
4116 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
4117 update_min_vruntime(cfs_rq);
4118 }
4119
4120
4121
4122
4123 static void
4124 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4125 {
4126 unsigned long ideal_runtime, delta_exec;
4127 struct sched_entity *se;
4128 s64 delta;
4129
4130 ideal_runtime = sched_slice(cfs_rq, curr);
4131 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
4132 if (delta_exec > ideal_runtime) {
4133 resched_curr(rq_of(cfs_rq));
4134
4135
4136
4137
4138 clear_buddies(cfs_rq, curr);
4139 return;
4140 }
4141
4142
4143
4144
4145
4146
4147 if (delta_exec < sysctl_sched_min_granularity)
4148 return;
4149
4150 se = __pick_first_entity(cfs_rq);
4151 delta = curr->vruntime - se->vruntime;
4152
4153 if (delta < 0)
4154 return;
4155
4156 if (delta > ideal_runtime)
4157 resched_curr(rq_of(cfs_rq));
4158 }
4159
4160 static void
4161 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4162 {
4163
4164 if (se->on_rq) {
4165
4166
4167
4168
4169
4170 update_stats_wait_end(cfs_rq, se);
4171 __dequeue_entity(cfs_rq, se);
4172 update_load_avg(cfs_rq, se, UPDATE_TG);
4173 }
4174
4175 update_stats_curr_start(cfs_rq, se);
4176 cfs_rq->curr = se;
4177
4178
4179
4180
4181
4182
4183 if (schedstat_enabled() &&
4184 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4185 schedstat_set(se->statistics.slice_max,
4186 max((u64)schedstat_val(se->statistics.slice_max),
4187 se->sum_exec_runtime - se->prev_sum_exec_runtime));
4188 }
4189
4190 se->prev_sum_exec_runtime = se->sum_exec_runtime;
4191 }
4192
4193 static int
4194 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4195
4196
4197
4198
4199
4200
4201
4202
4203 static struct sched_entity *
4204 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
4205 {
4206 struct sched_entity *left = __pick_first_entity(cfs_rq);
4207 struct sched_entity *se;
4208
4209
4210
4211
4212
4213 if (!left || (curr && entity_before(curr, left)))
4214 left = curr;
4215
4216 se = left;
4217
4218
4219
4220
4221
4222 if (cfs_rq->skip == se) {
4223 struct sched_entity *second;
4224
4225 if (se == curr) {
4226 second = __pick_first_entity(cfs_rq);
4227 } else {
4228 second = __pick_next_entity(se);
4229 if (!second || (curr && entity_before(curr, second)))
4230 second = curr;
4231 }
4232
4233 if (second && wakeup_preempt_entity(second, left) < 1)
4234 se = second;
4235 }
4236
4237
4238
4239
4240 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4241 se = cfs_rq->last;
4242
4243
4244
4245
4246 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4247 se = cfs_rq->next;
4248
4249 clear_buddies(cfs_rq, se);
4250
4251 return se;
4252 }
4253
4254 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
4255
4256 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
4257 {
4258
4259
4260
4261
4262 if (prev->on_rq)
4263 update_curr(cfs_rq);
4264
4265
4266 check_cfs_rq_runtime(cfs_rq);
4267
4268 check_spread(cfs_rq, prev);
4269
4270 if (prev->on_rq) {
4271 update_stats_wait_start(cfs_rq, prev);
4272
4273 __enqueue_entity(cfs_rq, prev);
4274
4275 update_load_avg(cfs_rq, prev, 0);
4276 }
4277 cfs_rq->curr = NULL;
4278 }
4279
4280 static void
4281 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
4282 {
4283
4284
4285
4286 update_curr(cfs_rq);
4287
4288
4289
4290
4291 update_load_avg(cfs_rq, curr, UPDATE_TG);
4292 update_cfs_group(curr);
4293
4294 #ifdef CONFIG_SCHED_HRTICK
4295
4296
4297
4298
4299 if (queued) {
4300 resched_curr(rq_of(cfs_rq));
4301 return;
4302 }
4303
4304
4305
4306 if (!sched_feat(DOUBLE_TICK) &&
4307 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4308 return;
4309 #endif
4310
4311 if (cfs_rq->nr_running > 1)
4312 check_preempt_tick(cfs_rq, curr);
4313 }
4314
4315
4316
4317
4318
4319
4320 #ifdef CONFIG_CFS_BANDWIDTH
4321
4322 #ifdef CONFIG_JUMP_LABEL
4323 static struct static_key __cfs_bandwidth_used;
4324
4325 static inline bool cfs_bandwidth_used(void)
4326 {
4327 return static_key_false(&__cfs_bandwidth_used);
4328 }
4329
4330 void cfs_bandwidth_usage_inc(void)
4331 {
4332 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
4333 }
4334
4335 void cfs_bandwidth_usage_dec(void)
4336 {
4337 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
4338 }
4339 #else
4340 static bool cfs_bandwidth_used(void)
4341 {
4342 return true;
4343 }
4344
4345 void cfs_bandwidth_usage_inc(void) {}
4346 void cfs_bandwidth_usage_dec(void) {}
4347 #endif
4348
4349
4350
4351
4352
4353 static inline u64 default_cfs_period(void)
4354 {
4355 return 100000000ULL;
4356 }
4357
4358 static inline u64 sched_cfs_bandwidth_slice(void)
4359 {
4360 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4361 }
4362
4363
4364
4365
4366
4367
4368
4369
4370 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4371 {
4372 if (cfs_b->quota != RUNTIME_INF)
4373 cfs_b->runtime = cfs_b->quota;
4374 }
4375
4376 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4377 {
4378 return &tg->cfs_bandwidth;
4379 }
4380
4381
4382 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4383 {
4384 struct task_group *tg = cfs_rq->tg;
4385 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4386 u64 amount = 0, min_amount;
4387
4388
4389 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4390
4391 raw_spin_lock(&cfs_b->lock);
4392 if (cfs_b->quota == RUNTIME_INF)
4393 amount = min_amount;
4394 else {
4395 start_cfs_bandwidth(cfs_b);
4396
4397 if (cfs_b->runtime > 0) {
4398 amount = min(cfs_b->runtime, min_amount);
4399 cfs_b->runtime -= amount;
4400 cfs_b->idle = 0;
4401 }
4402 }
4403 raw_spin_unlock(&cfs_b->lock);
4404
4405 cfs_rq->runtime_remaining += amount;
4406
4407 return cfs_rq->runtime_remaining > 0;
4408 }
4409
4410 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4411 {
4412
4413 cfs_rq->runtime_remaining -= delta_exec;
4414
4415 if (likely(cfs_rq->runtime_remaining > 0))
4416 return;
4417
4418 if (cfs_rq->throttled)
4419 return;
4420
4421
4422
4423
4424 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4425 resched_curr(rq_of(cfs_rq));
4426 }
4427
4428 static __always_inline
4429 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4430 {
4431 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4432 return;
4433
4434 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4435 }
4436
4437 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4438 {
4439 return cfs_bandwidth_used() && cfs_rq->throttled;
4440 }
4441
4442
4443 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4444 {
4445 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4446 }
4447
4448
4449
4450
4451
4452
4453 static inline int throttled_lb_pair(struct task_group *tg,
4454 int src_cpu, int dest_cpu)
4455 {
4456 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4457
4458 src_cfs_rq = tg->cfs_rq[src_cpu];
4459 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4460
4461 return throttled_hierarchy(src_cfs_rq) ||
4462 throttled_hierarchy(dest_cfs_rq);
4463 }
4464
4465 static int tg_unthrottle_up(struct task_group *tg, void *data)
4466 {
4467 struct rq *rq = data;
4468 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4469
4470 cfs_rq->throttle_count--;
4471 if (!cfs_rq->throttle_count) {
4472 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4473 cfs_rq->throttled_clock_task;
4474
4475
4476 if (cfs_rq->nr_running >= 1)
4477 list_add_leaf_cfs_rq(cfs_rq);
4478 }
4479
4480 return 0;
4481 }
4482
4483 static int tg_throttle_down(struct task_group *tg, void *data)
4484 {
4485 struct rq *rq = data;
4486 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4487
4488
4489 if (!cfs_rq->throttle_count) {
4490 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4491 list_del_leaf_cfs_rq(cfs_rq);
4492 }
4493 cfs_rq->throttle_count++;
4494
4495 return 0;
4496 }
4497
4498 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4499 {
4500 struct rq *rq = rq_of(cfs_rq);
4501 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4502 struct sched_entity *se;
4503 long task_delta, idle_task_delta, dequeue = 1;
4504 bool empty;
4505
4506 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4507
4508
4509 rcu_read_lock();
4510 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4511 rcu_read_unlock();
4512
4513 task_delta = cfs_rq->h_nr_running;
4514 idle_task_delta = cfs_rq->idle_h_nr_running;
4515 for_each_sched_entity(se) {
4516 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4517
4518 if (!se->on_rq)
4519 break;
4520
4521 if (dequeue)
4522 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4523 qcfs_rq->h_nr_running -= task_delta;
4524 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4525
4526 if (qcfs_rq->load.weight)
4527 dequeue = 0;
4528 }
4529
4530 if (!se)
4531 sub_nr_running(rq, task_delta);
4532
4533 cfs_rq->throttled = 1;
4534 cfs_rq->throttled_clock = rq_clock(rq);
4535 raw_spin_lock(&cfs_b->lock);
4536 empty = list_empty(&cfs_b->throttled_cfs_rq);
4537
4538
4539
4540
4541
4542
4543 if (cfs_b->distribute_running)
4544 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4545 else
4546 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4547
4548
4549
4550
4551
4552 if (empty)
4553 start_cfs_bandwidth(cfs_b);
4554
4555 raw_spin_unlock(&cfs_b->lock);
4556 }
4557
4558 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4559 {
4560 struct rq *rq = rq_of(cfs_rq);
4561 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4562 struct sched_entity *se;
4563 int enqueue = 1;
4564 long task_delta, idle_task_delta;
4565
4566 se = cfs_rq->tg->se[cpu_of(rq)];
4567
4568 cfs_rq->throttled = 0;
4569
4570 update_rq_clock(rq);
4571
4572 raw_spin_lock(&cfs_b->lock);
4573 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4574 list_del_rcu(&cfs_rq->throttled_list);
4575 raw_spin_unlock(&cfs_b->lock);
4576
4577
4578 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4579
4580 if (!cfs_rq->load.weight)
4581 return;
4582
4583 task_delta = cfs_rq->h_nr_running;
4584 idle_task_delta = cfs_rq->idle_h_nr_running;
4585 for_each_sched_entity(se) {
4586 if (se->on_rq)
4587 enqueue = 0;
4588
4589 cfs_rq = cfs_rq_of(se);
4590 if (enqueue)
4591 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4592 cfs_rq->h_nr_running += task_delta;
4593 cfs_rq->idle_h_nr_running += idle_task_delta;
4594
4595 if (cfs_rq_throttled(cfs_rq))
4596 break;
4597 }
4598
4599 if (!se)
4600 add_nr_running(rq, task_delta);
4601
4602
4603
4604
4605
4606
4607 for_each_sched_entity(se) {
4608 cfs_rq = cfs_rq_of(se);
4609
4610 list_add_leaf_cfs_rq(cfs_rq);
4611 }
4612
4613 assert_list_leaf_cfs_rq(rq);
4614
4615
4616 if (rq->curr == rq->idle && rq->cfs.nr_running)
4617 resched_curr(rq);
4618 }
4619
4620 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4621 {
4622 struct cfs_rq *cfs_rq;
4623 u64 runtime;
4624 u64 starting_runtime = remaining;
4625
4626 rcu_read_lock();
4627 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4628 throttled_list) {
4629 struct rq *rq = rq_of(cfs_rq);
4630 struct rq_flags rf;
4631
4632 rq_lock_irqsave(rq, &rf);
4633 if (!cfs_rq_throttled(cfs_rq))
4634 goto next;
4635
4636
4637 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
4638
4639 runtime = -cfs_rq->runtime_remaining + 1;
4640 if (runtime > remaining)
4641 runtime = remaining;
4642 remaining -= runtime;
4643
4644 cfs_rq->runtime_remaining += runtime;
4645
4646
4647 if (cfs_rq->runtime_remaining > 0)
4648 unthrottle_cfs_rq(cfs_rq);
4649
4650 next:
4651 rq_unlock_irqrestore(rq, &rf);
4652
4653 if (!remaining)
4654 break;
4655 }
4656 rcu_read_unlock();
4657
4658 return starting_runtime - remaining;
4659 }
4660
4661
4662
4663
4664
4665
4666
4667 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4668 {
4669 u64 runtime;
4670 int throttled;
4671
4672
4673 if (cfs_b->quota == RUNTIME_INF)
4674 goto out_deactivate;
4675
4676 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4677 cfs_b->nr_periods += overrun;
4678
4679
4680
4681
4682
4683 if (cfs_b->idle && !throttled)
4684 goto out_deactivate;
4685
4686 __refill_cfs_bandwidth_runtime(cfs_b);
4687
4688 if (!throttled) {
4689
4690 cfs_b->idle = 1;
4691 return 0;
4692 }
4693
4694
4695 cfs_b->nr_throttled += overrun;
4696
4697
4698
4699
4700
4701
4702
4703
4704 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4705 runtime = cfs_b->runtime;
4706 cfs_b->distribute_running = 1;
4707 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4708
4709 runtime = distribute_cfs_runtime(cfs_b, runtime);
4710 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4711
4712 cfs_b->distribute_running = 0;
4713 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4714
4715 lsub_positive(&cfs_b->runtime, runtime);
4716 }
4717
4718
4719
4720
4721
4722
4723
4724 cfs_b->idle = 0;
4725
4726 return 0;
4727
4728 out_deactivate:
4729 return 1;
4730 }
4731
4732
4733 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4734
4735 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4736
4737 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4738
4739
4740
4741
4742
4743
4744
4745
4746 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4747 {
4748 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4749 u64 remaining;
4750
4751
4752 if (hrtimer_callback_running(refresh_timer))
4753 return 1;
4754
4755
4756 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4757 if (remaining < min_expire)
4758 return 1;
4759
4760 return 0;
4761 }
4762
4763 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4764 {
4765 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4766
4767
4768 if (runtime_refresh_within(cfs_b, min_left))
4769 return;
4770
4771
4772 if (cfs_b->slack_started)
4773 return;
4774 cfs_b->slack_started = true;
4775
4776 hrtimer_start(&cfs_b->slack_timer,
4777 ns_to_ktime(cfs_bandwidth_slack_period),
4778 HRTIMER_MODE_REL);
4779 }
4780
4781
4782 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4783 {
4784 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4785 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4786
4787 if (slack_runtime <= 0)
4788 return;
4789
4790 raw_spin_lock(&cfs_b->lock);
4791 if (cfs_b->quota != RUNTIME_INF) {
4792 cfs_b->runtime += slack_runtime;
4793
4794
4795 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4796 !list_empty(&cfs_b->throttled_cfs_rq))
4797 start_cfs_slack_bandwidth(cfs_b);
4798 }
4799 raw_spin_unlock(&cfs_b->lock);
4800
4801
4802 cfs_rq->runtime_remaining -= slack_runtime;
4803 }
4804
4805 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4806 {
4807 if (!cfs_bandwidth_used())
4808 return;
4809
4810 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4811 return;
4812
4813 __return_cfs_rq_runtime(cfs_rq);
4814 }
4815
4816
4817
4818
4819
4820 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4821 {
4822 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4823 unsigned long flags;
4824
4825
4826 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4827 cfs_b->slack_started = false;
4828 if (cfs_b->distribute_running) {
4829 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4830 return;
4831 }
4832
4833 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4834 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4835 return;
4836 }
4837
4838 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4839 runtime = cfs_b->runtime;
4840
4841 if (runtime)
4842 cfs_b->distribute_running = 1;
4843
4844 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4845
4846 if (!runtime)
4847 return;
4848
4849 runtime = distribute_cfs_runtime(cfs_b, runtime);
4850
4851 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4852 lsub_positive(&cfs_b->runtime, runtime);
4853 cfs_b->distribute_running = 0;
4854 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4855 }
4856
4857
4858
4859
4860
4861
4862 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4863 {
4864 if (!cfs_bandwidth_used())
4865 return;
4866
4867
4868 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4869 return;
4870
4871
4872 if (cfs_rq_throttled(cfs_rq))
4873 return;
4874
4875
4876 account_cfs_rq_runtime(cfs_rq, 0);
4877 if (cfs_rq->runtime_remaining <= 0)
4878 throttle_cfs_rq(cfs_rq);
4879 }
4880
4881 static void sync_throttle(struct task_group *tg, int cpu)
4882 {
4883 struct cfs_rq *pcfs_rq, *cfs_rq;
4884
4885 if (!cfs_bandwidth_used())
4886 return;
4887
4888 if (!tg->parent)
4889 return;
4890
4891 cfs_rq = tg->cfs_rq[cpu];
4892 pcfs_rq = tg->parent->cfs_rq[cpu];
4893
4894 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4895 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
4896 }
4897
4898
4899 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4900 {
4901 if (!cfs_bandwidth_used())
4902 return false;
4903
4904 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4905 return false;
4906
4907
4908
4909
4910
4911 if (cfs_rq_throttled(cfs_rq))
4912 return true;
4913
4914 throttle_cfs_rq(cfs_rq);
4915 return true;
4916 }
4917
4918 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4919 {
4920 struct cfs_bandwidth *cfs_b =
4921 container_of(timer, struct cfs_bandwidth, slack_timer);
4922
4923 do_sched_cfs_slack_timer(cfs_b);
4924
4925 return HRTIMER_NORESTART;
4926 }
4927
4928 extern const u64 max_cfs_quota_period;
4929
4930 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4931 {
4932 struct cfs_bandwidth *cfs_b =
4933 container_of(timer, struct cfs_bandwidth, period_timer);
4934 unsigned long flags;
4935 int overrun;
4936 int idle = 0;
4937 int count = 0;
4938
4939 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4940 for (;;) {
4941 overrun = hrtimer_forward_now(timer, cfs_b->period);
4942 if (!overrun)
4943 break;
4944
4945 if (++count > 3) {
4946 u64 new, old = ktime_to_ns(cfs_b->period);
4947
4948
4949
4950
4951
4952
4953 new = old * 2;
4954 if (new < max_cfs_quota_period) {
4955 cfs_b->period = ns_to_ktime(new);
4956 cfs_b->quota *= 2;
4957
4958 pr_warn_ratelimited(
4959 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4960 smp_processor_id(),
4961 div_u64(new, NSEC_PER_USEC),
4962 div_u64(cfs_b->quota, NSEC_PER_USEC));
4963 } else {
4964 pr_warn_ratelimited(
4965 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4966 smp_processor_id(),
4967 div_u64(old, NSEC_PER_USEC),
4968 div_u64(cfs_b->quota, NSEC_PER_USEC));
4969 }
4970
4971
4972 count = 0;
4973 }
4974
4975 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
4976 }
4977 if (idle)
4978 cfs_b->period_active = 0;
4979 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4980
4981 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4982 }
4983
4984 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4985 {
4986 raw_spin_lock_init(&cfs_b->lock);
4987 cfs_b->runtime = 0;
4988 cfs_b->quota = RUNTIME_INF;
4989 cfs_b->period = ns_to_ktime(default_cfs_period());
4990
4991 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4992 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4993 cfs_b->period_timer.function = sched_cfs_period_timer;
4994 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4995 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4996 cfs_b->distribute_running = 0;
4997 cfs_b->slack_started = false;
4998 }
4999
5000 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5001 {
5002 cfs_rq->runtime_enabled = 0;
5003 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5004 }
5005
5006 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5007 {
5008 lockdep_assert_held(&cfs_b->lock);
5009
5010 if (cfs_b->period_active)
5011 return;
5012
5013 cfs_b->period_active = 1;
5014 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5015 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5016 }
5017
5018 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5019 {
5020
5021 if (!cfs_b->throttled_cfs_rq.next)
5022 return;
5023
5024 hrtimer_cancel(&cfs_b->period_timer);
5025 hrtimer_cancel(&cfs_b->slack_timer);
5026 }
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036 static void __maybe_unused update_runtime_enabled(struct rq *rq)
5037 {
5038 struct task_group *tg;
5039
5040 lockdep_assert_held(&rq->lock);
5041
5042 rcu_read_lock();
5043 list_for_each_entry_rcu(tg, &task_groups, list) {
5044 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5045 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5046
5047 raw_spin_lock(&cfs_b->lock);
5048 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5049 raw_spin_unlock(&cfs_b->lock);
5050 }
5051 rcu_read_unlock();
5052 }
5053
5054
5055 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5056 {
5057 struct task_group *tg;
5058
5059 lockdep_assert_held(&rq->lock);
5060
5061 rcu_read_lock();
5062 list_for_each_entry_rcu(tg, &task_groups, list) {
5063 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5064
5065 if (!cfs_rq->runtime_enabled)
5066 continue;
5067
5068
5069
5070
5071
5072 cfs_rq->runtime_remaining = 1;
5073
5074
5075
5076
5077 cfs_rq->runtime_enabled = 0;
5078
5079 if (cfs_rq_throttled(cfs_rq))
5080 unthrottle_cfs_rq(cfs_rq);
5081 }
5082 rcu_read_unlock();
5083 }
5084
5085 #else
5086
5087 static inline bool cfs_bandwidth_used(void)
5088 {
5089 return false;
5090 }
5091
5092 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
5093 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
5094 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
5095 static inline void sync_throttle(struct task_group *tg, int cpu) {}
5096 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5097
5098 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5099 {
5100 return 0;
5101 }
5102
5103 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5104 {
5105 return 0;
5106 }
5107
5108 static inline int throttled_lb_pair(struct task_group *tg,
5109 int src_cpu, int dest_cpu)
5110 {
5111 return 0;
5112 }
5113
5114 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5115
5116 #ifdef CONFIG_FAIR_GROUP_SCHED
5117 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
5118 #endif
5119
5120 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5121 {
5122 return NULL;
5123 }
5124 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5125 static inline void update_runtime_enabled(struct rq *rq) {}
5126 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
5127
5128 #endif
5129
5130
5131
5132
5133
5134 #ifdef CONFIG_SCHED_HRTICK
5135 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5136 {
5137 struct sched_entity *se = &p->se;
5138 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5139
5140 SCHED_WARN_ON(task_rq(p) != rq);
5141
5142 if (rq->cfs.h_nr_running > 1) {
5143 u64 slice = sched_slice(cfs_rq, se);
5144 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5145 s64 delta = slice - ran;
5146
5147 if (delta < 0) {
5148 if (rq->curr == p)
5149 resched_curr(rq);
5150 return;
5151 }
5152 hrtick_start(rq, delta);
5153 }
5154 }
5155
5156
5157
5158
5159
5160
5161 static void hrtick_update(struct rq *rq)
5162 {
5163 struct task_struct *curr = rq->curr;
5164
5165 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
5166 return;
5167
5168 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5169 hrtick_start_fair(rq, curr);
5170 }
5171 #else
5172 static inline void
5173 hrtick_start_fair(struct rq *rq, struct task_struct *p)
5174 {
5175 }
5176
5177 static inline void hrtick_update(struct rq *rq)
5178 {
5179 }
5180 #endif
5181
5182 #ifdef CONFIG_SMP
5183 static inline unsigned long cpu_util(int cpu);
5184
5185 static inline bool cpu_overutilized(int cpu)
5186 {
5187 return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5188 }
5189
5190 static inline void update_overutilized_status(struct rq *rq)
5191 {
5192 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5193 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5194 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5195 }
5196 }
5197 #else
5198 static inline void update_overutilized_status(struct rq *rq) { }
5199 #endif
5200
5201
5202
5203
5204
5205
5206 static void
5207 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5208 {
5209 struct cfs_rq *cfs_rq;
5210 struct sched_entity *se = &p->se;
5211 int idle_h_nr_running = task_has_idle_policy(p);
5212
5213
5214
5215
5216
5217
5218
5219 util_est_enqueue(&rq->cfs, p);
5220
5221
5222
5223
5224
5225
5226 if (p->in_iowait)
5227 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
5228
5229 for_each_sched_entity(se) {
5230 if (se->on_rq)
5231 break;
5232 cfs_rq = cfs_rq_of(se);
5233 enqueue_entity(cfs_rq, se, flags);
5234
5235 cfs_rq->h_nr_running++;
5236 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5237
5238
5239 if (cfs_rq_throttled(cfs_rq))
5240 goto enqueue_throttle;
5241
5242 flags = ENQUEUE_WAKEUP;
5243 }
5244
5245 for_each_sched_entity(se) {
5246 cfs_rq = cfs_rq_of(se);
5247
5248 update_load_avg(cfs_rq, se, UPDATE_TG);
5249 update_cfs_group(se);
5250
5251 cfs_rq->h_nr_running++;
5252 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5253
5254
5255 if (cfs_rq_throttled(cfs_rq))
5256 goto enqueue_throttle;
5257
5258
5259
5260
5261
5262 if (throttled_hierarchy(cfs_rq))
5263 list_add_leaf_cfs_rq(cfs_rq);
5264 }
5265
5266 enqueue_throttle:
5267 if (!se) {
5268 add_nr_running(rq, 1);
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283 if (flags & ENQUEUE_WAKEUP)
5284 update_overutilized_status(rq);
5285
5286 }
5287
5288 if (cfs_bandwidth_used()) {
5289
5290
5291
5292
5293
5294
5295 for_each_sched_entity(se) {
5296 cfs_rq = cfs_rq_of(se);
5297
5298 if (list_add_leaf_cfs_rq(cfs_rq))
5299 break;
5300 }
5301 }
5302
5303 assert_list_leaf_cfs_rq(rq);
5304
5305 hrtick_update(rq);
5306 }
5307
5308 static void set_next_buddy(struct sched_entity *se);
5309
5310
5311
5312
5313
5314
5315 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5316 {
5317 struct cfs_rq *cfs_rq;
5318 struct sched_entity *se = &p->se;
5319 int task_sleep = flags & DEQUEUE_SLEEP;
5320 int idle_h_nr_running = task_has_idle_policy(p);
5321
5322 for_each_sched_entity(se) {
5323 cfs_rq = cfs_rq_of(se);
5324 dequeue_entity(cfs_rq, se, flags);
5325
5326 cfs_rq->h_nr_running--;
5327 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5328
5329
5330 if (cfs_rq_throttled(cfs_rq))
5331 goto dequeue_throttle;
5332
5333
5334 if (cfs_rq->load.weight) {
5335
5336 se = parent_entity(se);
5337
5338
5339
5340
5341 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5342 set_next_buddy(se);
5343 break;
5344 }
5345 flags |= DEQUEUE_SLEEP;
5346 }
5347
5348 for_each_sched_entity(se) {
5349 cfs_rq = cfs_rq_of(se);
5350
5351 update_load_avg(cfs_rq, se, UPDATE_TG);
5352 update_cfs_group(se);
5353
5354 cfs_rq->h_nr_running--;
5355 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5356
5357
5358 if (cfs_rq_throttled(cfs_rq))
5359 goto dequeue_throttle;
5360
5361 }
5362
5363 dequeue_throttle:
5364 if (!se)
5365 sub_nr_running(rq, 1);
5366
5367 util_est_dequeue(&rq->cfs, p, task_sleep);
5368 hrtick_update(rq);
5369 }
5370
5371 #ifdef CONFIG_SMP
5372
5373
5374 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5375 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5376
5377 #ifdef CONFIG_NO_HZ_COMMON
5378
5379 static struct {
5380 cpumask_var_t idle_cpus_mask;
5381 atomic_t nr_cpus;
5382 int has_blocked;
5383 unsigned long next_balance;
5384 unsigned long next_blocked;
5385 } nohz ____cacheline_aligned;
5386
5387 #endif
5388
5389
5390 static int sched_idle_cpu(int cpu)
5391 {
5392 struct rq *rq = cpu_rq(cpu);
5393
5394 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5395 rq->nr_running);
5396 }
5397
5398 static unsigned long cpu_runnable_load(struct rq *rq)
5399 {
5400 return cfs_rq_runnable_load_avg(&rq->cfs);
5401 }
5402
5403 static unsigned long capacity_of(int cpu)
5404 {
5405 return cpu_rq(cpu)->cpu_capacity;
5406 }
5407
5408 static unsigned long cpu_avg_load_per_task(int cpu)
5409 {
5410 struct rq *rq = cpu_rq(cpu);
5411 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5412 unsigned long load_avg = cpu_runnable_load(rq);
5413
5414 if (nr_running)
5415 return load_avg / nr_running;
5416
5417 return 0;
5418 }
5419
5420 static void record_wakee(struct task_struct *p)
5421 {
5422
5423
5424
5425
5426 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5427 current->wakee_flips >>= 1;
5428 current->wakee_flip_decay_ts = jiffies;
5429 }
5430
5431 if (current->last_wakee != p) {
5432 current->last_wakee = p;
5433 current->wakee_flips++;
5434 }
5435 }
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454 static int wake_wide(struct task_struct *p)
5455 {
5456 unsigned int master = current->wakee_flips;
5457 unsigned int slave = p->wakee_flips;
5458 int factor = this_cpu_read(sd_llc_size);
5459
5460 if (master < slave)
5461 swap(master, slave);
5462 if (slave < factor || master < slave * factor)
5463 return 0;
5464 return 1;
5465 }
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479 static int
5480 wake_affine_idle(int this_cpu, int prev_cpu, int sync)
5481 {
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5495 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
5496
5497 if (sync && cpu_rq(this_cpu)->nr_running == 1)
5498 return this_cpu;
5499
5500 return nr_cpumask_bits;
5501 }
5502
5503 static int
5504 wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5505 int this_cpu, int prev_cpu, int sync)
5506 {
5507 s64 this_eff_load, prev_eff_load;
5508 unsigned long task_load;
5509
5510 this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
5511
5512 if (sync) {
5513 unsigned long current_load = task_h_load(current);
5514
5515 if (current_load > this_eff_load)
5516 return this_cpu;
5517
5518 this_eff_load -= current_load;
5519 }
5520
5521 task_load = task_h_load(p);
5522
5523 this_eff_load += task_load;
5524 if (sched_feat(WA_BIAS))
5525 this_eff_load *= 100;
5526 this_eff_load *= capacity_of(prev_cpu);
5527
5528 prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
5529 prev_eff_load -= task_load;
5530 if (sched_feat(WA_BIAS))
5531 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5532 prev_eff_load *= capacity_of(this_cpu);
5533
5534
5535
5536
5537
5538
5539
5540 if (sync)
5541 prev_eff_load += 1;
5542
5543 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5544 }
5545
5546 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5547 int this_cpu, int prev_cpu, int sync)
5548 {
5549 int target = nr_cpumask_bits;
5550
5551 if (sched_feat(WA_IDLE))
5552 target = wake_affine_idle(this_cpu, prev_cpu, sync);
5553
5554 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5555 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
5556
5557 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
5558 if (target == nr_cpumask_bits)
5559 return prev_cpu;
5560
5561 schedstat_inc(sd->ttwu_move_affine);
5562 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5563 return target;
5564 }
5565
5566 static unsigned long cpu_util_without(int cpu, struct task_struct *p);
5567
5568 static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
5569 {
5570 return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
5571 }
5572
5573
5574
5575
5576
5577
5578
5579 static struct sched_group *
5580 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5581 int this_cpu, int sd_flag)
5582 {
5583 struct sched_group *idlest = NULL, *group = sd->groups;
5584 struct sched_group *most_spare_sg = NULL;
5585 unsigned long min_runnable_load = ULONG_MAX;
5586 unsigned long this_runnable_load = ULONG_MAX;
5587 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5588 unsigned long most_spare = 0, this_spare = 0;
5589 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5590 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5591 (sd->imbalance_pct-100) / 100;
5592
5593 do {
5594 unsigned long load, avg_load, runnable_load;
5595 unsigned long spare_cap, max_spare_cap;
5596 int local_group;
5597 int i;
5598
5599
5600 if (!cpumask_intersects(sched_group_span(group),
5601 p->cpus_ptr))
5602 continue;
5603
5604 local_group = cpumask_test_cpu(this_cpu,
5605 sched_group_span(group));
5606
5607
5608
5609
5610
5611 avg_load = 0;
5612 runnable_load = 0;
5613 max_spare_cap = 0;
5614
5615 for_each_cpu(i, sched_group_span(group)) {
5616 load = cpu_runnable_load(cpu_rq(i));
5617 runnable_load += load;
5618
5619 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5620
5621 spare_cap = capacity_spare_without(i, p);
5622
5623 if (spare_cap > max_spare_cap)
5624 max_spare_cap = spare_cap;
5625 }
5626
5627
5628 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5629 group->sgc->capacity;
5630 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5631 group->sgc->capacity;
5632
5633 if (local_group) {
5634 this_runnable_load = runnable_load;
5635 this_avg_load = avg_load;
5636 this_spare = max_spare_cap;
5637 } else {
5638 if (min_runnable_load > (runnable_load + imbalance)) {
5639
5640
5641
5642
5643 min_runnable_load = runnable_load;
5644 min_avg_load = avg_load;
5645 idlest = group;
5646 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5647 (100*min_avg_load > imbalance_scale*avg_load)) {
5648
5649
5650
5651
5652 min_avg_load = avg_load;
5653 idlest = group;
5654 }
5655
5656 if (most_spare < max_spare_cap) {
5657 most_spare = max_spare_cap;
5658 most_spare_sg = group;
5659 }
5660 }
5661 } while (group = group->next, group != sd->groups);
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674 if (sd_flag & SD_BALANCE_FORK)
5675 goto skip_spare;
5676
5677 if (this_spare > task_util(p) / 2 &&
5678 imbalance_scale*this_spare > 100*most_spare)
5679 return NULL;
5680
5681 if (most_spare > task_util(p) / 2)
5682 return most_spare_sg;
5683
5684 skip_spare:
5685 if (!idlest)
5686 return NULL;
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696 if ((sd->flags & SD_NUMA) &&
5697 min_runnable_load + imbalance >= this_runnable_load)
5698 return NULL;
5699
5700 if (min_runnable_load > (this_runnable_load + imbalance))
5701 return NULL;
5702
5703 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5704 (100*this_avg_load < imbalance_scale*min_avg_load))
5705 return NULL;
5706
5707 return idlest;
5708 }
5709
5710
5711
5712
5713 static int
5714 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
5715 {
5716 unsigned long load, min_load = ULONG_MAX;
5717 unsigned int min_exit_latency = UINT_MAX;
5718 u64 latest_idle_timestamp = 0;
5719 int least_loaded_cpu = this_cpu;
5720 int shallowest_idle_cpu = -1, si_cpu = -1;
5721 int i;
5722
5723
5724 if (group->group_weight == 1)
5725 return cpumask_first(sched_group_span(group));
5726
5727
5728 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5729 if (available_idle_cpu(i)) {
5730 struct rq *rq = cpu_rq(i);
5731 struct cpuidle_state *idle = idle_get_state(rq);
5732 if (idle && idle->exit_latency < min_exit_latency) {
5733
5734
5735
5736
5737
5738 min_exit_latency = idle->exit_latency;
5739 latest_idle_timestamp = rq->idle_stamp;
5740 shallowest_idle_cpu = i;
5741 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5742 rq->idle_stamp > latest_idle_timestamp) {
5743
5744
5745
5746
5747
5748 latest_idle_timestamp = rq->idle_stamp;
5749 shallowest_idle_cpu = i;
5750 }
5751 } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
5752 if (sched_idle_cpu(i)) {
5753 si_cpu = i;
5754 continue;
5755 }
5756
5757 load = cpu_runnable_load(cpu_rq(i));
5758 if (load < min_load) {
5759 min_load = load;
5760 least_loaded_cpu = i;
5761 }
5762 }
5763 }
5764
5765 if (shallowest_idle_cpu != -1)
5766 return shallowest_idle_cpu;
5767 if (si_cpu != -1)
5768 return si_cpu;
5769 return least_loaded_cpu;
5770 }
5771
5772 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5773 int cpu, int prev_cpu, int sd_flag)
5774 {
5775 int new_cpu = cpu;
5776
5777 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
5778 return prev_cpu;
5779
5780
5781
5782
5783
5784 if (!(sd_flag & SD_BALANCE_FORK))
5785 sync_entity_load_avg(&p->se);
5786
5787 while (sd) {
5788 struct sched_group *group;
5789 struct sched_domain *tmp;
5790 int weight;
5791
5792 if (!(sd->flags & sd_flag)) {
5793 sd = sd->child;
5794 continue;
5795 }
5796
5797 group = find_idlest_group(sd, p, cpu, sd_flag);
5798 if (!group) {
5799 sd = sd->child;
5800 continue;
5801 }
5802
5803 new_cpu = find_idlest_group_cpu(group, p, cpu);
5804 if (new_cpu == cpu) {
5805
5806 sd = sd->child;
5807 continue;
5808 }
5809
5810
5811 cpu = new_cpu;
5812 weight = sd->span_weight;
5813 sd = NULL;
5814 for_each_domain(cpu, tmp) {
5815 if (weight <= tmp->span_weight)
5816 break;
5817 if (tmp->flags & sd_flag)
5818 sd = tmp;
5819 }
5820 }
5821
5822 return new_cpu;
5823 }
5824
5825 #ifdef CONFIG_SCHED_SMT
5826 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5827 EXPORT_SYMBOL_GPL(sched_smt_present);
5828
5829 static inline void set_idle_cores(int cpu, int val)
5830 {
5831 struct sched_domain_shared *sds;
5832
5833 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5834 if (sds)
5835 WRITE_ONCE(sds->has_idle_cores, val);
5836 }
5837
5838 static inline bool test_idle_cores(int cpu, bool def)
5839 {
5840 struct sched_domain_shared *sds;
5841
5842 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5843 if (sds)
5844 return READ_ONCE(sds->has_idle_cores);
5845
5846 return def;
5847 }
5848
5849
5850
5851
5852
5853
5854
5855
5856 void __update_idle_core(struct rq *rq)
5857 {
5858 int core = cpu_of(rq);
5859 int cpu;
5860
5861 rcu_read_lock();
5862 if (test_idle_cores(core, true))
5863 goto unlock;
5864
5865 for_each_cpu(cpu, cpu_smt_mask(core)) {
5866 if (cpu == core)
5867 continue;
5868
5869 if (!available_idle_cpu(cpu))
5870 goto unlock;
5871 }
5872
5873 set_idle_cores(core, 1);
5874 unlock:
5875 rcu_read_unlock();
5876 }
5877
5878
5879
5880
5881
5882
5883 static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5884 {
5885 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5886 int core, cpu;
5887
5888 if (!static_branch_likely(&sched_smt_present))
5889 return -1;
5890
5891 if (!test_idle_cores(target, false))
5892 return -1;
5893
5894 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
5895
5896 for_each_cpu_wrap(core, cpus, target) {
5897 bool idle = true;
5898
5899 for_each_cpu(cpu, cpu_smt_mask(core)) {
5900 __cpumask_clear_cpu(cpu, cpus);
5901 if (!available_idle_cpu(cpu))
5902 idle = false;
5903 }
5904
5905 if (idle)
5906 return core;
5907 }
5908
5909
5910
5911
5912 set_idle_cores(target, 0);
5913
5914 return -1;
5915 }
5916
5917
5918
5919
5920 static int select_idle_smt(struct task_struct *p, int target)
5921 {
5922 int cpu, si_cpu = -1;
5923
5924 if (!static_branch_likely(&sched_smt_present))
5925 return -1;
5926
5927 for_each_cpu(cpu, cpu_smt_mask(target)) {
5928 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
5929 continue;
5930 if (available_idle_cpu(cpu))
5931 return cpu;
5932 if (si_cpu == -1 && sched_idle_cpu(cpu))
5933 si_cpu = cpu;
5934 }
5935
5936 return si_cpu;
5937 }
5938
5939 #else
5940
5941 static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5942 {
5943 return -1;
5944 }
5945
5946 static inline int select_idle_smt(struct task_struct *p, int target)
5947 {
5948 return -1;
5949 }
5950
5951 #endif
5952
5953
5954
5955
5956
5957
5958 static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
5959 {
5960 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
5961 struct sched_domain *this_sd;
5962 u64 avg_cost, avg_idle;
5963 u64 time, cost;
5964 s64 delta;
5965 int this = smp_processor_id();
5966 int cpu, nr = INT_MAX, si_cpu = -1;
5967
5968 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5969 if (!this_sd)
5970 return -1;
5971
5972
5973
5974
5975
5976 avg_idle = this_rq()->avg_idle / 512;
5977 avg_cost = this_sd->avg_scan_cost + 1;
5978
5979 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
5980 return -1;
5981
5982 if (sched_feat(SIS_PROP)) {
5983 u64 span_avg = sd->span_weight * avg_idle;
5984 if (span_avg > 4*avg_cost)
5985 nr = div_u64(span_avg, avg_cost);
5986 else
5987 nr = 4;
5988 }
5989
5990 time = cpu_clock(this);
5991
5992 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
5993
5994 for_each_cpu_wrap(cpu, cpus, target) {
5995 if (!--nr)
5996 return si_cpu;
5997 if (available_idle_cpu(cpu))
5998 break;
5999 if (si_cpu == -1 && sched_idle_cpu(cpu))
6000 si_cpu = cpu;
6001 }
6002
6003 time = cpu_clock(this) - time;
6004 cost = this_sd->avg_scan_cost;
6005 delta = (s64)(time - cost) / 8;
6006 this_sd->avg_scan_cost += delta;
6007
6008 return cpu;
6009 }
6010
6011
6012
6013
6014 static int select_idle_sibling(struct task_struct *p, int prev, int target)
6015 {
6016 struct sched_domain *sd;
6017 int i, recent_used_cpu;
6018
6019 if (available_idle_cpu(target) || sched_idle_cpu(target))
6020 return target;
6021
6022
6023
6024
6025 if (prev != target && cpus_share_cache(prev, target) &&
6026 (available_idle_cpu(prev) || sched_idle_cpu(prev)))
6027 return prev;
6028
6029
6030 recent_used_cpu = p->recent_used_cpu;
6031 if (recent_used_cpu != prev &&
6032 recent_used_cpu != target &&
6033 cpus_share_cache(recent_used_cpu, target) &&
6034 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6035 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
6036
6037
6038
6039
6040 p->recent_used_cpu = prev;
6041 return recent_used_cpu;
6042 }
6043
6044 sd = rcu_dereference(per_cpu(sd_llc, target));
6045 if (!sd)
6046 return target;
6047
6048 i = select_idle_core(p, sd, target);
6049 if ((unsigned)i < nr_cpumask_bits)
6050 return i;
6051
6052 i = select_idle_cpu(p, sd, target);
6053 if ((unsigned)i < nr_cpumask_bits)
6054 return i;
6055
6056 i = select_idle_smt(p, target);
6057 if ((unsigned)i < nr_cpumask_bits)
6058 return i;
6059
6060 return target;
6061 }
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101 static inline unsigned long cpu_util(int cpu)
6102 {
6103 struct cfs_rq *cfs_rq;
6104 unsigned int util;
6105
6106 cfs_rq = &cpu_rq(cpu)->cfs;
6107 util = READ_ONCE(cfs_rq->avg.util_avg);
6108
6109 if (sched_feat(UTIL_EST))
6110 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6111
6112 return min_t(unsigned long, util, capacity_orig_of(cpu));
6113 }
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128 static unsigned long cpu_util_without(int cpu, struct task_struct *p)
6129 {
6130 struct cfs_rq *cfs_rq;
6131 unsigned int util;
6132
6133
6134 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
6135 return cpu_util(cpu);
6136
6137 cfs_rq = &cpu_rq(cpu)->cfs;
6138 util = READ_ONCE(cfs_rq->avg.util_avg);
6139
6140
6141 lsub_positive(&util, task_util(p));
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169 if (sched_feat(UTIL_EST)) {
6170 unsigned int estimated =
6171 READ_ONCE(cfs_rq->avg.util_est.enqueued);
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190 if (unlikely(task_on_rq_queued(p) || current == p))
6191 lsub_positive(&estimated, _task_util_est(p));
6192
6193 util = max(util, estimated);
6194 }
6195
6196
6197
6198
6199
6200
6201 return min_t(unsigned long, util, capacity_orig_of(cpu));
6202 }
6203
6204
6205
6206
6207
6208
6209
6210
6211 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6212 {
6213 long min_cap, max_cap;
6214
6215 if (!static_branch_unlikely(&sched_asym_cpucapacity))
6216 return 0;
6217
6218 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6219 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6220
6221
6222 if (max_cap - min_cap < max_cap >> 3)
6223 return 0;
6224
6225
6226 sync_entity_load_avg(&p->se);
6227
6228 return !task_fits_capacity(p, min_cap);
6229 }
6230
6231
6232
6233
6234
6235 static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6236 {
6237 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6238 unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6239
6240
6241
6242
6243
6244
6245
6246 if (task_cpu(p) == cpu && dst_cpu != cpu)
6247 sub_positive(&util, task_util(p));
6248 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6249 util += task_util(p);
6250
6251 if (sched_feat(UTIL_EST)) {
6252 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6253
6254
6255
6256
6257
6258
6259
6260 if (dst_cpu == cpu)
6261 util_est += _task_util_est(p);
6262
6263 util = max(util, util_est);
6264 }
6265
6266 return min(util, capacity_orig_of(cpu));
6267 }
6268
6269
6270
6271
6272
6273
6274
6275
6276 static long
6277 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6278 {
6279 struct cpumask *pd_mask = perf_domain_span(pd);
6280 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6281 unsigned long max_util = 0, sum_util = 0;
6282 int cpu;
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6294 unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6295 struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
6296
6297
6298
6299
6300
6301
6302
6303 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6304 ENERGY_UTIL, NULL);
6305
6306
6307
6308
6309
6310
6311
6312
6313 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6314 FREQUENCY_UTIL, tsk);
6315 max_util = max(max_util, cpu_util);
6316 }
6317
6318 return em_pd_energy(pd->em_pd, max_util, sum_util);
6319 }
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360 static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6361 {
6362 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6363 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6364 unsigned long cpu_cap, util, base_energy = 0;
6365 int cpu, best_energy_cpu = prev_cpu;
6366 struct sched_domain *sd;
6367 struct perf_domain *pd;
6368
6369 rcu_read_lock();
6370 pd = rcu_dereference(rd->pd);
6371 if (!pd || READ_ONCE(rd->overutilized))
6372 goto fail;
6373
6374
6375
6376
6377
6378 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6379 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6380 sd = sd->parent;
6381 if (!sd)
6382 goto fail;
6383
6384 sync_entity_load_avg(&p->se);
6385 if (!task_util_est(p))
6386 goto unlock;
6387
6388 for (; pd; pd = pd->next) {
6389 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6390 unsigned long base_energy_pd;
6391 int max_spare_cap_cpu = -1;
6392
6393
6394 base_energy_pd = compute_energy(p, -1, pd);
6395 base_energy += base_energy_pd;
6396
6397 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6398 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6399 continue;
6400
6401
6402 util = cpu_util_next(cpu, p, cpu);
6403 cpu_cap = capacity_of(cpu);
6404 if (!fits_capacity(util, cpu_cap))
6405 continue;
6406
6407
6408 if (cpu == prev_cpu) {
6409 prev_delta = compute_energy(p, prev_cpu, pd);
6410 prev_delta -= base_energy_pd;
6411 best_delta = min(best_delta, prev_delta);
6412 }
6413
6414
6415
6416
6417
6418 spare_cap = cpu_cap - util;
6419 if (spare_cap > max_spare_cap) {
6420 max_spare_cap = spare_cap;
6421 max_spare_cap_cpu = cpu;
6422 }
6423 }
6424
6425
6426 if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) {
6427 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6428 cur_delta -= base_energy_pd;
6429 if (cur_delta < best_delta) {
6430 best_delta = cur_delta;
6431 best_energy_cpu = max_spare_cap_cpu;
6432 }
6433 }
6434 }
6435 unlock:
6436 rcu_read_unlock();
6437
6438
6439
6440
6441
6442 if (prev_delta == ULONG_MAX)
6443 return best_energy_cpu;
6444
6445 if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6446 return best_energy_cpu;
6447
6448 return prev_cpu;
6449
6450 fail:
6451 rcu_read_unlock();
6452
6453 return -1;
6454 }
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468 static int
6469 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
6470 {
6471 struct sched_domain *tmp, *sd = NULL;
6472 int cpu = smp_processor_id();
6473 int new_cpu = prev_cpu;
6474 int want_affine = 0;
6475 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6476
6477 if (sd_flag & SD_BALANCE_WAKE) {
6478 record_wakee(p);
6479
6480 if (sched_energy_enabled()) {
6481 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6482 if (new_cpu >= 0)
6483 return new_cpu;
6484 new_cpu = prev_cpu;
6485 }
6486
6487 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
6488 cpumask_test_cpu(cpu, p->cpus_ptr);
6489 }
6490
6491 rcu_read_lock();
6492 for_each_domain(cpu, tmp) {
6493 if (!(tmp->flags & SD_LOAD_BALANCE))
6494 break;
6495
6496
6497
6498
6499
6500 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6501 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6502 if (cpu != prev_cpu)
6503 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6504
6505 sd = NULL;
6506 break;
6507 }
6508
6509 if (tmp->flags & sd_flag)
6510 sd = tmp;
6511 else if (!want_affine)
6512 break;
6513 }
6514
6515 if (unlikely(sd)) {
6516
6517 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6518 } else if (sd_flag & SD_BALANCE_WAKE) {
6519
6520
6521 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6522
6523 if (want_affine)
6524 current->recent_used_cpu = cpu;
6525 }
6526 rcu_read_unlock();
6527
6528 return new_cpu;
6529 }
6530
6531 static void detach_entity_cfs_rq(struct sched_entity *se);
6532
6533
6534
6535
6536
6537
6538 static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
6539 {
6540
6541
6542
6543
6544
6545
6546 if (p->state == TASK_WAKING) {
6547 struct sched_entity *se = &p->se;
6548 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6549 u64 min_vruntime;
6550
6551 #ifndef CONFIG_64BIT
6552 u64 min_vruntime_copy;
6553
6554 do {
6555 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6556 smp_rmb();
6557 min_vruntime = cfs_rq->min_vruntime;
6558 } while (min_vruntime != min_vruntime_copy);
6559 #else
6560 min_vruntime = cfs_rq->min_vruntime;
6561 #endif
6562
6563 se->vruntime -= min_vruntime;
6564 }
6565
6566 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6567
6568
6569
6570
6571 lockdep_assert_held(&task_rq(p)->lock);
6572 detach_entity_cfs_rq(&p->se);
6573
6574 } else {
6575
6576
6577
6578
6579
6580
6581
6582
6583 remove_entity_load_avg(&p->se);
6584 }
6585
6586
6587 p->se.avg.last_update_time = 0;
6588
6589
6590 p->se.exec_start = 0;
6591
6592 update_scan_period(p, new_cpu);
6593 }
6594
6595 static void task_dead_fair(struct task_struct *p)
6596 {
6597 remove_entity_load_avg(&p->se);
6598 }
6599
6600 static int
6601 balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6602 {
6603 if (rq->nr_running)
6604 return 1;
6605
6606 return newidle_balance(rq, rf) != 0;
6607 }
6608 #endif
6609
6610 static unsigned long wakeup_gran(struct sched_entity *se)
6611 {
6612 unsigned long gran = sysctl_sched_wakeup_granularity;
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627 return calc_delta_fair(gran, se);
6628 }
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644 static int
6645 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6646 {
6647 s64 gran, vdiff = curr->vruntime - se->vruntime;
6648
6649 if (vdiff <= 0)
6650 return -1;
6651
6652 gran = wakeup_gran(se);
6653 if (vdiff > gran)
6654 return 1;
6655
6656 return 0;
6657 }
6658
6659 static void set_last_buddy(struct sched_entity *se)
6660 {
6661 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6662 return;
6663
6664 for_each_sched_entity(se) {
6665 if (SCHED_WARN_ON(!se->on_rq))
6666 return;
6667 cfs_rq_of(se)->last = se;
6668 }
6669 }
6670
6671 static void set_next_buddy(struct sched_entity *se)
6672 {
6673 if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
6674 return;
6675
6676 for_each_sched_entity(se) {
6677 if (SCHED_WARN_ON(!se->on_rq))
6678 return;
6679 cfs_rq_of(se)->next = se;
6680 }
6681 }
6682
6683 static void set_skip_buddy(struct sched_entity *se)
6684 {
6685 for_each_sched_entity(se)
6686 cfs_rq_of(se)->skip = se;
6687 }
6688
6689
6690
6691
6692 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
6693 {
6694 struct task_struct *curr = rq->curr;
6695 struct sched_entity *se = &curr->se, *pse = &p->se;
6696 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6697 int scale = cfs_rq->nr_running >= sched_nr_latency;
6698 int next_buddy_marked = 0;
6699
6700 if (unlikely(se == pse))
6701 return;
6702
6703
6704
6705
6706
6707
6708
6709 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6710 return;
6711
6712 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
6713 set_next_buddy(pse);
6714 next_buddy_marked = 1;
6715 }
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727 if (test_tsk_need_resched(curr))
6728 return;
6729
6730
6731 if (unlikely(task_has_idle_policy(curr)) &&
6732 likely(!task_has_idle_policy(p)))
6733 goto preempt;
6734
6735
6736
6737
6738
6739 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
6740 return;
6741
6742 find_matching_se(&se, &pse);
6743 update_curr(cfs_rq_of(se));
6744 BUG_ON(!pse);
6745 if (wakeup_preempt_entity(se, pse) == 1) {
6746
6747
6748
6749
6750 if (!next_buddy_marked)
6751 set_next_buddy(pse);
6752 goto preempt;
6753 }
6754
6755 return;
6756
6757 preempt:
6758 resched_curr(rq);
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768 if (unlikely(!se->on_rq || curr == rq->idle))
6769 return;
6770
6771 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6772 set_last_buddy(se);
6773 }
6774
6775 static struct task_struct *
6776 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6777 {
6778 struct cfs_rq *cfs_rq = &rq->cfs;
6779 struct sched_entity *se;
6780 struct task_struct *p;
6781 int new_tasks;
6782
6783 again:
6784 if (!sched_fair_runnable(rq))
6785 goto idle;
6786
6787 #ifdef CONFIG_FAIR_GROUP_SCHED
6788 if (!prev || prev->sched_class != &fair_sched_class)
6789 goto simple;
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799 do {
6800 struct sched_entity *curr = cfs_rq->curr;
6801
6802
6803
6804
6805
6806
6807
6808 if (curr) {
6809 if (curr->on_rq)
6810 update_curr(cfs_rq);
6811 else
6812 curr = NULL;
6813
6814
6815
6816
6817
6818
6819
6820 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6821 cfs_rq = &rq->cfs;
6822
6823 if (!cfs_rq->nr_running)
6824 goto idle;
6825
6826 goto simple;
6827 }
6828 }
6829
6830 se = pick_next_entity(cfs_rq, curr);
6831 cfs_rq = group_cfs_rq(se);
6832 } while (cfs_rq);
6833
6834 p = task_of(se);
6835
6836
6837
6838
6839
6840
6841 if (prev != p) {
6842 struct sched_entity *pse = &prev->se;
6843
6844 while (!(cfs_rq = is_same_group(se, pse))) {
6845 int se_depth = se->depth;
6846 int pse_depth = pse->depth;
6847
6848 if (se_depth <= pse_depth) {
6849 put_prev_entity(cfs_rq_of(pse), pse);
6850 pse = parent_entity(pse);
6851 }
6852 if (se_depth >= pse_depth) {
6853 set_next_entity(cfs_rq_of(se), se);
6854 se = parent_entity(se);
6855 }
6856 }
6857
6858 put_prev_entity(cfs_rq, pse);
6859 set_next_entity(cfs_rq, se);
6860 }
6861
6862 goto done;
6863 simple:
6864 #endif
6865 if (prev)
6866 put_prev_task(rq, prev);
6867
6868 do {
6869 se = pick_next_entity(cfs_rq, NULL);
6870 set_next_entity(cfs_rq, se);
6871 cfs_rq = group_cfs_rq(se);
6872 } while (cfs_rq);
6873
6874 p = task_of(se);
6875
6876 done: __maybe_unused;
6877 #ifdef CONFIG_SMP
6878
6879
6880
6881
6882
6883 list_move(&p->se.group_node, &rq->cfs_tasks);
6884 #endif
6885
6886 if (hrtick_enabled(rq))
6887 hrtick_start_fair(rq, p);
6888
6889 update_misfit_status(p, rq);
6890
6891 return p;
6892
6893 idle:
6894 if (!rf)
6895 return NULL;
6896
6897 new_tasks = newidle_balance(rq, rf);
6898
6899
6900
6901
6902
6903
6904 if (new_tasks < 0)
6905 return RETRY_TASK;
6906
6907 if (new_tasks > 0)
6908 goto again;
6909
6910
6911
6912
6913
6914 update_idle_rq_clock_pelt(rq);
6915
6916 return NULL;
6917 }
6918
6919
6920
6921
6922 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
6923 {
6924 struct sched_entity *se = &prev->se;
6925 struct cfs_rq *cfs_rq;
6926
6927 for_each_sched_entity(se) {
6928 cfs_rq = cfs_rq_of(se);
6929 put_prev_entity(cfs_rq, se);
6930 }
6931 }
6932
6933
6934
6935
6936
6937
6938 static void yield_task_fair(struct rq *rq)
6939 {
6940 struct task_struct *curr = rq->curr;
6941 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6942 struct sched_entity *se = &curr->se;
6943
6944
6945
6946
6947 if (unlikely(rq->nr_running == 1))
6948 return;
6949
6950 clear_buddies(cfs_rq, se);
6951
6952 if (curr->policy != SCHED_BATCH) {
6953 update_rq_clock(rq);
6954
6955
6956
6957 update_curr(cfs_rq);
6958
6959
6960
6961
6962
6963 rq_clock_skip_update(rq);
6964 }
6965
6966 set_skip_buddy(se);
6967 }
6968
6969 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6970 {
6971 struct sched_entity *se = &p->se;
6972
6973
6974 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
6975 return false;
6976
6977
6978 set_next_buddy(se);
6979
6980 yield_task_fair(rq);
6981
6982 return true;
6983 }
6984
6985 #ifdef CONFIG_SMP
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7105
7106 enum fbq_type { regular, remote, all };
7107
7108 enum group_type {
7109 group_other = 0,
7110 group_misfit_task,
7111 group_imbalanced,
7112 group_overloaded,
7113 };
7114
7115 #define LBF_ALL_PINNED 0x01
7116 #define LBF_NEED_BREAK 0x02
7117 #define LBF_DST_PINNED 0x04
7118 #define LBF_SOME_PINNED 0x08
7119 #define LBF_NOHZ_STATS 0x10
7120 #define LBF_NOHZ_AGAIN 0x20
7121
7122 struct lb_env {
7123 struct sched_domain *sd;
7124
7125 struct rq *src_rq;
7126 int src_cpu;
7127
7128 int dst_cpu;
7129 struct rq *dst_rq;
7130
7131 struct cpumask *dst_grpmask;
7132 int new_dst_cpu;
7133 enum cpu_idle_type idle;
7134 long imbalance;
7135
7136 struct cpumask *cpus;
7137
7138 unsigned int flags;
7139
7140 unsigned int loop;
7141 unsigned int loop_break;
7142 unsigned int loop_max;
7143
7144 enum fbq_type fbq_type;
7145 enum group_type src_grp_type;
7146 struct list_head tasks;
7147 };
7148
7149
7150
7151
7152 static int task_hot(struct task_struct *p, struct lb_env *env)
7153 {
7154 s64 delta;
7155
7156 lockdep_assert_held(&env->src_rq->lock);
7157
7158 if (p->sched_class != &fair_sched_class)
7159 return 0;
7160
7161 if (unlikely(task_has_idle_policy(p)))
7162 return 0;
7163
7164
7165
7166
7167 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7168 (&p->se == cfs_rq_of(&p->se)->next ||
7169 &p->se == cfs_rq_of(&p->se)->last))
7170 return 1;
7171
7172 if (sysctl_sched_migration_cost == -1)
7173 return 1;
7174 if (sysctl_sched_migration_cost == 0)
7175 return 0;
7176
7177 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7178
7179 return delta < (s64)sysctl_sched_migration_cost;
7180 }
7181
7182 #ifdef CONFIG_NUMA_BALANCING
7183
7184
7185
7186
7187
7188 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7189 {
7190 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7191 unsigned long src_weight, dst_weight;
7192 int src_nid, dst_nid, dist;
7193
7194 if (!static_branch_likely(&sched_numa_balancing))
7195 return -1;
7196
7197 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7198 return -1;
7199
7200 src_nid = cpu_to_node(env->src_cpu);
7201 dst_nid = cpu_to_node(env->dst_cpu);
7202
7203 if (src_nid == dst_nid)
7204 return -1;
7205
7206
7207 if (src_nid == p->numa_preferred_nid) {
7208 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7209 return 1;
7210 else
7211 return -1;
7212 }
7213
7214
7215 if (dst_nid == p->numa_preferred_nid)
7216 return 0;
7217
7218
7219 if (env->idle == CPU_IDLE)
7220 return -1;
7221
7222 dist = node_distance(src_nid, dst_nid);
7223 if (numa_group) {
7224 src_weight = group_weight(p, src_nid, dist);
7225 dst_weight = group_weight(p, dst_nid, dist);
7226 } else {
7227 src_weight = task_weight(p, src_nid, dist);
7228 dst_weight = task_weight(p, dst_nid, dist);
7229 }
7230
7231 return dst_weight < src_weight;
7232 }
7233
7234 #else
7235 static inline int migrate_degrades_locality(struct task_struct *p,
7236 struct lb_env *env)
7237 {
7238 return -1;
7239 }
7240 #endif
7241
7242
7243
7244
7245 static
7246 int can_migrate_task(struct task_struct *p, struct lb_env *env)
7247 {
7248 int tsk_cache_hot;
7249
7250 lockdep_assert_held(&env->src_rq->lock);
7251
7252
7253
7254
7255
7256
7257
7258
7259 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7260 return 0;
7261
7262 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7263 int cpu;
7264
7265 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
7266
7267 env->flags |= LBF_SOME_PINNED;
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7278 return 0;
7279
7280
7281 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7282 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7283 env->flags |= LBF_DST_PINNED;
7284 env->new_dst_cpu = cpu;
7285 break;
7286 }
7287 }
7288
7289 return 0;
7290 }
7291
7292
7293 env->flags &= ~LBF_ALL_PINNED;
7294
7295 if (task_running(env->src_rq, p)) {
7296 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
7297 return 0;
7298 }
7299
7300
7301
7302
7303
7304
7305
7306 tsk_cache_hot = migrate_degrades_locality(p, env);
7307 if (tsk_cache_hot == -1)
7308 tsk_cache_hot = task_hot(p, env);
7309
7310 if (tsk_cache_hot <= 0 ||
7311 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7312 if (tsk_cache_hot == 1) {
7313 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7314 schedstat_inc(p->se.statistics.nr_forced_migrations);
7315 }
7316 return 1;
7317 }
7318
7319 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
7320 return 0;
7321 }
7322
7323
7324
7325
7326 static void detach_task(struct task_struct *p, struct lb_env *env)
7327 {
7328 lockdep_assert_held(&env->src_rq->lock);
7329
7330 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
7331 set_task_cpu(p, env->dst_cpu);
7332 }
7333
7334
7335
7336
7337
7338
7339
7340 static struct task_struct *detach_one_task(struct lb_env *env)
7341 {
7342 struct task_struct *p;
7343
7344 lockdep_assert_held(&env->src_rq->lock);
7345
7346 list_for_each_entry_reverse(p,
7347 &env->src_rq->cfs_tasks, se.group_node) {
7348 if (!can_migrate_task(p, env))
7349 continue;
7350
7351 detach_task(p, env);
7352
7353
7354
7355
7356
7357
7358
7359 schedstat_inc(env->sd->lb_gained[env->idle]);
7360 return p;
7361 }
7362 return NULL;
7363 }
7364
7365 static const unsigned int sched_nr_migrate_break = 32;
7366
7367
7368
7369
7370
7371
7372
7373 static int detach_tasks(struct lb_env *env)
7374 {
7375 struct list_head *tasks = &env->src_rq->cfs_tasks;
7376 struct task_struct *p;
7377 unsigned long load;
7378 int detached = 0;
7379
7380 lockdep_assert_held(&env->src_rq->lock);
7381
7382 if (env->imbalance <= 0)
7383 return 0;
7384
7385 while (!list_empty(tasks)) {
7386
7387
7388
7389
7390 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7391 break;
7392
7393 p = list_last_entry(tasks, struct task_struct, se.group_node);
7394
7395 env->loop++;
7396
7397 if (env->loop > env->loop_max)
7398 break;
7399
7400
7401 if (env->loop > env->loop_break) {
7402 env->loop_break += sched_nr_migrate_break;
7403 env->flags |= LBF_NEED_BREAK;
7404 break;
7405 }
7406
7407 if (!can_migrate_task(p, env))
7408 goto next;
7409
7410 load = task_h_load(p);
7411
7412 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7413 goto next;
7414
7415 if ((load / 2) > env->imbalance)
7416 goto next;
7417
7418 detach_task(p, env);
7419 list_add(&p->se.group_node, &env->tasks);
7420
7421 detached++;
7422 env->imbalance -= load;
7423
7424 #ifdef CONFIG_PREEMPTION
7425
7426
7427
7428
7429
7430 if (env->idle == CPU_NEWLY_IDLE)
7431 break;
7432 #endif
7433
7434
7435
7436
7437
7438 if (env->imbalance <= 0)
7439 break;
7440
7441 continue;
7442 next:
7443 list_move(&p->se.group_node, tasks);
7444 }
7445
7446
7447
7448
7449
7450
7451 schedstat_add(env->sd->lb_gained[env->idle], detached);
7452
7453 return detached;
7454 }
7455
7456
7457
7458
7459 static void attach_task(struct rq *rq, struct task_struct *p)
7460 {
7461 lockdep_assert_held(&rq->lock);
7462
7463 BUG_ON(task_rq(p) != rq);
7464 activate_task(rq, p, ENQUEUE_NOCLOCK);
7465 check_preempt_curr(rq, p, 0);
7466 }
7467
7468
7469
7470
7471
7472 static void attach_one_task(struct rq *rq, struct task_struct *p)
7473 {
7474 struct rq_flags rf;
7475
7476 rq_lock(rq, &rf);
7477 update_rq_clock(rq);
7478 attach_task(rq, p);
7479 rq_unlock(rq, &rf);
7480 }
7481
7482
7483
7484
7485
7486 static void attach_tasks(struct lb_env *env)
7487 {
7488 struct list_head *tasks = &env->tasks;
7489 struct task_struct *p;
7490 struct rq_flags rf;
7491
7492 rq_lock(env->dst_rq, &rf);
7493 update_rq_clock(env->dst_rq);
7494
7495 while (!list_empty(tasks)) {
7496 p = list_first_entry(tasks, struct task_struct, se.group_node);
7497 list_del_init(&p->se.group_node);
7498
7499 attach_task(env->dst_rq, p);
7500 }
7501
7502 rq_unlock(env->dst_rq, &rf);
7503 }
7504
7505 #ifdef CONFIG_NO_HZ_COMMON
7506 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7507 {
7508 if (cfs_rq->avg.load_avg)
7509 return true;
7510
7511 if (cfs_rq->avg.util_avg)
7512 return true;
7513
7514 return false;
7515 }
7516
7517 static inline bool others_have_blocked(struct rq *rq)
7518 {
7519 if (READ_ONCE(rq->avg_rt.util_avg))
7520 return true;
7521
7522 if (READ_ONCE(rq->avg_dl.util_avg))
7523 return true;
7524
7525 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
7526 if (READ_ONCE(rq->avg_irq.util_avg))
7527 return true;
7528 #endif
7529
7530 return false;
7531 }
7532
7533 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
7534 {
7535 rq->last_blocked_load_update_tick = jiffies;
7536
7537 if (!has_blocked)
7538 rq->has_blocked_load = 0;
7539 }
7540 #else
7541 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
7542 static inline bool others_have_blocked(struct rq *rq) { return false; }
7543 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
7544 #endif
7545
7546 static bool __update_blocked_others(struct rq *rq, bool *done)
7547 {
7548 const struct sched_class *curr_class;
7549 u64 now = rq_clock_pelt(rq);
7550 bool decayed;
7551
7552
7553
7554
7555
7556 curr_class = rq->curr->sched_class;
7557
7558 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
7559 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
7560 update_irq_load_avg(rq, 0);
7561
7562 if (others_have_blocked(rq))
7563 *done = false;
7564
7565 return decayed;
7566 }
7567
7568 #ifdef CONFIG_FAIR_GROUP_SCHED
7569
7570 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7571 {
7572 if (cfs_rq->load.weight)
7573 return false;
7574
7575 if (cfs_rq->avg.load_sum)
7576 return false;
7577
7578 if (cfs_rq->avg.util_sum)
7579 return false;
7580
7581 if (cfs_rq->avg.runnable_load_sum)
7582 return false;
7583
7584 return true;
7585 }
7586
7587 static bool __update_blocked_fair(struct rq *rq, bool *done)
7588 {
7589 struct cfs_rq *cfs_rq, *pos;
7590 bool decayed = false;
7591 int cpu = cpu_of(rq);
7592
7593
7594
7595
7596
7597 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
7598 struct sched_entity *se;
7599
7600 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
7601 update_tg_load_avg(cfs_rq, 0);
7602
7603 if (cfs_rq == &rq->cfs)
7604 decayed = true;
7605 }
7606
7607
7608 se = cfs_rq->tg->se[cpu];
7609 if (se && !skip_blocked_update(se))
7610 update_load_avg(cfs_rq_of(se), se, 0);
7611
7612
7613
7614
7615
7616 if (cfs_rq_is_decayed(cfs_rq))
7617 list_del_leaf_cfs_rq(cfs_rq);
7618
7619
7620 if (cfs_rq_has_blocked(cfs_rq))
7621 *done = false;
7622 }
7623
7624 return decayed;
7625 }
7626
7627
7628
7629
7630
7631
7632 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7633 {
7634 struct rq *rq = rq_of(cfs_rq);
7635 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7636 unsigned long now = jiffies;
7637 unsigned long load;
7638
7639 if (cfs_rq->last_h_load_update == now)
7640 return;
7641
7642 WRITE_ONCE(cfs_rq->h_load_next, NULL);
7643 for_each_sched_entity(se) {
7644 cfs_rq = cfs_rq_of(se);
7645 WRITE_ONCE(cfs_rq->h_load_next, se);
7646 if (cfs_rq->last_h_load_update == now)
7647 break;
7648 }
7649
7650 if (!se) {
7651 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7652 cfs_rq->last_h_load_update = now;
7653 }
7654
7655 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7656 load = cfs_rq->h_load;
7657 load = div64_ul(load * se->avg.load_avg,
7658 cfs_rq_load_avg(cfs_rq) + 1);
7659 cfs_rq = group_cfs_rq(se);
7660 cfs_rq->h_load = load;
7661 cfs_rq->last_h_load_update = now;
7662 }
7663 }
7664
7665 static unsigned long task_h_load(struct task_struct *p)
7666 {
7667 struct cfs_rq *cfs_rq = task_cfs_rq(p);
7668
7669 update_cfs_rq_h_load(cfs_rq);
7670 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7671 cfs_rq_load_avg(cfs_rq) + 1);
7672 }
7673 #else
7674 static bool __update_blocked_fair(struct rq *rq, bool *done)
7675 {
7676 struct cfs_rq *cfs_rq = &rq->cfs;
7677 bool decayed;
7678
7679 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
7680 if (cfs_rq_has_blocked(cfs_rq))
7681 *done = false;
7682
7683 return decayed;
7684 }
7685
7686 static unsigned long task_h_load(struct task_struct *p)
7687 {
7688 return p->se.avg.load_avg;
7689 }
7690 #endif
7691
7692 static void update_blocked_averages(int cpu)
7693 {
7694 bool decayed = false, done = true;
7695 struct rq *rq = cpu_rq(cpu);
7696 struct rq_flags rf;
7697
7698 rq_lock_irqsave(rq, &rf);
7699 update_rq_clock(rq);
7700
7701 decayed |= __update_blocked_others(rq, &done);
7702 decayed |= __update_blocked_fair(rq, &done);
7703
7704 update_blocked_load_status(rq, !done);
7705 if (decayed)
7706 cpufreq_update_util(rq, 0);
7707 rq_unlock_irqrestore(rq, &rf);
7708 }
7709
7710
7711
7712
7713
7714
7715 struct sg_lb_stats {
7716 unsigned long avg_load;
7717 unsigned long group_load;
7718 unsigned long load_per_task;
7719 unsigned long group_capacity;
7720 unsigned long group_util;
7721 unsigned int sum_nr_running;
7722 unsigned int idle_cpus;
7723 unsigned int group_weight;
7724 enum group_type group_type;
7725 int group_no_capacity;
7726 unsigned long group_misfit_task_load;
7727 #ifdef CONFIG_NUMA_BALANCING
7728 unsigned int nr_numa_running;
7729 unsigned int nr_preferred_running;
7730 #endif
7731 };
7732
7733
7734
7735
7736
7737 struct sd_lb_stats {
7738 struct sched_group *busiest;
7739 struct sched_group *local;
7740 unsigned long total_running;
7741 unsigned long total_load;
7742 unsigned long total_capacity;
7743 unsigned long avg_load;
7744
7745 struct sg_lb_stats busiest_stat;
7746 struct sg_lb_stats local_stat;
7747 };
7748
7749 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7750 {
7751
7752
7753
7754
7755
7756
7757 *sds = (struct sd_lb_stats){
7758 .busiest = NULL,
7759 .local = NULL,
7760 .total_running = 0UL,
7761 .total_load = 0UL,
7762 .total_capacity = 0UL,
7763 .busiest_stat = {
7764 .avg_load = 0UL,
7765 .sum_nr_running = 0,
7766 .group_type = group_other,
7767 },
7768 };
7769 }
7770
7771 static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
7772 {
7773 struct rq *rq = cpu_rq(cpu);
7774 unsigned long max = arch_scale_cpu_capacity(cpu);
7775 unsigned long used, free;
7776 unsigned long irq;
7777
7778 irq = cpu_util_irq(rq);
7779
7780 if (unlikely(irq >= max))
7781 return 1;
7782
7783 used = READ_ONCE(rq->avg_rt.util_avg);
7784 used += READ_ONCE(rq->avg_dl.util_avg);
7785
7786 if (unlikely(used >= max))
7787 return 1;
7788
7789 free = max - used;
7790
7791 return scale_irq_capacity(free, irq, max);
7792 }
7793
7794 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7795 {
7796 unsigned long capacity = scale_rt_capacity(sd, cpu);
7797 struct sched_group *sdg = sd->groups;
7798
7799 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
7800
7801 if (!capacity)
7802 capacity = 1;
7803
7804 cpu_rq(cpu)->cpu_capacity = capacity;
7805 sdg->sgc->capacity = capacity;
7806 sdg->sgc->min_capacity = capacity;
7807 sdg->sgc->max_capacity = capacity;
7808 }
7809
7810 void update_group_capacity(struct sched_domain *sd, int cpu)
7811 {
7812 struct sched_domain *child = sd->child;
7813 struct sched_group *group, *sdg = sd->groups;
7814 unsigned long capacity, min_capacity, max_capacity;
7815 unsigned long interval;
7816
7817 interval = msecs_to_jiffies(sd->balance_interval);
7818 interval = clamp(interval, 1UL, max_load_balance_interval);
7819 sdg->sgc->next_update = jiffies + interval;
7820
7821 if (!child) {
7822 update_cpu_capacity(sd, cpu);
7823 return;
7824 }
7825
7826 capacity = 0;
7827 min_capacity = ULONG_MAX;
7828 max_capacity = 0;
7829
7830 if (child->flags & SD_OVERLAP) {
7831
7832
7833
7834
7835
7836 for_each_cpu(cpu, sched_group_span(sdg)) {
7837 struct sched_group_capacity *sgc;
7838 struct rq *rq = cpu_rq(cpu);
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851 if (unlikely(!rq->sd)) {
7852 capacity += capacity_of(cpu);
7853 } else {
7854 sgc = rq->sd->groups->sgc;
7855 capacity += sgc->capacity;
7856 }
7857
7858 min_capacity = min(capacity, min_capacity);
7859 max_capacity = max(capacity, max_capacity);
7860 }
7861 } else {
7862
7863
7864
7865
7866
7867 group = child->groups;
7868 do {
7869 struct sched_group_capacity *sgc = group->sgc;
7870
7871 capacity += sgc->capacity;
7872 min_capacity = min(sgc->min_capacity, min_capacity);
7873 max_capacity = max(sgc->max_capacity, max_capacity);
7874 group = group->next;
7875 } while (group != child->groups);
7876 }
7877
7878 sdg->sgc->capacity = capacity;
7879 sdg->sgc->min_capacity = min_capacity;
7880 sdg->sgc->max_capacity = max_capacity;
7881 }
7882
7883
7884
7885
7886
7887
7888 static inline int
7889 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7890 {
7891 return ((rq->cpu_capacity * sd->imbalance_pct) <
7892 (rq->cpu_capacity_orig * 100));
7893 }
7894
7895
7896
7897
7898
7899
7900 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
7901 {
7902 return rq->misfit_task_load &&
7903 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
7904 check_cpu_capacity(rq, sd));
7905 }
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936 static inline int sg_imbalanced(struct sched_group *group)
7937 {
7938 return group->sgc->imbalance;
7939 }
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953 static inline bool
7954 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
7955 {
7956 if (sgs->sum_nr_running < sgs->group_weight)
7957 return true;
7958
7959 if ((sgs->group_capacity * 100) >
7960 (sgs->group_util * env->sd->imbalance_pct))
7961 return true;
7962
7963 return false;
7964 }
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974 static inline bool
7975 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7976 {
7977 if (sgs->sum_nr_running <= sgs->group_weight)
7978 return false;
7979
7980 if ((sgs->group_capacity * 100) <
7981 (sgs->group_util * env->sd->imbalance_pct))
7982 return true;
7983
7984 return false;
7985 }
7986
7987
7988
7989
7990
7991 static inline bool
7992 group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7993 {
7994 return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
7995 }
7996
7997
7998
7999
8000
8001 static inline bool
8002 group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8003 {
8004 return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
8005 }
8006
8007 static inline enum
8008 group_type group_classify(struct sched_group *group,
8009 struct sg_lb_stats *sgs)
8010 {
8011 if (sgs->group_no_capacity)
8012 return group_overloaded;
8013
8014 if (sg_imbalanced(group))
8015 return group_imbalanced;
8016
8017 if (sgs->group_misfit_task_load)
8018 return group_misfit_task;
8019
8020 return group_other;
8021 }
8022
8023 static bool update_nohz_stats(struct rq *rq, bool force)
8024 {
8025 #ifdef CONFIG_NO_HZ_COMMON
8026 unsigned int cpu = rq->cpu;
8027
8028 if (!rq->has_blocked_load)
8029 return false;
8030
8031 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
8032 return false;
8033
8034 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
8035 return true;
8036
8037 update_blocked_averages(cpu);
8038
8039 return rq->has_blocked_load;
8040 #else
8041 return false;
8042 #endif
8043 }
8044
8045
8046
8047
8048
8049
8050
8051
8052 static inline void update_sg_lb_stats(struct lb_env *env,
8053 struct sched_group *group,
8054 struct sg_lb_stats *sgs,
8055 int *sg_status)
8056 {
8057 int i, nr_running;
8058
8059 memset(sgs, 0, sizeof(*sgs));
8060
8061 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8062 struct rq *rq = cpu_rq(i);
8063
8064 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8065 env->flags |= LBF_NOHZ_AGAIN;
8066
8067 sgs->group_load += cpu_runnable_load(rq);
8068 sgs->group_util += cpu_util(i);
8069 sgs->sum_nr_running += rq->cfs.h_nr_running;
8070
8071 nr_running = rq->nr_running;
8072 if (nr_running > 1)
8073 *sg_status |= SG_OVERLOAD;
8074
8075 if (cpu_overutilized(i))
8076 *sg_status |= SG_OVERUTILIZED;
8077
8078 #ifdef CONFIG_NUMA_BALANCING
8079 sgs->nr_numa_running += rq->nr_numa_running;
8080 sgs->nr_preferred_running += rq->nr_preferred_running;
8081 #endif
8082
8083
8084
8085 if (!nr_running && idle_cpu(i))
8086 sgs->idle_cpus++;
8087
8088 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8089 sgs->group_misfit_task_load < rq->misfit_task_load) {
8090 sgs->group_misfit_task_load = rq->misfit_task_load;
8091 *sg_status |= SG_OVERLOAD;
8092 }
8093 }
8094
8095
8096 sgs->group_capacity = group->sgc->capacity;
8097 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8098
8099 if (sgs->sum_nr_running)
8100 sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
8101
8102 sgs->group_weight = group->group_weight;
8103
8104 sgs->group_no_capacity = group_is_overloaded(env, sgs);
8105 sgs->group_type = group_classify(group, sgs);
8106 }
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121 static bool update_sd_pick_busiest(struct lb_env *env,
8122 struct sd_lb_stats *sds,
8123 struct sched_group *sg,
8124 struct sg_lb_stats *sgs)
8125 {
8126 struct sg_lb_stats *busiest = &sds->busiest_stat;
8127
8128
8129
8130
8131
8132
8133
8134 if (sgs->group_type == group_misfit_task &&
8135 (!group_smaller_max_cpu_capacity(sg, sds->local) ||
8136 !group_has_capacity(env, &sds->local_stat)))
8137 return false;
8138
8139 if (sgs->group_type > busiest->group_type)
8140 return true;
8141
8142 if (sgs->group_type < busiest->group_type)
8143 return false;
8144
8145 if (sgs->avg_load <= busiest->avg_load)
8146 return false;
8147
8148 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
8149 goto asym_packing;
8150
8151
8152
8153
8154
8155
8156
8157 if (sgs->sum_nr_running <= sgs->group_weight &&
8158 group_smaller_min_cpu_capacity(sds->local, sg))
8159 return false;
8160
8161
8162
8163
8164 if (sgs->group_type == group_misfit_task &&
8165 sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8166 return false;
8167
8168 asym_packing:
8169
8170 if (!(env->sd->flags & SD_ASYM_PACKING))
8171 return true;
8172
8173
8174 if (env->idle == CPU_NOT_IDLE)
8175 return true;
8176
8177
8178
8179
8180
8181 if (sgs->sum_nr_running &&
8182 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
8183 if (!sds->busiest)
8184 return true;
8185
8186
8187 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
8188 sg->asym_prefer_cpu))
8189 return true;
8190 }
8191
8192 return false;
8193 }
8194
8195 #ifdef CONFIG_NUMA_BALANCING
8196 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8197 {
8198 if (sgs->sum_nr_running > sgs->nr_numa_running)
8199 return regular;
8200 if (sgs->sum_nr_running > sgs->nr_preferred_running)
8201 return remote;
8202 return all;
8203 }
8204
8205 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8206 {
8207 if (rq->nr_running > rq->nr_numa_running)
8208 return regular;
8209 if (rq->nr_running > rq->nr_preferred_running)
8210 return remote;
8211 return all;
8212 }
8213 #else
8214 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8215 {
8216 return all;
8217 }
8218
8219 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8220 {
8221 return regular;
8222 }
8223 #endif
8224
8225
8226
8227
8228
8229
8230 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8231 {
8232 struct sched_domain *child = env->sd->child;
8233 struct sched_group *sg = env->sd->groups;
8234 struct sg_lb_stats *local = &sds->local_stat;
8235 struct sg_lb_stats tmp_sgs;
8236 bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
8237 int sg_status = 0;
8238
8239 #ifdef CONFIG_NO_HZ_COMMON
8240 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
8241 env->flags |= LBF_NOHZ_STATS;
8242 #endif
8243
8244 do {
8245 struct sg_lb_stats *sgs = &tmp_sgs;
8246 int local_group;
8247
8248 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
8249 if (local_group) {
8250 sds->local = sg;
8251 sgs = local;
8252
8253 if (env->idle != CPU_NEWLY_IDLE ||
8254 time_after_eq(jiffies, sg->sgc->next_update))
8255 update_group_capacity(env->sd, env->dst_cpu);
8256 }
8257
8258 update_sg_lb_stats(env, sg, sgs, &sg_status);
8259
8260 if (local_group)
8261 goto next_group;
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273 if (prefer_sibling && sds->local &&
8274 group_has_capacity(env, local) &&
8275 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
8276 sgs->group_no_capacity = 1;
8277 sgs->group_type = group_classify(sg, sgs);
8278 }
8279
8280 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8281 sds->busiest = sg;
8282 sds->busiest_stat = *sgs;
8283 }
8284
8285 next_group:
8286
8287 sds->total_running += sgs->sum_nr_running;
8288 sds->total_load += sgs->group_load;
8289 sds->total_capacity += sgs->group_capacity;
8290
8291 sg = sg->next;
8292 } while (sg != env->sd->groups);
8293
8294 #ifdef CONFIG_NO_HZ_COMMON
8295 if ((env->flags & LBF_NOHZ_AGAIN) &&
8296 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8297
8298 WRITE_ONCE(nohz.next_blocked,
8299 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8300 }
8301 #endif
8302
8303 if (env->sd->flags & SD_NUMA)
8304 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8305
8306 if (!env->sd->parent) {
8307 struct root_domain *rd = env->dst_rq->rd;
8308
8309
8310 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
8311
8312
8313 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
8314 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
8315 } else if (sg_status & SG_OVERUTILIZED) {
8316 struct root_domain *rd = env->dst_rq->rd;
8317
8318 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
8319 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
8320 }
8321 }
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8347 {
8348 int busiest_cpu;
8349
8350 if (!(env->sd->flags & SD_ASYM_PACKING))
8351 return 0;
8352
8353 if (env->idle == CPU_NOT_IDLE)
8354 return 0;
8355
8356 if (!sds->busiest)
8357 return 0;
8358
8359 busiest_cpu = sds->busiest->asym_prefer_cpu;
8360 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
8361 return 0;
8362
8363 env->imbalance = sds->busiest_stat.group_load;
8364
8365 return 1;
8366 }
8367
8368
8369
8370
8371
8372
8373
8374
8375 static inline
8376 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8377 {
8378 unsigned long tmp, capa_now = 0, capa_move = 0;
8379 unsigned int imbn = 2;
8380 unsigned long scaled_busy_load_per_task;
8381 struct sg_lb_stats *local, *busiest;
8382
8383 local = &sds->local_stat;
8384 busiest = &sds->busiest_stat;
8385
8386 if (!local->sum_nr_running)
8387 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8388 else if (busiest->load_per_task > local->load_per_task)
8389 imbn = 1;
8390
8391 scaled_busy_load_per_task =
8392 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8393 busiest->group_capacity;
8394
8395 if (busiest->avg_load + scaled_busy_load_per_task >=
8396 local->avg_load + (scaled_busy_load_per_task * imbn)) {
8397 env->imbalance = busiest->load_per_task;
8398 return;
8399 }
8400
8401
8402
8403
8404
8405
8406
8407 capa_now += busiest->group_capacity *
8408 min(busiest->load_per_task, busiest->avg_load);
8409 capa_now += local->group_capacity *
8410 min(local->load_per_task, local->avg_load);
8411 capa_now /= SCHED_CAPACITY_SCALE;
8412
8413
8414 if (busiest->avg_load > scaled_busy_load_per_task) {
8415 capa_move += busiest->group_capacity *
8416 min(busiest->load_per_task,
8417 busiest->avg_load - scaled_busy_load_per_task);
8418 }
8419
8420
8421 if (busiest->avg_load * busiest->group_capacity <
8422 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8423 tmp = (busiest->avg_load * busiest->group_capacity) /
8424 local->group_capacity;
8425 } else {
8426 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8427 local->group_capacity;
8428 }
8429 capa_move += local->group_capacity *
8430 min(local->load_per_task, local->avg_load + tmp);
8431 capa_move /= SCHED_CAPACITY_SCALE;
8432
8433
8434 if (capa_move > capa_now)
8435 env->imbalance = busiest->load_per_task;
8436 }
8437
8438
8439
8440
8441
8442
8443
8444 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8445 {
8446 unsigned long max_pull, load_above_capacity = ~0UL;
8447 struct sg_lb_stats *local, *busiest;
8448
8449 local = &sds->local_stat;
8450 busiest = &sds->busiest_stat;
8451
8452 if (busiest->group_type == group_imbalanced) {
8453
8454
8455
8456
8457 busiest->load_per_task =
8458 min(busiest->load_per_task, sds->avg_load);
8459 }
8460
8461
8462
8463
8464
8465
8466
8467 if (busiest->group_type != group_misfit_task &&
8468 (busiest->avg_load <= sds->avg_load ||
8469 local->avg_load >= sds->avg_load)) {
8470 env->imbalance = 0;
8471 return fix_small_imbalance(env, sds);
8472 }
8473
8474
8475
8476
8477 if (busiest->group_type == group_overloaded &&
8478 local->group_type == group_overloaded) {
8479 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
8480 if (load_above_capacity > busiest->group_capacity) {
8481 load_above_capacity -= busiest->group_capacity;
8482 load_above_capacity *= scale_load_down(NICE_0_LOAD);
8483 load_above_capacity /= busiest->group_capacity;
8484 } else
8485 load_above_capacity = ~0UL;
8486 }
8487
8488
8489
8490
8491
8492
8493
8494
8495 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8496
8497
8498 env->imbalance = min(
8499 max_pull * busiest->group_capacity,
8500 (sds->avg_load - local->avg_load) * local->group_capacity
8501 ) / SCHED_CAPACITY_SCALE;
8502
8503
8504 if (busiest->group_type == group_misfit_task) {
8505 env->imbalance = max_t(long, env->imbalance,
8506 busiest->group_misfit_task_load);
8507 }
8508
8509
8510
8511
8512
8513
8514
8515 if (env->imbalance < busiest->load_per_task)
8516 return fix_small_imbalance(env, sds);
8517 }
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532 static struct sched_group *find_busiest_group(struct lb_env *env)
8533 {
8534 struct sg_lb_stats *local, *busiest;
8535 struct sd_lb_stats sds;
8536
8537 init_sd_lb_stats(&sds);
8538
8539
8540
8541
8542
8543 update_sd_lb_stats(env, &sds);
8544
8545 if (sched_energy_enabled()) {
8546 struct root_domain *rd = env->dst_rq->rd;
8547
8548 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
8549 goto out_balanced;
8550 }
8551
8552 local = &sds.local_stat;
8553 busiest = &sds.busiest_stat;
8554
8555
8556 if (check_asym_packing(env, &sds))
8557 return sds.busiest;
8558
8559
8560 if (!sds.busiest || busiest->sum_nr_running == 0)
8561 goto out_balanced;
8562
8563
8564 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8565 / sds.total_capacity;
8566
8567
8568
8569
8570
8571
8572 if (busiest->group_type == group_imbalanced)
8573 goto force_balance;
8574
8575
8576
8577
8578
8579 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8580 busiest->group_no_capacity)
8581 goto force_balance;
8582
8583
8584 if (busiest->group_type == group_misfit_task)
8585 goto force_balance;
8586
8587
8588
8589
8590
8591 if (local->avg_load >= busiest->avg_load)
8592 goto out_balanced;
8593
8594
8595
8596
8597
8598 if (local->avg_load >= sds.avg_load)
8599 goto out_balanced;
8600
8601 if (env->idle == CPU_IDLE) {
8602
8603
8604
8605
8606
8607
8608
8609 if ((busiest->group_type != group_overloaded) &&
8610 (local->idle_cpus <= (busiest->idle_cpus + 1)))
8611 goto out_balanced;
8612 } else {
8613
8614
8615
8616
8617 if (100 * busiest->avg_load <=
8618 env->sd->imbalance_pct * local->avg_load)
8619 goto out_balanced;
8620 }
8621
8622 force_balance:
8623
8624 env->src_grp_type = busiest->group_type;
8625 calculate_imbalance(env, &sds);
8626 return env->imbalance ? sds.busiest : NULL;
8627
8628 out_balanced:
8629 env->imbalance = 0;
8630 return NULL;
8631 }
8632
8633
8634
8635
8636 static struct rq *find_busiest_queue(struct lb_env *env,
8637 struct sched_group *group)
8638 {
8639 struct rq *busiest = NULL, *rq;
8640 unsigned long busiest_load = 0, busiest_capacity = 1;
8641 int i;
8642
8643 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8644 unsigned long capacity, load;
8645 enum fbq_type rt;
8646
8647 rq = cpu_rq(i);
8648 rt = fbq_classify_rq(rq);
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669 if (rt > env->fbq_type)
8670 continue;
8671
8672
8673
8674
8675
8676 if (env->src_grp_type == group_misfit_task) {
8677 if (rq->misfit_task_load > busiest_load) {
8678 busiest_load = rq->misfit_task_load;
8679 busiest = rq;
8680 }
8681
8682 continue;
8683 }
8684
8685 capacity = capacity_of(i);
8686
8687
8688
8689
8690
8691
8692
8693 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8694 capacity_of(env->dst_cpu) < capacity &&
8695 rq->nr_running == 1)
8696 continue;
8697
8698 load = cpu_runnable_load(rq);
8699
8700
8701
8702
8703
8704
8705 if (rq->nr_running == 1 && load > env->imbalance &&
8706 !check_cpu_capacity(rq, env->sd))
8707 continue;
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720 if (load * busiest_capacity > busiest_load * capacity) {
8721 busiest_load = load;
8722 busiest_capacity = capacity;
8723 busiest = rq;
8724 }
8725 }
8726
8727 return busiest;
8728 }
8729
8730
8731
8732
8733
8734 #define MAX_PINNED_INTERVAL 512
8735
8736 static inline bool
8737 asym_active_balance(struct lb_env *env)
8738 {
8739
8740
8741
8742
8743
8744 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
8745 sched_asym_prefer(env->dst_cpu, env->src_cpu);
8746 }
8747
8748 static inline bool
8749 voluntary_active_balance(struct lb_env *env)
8750 {
8751 struct sched_domain *sd = env->sd;
8752
8753 if (asym_active_balance(env))
8754 return 1;
8755
8756
8757
8758
8759
8760
8761
8762 if ((env->idle != CPU_NOT_IDLE) &&
8763 (env->src_rq->cfs.h_nr_running == 1)) {
8764 if ((check_cpu_capacity(env->src_rq, sd)) &&
8765 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8766 return 1;
8767 }
8768
8769 if (env->src_grp_type == group_misfit_task)
8770 return 1;
8771
8772 return 0;
8773 }
8774
8775 static int need_active_balance(struct lb_env *env)
8776 {
8777 struct sched_domain *sd = env->sd;
8778
8779 if (voluntary_active_balance(env))
8780 return 1;
8781
8782 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8783 }
8784
8785 static int active_load_balance_cpu_stop(void *data);
8786
8787 static int should_we_balance(struct lb_env *env)
8788 {
8789 struct sched_group *sg = env->sd->groups;
8790 int cpu, balance_cpu = -1;
8791
8792
8793
8794
8795
8796 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8797 return 0;
8798
8799
8800
8801
8802
8803 if (env->idle == CPU_NEWLY_IDLE)
8804 return 1;
8805
8806
8807 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8808 if (!idle_cpu(cpu))
8809 continue;
8810
8811 balance_cpu = cpu;
8812 break;
8813 }
8814
8815 if (balance_cpu == -1)
8816 balance_cpu = group_balance_cpu(sg);
8817
8818
8819
8820
8821
8822 return balance_cpu == env->dst_cpu;
8823 }
8824
8825
8826
8827
8828
8829 static int load_balance(int this_cpu, struct rq *this_rq,
8830 struct sched_domain *sd, enum cpu_idle_type idle,
8831 int *continue_balancing)
8832 {
8833 int ld_moved, cur_ld_moved, active_balance = 0;
8834 struct sched_domain *sd_parent = sd->parent;
8835 struct sched_group *group;
8836 struct rq *busiest;
8837 struct rq_flags rf;
8838 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
8839
8840 struct lb_env env = {
8841 .sd = sd,
8842 .dst_cpu = this_cpu,
8843 .dst_rq = this_rq,
8844 .dst_grpmask = sched_group_span(sd->groups),
8845 .idle = idle,
8846 .loop_break = sched_nr_migrate_break,
8847 .cpus = cpus,
8848 .fbq_type = all,
8849 .tasks = LIST_HEAD_INIT(env.tasks),
8850 };
8851
8852 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
8853
8854 schedstat_inc(sd->lb_count[idle]);
8855
8856 redo:
8857 if (!should_we_balance(&env)) {
8858 *continue_balancing = 0;
8859 goto out_balanced;
8860 }
8861
8862 group = find_busiest_group(&env);
8863 if (!group) {
8864 schedstat_inc(sd->lb_nobusyg[idle]);
8865 goto out_balanced;
8866 }
8867
8868 busiest = find_busiest_queue(&env, group);
8869 if (!busiest) {
8870 schedstat_inc(sd->lb_nobusyq[idle]);
8871 goto out_balanced;
8872 }
8873
8874 BUG_ON(busiest == env.dst_rq);
8875
8876 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
8877
8878 env.src_cpu = busiest->cpu;
8879 env.src_rq = busiest;
8880
8881 ld_moved = 0;
8882 if (busiest->nr_running > 1) {
8883
8884
8885
8886
8887
8888
8889 env.flags |= LBF_ALL_PINNED;
8890 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8891
8892 more_balance:
8893 rq_lock_irqsave(busiest, &rf);
8894 update_rq_clock(busiest);
8895
8896
8897
8898
8899
8900 cur_ld_moved = detach_tasks(&env);
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910 rq_unlock(busiest, &rf);
8911
8912 if (cur_ld_moved) {
8913 attach_tasks(&env);
8914 ld_moved += cur_ld_moved;
8915 }
8916
8917 local_irq_restore(rf.flags);
8918
8919 if (env.flags & LBF_NEED_BREAK) {
8920 env.flags &= ~LBF_NEED_BREAK;
8921 goto more_balance;
8922 }
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8944
8945
8946 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
8947
8948 env.dst_rq = cpu_rq(env.new_dst_cpu);
8949 env.dst_cpu = env.new_dst_cpu;
8950 env.flags &= ~LBF_DST_PINNED;
8951 env.loop = 0;
8952 env.loop_break = sched_nr_migrate_break;
8953
8954
8955
8956
8957
8958 goto more_balance;
8959 }
8960
8961
8962
8963
8964 if (sd_parent) {
8965 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8966
8967 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
8968 *group_imbalance = 1;
8969 }
8970
8971
8972 if (unlikely(env.flags & LBF_ALL_PINNED)) {
8973 __cpumask_clear_cpu(cpu_of(busiest), cpus);
8974
8975
8976
8977
8978
8979
8980
8981
8982 if (!cpumask_subset(cpus, env.dst_grpmask)) {
8983 env.loop = 0;
8984 env.loop_break = sched_nr_migrate_break;
8985 goto redo;
8986 }
8987 goto out_all_pinned;
8988 }
8989 }
8990
8991 if (!ld_moved) {
8992 schedstat_inc(sd->lb_failed[idle]);
8993
8994
8995
8996
8997
8998
8999 if (idle != CPU_NEWLY_IDLE)
9000 sd->nr_balance_failed++;
9001
9002 if (need_active_balance(&env)) {
9003 unsigned long flags;
9004
9005 raw_spin_lock_irqsave(&busiest->lock, flags);
9006
9007
9008
9009
9010
9011
9012 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9013 raw_spin_unlock_irqrestore(&busiest->lock,
9014 flags);
9015 env.flags |= LBF_ALL_PINNED;
9016 goto out_one_pinned;
9017 }
9018
9019
9020
9021
9022
9023
9024 if (!busiest->active_balance) {
9025 busiest->active_balance = 1;
9026 busiest->push_cpu = this_cpu;
9027 active_balance = 1;
9028 }
9029 raw_spin_unlock_irqrestore(&busiest->lock, flags);
9030
9031 if (active_balance) {
9032 stop_one_cpu_nowait(cpu_of(busiest),
9033 active_load_balance_cpu_stop, busiest,
9034 &busiest->active_balance_work);
9035 }
9036
9037
9038 sd->nr_balance_failed = sd->cache_nice_tries+1;
9039 }
9040 } else
9041 sd->nr_balance_failed = 0;
9042
9043 if (likely(!active_balance) || voluntary_active_balance(&env)) {
9044
9045 sd->balance_interval = sd->min_interval;
9046 } else {
9047
9048
9049
9050
9051
9052
9053 if (sd->balance_interval < sd->max_interval)
9054 sd->balance_interval *= 2;
9055 }
9056
9057 goto out;
9058
9059 out_balanced:
9060
9061
9062
9063
9064
9065 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
9066 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9067
9068 if (*group_imbalance)
9069 *group_imbalance = 0;
9070 }
9071
9072 out_all_pinned:
9073
9074
9075
9076
9077
9078 schedstat_inc(sd->lb_balanced[idle]);
9079
9080 sd->nr_balance_failed = 0;
9081
9082 out_one_pinned:
9083 ld_moved = 0;
9084
9085
9086
9087
9088
9089
9090
9091 if (env.idle == CPU_NEWLY_IDLE)
9092 goto out;
9093
9094
9095 if ((env.flags & LBF_ALL_PINNED &&
9096 sd->balance_interval < MAX_PINNED_INTERVAL) ||
9097 sd->balance_interval < sd->max_interval)
9098 sd->balance_interval *= 2;
9099 out:
9100 return ld_moved;
9101 }
9102
9103 static inline unsigned long
9104 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9105 {
9106 unsigned long interval = sd->balance_interval;
9107
9108 if (cpu_busy)
9109 interval *= sd->busy_factor;
9110
9111
9112 interval = msecs_to_jiffies(interval);
9113 interval = clamp(interval, 1UL, max_load_balance_interval);
9114
9115 return interval;
9116 }
9117
9118 static inline void
9119 update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
9120 {
9121 unsigned long interval, next;
9122
9123
9124 interval = get_sd_balance_interval(sd, 0);
9125 next = sd->last_balance + interval;
9126
9127 if (time_after(*next_balance, next))
9128 *next_balance = next;
9129 }
9130
9131
9132
9133
9134
9135
9136
9137 static int active_load_balance_cpu_stop(void *data)
9138 {
9139 struct rq *busiest_rq = data;
9140 int busiest_cpu = cpu_of(busiest_rq);
9141 int target_cpu = busiest_rq->push_cpu;
9142 struct rq *target_rq = cpu_rq(target_cpu);
9143 struct sched_domain *sd;
9144 struct task_struct *p = NULL;
9145 struct rq_flags rf;
9146
9147 rq_lock_irq(busiest_rq, &rf);
9148
9149
9150
9151
9152
9153 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
9154 goto out_unlock;
9155
9156
9157 if (unlikely(busiest_cpu != smp_processor_id() ||
9158 !busiest_rq->active_balance))
9159 goto out_unlock;
9160
9161
9162 if (busiest_rq->nr_running <= 1)
9163 goto out_unlock;
9164
9165
9166
9167
9168
9169
9170 BUG_ON(busiest_rq == target_rq);
9171
9172
9173 rcu_read_lock();
9174 for_each_domain(target_cpu, sd) {
9175 if ((sd->flags & SD_LOAD_BALANCE) &&
9176 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9177 break;
9178 }
9179
9180 if (likely(sd)) {
9181 struct lb_env env = {
9182 .sd = sd,
9183 .dst_cpu = target_cpu,
9184 .dst_rq = target_rq,
9185 .src_cpu = busiest_rq->cpu,
9186 .src_rq = busiest_rq,
9187 .idle = CPU_IDLE,
9188
9189
9190
9191
9192
9193
9194 .flags = LBF_DST_PINNED,
9195 };
9196
9197 schedstat_inc(sd->alb_count);
9198 update_rq_clock(busiest_rq);
9199
9200 p = detach_one_task(&env);
9201 if (p) {
9202 schedstat_inc(sd->alb_pushed);
9203
9204 sd->nr_balance_failed = 0;
9205 } else {
9206 schedstat_inc(sd->alb_failed);
9207 }
9208 }
9209 rcu_read_unlock();
9210 out_unlock:
9211 busiest_rq->active_balance = 0;
9212 rq_unlock(busiest_rq, &rf);
9213
9214 if (p)
9215 attach_one_task(target_rq, p);
9216
9217 local_irq_enable();
9218
9219 return 0;
9220 }
9221
9222 static DEFINE_SPINLOCK(balancing);
9223
9224
9225
9226
9227
9228 void update_max_interval(void)
9229 {
9230 max_load_balance_interval = HZ*num_online_cpus()/10;
9231 }
9232
9233
9234
9235
9236
9237
9238
9239 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9240 {
9241 int continue_balancing = 1;
9242 int cpu = rq->cpu;
9243 unsigned long interval;
9244 struct sched_domain *sd;
9245
9246 unsigned long next_balance = jiffies + 60*HZ;
9247 int update_next_balance = 0;
9248 int need_serialize, need_decay = 0;
9249 u64 max_cost = 0;
9250
9251 rcu_read_lock();
9252 for_each_domain(cpu, sd) {
9253
9254
9255
9256
9257 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9258 sd->max_newidle_lb_cost =
9259 (sd->max_newidle_lb_cost * 253) / 256;
9260 sd->next_decay_max_lb_cost = jiffies + HZ;
9261 need_decay = 1;
9262 }
9263 max_cost += sd->max_newidle_lb_cost;
9264
9265 if (!(sd->flags & SD_LOAD_BALANCE))
9266 continue;
9267
9268
9269
9270
9271
9272
9273 if (!continue_balancing) {
9274 if (need_decay)
9275 continue;
9276 break;
9277 }
9278
9279 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9280
9281 need_serialize = sd->flags & SD_SERIALIZE;
9282 if (need_serialize) {
9283 if (!spin_trylock(&balancing))
9284 goto out;
9285 }
9286
9287 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9288 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9289
9290
9291
9292
9293
9294 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9295 }
9296 sd->last_balance = jiffies;
9297 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9298 }
9299 if (need_serialize)
9300 spin_unlock(&balancing);
9301 out:
9302 if (time_after(next_balance, sd->last_balance + interval)) {
9303 next_balance = sd->last_balance + interval;
9304 update_next_balance = 1;
9305 }
9306 }
9307 if (need_decay) {
9308
9309
9310
9311
9312 rq->max_idle_balance_cost =
9313 max((u64)sysctl_sched_migration_cost, max_cost);
9314 }
9315 rcu_read_unlock();
9316
9317
9318
9319
9320
9321
9322 if (likely(update_next_balance)) {
9323 rq->next_balance = next_balance;
9324
9325 #ifdef CONFIG_NO_HZ_COMMON
9326
9327
9328
9329
9330
9331
9332
9333
9334 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9335 nohz.next_balance = rq->next_balance;
9336 #endif
9337 }
9338 }
9339
9340 static inline int on_null_domain(struct rq *rq)
9341 {
9342 return unlikely(!rcu_dereference_sched(rq->sd));
9343 }
9344
9345 #ifdef CONFIG_NO_HZ_COMMON
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355 static inline int find_new_ilb(void)
9356 {
9357 int ilb;
9358
9359 for_each_cpu_and(ilb, nohz.idle_cpus_mask,
9360 housekeeping_cpumask(HK_FLAG_MISC)) {
9361 if (idle_cpu(ilb))
9362 return ilb;
9363 }
9364
9365 return nr_cpu_ids;
9366 }
9367
9368
9369
9370
9371
9372 static void kick_ilb(unsigned int flags)
9373 {
9374 int ilb_cpu;
9375
9376 nohz.next_balance++;
9377
9378 ilb_cpu = find_new_ilb();
9379
9380 if (ilb_cpu >= nr_cpu_ids)
9381 return;
9382
9383 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
9384 if (flags & NOHZ_KICK_MASK)
9385 return;
9386
9387
9388
9389
9390
9391
9392
9393 smp_send_reschedule(ilb_cpu);
9394 }
9395
9396
9397
9398
9399
9400 static void nohz_balancer_kick(struct rq *rq)
9401 {
9402 unsigned long now = jiffies;
9403 struct sched_domain_shared *sds;
9404 struct sched_domain *sd;
9405 int nr_busy, i, cpu = rq->cpu;
9406 unsigned int flags = 0;
9407
9408 if (unlikely(rq->idle_balance))
9409 return;
9410
9411
9412
9413
9414
9415 nohz_balance_exit_idle(rq);
9416
9417
9418
9419
9420
9421 if (likely(!atomic_read(&nohz.nr_cpus)))
9422 return;
9423
9424 if (READ_ONCE(nohz.has_blocked) &&
9425 time_after(now, READ_ONCE(nohz.next_blocked)))
9426 flags = NOHZ_STATS_KICK;
9427
9428 if (time_before(now, nohz.next_balance))
9429 goto out;
9430
9431 if (rq->nr_running >= 2) {
9432 flags = NOHZ_KICK_MASK;
9433 goto out;
9434 }
9435
9436 rcu_read_lock();
9437
9438 sd = rcu_dereference(rq->sd);
9439 if (sd) {
9440
9441
9442
9443
9444
9445 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
9446 flags = NOHZ_KICK_MASK;
9447 goto unlock;
9448 }
9449 }
9450
9451 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
9452 if (sd) {
9453
9454
9455
9456
9457
9458 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
9459 if (sched_asym_prefer(i, cpu)) {
9460 flags = NOHZ_KICK_MASK;
9461 goto unlock;
9462 }
9463 }
9464 }
9465
9466 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
9467 if (sd) {
9468
9469
9470
9471
9472 if (check_misfit_status(rq, sd)) {
9473 flags = NOHZ_KICK_MASK;
9474 goto unlock;
9475 }
9476
9477
9478
9479
9480
9481
9482
9483
9484 goto unlock;
9485 }
9486
9487 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9488 if (sds) {
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498 nr_busy = atomic_read(&sds->nr_busy_cpus);
9499 if (nr_busy > 1) {
9500 flags = NOHZ_KICK_MASK;
9501 goto unlock;
9502 }
9503 }
9504 unlock:
9505 rcu_read_unlock();
9506 out:
9507 if (flags)
9508 kick_ilb(flags);
9509 }
9510
9511 static void set_cpu_sd_state_busy(int cpu)
9512 {
9513 struct sched_domain *sd;
9514
9515 rcu_read_lock();
9516 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9517
9518 if (!sd || !sd->nohz_idle)
9519 goto unlock;
9520 sd->nohz_idle = 0;
9521
9522 atomic_inc(&sd->shared->nr_busy_cpus);
9523 unlock:
9524 rcu_read_unlock();
9525 }
9526
9527 void nohz_balance_exit_idle(struct rq *rq)
9528 {
9529 SCHED_WARN_ON(rq != this_rq());
9530
9531 if (likely(!rq->nohz_tick_stopped))
9532 return;
9533
9534 rq->nohz_tick_stopped = 0;
9535 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9536 atomic_dec(&nohz.nr_cpus);
9537
9538 set_cpu_sd_state_busy(rq->cpu);
9539 }
9540
9541 static void set_cpu_sd_state_idle(int cpu)
9542 {
9543 struct sched_domain *sd;
9544
9545 rcu_read_lock();
9546 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9547
9548 if (!sd || sd->nohz_idle)
9549 goto unlock;
9550 sd->nohz_idle = 1;
9551
9552 atomic_dec(&sd->shared->nr_busy_cpus);
9553 unlock:
9554 rcu_read_unlock();
9555 }
9556
9557
9558
9559
9560
9561 void nohz_balance_enter_idle(int cpu)
9562 {
9563 struct rq *rq = cpu_rq(cpu);
9564
9565 SCHED_WARN_ON(cpu != smp_processor_id());
9566
9567
9568 if (!cpu_active(cpu))
9569 return;
9570
9571
9572 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
9573 return;
9574
9575
9576
9577
9578
9579
9580 rq->has_blocked_load = 1;
9581
9582
9583
9584
9585
9586
9587
9588 if (rq->nohz_tick_stopped)
9589 goto out;
9590
9591
9592 if (on_null_domain(rq))
9593 return;
9594
9595 rq->nohz_tick_stopped = 1;
9596
9597 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9598 atomic_inc(&nohz.nr_cpus);
9599
9600
9601
9602
9603
9604
9605 smp_mb__after_atomic();
9606
9607 set_cpu_sd_state_idle(cpu);
9608
9609 out:
9610
9611
9612
9613
9614 WRITE_ONCE(nohz.has_blocked, 1);
9615 }
9616
9617
9618
9619
9620
9621
9622
9623
9624 static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9625 enum cpu_idle_type idle)
9626 {
9627
9628 unsigned long now = jiffies;
9629 unsigned long next_balance = now + 60*HZ;
9630 bool has_blocked_load = false;
9631 int update_next_balance = 0;
9632 int this_cpu = this_rq->cpu;
9633 int balance_cpu;
9634 int ret = false;
9635 struct rq *rq;
9636
9637 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647 WRITE_ONCE(nohz.has_blocked, 0);
9648
9649
9650
9651
9652
9653 smp_mb();
9654
9655 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9656 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9657 continue;
9658
9659
9660
9661
9662
9663
9664 if (need_resched()) {
9665 has_blocked_load = true;
9666 goto abort;
9667 }
9668
9669 rq = cpu_rq(balance_cpu);
9670
9671 has_blocked_load |= update_nohz_stats(rq, true);
9672
9673
9674
9675
9676
9677 if (time_after_eq(jiffies, rq->next_balance)) {
9678 struct rq_flags rf;
9679
9680 rq_lock_irqsave(rq, &rf);
9681 update_rq_clock(rq);
9682 rq_unlock_irqrestore(rq, &rf);
9683
9684 if (flags & NOHZ_BALANCE_KICK)
9685 rebalance_domains(rq, CPU_IDLE);
9686 }
9687
9688 if (time_after(next_balance, rq->next_balance)) {
9689 next_balance = rq->next_balance;
9690 update_next_balance = 1;
9691 }
9692 }
9693
9694
9695 if (idle != CPU_NEWLY_IDLE) {
9696 update_blocked_averages(this_cpu);
9697 has_blocked_load |= this_rq->has_blocked_load;
9698 }
9699
9700 if (flags & NOHZ_BALANCE_KICK)
9701 rebalance_domains(this_rq, CPU_IDLE);
9702
9703 WRITE_ONCE(nohz.next_blocked,
9704 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9705
9706
9707 ret = true;
9708
9709 abort:
9710
9711 if (has_blocked_load)
9712 WRITE_ONCE(nohz.has_blocked, 1);
9713
9714
9715
9716
9717
9718
9719 if (likely(update_next_balance))
9720 nohz.next_balance = next_balance;
9721
9722 return ret;
9723 }
9724
9725
9726
9727
9728
9729 static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9730 {
9731 int this_cpu = this_rq->cpu;
9732 unsigned int flags;
9733
9734 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9735 return false;
9736
9737 if (idle != CPU_IDLE) {
9738 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9739 return false;
9740 }
9741
9742
9743 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9744 if (!(flags & NOHZ_KICK_MASK))
9745 return false;
9746
9747 _nohz_idle_balance(this_rq, flags, idle);
9748
9749 return true;
9750 }
9751
9752 static void nohz_newidle_balance(struct rq *this_rq)
9753 {
9754 int this_cpu = this_rq->cpu;
9755
9756
9757
9758
9759
9760 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9761 return;
9762
9763
9764 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9765 return;
9766
9767
9768 if (!READ_ONCE(nohz.has_blocked) ||
9769 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9770 return;
9771
9772 raw_spin_unlock(&this_rq->lock);
9773
9774
9775
9776
9777
9778
9779 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9780 kick_ilb(NOHZ_STATS_KICK);
9781 raw_spin_lock(&this_rq->lock);
9782 }
9783
9784 #else
9785 static inline void nohz_balancer_kick(struct rq *rq) { }
9786
9787 static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9788 {
9789 return false;
9790 }
9791
9792 static inline void nohz_newidle_balance(struct rq *this_rq) { }
9793 #endif
9794
9795
9796
9797
9798
9799 int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
9800 {
9801 unsigned long next_balance = jiffies + HZ;
9802 int this_cpu = this_rq->cpu;
9803 struct sched_domain *sd;
9804 int pulled_task = 0;
9805 u64 curr_cost = 0;
9806
9807 update_misfit_status(NULL, this_rq);
9808
9809
9810
9811
9812 this_rq->idle_stamp = rq_clock(this_rq);
9813
9814
9815
9816
9817 if (!cpu_active(this_cpu))
9818 return 0;
9819
9820
9821
9822
9823
9824
9825
9826 rq_unpin_lock(this_rq, rf);
9827
9828 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
9829 !READ_ONCE(this_rq->rd->overload)) {
9830
9831 rcu_read_lock();
9832 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9833 if (sd)
9834 update_next_balance(sd, &next_balance);
9835 rcu_read_unlock();
9836
9837 nohz_newidle_balance(this_rq);
9838
9839 goto out;
9840 }
9841
9842 raw_spin_unlock(&this_rq->lock);
9843
9844 update_blocked_averages(this_cpu);
9845 rcu_read_lock();
9846 for_each_domain(this_cpu, sd) {
9847 int continue_balancing = 1;
9848 u64 t0, domain_cost;
9849
9850 if (!(sd->flags & SD_LOAD_BALANCE))
9851 continue;
9852
9853 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9854 update_next_balance(sd, &next_balance);
9855 break;
9856 }
9857
9858 if (sd->flags & SD_BALANCE_NEWIDLE) {
9859 t0 = sched_clock_cpu(this_cpu);
9860
9861 pulled_task = load_balance(this_cpu, this_rq,
9862 sd, CPU_NEWLY_IDLE,
9863 &continue_balancing);
9864
9865 domain_cost = sched_clock_cpu(this_cpu) - t0;
9866 if (domain_cost > sd->max_newidle_lb_cost)
9867 sd->max_newidle_lb_cost = domain_cost;
9868
9869 curr_cost += domain_cost;
9870 }
9871
9872 update_next_balance(sd, &next_balance);
9873
9874
9875
9876
9877
9878 if (pulled_task || this_rq->nr_running > 0)
9879 break;
9880 }
9881 rcu_read_unlock();
9882
9883 raw_spin_lock(&this_rq->lock);
9884
9885 if (curr_cost > this_rq->max_idle_balance_cost)
9886 this_rq->max_idle_balance_cost = curr_cost;
9887
9888 out:
9889
9890
9891
9892
9893
9894 if (this_rq->cfs.h_nr_running && !pulled_task)
9895 pulled_task = 1;
9896
9897
9898 if (time_after(this_rq->next_balance, next_balance))
9899 this_rq->next_balance = next_balance;
9900
9901
9902 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9903 pulled_task = -1;
9904
9905 if (pulled_task)
9906 this_rq->idle_stamp = 0;
9907
9908 rq_repin_lock(this_rq, rf);
9909
9910 return pulled_task;
9911 }
9912
9913
9914
9915
9916
9917 static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9918 {
9919 struct rq *this_rq = this_rq();
9920 enum cpu_idle_type idle = this_rq->idle_balance ?
9921 CPU_IDLE : CPU_NOT_IDLE;
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931 if (nohz_idle_balance(this_rq, idle))
9932 return;
9933
9934
9935 update_blocked_averages(this_rq->cpu);
9936 rebalance_domains(this_rq, idle);
9937 }
9938
9939
9940
9941
9942 void trigger_load_balance(struct rq *rq)
9943 {
9944
9945 if (unlikely(on_null_domain(rq)))
9946 return;
9947
9948 if (time_after_eq(jiffies, rq->next_balance))
9949 raise_softirq(SCHED_SOFTIRQ);
9950
9951 nohz_balancer_kick(rq);
9952 }
9953
9954 static void rq_online_fair(struct rq *rq)
9955 {
9956 update_sysctl();
9957
9958 update_runtime_enabled(rq);
9959 }
9960
9961 static void rq_offline_fair(struct rq *rq)
9962 {
9963 update_sysctl();
9964
9965
9966 unthrottle_offline_cfs_rqs(rq);
9967 }
9968
9969 #endif
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9980 {
9981 struct cfs_rq *cfs_rq;
9982 struct sched_entity *se = &curr->se;
9983
9984 for_each_sched_entity(se) {
9985 cfs_rq = cfs_rq_of(se);
9986 entity_tick(cfs_rq, se, queued);
9987 }
9988
9989 if (static_branch_unlikely(&sched_numa_balancing))
9990 task_tick_numa(rq, curr);
9991
9992 update_misfit_status(curr, rq);
9993 update_overutilized_status(task_rq(curr));
9994 }
9995
9996
9997
9998
9999
10000
10001 static void task_fork_fair(struct task_struct *p)
10002 {
10003 struct cfs_rq *cfs_rq;
10004 struct sched_entity *se = &p->se, *curr;
10005 struct rq *rq = this_rq();
10006 struct rq_flags rf;
10007
10008 rq_lock(rq, &rf);
10009 update_rq_clock(rq);
10010
10011 cfs_rq = task_cfs_rq(current);
10012 curr = cfs_rq->curr;
10013 if (curr) {
10014 update_curr(cfs_rq);
10015 se->vruntime = curr->vruntime;
10016 }
10017 place_entity(cfs_rq, se, 1);
10018
10019 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
10020
10021
10022
10023
10024 swap(curr->vruntime, se->vruntime);
10025 resched_curr(rq);
10026 }
10027
10028 se->vruntime -= cfs_rq->min_vruntime;
10029 rq_unlock(rq, &rf);
10030 }
10031
10032
10033
10034
10035
10036 static void
10037 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10038 {
10039 if (!task_on_rq_queued(p))
10040 return;
10041
10042
10043
10044
10045
10046
10047 if (rq->curr == p) {
10048 if (p->prio > oldprio)
10049 resched_curr(rq);
10050 } else
10051 check_preempt_curr(rq, p, 0);
10052 }
10053
10054 static inline bool vruntime_normalized(struct task_struct *p)
10055 {
10056 struct sched_entity *se = &p->se;
10057
10058
10059
10060
10061
10062
10063 if (p->on_rq)
10064 return true;
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075 if (!se->sum_exec_runtime ||
10076 (p->state == TASK_WAKING && p->sched_remote_wakeup))
10077 return true;
10078
10079 return false;
10080 }
10081
10082 #ifdef CONFIG_FAIR_GROUP_SCHED
10083
10084
10085
10086
10087 static void propagate_entity_cfs_rq(struct sched_entity *se)
10088 {
10089 struct cfs_rq *cfs_rq;
10090
10091
10092 se = se->parent;
10093
10094 for_each_sched_entity(se) {
10095 cfs_rq = cfs_rq_of(se);
10096
10097 if (cfs_rq_throttled(cfs_rq))
10098 break;
10099
10100 update_load_avg(cfs_rq, se, UPDATE_TG);
10101 }
10102 }
10103 #else
10104 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10105 #endif
10106
10107 static void detach_entity_cfs_rq(struct sched_entity *se)
10108 {
10109 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10110
10111
10112 update_load_avg(cfs_rq, se, 0);
10113 detach_entity_load_avg(cfs_rq, se);
10114 update_tg_load_avg(cfs_rq, false);
10115 propagate_entity_cfs_rq(se);
10116 }
10117
10118 static void attach_entity_cfs_rq(struct sched_entity *se)
10119 {
10120 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10121
10122 #ifdef CONFIG_FAIR_GROUP_SCHED
10123
10124
10125
10126
10127 se->depth = se->parent ? se->parent->depth + 1 : 0;
10128 #endif
10129
10130
10131 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10132 attach_entity_load_avg(cfs_rq, se, 0);
10133 update_tg_load_avg(cfs_rq, false);
10134 propagate_entity_cfs_rq(se);
10135 }
10136
10137 static void detach_task_cfs_rq(struct task_struct *p)
10138 {
10139 struct sched_entity *se = &p->se;
10140 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10141
10142 if (!vruntime_normalized(p)) {
10143
10144
10145
10146
10147 place_entity(cfs_rq, se, 0);
10148 se->vruntime -= cfs_rq->min_vruntime;
10149 }
10150
10151 detach_entity_cfs_rq(se);
10152 }
10153
10154 static void attach_task_cfs_rq(struct task_struct *p)
10155 {
10156 struct sched_entity *se = &p->se;
10157 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10158
10159 attach_entity_cfs_rq(se);
10160
10161 if (!vruntime_normalized(p))
10162 se->vruntime += cfs_rq->min_vruntime;
10163 }
10164
10165 static void switched_from_fair(struct rq *rq, struct task_struct *p)
10166 {
10167 detach_task_cfs_rq(p);
10168 }
10169
10170 static void switched_to_fair(struct rq *rq, struct task_struct *p)
10171 {
10172 attach_task_cfs_rq(p);
10173
10174 if (task_on_rq_queued(p)) {
10175
10176
10177
10178
10179
10180 if (rq->curr == p)
10181 resched_curr(rq);
10182 else
10183 check_preempt_curr(rq, p, 0);
10184 }
10185 }
10186
10187
10188
10189
10190
10191
10192 static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
10193 {
10194 struct sched_entity *se = &p->se;
10195
10196 #ifdef CONFIG_SMP
10197 if (task_on_rq_queued(p)) {
10198
10199
10200
10201
10202 list_move(&se->group_node, &rq->cfs_tasks);
10203 }
10204 #endif
10205
10206 for_each_sched_entity(se) {
10207 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10208
10209 set_next_entity(cfs_rq, se);
10210
10211 account_cfs_rq_runtime(cfs_rq, 0);
10212 }
10213 }
10214
10215 void init_cfs_rq(struct cfs_rq *cfs_rq)
10216 {
10217 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
10218 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10219 #ifndef CONFIG_64BIT
10220 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10221 #endif
10222 #ifdef CONFIG_SMP
10223 raw_spin_lock_init(&cfs_rq->removed.lock);
10224 #endif
10225 }
10226
10227 #ifdef CONFIG_FAIR_GROUP_SCHED
10228 static void task_set_group_fair(struct task_struct *p)
10229 {
10230 struct sched_entity *se = &p->se;
10231
10232 set_task_rq(p, task_cpu(p));
10233 se->depth = se->parent ? se->parent->depth + 1 : 0;
10234 }
10235
10236 static void task_move_group_fair(struct task_struct *p)
10237 {
10238 detach_task_cfs_rq(p);
10239 set_task_rq(p, task_cpu(p));
10240
10241 #ifdef CONFIG_SMP
10242
10243 p->se.avg.last_update_time = 0;
10244 #endif
10245 attach_task_cfs_rq(p);
10246 }
10247
10248 static void task_change_group_fair(struct task_struct *p, int type)
10249 {
10250 switch (type) {
10251 case TASK_SET_GROUP:
10252 task_set_group_fair(p);
10253 break;
10254
10255 case TASK_MOVE_GROUP:
10256 task_move_group_fair(p);
10257 break;
10258 }
10259 }
10260
10261 void free_fair_sched_group(struct task_group *tg)
10262 {
10263 int i;
10264
10265 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10266
10267 for_each_possible_cpu(i) {
10268 if (tg->cfs_rq)
10269 kfree(tg->cfs_rq[i]);
10270 if (tg->se)
10271 kfree(tg->se[i]);
10272 }
10273
10274 kfree(tg->cfs_rq);
10275 kfree(tg->se);
10276 }
10277
10278 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10279 {
10280 struct sched_entity *se;
10281 struct cfs_rq *cfs_rq;
10282 int i;
10283
10284 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
10285 if (!tg->cfs_rq)
10286 goto err;
10287 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
10288 if (!tg->se)
10289 goto err;
10290
10291 tg->shares = NICE_0_LOAD;
10292
10293 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10294
10295 for_each_possible_cpu(i) {
10296 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10297 GFP_KERNEL, cpu_to_node(i));
10298 if (!cfs_rq)
10299 goto err;
10300
10301 se = kzalloc_node(sizeof(struct sched_entity),
10302 GFP_KERNEL, cpu_to_node(i));
10303 if (!se)
10304 goto err_free_rq;
10305
10306 init_cfs_rq(cfs_rq);
10307 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10308 init_entity_runnable_average(se);
10309 }
10310
10311 return 1;
10312
10313 err_free_rq:
10314 kfree(cfs_rq);
10315 err:
10316 return 0;
10317 }
10318
10319 void online_fair_sched_group(struct task_group *tg)
10320 {
10321 struct sched_entity *se;
10322 struct rq_flags rf;
10323 struct rq *rq;
10324 int i;
10325
10326 for_each_possible_cpu(i) {
10327 rq = cpu_rq(i);
10328 se = tg->se[i];
10329 rq_lock_irq(rq, &rf);
10330 update_rq_clock(rq);
10331 attach_entity_cfs_rq(se);
10332 sync_throttle(tg, i);
10333 rq_unlock_irq(rq, &rf);
10334 }
10335 }
10336
10337 void unregister_fair_sched_group(struct task_group *tg)
10338 {
10339 unsigned long flags;
10340 struct rq *rq;
10341 int cpu;
10342
10343 for_each_possible_cpu(cpu) {
10344 if (tg->se[cpu])
10345 remove_entity_load_avg(tg->se[cpu]);
10346
10347
10348
10349
10350
10351 if (!tg->cfs_rq[cpu]->on_list)
10352 continue;
10353
10354 rq = cpu_rq(cpu);
10355
10356 raw_spin_lock_irqsave(&rq->lock, flags);
10357 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10358 raw_spin_unlock_irqrestore(&rq->lock, flags);
10359 }
10360 }
10361
10362 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10363 struct sched_entity *se, int cpu,
10364 struct sched_entity *parent)
10365 {
10366 struct rq *rq = cpu_rq(cpu);
10367
10368 cfs_rq->tg = tg;
10369 cfs_rq->rq = rq;
10370 init_cfs_rq_runtime(cfs_rq);
10371
10372 tg->cfs_rq[cpu] = cfs_rq;
10373 tg->se[cpu] = se;
10374
10375
10376 if (!se)
10377 return;
10378
10379 if (!parent) {
10380 se->cfs_rq = &rq->cfs;
10381 se->depth = 0;
10382 } else {
10383 se->cfs_rq = parent->my_q;
10384 se->depth = parent->depth + 1;
10385 }
10386
10387 se->my_q = cfs_rq;
10388
10389 update_load_set(&se->load, NICE_0_LOAD);
10390 se->parent = parent;
10391 }
10392
10393 static DEFINE_MUTEX(shares_mutex);
10394
10395 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10396 {
10397 int i;
10398
10399
10400
10401
10402 if (!tg->se[0])
10403 return -EINVAL;
10404
10405 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10406
10407 mutex_lock(&shares_mutex);
10408 if (tg->shares == shares)
10409 goto done;
10410
10411 tg->shares = shares;
10412 for_each_possible_cpu(i) {
10413 struct rq *rq = cpu_rq(i);
10414 struct sched_entity *se = tg->se[i];
10415 struct rq_flags rf;
10416
10417
10418 rq_lock_irqsave(rq, &rf);
10419 update_rq_clock(rq);
10420 for_each_sched_entity(se) {
10421 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
10422 update_cfs_group(se);
10423 }
10424 rq_unlock_irqrestore(rq, &rf);
10425 }
10426
10427 done:
10428 mutex_unlock(&shares_mutex);
10429 return 0;
10430 }
10431 #else
10432
10433 void free_fair_sched_group(struct task_group *tg) { }
10434
10435 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10436 {
10437 return 1;
10438 }
10439
10440 void online_fair_sched_group(struct task_group *tg) { }
10441
10442 void unregister_fair_sched_group(struct task_group *tg) { }
10443
10444 #endif
10445
10446
10447 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10448 {
10449 struct sched_entity *se = &task->se;
10450 unsigned int rr_interval = 0;
10451
10452
10453
10454
10455
10456 if (rq->cfs.load.weight)
10457 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10458
10459 return rr_interval;
10460 }
10461
10462
10463
10464
10465 const struct sched_class fair_sched_class = {
10466 .next = &idle_sched_class,
10467 .enqueue_task = enqueue_task_fair,
10468 .dequeue_task = dequeue_task_fair,
10469 .yield_task = yield_task_fair,
10470 .yield_to_task = yield_to_task_fair,
10471
10472 .check_preempt_curr = check_preempt_wakeup,
10473
10474 .pick_next_task = pick_next_task_fair,
10475 .put_prev_task = put_prev_task_fair,
10476 .set_next_task = set_next_task_fair,
10477
10478 #ifdef CONFIG_SMP
10479 .balance = balance_fair,
10480 .select_task_rq = select_task_rq_fair,
10481 .migrate_task_rq = migrate_task_rq_fair,
10482
10483 .rq_online = rq_online_fair,
10484 .rq_offline = rq_offline_fair,
10485
10486 .task_dead = task_dead_fair,
10487 .set_cpus_allowed = set_cpus_allowed_common,
10488 #endif
10489
10490 .task_tick = task_tick_fair,
10491 .task_fork = task_fork_fair,
10492
10493 .prio_changed = prio_changed_fair,
10494 .switched_from = switched_from_fair,
10495 .switched_to = switched_to_fair,
10496
10497 .get_rr_interval = get_rr_interval_fair,
10498
10499 .update_curr = update_curr_fair,
10500
10501 #ifdef CONFIG_FAIR_GROUP_SCHED
10502 .task_change_group = task_change_group_fair,
10503 #endif
10504
10505 #ifdef CONFIG_UCLAMP_TASK
10506 .uclamp_enabled = 1,
10507 #endif
10508 };
10509
10510 #ifdef CONFIG_SCHED_DEBUG
10511 void print_cfs_stats(struct seq_file *m, int cpu)
10512 {
10513 struct cfs_rq *cfs_rq, *pos;
10514
10515 rcu_read_lock();
10516 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
10517 print_cfs_rq(m, cpu, cfs_rq);
10518 rcu_read_unlock();
10519 }
10520
10521 #ifdef CONFIG_NUMA_BALANCING
10522 void show_numa_stats(struct task_struct *p, struct seq_file *m)
10523 {
10524 int node;
10525 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10526 struct numa_group *ng;
10527
10528 rcu_read_lock();
10529 ng = rcu_dereference(p->numa_group);
10530 for_each_online_node(node) {
10531 if (p->numa_faults) {
10532 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10533 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10534 }
10535 if (ng) {
10536 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
10537 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
10538 }
10539 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10540 }
10541 rcu_read_unlock();
10542 }
10543 #endif
10544 #endif
10545
10546 __init void init_sched_fair_class(void)
10547 {
10548 #ifdef CONFIG_SMP
10549 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10550
10551 #ifdef CONFIG_NO_HZ_COMMON
10552 nohz.next_balance = jiffies;
10553 nohz.next_blocked = jiffies;
10554 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10555 #endif
10556 #endif
10557
10558 }
10559
10560
10561
10562
10563
10564 const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
10565 {
10566 #ifdef CONFIG_SMP
10567 return cfs_rq ? &cfs_rq->avg : NULL;
10568 #else
10569 return NULL;
10570 #endif
10571 }
10572 EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
10573
10574 char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
10575 {
10576 if (!cfs_rq) {
10577 if (str)
10578 strlcpy(str, "(null)", len);
10579 else
10580 return NULL;
10581 }
10582
10583 cfs_rq_tg_path(cfs_rq, str, len);
10584 return str;
10585 }
10586 EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
10587
10588 int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
10589 {
10590 return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
10591 }
10592 EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
10593
10594 const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
10595 {
10596 #ifdef CONFIG_SMP
10597 return rq ? &rq->avg_rt : NULL;
10598 #else
10599 return NULL;
10600 #endif
10601 }
10602 EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
10603
10604 const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
10605 {
10606 #ifdef CONFIG_SMP
10607 return rq ? &rq->avg_dl : NULL;
10608 #else
10609 return NULL;
10610 #endif
10611 }
10612 EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
10613
10614 const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
10615 {
10616 #if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
10617 return rq ? &rq->avg_irq : NULL;
10618 #else
10619 return NULL;
10620 #endif
10621 }
10622 EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
10623
10624 int sched_trace_rq_cpu(struct rq *rq)
10625 {
10626 return rq ? cpu_of(rq) : -1;
10627 }
10628 EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
10629
10630 const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
10631 {
10632 #ifdef CONFIG_SMP
10633 return rd ? rd->span : NULL;
10634 #else
10635 return NULL;
10636 #endif
10637 }
10638 EXPORT_SYMBOL_GPL(sched_trace_rd_span);