This source file includes following definitions.
- sched_debug_setup
- sched_debug
- sched_domain_debug_one
- sched_domain_debug
- sched_debug
- sd_degenerate
- sd_parent_degenerate
- sched_energy_aware_handler
- free_pd
- find_pd
- pd_init
- perf_domain_debug
- destroy_perf_domain_rcu
- sched_energy_set
- build_perf_domains
- free_pd
- free_rootdomain
- rq_attach_root
- sched_get_rd
- sched_put_rd
- init_rootdomain
- init_defrootdomain
- alloc_rootdomain
- free_sched_groups
- destroy_sched_domain
- destroy_sched_domains_rcu
- destroy_sched_domains
- update_top_cache_domain
- cpu_attach_domain
- group_balance_cpu
- build_balance_mask
- build_group_from_child_sched_domain
- init_overlap_sched_group
- build_overlap_sched_groups
- get_group
- build_sched_groups
- init_sched_groups_capacity
- setup_relax_domain_level
- set_domain_attribute
- __free_domain_allocs
- __visit_domain_allocation_hell
- claim_allocations
- sd_init
- set_sched_topology
- sd_numa_mask
- sched_numa_warn
- find_numa_distance
- init_numa_topology_type
- sched_init_numa
- sched_domains_numa_masks_set
- sched_domains_numa_masks_clear
- sched_numa_find_closest
- __sdt_alloc
- __sdt_free
- build_sched_domain
- topology_span_sane
- asym_cpu_capacity_level
- build_sched_domains
- arch_update_cpu_topology
- alloc_sched_domains
- free_sched_domains
- sched_init_domains
- detach_destroy_domains
- dattrs_equal
- partition_sched_domains_locked
- partition_sched_domains
1
2
3
4
5 #include "sched.h"
6
7 DEFINE_MUTEX(sched_domains_mutex);
8
9
10 static cpumask_var_t sched_domains_tmpmask;
11 static cpumask_var_t sched_domains_tmpmask2;
12
13 #ifdef CONFIG_SCHED_DEBUG
14
15 static int __init sched_debug_setup(char *str)
16 {
17 sched_debug_enabled = true;
18
19 return 0;
20 }
21 early_param("sched_debug", sched_debug_setup);
22
23 static inline bool sched_debug(void)
24 {
25 return sched_debug_enabled;
26 }
27
28 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
29 struct cpumask *groupmask)
30 {
31 struct sched_group *group = sd->groups;
32
33 cpumask_clear(groupmask);
34
35 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
36
37 if (!(sd->flags & SD_LOAD_BALANCE)) {
38 printk("does not load-balance\n");
39 if (sd->parent)
40 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
41 return -1;
42 }
43
44 printk(KERN_CONT "span=%*pbl level=%s\n",
45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
46
47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
49 }
50 if (group && !cpumask_test_cpu(cpu, sched_group_span(group))) {
51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
52 }
53
54 printk(KERN_DEBUG "%*s groups:", level + 1, "");
55 do {
56 if (!group) {
57 printk("\n");
58 printk(KERN_ERR "ERROR: group is NULL\n");
59 break;
60 }
61
62 if (!cpumask_weight(sched_group_span(group))) {
63 printk(KERN_CONT "\n");
64 printk(KERN_ERR "ERROR: empty group\n");
65 break;
66 }
67
68 if (!(sd->flags & SD_OVERLAP) &&
69 cpumask_intersects(groupmask, sched_group_span(group))) {
70 printk(KERN_CONT "\n");
71 printk(KERN_ERR "ERROR: repeated CPUs\n");
72 break;
73 }
74
75 cpumask_or(groupmask, groupmask, sched_group_span(group));
76
77 printk(KERN_CONT " %d:{ span=%*pbl",
78 group->sgc->id,
79 cpumask_pr_args(sched_group_span(group)));
80
81 if ((sd->flags & SD_OVERLAP) &&
82 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
83 printk(KERN_CONT " mask=%*pbl",
84 cpumask_pr_args(group_balance_mask(group)));
85 }
86
87 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
88 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
89
90 if (group == sd->groups && sd->child &&
91 !cpumask_equal(sched_domain_span(sd->child),
92 sched_group_span(group))) {
93 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
94 }
95
96 printk(KERN_CONT " }");
97
98 group = group->next;
99
100 if (group != sd->groups)
101 printk(KERN_CONT ",");
102
103 } while (group != sd->groups);
104 printk(KERN_CONT "\n");
105
106 if (!cpumask_equal(sched_domain_span(sd), groupmask))
107 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
108
109 if (sd->parent &&
110 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
111 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
112 return 0;
113 }
114
115 static void sched_domain_debug(struct sched_domain *sd, int cpu)
116 {
117 int level = 0;
118
119 if (!sched_debug_enabled)
120 return;
121
122 if (!sd) {
123 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
124 return;
125 }
126
127 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
128
129 for (;;) {
130 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
131 break;
132 level++;
133 sd = sd->parent;
134 if (!sd)
135 break;
136 }
137 }
138 #else
139
140 # define sched_debug_enabled 0
141 # define sched_domain_debug(sd, cpu) do { } while (0)
142 static inline bool sched_debug(void)
143 {
144 return false;
145 }
146 #endif
147
148 static int sd_degenerate(struct sched_domain *sd)
149 {
150 if (cpumask_weight(sched_domain_span(sd)) == 1)
151 return 1;
152
153
154 if (sd->flags & (SD_LOAD_BALANCE |
155 SD_BALANCE_NEWIDLE |
156 SD_BALANCE_FORK |
157 SD_BALANCE_EXEC |
158 SD_SHARE_CPUCAPACITY |
159 SD_ASYM_CPUCAPACITY |
160 SD_SHARE_PKG_RESOURCES |
161 SD_SHARE_POWERDOMAIN)) {
162 if (sd->groups != sd->groups->next)
163 return 0;
164 }
165
166
167 if (sd->flags & (SD_WAKE_AFFINE))
168 return 0;
169
170 return 1;
171 }
172
173 static int
174 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
175 {
176 unsigned long cflags = sd->flags, pflags = parent->flags;
177
178 if (sd_degenerate(parent))
179 return 1;
180
181 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
182 return 0;
183
184
185 if (parent->groups == parent->groups->next) {
186 pflags &= ~(SD_LOAD_BALANCE |
187 SD_BALANCE_NEWIDLE |
188 SD_BALANCE_FORK |
189 SD_BALANCE_EXEC |
190 SD_ASYM_CPUCAPACITY |
191 SD_SHARE_CPUCAPACITY |
192 SD_SHARE_PKG_RESOURCES |
193 SD_PREFER_SIBLING |
194 SD_SHARE_POWERDOMAIN);
195 if (nr_node_ids == 1)
196 pflags &= ~SD_SERIALIZE;
197 }
198 if (~cflags & pflags)
199 return 0;
200
201 return 1;
202 }
203
204 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
205 DEFINE_STATIC_KEY_FALSE(sched_energy_present);
206 unsigned int sysctl_sched_energy_aware = 1;
207 DEFINE_MUTEX(sched_energy_mutex);
208 bool sched_energy_update;
209
210 #ifdef CONFIG_PROC_SYSCTL
211 int sched_energy_aware_handler(struct ctl_table *table, int write,
212 void __user *buffer, size_t *lenp, loff_t *ppos)
213 {
214 int ret, state;
215
216 if (write && !capable(CAP_SYS_ADMIN))
217 return -EPERM;
218
219 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
220 if (!ret && write) {
221 state = static_branch_unlikely(&sched_energy_present);
222 if (state != sysctl_sched_energy_aware) {
223 mutex_lock(&sched_energy_mutex);
224 sched_energy_update = 1;
225 rebuild_sched_domains();
226 sched_energy_update = 0;
227 mutex_unlock(&sched_energy_mutex);
228 }
229 }
230
231 return ret;
232 }
233 #endif
234
235 static void free_pd(struct perf_domain *pd)
236 {
237 struct perf_domain *tmp;
238
239 while (pd) {
240 tmp = pd->next;
241 kfree(pd);
242 pd = tmp;
243 }
244 }
245
246 static struct perf_domain *find_pd(struct perf_domain *pd, int cpu)
247 {
248 while (pd) {
249 if (cpumask_test_cpu(cpu, perf_domain_span(pd)))
250 return pd;
251 pd = pd->next;
252 }
253
254 return NULL;
255 }
256
257 static struct perf_domain *pd_init(int cpu)
258 {
259 struct em_perf_domain *obj = em_cpu_get(cpu);
260 struct perf_domain *pd;
261
262 if (!obj) {
263 if (sched_debug())
264 pr_info("%s: no EM found for CPU%d\n", __func__, cpu);
265 return NULL;
266 }
267
268 pd = kzalloc(sizeof(*pd), GFP_KERNEL);
269 if (!pd)
270 return NULL;
271 pd->em_pd = obj;
272
273 return pd;
274 }
275
276 static void perf_domain_debug(const struct cpumask *cpu_map,
277 struct perf_domain *pd)
278 {
279 if (!sched_debug() || !pd)
280 return;
281
282 printk(KERN_DEBUG "root_domain %*pbl:", cpumask_pr_args(cpu_map));
283
284 while (pd) {
285 printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_cstate=%d }",
286 cpumask_first(perf_domain_span(pd)),
287 cpumask_pr_args(perf_domain_span(pd)),
288 em_pd_nr_cap_states(pd->em_pd));
289 pd = pd->next;
290 }
291
292 printk(KERN_CONT "\n");
293 }
294
295 static void destroy_perf_domain_rcu(struct rcu_head *rp)
296 {
297 struct perf_domain *pd;
298
299 pd = container_of(rp, struct perf_domain, rcu);
300 free_pd(pd);
301 }
302
303 static void sched_energy_set(bool has_eas)
304 {
305 if (!has_eas && static_branch_unlikely(&sched_energy_present)) {
306 if (sched_debug())
307 pr_info("%s: stopping EAS\n", __func__);
308 static_branch_disable_cpuslocked(&sched_energy_present);
309 } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) {
310 if (sched_debug())
311 pr_info("%s: starting EAS\n", __func__);
312 static_branch_enable_cpuslocked(&sched_energy_present);
313 }
314 }
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339 #define EM_MAX_COMPLEXITY 2048
340
341 extern struct cpufreq_governor schedutil_gov;
342 static bool build_perf_domains(const struct cpumask *cpu_map)
343 {
344 int i, nr_pd = 0, nr_cs = 0, nr_cpus = cpumask_weight(cpu_map);
345 struct perf_domain *pd = NULL, *tmp;
346 int cpu = cpumask_first(cpu_map);
347 struct root_domain *rd = cpu_rq(cpu)->rd;
348 struct cpufreq_policy *policy;
349 struct cpufreq_governor *gov;
350
351 if (!sysctl_sched_energy_aware)
352 goto free;
353
354
355 if (!per_cpu(sd_asym_cpucapacity, cpu)) {
356 if (sched_debug()) {
357 pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n",
358 cpumask_pr_args(cpu_map));
359 }
360 goto free;
361 }
362
363 for_each_cpu(i, cpu_map) {
364
365 if (find_pd(pd, i))
366 continue;
367
368
369 policy = cpufreq_cpu_get(i);
370 if (!policy)
371 goto free;
372 gov = policy->governor;
373 cpufreq_cpu_put(policy);
374 if (gov != &schedutil_gov) {
375 if (rd->pd)
376 pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n",
377 cpumask_pr_args(cpu_map));
378 goto free;
379 }
380
381
382 tmp = pd_init(i);
383 if (!tmp)
384 goto free;
385 tmp->next = pd;
386 pd = tmp;
387
388
389
390
391
392 nr_pd++;
393 nr_cs += em_pd_nr_cap_states(pd->em_pd);
394 }
395
396
397 if (nr_pd * (nr_cs + nr_cpus) > EM_MAX_COMPLEXITY) {
398 WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n",
399 cpumask_pr_args(cpu_map));
400 goto free;
401 }
402
403 perf_domain_debug(cpu_map, pd);
404
405
406 tmp = rd->pd;
407 rcu_assign_pointer(rd->pd, pd);
408 if (tmp)
409 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
410
411 return !!pd;
412
413 free:
414 free_pd(pd);
415 tmp = rd->pd;
416 rcu_assign_pointer(rd->pd, NULL);
417 if (tmp)
418 call_rcu(&tmp->rcu, destroy_perf_domain_rcu);
419
420 return false;
421 }
422 #else
423 static void free_pd(struct perf_domain *pd) { }
424 #endif
425
426 static void free_rootdomain(struct rcu_head *rcu)
427 {
428 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
429
430 cpupri_cleanup(&rd->cpupri);
431 cpudl_cleanup(&rd->cpudl);
432 free_cpumask_var(rd->dlo_mask);
433 free_cpumask_var(rd->rto_mask);
434 free_cpumask_var(rd->online);
435 free_cpumask_var(rd->span);
436 free_pd(rd->pd);
437 kfree(rd);
438 }
439
440 void rq_attach_root(struct rq *rq, struct root_domain *rd)
441 {
442 struct root_domain *old_rd = NULL;
443 unsigned long flags;
444
445 raw_spin_lock_irqsave(&rq->lock, flags);
446
447 if (rq->rd) {
448 old_rd = rq->rd;
449
450 if (cpumask_test_cpu(rq->cpu, old_rd->online))
451 set_rq_offline(rq);
452
453 cpumask_clear_cpu(rq->cpu, old_rd->span);
454
455
456
457
458
459
460 if (!atomic_dec_and_test(&old_rd->refcount))
461 old_rd = NULL;
462 }
463
464 atomic_inc(&rd->refcount);
465 rq->rd = rd;
466
467 cpumask_set_cpu(rq->cpu, rd->span);
468 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
469 set_rq_online(rq);
470
471 raw_spin_unlock_irqrestore(&rq->lock, flags);
472
473 if (old_rd)
474 call_rcu(&old_rd->rcu, free_rootdomain);
475 }
476
477 void sched_get_rd(struct root_domain *rd)
478 {
479 atomic_inc(&rd->refcount);
480 }
481
482 void sched_put_rd(struct root_domain *rd)
483 {
484 if (!atomic_dec_and_test(&rd->refcount))
485 return;
486
487 call_rcu(&rd->rcu, free_rootdomain);
488 }
489
490 static int init_rootdomain(struct root_domain *rd)
491 {
492 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
493 goto out;
494 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
495 goto free_span;
496 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
497 goto free_online;
498 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
499 goto free_dlo_mask;
500
501 #ifdef HAVE_RT_PUSH_IPI
502 rd->rto_cpu = -1;
503 raw_spin_lock_init(&rd->rto_lock);
504 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
505 #endif
506
507 init_dl_bw(&rd->dl_bw);
508 if (cpudl_init(&rd->cpudl) != 0)
509 goto free_rto_mask;
510
511 if (cpupri_init(&rd->cpupri) != 0)
512 goto free_cpudl;
513 return 0;
514
515 free_cpudl:
516 cpudl_cleanup(&rd->cpudl);
517 free_rto_mask:
518 free_cpumask_var(rd->rto_mask);
519 free_dlo_mask:
520 free_cpumask_var(rd->dlo_mask);
521 free_online:
522 free_cpumask_var(rd->online);
523 free_span:
524 free_cpumask_var(rd->span);
525 out:
526 return -ENOMEM;
527 }
528
529
530
531
532
533 struct root_domain def_root_domain;
534
535 void init_defrootdomain(void)
536 {
537 init_rootdomain(&def_root_domain);
538
539 atomic_set(&def_root_domain.refcount, 1);
540 }
541
542 static struct root_domain *alloc_rootdomain(void)
543 {
544 struct root_domain *rd;
545
546 rd = kzalloc(sizeof(*rd), GFP_KERNEL);
547 if (!rd)
548 return NULL;
549
550 if (init_rootdomain(rd) != 0) {
551 kfree(rd);
552 return NULL;
553 }
554
555 return rd;
556 }
557
558 static void free_sched_groups(struct sched_group *sg, int free_sgc)
559 {
560 struct sched_group *tmp, *first;
561
562 if (!sg)
563 return;
564
565 first = sg;
566 do {
567 tmp = sg->next;
568
569 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
570 kfree(sg->sgc);
571
572 if (atomic_dec_and_test(&sg->ref))
573 kfree(sg);
574 sg = tmp;
575 } while (sg != first);
576 }
577
578 static void destroy_sched_domain(struct sched_domain *sd)
579 {
580
581
582
583
584
585 free_sched_groups(sd->groups, 1);
586
587 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
588 kfree(sd->shared);
589 kfree(sd);
590 }
591
592 static void destroy_sched_domains_rcu(struct rcu_head *rcu)
593 {
594 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
595
596 while (sd) {
597 struct sched_domain *parent = sd->parent;
598 destroy_sched_domain(sd);
599 sd = parent;
600 }
601 }
602
603 static void destroy_sched_domains(struct sched_domain *sd)
604 {
605 if (sd)
606 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
607 }
608
609
610
611
612
613
614
615
616
617
618 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
619 DEFINE_PER_CPU(int, sd_llc_size);
620 DEFINE_PER_CPU(int, sd_llc_id);
621 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
622 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
623 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
624 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
625 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
626
627 static void update_top_cache_domain(int cpu)
628 {
629 struct sched_domain_shared *sds = NULL;
630 struct sched_domain *sd;
631 int id = cpu;
632 int size = 1;
633
634 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
635 if (sd) {
636 id = cpumask_first(sched_domain_span(sd));
637 size = cpumask_weight(sched_domain_span(sd));
638 sds = sd->shared;
639 }
640
641 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
642 per_cpu(sd_llc_size, cpu) = size;
643 per_cpu(sd_llc_id, cpu) = id;
644 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
645
646 sd = lowest_flag_domain(cpu, SD_NUMA);
647 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
648
649 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
650 rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd);
651
652 sd = lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY);
653 rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
654 }
655
656
657
658
659
660 static void
661 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
662 {
663 struct rq *rq = cpu_rq(cpu);
664 struct sched_domain *tmp;
665
666
667 for (tmp = sd; tmp; ) {
668 struct sched_domain *parent = tmp->parent;
669 if (!parent)
670 break;
671
672 if (sd_parent_degenerate(tmp, parent)) {
673 tmp->parent = parent->parent;
674 if (parent->parent)
675 parent->parent->child = tmp;
676
677
678
679
680
681 if (parent->flags & SD_PREFER_SIBLING)
682 tmp->flags |= SD_PREFER_SIBLING;
683 destroy_sched_domain(parent);
684 } else
685 tmp = tmp->parent;
686 }
687
688 if (sd && sd_degenerate(sd)) {
689 tmp = sd;
690 sd = sd->parent;
691 destroy_sched_domain(tmp);
692 if (sd)
693 sd->child = NULL;
694 }
695
696 sched_domain_debug(sd, cpu);
697
698 rq_attach_root(rq, rd);
699 tmp = rq->sd;
700 rcu_assign_pointer(rq->sd, sd);
701 dirty_sched_domain_sysctl(cpu);
702 destroy_sched_domains(tmp);
703
704 update_top_cache_domain(cpu);
705 }
706
707 struct s_data {
708 struct sched_domain * __percpu *sd;
709 struct root_domain *rd;
710 };
711
712 enum s_alloc {
713 sa_rootdomain,
714 sa_sd,
715 sa_sd_storage,
716 sa_none,
717 };
718
719
720
721
722
723
724
725
726
727
728 int group_balance_cpu(struct sched_group *sg)
729 {
730 return cpumask_first(group_balance_mask(sg));
731 }
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839 static void
840 build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
841 {
842 const struct cpumask *sg_span = sched_group_span(sg);
843 struct sd_data *sdd = sd->private;
844 struct sched_domain *sibling;
845 int i;
846
847 cpumask_clear(mask);
848
849 for_each_cpu(i, sg_span) {
850 sibling = *per_cpu_ptr(sdd->sd, i);
851
852
853
854
855
856
857 if (!sibling->child)
858 continue;
859
860
861 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
862 continue;
863
864 cpumask_set_cpu(i, mask);
865 }
866
867
868 WARN_ON_ONCE(cpumask_empty(mask));
869 }
870
871
872
873
874
875
876 static struct sched_group *
877 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
878 {
879 struct sched_group *sg;
880 struct cpumask *sg_span;
881
882 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
883 GFP_KERNEL, cpu_to_node(cpu));
884
885 if (!sg)
886 return NULL;
887
888 sg_span = sched_group_span(sg);
889 if (sd->child)
890 cpumask_copy(sg_span, sched_domain_span(sd->child));
891 else
892 cpumask_copy(sg_span, sched_domain_span(sd));
893
894 atomic_inc(&sg->ref);
895 return sg;
896 }
897
898 static void init_overlap_sched_group(struct sched_domain *sd,
899 struct sched_group *sg)
900 {
901 struct cpumask *mask = sched_domains_tmpmask2;
902 struct sd_data *sdd = sd->private;
903 struct cpumask *sg_span;
904 int cpu;
905
906 build_balance_mask(sd, sg, mask);
907 cpu = cpumask_first_and(sched_group_span(sg), mask);
908
909 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
910 if (atomic_inc_return(&sg->sgc->ref) == 1)
911 cpumask_copy(group_balance_mask(sg), mask);
912 else
913 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
914
915
916
917
918
919
920 sg_span = sched_group_span(sg);
921 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
922 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
923 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
924 }
925
926 static int
927 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
928 {
929 struct sched_group *first = NULL, *last = NULL, *sg;
930 const struct cpumask *span = sched_domain_span(sd);
931 struct cpumask *covered = sched_domains_tmpmask;
932 struct sd_data *sdd = sd->private;
933 struct sched_domain *sibling;
934 int i;
935
936 cpumask_clear(covered);
937
938 for_each_cpu_wrap(i, span, cpu) {
939 struct cpumask *sg_span;
940
941 if (cpumask_test_cpu(i, covered))
942 continue;
943
944 sibling = *per_cpu_ptr(sdd->sd, i);
945
946
947
948
949
950
951
952
953
954
955
956 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
957 continue;
958
959 sg = build_group_from_child_sched_domain(sibling, cpu);
960 if (!sg)
961 goto fail;
962
963 sg_span = sched_group_span(sg);
964 cpumask_or(covered, covered, sg_span);
965
966 init_overlap_sched_group(sd, sg);
967
968 if (!first)
969 first = sg;
970 if (last)
971 last->next = sg;
972 last = sg;
973 last->next = first;
974 }
975 sd->groups = first;
976
977 return 0;
978
979 fail:
980 free_sched_groups(first, 0);
981
982 return -ENOMEM;
983 }
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057 static struct sched_group *get_group(int cpu, struct sd_data *sdd)
1058 {
1059 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1060 struct sched_domain *child = sd->child;
1061 struct sched_group *sg;
1062 bool already_visited;
1063
1064 if (child)
1065 cpu = cpumask_first(sched_domain_span(child));
1066
1067 sg = *per_cpu_ptr(sdd->sg, cpu);
1068 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
1069
1070
1071 already_visited = atomic_inc_return(&sg->ref) > 1;
1072
1073 WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1));
1074
1075
1076 if (already_visited)
1077 return sg;
1078
1079 if (child) {
1080 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
1081 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
1082 } else {
1083 cpumask_set_cpu(cpu, sched_group_span(sg));
1084 cpumask_set_cpu(cpu, group_balance_mask(sg));
1085 }
1086
1087 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
1088 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
1089 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
1090
1091 return sg;
1092 }
1093
1094
1095
1096
1097
1098
1099
1100
1101 static int
1102 build_sched_groups(struct sched_domain *sd, int cpu)
1103 {
1104 struct sched_group *first = NULL, *last = NULL;
1105 struct sd_data *sdd = sd->private;
1106 const struct cpumask *span = sched_domain_span(sd);
1107 struct cpumask *covered;
1108 int i;
1109
1110 lockdep_assert_held(&sched_domains_mutex);
1111 covered = sched_domains_tmpmask;
1112
1113 cpumask_clear(covered);
1114
1115 for_each_cpu_wrap(i, span, cpu) {
1116 struct sched_group *sg;
1117
1118 if (cpumask_test_cpu(i, covered))
1119 continue;
1120
1121 sg = get_group(i, sdd);
1122
1123 cpumask_or(covered, covered, sched_group_span(sg));
1124
1125 if (!first)
1126 first = sg;
1127 if (last)
1128 last->next = sg;
1129 last = sg;
1130 }
1131 last->next = first;
1132 sd->groups = first;
1133
1134 return 0;
1135 }
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
1148 {
1149 struct sched_group *sg = sd->groups;
1150
1151 WARN_ON(!sg);
1152
1153 do {
1154 int cpu, max_cpu = -1;
1155
1156 sg->group_weight = cpumask_weight(sched_group_span(sg));
1157
1158 if (!(sd->flags & SD_ASYM_PACKING))
1159 goto next;
1160
1161 for_each_cpu(cpu, sched_group_span(sg)) {
1162 if (max_cpu < 0)
1163 max_cpu = cpu;
1164 else if (sched_asym_prefer(cpu, max_cpu))
1165 max_cpu = cpu;
1166 }
1167 sg->asym_prefer_cpu = max_cpu;
1168
1169 next:
1170 sg = sg->next;
1171 } while (sg != sd->groups);
1172
1173 if (cpu != group_balance_cpu(sg))
1174 return;
1175
1176 update_group_capacity(sd, cpu);
1177 }
1178
1179
1180
1181
1182
1183
1184 static int default_relax_domain_level = -1;
1185 int sched_domain_level_max;
1186
1187 static int __init setup_relax_domain_level(char *str)
1188 {
1189 if (kstrtoint(str, 0, &default_relax_domain_level))
1190 pr_warn("Unable to set relax_domain_level\n");
1191
1192 return 1;
1193 }
1194 __setup("relax_domain_level=", setup_relax_domain_level);
1195
1196 static void set_domain_attribute(struct sched_domain *sd,
1197 struct sched_domain_attr *attr)
1198 {
1199 int request;
1200
1201 if (!attr || attr->relax_domain_level < 0) {
1202 if (default_relax_domain_level < 0)
1203 return;
1204 else
1205 request = default_relax_domain_level;
1206 } else
1207 request = attr->relax_domain_level;
1208 if (request < sd->level) {
1209
1210 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1211 } else {
1212
1213 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1214 }
1215 }
1216
1217 static void __sdt_free(const struct cpumask *cpu_map);
1218 static int __sdt_alloc(const struct cpumask *cpu_map);
1219
1220 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
1221 const struct cpumask *cpu_map)
1222 {
1223 switch (what) {
1224 case sa_rootdomain:
1225 if (!atomic_read(&d->rd->refcount))
1226 free_rootdomain(&d->rd->rcu);
1227
1228 case sa_sd:
1229 free_percpu(d->sd);
1230
1231 case sa_sd_storage:
1232 __sdt_free(cpu_map);
1233
1234 case sa_none:
1235 break;
1236 }
1237 }
1238
1239 static enum s_alloc
1240 __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1241 {
1242 memset(d, 0, sizeof(*d));
1243
1244 if (__sdt_alloc(cpu_map))
1245 return sa_sd_storage;
1246 d->sd = alloc_percpu(struct sched_domain *);
1247 if (!d->sd)
1248 return sa_sd_storage;
1249 d->rd = alloc_rootdomain();
1250 if (!d->rd)
1251 return sa_sd;
1252
1253 return sa_rootdomain;
1254 }
1255
1256
1257
1258
1259
1260
1261 static void claim_allocations(int cpu, struct sched_domain *sd)
1262 {
1263 struct sd_data *sdd = sd->private;
1264
1265 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1266 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1267
1268 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1269 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1270
1271 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1272 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1273
1274 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1275 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1276 }
1277
1278 #ifdef CONFIG_NUMA
1279 enum numa_topology_type sched_numa_topology_type;
1280
1281 static int sched_domains_numa_levels;
1282 static int sched_domains_curr_level;
1283
1284 int sched_max_numa_distance;
1285 static int *sched_domains_numa_distance;
1286 static struct cpumask ***sched_domains_numa_masks;
1287 int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1288 #endif
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307 #define TOPOLOGY_SD_FLAGS \
1308 (SD_SHARE_CPUCAPACITY | \
1309 SD_SHARE_PKG_RESOURCES | \
1310 SD_NUMA | \
1311 SD_ASYM_PACKING | \
1312 SD_SHARE_POWERDOMAIN)
1313
1314 static struct sched_domain *
1315 sd_init(struct sched_domain_topology_level *tl,
1316 const struct cpumask *cpu_map,
1317 struct sched_domain *child, int dflags, int cpu)
1318 {
1319 struct sd_data *sdd = &tl->data;
1320 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1321 int sd_id, sd_weight, sd_flags = 0;
1322
1323 #ifdef CONFIG_NUMA
1324
1325
1326
1327 sched_domains_curr_level = tl->numa_level;
1328 #endif
1329
1330 sd_weight = cpumask_weight(tl->mask(cpu));
1331
1332 if (tl->sd_flags)
1333 sd_flags = (*tl->sd_flags)();
1334 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1335 "wrong sd_flags in topology description\n"))
1336 sd_flags &= ~TOPOLOGY_SD_FLAGS;
1337
1338
1339 sd_flags |= dflags;
1340
1341 *sd = (struct sched_domain){
1342 .min_interval = sd_weight,
1343 .max_interval = 2*sd_weight,
1344 .busy_factor = 32,
1345 .imbalance_pct = 125,
1346
1347 .cache_nice_tries = 0,
1348
1349 .flags = 1*SD_LOAD_BALANCE
1350 | 1*SD_BALANCE_NEWIDLE
1351 | 1*SD_BALANCE_EXEC
1352 | 1*SD_BALANCE_FORK
1353 | 0*SD_BALANCE_WAKE
1354 | 1*SD_WAKE_AFFINE
1355 | 0*SD_SHARE_CPUCAPACITY
1356 | 0*SD_SHARE_PKG_RESOURCES
1357 | 0*SD_SERIALIZE
1358 | 1*SD_PREFER_SIBLING
1359 | 0*SD_NUMA
1360 | sd_flags
1361 ,
1362
1363 .last_balance = jiffies,
1364 .balance_interval = sd_weight,
1365 .max_newidle_lb_cost = 0,
1366 .next_decay_max_lb_cost = jiffies,
1367 .child = child,
1368 #ifdef CONFIG_SCHED_DEBUG
1369 .name = tl->name,
1370 #endif
1371 };
1372
1373 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1374 sd_id = cpumask_first(sched_domain_span(sd));
1375
1376
1377
1378
1379
1380 if (sd->flags & SD_ASYM_CPUCAPACITY) {
1381 struct sched_domain *t = sd;
1382
1383
1384
1385
1386 if (sd->child)
1387 sd->child->flags &= ~SD_PREFER_SIBLING;
1388
1389 for_each_lower_domain(t)
1390 t->flags |= SD_BALANCE_WAKE;
1391 }
1392
1393 if (sd->flags & SD_SHARE_CPUCAPACITY) {
1394 sd->imbalance_pct = 110;
1395
1396 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1397 sd->imbalance_pct = 117;
1398 sd->cache_nice_tries = 1;
1399
1400 #ifdef CONFIG_NUMA
1401 } else if (sd->flags & SD_NUMA) {
1402 sd->cache_nice_tries = 2;
1403
1404 sd->flags &= ~SD_PREFER_SIBLING;
1405 sd->flags |= SD_SERIALIZE;
1406 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1407 sd->flags &= ~(SD_BALANCE_EXEC |
1408 SD_BALANCE_FORK |
1409 SD_WAKE_AFFINE);
1410 }
1411
1412 #endif
1413 } else {
1414 sd->cache_nice_tries = 1;
1415 }
1416
1417
1418
1419
1420
1421 if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1422 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1423 atomic_inc(&sd->shared->ref);
1424 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1425 }
1426
1427 sd->private = sdd;
1428
1429 return sd;
1430 }
1431
1432
1433
1434
1435 static struct sched_domain_topology_level default_topology[] = {
1436 #ifdef CONFIG_SCHED_SMT
1437 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1438 #endif
1439 #ifdef CONFIG_SCHED_MC
1440 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1441 #endif
1442 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1443 { NULL, },
1444 };
1445
1446 static struct sched_domain_topology_level *sched_domain_topology =
1447 default_topology;
1448
1449 #define for_each_sd_topology(tl) \
1450 for (tl = sched_domain_topology; tl->mask; tl++)
1451
1452 void set_sched_topology(struct sched_domain_topology_level *tl)
1453 {
1454 if (WARN_ON_ONCE(sched_smp_initialized))
1455 return;
1456
1457 sched_domain_topology = tl;
1458 }
1459
1460 #ifdef CONFIG_NUMA
1461
1462 static const struct cpumask *sd_numa_mask(int cpu)
1463 {
1464 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1465 }
1466
1467 static void sched_numa_warn(const char *str)
1468 {
1469 static int done = false;
1470 int i,j;
1471
1472 if (done)
1473 return;
1474
1475 done = true;
1476
1477 printk(KERN_WARNING "ERROR: %s\n\n", str);
1478
1479 for (i = 0; i < nr_node_ids; i++) {
1480 printk(KERN_WARNING " ");
1481 for (j = 0; j < nr_node_ids; j++)
1482 printk(KERN_CONT "%02d ", node_distance(i,j));
1483 printk(KERN_CONT "\n");
1484 }
1485 printk(KERN_WARNING "\n");
1486 }
1487
1488 bool find_numa_distance(int distance)
1489 {
1490 int i;
1491
1492 if (distance == node_distance(0, 0))
1493 return true;
1494
1495 for (i = 0; i < sched_domains_numa_levels; i++) {
1496 if (sched_domains_numa_distance[i] == distance)
1497 return true;
1498 }
1499
1500 return false;
1501 }
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522 static void init_numa_topology_type(void)
1523 {
1524 int a, b, c, n;
1525
1526 n = sched_max_numa_distance;
1527
1528 if (sched_domains_numa_levels <= 2) {
1529 sched_numa_topology_type = NUMA_DIRECT;
1530 return;
1531 }
1532
1533 for_each_online_node(a) {
1534 for_each_online_node(b) {
1535
1536 if (node_distance(a, b) < n)
1537 continue;
1538
1539
1540 for_each_online_node(c) {
1541 if (node_distance(a, c) < n &&
1542 node_distance(b, c) < n) {
1543 sched_numa_topology_type =
1544 NUMA_GLUELESS_MESH;
1545 return;
1546 }
1547 }
1548
1549 sched_numa_topology_type = NUMA_BACKPLANE;
1550 return;
1551 }
1552 }
1553 }
1554
1555 void sched_init_numa(void)
1556 {
1557 int next_distance, curr_distance = node_distance(0, 0);
1558 struct sched_domain_topology_level *tl;
1559 int level = 0;
1560 int i, j, k;
1561
1562 sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
1563 if (!sched_domains_numa_distance)
1564 return;
1565
1566
1567 sched_domains_numa_distance[level++] = curr_distance;
1568 sched_domains_numa_levels = level;
1569
1570
1571
1572
1573
1574
1575
1576
1577 next_distance = curr_distance;
1578 for (i = 0; i < nr_node_ids; i++) {
1579 for (j = 0; j < nr_node_ids; j++) {
1580 for (k = 0; k < nr_node_ids; k++) {
1581 int distance = node_distance(i, k);
1582
1583 if (distance > curr_distance &&
1584 (distance < next_distance ||
1585 next_distance == curr_distance))
1586 next_distance = distance;
1587
1588
1589
1590
1591
1592
1593 if (sched_debug() && node_distance(k, i) != distance)
1594 sched_numa_warn("Node-distance not symmetric");
1595
1596 if (sched_debug() && i && !find_numa_distance(distance))
1597 sched_numa_warn("Node-0 not representative");
1598 }
1599 if (next_distance != curr_distance) {
1600 sched_domains_numa_distance[level++] = next_distance;
1601 sched_domains_numa_levels = level;
1602 curr_distance = next_distance;
1603 } else break;
1604 }
1605
1606
1607
1608
1609 if (!sched_debug())
1610 break;
1611 }
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629 sched_domains_numa_levels = 0;
1630
1631 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1632 if (!sched_domains_numa_masks)
1633 return;
1634
1635
1636
1637
1638
1639 for (i = 0; i < level; i++) {
1640 sched_domains_numa_masks[i] =
1641 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1642 if (!sched_domains_numa_masks[i])
1643 return;
1644
1645 for (j = 0; j < nr_node_ids; j++) {
1646 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1647 if (!mask)
1648 return;
1649
1650 sched_domains_numa_masks[i][j] = mask;
1651
1652 for_each_node(k) {
1653 if (node_distance(j, k) > sched_domains_numa_distance[i])
1654 continue;
1655
1656 cpumask_or(mask, mask, cpumask_of_node(k));
1657 }
1658 }
1659 }
1660
1661
1662 for (i = 0; sched_domain_topology[i].mask; i++);
1663
1664 tl = kzalloc((i + level + 1) *
1665 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1666 if (!tl)
1667 return;
1668
1669
1670
1671
1672 for (i = 0; sched_domain_topology[i].mask; i++)
1673 tl[i] = sched_domain_topology[i];
1674
1675
1676
1677
1678 tl[i++] = (struct sched_domain_topology_level){
1679 .mask = sd_numa_mask,
1680 .numa_level = 0,
1681 SD_INIT_NAME(NODE)
1682 };
1683
1684
1685
1686
1687 for (j = 1; j < level; i++, j++) {
1688 tl[i] = (struct sched_domain_topology_level){
1689 .mask = sd_numa_mask,
1690 .sd_flags = cpu_numa_flags,
1691 .flags = SDTL_OVERLAP,
1692 .numa_level = j,
1693 SD_INIT_NAME(NUMA)
1694 };
1695 }
1696
1697 sched_domain_topology = tl;
1698
1699 sched_domains_numa_levels = level;
1700 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1701
1702 init_numa_topology_type();
1703 }
1704
1705 void sched_domains_numa_masks_set(unsigned int cpu)
1706 {
1707 int node = cpu_to_node(cpu);
1708 int i, j;
1709
1710 for (i = 0; i < sched_domains_numa_levels; i++) {
1711 for (j = 0; j < nr_node_ids; j++) {
1712 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1713 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1714 }
1715 }
1716 }
1717
1718 void sched_domains_numa_masks_clear(unsigned int cpu)
1719 {
1720 int i, j;
1721
1722 for (i = 0; i < sched_domains_numa_levels; i++) {
1723 for (j = 0; j < nr_node_ids; j++)
1724 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1725 }
1726 }
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736 int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1737 {
1738 int i, j = cpu_to_node(cpu);
1739
1740 for (i = 0; i < sched_domains_numa_levels; i++) {
1741 cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1742 if (cpu < nr_cpu_ids)
1743 return cpu;
1744 }
1745 return nr_cpu_ids;
1746 }
1747
1748 #endif
1749
1750 static int __sdt_alloc(const struct cpumask *cpu_map)
1751 {
1752 struct sched_domain_topology_level *tl;
1753 int j;
1754
1755 for_each_sd_topology(tl) {
1756 struct sd_data *sdd = &tl->data;
1757
1758 sdd->sd = alloc_percpu(struct sched_domain *);
1759 if (!sdd->sd)
1760 return -ENOMEM;
1761
1762 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1763 if (!sdd->sds)
1764 return -ENOMEM;
1765
1766 sdd->sg = alloc_percpu(struct sched_group *);
1767 if (!sdd->sg)
1768 return -ENOMEM;
1769
1770 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1771 if (!sdd->sgc)
1772 return -ENOMEM;
1773
1774 for_each_cpu(j, cpu_map) {
1775 struct sched_domain *sd;
1776 struct sched_domain_shared *sds;
1777 struct sched_group *sg;
1778 struct sched_group_capacity *sgc;
1779
1780 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1781 GFP_KERNEL, cpu_to_node(j));
1782 if (!sd)
1783 return -ENOMEM;
1784
1785 *per_cpu_ptr(sdd->sd, j) = sd;
1786
1787 sds = kzalloc_node(sizeof(struct sched_domain_shared),
1788 GFP_KERNEL, cpu_to_node(j));
1789 if (!sds)
1790 return -ENOMEM;
1791
1792 *per_cpu_ptr(sdd->sds, j) = sds;
1793
1794 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1795 GFP_KERNEL, cpu_to_node(j));
1796 if (!sg)
1797 return -ENOMEM;
1798
1799 sg->next = sg;
1800
1801 *per_cpu_ptr(sdd->sg, j) = sg;
1802
1803 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1804 GFP_KERNEL, cpu_to_node(j));
1805 if (!sgc)
1806 return -ENOMEM;
1807
1808 #ifdef CONFIG_SCHED_DEBUG
1809 sgc->id = j;
1810 #endif
1811
1812 *per_cpu_ptr(sdd->sgc, j) = sgc;
1813 }
1814 }
1815
1816 return 0;
1817 }
1818
1819 static void __sdt_free(const struct cpumask *cpu_map)
1820 {
1821 struct sched_domain_topology_level *tl;
1822 int j;
1823
1824 for_each_sd_topology(tl) {
1825 struct sd_data *sdd = &tl->data;
1826
1827 for_each_cpu(j, cpu_map) {
1828 struct sched_domain *sd;
1829
1830 if (sdd->sd) {
1831 sd = *per_cpu_ptr(sdd->sd, j);
1832 if (sd && (sd->flags & SD_OVERLAP))
1833 free_sched_groups(sd->groups, 0);
1834 kfree(*per_cpu_ptr(sdd->sd, j));
1835 }
1836
1837 if (sdd->sds)
1838 kfree(*per_cpu_ptr(sdd->sds, j));
1839 if (sdd->sg)
1840 kfree(*per_cpu_ptr(sdd->sg, j));
1841 if (sdd->sgc)
1842 kfree(*per_cpu_ptr(sdd->sgc, j));
1843 }
1844 free_percpu(sdd->sd);
1845 sdd->sd = NULL;
1846 free_percpu(sdd->sds);
1847 sdd->sds = NULL;
1848 free_percpu(sdd->sg);
1849 sdd->sg = NULL;
1850 free_percpu(sdd->sgc);
1851 sdd->sgc = NULL;
1852 }
1853 }
1854
1855 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1856 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1857 struct sched_domain *child, int dflags, int cpu)
1858 {
1859 struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
1860
1861 if (child) {
1862 sd->level = child->level + 1;
1863 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1864 child->parent = sd;
1865
1866 if (!cpumask_subset(sched_domain_span(child),
1867 sched_domain_span(sd))) {
1868 pr_err("BUG: arch topology borken\n");
1869 #ifdef CONFIG_SCHED_DEBUG
1870 pr_err(" the %s domain not a subset of the %s domain\n",
1871 child->name, sd->name);
1872 #endif
1873
1874 cpumask_or(sched_domain_span(sd),
1875 sched_domain_span(sd),
1876 sched_domain_span(child));
1877 }
1878
1879 }
1880 set_domain_attribute(sd, attr);
1881
1882 return sd;
1883 }
1884
1885
1886
1887
1888
1889 static bool topology_span_sane(struct sched_domain_topology_level *tl,
1890 const struct cpumask *cpu_map, int cpu)
1891 {
1892 int i;
1893
1894
1895 if (tl->flags & SDTL_OVERLAP)
1896 return true;
1897
1898
1899
1900
1901
1902
1903
1904 for_each_cpu(i, cpu_map) {
1905 if (i == cpu)
1906 continue;
1907
1908
1909
1910
1911
1912
1913 if (!cpumask_equal(tl->mask(cpu), tl->mask(i)) &&
1914 cpumask_intersects(tl->mask(cpu), tl->mask(i)))
1915 return false;
1916 }
1917
1918 return true;
1919 }
1920
1921
1922
1923
1924
1925 static struct sched_domain_topology_level
1926 *asym_cpu_capacity_level(const struct cpumask *cpu_map)
1927 {
1928 int i, j, asym_level = 0;
1929 bool asym = false;
1930 struct sched_domain_topology_level *tl, *asym_tl = NULL;
1931 unsigned long cap;
1932
1933
1934 cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
1935
1936 for_each_cpu(i, cpu_map) {
1937 if (arch_scale_cpu_capacity(i) != cap) {
1938 asym = true;
1939 break;
1940 }
1941 }
1942
1943 if (!asym)
1944 return NULL;
1945
1946
1947
1948
1949
1950
1951 for_each_cpu(i, cpu_map) {
1952 unsigned long max_capacity = arch_scale_cpu_capacity(i);
1953 int tl_id = 0;
1954
1955 for_each_sd_topology(tl) {
1956 if (tl_id < asym_level)
1957 goto next_level;
1958
1959 for_each_cpu_and(j, tl->mask(i), cpu_map) {
1960 unsigned long capacity;
1961
1962 capacity = arch_scale_cpu_capacity(j);
1963
1964 if (capacity <= max_capacity)
1965 continue;
1966
1967 max_capacity = capacity;
1968 asym_level = tl_id;
1969 asym_tl = tl;
1970 }
1971 next_level:
1972 tl_id++;
1973 }
1974 }
1975
1976 return asym_tl;
1977 }
1978
1979
1980
1981
1982
1983
1984 static int
1985 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1986 {
1987 enum s_alloc alloc_state = sa_none;
1988 struct sched_domain *sd;
1989 struct s_data d;
1990 struct rq *rq = NULL;
1991 int i, ret = -ENOMEM;
1992 struct sched_domain_topology_level *tl_asym;
1993 bool has_asym = false;
1994
1995 if (WARN_ON(cpumask_empty(cpu_map)))
1996 goto error;
1997
1998 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1999 if (alloc_state != sa_rootdomain)
2000 goto error;
2001
2002 tl_asym = asym_cpu_capacity_level(cpu_map);
2003
2004
2005 for_each_cpu(i, cpu_map) {
2006 struct sched_domain_topology_level *tl;
2007
2008 sd = NULL;
2009 for_each_sd_topology(tl) {
2010 int dflags = 0;
2011
2012 if (tl == tl_asym) {
2013 dflags |= SD_ASYM_CPUCAPACITY;
2014 has_asym = true;
2015 }
2016
2017 if (WARN_ON(!topology_span_sane(tl, cpu_map, i)))
2018 goto error;
2019
2020 sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
2021
2022 if (tl == sched_domain_topology)
2023 *per_cpu_ptr(d.sd, i) = sd;
2024 if (tl->flags & SDTL_OVERLAP)
2025 sd->flags |= SD_OVERLAP;
2026 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
2027 break;
2028 }
2029 }
2030
2031
2032 for_each_cpu(i, cpu_map) {
2033 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2034 sd->span_weight = cpumask_weight(sched_domain_span(sd));
2035 if (sd->flags & SD_OVERLAP) {
2036 if (build_overlap_sched_groups(sd, i))
2037 goto error;
2038 } else {
2039 if (build_sched_groups(sd, i))
2040 goto error;
2041 }
2042 }
2043 }
2044
2045
2046 for (i = nr_cpumask_bits-1; i >= 0; i--) {
2047 if (!cpumask_test_cpu(i, cpu_map))
2048 continue;
2049
2050 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
2051 claim_allocations(i, sd);
2052 init_sched_groups_capacity(i, sd);
2053 }
2054 }
2055
2056
2057 rcu_read_lock();
2058 for_each_cpu(i, cpu_map) {
2059 rq = cpu_rq(i);
2060 sd = *per_cpu_ptr(d.sd, i);
2061
2062
2063 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
2064 WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
2065
2066 cpu_attach_domain(sd, d.rd, i);
2067 }
2068 rcu_read_unlock();
2069
2070 if (has_asym)
2071 static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
2072
2073 if (rq && sched_debug_enabled) {
2074 pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
2075 cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
2076 }
2077
2078 ret = 0;
2079 error:
2080 __free_domain_allocs(&d, alloc_state, cpu_map);
2081
2082 return ret;
2083 }
2084
2085
2086 static cpumask_var_t *doms_cur;
2087
2088
2089 static int ndoms_cur;
2090
2091
2092 static struct sched_domain_attr *dattr_cur;
2093
2094
2095
2096
2097
2098
2099 static cpumask_var_t fallback_doms;
2100
2101
2102
2103
2104
2105
2106 int __weak arch_update_cpu_topology(void)
2107 {
2108 return 0;
2109 }
2110
2111 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
2112 {
2113 int i;
2114 cpumask_var_t *doms;
2115
2116 doms = kmalloc_array(ndoms, sizeof(*doms), GFP_KERNEL);
2117 if (!doms)
2118 return NULL;
2119 for (i = 0; i < ndoms; i++) {
2120 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
2121 free_sched_domains(doms, i);
2122 return NULL;
2123 }
2124 }
2125 return doms;
2126 }
2127
2128 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
2129 {
2130 unsigned int i;
2131 for (i = 0; i < ndoms; i++)
2132 free_cpumask_var(doms[i]);
2133 kfree(doms);
2134 }
2135
2136
2137
2138
2139
2140 int sched_init_domains(const struct cpumask *cpu_map)
2141 {
2142 int err;
2143
2144 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
2145 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
2146 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
2147
2148 arch_update_cpu_topology();
2149 ndoms_cur = 1;
2150 doms_cur = alloc_sched_domains(ndoms_cur);
2151 if (!doms_cur)
2152 doms_cur = &fallback_doms;
2153 cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
2154 err = build_sched_domains(doms_cur[0], NULL);
2155 register_sched_domain_sysctl();
2156
2157 return err;
2158 }
2159
2160
2161
2162
2163
2164 static void detach_destroy_domains(const struct cpumask *cpu_map)
2165 {
2166 unsigned int cpu = cpumask_any(cpu_map);
2167 int i;
2168
2169 if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu)))
2170 static_branch_dec_cpuslocked(&sched_asym_cpucapacity);
2171
2172 rcu_read_lock();
2173 for_each_cpu(i, cpu_map)
2174 cpu_attach_domain(NULL, &def_root_domain, i);
2175 rcu_read_unlock();
2176 }
2177
2178
2179 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
2180 struct sched_domain_attr *new, int idx_new)
2181 {
2182 struct sched_domain_attr tmp;
2183
2184
2185 if (!new && !cur)
2186 return 1;
2187
2188 tmp = SD_ATTR_INIT;
2189
2190 return !memcmp(cur ? (cur + idx_cur) : &tmp,
2191 new ? (new + idx_new) : &tmp,
2192 sizeof(struct sched_domain_attr));
2193 }
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221 void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2222 struct sched_domain_attr *dattr_new)
2223 {
2224 bool __maybe_unused has_eas = false;
2225 int i, j, n;
2226 int new_topology;
2227
2228 lockdep_assert_held(&sched_domains_mutex);
2229
2230
2231 unregister_sched_domain_sysctl();
2232
2233
2234 new_topology = arch_update_cpu_topology();
2235
2236 if (!doms_new) {
2237 WARN_ON_ONCE(dattr_new);
2238 n = 0;
2239 doms_new = alloc_sched_domains(1);
2240 if (doms_new) {
2241 n = 1;
2242 cpumask_and(doms_new[0], cpu_active_mask,
2243 housekeeping_cpumask(HK_FLAG_DOMAIN));
2244 }
2245 } else {
2246 n = ndoms_new;
2247 }
2248
2249
2250 for (i = 0; i < ndoms_cur; i++) {
2251 for (j = 0; j < n && !new_topology; j++) {
2252 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2253 dattrs_equal(dattr_cur, i, dattr_new, j)) {
2254 struct root_domain *rd;
2255
2256
2257
2258
2259
2260
2261
2262 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2263 dl_clear_root_domain(rd);
2264 goto match1;
2265 }
2266 }
2267
2268 detach_destroy_domains(doms_cur[i]);
2269 match1:
2270 ;
2271 }
2272
2273 n = ndoms_cur;
2274 if (!doms_new) {
2275 n = 0;
2276 doms_new = &fallback_doms;
2277 cpumask_and(doms_new[0], cpu_active_mask,
2278 housekeeping_cpumask(HK_FLAG_DOMAIN));
2279 }
2280
2281
2282 for (i = 0; i < ndoms_new; i++) {
2283 for (j = 0; j < n && !new_topology; j++) {
2284 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2285 dattrs_equal(dattr_new, i, dattr_cur, j))
2286 goto match2;
2287 }
2288
2289 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2290 match2:
2291 ;
2292 }
2293
2294 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
2295
2296 for (i = 0; i < ndoms_new; i++) {
2297 for (j = 0; j < n && !sched_energy_update; j++) {
2298 if (cpumask_equal(doms_new[i], doms_cur[j]) &&
2299 cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) {
2300 has_eas = true;
2301 goto match3;
2302 }
2303 }
2304
2305 has_eas |= build_perf_domains(doms_new[i]);
2306 match3:
2307 ;
2308 }
2309 sched_energy_set(has_eas);
2310 #endif
2311
2312
2313 if (doms_cur != &fallback_doms)
2314 free_sched_domains(doms_cur, ndoms_cur);
2315
2316 kfree(dattr_cur);
2317 doms_cur = doms_new;
2318 dattr_cur = dattr_new;
2319 ndoms_cur = ndoms_new;
2320
2321 register_sched_domain_sysctl();
2322 }
2323
2324
2325
2326
2327 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2328 struct sched_domain_attr *dattr_new)
2329 {
2330 mutex_lock(&sched_domains_mutex);
2331 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2332 mutex_unlock(&sched_domains_mutex);
2333 }