This source file includes following definitions.
- css_cs
- task_cs
- parent_cs
- is_cpuset_online
- is_cpu_exclusive
- is_mem_exclusive
- is_mem_hardwall
- is_sched_load_balance
- is_memory_migrate
- is_spread_page
- is_spread_slab
- is_partition_root
- cpuset_read_lock
- cpuset_read_unlock
- is_in_v2_mode
- guarantee_online_cpus
- guarantee_online_mems
- cpuset_update_task_spread_flag
- is_cpuset_subset
- alloc_cpumasks
- free_cpumasks
- alloc_trial_cpuset
- free_cpuset
- validate_change
- cpusets_overlap
- update_domain_attr
- update_domain_attr_tree
- nr_cpusets
- generate_sched_domains
- update_tasks_root_domain
- rebuild_root_domains
- partition_and_rebuild_sched_domains
- rebuild_sched_domains_locked
- rebuild_sched_domains_locked
- rebuild_sched_domains
- update_tasks_cpumask
- compute_effective_cpumask
- update_parent_subparts_cpumask
- update_cpumasks_hier
- update_sibling_cpumasks
- update_cpumask
- cpuset_migrate_mm_workfn
- cpuset_migrate_mm
- cpuset_post_attach
- cpuset_change_task_nodemask
- update_tasks_nodemask
- update_nodemasks_hier
- update_nodemask
- current_cpuset_is_being_rebound
- update_relax_domain_level
- update_tasks_flags
- update_flag
- update_prstate
- fmeter_init
- fmeter_update
- fmeter_markevent
- fmeter_getrate
- cpuset_can_attach
- cpuset_cancel_attach
- cpuset_attach
- cpuset_write_u64
- cpuset_write_s64
- cpuset_write_resmask
- cpuset_common_seq_show
- cpuset_read_u64
- cpuset_read_s64
- sched_partition_show
- sched_partition_write
- cpuset_css_alloc
- cpuset_css_online
- cpuset_css_offline
- cpuset_css_free
- cpuset_bind
- cpuset_fork
- cpuset_init
- remove_tasks_in_empty_cpuset
- hotplug_update_tasks_legacy
- hotplug_update_tasks
- cpuset_force_rebuild
- cpuset_hotplug_update_tasks
- cpuset_hotplug_workfn
- cpuset_update_active_cpus
- cpuset_wait_for_hotplug
- cpuset_track_online_nodes
- cpuset_init_smp
- cpuset_cpus_allowed
- cpuset_cpus_allowed_fallback
- cpuset_init_current_mems_allowed
- cpuset_mems_allowed
- cpuset_nodemask_valid_mems_allowed
- nearest_hardwall_ancestor
- __cpuset_node_allowed
- cpuset_spread_node
- cpuset_mem_spread_node
- cpuset_slab_spread_node
- cpuset_mems_allowed_intersects
- cpuset_print_current_mems_allowed
- __cpuset_memory_pressure_bump
- proc_cpuset_show
- cpuset_task_status_allowed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25 #include <linux/cpu.h>
26 #include <linux/cpumask.h>
27 #include <linux/cpuset.h>
28 #include <linux/err.h>
29 #include <linux/errno.h>
30 #include <linux/file.h>
31 #include <linux/fs.h>
32 #include <linux/init.h>
33 #include <linux/interrupt.h>
34 #include <linux/kernel.h>
35 #include <linux/kmod.h>
36 #include <linux/list.h>
37 #include <linux/mempolicy.h>
38 #include <linux/mm.h>
39 #include <linux/memory.h>
40 #include <linux/export.h>
41 #include <linux/mount.h>
42 #include <linux/fs_context.h>
43 #include <linux/namei.h>
44 #include <linux/pagemap.h>
45 #include <linux/proc_fs.h>
46 #include <linux/rcupdate.h>
47 #include <linux/sched.h>
48 #include <linux/sched/deadline.h>
49 #include <linux/sched/mm.h>
50 #include <linux/sched/task.h>
51 #include <linux/seq_file.h>
52 #include <linux/security.h>
53 #include <linux/slab.h>
54 #include <linux/spinlock.h>
55 #include <linux/stat.h>
56 #include <linux/string.h>
57 #include <linux/time.h>
58 #include <linux/time64.h>
59 #include <linux/backing-dev.h>
60 #include <linux/sort.h>
61 #include <linux/oom.h>
62 #include <linux/sched/isolation.h>
63 #include <linux/uaccess.h>
64 #include <linux/atomic.h>
65 #include <linux/mutex.h>
66 #include <linux/cgroup.h>
67 #include <linux/wait.h>
68
69 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
70 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
71
72
73
74 struct fmeter {
75 int cnt;
76 int val;
77 time64_t time;
78 spinlock_t lock;
79 };
80
81 struct cpuset {
82 struct cgroup_subsys_state css;
83
84 unsigned long flags;
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107 cpumask_var_t cpus_allowed;
108 nodemask_t mems_allowed;
109
110
111 cpumask_var_t effective_cpus;
112 nodemask_t effective_mems;
113
114
115
116
117
118
119
120
121
122 cpumask_var_t subparts_cpus;
123
124
125
126
127
128
129
130
131
132
133
134 nodemask_t old_mems_allowed;
135
136 struct fmeter fmeter;
137
138
139
140
141
142 int attach_in_progress;
143
144
145 int pn;
146
147
148 int relax_domain_level;
149
150
151 int nr_subparts_cpus;
152
153
154 int partition_root_state;
155
156
157
158
159
160
161 int use_parent_ecpus;
162 int child_ecpus_count;
163 };
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179 #define PRS_DISABLED 0
180 #define PRS_ENABLED 1
181 #define PRS_ERROR -1
182
183
184
185
186
187 struct tmpmasks {
188 cpumask_var_t addmask, delmask;
189 cpumask_var_t new_cpus;
190 };
191
192 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
193 {
194 return css ? container_of(css, struct cpuset, css) : NULL;
195 }
196
197
198 static inline struct cpuset *task_cs(struct task_struct *task)
199 {
200 return css_cs(task_css(task, cpuset_cgrp_id));
201 }
202
203 static inline struct cpuset *parent_cs(struct cpuset *cs)
204 {
205 return css_cs(cs->css.parent);
206 }
207
208
209 typedef enum {
210 CS_ONLINE,
211 CS_CPU_EXCLUSIVE,
212 CS_MEM_EXCLUSIVE,
213 CS_MEM_HARDWALL,
214 CS_MEMORY_MIGRATE,
215 CS_SCHED_LOAD_BALANCE,
216 CS_SPREAD_PAGE,
217 CS_SPREAD_SLAB,
218 } cpuset_flagbits_t;
219
220
221 static inline bool is_cpuset_online(struct cpuset *cs)
222 {
223 return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
224 }
225
226 static inline int is_cpu_exclusive(const struct cpuset *cs)
227 {
228 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
229 }
230
231 static inline int is_mem_exclusive(const struct cpuset *cs)
232 {
233 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
234 }
235
236 static inline int is_mem_hardwall(const struct cpuset *cs)
237 {
238 return test_bit(CS_MEM_HARDWALL, &cs->flags);
239 }
240
241 static inline int is_sched_load_balance(const struct cpuset *cs)
242 {
243 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
244 }
245
246 static inline int is_memory_migrate(const struct cpuset *cs)
247 {
248 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
249 }
250
251 static inline int is_spread_page(const struct cpuset *cs)
252 {
253 return test_bit(CS_SPREAD_PAGE, &cs->flags);
254 }
255
256 static inline int is_spread_slab(const struct cpuset *cs)
257 {
258 return test_bit(CS_SPREAD_SLAB, &cs->flags);
259 }
260
261 static inline int is_partition_root(const struct cpuset *cs)
262 {
263 return cs->partition_root_state > 0;
264 }
265
266 static struct cpuset top_cpuset = {
267 .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
268 (1 << CS_MEM_EXCLUSIVE)),
269 .partition_root_state = PRS_ENABLED,
270 };
271
272
273
274
275
276
277
278
279
280
281 #define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
282 css_for_each_child((pos_css), &(parent_cs)->css) \
283 if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
284
285
286
287
288
289
290
291
292
293
294
295
296 #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
297 css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
298 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336 DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
337
338 void cpuset_read_lock(void)
339 {
340 percpu_down_read(&cpuset_rwsem);
341 }
342
343 void cpuset_read_unlock(void)
344 {
345 percpu_up_read(&cpuset_rwsem);
346 }
347
348 static DEFINE_SPINLOCK(callback_lock);
349
350 static struct workqueue_struct *cpuset_migrate_mm_wq;
351
352
353
354
355 static void cpuset_hotplug_workfn(struct work_struct *work);
356 static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
357
358 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
359
360
361
362
363
364 static inline bool is_in_v2_mode(void)
365 {
366 return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
367 (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
368 }
369
370
371
372
373
374
375
376
377
378
379
380 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
381 {
382 while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
383 cs = parent_cs(cs);
384 if (unlikely(!cs)) {
385
386
387
388
389
390
391
392 cpumask_copy(pmask, cpu_online_mask);
393 return;
394 }
395 }
396 cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
397 }
398
399
400
401
402
403
404
405
406
407
408
409
410 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
411 {
412 while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
413 cs = parent_cs(cs);
414 nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
415 }
416
417
418
419
420
421
422 static void cpuset_update_task_spread_flag(struct cpuset *cs,
423 struct task_struct *tsk)
424 {
425 if (is_spread_page(cs))
426 task_set_spread_page(tsk);
427 else
428 task_clear_spread_page(tsk);
429
430 if (is_spread_slab(cs))
431 task_set_spread_slab(tsk);
432 else
433 task_clear_spread_slab(tsk);
434 }
435
436
437
438
439
440
441
442
443
444 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
445 {
446 return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
447 nodes_subset(p->mems_allowed, q->mems_allowed) &&
448 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
449 is_mem_exclusive(p) <= is_mem_exclusive(q);
450 }
451
452
453
454
455
456
457
458
459
460 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
461 {
462 cpumask_var_t *pmask1, *pmask2, *pmask3;
463
464 if (cs) {
465 pmask1 = &cs->cpus_allowed;
466 pmask2 = &cs->effective_cpus;
467 pmask3 = &cs->subparts_cpus;
468 } else {
469 pmask1 = &tmp->new_cpus;
470 pmask2 = &tmp->addmask;
471 pmask3 = &tmp->delmask;
472 }
473
474 if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
475 return -ENOMEM;
476
477 if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
478 goto free_one;
479
480 if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
481 goto free_two;
482
483 return 0;
484
485 free_two:
486 free_cpumask_var(*pmask2);
487 free_one:
488 free_cpumask_var(*pmask1);
489 return -ENOMEM;
490 }
491
492
493
494
495
496
497 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
498 {
499 if (cs) {
500 free_cpumask_var(cs->cpus_allowed);
501 free_cpumask_var(cs->effective_cpus);
502 free_cpumask_var(cs->subparts_cpus);
503 }
504 if (tmp) {
505 free_cpumask_var(tmp->new_cpus);
506 free_cpumask_var(tmp->addmask);
507 free_cpumask_var(tmp->delmask);
508 }
509 }
510
511
512
513
514
515 static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
516 {
517 struct cpuset *trial;
518
519 trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
520 if (!trial)
521 return NULL;
522
523 if (alloc_cpumasks(trial, NULL)) {
524 kfree(trial);
525 return NULL;
526 }
527
528 cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
529 cpumask_copy(trial->effective_cpus, cs->effective_cpus);
530 return trial;
531 }
532
533
534
535
536
537 static inline void free_cpuset(struct cpuset *cs)
538 {
539 free_cpumasks(cs, NULL);
540 kfree(cs);
541 }
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563 static int validate_change(struct cpuset *cur, struct cpuset *trial)
564 {
565 struct cgroup_subsys_state *css;
566 struct cpuset *c, *par;
567 int ret;
568
569 rcu_read_lock();
570
571
572 ret = -EBUSY;
573 cpuset_for_each_child(c, css, cur)
574 if (!is_cpuset_subset(c, trial))
575 goto out;
576
577
578 ret = 0;
579 if (cur == &top_cpuset)
580 goto out;
581
582 par = parent_cs(cur);
583
584
585 ret = -EACCES;
586 if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
587 goto out;
588
589
590
591
592
593 ret = -EINVAL;
594 cpuset_for_each_child(c, css, par) {
595 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
596 c != cur &&
597 cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
598 goto out;
599 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
600 c != cur &&
601 nodes_intersects(trial->mems_allowed, c->mems_allowed))
602 goto out;
603 }
604
605
606
607
608
609 ret = -ENOSPC;
610 if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
611 if (!cpumask_empty(cur->cpus_allowed) &&
612 cpumask_empty(trial->cpus_allowed))
613 goto out;
614 if (!nodes_empty(cur->mems_allowed) &&
615 nodes_empty(trial->mems_allowed))
616 goto out;
617 }
618
619
620
621
622
623 ret = -EBUSY;
624 if (is_cpu_exclusive(cur) &&
625 !cpuset_cpumask_can_shrink(cur->cpus_allowed,
626 trial->cpus_allowed))
627 goto out;
628
629 ret = 0;
630 out:
631 rcu_read_unlock();
632 return ret;
633 }
634
635 #ifdef CONFIG_SMP
636
637
638
639
640 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
641 {
642 return cpumask_intersects(a->effective_cpus, b->effective_cpus);
643 }
644
645 static void
646 update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
647 {
648 if (dattr->relax_domain_level < c->relax_domain_level)
649 dattr->relax_domain_level = c->relax_domain_level;
650 return;
651 }
652
653 static void update_domain_attr_tree(struct sched_domain_attr *dattr,
654 struct cpuset *root_cs)
655 {
656 struct cpuset *cp;
657 struct cgroup_subsys_state *pos_css;
658
659 rcu_read_lock();
660 cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
661
662 if (cpumask_empty(cp->cpus_allowed)) {
663 pos_css = css_rightmost_descendant(pos_css);
664 continue;
665 }
666
667 if (is_sched_load_balance(cp))
668 update_domain_attr(dattr, cp);
669 }
670 rcu_read_unlock();
671 }
672
673
674 static inline int nr_cpusets(void)
675 {
676
677 return static_key_count(&cpusets_enabled_key.key) + 1;
678 }
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733 static int generate_sched_domains(cpumask_var_t **domains,
734 struct sched_domain_attr **attributes)
735 {
736 struct cpuset *cp;
737 struct cpuset **csa;
738 int csn;
739 int i, j, k;
740 cpumask_var_t *doms;
741 struct sched_domain_attr *dattr;
742 int ndoms = 0;
743 int nslot;
744 struct cgroup_subsys_state *pos_css;
745 bool root_load_balance = is_sched_load_balance(&top_cpuset);
746
747 doms = NULL;
748 dattr = NULL;
749 csa = NULL;
750
751
752 if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
753 ndoms = 1;
754 doms = alloc_sched_domains(ndoms);
755 if (!doms)
756 goto done;
757
758 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
759 if (dattr) {
760 *dattr = SD_ATTR_INIT;
761 update_domain_attr_tree(dattr, &top_cpuset);
762 }
763 cpumask_and(doms[0], top_cpuset.effective_cpus,
764 housekeeping_cpumask(HK_FLAG_DOMAIN));
765
766 goto done;
767 }
768
769 csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
770 if (!csa)
771 goto done;
772 csn = 0;
773
774 rcu_read_lock();
775 if (root_load_balance)
776 csa[csn++] = &top_cpuset;
777 cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
778 if (cp == &top_cpuset)
779 continue;
780
781
782
783
784
785
786
787
788
789
790
791 if (!cpumask_empty(cp->cpus_allowed) &&
792 !(is_sched_load_balance(cp) &&
793 cpumask_intersects(cp->cpus_allowed,
794 housekeeping_cpumask(HK_FLAG_DOMAIN))))
795 continue;
796
797 if (root_load_balance &&
798 cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
799 continue;
800
801 if (is_sched_load_balance(cp) &&
802 !cpumask_empty(cp->effective_cpus))
803 csa[csn++] = cp;
804
805
806 if (!is_partition_root(cp))
807 pos_css = css_rightmost_descendant(pos_css);
808 }
809 rcu_read_unlock();
810
811 for (i = 0; i < csn; i++)
812 csa[i]->pn = i;
813 ndoms = csn;
814
815 restart:
816
817 for (i = 0; i < csn; i++) {
818 struct cpuset *a = csa[i];
819 int apn = a->pn;
820
821 for (j = 0; j < csn; j++) {
822 struct cpuset *b = csa[j];
823 int bpn = b->pn;
824
825 if (apn != bpn && cpusets_overlap(a, b)) {
826 for (k = 0; k < csn; k++) {
827 struct cpuset *c = csa[k];
828
829 if (c->pn == bpn)
830 c->pn = apn;
831 }
832 ndoms--;
833 goto restart;
834 }
835 }
836 }
837
838
839
840
841
842 doms = alloc_sched_domains(ndoms);
843 if (!doms)
844 goto done;
845
846
847
848
849
850 dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
851 GFP_KERNEL);
852
853 for (nslot = 0, i = 0; i < csn; i++) {
854 struct cpuset *a = csa[i];
855 struct cpumask *dp;
856 int apn = a->pn;
857
858 if (apn < 0) {
859
860 continue;
861 }
862
863 dp = doms[nslot];
864
865 if (nslot == ndoms) {
866 static int warnings = 10;
867 if (warnings) {
868 pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
869 nslot, ndoms, csn, i, apn);
870 warnings--;
871 }
872 continue;
873 }
874
875 cpumask_clear(dp);
876 if (dattr)
877 *(dattr + nslot) = SD_ATTR_INIT;
878 for (j = i; j < csn; j++) {
879 struct cpuset *b = csa[j];
880
881 if (apn == b->pn) {
882 cpumask_or(dp, dp, b->effective_cpus);
883 cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
884 if (dattr)
885 update_domain_attr_tree(dattr + nslot, b);
886
887
888 b->pn = -1;
889 }
890 }
891 nslot++;
892 }
893 BUG_ON(nslot != ndoms);
894
895 done:
896 kfree(csa);
897
898
899
900
901
902 if (doms == NULL)
903 ndoms = 1;
904
905 *domains = doms;
906 *attributes = dattr;
907 return ndoms;
908 }
909
910 static void update_tasks_root_domain(struct cpuset *cs)
911 {
912 struct css_task_iter it;
913 struct task_struct *task;
914
915 css_task_iter_start(&cs->css, 0, &it);
916
917 while ((task = css_task_iter_next(&it)))
918 dl_add_task_root_domain(task);
919
920 css_task_iter_end(&it);
921 }
922
923 static void rebuild_root_domains(void)
924 {
925 struct cpuset *cs = NULL;
926 struct cgroup_subsys_state *pos_css;
927
928 percpu_rwsem_assert_held(&cpuset_rwsem);
929 lockdep_assert_cpus_held();
930 lockdep_assert_held(&sched_domains_mutex);
931
932 cgroup_enable_task_cg_lists();
933
934 rcu_read_lock();
935
936
937
938
939
940 dl_clear_root_domain(&def_root_domain);
941
942 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
943
944 if (cpumask_empty(cs->effective_cpus)) {
945 pos_css = css_rightmost_descendant(pos_css);
946 continue;
947 }
948
949 css_get(&cs->css);
950
951 rcu_read_unlock();
952
953 update_tasks_root_domain(cs);
954
955 rcu_read_lock();
956 css_put(&cs->css);
957 }
958 rcu_read_unlock();
959 }
960
961 static void
962 partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
963 struct sched_domain_attr *dattr_new)
964 {
965 mutex_lock(&sched_domains_mutex);
966 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
967 rebuild_root_domains();
968 mutex_unlock(&sched_domains_mutex);
969 }
970
971
972
973
974
975
976
977
978
979
980
981
982 static void rebuild_sched_domains_locked(void)
983 {
984 struct sched_domain_attr *attr;
985 cpumask_var_t *doms;
986 int ndoms;
987
988 lockdep_assert_cpus_held();
989 percpu_rwsem_assert_held(&cpuset_rwsem);
990
991
992
993
994
995
996 if (!top_cpuset.nr_subparts_cpus &&
997 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
998 return;
999
1000 if (top_cpuset.nr_subparts_cpus &&
1001 !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
1002 return;
1003
1004
1005 ndoms = generate_sched_domains(&doms, &attr);
1006
1007
1008 partition_and_rebuild_sched_domains(ndoms, doms, attr);
1009 }
1010 #else
1011 static void rebuild_sched_domains_locked(void)
1012 {
1013 }
1014 #endif
1015
1016 void rebuild_sched_domains(void)
1017 {
1018 get_online_cpus();
1019 percpu_down_write(&cpuset_rwsem);
1020 rebuild_sched_domains_locked();
1021 percpu_up_write(&cpuset_rwsem);
1022 put_online_cpus();
1023 }
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033 static void update_tasks_cpumask(struct cpuset *cs)
1034 {
1035 struct css_task_iter it;
1036 struct task_struct *task;
1037
1038 css_task_iter_start(&cs->css, 0, &it);
1039 while ((task = css_task_iter_next(&it)))
1040 set_cpus_allowed_ptr(task, cs->effective_cpus);
1041 css_task_iter_end(&it);
1042 }
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055 static void compute_effective_cpumask(struct cpumask *new_cpus,
1056 struct cpuset *cs, struct cpuset *parent)
1057 {
1058 if (parent->nr_subparts_cpus) {
1059 cpumask_or(new_cpus, parent->effective_cpus,
1060 parent->subparts_cpus);
1061 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
1062 cpumask_and(new_cpus, new_cpus, cpu_active_mask);
1063 } else {
1064 cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
1065 }
1066 }
1067
1068
1069
1070
1071 enum subparts_cmd {
1072 partcmd_enable,
1073 partcmd_disable,
1074 partcmd_update,
1075 };
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119 static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1120 struct cpumask *newmask,
1121 struct tmpmasks *tmp)
1122 {
1123 struct cpuset *parent = parent_cs(cpuset);
1124 int adding;
1125 int deleting;
1126 bool part_error = false;
1127
1128 percpu_rwsem_assert_held(&cpuset_rwsem);
1129
1130
1131
1132
1133
1134
1135 if (!is_partition_root(parent) ||
1136 (newmask && cpumask_empty(newmask)) ||
1137 (!newmask && cpumask_empty(cpuset->cpus_allowed)))
1138 return -EINVAL;
1139
1140
1141
1142
1143
1144 if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
1145 return -EBUSY;
1146
1147
1148
1149
1150
1151
1152 if ((cmd == partcmd_enable) &&
1153 (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
1154 cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
1155 return -EINVAL;
1156
1157
1158
1159
1160 adding = deleting = false;
1161 if (cmd == partcmd_enable) {
1162 cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
1163 adding = true;
1164 } else if (cmd == partcmd_disable) {
1165 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1166 parent->subparts_cpus);
1167 } else if (newmask) {
1168
1169
1170
1171
1172
1173
1174
1175 cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
1176 deleting = cpumask_and(tmp->delmask, tmp->delmask,
1177 parent->subparts_cpus);
1178
1179 cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
1180 adding = cpumask_andnot(tmp->addmask, tmp->addmask,
1181 parent->subparts_cpus);
1182
1183
1184
1185 if (adding &&
1186 cpumask_equal(parent->effective_cpus, tmp->addmask)) {
1187 if (!deleting)
1188 return -EINVAL;
1189
1190
1191
1192
1193
1194 if (!cpumask_and(tmp->addmask, tmp->delmask,
1195 cpu_active_mask))
1196 return -EINVAL;
1197 cpumask_copy(tmp->addmask, parent->effective_cpus);
1198 }
1199 } else {
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209 adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
1210 parent->effective_cpus);
1211 part_error = cpumask_equal(tmp->addmask,
1212 parent->effective_cpus);
1213 }
1214
1215 if (cmd == partcmd_update) {
1216 int prev_prs = cpuset->partition_root_state;
1217
1218
1219
1220
1221
1222 switch (cpuset->partition_root_state) {
1223 case PRS_ENABLED:
1224 if (part_error)
1225 cpuset->partition_root_state = PRS_ERROR;
1226 break;
1227 case PRS_ERROR:
1228 if (!part_error)
1229 cpuset->partition_root_state = PRS_ENABLED;
1230 break;
1231 }
1232
1233
1234
1235 part_error = (prev_prs == PRS_ERROR);
1236 }
1237
1238 if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
1239 return 0;
1240
1241 if (cpuset->partition_root_state == PRS_ERROR) {
1242
1243
1244
1245 adding = false;
1246 deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
1247 parent->subparts_cpus);
1248 }
1249
1250 if (!adding && !deleting)
1251 return 0;
1252
1253
1254
1255
1256
1257
1258 spin_lock_irq(&callback_lock);
1259 if (adding) {
1260 cpumask_or(parent->subparts_cpus,
1261 parent->subparts_cpus, tmp->addmask);
1262 cpumask_andnot(parent->effective_cpus,
1263 parent->effective_cpus, tmp->addmask);
1264 }
1265 if (deleting) {
1266 cpumask_andnot(parent->subparts_cpus,
1267 parent->subparts_cpus, tmp->delmask);
1268
1269
1270
1271 cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
1272 cpumask_or(parent->effective_cpus,
1273 parent->effective_cpus, tmp->delmask);
1274 }
1275
1276 parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
1277 spin_unlock_irq(&callback_lock);
1278
1279 return cmd == partcmd_update;
1280 }
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
1295 {
1296 struct cpuset *cp;
1297 struct cgroup_subsys_state *pos_css;
1298 bool need_rebuild_sched_domains = false;
1299
1300 rcu_read_lock();
1301 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1302 struct cpuset *parent = parent_cs(cp);
1303
1304 compute_effective_cpumask(tmp->new_cpus, cp, parent);
1305
1306
1307
1308
1309
1310 if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
1311 cpumask_copy(tmp->new_cpus, parent->effective_cpus);
1312 if (!cp->use_parent_ecpus) {
1313 cp->use_parent_ecpus = true;
1314 parent->child_ecpus_count++;
1315 }
1316 } else if (cp->use_parent_ecpus) {
1317 cp->use_parent_ecpus = false;
1318 WARN_ON_ONCE(!parent->child_ecpus_count);
1319 parent->child_ecpus_count--;
1320 }
1321
1322
1323
1324
1325
1326 if (!cp->partition_root_state &&
1327 cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
1328 pos_css = css_rightmost_descendant(pos_css);
1329 continue;
1330 }
1331
1332
1333
1334
1335
1336
1337
1338 if ((cp != cs) && cp->partition_root_state) {
1339 switch (parent->partition_root_state) {
1340 case PRS_DISABLED:
1341
1342
1343
1344
1345
1346 WARN_ON_ONCE(cp->partition_root_state
1347 != PRS_ERROR);
1348 cp->partition_root_state = 0;
1349
1350
1351
1352
1353
1354
1355
1356
1357 clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
1358 break;
1359
1360 case PRS_ENABLED:
1361 if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
1362 update_tasks_cpumask(parent);
1363 break;
1364
1365 case PRS_ERROR:
1366
1367
1368
1369 cp->partition_root_state = PRS_ERROR;
1370 if (cp->nr_subparts_cpus) {
1371 cp->nr_subparts_cpus = 0;
1372 cpumask_clear(cp->subparts_cpus);
1373 }
1374 break;
1375 }
1376 }
1377
1378 if (!css_tryget_online(&cp->css))
1379 continue;
1380 rcu_read_unlock();
1381
1382 spin_lock_irq(&callback_lock);
1383
1384 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1385 if (cp->nr_subparts_cpus &&
1386 (cp->partition_root_state != PRS_ENABLED)) {
1387 cp->nr_subparts_cpus = 0;
1388 cpumask_clear(cp->subparts_cpus);
1389 } else if (cp->nr_subparts_cpus) {
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399 cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
1400 cp->subparts_cpus);
1401 if (cpumask_empty(cp->effective_cpus)) {
1402 cpumask_copy(cp->effective_cpus, tmp->new_cpus);
1403 cpumask_clear(cp->subparts_cpus);
1404 cp->nr_subparts_cpus = 0;
1405 } else if (!cpumask_subset(cp->subparts_cpus,
1406 tmp->new_cpus)) {
1407 cpumask_andnot(cp->subparts_cpus,
1408 cp->subparts_cpus, tmp->new_cpus);
1409 cp->nr_subparts_cpus
1410 = cpumask_weight(cp->subparts_cpus);
1411 }
1412 }
1413 spin_unlock_irq(&callback_lock);
1414
1415 WARN_ON(!is_in_v2_mode() &&
1416 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
1417
1418 update_tasks_cpumask(cp);
1419
1420
1421
1422
1423
1424
1425
1426 if (!cpumask_empty(cp->cpus_allowed) &&
1427 is_sched_load_balance(cp) &&
1428 (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
1429 is_partition_root(cp)))
1430 need_rebuild_sched_domains = true;
1431
1432 rcu_read_lock();
1433 css_put(&cp->css);
1434 }
1435 rcu_read_unlock();
1436
1437 if (need_rebuild_sched_domains)
1438 rebuild_sched_domains_locked();
1439 }
1440
1441
1442
1443
1444
1445
1446
1447 static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
1448 struct tmpmasks *tmp)
1449 {
1450 struct cpuset *sibling;
1451 struct cgroup_subsys_state *pos_css;
1452
1453
1454
1455
1456
1457
1458 rcu_read_lock();
1459 cpuset_for_each_child(sibling, pos_css, parent) {
1460 if (sibling == cs)
1461 continue;
1462 if (!sibling->use_parent_ecpus)
1463 continue;
1464
1465 update_cpumasks_hier(sibling, tmp);
1466 }
1467 rcu_read_unlock();
1468 }
1469
1470
1471
1472
1473
1474
1475
1476 static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
1477 const char *buf)
1478 {
1479 int retval;
1480 struct tmpmasks tmp;
1481
1482
1483 if (cs == &top_cpuset)
1484 return -EACCES;
1485
1486
1487
1488
1489
1490
1491
1492 if (!*buf) {
1493 cpumask_clear(trialcs->cpus_allowed);
1494 } else {
1495 retval = cpulist_parse(buf, trialcs->cpus_allowed);
1496 if (retval < 0)
1497 return retval;
1498
1499 if (!cpumask_subset(trialcs->cpus_allowed,
1500 top_cpuset.cpus_allowed))
1501 return -EINVAL;
1502 }
1503
1504
1505 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
1506 return 0;
1507
1508 retval = validate_change(cs, trialcs);
1509 if (retval < 0)
1510 return retval;
1511
1512 #ifdef CONFIG_CPUMASK_OFFSTACK
1513
1514
1515
1516
1517 tmp.addmask = trialcs->subparts_cpus;
1518 tmp.delmask = trialcs->effective_cpus;
1519 tmp.new_cpus = trialcs->cpus_allowed;
1520 #endif
1521
1522 if (cs->partition_root_state) {
1523
1524 if (cpumask_empty(trialcs->cpus_allowed))
1525 return -EINVAL;
1526 if (update_parent_subparts_cpumask(cs, partcmd_update,
1527 trialcs->cpus_allowed, &tmp) < 0)
1528 return -EINVAL;
1529 }
1530
1531 spin_lock_irq(&callback_lock);
1532 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
1533
1534
1535
1536
1537 if (cs->nr_subparts_cpus) {
1538 cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
1539 cs->cpus_allowed);
1540 cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
1541 }
1542 spin_unlock_irq(&callback_lock);
1543
1544 update_cpumasks_hier(cs, &tmp);
1545
1546 if (cs->partition_root_state) {
1547 struct cpuset *parent = parent_cs(cs);
1548
1549
1550
1551
1552
1553 if (parent->child_ecpus_count)
1554 update_sibling_cpumasks(parent, cs, &tmp);
1555 }
1556 return 0;
1557 }
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567 struct cpuset_migrate_mm_work {
1568 struct work_struct work;
1569 struct mm_struct *mm;
1570 nodemask_t from;
1571 nodemask_t to;
1572 };
1573
1574 static void cpuset_migrate_mm_workfn(struct work_struct *work)
1575 {
1576 struct cpuset_migrate_mm_work *mwork =
1577 container_of(work, struct cpuset_migrate_mm_work, work);
1578
1579
1580 do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
1581 mmput(mwork->mm);
1582 kfree(mwork);
1583 }
1584
1585 static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
1586 const nodemask_t *to)
1587 {
1588 struct cpuset_migrate_mm_work *mwork;
1589
1590 mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
1591 if (mwork) {
1592 mwork->mm = mm;
1593 mwork->from = *from;
1594 mwork->to = *to;
1595 INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
1596 queue_work(cpuset_migrate_mm_wq, &mwork->work);
1597 } else {
1598 mmput(mm);
1599 }
1600 }
1601
1602 static void cpuset_post_attach(void)
1603 {
1604 flush_workqueue(cpuset_migrate_mm_wq);
1605 }
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617 static void cpuset_change_task_nodemask(struct task_struct *tsk,
1618 nodemask_t *newmems)
1619 {
1620 task_lock(tsk);
1621
1622 local_irq_disable();
1623 write_seqcount_begin(&tsk->mems_allowed_seq);
1624
1625 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1626 mpol_rebind_task(tsk, newmems);
1627 tsk->mems_allowed = *newmems;
1628
1629 write_seqcount_end(&tsk->mems_allowed_seq);
1630 local_irq_enable();
1631
1632 task_unlock(tsk);
1633 }
1634
1635 static void *cpuset_being_rebound;
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645 static void update_tasks_nodemask(struct cpuset *cs)
1646 {
1647 static nodemask_t newmems;
1648 struct css_task_iter it;
1649 struct task_struct *task;
1650
1651 cpuset_being_rebound = cs;
1652
1653 guarantee_online_mems(cs, &newmems);
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665 css_task_iter_start(&cs->css, 0, &it);
1666 while ((task = css_task_iter_next(&it))) {
1667 struct mm_struct *mm;
1668 bool migrate;
1669
1670 cpuset_change_task_nodemask(task, &newmems);
1671
1672 mm = get_task_mm(task);
1673 if (!mm)
1674 continue;
1675
1676 migrate = is_memory_migrate(cs);
1677
1678 mpol_rebind_mm(mm, &cs->mems_allowed);
1679 if (migrate)
1680 cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
1681 else
1682 mmput(mm);
1683 }
1684 css_task_iter_end(&it);
1685
1686
1687
1688
1689
1690 cs->old_mems_allowed = newmems;
1691
1692
1693 cpuset_being_rebound = NULL;
1694 }
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1709 {
1710 struct cpuset *cp;
1711 struct cgroup_subsys_state *pos_css;
1712
1713 rcu_read_lock();
1714 cpuset_for_each_descendant_pre(cp, pos_css, cs) {
1715 struct cpuset *parent = parent_cs(cp);
1716
1717 nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
1718
1719
1720
1721
1722
1723 if (is_in_v2_mode() && nodes_empty(*new_mems))
1724 *new_mems = parent->effective_mems;
1725
1726
1727 if (nodes_equal(*new_mems, cp->effective_mems)) {
1728 pos_css = css_rightmost_descendant(pos_css);
1729 continue;
1730 }
1731
1732 if (!css_tryget_online(&cp->css))
1733 continue;
1734 rcu_read_unlock();
1735
1736 spin_lock_irq(&callback_lock);
1737 cp->effective_mems = *new_mems;
1738 spin_unlock_irq(&callback_lock);
1739
1740 WARN_ON(!is_in_v2_mode() &&
1741 !nodes_equal(cp->mems_allowed, cp->effective_mems));
1742
1743 update_tasks_nodemask(cp);
1744
1745 rcu_read_lock();
1746 css_put(&cp->css);
1747 }
1748 rcu_read_unlock();
1749 }
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1765 const char *buf)
1766 {
1767 int retval;
1768
1769
1770
1771
1772
1773 if (cs == &top_cpuset) {
1774 retval = -EACCES;
1775 goto done;
1776 }
1777
1778
1779
1780
1781
1782
1783
1784 if (!*buf) {
1785 nodes_clear(trialcs->mems_allowed);
1786 } else {
1787 retval = nodelist_parse(buf, trialcs->mems_allowed);
1788 if (retval < 0)
1789 goto done;
1790
1791 if (!nodes_subset(trialcs->mems_allowed,
1792 top_cpuset.mems_allowed)) {
1793 retval = -EINVAL;
1794 goto done;
1795 }
1796 }
1797
1798 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
1799 retval = 0;
1800 goto done;
1801 }
1802 retval = validate_change(cs, trialcs);
1803 if (retval < 0)
1804 goto done;
1805
1806 spin_lock_irq(&callback_lock);
1807 cs->mems_allowed = trialcs->mems_allowed;
1808 spin_unlock_irq(&callback_lock);
1809
1810
1811 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1812 done:
1813 return retval;
1814 }
1815
1816 bool current_cpuset_is_being_rebound(void)
1817 {
1818 bool ret;
1819
1820 rcu_read_lock();
1821 ret = task_cs(current) == cpuset_being_rebound;
1822 rcu_read_unlock();
1823
1824 return ret;
1825 }
1826
1827 static int update_relax_domain_level(struct cpuset *cs, s64 val)
1828 {
1829 #ifdef CONFIG_SMP
1830 if (val < -1 || val >= sched_domain_level_max)
1831 return -EINVAL;
1832 #endif
1833
1834 if (val != cs->relax_domain_level) {
1835 cs->relax_domain_level = val;
1836 if (!cpumask_empty(cs->cpus_allowed) &&
1837 is_sched_load_balance(cs))
1838 rebuild_sched_domains_locked();
1839 }
1840
1841 return 0;
1842 }
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852 static void update_tasks_flags(struct cpuset *cs)
1853 {
1854 struct css_task_iter it;
1855 struct task_struct *task;
1856
1857 css_task_iter_start(&cs->css, 0, &it);
1858 while ((task = css_task_iter_next(&it)))
1859 cpuset_update_task_spread_flag(cs, task);
1860 css_task_iter_end(&it);
1861 }
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1873 int turning_on)
1874 {
1875 struct cpuset *trialcs;
1876 int balance_flag_changed;
1877 int spread_flag_changed;
1878 int err;
1879
1880 trialcs = alloc_trial_cpuset(cs);
1881 if (!trialcs)
1882 return -ENOMEM;
1883
1884 if (turning_on)
1885 set_bit(bit, &trialcs->flags);
1886 else
1887 clear_bit(bit, &trialcs->flags);
1888
1889 err = validate_change(cs, trialcs);
1890 if (err < 0)
1891 goto out;
1892
1893 balance_flag_changed = (is_sched_load_balance(cs) !=
1894 is_sched_load_balance(trialcs));
1895
1896 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1897 || (is_spread_page(cs) != is_spread_page(trialcs)));
1898
1899 spin_lock_irq(&callback_lock);
1900 cs->flags = trialcs->flags;
1901 spin_unlock_irq(&callback_lock);
1902
1903 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1904 rebuild_sched_domains_locked();
1905
1906 if (spread_flag_changed)
1907 update_tasks_flags(cs);
1908 out:
1909 free_cpuset(trialcs);
1910 return err;
1911 }
1912
1913
1914
1915
1916
1917
1918
1919
1920 static int update_prstate(struct cpuset *cs, int val)
1921 {
1922 int err;
1923 struct cpuset *parent = parent_cs(cs);
1924 struct tmpmasks tmp;
1925
1926 if ((val != 0) && (val != 1))
1927 return -EINVAL;
1928 if (val == cs->partition_root_state)
1929 return 0;
1930
1931
1932
1933
1934
1935 if (val && cs->partition_root_state)
1936 return -EINVAL;
1937
1938 if (alloc_cpumasks(NULL, &tmp))
1939 return -ENOMEM;
1940
1941 err = -EINVAL;
1942 if (!cs->partition_root_state) {
1943
1944
1945
1946
1947
1948 if (cpumask_empty(cs->cpus_allowed))
1949 goto out;
1950
1951 err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
1952 if (err)
1953 goto out;
1954
1955 err = update_parent_subparts_cpumask(cs, partcmd_enable,
1956 NULL, &tmp);
1957 if (err) {
1958 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1959 goto out;
1960 }
1961 cs->partition_root_state = PRS_ENABLED;
1962 } else {
1963
1964
1965
1966
1967 if (cs->partition_root_state == PRS_ERROR) {
1968 cs->partition_root_state = 0;
1969 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1970 err = 0;
1971 goto out;
1972 }
1973
1974 err = update_parent_subparts_cpumask(cs, partcmd_disable,
1975 NULL, &tmp);
1976 if (err)
1977 goto out;
1978
1979 cs->partition_root_state = 0;
1980
1981
1982 update_flag(CS_CPU_EXCLUSIVE, cs, 0);
1983 }
1984
1985
1986
1987
1988
1989 if (parent != &top_cpuset)
1990 update_tasks_cpumask(parent);
1991
1992 if (parent->child_ecpus_count)
1993 update_sibling_cpumasks(parent, cs, &tmp);
1994
1995 rebuild_sched_domains_locked();
1996 out:
1997 free_cpumasks(NULL, &tmp);
1998 return err;
1999 }
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046 #define FM_COEF 933
2047 #define FM_MAXTICKS ((u32)99)
2048 #define FM_MAXCNT 1000000
2049 #define FM_SCALE 1000
2050
2051
2052 static void fmeter_init(struct fmeter *fmp)
2053 {
2054 fmp->cnt = 0;
2055 fmp->val = 0;
2056 fmp->time = 0;
2057 spin_lock_init(&fmp->lock);
2058 }
2059
2060
2061 static void fmeter_update(struct fmeter *fmp)
2062 {
2063 time64_t now;
2064 u32 ticks;
2065
2066 now = ktime_get_seconds();
2067 ticks = now - fmp->time;
2068
2069 if (ticks == 0)
2070 return;
2071
2072 ticks = min(FM_MAXTICKS, ticks);
2073 while (ticks-- > 0)
2074 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
2075 fmp->time = now;
2076
2077 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
2078 fmp->cnt = 0;
2079 }
2080
2081
2082 static void fmeter_markevent(struct fmeter *fmp)
2083 {
2084 spin_lock(&fmp->lock);
2085 fmeter_update(fmp);
2086 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
2087 spin_unlock(&fmp->lock);
2088 }
2089
2090
2091 static int fmeter_getrate(struct fmeter *fmp)
2092 {
2093 int val;
2094
2095 spin_lock(&fmp->lock);
2096 fmeter_update(fmp);
2097 val = fmp->val;
2098 spin_unlock(&fmp->lock);
2099 return val;
2100 }
2101
2102 static struct cpuset *cpuset_attach_old_cs;
2103
2104
2105 static int cpuset_can_attach(struct cgroup_taskset *tset)
2106 {
2107 struct cgroup_subsys_state *css;
2108 struct cpuset *cs;
2109 struct task_struct *task;
2110 int ret;
2111
2112
2113 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2114 cs = css_cs(css);
2115
2116 percpu_down_write(&cpuset_rwsem);
2117
2118
2119 ret = -ENOSPC;
2120 if (!is_in_v2_mode() &&
2121 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
2122 goto out_unlock;
2123
2124 cgroup_taskset_for_each(task, css, tset) {
2125 ret = task_can_attach(task, cs->cpus_allowed);
2126 if (ret)
2127 goto out_unlock;
2128 ret = security_task_setscheduler(task);
2129 if (ret)
2130 goto out_unlock;
2131 }
2132
2133
2134
2135
2136
2137 cs->attach_in_progress++;
2138 ret = 0;
2139 out_unlock:
2140 percpu_up_write(&cpuset_rwsem);
2141 return ret;
2142 }
2143
2144 static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2145 {
2146 struct cgroup_subsys_state *css;
2147
2148 cgroup_taskset_first(tset, &css);
2149
2150 percpu_down_write(&cpuset_rwsem);
2151 css_cs(css)->attach_in_progress--;
2152 percpu_up_write(&cpuset_rwsem);
2153 }
2154
2155
2156
2157
2158
2159
2160 static cpumask_var_t cpus_attach;
2161
2162 static void cpuset_attach(struct cgroup_taskset *tset)
2163 {
2164
2165 static nodemask_t cpuset_attach_nodemask_to;
2166 struct task_struct *task;
2167 struct task_struct *leader;
2168 struct cgroup_subsys_state *css;
2169 struct cpuset *cs;
2170 struct cpuset *oldcs = cpuset_attach_old_cs;
2171
2172 cgroup_taskset_first(tset, &css);
2173 cs = css_cs(css);
2174
2175 percpu_down_write(&cpuset_rwsem);
2176
2177
2178 if (cs == &top_cpuset)
2179 cpumask_copy(cpus_attach, cpu_possible_mask);
2180 else
2181 guarantee_online_cpus(cs, cpus_attach);
2182
2183 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
2184
2185 cgroup_taskset_for_each(task, css, tset) {
2186
2187
2188
2189
2190 WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
2191
2192 cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
2193 cpuset_update_task_spread_flag(cs, task);
2194 }
2195
2196
2197
2198
2199
2200 cpuset_attach_nodemask_to = cs->effective_mems;
2201 cgroup_taskset_for_each_leader(leader, css, tset) {
2202 struct mm_struct *mm = get_task_mm(leader);
2203
2204 if (mm) {
2205 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215 if (is_memory_migrate(cs))
2216 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
2217 &cpuset_attach_nodemask_to);
2218 else
2219 mmput(mm);
2220 }
2221 }
2222
2223 cs->old_mems_allowed = cpuset_attach_nodemask_to;
2224
2225 cs->attach_in_progress--;
2226 if (!cs->attach_in_progress)
2227 wake_up(&cpuset_attach_wq);
2228
2229 percpu_up_write(&cpuset_rwsem);
2230 }
2231
2232
2233
2234 typedef enum {
2235 FILE_MEMORY_MIGRATE,
2236 FILE_CPULIST,
2237 FILE_MEMLIST,
2238 FILE_EFFECTIVE_CPULIST,
2239 FILE_EFFECTIVE_MEMLIST,
2240 FILE_SUBPARTS_CPULIST,
2241 FILE_CPU_EXCLUSIVE,
2242 FILE_MEM_EXCLUSIVE,
2243 FILE_MEM_HARDWALL,
2244 FILE_SCHED_LOAD_BALANCE,
2245 FILE_PARTITION_ROOT,
2246 FILE_SCHED_RELAX_DOMAIN_LEVEL,
2247 FILE_MEMORY_PRESSURE_ENABLED,
2248 FILE_MEMORY_PRESSURE,
2249 FILE_SPREAD_PAGE,
2250 FILE_SPREAD_SLAB,
2251 } cpuset_filetype_t;
2252
2253 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2254 u64 val)
2255 {
2256 struct cpuset *cs = css_cs(css);
2257 cpuset_filetype_t type = cft->private;
2258 int retval = 0;
2259
2260 get_online_cpus();
2261 percpu_down_write(&cpuset_rwsem);
2262 if (!is_cpuset_online(cs)) {
2263 retval = -ENODEV;
2264 goto out_unlock;
2265 }
2266
2267 switch (type) {
2268 case FILE_CPU_EXCLUSIVE:
2269 retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
2270 break;
2271 case FILE_MEM_EXCLUSIVE:
2272 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
2273 break;
2274 case FILE_MEM_HARDWALL:
2275 retval = update_flag(CS_MEM_HARDWALL, cs, val);
2276 break;
2277 case FILE_SCHED_LOAD_BALANCE:
2278 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
2279 break;
2280 case FILE_MEMORY_MIGRATE:
2281 retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
2282 break;
2283 case FILE_MEMORY_PRESSURE_ENABLED:
2284 cpuset_memory_pressure_enabled = !!val;
2285 break;
2286 case FILE_SPREAD_PAGE:
2287 retval = update_flag(CS_SPREAD_PAGE, cs, val);
2288 break;
2289 case FILE_SPREAD_SLAB:
2290 retval = update_flag(CS_SPREAD_SLAB, cs, val);
2291 break;
2292 default:
2293 retval = -EINVAL;
2294 break;
2295 }
2296 out_unlock:
2297 percpu_up_write(&cpuset_rwsem);
2298 put_online_cpus();
2299 return retval;
2300 }
2301
2302 static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2303 s64 val)
2304 {
2305 struct cpuset *cs = css_cs(css);
2306 cpuset_filetype_t type = cft->private;
2307 int retval = -ENODEV;
2308
2309 get_online_cpus();
2310 percpu_down_write(&cpuset_rwsem);
2311 if (!is_cpuset_online(cs))
2312 goto out_unlock;
2313
2314 switch (type) {
2315 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2316 retval = update_relax_domain_level(cs, val);
2317 break;
2318 default:
2319 retval = -EINVAL;
2320 break;
2321 }
2322 out_unlock:
2323 percpu_up_write(&cpuset_rwsem);
2324 put_online_cpus();
2325 return retval;
2326 }
2327
2328
2329
2330
2331 static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2332 char *buf, size_t nbytes, loff_t off)
2333 {
2334 struct cpuset *cs = css_cs(of_css(of));
2335 struct cpuset *trialcs;
2336 int retval = -ENODEV;
2337
2338 buf = strstrip(buf);
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359 css_get(&cs->css);
2360 kernfs_break_active_protection(of->kn);
2361 flush_work(&cpuset_hotplug_work);
2362
2363 get_online_cpus();
2364 percpu_down_write(&cpuset_rwsem);
2365 if (!is_cpuset_online(cs))
2366 goto out_unlock;
2367
2368 trialcs = alloc_trial_cpuset(cs);
2369 if (!trialcs) {
2370 retval = -ENOMEM;
2371 goto out_unlock;
2372 }
2373
2374 switch (of_cft(of)->private) {
2375 case FILE_CPULIST:
2376 retval = update_cpumask(cs, trialcs, buf);
2377 break;
2378 case FILE_MEMLIST:
2379 retval = update_nodemask(cs, trialcs, buf);
2380 break;
2381 default:
2382 retval = -EINVAL;
2383 break;
2384 }
2385
2386 free_cpuset(trialcs);
2387 out_unlock:
2388 percpu_up_write(&cpuset_rwsem);
2389 put_online_cpus();
2390 kernfs_unbreak_active_protection(of->kn);
2391 css_put(&cs->css);
2392 flush_workqueue(cpuset_migrate_mm_wq);
2393 return retval ?: nbytes;
2394 }
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404 static int cpuset_common_seq_show(struct seq_file *sf, void *v)
2405 {
2406 struct cpuset *cs = css_cs(seq_css(sf));
2407 cpuset_filetype_t type = seq_cft(sf)->private;
2408 int ret = 0;
2409
2410 spin_lock_irq(&callback_lock);
2411
2412 switch (type) {
2413 case FILE_CPULIST:
2414 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
2415 break;
2416 case FILE_MEMLIST:
2417 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
2418 break;
2419 case FILE_EFFECTIVE_CPULIST:
2420 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
2421 break;
2422 case FILE_EFFECTIVE_MEMLIST:
2423 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
2424 break;
2425 case FILE_SUBPARTS_CPULIST:
2426 seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
2427 break;
2428 default:
2429 ret = -EINVAL;
2430 }
2431
2432 spin_unlock_irq(&callback_lock);
2433 return ret;
2434 }
2435
2436 static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
2437 {
2438 struct cpuset *cs = css_cs(css);
2439 cpuset_filetype_t type = cft->private;
2440 switch (type) {
2441 case FILE_CPU_EXCLUSIVE:
2442 return is_cpu_exclusive(cs);
2443 case FILE_MEM_EXCLUSIVE:
2444 return is_mem_exclusive(cs);
2445 case FILE_MEM_HARDWALL:
2446 return is_mem_hardwall(cs);
2447 case FILE_SCHED_LOAD_BALANCE:
2448 return is_sched_load_balance(cs);
2449 case FILE_MEMORY_MIGRATE:
2450 return is_memory_migrate(cs);
2451 case FILE_MEMORY_PRESSURE_ENABLED:
2452 return cpuset_memory_pressure_enabled;
2453 case FILE_MEMORY_PRESSURE:
2454 return fmeter_getrate(&cs->fmeter);
2455 case FILE_SPREAD_PAGE:
2456 return is_spread_page(cs);
2457 case FILE_SPREAD_SLAB:
2458 return is_spread_slab(cs);
2459 default:
2460 BUG();
2461 }
2462
2463
2464 return 0;
2465 }
2466
2467 static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
2468 {
2469 struct cpuset *cs = css_cs(css);
2470 cpuset_filetype_t type = cft->private;
2471 switch (type) {
2472 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
2473 return cs->relax_domain_level;
2474 default:
2475 BUG();
2476 }
2477
2478
2479 return 0;
2480 }
2481
2482 static int sched_partition_show(struct seq_file *seq, void *v)
2483 {
2484 struct cpuset *cs = css_cs(seq_css(seq));
2485
2486 switch (cs->partition_root_state) {
2487 case PRS_ENABLED:
2488 seq_puts(seq, "root\n");
2489 break;
2490 case PRS_DISABLED:
2491 seq_puts(seq, "member\n");
2492 break;
2493 case PRS_ERROR:
2494 seq_puts(seq, "root invalid\n");
2495 break;
2496 }
2497 return 0;
2498 }
2499
2500 static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2501 size_t nbytes, loff_t off)
2502 {
2503 struct cpuset *cs = css_cs(of_css(of));
2504 int val;
2505 int retval = -ENODEV;
2506
2507 buf = strstrip(buf);
2508
2509
2510
2511
2512 if (!strcmp(buf, "root"))
2513 val = PRS_ENABLED;
2514 else if (!strcmp(buf, "member"))
2515 val = PRS_DISABLED;
2516 else
2517 return -EINVAL;
2518
2519 css_get(&cs->css);
2520 get_online_cpus();
2521 percpu_down_write(&cpuset_rwsem);
2522 if (!is_cpuset_online(cs))
2523 goto out_unlock;
2524
2525 retval = update_prstate(cs, val);
2526 out_unlock:
2527 percpu_up_write(&cpuset_rwsem);
2528 put_online_cpus();
2529 css_put(&cs->css);
2530 return retval ?: nbytes;
2531 }
2532
2533
2534
2535
2536
2537 static struct cftype legacy_files[] = {
2538 {
2539 .name = "cpus",
2540 .seq_show = cpuset_common_seq_show,
2541 .write = cpuset_write_resmask,
2542 .max_write_len = (100U + 6 * NR_CPUS),
2543 .private = FILE_CPULIST,
2544 },
2545
2546 {
2547 .name = "mems",
2548 .seq_show = cpuset_common_seq_show,
2549 .write = cpuset_write_resmask,
2550 .max_write_len = (100U + 6 * MAX_NUMNODES),
2551 .private = FILE_MEMLIST,
2552 },
2553
2554 {
2555 .name = "effective_cpus",
2556 .seq_show = cpuset_common_seq_show,
2557 .private = FILE_EFFECTIVE_CPULIST,
2558 },
2559
2560 {
2561 .name = "effective_mems",
2562 .seq_show = cpuset_common_seq_show,
2563 .private = FILE_EFFECTIVE_MEMLIST,
2564 },
2565
2566 {
2567 .name = "cpu_exclusive",
2568 .read_u64 = cpuset_read_u64,
2569 .write_u64 = cpuset_write_u64,
2570 .private = FILE_CPU_EXCLUSIVE,
2571 },
2572
2573 {
2574 .name = "mem_exclusive",
2575 .read_u64 = cpuset_read_u64,
2576 .write_u64 = cpuset_write_u64,
2577 .private = FILE_MEM_EXCLUSIVE,
2578 },
2579
2580 {
2581 .name = "mem_hardwall",
2582 .read_u64 = cpuset_read_u64,
2583 .write_u64 = cpuset_write_u64,
2584 .private = FILE_MEM_HARDWALL,
2585 },
2586
2587 {
2588 .name = "sched_load_balance",
2589 .read_u64 = cpuset_read_u64,
2590 .write_u64 = cpuset_write_u64,
2591 .private = FILE_SCHED_LOAD_BALANCE,
2592 },
2593
2594 {
2595 .name = "sched_relax_domain_level",
2596 .read_s64 = cpuset_read_s64,
2597 .write_s64 = cpuset_write_s64,
2598 .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
2599 },
2600
2601 {
2602 .name = "memory_migrate",
2603 .read_u64 = cpuset_read_u64,
2604 .write_u64 = cpuset_write_u64,
2605 .private = FILE_MEMORY_MIGRATE,
2606 },
2607
2608 {
2609 .name = "memory_pressure",
2610 .read_u64 = cpuset_read_u64,
2611 .private = FILE_MEMORY_PRESSURE,
2612 },
2613
2614 {
2615 .name = "memory_spread_page",
2616 .read_u64 = cpuset_read_u64,
2617 .write_u64 = cpuset_write_u64,
2618 .private = FILE_SPREAD_PAGE,
2619 },
2620
2621 {
2622 .name = "memory_spread_slab",
2623 .read_u64 = cpuset_read_u64,
2624 .write_u64 = cpuset_write_u64,
2625 .private = FILE_SPREAD_SLAB,
2626 },
2627
2628 {
2629 .name = "memory_pressure_enabled",
2630 .flags = CFTYPE_ONLY_ON_ROOT,
2631 .read_u64 = cpuset_read_u64,
2632 .write_u64 = cpuset_write_u64,
2633 .private = FILE_MEMORY_PRESSURE_ENABLED,
2634 },
2635
2636 { }
2637 };
2638
2639
2640
2641
2642
2643 static struct cftype dfl_files[] = {
2644 {
2645 .name = "cpus",
2646 .seq_show = cpuset_common_seq_show,
2647 .write = cpuset_write_resmask,
2648 .max_write_len = (100U + 6 * NR_CPUS),
2649 .private = FILE_CPULIST,
2650 .flags = CFTYPE_NOT_ON_ROOT,
2651 },
2652
2653 {
2654 .name = "mems",
2655 .seq_show = cpuset_common_seq_show,
2656 .write = cpuset_write_resmask,
2657 .max_write_len = (100U + 6 * MAX_NUMNODES),
2658 .private = FILE_MEMLIST,
2659 .flags = CFTYPE_NOT_ON_ROOT,
2660 },
2661
2662 {
2663 .name = "cpus.effective",
2664 .seq_show = cpuset_common_seq_show,
2665 .private = FILE_EFFECTIVE_CPULIST,
2666 },
2667
2668 {
2669 .name = "mems.effective",
2670 .seq_show = cpuset_common_seq_show,
2671 .private = FILE_EFFECTIVE_MEMLIST,
2672 },
2673
2674 {
2675 .name = "cpus.partition",
2676 .seq_show = sched_partition_show,
2677 .write = sched_partition_write,
2678 .private = FILE_PARTITION_ROOT,
2679 .flags = CFTYPE_NOT_ON_ROOT,
2680 },
2681
2682 {
2683 .name = "cpus.subpartitions",
2684 .seq_show = cpuset_common_seq_show,
2685 .private = FILE_SUBPARTS_CPULIST,
2686 .flags = CFTYPE_DEBUG,
2687 },
2688
2689 { }
2690 };
2691
2692
2693
2694
2695
2696
2697
2698 static struct cgroup_subsys_state *
2699 cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
2700 {
2701 struct cpuset *cs;
2702
2703 if (!parent_css)
2704 return &top_cpuset.css;
2705
2706 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
2707 if (!cs)
2708 return ERR_PTR(-ENOMEM);
2709
2710 if (alloc_cpumasks(cs, NULL)) {
2711 kfree(cs);
2712 return ERR_PTR(-ENOMEM);
2713 }
2714
2715 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
2716 nodes_clear(cs->mems_allowed);
2717 nodes_clear(cs->effective_mems);
2718 fmeter_init(&cs->fmeter);
2719 cs->relax_domain_level = -1;
2720
2721 return &cs->css;
2722 }
2723
2724 static int cpuset_css_online(struct cgroup_subsys_state *css)
2725 {
2726 struct cpuset *cs = css_cs(css);
2727 struct cpuset *parent = parent_cs(cs);
2728 struct cpuset *tmp_cs;
2729 struct cgroup_subsys_state *pos_css;
2730
2731 if (!parent)
2732 return 0;
2733
2734 get_online_cpus();
2735 percpu_down_write(&cpuset_rwsem);
2736
2737 set_bit(CS_ONLINE, &cs->flags);
2738 if (is_spread_page(parent))
2739 set_bit(CS_SPREAD_PAGE, &cs->flags);
2740 if (is_spread_slab(parent))
2741 set_bit(CS_SPREAD_SLAB, &cs->flags);
2742
2743 cpuset_inc();
2744
2745 spin_lock_irq(&callback_lock);
2746 if (is_in_v2_mode()) {
2747 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
2748 cs->effective_mems = parent->effective_mems;
2749 cs->use_parent_ecpus = true;
2750 parent->child_ecpus_count++;
2751 }
2752 spin_unlock_irq(&callback_lock);
2753
2754 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
2755 goto out_unlock;
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770 rcu_read_lock();
2771 cpuset_for_each_child(tmp_cs, pos_css, parent) {
2772 if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
2773 rcu_read_unlock();
2774 goto out_unlock;
2775 }
2776 }
2777 rcu_read_unlock();
2778
2779 spin_lock_irq(&callback_lock);
2780 cs->mems_allowed = parent->mems_allowed;
2781 cs->effective_mems = parent->mems_allowed;
2782 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
2783 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2784 spin_unlock_irq(&callback_lock);
2785 out_unlock:
2786 percpu_up_write(&cpuset_rwsem);
2787 put_online_cpus();
2788 return 0;
2789 }
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802 static void cpuset_css_offline(struct cgroup_subsys_state *css)
2803 {
2804 struct cpuset *cs = css_cs(css);
2805
2806 get_online_cpus();
2807 percpu_down_write(&cpuset_rwsem);
2808
2809 if (is_partition_root(cs))
2810 update_prstate(cs, 0);
2811
2812 if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
2813 is_sched_load_balance(cs))
2814 update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
2815
2816 if (cs->use_parent_ecpus) {
2817 struct cpuset *parent = parent_cs(cs);
2818
2819 cs->use_parent_ecpus = false;
2820 parent->child_ecpus_count--;
2821 }
2822
2823 cpuset_dec();
2824 clear_bit(CS_ONLINE, &cs->flags);
2825
2826 percpu_up_write(&cpuset_rwsem);
2827 put_online_cpus();
2828 }
2829
2830 static void cpuset_css_free(struct cgroup_subsys_state *css)
2831 {
2832 struct cpuset *cs = css_cs(css);
2833
2834 free_cpuset(cs);
2835 }
2836
2837 static void cpuset_bind(struct cgroup_subsys_state *root_css)
2838 {
2839 percpu_down_write(&cpuset_rwsem);
2840 spin_lock_irq(&callback_lock);
2841
2842 if (is_in_v2_mode()) {
2843 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
2844 top_cpuset.mems_allowed = node_possible_map;
2845 } else {
2846 cpumask_copy(top_cpuset.cpus_allowed,
2847 top_cpuset.effective_cpus);
2848 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2849 }
2850
2851 spin_unlock_irq(&callback_lock);
2852 percpu_up_write(&cpuset_rwsem);
2853 }
2854
2855
2856
2857
2858
2859
2860 static void cpuset_fork(struct task_struct *task)
2861 {
2862 if (task_css_is_root(task, cpuset_cgrp_id))
2863 return;
2864
2865 set_cpus_allowed_ptr(task, current->cpus_ptr);
2866 task->mems_allowed = current->mems_allowed;
2867 }
2868
2869 struct cgroup_subsys cpuset_cgrp_subsys = {
2870 .css_alloc = cpuset_css_alloc,
2871 .css_online = cpuset_css_online,
2872 .css_offline = cpuset_css_offline,
2873 .css_free = cpuset_css_free,
2874 .can_attach = cpuset_can_attach,
2875 .cancel_attach = cpuset_cancel_attach,
2876 .attach = cpuset_attach,
2877 .post_attach = cpuset_post_attach,
2878 .bind = cpuset_bind,
2879 .fork = cpuset_fork,
2880 .legacy_cftypes = legacy_files,
2881 .dfl_cftypes = dfl_files,
2882 .early_init = true,
2883 .threaded = true,
2884 };
2885
2886
2887
2888
2889
2890
2891
2892 int __init cpuset_init(void)
2893 {
2894 BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2895
2896 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2897 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2898 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
2899
2900 cpumask_setall(top_cpuset.cpus_allowed);
2901 nodes_setall(top_cpuset.mems_allowed);
2902 cpumask_setall(top_cpuset.effective_cpus);
2903 nodes_setall(top_cpuset.effective_mems);
2904
2905 fmeter_init(&top_cpuset.fmeter);
2906 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
2907 top_cpuset.relax_domain_level = -1;
2908
2909 BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
2910
2911 return 0;
2912 }
2913
2914
2915
2916
2917
2918
2919
2920
2921 static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
2922 {
2923 struct cpuset *parent;
2924
2925
2926
2927
2928
2929 parent = parent_cs(cs);
2930 while (cpumask_empty(parent->cpus_allowed) ||
2931 nodes_empty(parent->mems_allowed))
2932 parent = parent_cs(parent);
2933
2934 if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
2935 pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
2936 pr_cont_cgroup_name(cs->css.cgroup);
2937 pr_cont("\n");
2938 }
2939 }
2940
2941 static void
2942 hotplug_update_tasks_legacy(struct cpuset *cs,
2943 struct cpumask *new_cpus, nodemask_t *new_mems,
2944 bool cpus_updated, bool mems_updated)
2945 {
2946 bool is_empty;
2947
2948 spin_lock_irq(&callback_lock);
2949 cpumask_copy(cs->cpus_allowed, new_cpus);
2950 cpumask_copy(cs->effective_cpus, new_cpus);
2951 cs->mems_allowed = *new_mems;
2952 cs->effective_mems = *new_mems;
2953 spin_unlock_irq(&callback_lock);
2954
2955
2956
2957
2958
2959 if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
2960 update_tasks_cpumask(cs);
2961 if (mems_updated && !nodes_empty(cs->mems_allowed))
2962 update_tasks_nodemask(cs);
2963
2964 is_empty = cpumask_empty(cs->cpus_allowed) ||
2965 nodes_empty(cs->mems_allowed);
2966
2967 percpu_up_write(&cpuset_rwsem);
2968
2969
2970
2971
2972
2973
2974 if (is_empty)
2975 remove_tasks_in_empty_cpuset(cs);
2976
2977 percpu_down_write(&cpuset_rwsem);
2978 }
2979
2980 static void
2981 hotplug_update_tasks(struct cpuset *cs,
2982 struct cpumask *new_cpus, nodemask_t *new_mems,
2983 bool cpus_updated, bool mems_updated)
2984 {
2985 if (cpumask_empty(new_cpus))
2986 cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
2987 if (nodes_empty(*new_mems))
2988 *new_mems = parent_cs(cs)->effective_mems;
2989
2990 spin_lock_irq(&callback_lock);
2991 cpumask_copy(cs->effective_cpus, new_cpus);
2992 cs->effective_mems = *new_mems;
2993 spin_unlock_irq(&callback_lock);
2994
2995 if (cpus_updated)
2996 update_tasks_cpumask(cs);
2997 if (mems_updated)
2998 update_tasks_nodemask(cs);
2999 }
3000
3001 static bool force_rebuild;
3002
3003 void cpuset_force_rebuild(void)
3004 {
3005 force_rebuild = true;
3006 }
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017 static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
3018 {
3019 static cpumask_t new_cpus;
3020 static nodemask_t new_mems;
3021 bool cpus_updated;
3022 bool mems_updated;
3023 struct cpuset *parent;
3024 retry:
3025 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
3026
3027 percpu_down_write(&cpuset_rwsem);
3028
3029
3030
3031
3032
3033 if (cs->attach_in_progress) {
3034 percpu_up_write(&cpuset_rwsem);
3035 goto retry;
3036 }
3037
3038 parent = parent_cs(cs);
3039 compute_effective_cpumask(&new_cpus, cs, parent);
3040 nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
3041
3042 if (cs->nr_subparts_cpus)
3043
3044
3045
3046
3047 cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
3048
3049 if (!tmp || !cs->partition_root_state)
3050 goto update_tasks;
3051
3052
3053
3054
3055
3056
3057 if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
3058 (parent->partition_root_state == PRS_ERROR))) {
3059 if (cs->nr_subparts_cpus) {
3060 cs->nr_subparts_cpus = 0;
3061 cpumask_clear(cs->subparts_cpus);
3062 compute_effective_cpumask(&new_cpus, cs, parent);
3063 }
3064
3065
3066
3067
3068
3069
3070
3071 if ((parent->partition_root_state == PRS_ERROR) ||
3072 cpumask_empty(&new_cpus)) {
3073 update_parent_subparts_cpumask(cs, partcmd_disable,
3074 NULL, tmp);
3075 cs->partition_root_state = PRS_ERROR;
3076 }
3077 cpuset_force_rebuild();
3078 }
3079
3080
3081
3082
3083
3084
3085 if (is_partition_root(parent) &&
3086 ((cs->partition_root_state == PRS_ERROR) ||
3087 !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
3088 update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
3089 cpuset_force_rebuild();
3090
3091 update_tasks:
3092 cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
3093 mems_updated = !nodes_equal(new_mems, cs->effective_mems);
3094
3095 if (is_in_v2_mode())
3096 hotplug_update_tasks(cs, &new_cpus, &new_mems,
3097 cpus_updated, mems_updated);
3098 else
3099 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3100 cpus_updated, mems_updated);
3101
3102 percpu_up_write(&cpuset_rwsem);
3103 }
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121 static void cpuset_hotplug_workfn(struct work_struct *work)
3122 {
3123 static cpumask_t new_cpus;
3124 static nodemask_t new_mems;
3125 bool cpus_updated, mems_updated;
3126 bool on_dfl = is_in_v2_mode();
3127 struct tmpmasks tmp, *ptmp = NULL;
3128
3129 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3130 ptmp = &tmp;
3131
3132 percpu_down_write(&cpuset_rwsem);
3133
3134
3135 cpumask_copy(&new_cpus, cpu_active_mask);
3136 new_mems = node_states[N_MEMORY];
3137
3138
3139
3140
3141
3142
3143 cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
3144 mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
3145
3146
3147 if (cpus_updated) {
3148 spin_lock_irq(&callback_lock);
3149 if (!on_dfl)
3150 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
3151
3152
3153
3154
3155
3156
3157 if (top_cpuset.nr_subparts_cpus) {
3158 if (cpumask_subset(&new_cpus,
3159 top_cpuset.subparts_cpus)) {
3160 top_cpuset.nr_subparts_cpus = 0;
3161 cpumask_clear(top_cpuset.subparts_cpus);
3162 } else {
3163 cpumask_andnot(&new_cpus, &new_cpus,
3164 top_cpuset.subparts_cpus);
3165 }
3166 }
3167 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
3168 spin_unlock_irq(&callback_lock);
3169
3170 }
3171
3172
3173 if (mems_updated) {
3174 spin_lock_irq(&callback_lock);
3175 if (!on_dfl)
3176 top_cpuset.mems_allowed = new_mems;
3177 top_cpuset.effective_mems = new_mems;
3178 spin_unlock_irq(&callback_lock);
3179 update_tasks_nodemask(&top_cpuset);
3180 }
3181
3182 percpu_up_write(&cpuset_rwsem);
3183
3184
3185 if (cpus_updated || mems_updated) {
3186 struct cpuset *cs;
3187 struct cgroup_subsys_state *pos_css;
3188
3189 rcu_read_lock();
3190 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
3191 if (cs == &top_cpuset || !css_tryget_online(&cs->css))
3192 continue;
3193 rcu_read_unlock();
3194
3195 cpuset_hotplug_update_tasks(cs, ptmp);
3196
3197 rcu_read_lock();
3198 css_put(&cs->css);
3199 }
3200 rcu_read_unlock();
3201 }
3202
3203
3204 if (cpus_updated || force_rebuild) {
3205 force_rebuild = false;
3206 rebuild_sched_domains();
3207 }
3208
3209 free_cpumasks(NULL, ptmp);
3210 }
3211
3212 void cpuset_update_active_cpus(void)
3213 {
3214
3215
3216
3217
3218
3219 schedule_work(&cpuset_hotplug_work);
3220 }
3221
3222 void cpuset_wait_for_hotplug(void)
3223 {
3224 flush_work(&cpuset_hotplug_work);
3225 }
3226
3227
3228
3229
3230
3231
3232 static int cpuset_track_online_nodes(struct notifier_block *self,
3233 unsigned long action, void *arg)
3234 {
3235 schedule_work(&cpuset_hotplug_work);
3236 return NOTIFY_OK;
3237 }
3238
3239 static struct notifier_block cpuset_track_online_nodes_nb = {
3240 .notifier_call = cpuset_track_online_nodes,
3241 .priority = 10,
3242 };
3243
3244
3245
3246
3247
3248
3249 void __init cpuset_init_smp(void)
3250 {
3251 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
3252 top_cpuset.mems_allowed = node_states[N_MEMORY];
3253 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
3254
3255 cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
3256 top_cpuset.effective_mems = node_states[N_MEMORY];
3257
3258 register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
3259
3260 cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
3261 BUG_ON(!cpuset_migrate_mm_wq);
3262 }
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
3276 {
3277 unsigned long flags;
3278
3279 spin_lock_irqsave(&callback_lock, flags);
3280 rcu_read_lock();
3281 guarantee_online_cpus(task_cs(tsk), pmask);
3282 rcu_read_unlock();
3283 spin_unlock_irqrestore(&callback_lock, flags);
3284 }
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298 void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
3299 {
3300 rcu_read_lock();
3301 do_set_cpus_allowed(tsk, is_in_v2_mode() ?
3302 task_cs(tsk)->cpus_allowed : cpu_possible_mask);
3303 rcu_read_unlock();
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322 }
3323
3324 void __init cpuset_init_current_mems_allowed(void)
3325 {
3326 nodes_setall(current->mems_allowed);
3327 }
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339 nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
3340 {
3341 nodemask_t mask;
3342 unsigned long flags;
3343
3344 spin_lock_irqsave(&callback_lock, flags);
3345 rcu_read_lock();
3346 guarantee_online_mems(task_cs(tsk), &mask);
3347 rcu_read_unlock();
3348 spin_unlock_irqrestore(&callback_lock, flags);
3349
3350 return mask;
3351 }
3352
3353
3354
3355
3356
3357
3358
3359 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
3360 {
3361 return nodes_intersects(*nodemask, current->mems_allowed);
3362 }
3363
3364
3365
3366
3367
3368
3369
3370 static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
3371 {
3372 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
3373 cs = parent_cs(cs);
3374 return cs;
3375 }
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417 bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
3418 {
3419 struct cpuset *cs;
3420 int allowed;
3421 unsigned long flags;
3422
3423 if (in_interrupt())
3424 return true;
3425 if (node_isset(node, current->mems_allowed))
3426 return true;
3427
3428
3429
3430
3431 if (unlikely(tsk_is_oom_victim(current)))
3432 return true;
3433 if (gfp_mask & __GFP_HARDWALL)
3434 return false;
3435
3436 if (current->flags & PF_EXITING)
3437 return true;
3438
3439
3440 spin_lock_irqsave(&callback_lock, flags);
3441
3442 rcu_read_lock();
3443 cs = nearest_hardwall_ancestor(task_cs(current));
3444 allowed = node_isset(node, cs->mems_allowed);
3445 rcu_read_unlock();
3446
3447 spin_unlock_irqrestore(&callback_lock, flags);
3448 return allowed;
3449 }
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478 static int cpuset_spread_node(int *rotor)
3479 {
3480 return *rotor = next_node_in(*rotor, current->mems_allowed);
3481 }
3482
3483 int cpuset_mem_spread_node(void)
3484 {
3485 if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
3486 current->cpuset_mem_spread_rotor =
3487 node_random(¤t->mems_allowed);
3488
3489 return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
3490 }
3491
3492 int cpuset_slab_spread_node(void)
3493 {
3494 if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
3495 current->cpuset_slab_spread_rotor =
3496 node_random(¤t->mems_allowed);
3497
3498 return cpuset_spread_node(¤t->cpuset_slab_spread_rotor);
3499 }
3500
3501 EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514 int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
3515 const struct task_struct *tsk2)
3516 {
3517 return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
3518 }
3519
3520
3521
3522
3523
3524
3525
3526 void cpuset_print_current_mems_allowed(void)
3527 {
3528 struct cgroup *cgrp;
3529
3530 rcu_read_lock();
3531
3532 cgrp = task_cs(current)->css.cgroup;
3533 pr_cont(",cpuset=");
3534 pr_cont_cgroup_name(cgrp);
3535 pr_cont(",mems_allowed=%*pbl",
3536 nodemask_pr_args(¤t->mems_allowed));
3537
3538 rcu_read_unlock();
3539 }
3540
3541
3542
3543
3544
3545
3546
3547 int cpuset_memory_pressure_enabled __read_mostly;
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567 void __cpuset_memory_pressure_bump(void)
3568 {
3569 rcu_read_lock();
3570 fmeter_markevent(&task_cs(current)->fmeter);
3571 rcu_read_unlock();
3572 }
3573
3574 #ifdef CONFIG_PROC_PID_CPUSET
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
3585 struct pid *pid, struct task_struct *tsk)
3586 {
3587 char *buf;
3588 struct cgroup_subsys_state *css;
3589 int retval;
3590
3591 retval = -ENOMEM;
3592 buf = kmalloc(PATH_MAX, GFP_KERNEL);
3593 if (!buf)
3594 goto out;
3595
3596 css = task_get_css(tsk, cpuset_cgrp_id);
3597 retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
3598 current->nsproxy->cgroup_ns);
3599 css_put(css);
3600 if (retval >= PATH_MAX)
3601 retval = -ENAMETOOLONG;
3602 if (retval < 0)
3603 goto out_free;
3604 seq_puts(m, buf);
3605 seq_putc(m, '\n');
3606 retval = 0;
3607 out_free:
3608 kfree(buf);
3609 out:
3610 return retval;
3611 }
3612 #endif
3613
3614
3615 void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
3616 {
3617 seq_printf(m, "Mems_allowed:\t%*pb\n",
3618 nodemask_pr_args(&task->mems_allowed));
3619 seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
3620 nodemask_pr_args(&task->mems_allowed));
3621 }