1/* 2 * kernel/sched/core.c 3 * 4 * Kernel scheduler and related syscalls 5 * 6 * Copyright (C) 1991-2002 Linus Torvalds 7 * 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 9 * make semaphores SMP safe 10 * 1998-11-19 Implemented schedule_timeout() and related stuff 11 * by Andrea Arcangeli 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 13 * hybrid priority-list and round-robin design with 14 * an array-switch method of distributing timeslices 15 * and per-CPU runqueues. Cleanups and useful suggestions 16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 17 * 2003-09-03 Interactivity tuning by Con Kolivas. 18 * 2004-04-02 Scheduler domains code by Nick Piggin 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 20 * fair scheduling design by Con Kolivas. 21 * 2007-05-05 Load balancing (smp-nice) and other improvements 22 * by Peter Williams 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 26 * Thomas Gleixner, Mike Kravetz 27 */ 28 29#include <linux/mm.h> 30#include <linux/module.h> 31#include <linux/nmi.h> 32#include <linux/init.h> 33#include <linux/uaccess.h> 34#include <linux/highmem.h> 35#include <asm/mmu_context.h> 36#include <linux/interrupt.h> 37#include <linux/capability.h> 38#include <linux/completion.h> 39#include <linux/kernel_stat.h> 40#include <linux/debug_locks.h> 41#include <linux/perf_event.h> 42#include <linux/security.h> 43#include <linux/notifier.h> 44#include <linux/profile.h> 45#include <linux/freezer.h> 46#include <linux/vmalloc.h> 47#include <linux/blkdev.h> 48#include <linux/delay.h> 49#include <linux/pid_namespace.h> 50#include <linux/smp.h> 51#include <linux/threads.h> 52#include <linux/timer.h> 53#include <linux/rcupdate.h> 54#include <linux/cpu.h> 55#include <linux/cpuset.h> 56#include <linux/percpu.h> 57#include <linux/proc_fs.h> 58#include <linux/seq_file.h> 59#include <linux/sysctl.h> 60#include <linux/syscalls.h> 61#include <linux/times.h> 62#include <linux/tsacct_kern.h> 63#include <linux/kprobes.h> 64#include <linux/delayacct.h> 65#include <linux/unistd.h> 66#include <linux/pagemap.h> 67#include <linux/hrtimer.h> 68#include <linux/tick.h> 69#include <linux/debugfs.h> 70#include <linux/ctype.h> 71#include <linux/ftrace.h> 72#include <linux/slab.h> 73#include <linux/init_task.h> 74#include <linux/binfmts.h> 75#include <linux/context_tracking.h> 76#include <linux/compiler.h> 77 78#include <asm/switch_to.h> 79#include <asm/tlb.h> 80#include <asm/irq_regs.h> 81#include <asm/mutex.h> 82#ifdef CONFIG_PARAVIRT 83#include <asm/paravirt.h> 84#endif 85 86#include "sched.h" 87#include "../workqueue_internal.h" 88#include "../smpboot.h" 89 90#define CREATE_TRACE_POINTS 91#include <trace/events/sched.h> 92 93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 94{ 95 unsigned long delta; 96 ktime_t soft, hard, now; 97 98 for (;;) { 99 if (hrtimer_active(period_timer)) 100 break; 101 102 now = hrtimer_cb_get_time(period_timer); 103 hrtimer_forward(period_timer, now, period); 104 105 soft = hrtimer_get_softexpires(period_timer); 106 hard = hrtimer_get_expires(period_timer); 107 delta = ktime_to_ns(ktime_sub(hard, soft)); 108 __hrtimer_start_range_ns(period_timer, soft, delta, 109 HRTIMER_MODE_ABS_PINNED, 0); 110 } 111} 112 113DEFINE_MUTEX(sched_domains_mutex); 114DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 115 116static void update_rq_clock_task(struct rq *rq, s64 delta); 117 118void update_rq_clock(struct rq *rq) 119{ 120 s64 delta; 121 122 lockdep_assert_held(&rq->lock); 123 124 if (rq->clock_skip_update & RQCF_ACT_SKIP) 125 return; 126 127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 128 if (delta < 0) 129 return; 130 rq->clock += delta; 131 update_rq_clock_task(rq, delta); 132} 133 134/* 135 * Debugging: various feature bits 136 */ 137 138#define SCHED_FEAT(name, enabled) \ 139 (1UL << __SCHED_FEAT_##name) * enabled | 140 141const_debug unsigned int sysctl_sched_features = 142#include "features.h" 143 0; 144 145#undef SCHED_FEAT 146 147#ifdef CONFIG_SCHED_DEBUG 148#define SCHED_FEAT(name, enabled) \ 149 #name , 150 151static const char * const sched_feat_names[] = { 152#include "features.h" 153}; 154 155#undef SCHED_FEAT 156 157static int sched_feat_show(struct seq_file *m, void *v) 158{ 159 int i; 160 161 for (i = 0; i < __SCHED_FEAT_NR; i++) { 162 if (!(sysctl_sched_features & (1UL << i))) 163 seq_puts(m, "NO_"); 164 seq_printf(m, "%s ", sched_feat_names[i]); 165 } 166 seq_puts(m, "\n"); 167 168 return 0; 169} 170 171#ifdef HAVE_JUMP_LABEL 172 173#define jump_label_key__true STATIC_KEY_INIT_TRUE 174#define jump_label_key__false STATIC_KEY_INIT_FALSE 175 176#define SCHED_FEAT(name, enabled) \ 177 jump_label_key__##enabled , 178 179struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 180#include "features.h" 181}; 182 183#undef SCHED_FEAT 184 185static void sched_feat_disable(int i) 186{ 187 if (static_key_enabled(&sched_feat_keys[i])) 188 static_key_slow_dec(&sched_feat_keys[i]); 189} 190 191static void sched_feat_enable(int i) 192{ 193 if (!static_key_enabled(&sched_feat_keys[i])) 194 static_key_slow_inc(&sched_feat_keys[i]); 195} 196#else 197static void sched_feat_disable(int i) { }; 198static void sched_feat_enable(int i) { }; 199#endif /* HAVE_JUMP_LABEL */ 200 201static int sched_feat_set(char *cmp) 202{ 203 int i; 204 int neg = 0; 205 206 if (strncmp(cmp, "NO_", 3) == 0) { 207 neg = 1; 208 cmp += 3; 209 } 210 211 for (i = 0; i < __SCHED_FEAT_NR; i++) { 212 if (strcmp(cmp, sched_feat_names[i]) == 0) { 213 if (neg) { 214 sysctl_sched_features &= ~(1UL << i); 215 sched_feat_disable(i); 216 } else { 217 sysctl_sched_features |= (1UL << i); 218 sched_feat_enable(i); 219 } 220 break; 221 } 222 } 223 224 return i; 225} 226 227static ssize_t 228sched_feat_write(struct file *filp, const char __user *ubuf, 229 size_t cnt, loff_t *ppos) 230{ 231 char buf[64]; 232 char *cmp; 233 int i; 234 struct inode *inode; 235 236 if (cnt > 63) 237 cnt = 63; 238 239 if (copy_from_user(&buf, ubuf, cnt)) 240 return -EFAULT; 241 242 buf[cnt] = 0; 243 cmp = strstrip(buf); 244 245 /* Ensure the static_key remains in a consistent state */ 246 inode = file_inode(filp); 247 mutex_lock(&inode->i_mutex); 248 i = sched_feat_set(cmp); 249 mutex_unlock(&inode->i_mutex); 250 if (i == __SCHED_FEAT_NR) 251 return -EINVAL; 252 253 *ppos += cnt; 254 255 return cnt; 256} 257 258static int sched_feat_open(struct inode *inode, struct file *filp) 259{ 260 return single_open(filp, sched_feat_show, NULL); 261} 262 263static const struct file_operations sched_feat_fops = { 264 .open = sched_feat_open, 265 .write = sched_feat_write, 266 .read = seq_read, 267 .llseek = seq_lseek, 268 .release = single_release, 269}; 270 271static __init int sched_init_debug(void) 272{ 273 debugfs_create_file("sched_features", 0644, NULL, NULL, 274 &sched_feat_fops); 275 276 return 0; 277} 278late_initcall(sched_init_debug); 279#endif /* CONFIG_SCHED_DEBUG */ 280 281/* 282 * Number of tasks to iterate in a single balance run. 283 * Limited because this is done with IRQs disabled. 284 */ 285const_debug unsigned int sysctl_sched_nr_migrate = 32; 286 287/* 288 * period over which we average the RT time consumption, measured 289 * in ms. 290 * 291 * default: 1s 292 */ 293const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 294 295/* 296 * period over which we measure -rt task cpu usage in us. 297 * default: 1s 298 */ 299unsigned int sysctl_sched_rt_period = 1000000; 300 301__read_mostly int scheduler_running; 302 303/* 304 * part of the period that we allow rt tasks to run in us. 305 * default: 0.95s 306 */ 307int sysctl_sched_rt_runtime = 950000; 308 309/* cpus with isolated domains */ 310cpumask_var_t cpu_isolated_map; 311 312/* 313 * this_rq_lock - lock this runqueue and disable interrupts. 314 */ 315static struct rq *this_rq_lock(void) 316 __acquires(rq->lock) 317{ 318 struct rq *rq; 319 320 local_irq_disable(); 321 rq = this_rq(); 322 raw_spin_lock(&rq->lock); 323 324 return rq; 325} 326 327#ifdef CONFIG_SCHED_HRTICK 328/* 329 * Use HR-timers to deliver accurate preemption points. 330 */ 331 332static void hrtick_clear(struct rq *rq) 333{ 334 if (hrtimer_active(&rq->hrtick_timer)) 335 hrtimer_cancel(&rq->hrtick_timer); 336} 337 338/* 339 * High-resolution timer tick. 340 * Runs from hardirq context with interrupts disabled. 341 */ 342static enum hrtimer_restart hrtick(struct hrtimer *timer) 343{ 344 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 345 346 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 347 348 raw_spin_lock(&rq->lock); 349 update_rq_clock(rq); 350 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 351 raw_spin_unlock(&rq->lock); 352 353 return HRTIMER_NORESTART; 354} 355 356#ifdef CONFIG_SMP 357 358static int __hrtick_restart(struct rq *rq) 359{ 360 struct hrtimer *timer = &rq->hrtick_timer; 361 ktime_t time = hrtimer_get_softexpires(timer); 362 363 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); 364} 365 366/* 367 * called from hardirq (IPI) context 368 */ 369static void __hrtick_start(void *arg) 370{ 371 struct rq *rq = arg; 372 373 raw_spin_lock(&rq->lock); 374 __hrtick_restart(rq); 375 rq->hrtick_csd_pending = 0; 376 raw_spin_unlock(&rq->lock); 377} 378 379/* 380 * Called to set the hrtick timer state. 381 * 382 * called with rq->lock held and irqs disabled 383 */ 384void hrtick_start(struct rq *rq, u64 delay) 385{ 386 struct hrtimer *timer = &rq->hrtick_timer; 387 ktime_t time; 388 s64 delta; 389 390 /* 391 * Don't schedule slices shorter than 10000ns, that just 392 * doesn't make sense and can cause timer DoS. 393 */ 394 delta = max_t(s64, delay, 10000LL); 395 time = ktime_add_ns(timer->base->get_time(), delta); 396 397 hrtimer_set_expires(timer, time); 398 399 if (rq == this_rq()) { 400 __hrtick_restart(rq); 401 } else if (!rq->hrtick_csd_pending) { 402 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 403 rq->hrtick_csd_pending = 1; 404 } 405} 406 407static int 408hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 409{ 410 int cpu = (int)(long)hcpu; 411 412 switch (action) { 413 case CPU_UP_CANCELED: 414 case CPU_UP_CANCELED_FROZEN: 415 case CPU_DOWN_PREPARE: 416 case CPU_DOWN_PREPARE_FROZEN: 417 case CPU_DEAD: 418 case CPU_DEAD_FROZEN: 419 hrtick_clear(cpu_rq(cpu)); 420 return NOTIFY_OK; 421 } 422 423 return NOTIFY_DONE; 424} 425 426static __init void init_hrtick(void) 427{ 428 hotcpu_notifier(hotplug_hrtick, 0); 429} 430#else 431/* 432 * Called to set the hrtick timer state. 433 * 434 * called with rq->lock held and irqs disabled 435 */ 436void hrtick_start(struct rq *rq, u64 delay) 437{ 438 /* 439 * Don't schedule slices shorter than 10000ns, that just 440 * doesn't make sense. Rely on vruntime for fairness. 441 */ 442 delay = max_t(u64, delay, 10000LL); 443 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 444 HRTIMER_MODE_REL_PINNED, 0); 445} 446 447static inline void init_hrtick(void) 448{ 449} 450#endif /* CONFIG_SMP */ 451 452static void init_rq_hrtick(struct rq *rq) 453{ 454#ifdef CONFIG_SMP 455 rq->hrtick_csd_pending = 0; 456 457 rq->hrtick_csd.flags = 0; 458 rq->hrtick_csd.func = __hrtick_start; 459 rq->hrtick_csd.info = rq; 460#endif 461 462 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 463 rq->hrtick_timer.function = hrtick; 464} 465#else /* CONFIG_SCHED_HRTICK */ 466static inline void hrtick_clear(struct rq *rq) 467{ 468} 469 470static inline void init_rq_hrtick(struct rq *rq) 471{ 472} 473 474static inline void init_hrtick(void) 475{ 476} 477#endif /* CONFIG_SCHED_HRTICK */ 478 479/* 480 * cmpxchg based fetch_or, macro so it works for different integer types 481 */ 482#define fetch_or(ptr, val) \ 483({ typeof(*(ptr)) __old, __val = *(ptr); \ 484 for (;;) { \ 485 __old = cmpxchg((ptr), __val, __val | (val)); \ 486 if (__old == __val) \ 487 break; \ 488 __val = __old; \ 489 } \ 490 __old; \ 491}) 492 493#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) 494/* 495 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, 496 * this avoids any races wrt polling state changes and thereby avoids 497 * spurious IPIs. 498 */ 499static bool set_nr_and_not_polling(struct task_struct *p) 500{ 501 struct thread_info *ti = task_thread_info(p); 502 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 503} 504 505/* 506 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. 507 * 508 * If this returns true, then the idle task promises to call 509 * sched_ttwu_pending() and reschedule soon. 510 */ 511static bool set_nr_if_polling(struct task_struct *p) 512{ 513 struct thread_info *ti = task_thread_info(p); 514 typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags); 515 516 for (;;) { 517 if (!(val & _TIF_POLLING_NRFLAG)) 518 return false; 519 if (val & _TIF_NEED_RESCHED) 520 return true; 521 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); 522 if (old == val) 523 break; 524 val = old; 525 } 526 return true; 527} 528 529#else 530static bool set_nr_and_not_polling(struct task_struct *p) 531{ 532 set_tsk_need_resched(p); 533 return true; 534} 535 536#ifdef CONFIG_SMP 537static bool set_nr_if_polling(struct task_struct *p) 538{ 539 return false; 540} 541#endif 542#endif 543 544/* 545 * resched_curr - mark rq's current task 'to be rescheduled now'. 546 * 547 * On UP this means the setting of the need_resched flag, on SMP it 548 * might also involve a cross-CPU call to trigger the scheduler on 549 * the target CPU. 550 */ 551void resched_curr(struct rq *rq) 552{ 553 struct task_struct *curr = rq->curr; 554 int cpu; 555 556 lockdep_assert_held(&rq->lock); 557 558 if (test_tsk_need_resched(curr)) 559 return; 560 561 cpu = cpu_of(rq); 562 563 if (cpu == smp_processor_id()) { 564 set_tsk_need_resched(curr); 565 set_preempt_need_resched(); 566 return; 567 } 568 569 if (set_nr_and_not_polling(curr)) 570 smp_send_reschedule(cpu); 571 else 572 trace_sched_wake_idle_without_ipi(cpu); 573} 574 575void resched_cpu(int cpu) 576{ 577 struct rq *rq = cpu_rq(cpu); 578 unsigned long flags; 579 580 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 581 return; 582 resched_curr(rq); 583 raw_spin_unlock_irqrestore(&rq->lock, flags); 584} 585 586#ifdef CONFIG_SMP 587#ifdef CONFIG_NO_HZ_COMMON 588/* 589 * In the semi idle case, use the nearest busy cpu for migrating timers 590 * from an idle cpu. This is good for power-savings. 591 * 592 * We don't do similar optimization for completely idle system, as 593 * selecting an idle cpu will add more delays to the timers than intended 594 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 595 */ 596int get_nohz_timer_target(int pinned) 597{ 598 int cpu = smp_processor_id(); 599 int i; 600 struct sched_domain *sd; 601 602 if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) 603 return cpu; 604 605 rcu_read_lock(); 606 for_each_domain(cpu, sd) { 607 for_each_cpu(i, sched_domain_span(sd)) { 608 if (!idle_cpu(i)) { 609 cpu = i; 610 goto unlock; 611 } 612 } 613 } 614unlock: 615 rcu_read_unlock(); 616 return cpu; 617} 618/* 619 * When add_timer_on() enqueues a timer into the timer wheel of an 620 * idle CPU then this timer might expire before the next timer event 621 * which is scheduled to wake up that CPU. In case of a completely 622 * idle system the next event might even be infinite time into the 623 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 624 * leaves the inner idle loop so the newly added timer is taken into 625 * account when the CPU goes back to idle and evaluates the timer 626 * wheel for the next timer event. 627 */ 628static void wake_up_idle_cpu(int cpu) 629{ 630 struct rq *rq = cpu_rq(cpu); 631 632 if (cpu == smp_processor_id()) 633 return; 634 635 if (set_nr_and_not_polling(rq->idle)) 636 smp_send_reschedule(cpu); 637 else 638 trace_sched_wake_idle_without_ipi(cpu); 639} 640 641static bool wake_up_full_nohz_cpu(int cpu) 642{ 643 /* 644 * We just need the target to call irq_exit() and re-evaluate 645 * the next tick. The nohz full kick at least implies that. 646 * If needed we can still optimize that later with an 647 * empty IRQ. 648 */ 649 if (tick_nohz_full_cpu(cpu)) { 650 if (cpu != smp_processor_id() || 651 tick_nohz_tick_stopped()) 652 tick_nohz_full_kick_cpu(cpu); 653 return true; 654 } 655 656 return false; 657} 658 659void wake_up_nohz_cpu(int cpu) 660{ 661 if (!wake_up_full_nohz_cpu(cpu)) 662 wake_up_idle_cpu(cpu); 663} 664 665static inline bool got_nohz_idle_kick(void) 666{ 667 int cpu = smp_processor_id(); 668 669 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 670 return false; 671 672 if (idle_cpu(cpu) && !need_resched()) 673 return true; 674 675 /* 676 * We can't run Idle Load Balance on this CPU for this time so we 677 * cancel it and clear NOHZ_BALANCE_KICK 678 */ 679 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 680 return false; 681} 682 683#else /* CONFIG_NO_HZ_COMMON */ 684 685static inline bool got_nohz_idle_kick(void) 686{ 687 return false; 688} 689 690#endif /* CONFIG_NO_HZ_COMMON */ 691 692#ifdef CONFIG_NO_HZ_FULL 693bool sched_can_stop_tick(void) 694{ 695 /* 696 * FIFO realtime policy runs the highest priority task. Other runnable 697 * tasks are of a lower priority. The scheduler tick does nothing. 698 */ 699 if (current->policy == SCHED_FIFO) 700 return true; 701 702 /* 703 * Round-robin realtime tasks time slice with other tasks at the same 704 * realtime priority. Is this task the only one at this priority? 705 */ 706 if (current->policy == SCHED_RR) { 707 struct sched_rt_entity *rt_se = ¤t->rt; 708 709 return rt_se->run_list.prev == rt_se->run_list.next; 710 } 711 712 /* 713 * More than one running task need preemption. 714 * nr_running update is assumed to be visible 715 * after IPI is sent from wakers. 716 */ 717 if (this_rq()->nr_running > 1) 718 return false; 719 720 return true; 721} 722#endif /* CONFIG_NO_HZ_FULL */ 723 724void sched_avg_update(struct rq *rq) 725{ 726 s64 period = sched_avg_period(); 727 728 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 729 /* 730 * Inline assembly required to prevent the compiler 731 * optimising this loop into a divmod call. 732 * See __iter_div_u64_rem() for another example of this. 733 */ 734 asm("" : "+rm" (rq->age_stamp)); 735 rq->age_stamp += period; 736 rq->rt_avg /= 2; 737 } 738} 739 740#endif /* CONFIG_SMP */ 741 742#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 743 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 744/* 745 * Iterate task_group tree rooted at *from, calling @down when first entering a 746 * node and @up when leaving it for the final time. 747 * 748 * Caller must hold rcu_lock or sufficient equivalent. 749 */ 750int walk_tg_tree_from(struct task_group *from, 751 tg_visitor down, tg_visitor up, void *data) 752{ 753 struct task_group *parent, *child; 754 int ret; 755 756 parent = from; 757 758down: 759 ret = (*down)(parent, data); 760 if (ret) 761 goto out; 762 list_for_each_entry_rcu(child, &parent->children, siblings) { 763 parent = child; 764 goto down; 765 766up: 767 continue; 768 } 769 ret = (*up)(parent, data); 770 if (ret || parent == from) 771 goto out; 772 773 child = parent; 774 parent = parent->parent; 775 if (parent) 776 goto up; 777out: 778 return ret; 779} 780 781int tg_nop(struct task_group *tg, void *data) 782{ 783 return 0; 784} 785#endif 786 787static void set_load_weight(struct task_struct *p) 788{ 789 int prio = p->static_prio - MAX_RT_PRIO; 790 struct load_weight *load = &p->se.load; 791 792 /* 793 * SCHED_IDLE tasks get minimal weight: 794 */ 795 if (p->policy == SCHED_IDLE) { 796 load->weight = scale_load(WEIGHT_IDLEPRIO); 797 load->inv_weight = WMULT_IDLEPRIO; 798 return; 799 } 800 801 load->weight = scale_load(prio_to_weight[prio]); 802 load->inv_weight = prio_to_wmult[prio]; 803} 804 805static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 806{ 807 update_rq_clock(rq); 808 sched_info_queued(rq, p); 809 p->sched_class->enqueue_task(rq, p, flags); 810} 811 812static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 813{ 814 update_rq_clock(rq); 815 sched_info_dequeued(rq, p); 816 p->sched_class->dequeue_task(rq, p, flags); 817} 818 819void activate_task(struct rq *rq, struct task_struct *p, int flags) 820{ 821 if (task_contributes_to_load(p)) 822 rq->nr_uninterruptible--; 823 824 enqueue_task(rq, p, flags); 825} 826 827void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 828{ 829 if (task_contributes_to_load(p)) 830 rq->nr_uninterruptible++; 831 832 dequeue_task(rq, p, flags); 833} 834 835static void update_rq_clock_task(struct rq *rq, s64 delta) 836{ 837/* 838 * In theory, the compile should just see 0 here, and optimize out the call 839 * to sched_rt_avg_update. But I don't trust it... 840 */ 841#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 842 s64 steal = 0, irq_delta = 0; 843#endif 844#ifdef CONFIG_IRQ_TIME_ACCOUNTING 845 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 846 847 /* 848 * Since irq_time is only updated on {soft,}irq_exit, we might run into 849 * this case when a previous update_rq_clock() happened inside a 850 * {soft,}irq region. 851 * 852 * When this happens, we stop ->clock_task and only update the 853 * prev_irq_time stamp to account for the part that fit, so that a next 854 * update will consume the rest. This ensures ->clock_task is 855 * monotonic. 856 * 857 * It does however cause some slight miss-attribution of {soft,}irq 858 * time, a more accurate solution would be to update the irq_time using 859 * the current rq->clock timestamp, except that would require using 860 * atomic ops. 861 */ 862 if (irq_delta > delta) 863 irq_delta = delta; 864 865 rq->prev_irq_time += irq_delta; 866 delta -= irq_delta; 867#endif 868#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 869 if (static_key_false((¶virt_steal_rq_enabled))) { 870 steal = paravirt_steal_clock(cpu_of(rq)); 871 steal -= rq->prev_steal_time_rq; 872 873 if (unlikely(steal > delta)) 874 steal = delta; 875 876 rq->prev_steal_time_rq += steal; 877 delta -= steal; 878 } 879#endif 880 881 rq->clock_task += delta; 882 883#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 884 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) 885 sched_rt_avg_update(rq, irq_delta + steal); 886#endif 887} 888 889void sched_set_stop_task(int cpu, struct task_struct *stop) 890{ 891 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 892 struct task_struct *old_stop = cpu_rq(cpu)->stop; 893 894 if (stop) { 895 /* 896 * Make it appear like a SCHED_FIFO task, its something 897 * userspace knows about and won't get confused about. 898 * 899 * Also, it will make PI more or less work without too 900 * much confusion -- but then, stop work should not 901 * rely on PI working anyway. 902 */ 903 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); 904 905 stop->sched_class = &stop_sched_class; 906 } 907 908 cpu_rq(cpu)->stop = stop; 909 910 if (old_stop) { 911 /* 912 * Reset it back to a normal scheduling class so that 913 * it can die in pieces. 914 */ 915 old_stop->sched_class = &rt_sched_class; 916 } 917} 918 919/* 920 * __normal_prio - return the priority that is based on the static prio 921 */ 922static inline int __normal_prio(struct task_struct *p) 923{ 924 return p->static_prio; 925} 926 927/* 928 * Calculate the expected normal priority: i.e. priority 929 * without taking RT-inheritance into account. Might be 930 * boosted by interactivity modifiers. Changes upon fork, 931 * setprio syscalls, and whenever the interactivity 932 * estimator recalculates. 933 */ 934static inline int normal_prio(struct task_struct *p) 935{ 936 int prio; 937 938 if (task_has_dl_policy(p)) 939 prio = MAX_DL_PRIO-1; 940 else if (task_has_rt_policy(p)) 941 prio = MAX_RT_PRIO-1 - p->rt_priority; 942 else 943 prio = __normal_prio(p); 944 return prio; 945} 946 947/* 948 * Calculate the current priority, i.e. the priority 949 * taken into account by the scheduler. This value might 950 * be boosted by RT tasks, or might be boosted by 951 * interactivity modifiers. Will be RT if the task got 952 * RT-boosted. If not then it returns p->normal_prio. 953 */ 954static int effective_prio(struct task_struct *p) 955{ 956 p->normal_prio = normal_prio(p); 957 /* 958 * If we are RT tasks or we were boosted to RT priority, 959 * keep the priority unchanged. Otherwise, update priority 960 * to the normal priority: 961 */ 962 if (!rt_prio(p->prio)) 963 return p->normal_prio; 964 return p->prio; 965} 966 967/** 968 * task_curr - is this task currently executing on a CPU? 969 * @p: the task in question. 970 * 971 * Return: 1 if the task is currently executing. 0 otherwise. 972 */ 973inline int task_curr(const struct task_struct *p) 974{ 975 return cpu_curr(task_cpu(p)) == p; 976} 977 978/* 979 * Can drop rq->lock because from sched_class::switched_from() methods drop it. 980 */ 981static inline void check_class_changed(struct rq *rq, struct task_struct *p, 982 const struct sched_class *prev_class, 983 int oldprio) 984{ 985 if (prev_class != p->sched_class) { 986 if (prev_class->switched_from) 987 prev_class->switched_from(rq, p); 988 /* Possble rq->lock 'hole'. */ 989 p->sched_class->switched_to(rq, p); 990 } else if (oldprio != p->prio || dl_task(p)) 991 p->sched_class->prio_changed(rq, p, oldprio); 992} 993 994void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 995{ 996 const struct sched_class *class; 997 998 if (p->sched_class == rq->curr->sched_class) { 999 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 1000 } else { 1001 for_each_class(class) { 1002 if (class == rq->curr->sched_class) 1003 break; 1004 if (class == p->sched_class) { 1005 resched_curr(rq); 1006 break; 1007 } 1008 } 1009 } 1010 1011 /* 1012 * A queue event has occurred, and we're going to schedule. In 1013 * this case, we can save a useless back to back clock update. 1014 */ 1015 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1016 rq_clock_skip_update(rq, true); 1017} 1018 1019#ifdef CONFIG_SMP 1020void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1021{ 1022#ifdef CONFIG_SCHED_DEBUG 1023 /* 1024 * We should never call set_task_cpu() on a blocked task, 1025 * ttwu() will sort out the placement. 1026 */ 1027 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1028 !p->on_rq); 1029 1030#ifdef CONFIG_LOCKDEP 1031 /* 1032 * The caller should hold either p->pi_lock or rq->lock, when changing 1033 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1034 * 1035 * sched_move_task() holds both and thus holding either pins the cgroup, 1036 * see task_group(). 1037 * 1038 * Furthermore, all task_rq users should acquire both locks, see 1039 * task_rq_lock(). 1040 */ 1041 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1042 lockdep_is_held(&task_rq(p)->lock))); 1043#endif 1044#endif 1045 1046 trace_sched_migrate_task(p, new_cpu); 1047 1048 if (task_cpu(p) != new_cpu) { 1049 if (p->sched_class->migrate_task_rq) 1050 p->sched_class->migrate_task_rq(p, new_cpu); 1051 p->se.nr_migrations++; 1052 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); 1053 } 1054 1055 __set_task_cpu(p, new_cpu); 1056} 1057 1058static void __migrate_swap_task(struct task_struct *p, int cpu) 1059{ 1060 if (task_on_rq_queued(p)) { 1061 struct rq *src_rq, *dst_rq; 1062 1063 src_rq = task_rq(p); 1064 dst_rq = cpu_rq(cpu); 1065 1066 deactivate_task(src_rq, p, 0); 1067 set_task_cpu(p, cpu); 1068 activate_task(dst_rq, p, 0); 1069 check_preempt_curr(dst_rq, p, 0); 1070 } else { 1071 /* 1072 * Task isn't running anymore; make it appear like we migrated 1073 * it before it went to sleep. This means on wakeup we make the 1074 * previous cpu our targer instead of where it really is. 1075 */ 1076 p->wake_cpu = cpu; 1077 } 1078} 1079 1080struct migration_swap_arg { 1081 struct task_struct *src_task, *dst_task; 1082 int src_cpu, dst_cpu; 1083}; 1084 1085static int migrate_swap_stop(void *data) 1086{ 1087 struct migration_swap_arg *arg = data; 1088 struct rq *src_rq, *dst_rq; 1089 int ret = -EAGAIN; 1090 1091 src_rq = cpu_rq(arg->src_cpu); 1092 dst_rq = cpu_rq(arg->dst_cpu); 1093 1094 double_raw_lock(&arg->src_task->pi_lock, 1095 &arg->dst_task->pi_lock); 1096 double_rq_lock(src_rq, dst_rq); 1097 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1098 goto unlock; 1099 1100 if (task_cpu(arg->src_task) != arg->src_cpu) 1101 goto unlock; 1102 1103 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1104 goto unlock; 1105 1106 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1107 goto unlock; 1108 1109 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1110 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1111 1112 ret = 0; 1113 1114unlock: 1115 double_rq_unlock(src_rq, dst_rq); 1116 raw_spin_unlock(&arg->dst_task->pi_lock); 1117 raw_spin_unlock(&arg->src_task->pi_lock); 1118 1119 return ret; 1120} 1121 1122/* 1123 * Cross migrate two tasks 1124 */ 1125int migrate_swap(struct task_struct *cur, struct task_struct *p) 1126{ 1127 struct migration_swap_arg arg; 1128 int ret = -EINVAL; 1129 1130 arg = (struct migration_swap_arg){ 1131 .src_task = cur, 1132 .src_cpu = task_cpu(cur), 1133 .dst_task = p, 1134 .dst_cpu = task_cpu(p), 1135 }; 1136 1137 if (arg.src_cpu == arg.dst_cpu) 1138 goto out; 1139 1140 /* 1141 * These three tests are all lockless; this is OK since all of them 1142 * will be re-checked with proper locks held further down the line. 1143 */ 1144 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1145 goto out; 1146 1147 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1148 goto out; 1149 1150 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1151 goto out; 1152 1153 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1154 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1155 1156out: 1157 return ret; 1158} 1159 1160struct migration_arg { 1161 struct task_struct *task; 1162 int dest_cpu; 1163}; 1164 1165static int migration_cpu_stop(void *data); 1166 1167/* 1168 * wait_task_inactive - wait for a thread to unschedule. 1169 * 1170 * If @match_state is nonzero, it's the @p->state value just checked and 1171 * not expected to change. If it changes, i.e. @p might have woken up, 1172 * then return zero. When we succeed in waiting for @p to be off its CPU, 1173 * we return a positive number (its total switch count). If a second call 1174 * a short while later returns the same number, the caller can be sure that 1175 * @p has remained unscheduled the whole time. 1176 * 1177 * The caller must ensure that the task *will* unschedule sometime soon, 1178 * else this function might spin for a *long* time. This function can't 1179 * be called with interrupts off, or it may introduce deadlock with 1180 * smp_call_function() if an IPI is sent by the same process we are 1181 * waiting to become inactive. 1182 */ 1183unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1184{ 1185 unsigned long flags; 1186 int running, queued; 1187 unsigned long ncsw; 1188 struct rq *rq; 1189 1190 for (;;) { 1191 /* 1192 * We do the initial early heuristics without holding 1193 * any task-queue locks at all. We'll only try to get 1194 * the runqueue lock when things look like they will 1195 * work out! 1196 */ 1197 rq = task_rq(p); 1198 1199 /* 1200 * If the task is actively running on another CPU 1201 * still, just relax and busy-wait without holding 1202 * any locks. 1203 * 1204 * NOTE! Since we don't hold any locks, it's not 1205 * even sure that "rq" stays as the right runqueue! 1206 * But we don't care, since "task_running()" will 1207 * return false if the runqueue has changed and p 1208 * is actually now running somewhere else! 1209 */ 1210 while (task_running(rq, p)) { 1211 if (match_state && unlikely(p->state != match_state)) 1212 return 0; 1213 cpu_relax(); 1214 } 1215 1216 /* 1217 * Ok, time to look more closely! We need the rq 1218 * lock now, to be *sure*. If we're wrong, we'll 1219 * just go back and repeat. 1220 */ 1221 rq = task_rq_lock(p, &flags); 1222 trace_sched_wait_task(p); 1223 running = task_running(rq, p); 1224 queued = task_on_rq_queued(p); 1225 ncsw = 0; 1226 if (!match_state || p->state == match_state) 1227 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1228 task_rq_unlock(rq, p, &flags); 1229 1230 /* 1231 * If it changed from the expected state, bail out now. 1232 */ 1233 if (unlikely(!ncsw)) 1234 break; 1235 1236 /* 1237 * Was it really running after all now that we 1238 * checked with the proper locks actually held? 1239 * 1240 * Oops. Go back and try again.. 1241 */ 1242 if (unlikely(running)) { 1243 cpu_relax(); 1244 continue; 1245 } 1246 1247 /* 1248 * It's not enough that it's not actively running, 1249 * it must be off the runqueue _entirely_, and not 1250 * preempted! 1251 * 1252 * So if it was still runnable (but just not actively 1253 * running right now), it's preempted, and we should 1254 * yield - it could be a while. 1255 */ 1256 if (unlikely(queued)) { 1257 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1258 1259 set_current_state(TASK_UNINTERRUPTIBLE); 1260 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1261 continue; 1262 } 1263 1264 /* 1265 * Ahh, all good. It wasn't running, and it wasn't 1266 * runnable, which means that it will never become 1267 * running in the future either. We're all done! 1268 */ 1269 break; 1270 } 1271 1272 return ncsw; 1273} 1274 1275/*** 1276 * kick_process - kick a running thread to enter/exit the kernel 1277 * @p: the to-be-kicked thread 1278 * 1279 * Cause a process which is running on another CPU to enter 1280 * kernel-mode, without any delay. (to get signals handled.) 1281 * 1282 * NOTE: this function doesn't have to take the runqueue lock, 1283 * because all it wants to ensure is that the remote task enters 1284 * the kernel. If the IPI races and the task has been migrated 1285 * to another CPU then no harm is done and the purpose has been 1286 * achieved as well. 1287 */ 1288void kick_process(struct task_struct *p) 1289{ 1290 int cpu; 1291 1292 preempt_disable(); 1293 cpu = task_cpu(p); 1294 if ((cpu != smp_processor_id()) && task_curr(p)) 1295 smp_send_reschedule(cpu); 1296 preempt_enable(); 1297} 1298EXPORT_SYMBOL_GPL(kick_process); 1299#endif /* CONFIG_SMP */ 1300 1301#ifdef CONFIG_SMP 1302/* 1303 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1304 */ 1305static int select_fallback_rq(int cpu, struct task_struct *p) 1306{ 1307 int nid = cpu_to_node(cpu); 1308 const struct cpumask *nodemask = NULL; 1309 enum { cpuset, possible, fail } state = cpuset; 1310 int dest_cpu; 1311 1312 /* 1313 * If the node that the cpu is on has been offlined, cpu_to_node() 1314 * will return -1. There is no cpu on the node, and we should 1315 * select the cpu on the other node. 1316 */ 1317 if (nid != -1) { 1318 nodemask = cpumask_of_node(nid); 1319 1320 /* Look for allowed, online CPU in same node. */ 1321 for_each_cpu(dest_cpu, nodemask) { 1322 if (!cpu_online(dest_cpu)) 1323 continue; 1324 if (!cpu_active(dest_cpu)) 1325 continue; 1326 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1327 return dest_cpu; 1328 } 1329 } 1330 1331 for (;;) { 1332 /* Any allowed, online CPU? */ 1333 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1334 if (!cpu_online(dest_cpu)) 1335 continue; 1336 if (!cpu_active(dest_cpu)) 1337 continue; 1338 goto out; 1339 } 1340 1341 switch (state) { 1342 case cpuset: 1343 /* No more Mr. Nice Guy. */ 1344 cpuset_cpus_allowed_fallback(p); 1345 state = possible; 1346 break; 1347 1348 case possible: 1349 do_set_cpus_allowed(p, cpu_possible_mask); 1350 state = fail; 1351 break; 1352 1353 case fail: 1354 BUG(); 1355 break; 1356 } 1357 } 1358 1359out: 1360 if (state != cpuset) { 1361 /* 1362 * Don't tell them about moving exiting tasks or 1363 * kernel threads (both mm NULL), since they never 1364 * leave kernel. 1365 */ 1366 if (p->mm && printk_ratelimit()) { 1367 printk_deferred("process %d (%s) no longer affine to cpu%d\n", 1368 task_pid_nr(p), p->comm, cpu); 1369 } 1370 } 1371 1372 return dest_cpu; 1373} 1374 1375/* 1376 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1377 */ 1378static inline 1379int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1380{ 1381 if (p->nr_cpus_allowed > 1) 1382 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1383 1384 /* 1385 * In order not to call set_task_cpu() on a blocking task we need 1386 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1387 * cpu. 1388 * 1389 * Since this is common to all placement strategies, this lives here. 1390 * 1391 * [ this allows ->select_task() to simply return task_cpu(p) and 1392 * not worry about this generic constraint ] 1393 */ 1394 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1395 !cpu_online(cpu))) 1396 cpu = select_fallback_rq(task_cpu(p), p); 1397 1398 return cpu; 1399} 1400 1401static void update_avg(u64 *avg, u64 sample) 1402{ 1403 s64 diff = sample - *avg; 1404 *avg += diff >> 3; 1405} 1406#endif 1407 1408static void 1409ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1410{ 1411#ifdef CONFIG_SCHEDSTATS 1412 struct rq *rq = this_rq(); 1413 1414#ifdef CONFIG_SMP 1415 int this_cpu = smp_processor_id(); 1416 1417 if (cpu == this_cpu) { 1418 schedstat_inc(rq, ttwu_local); 1419 schedstat_inc(p, se.statistics.nr_wakeups_local); 1420 } else { 1421 struct sched_domain *sd; 1422 1423 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1424 rcu_read_lock(); 1425 for_each_domain(this_cpu, sd) { 1426 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1427 schedstat_inc(sd, ttwu_wake_remote); 1428 break; 1429 } 1430 } 1431 rcu_read_unlock(); 1432 } 1433 1434 if (wake_flags & WF_MIGRATED) 1435 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1436 1437#endif /* CONFIG_SMP */ 1438 1439 schedstat_inc(rq, ttwu_count); 1440 schedstat_inc(p, se.statistics.nr_wakeups); 1441 1442 if (wake_flags & WF_SYNC) 1443 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1444 1445#endif /* CONFIG_SCHEDSTATS */ 1446} 1447 1448static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1449{ 1450 activate_task(rq, p, en_flags); 1451 p->on_rq = TASK_ON_RQ_QUEUED; 1452 1453 /* if a worker is waking up, notify workqueue */ 1454 if (p->flags & PF_WQ_WORKER) 1455 wq_worker_waking_up(p, cpu_of(rq)); 1456} 1457 1458/* 1459 * Mark the task runnable and perform wakeup-preemption. 1460 */ 1461static void 1462ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1463{ 1464 check_preempt_curr(rq, p, wake_flags); 1465 trace_sched_wakeup(p, true); 1466 1467 p->state = TASK_RUNNING; 1468#ifdef CONFIG_SMP 1469 if (p->sched_class->task_woken) 1470 p->sched_class->task_woken(rq, p); 1471 1472 if (rq->idle_stamp) { 1473 u64 delta = rq_clock(rq) - rq->idle_stamp; 1474 u64 max = 2*rq->max_idle_balance_cost; 1475 1476 update_avg(&rq->avg_idle, delta); 1477 1478 if (rq->avg_idle > max) 1479 rq->avg_idle = max; 1480 1481 rq->idle_stamp = 0; 1482 } 1483#endif 1484} 1485 1486static void 1487ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1488{ 1489#ifdef CONFIG_SMP 1490 if (p->sched_contributes_to_load) 1491 rq->nr_uninterruptible--; 1492#endif 1493 1494 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1495 ttwu_do_wakeup(rq, p, wake_flags); 1496} 1497 1498/* 1499 * Called in case the task @p isn't fully descheduled from its runqueue, 1500 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1501 * since all we need to do is flip p->state to TASK_RUNNING, since 1502 * the task is still ->on_rq. 1503 */ 1504static int ttwu_remote(struct task_struct *p, int wake_flags) 1505{ 1506 struct rq *rq; 1507 int ret = 0; 1508 1509 rq = __task_rq_lock(p); 1510 if (task_on_rq_queued(p)) { 1511 /* check_preempt_curr() may use rq clock */ 1512 update_rq_clock(rq); 1513 ttwu_do_wakeup(rq, p, wake_flags); 1514 ret = 1; 1515 } 1516 __task_rq_unlock(rq); 1517 1518 return ret; 1519} 1520 1521#ifdef CONFIG_SMP 1522void sched_ttwu_pending(void) 1523{ 1524 struct rq *rq = this_rq(); 1525 struct llist_node *llist = llist_del_all(&rq->wake_list); 1526 struct task_struct *p; 1527 unsigned long flags; 1528 1529 if (!llist) 1530 return; 1531 1532 raw_spin_lock_irqsave(&rq->lock, flags); 1533 1534 while (llist) { 1535 p = llist_entry(llist, struct task_struct, wake_entry); 1536 llist = llist_next(llist); 1537 ttwu_do_activate(rq, p, 0); 1538 } 1539 1540 raw_spin_unlock_irqrestore(&rq->lock, flags); 1541} 1542 1543void scheduler_ipi(void) 1544{ 1545 /* 1546 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1547 * TIF_NEED_RESCHED remotely (for the first time) will also send 1548 * this IPI. 1549 */ 1550 preempt_fold_need_resched(); 1551 1552 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1553 return; 1554 1555 /* 1556 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1557 * traditionally all their work was done from the interrupt return 1558 * path. Now that we actually do some work, we need to make sure 1559 * we do call them. 1560 * 1561 * Some archs already do call them, luckily irq_enter/exit nest 1562 * properly. 1563 * 1564 * Arguably we should visit all archs and update all handlers, 1565 * however a fair share of IPIs are still resched only so this would 1566 * somewhat pessimize the simple resched case. 1567 */ 1568 irq_enter(); 1569 sched_ttwu_pending(); 1570 1571 /* 1572 * Check if someone kicked us for doing the nohz idle load balance. 1573 */ 1574 if (unlikely(got_nohz_idle_kick())) { 1575 this_rq()->idle_balance = 1; 1576 raise_softirq_irqoff(SCHED_SOFTIRQ); 1577 } 1578 irq_exit(); 1579} 1580 1581static void ttwu_queue_remote(struct task_struct *p, int cpu) 1582{ 1583 struct rq *rq = cpu_rq(cpu); 1584 1585 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { 1586 if (!set_nr_if_polling(rq->idle)) 1587 smp_send_reschedule(cpu); 1588 else 1589 trace_sched_wake_idle_without_ipi(cpu); 1590 } 1591} 1592 1593void wake_up_if_idle(int cpu) 1594{ 1595 struct rq *rq = cpu_rq(cpu); 1596 unsigned long flags; 1597 1598 rcu_read_lock(); 1599 1600 if (!is_idle_task(rcu_dereference(rq->curr))) 1601 goto out; 1602 1603 if (set_nr_if_polling(rq->idle)) { 1604 trace_sched_wake_idle_without_ipi(cpu); 1605 } else { 1606 raw_spin_lock_irqsave(&rq->lock, flags); 1607 if (is_idle_task(rq->curr)) 1608 smp_send_reschedule(cpu); 1609 /* Else cpu is not in idle, do nothing here */ 1610 raw_spin_unlock_irqrestore(&rq->lock, flags); 1611 } 1612 1613out: 1614 rcu_read_unlock(); 1615} 1616 1617bool cpus_share_cache(int this_cpu, int that_cpu) 1618{ 1619 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1620} 1621#endif /* CONFIG_SMP */ 1622 1623static void ttwu_queue(struct task_struct *p, int cpu) 1624{ 1625 struct rq *rq = cpu_rq(cpu); 1626 1627#if defined(CONFIG_SMP) 1628 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1629 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1630 ttwu_queue_remote(p, cpu); 1631 return; 1632 } 1633#endif 1634 1635 raw_spin_lock(&rq->lock); 1636 ttwu_do_activate(rq, p, 0); 1637 raw_spin_unlock(&rq->lock); 1638} 1639 1640/** 1641 * try_to_wake_up - wake up a thread 1642 * @p: the thread to be awakened 1643 * @state: the mask of task states that can be woken 1644 * @wake_flags: wake modifier flags (WF_*) 1645 * 1646 * Put it on the run-queue if it's not already there. The "current" 1647 * thread is always on the run-queue (except when the actual 1648 * re-schedule is in progress), and as such you're allowed to do 1649 * the simpler "current->state = TASK_RUNNING" to mark yourself 1650 * runnable without the overhead of this. 1651 * 1652 * Return: %true if @p was woken up, %false if it was already running. 1653 * or @state didn't match @p's state. 1654 */ 1655static int 1656try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1657{ 1658 unsigned long flags; 1659 int cpu, success = 0; 1660 1661 /* 1662 * If we are going to wake up a thread waiting for CONDITION we 1663 * need to ensure that CONDITION=1 done by the caller can not be 1664 * reordered with p->state check below. This pairs with mb() in 1665 * set_current_state() the waiting thread does. 1666 */ 1667 smp_mb__before_spinlock(); 1668 raw_spin_lock_irqsave(&p->pi_lock, flags); 1669 if (!(p->state & state)) 1670 goto out; 1671 1672 success = 1; /* we're going to change ->state */ 1673 cpu = task_cpu(p); 1674 1675 if (p->on_rq && ttwu_remote(p, wake_flags)) 1676 goto stat; 1677 1678#ifdef CONFIG_SMP 1679 /* 1680 * If the owning (remote) cpu is still in the middle of schedule() with 1681 * this task as prev, wait until its done referencing the task. 1682 */ 1683 while (p->on_cpu) 1684 cpu_relax(); 1685 /* 1686 * Pairs with the smp_wmb() in finish_lock_switch(). 1687 */ 1688 smp_rmb(); 1689 1690 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1691 p->state = TASK_WAKING; 1692 1693 if (p->sched_class->task_waking) 1694 p->sched_class->task_waking(p); 1695 1696 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 1697 if (task_cpu(p) != cpu) { 1698 wake_flags |= WF_MIGRATED; 1699 set_task_cpu(p, cpu); 1700 } 1701#endif /* CONFIG_SMP */ 1702 1703 ttwu_queue(p, cpu); 1704stat: 1705 ttwu_stat(p, cpu, wake_flags); 1706out: 1707 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1708 1709 return success; 1710} 1711 1712/** 1713 * try_to_wake_up_local - try to wake up a local task with rq lock held 1714 * @p: the thread to be awakened 1715 * 1716 * Put @p on the run-queue if it's not already there. The caller must 1717 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1718 * the current task. 1719 */ 1720static void try_to_wake_up_local(struct task_struct *p) 1721{ 1722 struct rq *rq = task_rq(p); 1723 1724 if (WARN_ON_ONCE(rq != this_rq()) || 1725 WARN_ON_ONCE(p == current)) 1726 return; 1727 1728 lockdep_assert_held(&rq->lock); 1729 1730 if (!raw_spin_trylock(&p->pi_lock)) { 1731 raw_spin_unlock(&rq->lock); 1732 raw_spin_lock(&p->pi_lock); 1733 raw_spin_lock(&rq->lock); 1734 } 1735 1736 if (!(p->state & TASK_NORMAL)) 1737 goto out; 1738 1739 if (!task_on_rq_queued(p)) 1740 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1741 1742 ttwu_do_wakeup(rq, p, 0); 1743 ttwu_stat(p, smp_processor_id(), 0); 1744out: 1745 raw_spin_unlock(&p->pi_lock); 1746} 1747 1748/** 1749 * wake_up_process - Wake up a specific process 1750 * @p: The process to be woken up. 1751 * 1752 * Attempt to wake up the nominated process and move it to the set of runnable 1753 * processes. 1754 * 1755 * Return: 1 if the process was woken up, 0 if it was already running. 1756 * 1757 * It may be assumed that this function implies a write memory barrier before 1758 * changing the task state if and only if any tasks are woken up. 1759 */ 1760int wake_up_process(struct task_struct *p) 1761{ 1762 WARN_ON(task_is_stopped_or_traced(p)); 1763 return try_to_wake_up(p, TASK_NORMAL, 0); 1764} 1765EXPORT_SYMBOL(wake_up_process); 1766 1767int wake_up_state(struct task_struct *p, unsigned int state) 1768{ 1769 return try_to_wake_up(p, state, 0); 1770} 1771 1772/* 1773 * This function clears the sched_dl_entity static params. 1774 */ 1775void __dl_clear_params(struct task_struct *p) 1776{ 1777 struct sched_dl_entity *dl_se = &p->dl; 1778 1779 dl_se->dl_runtime = 0; 1780 dl_se->dl_deadline = 0; 1781 dl_se->dl_period = 0; 1782 dl_se->flags = 0; 1783 dl_se->dl_bw = 0; 1784 1785 dl_se->dl_throttled = 0; 1786 dl_se->dl_new = 1; 1787 dl_se->dl_yielded = 0; 1788} 1789 1790/* 1791 * Perform scheduler related setup for a newly forked process p. 1792 * p is forked by current. 1793 * 1794 * __sched_fork() is basic setup used by init_idle() too: 1795 */ 1796static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 1797{ 1798 p->on_rq = 0; 1799 1800 p->se.on_rq = 0; 1801 p->se.exec_start = 0; 1802 p->se.sum_exec_runtime = 0; 1803 p->se.prev_sum_exec_runtime = 0; 1804 p->se.nr_migrations = 0; 1805 p->se.vruntime = 0; 1806#ifdef CONFIG_SMP 1807 p->se.avg.decay_count = 0; 1808#endif 1809 INIT_LIST_HEAD(&p->se.group_node); 1810 1811#ifdef CONFIG_SCHEDSTATS 1812 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1813#endif 1814 1815 RB_CLEAR_NODE(&p->dl.rb_node); 1816 init_dl_task_timer(&p->dl); 1817 __dl_clear_params(p); 1818 1819 INIT_LIST_HEAD(&p->rt.run_list); 1820 1821#ifdef CONFIG_PREEMPT_NOTIFIERS 1822 INIT_HLIST_HEAD(&p->preempt_notifiers); 1823#endif 1824 1825#ifdef CONFIG_NUMA_BALANCING 1826 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1827 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 1828 p->mm->numa_scan_seq = 0; 1829 } 1830 1831 if (clone_flags & CLONE_VM) 1832 p->numa_preferred_nid = current->numa_preferred_nid; 1833 else 1834 p->numa_preferred_nid = -1; 1835 1836 p->node_stamp = 0ULL; 1837 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1838 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1839 p->numa_work.next = &p->numa_work; 1840 p->numa_faults = NULL; 1841 p->last_task_numa_placement = 0; 1842 p->last_sum_exec_runtime = 0; 1843 1844 p->numa_group = NULL; 1845#endif /* CONFIG_NUMA_BALANCING */ 1846} 1847 1848#ifdef CONFIG_NUMA_BALANCING 1849#ifdef CONFIG_SCHED_DEBUG 1850void set_numabalancing_state(bool enabled) 1851{ 1852 if (enabled) 1853 sched_feat_set("NUMA"); 1854 else 1855 sched_feat_set("NO_NUMA"); 1856} 1857#else 1858__read_mostly bool numabalancing_enabled; 1859 1860void set_numabalancing_state(bool enabled) 1861{ 1862 numabalancing_enabled = enabled; 1863} 1864#endif /* CONFIG_SCHED_DEBUG */ 1865 1866#ifdef CONFIG_PROC_SYSCTL 1867int sysctl_numa_balancing(struct ctl_table *table, int write, 1868 void __user *buffer, size_t *lenp, loff_t *ppos) 1869{ 1870 struct ctl_table t; 1871 int err; 1872 int state = numabalancing_enabled; 1873 1874 if (write && !capable(CAP_SYS_ADMIN)) 1875 return -EPERM; 1876 1877 t = *table; 1878 t.data = &state; 1879 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 1880 if (err < 0) 1881 return err; 1882 if (write) 1883 set_numabalancing_state(state); 1884 return err; 1885} 1886#endif 1887#endif 1888 1889/* 1890 * fork()/clone()-time setup: 1891 */ 1892int sched_fork(unsigned long clone_flags, struct task_struct *p) 1893{ 1894 unsigned long flags; 1895 int cpu = get_cpu(); 1896 1897 __sched_fork(clone_flags, p); 1898 /* 1899 * We mark the process as running here. This guarantees that 1900 * nobody will actually run it, and a signal or other external 1901 * event cannot wake it up and insert it on the runqueue either. 1902 */ 1903 p->state = TASK_RUNNING; 1904 1905 /* 1906 * Make sure we do not leak PI boosting priority to the child. 1907 */ 1908 p->prio = current->normal_prio; 1909 1910 /* 1911 * Revert to default priority/policy on fork if requested. 1912 */ 1913 if (unlikely(p->sched_reset_on_fork)) { 1914 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 1915 p->policy = SCHED_NORMAL; 1916 p->static_prio = NICE_TO_PRIO(0); 1917 p->rt_priority = 0; 1918 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1919 p->static_prio = NICE_TO_PRIO(0); 1920 1921 p->prio = p->normal_prio = __normal_prio(p); 1922 set_load_weight(p); 1923 1924 /* 1925 * We don't need the reset flag anymore after the fork. It has 1926 * fulfilled its duty: 1927 */ 1928 p->sched_reset_on_fork = 0; 1929 } 1930 1931 if (dl_prio(p->prio)) { 1932 put_cpu(); 1933 return -EAGAIN; 1934 } else if (rt_prio(p->prio)) { 1935 p->sched_class = &rt_sched_class; 1936 } else { 1937 p->sched_class = &fair_sched_class; 1938 } 1939 1940 if (p->sched_class->task_fork) 1941 p->sched_class->task_fork(p); 1942 1943 /* 1944 * The child is not yet in the pid-hash so no cgroup attach races, 1945 * and the cgroup is pinned to this child due to cgroup_fork() 1946 * is ran before sched_fork(). 1947 * 1948 * Silence PROVE_RCU. 1949 */ 1950 raw_spin_lock_irqsave(&p->pi_lock, flags); 1951 set_task_cpu(p, cpu); 1952 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1953 1954#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1955 if (likely(sched_info_on())) 1956 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1957#endif 1958#if defined(CONFIG_SMP) 1959 p->on_cpu = 0; 1960#endif 1961 init_task_preempt_count(p); 1962#ifdef CONFIG_SMP 1963 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1964 RB_CLEAR_NODE(&p->pushable_dl_tasks); 1965#endif 1966 1967 put_cpu(); 1968 return 0; 1969} 1970 1971unsigned long to_ratio(u64 period, u64 runtime) 1972{ 1973 if (runtime == RUNTIME_INF) 1974 return 1ULL << 20; 1975 1976 /* 1977 * Doing this here saves a lot of checks in all 1978 * the calling paths, and returning zero seems 1979 * safe for them anyway. 1980 */ 1981 if (period == 0) 1982 return 0; 1983 1984 return div64_u64(runtime << 20, period); 1985} 1986 1987#ifdef CONFIG_SMP 1988inline struct dl_bw *dl_bw_of(int i) 1989{ 1990 rcu_lockdep_assert(rcu_read_lock_sched_held(), 1991 "sched RCU must be held"); 1992 return &cpu_rq(i)->rd->dl_bw; 1993} 1994 1995static inline int dl_bw_cpus(int i) 1996{ 1997 struct root_domain *rd = cpu_rq(i)->rd; 1998 int cpus = 0; 1999 2000 rcu_lockdep_assert(rcu_read_lock_sched_held(), 2001 "sched RCU must be held"); 2002 for_each_cpu_and(i, rd->span, cpu_active_mask) 2003 cpus++; 2004 2005 return cpus; 2006} 2007#else 2008inline struct dl_bw *dl_bw_of(int i) 2009{ 2010 return &cpu_rq(i)->dl.dl_bw; 2011} 2012 2013static inline int dl_bw_cpus(int i) 2014{ 2015 return 1; 2016} 2017#endif 2018 2019/* 2020 * We must be sure that accepting a new task (or allowing changing the 2021 * parameters of an existing one) is consistent with the bandwidth 2022 * constraints. If yes, this function also accordingly updates the currently 2023 * allocated bandwidth to reflect the new situation. 2024 * 2025 * This function is called while holding p's rq->lock. 2026 * 2027 * XXX we should delay bw change until the task's 0-lag point, see 2028 * __setparam_dl(). 2029 */ 2030static int dl_overflow(struct task_struct *p, int policy, 2031 const struct sched_attr *attr) 2032{ 2033 2034 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 2035 u64 period = attr->sched_period ?: attr->sched_deadline; 2036 u64 runtime = attr->sched_runtime; 2037 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 2038 int cpus, err = -1; 2039 2040 if (new_bw == p->dl.dl_bw) 2041 return 0; 2042 2043 /* 2044 * Either if a task, enters, leave, or stays -deadline but changes 2045 * its parameters, we may need to update accordingly the total 2046 * allocated bandwidth of the container. 2047 */ 2048 raw_spin_lock(&dl_b->lock); 2049 cpus = dl_bw_cpus(task_cpu(p)); 2050 if (dl_policy(policy) && !task_has_dl_policy(p) && 2051 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 2052 __dl_add(dl_b, new_bw); 2053 err = 0; 2054 } else if (dl_policy(policy) && task_has_dl_policy(p) && 2055 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 2056 __dl_clear(dl_b, p->dl.dl_bw); 2057 __dl_add(dl_b, new_bw); 2058 err = 0; 2059 } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 2060 __dl_clear(dl_b, p->dl.dl_bw); 2061 err = 0; 2062 } 2063 raw_spin_unlock(&dl_b->lock); 2064 2065 return err; 2066} 2067 2068extern void init_dl_bw(struct dl_bw *dl_b); 2069 2070/* 2071 * wake_up_new_task - wake up a newly created task for the first time. 2072 * 2073 * This function will do some initial scheduler statistics housekeeping 2074 * that must be done for every newly created context, then puts the task 2075 * on the runqueue and wakes it. 2076 */ 2077void wake_up_new_task(struct task_struct *p) 2078{ 2079 unsigned long flags; 2080 struct rq *rq; 2081 2082 raw_spin_lock_irqsave(&p->pi_lock, flags); 2083#ifdef CONFIG_SMP 2084 /* 2085 * Fork balancing, do it here and not earlier because: 2086 * - cpus_allowed can change in the fork path 2087 * - any previously selected cpu might disappear through hotplug 2088 */ 2089 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2090#endif 2091 2092 /* Initialize new task's runnable average */ 2093 init_task_runnable_average(p); 2094 rq = __task_rq_lock(p); 2095 activate_task(rq, p, 0); 2096 p->on_rq = TASK_ON_RQ_QUEUED; 2097 trace_sched_wakeup_new(p, true); 2098 check_preempt_curr(rq, p, WF_FORK); 2099#ifdef CONFIG_SMP 2100 if (p->sched_class->task_woken) 2101 p->sched_class->task_woken(rq, p); 2102#endif 2103 task_rq_unlock(rq, p, &flags); 2104} 2105 2106#ifdef CONFIG_PREEMPT_NOTIFIERS 2107 2108/** 2109 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2110 * @notifier: notifier struct to register 2111 */ 2112void preempt_notifier_register(struct preempt_notifier *notifier) 2113{ 2114 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); 2115} 2116EXPORT_SYMBOL_GPL(preempt_notifier_register); 2117 2118/** 2119 * preempt_notifier_unregister - no longer interested in preemption notifications 2120 * @notifier: notifier struct to unregister 2121 * 2122 * This is safe to call from within a preemption notifier. 2123 */ 2124void preempt_notifier_unregister(struct preempt_notifier *notifier) 2125{ 2126 hlist_del(¬ifier->link); 2127} 2128EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2129 2130static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2131{ 2132 struct preempt_notifier *notifier; 2133 2134 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2135 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2136} 2137 2138static void 2139fire_sched_out_preempt_notifiers(struct task_struct *curr, 2140 struct task_struct *next) 2141{ 2142 struct preempt_notifier *notifier; 2143 2144 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2145 notifier->ops->sched_out(notifier, next); 2146} 2147 2148#else /* !CONFIG_PREEMPT_NOTIFIERS */ 2149 2150static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2151{ 2152} 2153 2154static void 2155fire_sched_out_preempt_notifiers(struct task_struct *curr, 2156 struct task_struct *next) 2157{ 2158} 2159 2160#endif /* CONFIG_PREEMPT_NOTIFIERS */ 2161 2162/** 2163 * prepare_task_switch - prepare to switch tasks 2164 * @rq: the runqueue preparing to switch 2165 * @prev: the current task that is being switched out 2166 * @next: the task we are going to switch to. 2167 * 2168 * This is called with the rq lock held and interrupts off. It must 2169 * be paired with a subsequent finish_task_switch after the context 2170 * switch. 2171 * 2172 * prepare_task_switch sets up locking and calls architecture specific 2173 * hooks. 2174 */ 2175static inline void 2176prepare_task_switch(struct rq *rq, struct task_struct *prev, 2177 struct task_struct *next) 2178{ 2179 trace_sched_switch(prev, next); 2180 sched_info_switch(rq, prev, next); 2181 perf_event_task_sched_out(prev, next); 2182 fire_sched_out_preempt_notifiers(prev, next); 2183 prepare_lock_switch(rq, next); 2184 prepare_arch_switch(next); 2185} 2186 2187/** 2188 * finish_task_switch - clean up after a task-switch 2189 * @prev: the thread we just switched away from. 2190 * 2191 * finish_task_switch must be called after the context switch, paired 2192 * with a prepare_task_switch call before the context switch. 2193 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2194 * and do any other architecture-specific cleanup actions. 2195 * 2196 * Note that we may have delayed dropping an mm in context_switch(). If 2197 * so, we finish that here outside of the runqueue lock. (Doing it 2198 * with the lock held can cause deadlocks; see schedule() for 2199 * details.) 2200 * 2201 * The context switch have flipped the stack from under us and restored the 2202 * local variables which were saved when this task called schedule() in the 2203 * past. prev == current is still correct but we need to recalculate this_rq 2204 * because prev may have moved to another CPU. 2205 */ 2206static struct rq *finish_task_switch(struct task_struct *prev) 2207 __releases(rq->lock) 2208{ 2209 struct rq *rq = this_rq(); 2210 struct mm_struct *mm = rq->prev_mm; 2211 long prev_state; 2212 2213 rq->prev_mm = NULL; 2214 2215 /* 2216 * A task struct has one reference for the use as "current". 2217 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2218 * schedule one last time. The schedule call will never return, and 2219 * the scheduled task must drop that reference. 2220 * 2221 * We must observe prev->state before clearing prev->on_cpu (in 2222 * finish_lock_switch), otherwise a concurrent wakeup can get prev 2223 * running on another CPU and we could rave with its RUNNING -> DEAD 2224 * transition, resulting in a double drop. 2225 */ 2226 prev_state = prev->state; 2227 vtime_task_switch(prev); 2228 finish_arch_switch(prev); 2229 perf_event_task_sched_in(prev, current); 2230 finish_lock_switch(rq, prev); 2231 finish_arch_post_lock_switch(); 2232 2233 fire_sched_in_preempt_notifiers(current); 2234 if (mm) 2235 mmdrop(mm); 2236 if (unlikely(prev_state == TASK_DEAD)) { 2237 if (prev->sched_class->task_dead) 2238 prev->sched_class->task_dead(prev); 2239 2240 /* 2241 * Remove function-return probe instances associated with this 2242 * task and put them back on the free list. 2243 */ 2244 kprobe_flush_task(prev); 2245 put_task_struct(prev); 2246 } 2247 2248 tick_nohz_task_switch(current); 2249 return rq; 2250} 2251 2252#ifdef CONFIG_SMP 2253 2254/* rq->lock is NOT held, but preemption is disabled */ 2255static inline void post_schedule(struct rq *rq) 2256{ 2257 if (rq->post_schedule) { 2258 unsigned long flags; 2259 2260 raw_spin_lock_irqsave(&rq->lock, flags); 2261 if (rq->curr->sched_class->post_schedule) 2262 rq->curr->sched_class->post_schedule(rq); 2263 raw_spin_unlock_irqrestore(&rq->lock, flags); 2264 2265 rq->post_schedule = 0; 2266 } 2267} 2268 2269#else 2270 2271static inline void post_schedule(struct rq *rq) 2272{ 2273} 2274 2275#endif 2276 2277/** 2278 * schedule_tail - first thing a freshly forked thread must call. 2279 * @prev: the thread we just switched away from. 2280 */ 2281asmlinkage __visible void schedule_tail(struct task_struct *prev) 2282 __releases(rq->lock) 2283{ 2284 struct rq *rq; 2285 2286 /* finish_task_switch() drops rq->lock and enables preemtion */ 2287 preempt_disable(); 2288 rq = finish_task_switch(prev); 2289 post_schedule(rq); 2290 preempt_enable(); 2291 2292 if (current->set_child_tid) 2293 put_user(task_pid_vnr(current), current->set_child_tid); 2294} 2295 2296/* 2297 * context_switch - switch to the new MM and the new thread's register state. 2298 */ 2299static inline struct rq * 2300context_switch(struct rq *rq, struct task_struct *prev, 2301 struct task_struct *next) 2302{ 2303 struct mm_struct *mm, *oldmm; 2304 2305 prepare_task_switch(rq, prev, next); 2306 2307 mm = next->mm; 2308 oldmm = prev->active_mm; 2309 /* 2310 * For paravirt, this is coupled with an exit in switch_to to 2311 * combine the page table reload and the switch backend into 2312 * one hypercall. 2313 */ 2314 arch_start_context_switch(prev); 2315 2316 if (!mm) { 2317 next->active_mm = oldmm; 2318 atomic_inc(&oldmm->mm_count); 2319 enter_lazy_tlb(oldmm, next); 2320 } else 2321 switch_mm(oldmm, mm, next); 2322 2323 if (!prev->mm) { 2324 prev->active_mm = NULL; 2325 rq->prev_mm = oldmm; 2326 } 2327 /* 2328 * Since the runqueue lock will be released by the next 2329 * task (which is an invalid locking op but in the case 2330 * of the scheduler it's an obvious special-case), so we 2331 * do an early lockdep release here: 2332 */ 2333 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2334 2335 context_tracking_task_switch(prev, next); 2336 /* Here we just switch the register state and the stack. */ 2337 switch_to(prev, next, prev); 2338 barrier(); 2339 2340 return finish_task_switch(prev); 2341} 2342 2343/* 2344 * nr_running and nr_context_switches: 2345 * 2346 * externally visible scheduler statistics: current number of runnable 2347 * threads, total number of context switches performed since bootup. 2348 */ 2349unsigned long nr_running(void) 2350{ 2351 unsigned long i, sum = 0; 2352 2353 for_each_online_cpu(i) 2354 sum += cpu_rq(i)->nr_running; 2355 2356 return sum; 2357} 2358 2359/* 2360 * Check if only the current task is running on the cpu. 2361 * 2362 * Caution: this function does not check that the caller has disabled 2363 * preemption, thus the result might have a time-of-check-to-time-of-use 2364 * race. The caller is responsible to use it correctly, for example: 2365 * 2366 * - from a non-preemptable section (of course) 2367 * 2368 * - from a thread that is bound to a single CPU 2369 * 2370 * - in a loop with very short iterations (e.g. a polling loop) 2371 */ 2372bool single_task_running(void) 2373{ 2374 return raw_rq()->nr_running == 1; 2375} 2376EXPORT_SYMBOL(single_task_running); 2377 2378unsigned long long nr_context_switches(void) 2379{ 2380 int i; 2381 unsigned long long sum = 0; 2382 2383 for_each_possible_cpu(i) 2384 sum += cpu_rq(i)->nr_switches; 2385 2386 return sum; 2387} 2388 2389unsigned long nr_iowait(void) 2390{ 2391 unsigned long i, sum = 0; 2392 2393 for_each_possible_cpu(i) 2394 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2395 2396 return sum; 2397} 2398 2399unsigned long nr_iowait_cpu(int cpu) 2400{ 2401 struct rq *this = cpu_rq(cpu); 2402 return atomic_read(&this->nr_iowait); 2403} 2404 2405void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) 2406{ 2407 struct rq *this = this_rq(); 2408 *nr_waiters = atomic_read(&this->nr_iowait); 2409 *load = this->cpu_load[0]; 2410} 2411 2412#ifdef CONFIG_SMP 2413 2414/* 2415 * sched_exec - execve() is a valuable balancing opportunity, because at 2416 * this point the task has the smallest effective memory and cache footprint. 2417 */ 2418void sched_exec(void) 2419{ 2420 struct task_struct *p = current; 2421 unsigned long flags; 2422 int dest_cpu; 2423 2424 raw_spin_lock_irqsave(&p->pi_lock, flags); 2425 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2426 if (dest_cpu == smp_processor_id()) 2427 goto unlock; 2428 2429 if (likely(cpu_active(dest_cpu))) { 2430 struct migration_arg arg = { p, dest_cpu }; 2431 2432 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2433 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2434 return; 2435 } 2436unlock: 2437 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2438} 2439 2440#endif 2441 2442DEFINE_PER_CPU(struct kernel_stat, kstat); 2443DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2444 2445EXPORT_PER_CPU_SYMBOL(kstat); 2446EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2447 2448/* 2449 * Return accounted runtime for the task. 2450 * In case the task is currently running, return the runtime plus current's 2451 * pending runtime that have not been accounted yet. 2452 */ 2453unsigned long long task_sched_runtime(struct task_struct *p) 2454{ 2455 unsigned long flags; 2456 struct rq *rq; 2457 u64 ns; 2458 2459#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2460 /* 2461 * 64-bit doesn't need locks to atomically read a 64bit value. 2462 * So we have a optimization chance when the task's delta_exec is 0. 2463 * Reading ->on_cpu is racy, but this is ok. 2464 * 2465 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2466 * If we race with it entering cpu, unaccounted time is 0. This is 2467 * indistinguishable from the read occurring a few cycles earlier. 2468 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2469 * been accounted, so we're correct here as well. 2470 */ 2471 if (!p->on_cpu || !task_on_rq_queued(p)) 2472 return p->se.sum_exec_runtime; 2473#endif 2474 2475 rq = task_rq_lock(p, &flags); 2476 /* 2477 * Must be ->curr _and_ ->on_rq. If dequeued, we would 2478 * project cycles that may never be accounted to this 2479 * thread, breaking clock_gettime(). 2480 */ 2481 if (task_current(rq, p) && task_on_rq_queued(p)) { 2482 update_rq_clock(rq); 2483 p->sched_class->update_curr(rq); 2484 } 2485 ns = p->se.sum_exec_runtime; 2486 task_rq_unlock(rq, p, &flags); 2487 2488 return ns; 2489} 2490 2491/* 2492 * This function gets called by the timer code, with HZ frequency. 2493 * We call it with interrupts disabled. 2494 */ 2495void scheduler_tick(void) 2496{ 2497 int cpu = smp_processor_id(); 2498 struct rq *rq = cpu_rq(cpu); 2499 struct task_struct *curr = rq->curr; 2500 2501 sched_clock_tick(); 2502 2503 raw_spin_lock(&rq->lock); 2504 update_rq_clock(rq); 2505 curr->sched_class->task_tick(rq, curr, 0); 2506 update_cpu_load_active(rq); 2507 raw_spin_unlock(&rq->lock); 2508 2509 perf_event_task_tick(); 2510 2511#ifdef CONFIG_SMP 2512 rq->idle_balance = idle_cpu(cpu); 2513 trigger_load_balance(rq); 2514#endif 2515 rq_last_tick_reset(rq); 2516} 2517 2518#ifdef CONFIG_NO_HZ_FULL 2519/** 2520 * scheduler_tick_max_deferment 2521 * 2522 * Keep at least one tick per second when a single 2523 * active task is running because the scheduler doesn't 2524 * yet completely support full dynticks environment. 2525 * 2526 * This makes sure that uptime, CFS vruntime, load 2527 * balancing, etc... continue to move forward, even 2528 * with a very low granularity. 2529 * 2530 * Return: Maximum deferment in nanoseconds. 2531 */ 2532u64 scheduler_tick_max_deferment(void) 2533{ 2534 struct rq *rq = this_rq(); 2535 unsigned long next, now = ACCESS_ONCE(jiffies); 2536 2537 next = rq->last_sched_tick + HZ; 2538 2539 if (time_before_eq(next, now)) 2540 return 0; 2541 2542 return jiffies_to_nsecs(next - now); 2543} 2544#endif 2545 2546notrace unsigned long get_parent_ip(unsigned long addr) 2547{ 2548 if (in_lock_functions(addr)) { 2549 addr = CALLER_ADDR2; 2550 if (in_lock_functions(addr)) 2551 addr = CALLER_ADDR3; 2552 } 2553 return addr; 2554} 2555 2556#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2557 defined(CONFIG_PREEMPT_TRACER)) 2558 2559void preempt_count_add(int val) 2560{ 2561#ifdef CONFIG_DEBUG_PREEMPT 2562 /* 2563 * Underflow? 2564 */ 2565 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2566 return; 2567#endif 2568 __preempt_count_add(val); 2569#ifdef CONFIG_DEBUG_PREEMPT 2570 /* 2571 * Spinlock count overflowing soon? 2572 */ 2573 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2574 PREEMPT_MASK - 10); 2575#endif 2576 if (preempt_count() == val) { 2577 unsigned long ip = get_parent_ip(CALLER_ADDR1); 2578#ifdef CONFIG_DEBUG_PREEMPT 2579 current->preempt_disable_ip = ip; 2580#endif 2581 trace_preempt_off(CALLER_ADDR0, ip); 2582 } 2583} 2584EXPORT_SYMBOL(preempt_count_add); 2585NOKPROBE_SYMBOL(preempt_count_add); 2586 2587void preempt_count_sub(int val) 2588{ 2589#ifdef CONFIG_DEBUG_PREEMPT 2590 /* 2591 * Underflow? 2592 */ 2593 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2594 return; 2595 /* 2596 * Is the spinlock portion underflowing? 2597 */ 2598 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2599 !(preempt_count() & PREEMPT_MASK))) 2600 return; 2601#endif 2602 2603 if (preempt_count() == val) 2604 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2605 __preempt_count_sub(val); 2606} 2607EXPORT_SYMBOL(preempt_count_sub); 2608NOKPROBE_SYMBOL(preempt_count_sub); 2609 2610#endif 2611 2612/* 2613 * Print scheduling while atomic bug: 2614 */ 2615static noinline void __schedule_bug(struct task_struct *prev) 2616{ 2617 if (oops_in_progress) 2618 return; 2619 2620 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2621 prev->comm, prev->pid, preempt_count()); 2622 2623 debug_show_held_locks(prev); 2624 print_modules(); 2625 if (irqs_disabled()) 2626 print_irqtrace_events(prev); 2627#ifdef CONFIG_DEBUG_PREEMPT 2628 if (in_atomic_preempt_off()) { 2629 pr_err("Preemption disabled at:"); 2630 print_ip_sym(current->preempt_disable_ip); 2631 pr_cont("\n"); 2632 } 2633#endif 2634 dump_stack(); 2635 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2636} 2637 2638/* 2639 * Various schedule()-time debugging checks and statistics: 2640 */ 2641static inline void schedule_debug(struct task_struct *prev) 2642{ 2643#ifdef CONFIG_SCHED_STACK_END_CHECK 2644 BUG_ON(unlikely(task_stack_end_corrupted(prev))); 2645#endif 2646 /* 2647 * Test if we are atomic. Since do_exit() needs to call into 2648 * schedule() atomically, we ignore that path. Otherwise whine 2649 * if we are scheduling when we should not. 2650 */ 2651 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) 2652 __schedule_bug(prev); 2653 rcu_sleep_check(); 2654 2655 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2656 2657 schedstat_inc(this_rq(), sched_count); 2658} 2659 2660/* 2661 * Pick up the highest-prio task: 2662 */ 2663static inline struct task_struct * 2664pick_next_task(struct rq *rq, struct task_struct *prev) 2665{ 2666 const struct sched_class *class = &fair_sched_class; 2667 struct task_struct *p; 2668 2669 /* 2670 * Optimization: we know that if all tasks are in 2671 * the fair class we can call that function directly: 2672 */ 2673 if (likely(prev->sched_class == class && 2674 rq->nr_running == rq->cfs.h_nr_running)) { 2675 p = fair_sched_class.pick_next_task(rq, prev); 2676 if (unlikely(p == RETRY_TASK)) 2677 goto again; 2678 2679 /* assumes fair_sched_class->next == idle_sched_class */ 2680 if (unlikely(!p)) 2681 p = idle_sched_class.pick_next_task(rq, prev); 2682 2683 return p; 2684 } 2685 2686again: 2687 for_each_class(class) { 2688 p = class->pick_next_task(rq, prev); 2689 if (p) { 2690 if (unlikely(p == RETRY_TASK)) 2691 goto again; 2692 return p; 2693 } 2694 } 2695 2696 BUG(); /* the idle class will always have a runnable task */ 2697} 2698 2699/* 2700 * __schedule() is the main scheduler function. 2701 * 2702 * The main means of driving the scheduler and thus entering this function are: 2703 * 2704 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2705 * 2706 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2707 * paths. For example, see arch/x86/entry_64.S. 2708 * 2709 * To drive preemption between tasks, the scheduler sets the flag in timer 2710 * interrupt handler scheduler_tick(). 2711 * 2712 * 3. Wakeups don't really cause entry into schedule(). They add a 2713 * task to the run-queue and that's it. 2714 * 2715 * Now, if the new task added to the run-queue preempts the current 2716 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2717 * called on the nearest possible occasion: 2718 * 2719 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2720 * 2721 * - in syscall or exception context, at the next outmost 2722 * preempt_enable(). (this might be as soon as the wake_up()'s 2723 * spin_unlock()!) 2724 * 2725 * - in IRQ context, return from interrupt-handler to 2726 * preemptible context 2727 * 2728 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2729 * then at the next: 2730 * 2731 * - cond_resched() call 2732 * - explicit schedule() call 2733 * - return from syscall or exception to user-space 2734 * - return from interrupt-handler to user-space 2735 * 2736 * WARNING: all callers must re-check need_resched() afterward and reschedule 2737 * accordingly in case an event triggered the need for rescheduling (such as 2738 * an interrupt waking up a task) while preemption was disabled in __schedule(). 2739 */ 2740static void __sched __schedule(void) 2741{ 2742 struct task_struct *prev, *next; 2743 unsigned long *switch_count; 2744 struct rq *rq; 2745 int cpu; 2746 2747 preempt_disable(); 2748 cpu = smp_processor_id(); 2749 rq = cpu_rq(cpu); 2750 rcu_note_context_switch(); 2751 prev = rq->curr; 2752 2753 schedule_debug(prev); 2754 2755 if (sched_feat(HRTICK)) 2756 hrtick_clear(rq); 2757 2758 /* 2759 * Make sure that signal_pending_state()->signal_pending() below 2760 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 2761 * done by the caller to avoid the race with signal_wake_up(). 2762 */ 2763 smp_mb__before_spinlock(); 2764 raw_spin_lock_irq(&rq->lock); 2765 2766 rq->clock_skip_update <<= 1; /* promote REQ to ACT */ 2767 2768 switch_count = &prev->nivcsw; 2769 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2770 if (unlikely(signal_pending_state(prev->state, prev))) { 2771 prev->state = TASK_RUNNING; 2772 } else { 2773 deactivate_task(rq, prev, DEQUEUE_SLEEP); 2774 prev->on_rq = 0; 2775 2776 /* 2777 * If a worker went to sleep, notify and ask workqueue 2778 * whether it wants to wake up a task to maintain 2779 * concurrency. 2780 */ 2781 if (prev->flags & PF_WQ_WORKER) { 2782 struct task_struct *to_wakeup; 2783 2784 to_wakeup = wq_worker_sleeping(prev, cpu); 2785 if (to_wakeup) 2786 try_to_wake_up_local(to_wakeup); 2787 } 2788 } 2789 switch_count = &prev->nvcsw; 2790 } 2791 2792 if (task_on_rq_queued(prev)) 2793 update_rq_clock(rq); 2794 2795 next = pick_next_task(rq, prev); 2796 clear_tsk_need_resched(prev); 2797 clear_preempt_need_resched(); 2798 rq->clock_skip_update = 0; 2799 2800 if (likely(prev != next)) { 2801 rq->nr_switches++; 2802 rq->curr = next; 2803 ++*switch_count; 2804 2805 rq = context_switch(rq, prev, next); /* unlocks the rq */ 2806 cpu = cpu_of(rq); 2807 } else 2808 raw_spin_unlock_irq(&rq->lock); 2809 2810 post_schedule(rq); 2811 2812 sched_preempt_enable_no_resched(); 2813} 2814 2815static inline void sched_submit_work(struct task_struct *tsk) 2816{ 2817 if (!tsk->state || tsk_is_pi_blocked(tsk)) 2818 return; 2819 /* 2820 * If we are going to sleep and we have plugged IO queued, 2821 * make sure to submit it to avoid deadlocks. 2822 */ 2823 if (blk_needs_flush_plug(tsk)) 2824 blk_schedule_flush_plug(tsk); 2825} 2826 2827asmlinkage __visible void __sched schedule(void) 2828{ 2829 struct task_struct *tsk = current; 2830 2831 sched_submit_work(tsk); 2832 do { 2833 __schedule(); 2834 } while (need_resched()); 2835} 2836EXPORT_SYMBOL(schedule); 2837 2838#ifdef CONFIG_CONTEXT_TRACKING 2839asmlinkage __visible void __sched schedule_user(void) 2840{ 2841 /* 2842 * If we come here after a random call to set_need_resched(), 2843 * or we have been woken up remotely but the IPI has not yet arrived, 2844 * we haven't yet exited the RCU idle mode. Do it here manually until 2845 * we find a better solution. 2846 * 2847 * NB: There are buggy callers of this function. Ideally we 2848 * should warn if prev_state != CONTEXT_USER, but that will trigger 2849 * too frequently to make sense yet. 2850 */ 2851 enum ctx_state prev_state = exception_enter(); 2852 schedule(); 2853 exception_exit(prev_state); 2854} 2855#endif 2856 2857/** 2858 * schedule_preempt_disabled - called with preemption disabled 2859 * 2860 * Returns with preemption disabled. Note: preempt_count must be 1 2861 */ 2862void __sched schedule_preempt_disabled(void) 2863{ 2864 sched_preempt_enable_no_resched(); 2865 schedule(); 2866 preempt_disable(); 2867} 2868 2869static void __sched notrace preempt_schedule_common(void) 2870{ 2871 do { 2872 __preempt_count_add(PREEMPT_ACTIVE); 2873 __schedule(); 2874 __preempt_count_sub(PREEMPT_ACTIVE); 2875 2876 /* 2877 * Check again in case we missed a preemption opportunity 2878 * between schedule and now. 2879 */ 2880 barrier(); 2881 } while (need_resched()); 2882} 2883 2884#ifdef CONFIG_PREEMPT 2885/* 2886 * this is the entry point to schedule() from in-kernel preemption 2887 * off of preempt_enable. Kernel preemptions off return from interrupt 2888 * occur there and call schedule directly. 2889 */ 2890asmlinkage __visible void __sched notrace preempt_schedule(void) 2891{ 2892 /* 2893 * If there is a non-zero preempt_count or interrupts are disabled, 2894 * we do not want to preempt the current task. Just return.. 2895 */ 2896 if (likely(!preemptible())) 2897 return; 2898 2899 preempt_schedule_common(); 2900} 2901NOKPROBE_SYMBOL(preempt_schedule); 2902EXPORT_SYMBOL(preempt_schedule); 2903 2904#ifdef CONFIG_CONTEXT_TRACKING 2905/** 2906 * preempt_schedule_context - preempt_schedule called by tracing 2907 * 2908 * The tracing infrastructure uses preempt_enable_notrace to prevent 2909 * recursion and tracing preempt enabling caused by the tracing 2910 * infrastructure itself. But as tracing can happen in areas coming 2911 * from userspace or just about to enter userspace, a preempt enable 2912 * can occur before user_exit() is called. This will cause the scheduler 2913 * to be called when the system is still in usermode. 2914 * 2915 * To prevent this, the preempt_enable_notrace will use this function 2916 * instead of preempt_schedule() to exit user context if needed before 2917 * calling the scheduler. 2918 */ 2919asmlinkage __visible void __sched notrace preempt_schedule_context(void) 2920{ 2921 enum ctx_state prev_ctx; 2922 2923 if (likely(!preemptible())) 2924 return; 2925 2926 do { 2927 __preempt_count_add(PREEMPT_ACTIVE); 2928 /* 2929 * Needs preempt disabled in case user_exit() is traced 2930 * and the tracer calls preempt_enable_notrace() causing 2931 * an infinite recursion. 2932 */ 2933 prev_ctx = exception_enter(); 2934 __schedule(); 2935 exception_exit(prev_ctx); 2936 2937 __preempt_count_sub(PREEMPT_ACTIVE); 2938 barrier(); 2939 } while (need_resched()); 2940} 2941EXPORT_SYMBOL_GPL(preempt_schedule_context); 2942#endif /* CONFIG_CONTEXT_TRACKING */ 2943 2944#endif /* CONFIG_PREEMPT */ 2945 2946/* 2947 * this is the entry point to schedule() from kernel preemption 2948 * off of irq context. 2949 * Note, that this is called and return with irqs disabled. This will 2950 * protect us against recursive calling from irq. 2951 */ 2952asmlinkage __visible void __sched preempt_schedule_irq(void) 2953{ 2954 enum ctx_state prev_state; 2955 2956 /* Catch callers which need to be fixed */ 2957 BUG_ON(preempt_count() || !irqs_disabled()); 2958 2959 prev_state = exception_enter(); 2960 2961 do { 2962 __preempt_count_add(PREEMPT_ACTIVE); 2963 local_irq_enable(); 2964 __schedule(); 2965 local_irq_disable(); 2966 __preempt_count_sub(PREEMPT_ACTIVE); 2967 2968 /* 2969 * Check again in case we missed a preemption opportunity 2970 * between schedule and now. 2971 */ 2972 barrier(); 2973 } while (need_resched()); 2974 2975 exception_exit(prev_state); 2976} 2977 2978int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2979 void *key) 2980{ 2981 return try_to_wake_up(curr->private, mode, wake_flags); 2982} 2983EXPORT_SYMBOL(default_wake_function); 2984 2985#ifdef CONFIG_RT_MUTEXES 2986 2987/* 2988 * rt_mutex_setprio - set the current priority of a task 2989 * @p: task 2990 * @prio: prio value (kernel-internal form) 2991 * 2992 * This function changes the 'effective' priority of a task. It does 2993 * not touch ->normal_prio like __setscheduler(). 2994 * 2995 * Used by the rt_mutex code to implement priority inheritance 2996 * logic. Call site only calls if the priority of the task changed. 2997 */ 2998void rt_mutex_setprio(struct task_struct *p, int prio) 2999{ 3000 int oldprio, queued, running, enqueue_flag = 0; 3001 struct rq *rq; 3002 const struct sched_class *prev_class; 3003 3004 BUG_ON(prio > MAX_PRIO); 3005 3006 rq = __task_rq_lock(p); 3007 3008 /* 3009 * Idle task boosting is a nono in general. There is one 3010 * exception, when PREEMPT_RT and NOHZ is active: 3011 * 3012 * The idle task calls get_next_timer_interrupt() and holds 3013 * the timer wheel base->lock on the CPU and another CPU wants 3014 * to access the timer (probably to cancel it). We can safely 3015 * ignore the boosting request, as the idle CPU runs this code 3016 * with interrupts disabled and will complete the lock 3017 * protected section without being interrupted. So there is no 3018 * real need to boost. 3019 */ 3020 if (unlikely(p == rq->idle)) { 3021 WARN_ON(p != rq->curr); 3022 WARN_ON(p->pi_blocked_on); 3023 goto out_unlock; 3024 } 3025 3026 trace_sched_pi_setprio(p, prio); 3027 oldprio = p->prio; 3028 prev_class = p->sched_class; 3029 queued = task_on_rq_queued(p); 3030 running = task_current(rq, p); 3031 if (queued) 3032 dequeue_task(rq, p, 0); 3033 if (running) 3034 put_prev_task(rq, p); 3035 3036 /* 3037 * Boosting condition are: 3038 * 1. -rt task is running and holds mutex A 3039 * --> -dl task blocks on mutex A 3040 * 3041 * 2. -dl task is running and holds mutex A 3042 * --> -dl task blocks on mutex A and could preempt the 3043 * running task 3044 */ 3045 if (dl_prio(prio)) { 3046 struct task_struct *pi_task = rt_mutex_get_top_task(p); 3047 if (!dl_prio(p->normal_prio) || 3048 (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { 3049 p->dl.dl_boosted = 1; 3050 p->dl.dl_throttled = 0; 3051 enqueue_flag = ENQUEUE_REPLENISH; 3052 } else 3053 p->dl.dl_boosted = 0; 3054 p->sched_class = &dl_sched_class; 3055 } else if (rt_prio(prio)) { 3056 if (dl_prio(oldprio)) 3057 p->dl.dl_boosted = 0; 3058 if (oldprio < prio) 3059 enqueue_flag = ENQUEUE_HEAD; 3060 p->sched_class = &rt_sched_class; 3061 } else { 3062 if (dl_prio(oldprio)) 3063 p->dl.dl_boosted = 0; 3064 if (rt_prio(oldprio)) 3065 p->rt.timeout = 0; 3066 p->sched_class = &fair_sched_class; 3067 } 3068 3069 p->prio = prio; 3070 3071 if (running) 3072 p->sched_class->set_curr_task(rq); 3073 if (queued) 3074 enqueue_task(rq, p, enqueue_flag); 3075 3076 check_class_changed(rq, p, prev_class, oldprio); 3077out_unlock: 3078 __task_rq_unlock(rq); 3079} 3080#endif 3081 3082void set_user_nice(struct task_struct *p, long nice) 3083{ 3084 int old_prio, delta, queued; 3085 unsigned long flags; 3086 struct rq *rq; 3087 3088 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 3089 return; 3090 /* 3091 * We have to be careful, if called from sys_setpriority(), 3092 * the task might be in the middle of scheduling on another CPU. 3093 */ 3094 rq = task_rq_lock(p, &flags); 3095 /* 3096 * The RT priorities are set via sched_setscheduler(), but we still 3097 * allow the 'normal' nice value to be set - but as expected 3098 * it wont have any effect on scheduling until the task is 3099 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 3100 */ 3101 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 3102 p->static_prio = NICE_TO_PRIO(nice); 3103 goto out_unlock; 3104 } 3105 queued = task_on_rq_queued(p); 3106 if (queued) 3107 dequeue_task(rq, p, 0); 3108 3109 p->static_prio = NICE_TO_PRIO(nice); 3110 set_load_weight(p); 3111 old_prio = p->prio; 3112 p->prio = effective_prio(p); 3113 delta = p->prio - old_prio; 3114 3115 if (queued) { 3116 enqueue_task(rq, p, 0); 3117 /* 3118 * If the task increased its priority or is running and 3119 * lowered its priority, then reschedule its CPU: 3120 */ 3121 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3122 resched_curr(rq); 3123 } 3124out_unlock: 3125 task_rq_unlock(rq, p, &flags); 3126} 3127EXPORT_SYMBOL(set_user_nice); 3128 3129/* 3130 * can_nice - check if a task can reduce its nice value 3131 * @p: task 3132 * @nice: nice value 3133 */ 3134int can_nice(const struct task_struct *p, const int nice) 3135{ 3136 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3137 int nice_rlim = nice_to_rlimit(nice); 3138 3139 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3140 capable(CAP_SYS_NICE)); 3141} 3142 3143#ifdef __ARCH_WANT_SYS_NICE 3144 3145/* 3146 * sys_nice - change the priority of the current process. 3147 * @increment: priority increment 3148 * 3149 * sys_setpriority is a more generic, but much slower function that 3150 * does similar things. 3151 */ 3152SYSCALL_DEFINE1(nice, int, increment) 3153{ 3154 long nice, retval; 3155 3156 /* 3157 * Setpriority might change our priority at the same moment. 3158 * We don't have to worry. Conceptually one call occurs first 3159 * and we have a single winner. 3160 */ 3161 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); 3162 nice = task_nice(current) + increment; 3163 3164 nice = clamp_val(nice, MIN_NICE, MAX_NICE); 3165 if (increment < 0 && !can_nice(current, nice)) 3166 return -EPERM; 3167 3168 retval = security_task_setnice(current, nice); 3169 if (retval) 3170 return retval; 3171 3172 set_user_nice(current, nice); 3173 return 0; 3174} 3175 3176#endif 3177 3178/** 3179 * task_prio - return the priority value of a given task. 3180 * @p: the task in question. 3181 * 3182 * Return: The priority value as seen by users in /proc. 3183 * RT tasks are offset by -200. Normal tasks are centered 3184 * around 0, value goes from -16 to +15. 3185 */ 3186int task_prio(const struct task_struct *p) 3187{ 3188 return p->prio - MAX_RT_PRIO; 3189} 3190 3191/** 3192 * idle_cpu - is a given cpu idle currently? 3193 * @cpu: the processor in question. 3194 * 3195 * Return: 1 if the CPU is currently idle. 0 otherwise. 3196 */ 3197int idle_cpu(int cpu) 3198{ 3199 struct rq *rq = cpu_rq(cpu); 3200 3201 if (rq->curr != rq->idle) 3202 return 0; 3203 3204 if (rq->nr_running) 3205 return 0; 3206 3207#ifdef CONFIG_SMP 3208 if (!llist_empty(&rq->wake_list)) 3209 return 0; 3210#endif 3211 3212 return 1; 3213} 3214 3215/** 3216 * idle_task - return the idle task for a given cpu. 3217 * @cpu: the processor in question. 3218 * 3219 * Return: The idle task for the cpu @cpu. 3220 */ 3221struct task_struct *idle_task(int cpu) 3222{ 3223 return cpu_rq(cpu)->idle; 3224} 3225 3226/** 3227 * find_process_by_pid - find a process with a matching PID value. 3228 * @pid: the pid in question. 3229 * 3230 * The task of @pid, if found. %NULL otherwise. 3231 */ 3232static struct task_struct *find_process_by_pid(pid_t pid) 3233{ 3234 return pid ? find_task_by_vpid(pid) : current; 3235} 3236 3237/* 3238 * This function initializes the sched_dl_entity of a newly becoming 3239 * SCHED_DEADLINE task. 3240 * 3241 * Only the static values are considered here, the actual runtime and the 3242 * absolute deadline will be properly calculated when the task is enqueued 3243 * for the first time with its new policy. 3244 */ 3245static void 3246__setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3247{ 3248 struct sched_dl_entity *dl_se = &p->dl; 3249 3250 dl_se->dl_runtime = attr->sched_runtime; 3251 dl_se->dl_deadline = attr->sched_deadline; 3252 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3253 dl_se->flags = attr->sched_flags; 3254 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3255 3256 /* 3257 * Changing the parameters of a task is 'tricky' and we're not doing 3258 * the correct thing -- also see task_dead_dl() and switched_from_dl(). 3259 * 3260 * What we SHOULD do is delay the bandwidth release until the 0-lag 3261 * point. This would include retaining the task_struct until that time 3262 * and change dl_overflow() to not immediately decrement the current 3263 * amount. 3264 * 3265 * Instead we retain the current runtime/deadline and let the new 3266 * parameters take effect after the current reservation period lapses. 3267 * This is safe (albeit pessimistic) because the 0-lag point is always 3268 * before the current scheduling deadline. 3269 * 3270 * We can still have temporary overloads because we do not delay the 3271 * change in bandwidth until that time; so admission control is 3272 * not on the safe side. It does however guarantee tasks will never 3273 * consume more than promised. 3274 */ 3275} 3276 3277/* 3278 * sched_setparam() passes in -1 for its policy, to let the functions 3279 * it calls know not to change it. 3280 */ 3281#define SETPARAM_POLICY -1 3282 3283static void __setscheduler_params(struct task_struct *p, 3284 const struct sched_attr *attr) 3285{ 3286 int policy = attr->sched_policy; 3287 3288 if (policy == SETPARAM_POLICY) 3289 policy = p->policy; 3290 3291 p->policy = policy; 3292 3293 if (dl_policy(policy)) 3294 __setparam_dl(p, attr); 3295 else if (fair_policy(policy)) 3296 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3297 3298 /* 3299 * __sched_setscheduler() ensures attr->sched_priority == 0 when 3300 * !rt_policy. Always setting this ensures that things like 3301 * getparam()/getattr() don't report silly values for !rt tasks. 3302 */ 3303 p->rt_priority = attr->sched_priority; 3304 p->normal_prio = normal_prio(p); 3305 set_load_weight(p); 3306} 3307 3308/* Actually do priority change: must hold pi & rq lock. */ 3309static void __setscheduler(struct rq *rq, struct task_struct *p, 3310 const struct sched_attr *attr, bool keep_boost) 3311{ 3312 __setscheduler_params(p, attr); 3313 3314 /* 3315 * Keep a potential priority boosting if called from 3316 * sched_setscheduler(). 3317 */ 3318 if (keep_boost) 3319 p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); 3320 else 3321 p->prio = normal_prio(p); 3322 3323 if (dl_prio(p->prio)) 3324 p->sched_class = &dl_sched_class; 3325 else if (rt_prio(p->prio)) 3326 p->sched_class = &rt_sched_class; 3327 else 3328 p->sched_class = &fair_sched_class; 3329} 3330 3331static void 3332__getparam_dl(struct task_struct *p, struct sched_attr *attr) 3333{ 3334 struct sched_dl_entity *dl_se = &p->dl; 3335 3336 attr->sched_priority = p->rt_priority; 3337 attr->sched_runtime = dl_se->dl_runtime; 3338 attr->sched_deadline = dl_se->dl_deadline; 3339 attr->sched_period = dl_se->dl_period; 3340 attr->sched_flags = dl_se->flags; 3341} 3342 3343/* 3344 * This function validates the new parameters of a -deadline task. 3345 * We ask for the deadline not being zero, and greater or equal 3346 * than the runtime, as well as the period of being zero or 3347 * greater than deadline. Furthermore, we have to be sure that 3348 * user parameters are above the internal resolution of 1us (we 3349 * check sched_runtime only since it is always the smaller one) and 3350 * below 2^63 ns (we have to check both sched_deadline and 3351 * sched_period, as the latter can be zero). 3352 */ 3353static bool 3354__checkparam_dl(const struct sched_attr *attr) 3355{ 3356 /* deadline != 0 */ 3357 if (attr->sched_deadline == 0) 3358 return false; 3359 3360 /* 3361 * Since we truncate DL_SCALE bits, make sure we're at least 3362 * that big. 3363 */ 3364 if (attr->sched_runtime < (1ULL << DL_SCALE)) 3365 return false; 3366 3367 /* 3368 * Since we use the MSB for wrap-around and sign issues, make 3369 * sure it's not set (mind that period can be equal to zero). 3370 */ 3371 if (attr->sched_deadline & (1ULL << 63) || 3372 attr->sched_period & (1ULL << 63)) 3373 return false; 3374 3375 /* runtime <= deadline <= period (if period != 0) */ 3376 if ((attr->sched_period != 0 && 3377 attr->sched_period < attr->sched_deadline) || 3378 attr->sched_deadline < attr->sched_runtime) 3379 return false; 3380 3381 return true; 3382} 3383 3384/* 3385 * check the target process has a UID that matches the current process's 3386 */ 3387static bool check_same_owner(struct task_struct *p) 3388{ 3389 const struct cred *cred = current_cred(), *pcred; 3390 bool match; 3391 3392 rcu_read_lock(); 3393 pcred = __task_cred(p); 3394 match = (uid_eq(cred->euid, pcred->euid) || 3395 uid_eq(cred->euid, pcred->uid)); 3396 rcu_read_unlock(); 3397 return match; 3398} 3399 3400static bool dl_param_changed(struct task_struct *p, 3401 const struct sched_attr *attr) 3402{ 3403 struct sched_dl_entity *dl_se = &p->dl; 3404 3405 if (dl_se->dl_runtime != attr->sched_runtime || 3406 dl_se->dl_deadline != attr->sched_deadline || 3407 dl_se->dl_period != attr->sched_period || 3408 dl_se->flags != attr->sched_flags) 3409 return true; 3410 3411 return false; 3412} 3413 3414static int __sched_setscheduler(struct task_struct *p, 3415 const struct sched_attr *attr, 3416 bool user) 3417{ 3418 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3419 MAX_RT_PRIO - 1 - attr->sched_priority; 3420 int retval, oldprio, oldpolicy = -1, queued, running; 3421 int new_effective_prio, policy = attr->sched_policy; 3422 unsigned long flags; 3423 const struct sched_class *prev_class; 3424 struct rq *rq; 3425 int reset_on_fork; 3426 3427 /* may grab non-irq protected spin_locks */ 3428 BUG_ON(in_interrupt()); 3429recheck: 3430 /* double check policy once rq lock held */ 3431 if (policy < 0) { 3432 reset_on_fork = p->sched_reset_on_fork; 3433 policy = oldpolicy = p->policy; 3434 } else { 3435 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3436 3437 if (policy != SCHED_DEADLINE && 3438 policy != SCHED_FIFO && policy != SCHED_RR && 3439 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3440 policy != SCHED_IDLE) 3441 return -EINVAL; 3442 } 3443 3444 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 3445 return -EINVAL; 3446 3447 /* 3448 * Valid priorities for SCHED_FIFO and SCHED_RR are 3449 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3450 * SCHED_BATCH and SCHED_IDLE is 0. 3451 */ 3452 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3453 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3454 return -EINVAL; 3455 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 3456 (rt_policy(policy) != (attr->sched_priority != 0))) 3457 return -EINVAL; 3458 3459 /* 3460 * Allow unprivileged RT tasks to decrease priority: 3461 */ 3462 if (user && !capable(CAP_SYS_NICE)) { 3463 if (fair_policy(policy)) { 3464 if (attr->sched_nice < task_nice(p) && 3465 !can_nice(p, attr->sched_nice)) 3466 return -EPERM; 3467 } 3468 3469 if (rt_policy(policy)) { 3470 unsigned long rlim_rtprio = 3471 task_rlimit(p, RLIMIT_RTPRIO); 3472 3473 /* can't set/change the rt policy */ 3474 if (policy != p->policy && !rlim_rtprio) 3475 return -EPERM; 3476 3477 /* can't increase priority */ 3478 if (attr->sched_priority > p->rt_priority && 3479 attr->sched_priority > rlim_rtprio) 3480 return -EPERM; 3481 } 3482 3483 /* 3484 * Can't set/change SCHED_DEADLINE policy at all for now 3485 * (safest behavior); in the future we would like to allow 3486 * unprivileged DL tasks to increase their relative deadline 3487 * or reduce their runtime (both ways reducing utilization) 3488 */ 3489 if (dl_policy(policy)) 3490 return -EPERM; 3491 3492 /* 3493 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3494 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3495 */ 3496 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3497 if (!can_nice(p, task_nice(p))) 3498 return -EPERM; 3499 } 3500 3501 /* can't change other user's priorities */ 3502 if (!check_same_owner(p)) 3503 return -EPERM; 3504 3505 /* Normal users shall not reset the sched_reset_on_fork flag */ 3506 if (p->sched_reset_on_fork && !reset_on_fork) 3507 return -EPERM; 3508 } 3509 3510 if (user) { 3511 retval = security_task_setscheduler(p); 3512 if (retval) 3513 return retval; 3514 } 3515 3516 /* 3517 * make sure no PI-waiters arrive (or leave) while we are 3518 * changing the priority of the task: 3519 * 3520 * To be able to change p->policy safely, the appropriate 3521 * runqueue lock must be held. 3522 */ 3523 rq = task_rq_lock(p, &flags); 3524 3525 /* 3526 * Changing the policy of the stop threads its a very bad idea 3527 */ 3528 if (p == rq->stop) { 3529 task_rq_unlock(rq, p, &flags); 3530 return -EINVAL; 3531 } 3532 3533 /* 3534 * If not changing anything there's no need to proceed further, 3535 * but store a possible modification of reset_on_fork. 3536 */ 3537 if (unlikely(policy == p->policy)) { 3538 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 3539 goto change; 3540 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3541 goto change; 3542 if (dl_policy(policy) && dl_param_changed(p, attr)) 3543 goto change; 3544 3545 p->sched_reset_on_fork = reset_on_fork; 3546 task_rq_unlock(rq, p, &flags); 3547 return 0; 3548 } 3549change: 3550 3551 if (user) { 3552#ifdef CONFIG_RT_GROUP_SCHED 3553 /* 3554 * Do not allow realtime tasks into groups that have no runtime 3555 * assigned. 3556 */ 3557 if (rt_bandwidth_enabled() && rt_policy(policy) && 3558 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3559 !task_group_is_autogroup(task_group(p))) { 3560 task_rq_unlock(rq, p, &flags); 3561 return -EPERM; 3562 } 3563#endif 3564#ifdef CONFIG_SMP 3565 if (dl_bandwidth_enabled() && dl_policy(policy)) { 3566 cpumask_t *span = rq->rd->span; 3567 3568 /* 3569 * Don't allow tasks with an affinity mask smaller than 3570 * the entire root_domain to become SCHED_DEADLINE. We 3571 * will also fail if there's no bandwidth available. 3572 */ 3573 if (!cpumask_subset(span, &p->cpus_allowed) || 3574 rq->rd->dl_bw.bw == 0) { 3575 task_rq_unlock(rq, p, &flags); 3576 return -EPERM; 3577 } 3578 } 3579#endif 3580 } 3581 3582 /* recheck policy now with rq lock held */ 3583 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3584 policy = oldpolicy = -1; 3585 task_rq_unlock(rq, p, &flags); 3586 goto recheck; 3587 } 3588 3589 /* 3590 * If setscheduling to SCHED_DEADLINE (or changing the parameters 3591 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 3592 * is available. 3593 */ 3594 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 3595 task_rq_unlock(rq, p, &flags); 3596 return -EBUSY; 3597 } 3598 3599 p->sched_reset_on_fork = reset_on_fork; 3600 oldprio = p->prio; 3601 3602 /* 3603 * Take priority boosted tasks into account. If the new 3604 * effective priority is unchanged, we just store the new 3605 * normal parameters and do not touch the scheduler class and 3606 * the runqueue. This will be done when the task deboost 3607 * itself. 3608 */ 3609 new_effective_prio = rt_mutex_get_effective_prio(p, newprio); 3610 if (new_effective_prio == oldprio) { 3611 __setscheduler_params(p, attr); 3612 task_rq_unlock(rq, p, &flags); 3613 return 0; 3614 } 3615 3616 queued = task_on_rq_queued(p); 3617 running = task_current(rq, p); 3618 if (queued) 3619 dequeue_task(rq, p, 0); 3620 if (running) 3621 put_prev_task(rq, p); 3622 3623 prev_class = p->sched_class; 3624 __setscheduler(rq, p, attr, true); 3625 3626 if (running) 3627 p->sched_class->set_curr_task(rq); 3628 if (queued) { 3629 /* 3630 * We enqueue to tail when the priority of a task is 3631 * increased (user space view). 3632 */ 3633 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); 3634 } 3635 3636 check_class_changed(rq, p, prev_class, oldprio); 3637 task_rq_unlock(rq, p, &flags); 3638 3639 rt_mutex_adjust_pi(p); 3640 3641 return 0; 3642} 3643 3644static int _sched_setscheduler(struct task_struct *p, int policy, 3645 const struct sched_param *param, bool check) 3646{ 3647 struct sched_attr attr = { 3648 .sched_policy = policy, 3649 .sched_priority = param->sched_priority, 3650 .sched_nice = PRIO_TO_NICE(p->static_prio), 3651 }; 3652 3653 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ 3654 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { 3655 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3656 policy &= ~SCHED_RESET_ON_FORK; 3657 attr.sched_policy = policy; 3658 } 3659 3660 return __sched_setscheduler(p, &attr, check); 3661} 3662/** 3663 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3664 * @p: the task in question. 3665 * @policy: new policy. 3666 * @param: structure containing the new RT priority. 3667 * 3668 * Return: 0 on success. An error code otherwise. 3669 * 3670 * NOTE that the task may be already dead. 3671 */ 3672int sched_setscheduler(struct task_struct *p, int policy, 3673 const struct sched_param *param) 3674{ 3675 return _sched_setscheduler(p, policy, param, true); 3676} 3677EXPORT_SYMBOL_GPL(sched_setscheduler); 3678 3679int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3680{ 3681 return __sched_setscheduler(p, attr, true); 3682} 3683EXPORT_SYMBOL_GPL(sched_setattr); 3684 3685/** 3686 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3687 * @p: the task in question. 3688 * @policy: new policy. 3689 * @param: structure containing the new RT priority. 3690 * 3691 * Just like sched_setscheduler, only don't bother checking if the 3692 * current context has permission. For example, this is needed in 3693 * stop_machine(): we create temporary high priority worker threads, 3694 * but our caller might not have that capability. 3695 * 3696 * Return: 0 on success. An error code otherwise. 3697 */ 3698int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3699 const struct sched_param *param) 3700{ 3701 return _sched_setscheduler(p, policy, param, false); 3702} 3703 3704static int 3705do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3706{ 3707 struct sched_param lparam; 3708 struct task_struct *p; 3709 int retval; 3710 3711 if (!param || pid < 0) 3712 return -EINVAL; 3713 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3714 return -EFAULT; 3715 3716 rcu_read_lock(); 3717 retval = -ESRCH; 3718 p = find_process_by_pid(pid); 3719 if (p != NULL) 3720 retval = sched_setscheduler(p, policy, &lparam); 3721 rcu_read_unlock(); 3722 3723 return retval; 3724} 3725 3726/* 3727 * Mimics kernel/events/core.c perf_copy_attr(). 3728 */ 3729static int sched_copy_attr(struct sched_attr __user *uattr, 3730 struct sched_attr *attr) 3731{ 3732 u32 size; 3733 int ret; 3734 3735 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 3736 return -EFAULT; 3737 3738 /* 3739 * zero the full structure, so that a short copy will be nice. 3740 */ 3741 memset(attr, 0, sizeof(*attr)); 3742 3743 ret = get_user(size, &uattr->size); 3744 if (ret) 3745 return ret; 3746 3747 if (size > PAGE_SIZE) /* silly large */ 3748 goto err_size; 3749 3750 if (!size) /* abi compat */ 3751 size = SCHED_ATTR_SIZE_VER0; 3752 3753 if (size < SCHED_ATTR_SIZE_VER0) 3754 goto err_size; 3755 3756 /* 3757 * If we're handed a bigger struct than we know of, 3758 * ensure all the unknown bits are 0 - i.e. new 3759 * user-space does not rely on any kernel feature 3760 * extensions we dont know about yet. 3761 */ 3762 if (size > sizeof(*attr)) { 3763 unsigned char __user *addr; 3764 unsigned char __user *end; 3765 unsigned char val; 3766 3767 addr = (void __user *)uattr + sizeof(*attr); 3768 end = (void __user *)uattr + size; 3769 3770 for (; addr < end; addr++) { 3771 ret = get_user(val, addr); 3772 if (ret) 3773 return ret; 3774 if (val) 3775 goto err_size; 3776 } 3777 size = sizeof(*attr); 3778 } 3779 3780 ret = copy_from_user(attr, uattr, size); 3781 if (ret) 3782 return -EFAULT; 3783 3784 /* 3785 * XXX: do we want to be lenient like existing syscalls; or do we want 3786 * to be strict and return an error on out-of-bounds values? 3787 */ 3788 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3789 3790 return 0; 3791 3792err_size: 3793 put_user(sizeof(*attr), &uattr->size); 3794 return -E2BIG; 3795} 3796 3797/** 3798 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3799 * @pid: the pid in question. 3800 * @policy: new policy. 3801 * @param: structure containing the new RT priority. 3802 * 3803 * Return: 0 on success. An error code otherwise. 3804 */ 3805SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3806 struct sched_param __user *, param) 3807{ 3808 /* negative values for policy are not valid */ 3809 if (policy < 0) 3810 return -EINVAL; 3811 3812 return do_sched_setscheduler(pid, policy, param); 3813} 3814 3815/** 3816 * sys_sched_setparam - set/change the RT priority of a thread 3817 * @pid: the pid in question. 3818 * @param: structure containing the new RT priority. 3819 * 3820 * Return: 0 on success. An error code otherwise. 3821 */ 3822SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3823{ 3824 return do_sched_setscheduler(pid, SETPARAM_POLICY, param); 3825} 3826 3827/** 3828 * sys_sched_setattr - same as above, but with extended sched_attr 3829 * @pid: the pid in question. 3830 * @uattr: structure containing the extended parameters. 3831 * @flags: for future extension. 3832 */ 3833SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3834 unsigned int, flags) 3835{ 3836 struct sched_attr attr; 3837 struct task_struct *p; 3838 int retval; 3839 3840 if (!uattr || pid < 0 || flags) 3841 return -EINVAL; 3842 3843 retval = sched_copy_attr(uattr, &attr); 3844 if (retval) 3845 return retval; 3846 3847 if ((int)attr.sched_policy < 0) 3848 return -EINVAL; 3849 3850 rcu_read_lock(); 3851 retval = -ESRCH; 3852 p = find_process_by_pid(pid); 3853 if (p != NULL) 3854 retval = sched_setattr(p, &attr); 3855 rcu_read_unlock(); 3856 3857 return retval; 3858} 3859 3860/** 3861 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3862 * @pid: the pid in question. 3863 * 3864 * Return: On success, the policy of the thread. Otherwise, a negative error 3865 * code. 3866 */ 3867SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3868{ 3869 struct task_struct *p; 3870 int retval; 3871 3872 if (pid < 0) 3873 return -EINVAL; 3874 3875 retval = -ESRCH; 3876 rcu_read_lock(); 3877 p = find_process_by_pid(pid); 3878 if (p) { 3879 retval = security_task_getscheduler(p); 3880 if (!retval) 3881 retval = p->policy 3882 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 3883 } 3884 rcu_read_unlock(); 3885 return retval; 3886} 3887 3888/** 3889 * sys_sched_getparam - get the RT priority of a thread 3890 * @pid: the pid in question. 3891 * @param: structure containing the RT priority. 3892 * 3893 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 3894 * code. 3895 */ 3896SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3897{ 3898 struct sched_param lp = { .sched_priority = 0 }; 3899 struct task_struct *p; 3900 int retval; 3901 3902 if (!param || pid < 0) 3903 return -EINVAL; 3904 3905 rcu_read_lock(); 3906 p = find_process_by_pid(pid); 3907 retval = -ESRCH; 3908 if (!p) 3909 goto out_unlock; 3910 3911 retval = security_task_getscheduler(p); 3912 if (retval) 3913 goto out_unlock; 3914 3915 if (task_has_rt_policy(p)) 3916 lp.sched_priority = p->rt_priority; 3917 rcu_read_unlock(); 3918 3919 /* 3920 * This one might sleep, we cannot do it with a spinlock held ... 3921 */ 3922 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3923 3924 return retval; 3925 3926out_unlock: 3927 rcu_read_unlock(); 3928 return retval; 3929} 3930 3931static int sched_read_attr(struct sched_attr __user *uattr, 3932 struct sched_attr *attr, 3933 unsigned int usize) 3934{ 3935 int ret; 3936 3937 if (!access_ok(VERIFY_WRITE, uattr, usize)) 3938 return -EFAULT; 3939 3940 /* 3941 * If we're handed a smaller struct than we know of, 3942 * ensure all the unknown bits are 0 - i.e. old 3943 * user-space does not get uncomplete information. 3944 */ 3945 if (usize < sizeof(*attr)) { 3946 unsigned char *addr; 3947 unsigned char *end; 3948 3949 addr = (void *)attr + usize; 3950 end = (void *)attr + sizeof(*attr); 3951 3952 for (; addr < end; addr++) { 3953 if (*addr) 3954 return -EFBIG; 3955 } 3956 3957 attr->size = usize; 3958 } 3959 3960 ret = copy_to_user(uattr, attr, attr->size); 3961 if (ret) 3962 return -EFAULT; 3963 3964 return 0; 3965} 3966 3967/** 3968 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 3969 * @pid: the pid in question. 3970 * @uattr: structure containing the extended parameters. 3971 * @size: sizeof(attr) for fwd/bwd comp. 3972 * @flags: for future extension. 3973 */ 3974SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3975 unsigned int, size, unsigned int, flags) 3976{ 3977 struct sched_attr attr = { 3978 .size = sizeof(struct sched_attr), 3979 }; 3980 struct task_struct *p; 3981 int retval; 3982 3983 if (!uattr || pid < 0 || size > PAGE_SIZE || 3984 size < SCHED_ATTR_SIZE_VER0 || flags) 3985 return -EINVAL; 3986 3987 rcu_read_lock(); 3988 p = find_process_by_pid(pid); 3989 retval = -ESRCH; 3990 if (!p) 3991 goto out_unlock; 3992 3993 retval = security_task_getscheduler(p); 3994 if (retval) 3995 goto out_unlock; 3996 3997 attr.sched_policy = p->policy; 3998 if (p->sched_reset_on_fork) 3999 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 4000 if (task_has_dl_policy(p)) 4001 __getparam_dl(p, &attr); 4002 else if (task_has_rt_policy(p)) 4003 attr.sched_priority = p->rt_priority; 4004 else 4005 attr.sched_nice = task_nice(p); 4006 4007 rcu_read_unlock(); 4008 4009 retval = sched_read_attr(uattr, &attr, size); 4010 return retval; 4011 4012out_unlock: 4013 rcu_read_unlock(); 4014 return retval; 4015} 4016 4017long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4018{ 4019 cpumask_var_t cpus_allowed, new_mask; 4020 struct task_struct *p; 4021 int retval; 4022 4023 rcu_read_lock(); 4024 4025 p = find_process_by_pid(pid); 4026 if (!p) { 4027 rcu_read_unlock(); 4028 return -ESRCH; 4029 } 4030 4031 /* Prevent p going away */ 4032 get_task_struct(p); 4033 rcu_read_unlock(); 4034 4035 if (p->flags & PF_NO_SETAFFINITY) { 4036 retval = -EINVAL; 4037 goto out_put_task; 4038 } 4039 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4040 retval = -ENOMEM; 4041 goto out_put_task; 4042 } 4043 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4044 retval = -ENOMEM; 4045 goto out_free_cpus_allowed; 4046 } 4047 retval = -EPERM; 4048 if (!check_same_owner(p)) { 4049 rcu_read_lock(); 4050 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4051 rcu_read_unlock(); 4052 goto out_free_new_mask; 4053 } 4054 rcu_read_unlock(); 4055 } 4056 4057 retval = security_task_setscheduler(p); 4058 if (retval) 4059 goto out_free_new_mask; 4060 4061 4062 cpuset_cpus_allowed(p, cpus_allowed); 4063 cpumask_and(new_mask, in_mask, cpus_allowed); 4064 4065 /* 4066 * Since bandwidth control happens on root_domain basis, 4067 * if admission test is enabled, we only admit -deadline 4068 * tasks allowed to run on all the CPUs in the task's 4069 * root_domain. 4070 */ 4071#ifdef CONFIG_SMP 4072 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) { 4073 rcu_read_lock(); 4074 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) { 4075 retval = -EBUSY; 4076 rcu_read_unlock(); 4077 goto out_free_new_mask; 4078 } 4079 rcu_read_unlock(); 4080 } 4081#endif 4082again: 4083 retval = set_cpus_allowed_ptr(p, new_mask); 4084 4085 if (!retval) { 4086 cpuset_cpus_allowed(p, cpus_allowed); 4087 if (!cpumask_subset(new_mask, cpus_allowed)) { 4088 /* 4089 * We must have raced with a concurrent cpuset 4090 * update. Just reset the cpus_allowed to the 4091 * cpuset's cpus_allowed 4092 */ 4093 cpumask_copy(new_mask, cpus_allowed); 4094 goto again; 4095 } 4096 } 4097out_free_new_mask: 4098 free_cpumask_var(new_mask); 4099out_free_cpus_allowed: 4100 free_cpumask_var(cpus_allowed); 4101out_put_task: 4102 put_task_struct(p); 4103 return retval; 4104} 4105 4106static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4107 struct cpumask *new_mask) 4108{ 4109 if (len < cpumask_size()) 4110 cpumask_clear(new_mask); 4111 else if (len > cpumask_size()) 4112 len = cpumask_size(); 4113 4114 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4115} 4116 4117/** 4118 * sys_sched_setaffinity - set the cpu affinity of a process 4119 * @pid: pid of the process 4120 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4121 * @user_mask_ptr: user-space pointer to the new cpu mask 4122 * 4123 * Return: 0 on success. An error code otherwise. 4124 */ 4125SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4126 unsigned long __user *, user_mask_ptr) 4127{ 4128 cpumask_var_t new_mask; 4129 int retval; 4130 4131 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4132 return -ENOMEM; 4133 4134 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4135 if (retval == 0) 4136 retval = sched_setaffinity(pid, new_mask); 4137 free_cpumask_var(new_mask); 4138 return retval; 4139} 4140 4141long sched_getaffinity(pid_t pid, struct cpumask *mask) 4142{ 4143 struct task_struct *p; 4144 unsigned long flags; 4145 int retval; 4146 4147 rcu_read_lock(); 4148 4149 retval = -ESRCH; 4150 p = find_process_by_pid(pid); 4151 if (!p) 4152 goto out_unlock; 4153 4154 retval = security_task_getscheduler(p); 4155 if (retval) 4156 goto out_unlock; 4157 4158 raw_spin_lock_irqsave(&p->pi_lock, flags); 4159 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4160 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4161 4162out_unlock: 4163 rcu_read_unlock(); 4164 4165 return retval; 4166} 4167 4168/** 4169 * sys_sched_getaffinity - get the cpu affinity of a process 4170 * @pid: pid of the process 4171 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4172 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4173 * 4174 * Return: 0 on success. An error code otherwise. 4175 */ 4176SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4177 unsigned long __user *, user_mask_ptr) 4178{ 4179 int ret; 4180 cpumask_var_t mask; 4181 4182 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4183 return -EINVAL; 4184 if (len & (sizeof(unsigned long)-1)) 4185 return -EINVAL; 4186 4187 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4188 return -ENOMEM; 4189 4190 ret = sched_getaffinity(pid, mask); 4191 if (ret == 0) { 4192 size_t retlen = min_t(size_t, len, cpumask_size()); 4193 4194 if (copy_to_user(user_mask_ptr, mask, retlen)) 4195 ret = -EFAULT; 4196 else 4197 ret = retlen; 4198 } 4199 free_cpumask_var(mask); 4200 4201 return ret; 4202} 4203 4204/** 4205 * sys_sched_yield - yield the current processor to other threads. 4206 * 4207 * This function yields the current CPU to other tasks. If there are no 4208 * other threads running on this CPU then this function will return. 4209 * 4210 * Return: 0. 4211 */ 4212SYSCALL_DEFINE0(sched_yield) 4213{ 4214 struct rq *rq = this_rq_lock(); 4215 4216 schedstat_inc(rq, yld_count); 4217 current->sched_class->yield_task(rq); 4218 4219 /* 4220 * Since we are going to call schedule() anyway, there's 4221 * no need to preempt or enable interrupts: 4222 */ 4223 __release(rq->lock); 4224 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4225 do_raw_spin_unlock(&rq->lock); 4226 sched_preempt_enable_no_resched(); 4227 4228 schedule(); 4229 4230 return 0; 4231} 4232 4233int __sched _cond_resched(void) 4234{ 4235 if (should_resched(0)) { 4236 preempt_schedule_common(); 4237 return 1; 4238 } 4239 return 0; 4240} 4241EXPORT_SYMBOL(_cond_resched); 4242 4243/* 4244 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4245 * call schedule, and on return reacquire the lock. 4246 * 4247 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4248 * operations here to prevent schedule() from being called twice (once via 4249 * spin_unlock(), once by hand). 4250 */ 4251int __cond_resched_lock(spinlock_t *lock) 4252{ 4253 int resched = should_resched(PREEMPT_LOCK_OFFSET); 4254 int ret = 0; 4255 4256 lockdep_assert_held(lock); 4257 4258 if (spin_needbreak(lock) || resched) { 4259 spin_unlock(lock); 4260 if (resched) 4261 preempt_schedule_common(); 4262 else 4263 cpu_relax(); 4264 ret = 1; 4265 spin_lock(lock); 4266 } 4267 return ret; 4268} 4269EXPORT_SYMBOL(__cond_resched_lock); 4270 4271int __sched __cond_resched_softirq(void) 4272{ 4273 BUG_ON(!in_softirq()); 4274 4275 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) { 4276 local_bh_enable(); 4277 preempt_schedule_common(); 4278 local_bh_disable(); 4279 return 1; 4280 } 4281 return 0; 4282} 4283EXPORT_SYMBOL(__cond_resched_softirq); 4284 4285/** 4286 * yield - yield the current processor to other threads. 4287 * 4288 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4289 * 4290 * The scheduler is at all times free to pick the calling task as the most 4291 * eligible task to run, if removing the yield() call from your code breaks 4292 * it, its already broken. 4293 * 4294 * Typical broken usage is: 4295 * 4296 * while (!event) 4297 * yield(); 4298 * 4299 * where one assumes that yield() will let 'the other' process run that will 4300 * make event true. If the current task is a SCHED_FIFO task that will never 4301 * happen. Never use yield() as a progress guarantee!! 4302 * 4303 * If you want to use yield() to wait for something, use wait_event(). 4304 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4305 * If you still want to use yield(), do not! 4306 */ 4307void __sched yield(void) 4308{ 4309 set_current_state(TASK_RUNNING); 4310 sys_sched_yield(); 4311} 4312EXPORT_SYMBOL(yield); 4313 4314/** 4315 * yield_to - yield the current processor to another thread in 4316 * your thread group, or accelerate that thread toward the 4317 * processor it's on. 4318 * @p: target task 4319 * @preempt: whether task preemption is allowed or not 4320 * 4321 * It's the caller's job to ensure that the target task struct 4322 * can't go away on us before we can do any checks. 4323 * 4324 * Return: 4325 * true (>0) if we indeed boosted the target task. 4326 * false (0) if we failed to boost the target. 4327 * -ESRCH if there's no task to yield to. 4328 */ 4329int __sched yield_to(struct task_struct *p, bool preempt) 4330{ 4331 struct task_struct *curr = current; 4332 struct rq *rq, *p_rq; 4333 unsigned long flags; 4334 int yielded = 0; 4335 4336 local_irq_save(flags); 4337 rq = this_rq(); 4338 4339again: 4340 p_rq = task_rq(p); 4341 /* 4342 * If we're the only runnable task on the rq and target rq also 4343 * has only one task, there's absolutely no point in yielding. 4344 */ 4345 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4346 yielded = -ESRCH; 4347 goto out_irq; 4348 } 4349 4350 double_rq_lock(rq, p_rq); 4351 if (task_rq(p) != p_rq) { 4352 double_rq_unlock(rq, p_rq); 4353 goto again; 4354 } 4355 4356 if (!curr->sched_class->yield_to_task) 4357 goto out_unlock; 4358 4359 if (curr->sched_class != p->sched_class) 4360 goto out_unlock; 4361 4362 if (task_running(p_rq, p) || p->state) 4363 goto out_unlock; 4364 4365 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4366 if (yielded) { 4367 schedstat_inc(rq, yld_count); 4368 /* 4369 * Make p's CPU reschedule; pick_next_entity takes care of 4370 * fairness. 4371 */ 4372 if (preempt && rq != p_rq) 4373 resched_curr(p_rq); 4374 } 4375 4376out_unlock: 4377 double_rq_unlock(rq, p_rq); 4378out_irq: 4379 local_irq_restore(flags); 4380 4381 if (yielded > 0) 4382 schedule(); 4383 4384 return yielded; 4385} 4386EXPORT_SYMBOL_GPL(yield_to); 4387 4388/* 4389 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4390 * that process accounting knows that this is a task in IO wait state. 4391 */ 4392long __sched io_schedule_timeout(long timeout) 4393{ 4394 int old_iowait = current->in_iowait; 4395 struct rq *rq; 4396 long ret; 4397 4398 current->in_iowait = 1; 4399 blk_schedule_flush_plug(current); 4400 4401 delayacct_blkio_start(); 4402 rq = raw_rq(); 4403 atomic_inc(&rq->nr_iowait); 4404 ret = schedule_timeout(timeout); 4405 current->in_iowait = old_iowait; 4406 atomic_dec(&rq->nr_iowait); 4407 delayacct_blkio_end(); 4408 4409 return ret; 4410} 4411EXPORT_SYMBOL(io_schedule_timeout); 4412 4413/** 4414 * sys_sched_get_priority_max - return maximum RT priority. 4415 * @policy: scheduling class. 4416 * 4417 * Return: On success, this syscall returns the maximum 4418 * rt_priority that can be used by a given scheduling class. 4419 * On failure, a negative error code is returned. 4420 */ 4421SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4422{ 4423 int ret = -EINVAL; 4424 4425 switch (policy) { 4426 case SCHED_FIFO: 4427 case SCHED_RR: 4428 ret = MAX_USER_RT_PRIO-1; 4429 break; 4430 case SCHED_DEADLINE: 4431 case SCHED_NORMAL: 4432 case SCHED_BATCH: 4433 case SCHED_IDLE: 4434 ret = 0; 4435 break; 4436 } 4437 return ret; 4438} 4439 4440/** 4441 * sys_sched_get_priority_min - return minimum RT priority. 4442 * @policy: scheduling class. 4443 * 4444 * Return: On success, this syscall returns the minimum 4445 * rt_priority that can be used by a given scheduling class. 4446 * On failure, a negative error code is returned. 4447 */ 4448SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4449{ 4450 int ret = -EINVAL; 4451 4452 switch (policy) { 4453 case SCHED_FIFO: 4454 case SCHED_RR: 4455 ret = 1; 4456 break; 4457 case SCHED_DEADLINE: 4458 case SCHED_NORMAL: 4459 case SCHED_BATCH: 4460 case SCHED_IDLE: 4461 ret = 0; 4462 } 4463 return ret; 4464} 4465 4466/** 4467 * sys_sched_rr_get_interval - return the default timeslice of a process. 4468 * @pid: pid of the process. 4469 * @interval: userspace pointer to the timeslice value. 4470 * 4471 * this syscall writes the default timeslice value of a given process 4472 * into the user-space timespec buffer. A value of '0' means infinity. 4473 * 4474 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4475 * an error code. 4476 */ 4477SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4478 struct timespec __user *, interval) 4479{ 4480 struct task_struct *p; 4481 unsigned int time_slice; 4482 unsigned long flags; 4483 struct rq *rq; 4484 int retval; 4485 struct timespec t; 4486 4487 if (pid < 0) 4488 return -EINVAL; 4489 4490 retval = -ESRCH; 4491 rcu_read_lock(); 4492 p = find_process_by_pid(pid); 4493 if (!p) 4494 goto out_unlock; 4495 4496 retval = security_task_getscheduler(p); 4497 if (retval) 4498 goto out_unlock; 4499 4500 rq = task_rq_lock(p, &flags); 4501 time_slice = 0; 4502 if (p->sched_class->get_rr_interval) 4503 time_slice = p->sched_class->get_rr_interval(rq, p); 4504 task_rq_unlock(rq, p, &flags); 4505 4506 rcu_read_unlock(); 4507 jiffies_to_timespec(time_slice, &t); 4508 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4509 return retval; 4510 4511out_unlock: 4512 rcu_read_unlock(); 4513 return retval; 4514} 4515 4516static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4517 4518void sched_show_task(struct task_struct *p) 4519{ 4520 unsigned long free = 0; 4521 int ppid; 4522 unsigned long state = p->state; 4523 4524 if (state) 4525 state = __ffs(state) + 1; 4526 printk(KERN_INFO "%-15.15s %c", p->comm, 4527 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4528#if BITS_PER_LONG == 32 4529 if (state == TASK_RUNNING) 4530 printk(KERN_CONT " running "); 4531 else 4532 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4533#else 4534 if (state == TASK_RUNNING) 4535 printk(KERN_CONT " running task "); 4536 else 4537 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4538#endif 4539#ifdef CONFIG_DEBUG_STACK_USAGE 4540 free = stack_not_used(p); 4541#endif 4542 ppid = 0; 4543 rcu_read_lock(); 4544 if (pid_alive(p)) 4545 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4546 rcu_read_unlock(); 4547 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4548 task_pid_nr(p), ppid, 4549 (unsigned long)task_thread_info(p)->flags); 4550 4551 print_worker_info(KERN_INFO, p); 4552 show_stack(p, NULL); 4553} 4554 4555void show_state_filter(unsigned long state_filter) 4556{ 4557 struct task_struct *g, *p; 4558 4559#if BITS_PER_LONG == 32 4560 printk(KERN_INFO 4561 " task PC stack pid father\n"); 4562#else 4563 printk(KERN_INFO 4564 " task PC stack pid father\n"); 4565#endif 4566 rcu_read_lock(); 4567 for_each_process_thread(g, p) { 4568 /* 4569 * reset the NMI-timeout, listing all files on a slow 4570 * console might take a lot of time: 4571 */ 4572 touch_nmi_watchdog(); 4573 if (!state_filter || (p->state & state_filter)) 4574 sched_show_task(p); 4575 } 4576 4577 touch_all_softlockup_watchdogs(); 4578 4579#ifdef CONFIG_SCHED_DEBUG 4580 sysrq_sched_debug_show(); 4581#endif 4582 rcu_read_unlock(); 4583 /* 4584 * Only show locks if all tasks are dumped: 4585 */ 4586 if (!state_filter) 4587 debug_show_all_locks(); 4588} 4589 4590void init_idle_bootup_task(struct task_struct *idle) 4591{ 4592 idle->sched_class = &idle_sched_class; 4593} 4594 4595/** 4596 * init_idle - set up an idle thread for a given CPU 4597 * @idle: task in question 4598 * @cpu: cpu the idle task belongs to 4599 * 4600 * NOTE: this function does not set the idle thread's NEED_RESCHED 4601 * flag, to make booting more robust. 4602 */ 4603void init_idle(struct task_struct *idle, int cpu) 4604{ 4605 struct rq *rq = cpu_rq(cpu); 4606 unsigned long flags; 4607 4608 raw_spin_lock_irqsave(&rq->lock, flags); 4609 4610 __sched_fork(0, idle); 4611 idle->state = TASK_RUNNING; 4612 idle->se.exec_start = sched_clock(); 4613 4614 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4615 /* 4616 * We're having a chicken and egg problem, even though we are 4617 * holding rq->lock, the cpu isn't yet set to this cpu so the 4618 * lockdep check in task_group() will fail. 4619 * 4620 * Similar case to sched_fork(). / Alternatively we could 4621 * use task_rq_lock() here and obtain the other rq->lock. 4622 * 4623 * Silence PROVE_RCU 4624 */ 4625 rcu_read_lock(); 4626 __set_task_cpu(idle, cpu); 4627 rcu_read_unlock(); 4628 4629 rq->curr = rq->idle = idle; 4630 idle->on_rq = TASK_ON_RQ_QUEUED; 4631#if defined(CONFIG_SMP) 4632 idle->on_cpu = 1; 4633#endif 4634 raw_spin_unlock_irqrestore(&rq->lock, flags); 4635 4636 /* Set the preempt count _outside_ the spinlocks! */ 4637 init_idle_preempt_count(idle, cpu); 4638 4639 /* 4640 * The idle tasks have their own, simple scheduling class: 4641 */ 4642 idle->sched_class = &idle_sched_class; 4643 ftrace_graph_init_idle_task(idle, cpu); 4644 vtime_init_idle(idle, cpu); 4645#if defined(CONFIG_SMP) 4646 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4647#endif 4648} 4649 4650int cpuset_cpumask_can_shrink(const struct cpumask *cur, 4651 const struct cpumask *trial) 4652{ 4653 int ret = 1, trial_cpus; 4654 struct dl_bw *cur_dl_b; 4655 unsigned long flags; 4656 4657 if (!cpumask_weight(cur)) 4658 return ret; 4659 4660 rcu_read_lock_sched(); 4661 cur_dl_b = dl_bw_of(cpumask_any(cur)); 4662 trial_cpus = cpumask_weight(trial); 4663 4664 raw_spin_lock_irqsave(&cur_dl_b->lock, flags); 4665 if (cur_dl_b->bw != -1 && 4666 cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) 4667 ret = 0; 4668 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 4669 rcu_read_unlock_sched(); 4670 4671 return ret; 4672} 4673 4674int task_can_attach(struct task_struct *p, 4675 const struct cpumask *cs_cpus_allowed) 4676{ 4677 int ret = 0; 4678 4679 /* 4680 * Kthreads which disallow setaffinity shouldn't be moved 4681 * to a new cpuset; we don't want to change their cpu 4682 * affinity and isolating such threads by their set of 4683 * allowed nodes is unnecessary. Thus, cpusets are not 4684 * applicable for such threads. This prevents checking for 4685 * success of set_cpus_allowed_ptr() on all attached tasks 4686 * before cpus_allowed may be changed. 4687 */ 4688 if (p->flags & PF_NO_SETAFFINITY) { 4689 ret = -EINVAL; 4690 goto out; 4691 } 4692 4693#ifdef CONFIG_SMP 4694 if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span, 4695 cs_cpus_allowed)) { 4696 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 4697 cs_cpus_allowed); 4698 struct dl_bw *dl_b; 4699 bool overflow; 4700 int cpus; 4701 unsigned long flags; 4702 4703 rcu_read_lock_sched(); 4704 dl_b = dl_bw_of(dest_cpu); 4705 raw_spin_lock_irqsave(&dl_b->lock, flags); 4706 cpus = dl_bw_cpus(dest_cpu); 4707 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 4708 if (overflow) 4709 ret = -EBUSY; 4710 else { 4711 /* 4712 * We reserve space for this task in the destination 4713 * root_domain, as we can't fail after this point. 4714 * We will free resources in the source root_domain 4715 * later on (see set_cpus_allowed_dl()). 4716 */ 4717 __dl_add(dl_b, p->dl.dl_bw); 4718 } 4719 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 4720 rcu_read_unlock_sched(); 4721 4722 } 4723#endif 4724out: 4725 return ret; 4726} 4727 4728#ifdef CONFIG_SMP 4729/* 4730 * move_queued_task - move a queued task to new rq. 4731 * 4732 * Returns (locked) new rq. Old rq's lock is released. 4733 */ 4734static struct rq *move_queued_task(struct task_struct *p, int new_cpu) 4735{ 4736 struct rq *rq = task_rq(p); 4737 4738 lockdep_assert_held(&rq->lock); 4739 4740 dequeue_task(rq, p, 0); 4741 p->on_rq = TASK_ON_RQ_MIGRATING; 4742 set_task_cpu(p, new_cpu); 4743 raw_spin_unlock(&rq->lock); 4744 4745 rq = cpu_rq(new_cpu); 4746 4747 raw_spin_lock(&rq->lock); 4748 BUG_ON(task_cpu(p) != new_cpu); 4749 p->on_rq = TASK_ON_RQ_QUEUED; 4750 enqueue_task(rq, p, 0); 4751 check_preempt_curr(rq, p, 0); 4752 4753 return rq; 4754} 4755 4756void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4757{ 4758 if (p->sched_class->set_cpus_allowed) 4759 p->sched_class->set_cpus_allowed(p, new_mask); 4760 4761 cpumask_copy(&p->cpus_allowed, new_mask); 4762 p->nr_cpus_allowed = cpumask_weight(new_mask); 4763} 4764 4765/* 4766 * This is how migration works: 4767 * 4768 * 1) we invoke migration_cpu_stop() on the target CPU using 4769 * stop_one_cpu(). 4770 * 2) stopper starts to run (implicitly forcing the migrated thread 4771 * off the CPU) 4772 * 3) it checks whether the migrated task is still in the wrong runqueue. 4773 * 4) if it's in the wrong runqueue then the migration thread removes 4774 * it and puts it into the right queue. 4775 * 5) stopper completes and stop_one_cpu() returns and the migration 4776 * is done. 4777 */ 4778 4779/* 4780 * Change a given task's CPU affinity. Migrate the thread to a 4781 * proper CPU and schedule it away if the CPU it's executing on 4782 * is removed from the allowed bitmask. 4783 * 4784 * NOTE: the caller must have a valid reference to the task, the 4785 * task must not exit() & deallocate itself prematurely. The 4786 * call is not atomic; no spinlocks may be held. 4787 */ 4788int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4789{ 4790 unsigned long flags; 4791 struct rq *rq; 4792 unsigned int dest_cpu; 4793 int ret = 0; 4794 4795 rq = task_rq_lock(p, &flags); 4796 4797 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4798 goto out; 4799 4800 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4801 ret = -EINVAL; 4802 goto out; 4803 } 4804 4805 do_set_cpus_allowed(p, new_mask); 4806 4807 /* Can the task run on the task's current CPU? If so, we're done */ 4808 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4809 goto out; 4810 4811 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4812 if (task_running(rq, p) || p->state == TASK_WAKING) { 4813 struct migration_arg arg = { p, dest_cpu }; 4814 /* Need help from migration thread: drop lock and wait. */ 4815 task_rq_unlock(rq, p, &flags); 4816 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4817 tlb_migrate_finish(p->mm); 4818 return 0; 4819 } else if (task_on_rq_queued(p)) 4820 rq = move_queued_task(p, dest_cpu); 4821out: 4822 task_rq_unlock(rq, p, &flags); 4823 4824 return ret; 4825} 4826EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4827 4828/* 4829 * Move (not current) task off this cpu, onto dest cpu. We're doing 4830 * this because either it can't run here any more (set_cpus_allowed() 4831 * away from this CPU, or CPU going down), or because we're 4832 * attempting to rebalance this task on exec (sched_exec). 4833 * 4834 * So we race with normal scheduler movements, but that's OK, as long 4835 * as the task is no longer on this CPU. 4836 * 4837 * Returns non-zero if task was successfully migrated. 4838 */ 4839static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4840{ 4841 struct rq *rq; 4842 int ret = 0; 4843 4844 if (unlikely(!cpu_active(dest_cpu))) 4845 return ret; 4846 4847 rq = cpu_rq(src_cpu); 4848 4849 raw_spin_lock(&p->pi_lock); 4850 raw_spin_lock(&rq->lock); 4851 /* Already moved. */ 4852 if (task_cpu(p) != src_cpu) 4853 goto done; 4854 4855 /* Affinity changed (again). */ 4856 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4857 goto fail; 4858 4859 /* 4860 * If we're not on a rq, the next wake-up will ensure we're 4861 * placed properly. 4862 */ 4863 if (task_on_rq_queued(p)) 4864 rq = move_queued_task(p, dest_cpu); 4865done: 4866 ret = 1; 4867fail: 4868 raw_spin_unlock(&rq->lock); 4869 raw_spin_unlock(&p->pi_lock); 4870 return ret; 4871} 4872 4873#ifdef CONFIG_NUMA_BALANCING 4874/* Migrate current task p to target_cpu */ 4875int migrate_task_to(struct task_struct *p, int target_cpu) 4876{ 4877 struct migration_arg arg = { p, target_cpu }; 4878 int curr_cpu = task_cpu(p); 4879 4880 if (curr_cpu == target_cpu) 4881 return 0; 4882 4883 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 4884 return -EINVAL; 4885 4886 /* TODO: This is not properly updating schedstats */ 4887 4888 trace_sched_move_numa(p, curr_cpu, target_cpu); 4889 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4890} 4891 4892/* 4893 * Requeue a task on a given node and accurately track the number of NUMA 4894 * tasks on the runqueues 4895 */ 4896void sched_setnuma(struct task_struct *p, int nid) 4897{ 4898 struct rq *rq; 4899 unsigned long flags; 4900 bool queued, running; 4901 4902 rq = task_rq_lock(p, &flags); 4903 queued = task_on_rq_queued(p); 4904 running = task_current(rq, p); 4905 4906 if (queued) 4907 dequeue_task(rq, p, 0); 4908 if (running) 4909 put_prev_task(rq, p); 4910 4911 p->numa_preferred_nid = nid; 4912 4913 if (running) 4914 p->sched_class->set_curr_task(rq); 4915 if (queued) 4916 enqueue_task(rq, p, 0); 4917 task_rq_unlock(rq, p, &flags); 4918} 4919#endif 4920 4921/* 4922 * migration_cpu_stop - this will be executed by a highprio stopper thread 4923 * and performs thread migration by bumping thread off CPU then 4924 * 'pushing' onto another runqueue. 4925 */ 4926static int migration_cpu_stop(void *data) 4927{ 4928 struct migration_arg *arg = data; 4929 4930 /* 4931 * The original target cpu might have gone down and we might 4932 * be on another cpu but it doesn't matter. 4933 */ 4934 local_irq_disable(); 4935 /* 4936 * We need to explicitly wake pending tasks before running 4937 * __migrate_task() such that we will not miss enforcing cpus_allowed 4938 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 4939 */ 4940 sched_ttwu_pending(); 4941 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4942 local_irq_enable(); 4943 return 0; 4944} 4945 4946#ifdef CONFIG_HOTPLUG_CPU 4947 4948/* 4949 * Ensures that the idle task is using init_mm right before its cpu goes 4950 * offline. 4951 */ 4952void idle_task_exit(void) 4953{ 4954 struct mm_struct *mm = current->active_mm; 4955 4956 BUG_ON(cpu_online(smp_processor_id())); 4957 4958 if (mm != &init_mm) { 4959 switch_mm(mm, &init_mm, current); 4960 finish_arch_post_lock_switch(); 4961 } 4962 mmdrop(mm); 4963} 4964 4965/* 4966 * Since this CPU is going 'away' for a while, fold any nr_active delta 4967 * we might have. Assumes we're called after migrate_tasks() so that the 4968 * nr_active count is stable. 4969 * 4970 * Also see the comment "Global load-average calculations". 4971 */ 4972static void calc_load_migrate(struct rq *rq) 4973{ 4974 long delta = calc_load_fold_active(rq); 4975 if (delta) 4976 atomic_long_add(delta, &calc_load_tasks); 4977} 4978 4979static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 4980{ 4981} 4982 4983static const struct sched_class fake_sched_class = { 4984 .put_prev_task = put_prev_task_fake, 4985}; 4986 4987static struct task_struct fake_task = { 4988 /* 4989 * Avoid pull_{rt,dl}_task() 4990 */ 4991 .prio = MAX_PRIO + 1, 4992 .sched_class = &fake_sched_class, 4993}; 4994 4995/* 4996 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4997 * try_to_wake_up()->select_task_rq(). 4998 * 4999 * Called with rq->lock held even though we'er in stop_machine() and 5000 * there's no concurrency possible, we hold the required locks anyway 5001 * because of lock validation efforts. 5002 */ 5003static void migrate_tasks(unsigned int dead_cpu) 5004{ 5005 struct rq *rq = cpu_rq(dead_cpu); 5006 struct task_struct *next, *stop = rq->stop; 5007 int dest_cpu; 5008 5009 /* 5010 * Fudge the rq selection such that the below task selection loop 5011 * doesn't get stuck on the currently eligible stop task. 5012 * 5013 * We're currently inside stop_machine() and the rq is either stuck 5014 * in the stop_machine_cpu_stop() loop, or we're executing this code, 5015 * either way we should never end up calling schedule() until we're 5016 * done here. 5017 */ 5018 rq->stop = NULL; 5019 5020 /* 5021 * put_prev_task() and pick_next_task() sched 5022 * class method both need to have an up-to-date 5023 * value of rq->clock[_task] 5024 */ 5025 update_rq_clock(rq); 5026 5027 for ( ; ; ) { 5028 /* 5029 * There's this thread running, bail when that's the only 5030 * remaining thread. 5031 */ 5032 if (rq->nr_running == 1) 5033 break; 5034 5035 next = pick_next_task(rq, &fake_task); 5036 BUG_ON(!next); 5037 next->sched_class->put_prev_task(rq, next); 5038 5039 /* Find suitable destination for @next, with force if needed. */ 5040 dest_cpu = select_fallback_rq(dead_cpu, next); 5041 raw_spin_unlock(&rq->lock); 5042 5043 __migrate_task(next, dead_cpu, dest_cpu); 5044 5045 raw_spin_lock(&rq->lock); 5046 } 5047 5048 rq->stop = stop; 5049} 5050 5051#endif /* CONFIG_HOTPLUG_CPU */ 5052 5053#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5054 5055static struct ctl_table sd_ctl_dir[] = { 5056 { 5057 .procname = "sched_domain", 5058 .mode = 0555, 5059 }, 5060 {} 5061}; 5062 5063static struct ctl_table sd_ctl_root[] = { 5064 { 5065 .procname = "kernel", 5066 .mode = 0555, 5067 .child = sd_ctl_dir, 5068 }, 5069 {} 5070}; 5071 5072static struct ctl_table *sd_alloc_ctl_entry(int n) 5073{ 5074 struct ctl_table *entry = 5075 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 5076 5077 return entry; 5078} 5079 5080static void sd_free_ctl_entry(struct ctl_table **tablep) 5081{ 5082 struct ctl_table *entry; 5083 5084 /* 5085 * In the intermediate directories, both the child directory and 5086 * procname are dynamically allocated and could fail but the mode 5087 * will always be set. In the lowest directory the names are 5088 * static strings and all have proc handlers. 5089 */ 5090 for (entry = *tablep; entry->mode; entry++) { 5091 if (entry->child) 5092 sd_free_ctl_entry(&entry->child); 5093 if (entry->proc_handler == NULL) 5094 kfree(entry->procname); 5095 } 5096 5097 kfree(*tablep); 5098 *tablep = NULL; 5099} 5100 5101static int min_load_idx = 0; 5102static int max_load_idx = CPU_LOAD_IDX_MAX-1; 5103 5104static void 5105set_table_entry(struct ctl_table *entry, 5106 const char *procname, void *data, int maxlen, 5107 umode_t mode, proc_handler *proc_handler, 5108 bool load_idx) 5109{ 5110 entry->procname = procname; 5111 entry->data = data; 5112 entry->maxlen = maxlen; 5113 entry->mode = mode; 5114 entry->proc_handler = proc_handler; 5115 5116 if (load_idx) { 5117 entry->extra1 = &min_load_idx; 5118 entry->extra2 = &max_load_idx; 5119 } 5120} 5121 5122static struct ctl_table * 5123sd_alloc_ctl_domain_table(struct sched_domain *sd) 5124{ 5125 struct ctl_table *table = sd_alloc_ctl_entry(14); 5126 5127 if (table == NULL) 5128 return NULL; 5129 5130 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5131 sizeof(long), 0644, proc_doulongvec_minmax, false); 5132 set_table_entry(&table[1], "max_interval", &sd->max_interval, 5133 sizeof(long), 0644, proc_doulongvec_minmax, false); 5134 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 5135 sizeof(int), 0644, proc_dointvec_minmax, true); 5136 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 5137 sizeof(int), 0644, proc_dointvec_minmax, true); 5138 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 5139 sizeof(int), 0644, proc_dointvec_minmax, true); 5140 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 5141 sizeof(int), 0644, proc_dointvec_minmax, true); 5142 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 5143 sizeof(int), 0644, proc_dointvec_minmax, true); 5144 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 5145 sizeof(int), 0644, proc_dointvec_minmax, false); 5146 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5147 sizeof(int), 0644, proc_dointvec_minmax, false); 5148 set_table_entry(&table[9], "cache_nice_tries", 5149 &sd->cache_nice_tries, 5150 sizeof(int), 0644, proc_dointvec_minmax, false); 5151 set_table_entry(&table[10], "flags", &sd->flags, 5152 sizeof(int), 0644, proc_dointvec_minmax, false); 5153 set_table_entry(&table[11], "max_newidle_lb_cost", 5154 &sd->max_newidle_lb_cost, 5155 sizeof(long), 0644, proc_doulongvec_minmax, false); 5156 set_table_entry(&table[12], "name", sd->name, 5157 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 5158 /* &table[13] is terminator */ 5159 5160 return table; 5161} 5162 5163static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5164{ 5165 struct ctl_table *entry, *table; 5166 struct sched_domain *sd; 5167 int domain_num = 0, i; 5168 char buf[32]; 5169 5170 for_each_domain(cpu, sd) 5171 domain_num++; 5172 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5173 if (table == NULL) 5174 return NULL; 5175 5176 i = 0; 5177 for_each_domain(cpu, sd) { 5178 snprintf(buf, 32, "domain%d", i); 5179 entry->procname = kstrdup(buf, GFP_KERNEL); 5180 entry->mode = 0555; 5181 entry->child = sd_alloc_ctl_domain_table(sd); 5182 entry++; 5183 i++; 5184 } 5185 return table; 5186} 5187 5188static struct ctl_table_header *sd_sysctl_header; 5189static void register_sched_domain_sysctl(void) 5190{ 5191 int i, cpu_num = num_possible_cpus(); 5192 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5193 char buf[32]; 5194 5195 WARN_ON(sd_ctl_dir[0].child); 5196 sd_ctl_dir[0].child = entry; 5197 5198 if (entry == NULL) 5199 return; 5200 5201 for_each_possible_cpu(i) { 5202 snprintf(buf, 32, "cpu%d", i); 5203 entry->procname = kstrdup(buf, GFP_KERNEL); 5204 entry->mode = 0555; 5205 entry->child = sd_alloc_ctl_cpu_table(i); 5206 entry++; 5207 } 5208 5209 WARN_ON(sd_sysctl_header); 5210 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5211} 5212 5213/* may be called multiple times per register */ 5214static void unregister_sched_domain_sysctl(void) 5215{ 5216 if (sd_sysctl_header) 5217 unregister_sysctl_table(sd_sysctl_header); 5218 sd_sysctl_header = NULL; 5219 if (sd_ctl_dir[0].child) 5220 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5221} 5222#else 5223static void register_sched_domain_sysctl(void) 5224{ 5225} 5226static void unregister_sched_domain_sysctl(void) 5227{ 5228} 5229#endif 5230 5231static void set_rq_online(struct rq *rq) 5232{ 5233 if (!rq->online) { 5234 const struct sched_class *class; 5235 5236 cpumask_set_cpu(rq->cpu, rq->rd->online); 5237 rq->online = 1; 5238 5239 for_each_class(class) { 5240 if (class->rq_online) 5241 class->rq_online(rq); 5242 } 5243 } 5244} 5245 5246static void set_rq_offline(struct rq *rq) 5247{ 5248 if (rq->online) { 5249 const struct sched_class *class; 5250 5251 for_each_class(class) { 5252 if (class->rq_offline) 5253 class->rq_offline(rq); 5254 } 5255 5256 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5257 rq->online = 0; 5258 } 5259} 5260 5261/* 5262 * migration_call - callback that gets triggered when a CPU is added. 5263 * Here we can start up the necessary migration thread for the new CPU. 5264 */ 5265static int 5266migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5267{ 5268 int cpu = (long)hcpu; 5269 unsigned long flags; 5270 struct rq *rq = cpu_rq(cpu); 5271 5272 switch (action & ~CPU_TASKS_FROZEN) { 5273 5274 case CPU_UP_PREPARE: 5275 rq->calc_load_update = calc_load_update; 5276 break; 5277 5278 case CPU_ONLINE: 5279 /* Update our root-domain */ 5280 raw_spin_lock_irqsave(&rq->lock, flags); 5281 if (rq->rd) { 5282 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5283 5284 set_rq_online(rq); 5285 } 5286 raw_spin_unlock_irqrestore(&rq->lock, flags); 5287 break; 5288 5289#ifdef CONFIG_HOTPLUG_CPU 5290 case CPU_DYING: 5291 sched_ttwu_pending(); 5292 /* Update our root-domain */ 5293 raw_spin_lock_irqsave(&rq->lock, flags); 5294 if (rq->rd) { 5295 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5296 set_rq_offline(rq); 5297 } 5298 migrate_tasks(cpu); 5299 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5300 raw_spin_unlock_irqrestore(&rq->lock, flags); 5301 break; 5302 5303 case CPU_DEAD: 5304 calc_load_migrate(rq); 5305 break; 5306#endif 5307 } 5308 5309 update_max_interval(); 5310 5311 return NOTIFY_OK; 5312} 5313 5314/* 5315 * Register at high priority so that task migration (migrate_all_tasks) 5316 * happens before everything else. This has to be lower priority than 5317 * the notifier in the perf_event subsystem, though. 5318 */ 5319static struct notifier_block migration_notifier = { 5320 .notifier_call = migration_call, 5321 .priority = CPU_PRI_MIGRATION, 5322}; 5323 5324static void __cpuinit set_cpu_rq_start_time(void) 5325{ 5326 int cpu = smp_processor_id(); 5327 struct rq *rq = cpu_rq(cpu); 5328 rq->age_stamp = sched_clock_cpu(cpu); 5329} 5330 5331static int sched_cpu_active(struct notifier_block *nfb, 5332 unsigned long action, void *hcpu) 5333{ 5334 switch (action & ~CPU_TASKS_FROZEN) { 5335 case CPU_STARTING: 5336 set_cpu_rq_start_time(); 5337 return NOTIFY_OK; 5338 case CPU_ONLINE: 5339 /* 5340 * At this point a starting CPU has marked itself as online via 5341 * set_cpu_online(). But it might not yet have marked itself 5342 * as active, which is essential from here on. 5343 * 5344 * Thus, fall-through and help the starting CPU along. 5345 */ 5346 case CPU_DOWN_FAILED: 5347 set_cpu_active((long)hcpu, true); 5348 return NOTIFY_OK; 5349 default: 5350 return NOTIFY_DONE; 5351 } 5352} 5353 5354static int sched_cpu_inactive(struct notifier_block *nfb, 5355 unsigned long action, void *hcpu) 5356{ 5357 switch (action & ~CPU_TASKS_FROZEN) { 5358 case CPU_DOWN_PREPARE: 5359 set_cpu_active((long)hcpu, false); 5360 return NOTIFY_OK; 5361 default: 5362 return NOTIFY_DONE; 5363 } 5364} 5365 5366static int __init migration_init(void) 5367{ 5368 void *cpu = (void *)(long)smp_processor_id(); 5369 int err; 5370 5371 /* Initialize migration for the boot CPU */ 5372 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5373 BUG_ON(err == NOTIFY_BAD); 5374 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5375 register_cpu_notifier(&migration_notifier); 5376 5377 /* Register cpu active notifiers */ 5378 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5379 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5380 5381 return 0; 5382} 5383early_initcall(migration_init); 5384#endif 5385 5386#ifdef CONFIG_SMP 5387 5388static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5389 5390#ifdef CONFIG_SCHED_DEBUG 5391 5392static __read_mostly int sched_debug_enabled; 5393 5394static int __init sched_debug_setup(char *str) 5395{ 5396 sched_debug_enabled = 1; 5397 5398 return 0; 5399} 5400early_param("sched_debug", sched_debug_setup); 5401 5402static inline bool sched_debug(void) 5403{ 5404 return sched_debug_enabled; 5405} 5406 5407static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5408 struct cpumask *groupmask) 5409{ 5410 struct sched_group *group = sd->groups; 5411 5412 cpumask_clear(groupmask); 5413 5414 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5415 5416 if (!(sd->flags & SD_LOAD_BALANCE)) { 5417 printk("does not load-balance\n"); 5418 if (sd->parent) 5419 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5420 " has parent"); 5421 return -1; 5422 } 5423 5424 printk(KERN_CONT "span %*pbl level %s\n", 5425 cpumask_pr_args(sched_domain_span(sd)), sd->name); 5426 5427 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5428 printk(KERN_ERR "ERROR: domain->span does not contain " 5429 "CPU%d\n", cpu); 5430 } 5431 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5432 printk(KERN_ERR "ERROR: domain->groups does not contain" 5433 " CPU%d\n", cpu); 5434 } 5435 5436 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5437 do { 5438 if (!group) { 5439 printk("\n"); 5440 printk(KERN_ERR "ERROR: group is NULL\n"); 5441 break; 5442 } 5443 5444 if (!cpumask_weight(sched_group_cpus(group))) { 5445 printk(KERN_CONT "\n"); 5446 printk(KERN_ERR "ERROR: empty group\n"); 5447 break; 5448 } 5449 5450 if (!(sd->flags & SD_OVERLAP) && 5451 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5452 printk(KERN_CONT "\n"); 5453 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5454 break; 5455 } 5456 5457 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5458 5459 printk(KERN_CONT " %*pbl", 5460 cpumask_pr_args(sched_group_cpus(group))); 5461 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5462 printk(KERN_CONT " (cpu_capacity = %d)", 5463 group->sgc->capacity); 5464 } 5465 5466 group = group->next; 5467 } while (group != sd->groups); 5468 printk(KERN_CONT "\n"); 5469 5470 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5471 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5472 5473 if (sd->parent && 5474 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5475 printk(KERN_ERR "ERROR: parent span is not a superset " 5476 "of domain->span\n"); 5477 return 0; 5478} 5479 5480static void sched_domain_debug(struct sched_domain *sd, int cpu) 5481{ 5482 int level = 0; 5483 5484 if (!sched_debug_enabled) 5485 return; 5486 5487 if (!sd) { 5488 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5489 return; 5490 } 5491 5492 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5493 5494 for (;;) { 5495 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5496 break; 5497 level++; 5498 sd = sd->parent; 5499 if (!sd) 5500 break; 5501 } 5502} 5503#else /* !CONFIG_SCHED_DEBUG */ 5504# define sched_domain_debug(sd, cpu) do { } while (0) 5505static inline bool sched_debug(void) 5506{ 5507 return false; 5508} 5509#endif /* CONFIG_SCHED_DEBUG */ 5510 5511static int sd_degenerate(struct sched_domain *sd) 5512{ 5513 if (cpumask_weight(sched_domain_span(sd)) == 1) 5514 return 1; 5515 5516 /* Following flags need at least 2 groups */ 5517 if (sd->flags & (SD_LOAD_BALANCE | 5518 SD_BALANCE_NEWIDLE | 5519 SD_BALANCE_FORK | 5520 SD_BALANCE_EXEC | 5521 SD_SHARE_CPUCAPACITY | 5522 SD_SHARE_PKG_RESOURCES | 5523 SD_SHARE_POWERDOMAIN)) { 5524 if (sd->groups != sd->groups->next) 5525 return 0; 5526 } 5527 5528 /* Following flags don't use groups */ 5529 if (sd->flags & (SD_WAKE_AFFINE)) 5530 return 0; 5531 5532 return 1; 5533} 5534 5535static int 5536sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5537{ 5538 unsigned long cflags = sd->flags, pflags = parent->flags; 5539 5540 if (sd_degenerate(parent)) 5541 return 1; 5542 5543 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5544 return 0; 5545 5546 /* Flags needing groups don't count if only 1 group in parent */ 5547 if (parent->groups == parent->groups->next) { 5548 pflags &= ~(SD_LOAD_BALANCE | 5549 SD_BALANCE_NEWIDLE | 5550 SD_BALANCE_FORK | 5551 SD_BALANCE_EXEC | 5552 SD_SHARE_CPUCAPACITY | 5553 SD_SHARE_PKG_RESOURCES | 5554 SD_PREFER_SIBLING | 5555 SD_SHARE_POWERDOMAIN); 5556 if (nr_node_ids == 1) 5557 pflags &= ~SD_SERIALIZE; 5558 } 5559 if (~cflags & pflags) 5560 return 0; 5561 5562 return 1; 5563} 5564 5565static void free_rootdomain(struct rcu_head *rcu) 5566{ 5567 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5568 5569 cpupri_cleanup(&rd->cpupri); 5570 cpudl_cleanup(&rd->cpudl); 5571 free_cpumask_var(rd->dlo_mask); 5572 free_cpumask_var(rd->rto_mask); 5573 free_cpumask_var(rd->online); 5574 free_cpumask_var(rd->span); 5575 kfree(rd); 5576} 5577 5578static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5579{ 5580 struct root_domain *old_rd = NULL; 5581 unsigned long flags; 5582 5583 raw_spin_lock_irqsave(&rq->lock, flags); 5584 5585 if (rq->rd) { 5586 old_rd = rq->rd; 5587 5588 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5589 set_rq_offline(rq); 5590 5591 cpumask_clear_cpu(rq->cpu, old_rd->span); 5592 5593 /* 5594 * If we dont want to free the old_rd yet then 5595 * set old_rd to NULL to skip the freeing later 5596 * in this function: 5597 */ 5598 if (!atomic_dec_and_test(&old_rd->refcount)) 5599 old_rd = NULL; 5600 } 5601 5602 atomic_inc(&rd->refcount); 5603 rq->rd = rd; 5604 5605 cpumask_set_cpu(rq->cpu, rd->span); 5606 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5607 set_rq_online(rq); 5608 5609 raw_spin_unlock_irqrestore(&rq->lock, flags); 5610 5611 if (old_rd) 5612 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5613} 5614 5615static int init_rootdomain(struct root_domain *rd) 5616{ 5617 memset(rd, 0, sizeof(*rd)); 5618 5619 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5620 goto out; 5621 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5622 goto free_span; 5623 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 5624 goto free_online; 5625 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5626 goto free_dlo_mask; 5627 5628 init_dl_bw(&rd->dl_bw); 5629 if (cpudl_init(&rd->cpudl) != 0) 5630 goto free_dlo_mask; 5631 5632 if (cpupri_init(&rd->cpupri) != 0) 5633 goto free_rto_mask; 5634 return 0; 5635 5636free_rto_mask: 5637 free_cpumask_var(rd->rto_mask); 5638free_dlo_mask: 5639 free_cpumask_var(rd->dlo_mask); 5640free_online: 5641 free_cpumask_var(rd->online); 5642free_span: 5643 free_cpumask_var(rd->span); 5644out: 5645 return -ENOMEM; 5646} 5647 5648/* 5649 * By default the system creates a single root-domain with all cpus as 5650 * members (mimicking the global state we have today). 5651 */ 5652struct root_domain def_root_domain; 5653 5654static void init_defrootdomain(void) 5655{ 5656 init_rootdomain(&def_root_domain); 5657 5658 atomic_set(&def_root_domain.refcount, 1); 5659} 5660 5661static struct root_domain *alloc_rootdomain(void) 5662{ 5663 struct root_domain *rd; 5664 5665 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5666 if (!rd) 5667 return NULL; 5668 5669 if (init_rootdomain(rd) != 0) { 5670 kfree(rd); 5671 return NULL; 5672 } 5673 5674 return rd; 5675} 5676 5677static void free_sched_groups(struct sched_group *sg, int free_sgc) 5678{ 5679 struct sched_group *tmp, *first; 5680 5681 if (!sg) 5682 return; 5683 5684 first = sg; 5685 do { 5686 tmp = sg->next; 5687 5688 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref)) 5689 kfree(sg->sgc); 5690 5691 kfree(sg); 5692 sg = tmp; 5693 } while (sg != first); 5694} 5695 5696static void free_sched_domain(struct rcu_head *rcu) 5697{ 5698 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5699 5700 /* 5701 * If its an overlapping domain it has private groups, iterate and 5702 * nuke them all. 5703 */ 5704 if (sd->flags & SD_OVERLAP) { 5705 free_sched_groups(sd->groups, 1); 5706 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5707 kfree(sd->groups->sgc); 5708 kfree(sd->groups); 5709 } 5710 kfree(sd); 5711} 5712 5713static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5714{ 5715 call_rcu(&sd->rcu, free_sched_domain); 5716} 5717 5718static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5719{ 5720 for (; sd; sd = sd->parent) 5721 destroy_sched_domain(sd, cpu); 5722} 5723 5724/* 5725 * Keep a special pointer to the highest sched_domain that has 5726 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5727 * allows us to avoid some pointer chasing select_idle_sibling(). 5728 * 5729 * Also keep a unique ID per domain (we use the first cpu number in 5730 * the cpumask of the domain), this allows us to quickly tell if 5731 * two cpus are in the same cache domain, see cpus_share_cache(). 5732 */ 5733DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5734DEFINE_PER_CPU(int, sd_llc_size); 5735DEFINE_PER_CPU(int, sd_llc_id); 5736DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5737DEFINE_PER_CPU(struct sched_domain *, sd_busy); 5738DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5739 5740static void update_top_cache_domain(int cpu) 5741{ 5742 struct sched_domain *sd; 5743 struct sched_domain *busy_sd = NULL; 5744 int id = cpu; 5745 int size = 1; 5746 5747 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5748 if (sd) { 5749 id = cpumask_first(sched_domain_span(sd)); 5750 size = cpumask_weight(sched_domain_span(sd)); 5751 busy_sd = sd->parent; /* sd_busy */ 5752 } 5753 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); 5754 5755 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5756 per_cpu(sd_llc_size, cpu) = size; 5757 per_cpu(sd_llc_id, cpu) = id; 5758 5759 sd = lowest_flag_domain(cpu, SD_NUMA); 5760 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 5761 5762 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 5763 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); 5764} 5765 5766/* 5767 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5768 * hold the hotplug lock. 5769 */ 5770static void 5771cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5772{ 5773 struct rq *rq = cpu_rq(cpu); 5774 struct sched_domain *tmp; 5775 5776 /* Remove the sched domains which do not contribute to scheduling. */ 5777 for (tmp = sd; tmp; ) { 5778 struct sched_domain *parent = tmp->parent; 5779 if (!parent) 5780 break; 5781 5782 if (sd_parent_degenerate(tmp, parent)) { 5783 tmp->parent = parent->parent; 5784 if (parent->parent) 5785 parent->parent->child = tmp; 5786 /* 5787 * Transfer SD_PREFER_SIBLING down in case of a 5788 * degenerate parent; the spans match for this 5789 * so the property transfers. 5790 */ 5791 if (parent->flags & SD_PREFER_SIBLING) 5792 tmp->flags |= SD_PREFER_SIBLING; 5793 destroy_sched_domain(parent, cpu); 5794 } else 5795 tmp = tmp->parent; 5796 } 5797 5798 if (sd && sd_degenerate(sd)) { 5799 tmp = sd; 5800 sd = sd->parent; 5801 destroy_sched_domain(tmp, cpu); 5802 if (sd) 5803 sd->child = NULL; 5804 } 5805 5806 sched_domain_debug(sd, cpu); 5807 5808 rq_attach_root(rq, rd); 5809 tmp = rq->sd; 5810 rcu_assign_pointer(rq->sd, sd); 5811 destroy_sched_domains(tmp, cpu); 5812 5813 update_top_cache_domain(cpu); 5814} 5815 5816/* Setup the mask of cpus configured for isolated domains */ 5817static int __init isolated_cpu_setup(char *str) 5818{ 5819 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5820 cpulist_parse(str, cpu_isolated_map); 5821 return 1; 5822} 5823 5824__setup("isolcpus=", isolated_cpu_setup); 5825 5826struct s_data { 5827 struct sched_domain ** __percpu sd; 5828 struct root_domain *rd; 5829}; 5830 5831enum s_alloc { 5832 sa_rootdomain, 5833 sa_sd, 5834 sa_sd_storage, 5835 sa_none, 5836}; 5837 5838/* 5839 * Build an iteration mask that can exclude certain CPUs from the upwards 5840 * domain traversal. 5841 * 5842 * Asymmetric node setups can result in situations where the domain tree is of 5843 * unequal depth, make sure to skip domains that already cover the entire 5844 * range. 5845 * 5846 * In that case build_sched_domains() will have terminated the iteration early 5847 * and our sibling sd spans will be empty. Domains should always include the 5848 * cpu they're built on, so check that. 5849 * 5850 */ 5851static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5852{ 5853 const struct cpumask *span = sched_domain_span(sd); 5854 struct sd_data *sdd = sd->private; 5855 struct sched_domain *sibling; 5856 int i; 5857 5858 for_each_cpu(i, span) { 5859 sibling = *per_cpu_ptr(sdd->sd, i); 5860 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5861 continue; 5862 5863 cpumask_set_cpu(i, sched_group_mask(sg)); 5864 } 5865} 5866 5867/* 5868 * Return the canonical balance cpu for this group, this is the first cpu 5869 * of this group that's also in the iteration mask. 5870 */ 5871int group_balance_cpu(struct sched_group *sg) 5872{ 5873 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5874} 5875 5876static int 5877build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5878{ 5879 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5880 const struct cpumask *span = sched_domain_span(sd); 5881 struct cpumask *covered = sched_domains_tmpmask; 5882 struct sd_data *sdd = sd->private; 5883 struct sched_domain *sibling; 5884 int i; 5885 5886 cpumask_clear(covered); 5887 5888 for_each_cpu(i, span) { 5889 struct cpumask *sg_span; 5890 5891 if (cpumask_test_cpu(i, covered)) 5892 continue; 5893 5894 sibling = *per_cpu_ptr(sdd->sd, i); 5895 5896 /* See the comment near build_group_mask(). */ 5897 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5898 continue; 5899 5900 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5901 GFP_KERNEL, cpu_to_node(cpu)); 5902 5903 if (!sg) 5904 goto fail; 5905 5906 sg_span = sched_group_cpus(sg); 5907 if (sibling->child) 5908 cpumask_copy(sg_span, sched_domain_span(sibling->child)); 5909 else 5910 cpumask_set_cpu(i, sg_span); 5911 5912 cpumask_or(covered, covered, sg_span); 5913 5914 sg->sgc = *per_cpu_ptr(sdd->sgc, i); 5915 if (atomic_inc_return(&sg->sgc->ref) == 1) 5916 build_group_mask(sd, sg); 5917 5918 /* 5919 * Initialize sgc->capacity such that even if we mess up the 5920 * domains and no possible iteration will get us here, we won't 5921 * die on a /0 trap. 5922 */ 5923 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 5924 5925 /* 5926 * Make sure the first group of this domain contains the 5927 * canonical balance cpu. Otherwise the sched_domain iteration 5928 * breaks. See update_sg_lb_stats(). 5929 */ 5930 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 5931 group_balance_cpu(sg) == cpu) 5932 groups = sg; 5933 5934 if (!first) 5935 first = sg; 5936 if (last) 5937 last->next = sg; 5938 last = sg; 5939 last->next = first; 5940 } 5941 sd->groups = groups; 5942 5943 return 0; 5944 5945fail: 5946 free_sched_groups(first, 0); 5947 5948 return -ENOMEM; 5949} 5950 5951static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 5952{ 5953 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 5954 struct sched_domain *child = sd->child; 5955 5956 if (child) 5957 cpu = cpumask_first(sched_domain_span(child)); 5958 5959 if (sg) { 5960 *sg = *per_cpu_ptr(sdd->sg, cpu); 5961 (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu); 5962 atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */ 5963 } 5964 5965 return cpu; 5966} 5967 5968/* 5969 * build_sched_groups will build a circular linked list of the groups 5970 * covered by the given span, and will set each group's ->cpumask correctly, 5971 * and ->cpu_capacity to 0. 5972 * 5973 * Assumes the sched_domain tree is fully constructed 5974 */ 5975static int 5976build_sched_groups(struct sched_domain *sd, int cpu) 5977{ 5978 struct sched_group *first = NULL, *last = NULL; 5979 struct sd_data *sdd = sd->private; 5980 const struct cpumask *span = sched_domain_span(sd); 5981 struct cpumask *covered; 5982 int i; 5983 5984 get_group(cpu, sdd, &sd->groups); 5985 atomic_inc(&sd->groups->ref); 5986 5987 if (cpu != cpumask_first(span)) 5988 return 0; 5989 5990 lockdep_assert_held(&sched_domains_mutex); 5991 covered = sched_domains_tmpmask; 5992 5993 cpumask_clear(covered); 5994 5995 for_each_cpu(i, span) { 5996 struct sched_group *sg; 5997 int group, j; 5998 5999 if (cpumask_test_cpu(i, covered)) 6000 continue; 6001 6002 group = get_group(i, sdd, &sg); 6003 cpumask_setall(sched_group_mask(sg)); 6004 6005 for_each_cpu(j, span) { 6006 if (get_group(j, sdd, NULL) != group) 6007 continue; 6008 6009 cpumask_set_cpu(j, covered); 6010 cpumask_set_cpu(j, sched_group_cpus(sg)); 6011 } 6012 6013 if (!first) 6014 first = sg; 6015 if (last) 6016 last->next = sg; 6017 last = sg; 6018 } 6019 last->next = first; 6020 6021 return 0; 6022} 6023 6024/* 6025 * Initialize sched groups cpu_capacity. 6026 * 6027 * cpu_capacity indicates the capacity of sched group, which is used while 6028 * distributing the load between different sched groups in a sched domain. 6029 * Typically cpu_capacity for all the groups in a sched domain will be same 6030 * unless there are asymmetries in the topology. If there are asymmetries, 6031 * group having more cpu_capacity will pickup more load compared to the 6032 * group having less cpu_capacity. 6033 */ 6034static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) 6035{ 6036 struct sched_group *sg = sd->groups; 6037 6038 WARN_ON(!sg); 6039 6040 do { 6041 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 6042 sg = sg->next; 6043 } while (sg != sd->groups); 6044 6045 if (cpu != group_balance_cpu(sg)) 6046 return; 6047 6048 update_group_capacity(sd, cpu); 6049 atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight); 6050} 6051 6052/* 6053 * Initializers for schedule domains 6054 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6055 */ 6056 6057static int default_relax_domain_level = -1; 6058int sched_domain_level_max; 6059 6060static int __init setup_relax_domain_level(char *str) 6061{ 6062 if (kstrtoint(str, 0, &default_relax_domain_level)) 6063 pr_warn("Unable to set relax_domain_level\n"); 6064 6065 return 1; 6066} 6067__setup("relax_domain_level=", setup_relax_domain_level); 6068 6069static void set_domain_attribute(struct sched_domain *sd, 6070 struct sched_domain_attr *attr) 6071{ 6072 int request; 6073 6074 if (!attr || attr->relax_domain_level < 0) { 6075 if (default_relax_domain_level < 0) 6076 return; 6077 else 6078 request = default_relax_domain_level; 6079 } else 6080 request = attr->relax_domain_level; 6081 if (request < sd->level) { 6082 /* turn off idle balance on this domain */ 6083 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6084 } else { 6085 /* turn on idle balance on this domain */ 6086 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6087 } 6088} 6089 6090static void __sdt_free(const struct cpumask *cpu_map); 6091static int __sdt_alloc(const struct cpumask *cpu_map); 6092 6093static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 6094 const struct cpumask *cpu_map) 6095{ 6096 switch (what) { 6097 case sa_rootdomain: 6098 if (!atomic_read(&d->rd->refcount)) 6099 free_rootdomain(&d->rd->rcu); /* fall through */ 6100 case sa_sd: 6101 free_percpu(d->sd); /* fall through */ 6102 case sa_sd_storage: 6103 __sdt_free(cpu_map); /* fall through */ 6104 case sa_none: 6105 break; 6106 } 6107} 6108 6109static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 6110 const struct cpumask *cpu_map) 6111{ 6112 memset(d, 0, sizeof(*d)); 6113 6114 if (__sdt_alloc(cpu_map)) 6115 return sa_sd_storage; 6116 d->sd = alloc_percpu(struct sched_domain *); 6117 if (!d->sd) 6118 return sa_sd_storage; 6119 d->rd = alloc_rootdomain(); 6120 if (!d->rd) 6121 return sa_sd; 6122 return sa_rootdomain; 6123} 6124 6125/* 6126 * NULL the sd_data elements we've used to build the sched_domain and 6127 * sched_group structure so that the subsequent __free_domain_allocs() 6128 * will not free the data we're using. 6129 */ 6130static void claim_allocations(int cpu, struct sched_domain *sd) 6131{ 6132 struct sd_data *sdd = sd->private; 6133 6134 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 6135 *per_cpu_ptr(sdd->sd, cpu) = NULL; 6136 6137 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 6138 *per_cpu_ptr(sdd->sg, cpu) = NULL; 6139 6140 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref)) 6141 *per_cpu_ptr(sdd->sgc, cpu) = NULL; 6142} 6143 6144#ifdef CONFIG_NUMA 6145static int sched_domains_numa_levels; 6146enum numa_topology_type sched_numa_topology_type; 6147static int *sched_domains_numa_distance; 6148int sched_max_numa_distance; 6149static struct cpumask ***sched_domains_numa_masks; 6150static int sched_domains_curr_level; 6151#endif 6152 6153/* 6154 * SD_flags allowed in topology descriptions. 6155 * 6156 * SD_SHARE_CPUCAPACITY - describes SMT topologies 6157 * SD_SHARE_PKG_RESOURCES - describes shared caches 6158 * SD_NUMA - describes NUMA topologies 6159 * SD_SHARE_POWERDOMAIN - describes shared power domain 6160 * 6161 * Odd one out: 6162 * SD_ASYM_PACKING - describes SMT quirks 6163 */ 6164#define TOPOLOGY_SD_FLAGS \ 6165 (SD_SHARE_CPUCAPACITY | \ 6166 SD_SHARE_PKG_RESOURCES | \ 6167 SD_NUMA | \ 6168 SD_ASYM_PACKING | \ 6169 SD_SHARE_POWERDOMAIN) 6170 6171static struct sched_domain * 6172sd_init(struct sched_domain_topology_level *tl, int cpu) 6173{ 6174 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6175 int sd_weight, sd_flags = 0; 6176 6177#ifdef CONFIG_NUMA 6178 /* 6179 * Ugly hack to pass state to sd_numa_mask()... 6180 */ 6181 sched_domains_curr_level = tl->numa_level; 6182#endif 6183 6184 sd_weight = cpumask_weight(tl->mask(cpu)); 6185 6186 if (tl->sd_flags) 6187 sd_flags = (*tl->sd_flags)(); 6188 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, 6189 "wrong sd_flags in topology description\n")) 6190 sd_flags &= ~TOPOLOGY_SD_FLAGS; 6191 6192 *sd = (struct sched_domain){ 6193 .min_interval = sd_weight, 6194 .max_interval = 2*sd_weight, 6195 .busy_factor = 32, 6196 .imbalance_pct = 125, 6197 6198 .cache_nice_tries = 0, 6199 .busy_idx = 0, 6200 .idle_idx = 0, 6201 .newidle_idx = 0, 6202 .wake_idx = 0, 6203 .forkexec_idx = 0, 6204 6205 .flags = 1*SD_LOAD_BALANCE 6206 | 1*SD_BALANCE_NEWIDLE 6207 | 1*SD_BALANCE_EXEC 6208 | 1*SD_BALANCE_FORK 6209 | 0*SD_BALANCE_WAKE 6210 | 1*SD_WAKE_AFFINE 6211 | 0*SD_SHARE_CPUCAPACITY 6212 | 0*SD_SHARE_PKG_RESOURCES 6213 | 0*SD_SERIALIZE 6214 | 0*SD_PREFER_SIBLING 6215 | 0*SD_NUMA 6216 | sd_flags 6217 , 6218 6219 .last_balance = jiffies, 6220 .balance_interval = sd_weight, 6221 .smt_gain = 0, 6222 .max_newidle_lb_cost = 0, 6223 .next_decay_max_lb_cost = jiffies, 6224#ifdef CONFIG_SCHED_DEBUG 6225 .name = tl->name, 6226#endif 6227 }; 6228 6229 /* 6230 * Convert topological properties into behaviour. 6231 */ 6232 6233 if (sd->flags & SD_SHARE_CPUCAPACITY) { 6234 sd->flags |= SD_PREFER_SIBLING; 6235 sd->imbalance_pct = 110; 6236 sd->smt_gain = 1178; /* ~15% */ 6237 6238 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 6239 sd->imbalance_pct = 117; 6240 sd->cache_nice_tries = 1; 6241 sd->busy_idx = 2; 6242 6243#ifdef CONFIG_NUMA 6244 } else if (sd->flags & SD_NUMA) { 6245 sd->cache_nice_tries = 2; 6246 sd->busy_idx = 3; 6247 sd->idle_idx = 2; 6248 6249 sd->flags |= SD_SERIALIZE; 6250 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { 6251 sd->flags &= ~(SD_BALANCE_EXEC | 6252 SD_BALANCE_FORK | 6253 SD_WAKE_AFFINE); 6254 } 6255 6256#endif 6257 } else { 6258 sd->flags |= SD_PREFER_SIBLING; 6259 sd->cache_nice_tries = 1; 6260 sd->busy_idx = 2; 6261 sd->idle_idx = 1; 6262 } 6263 6264 sd->private = &tl->data; 6265 6266 return sd; 6267} 6268 6269/* 6270 * Topology list, bottom-up. 6271 */ 6272static struct sched_domain_topology_level default_topology[] = { 6273#ifdef CONFIG_SCHED_SMT 6274 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 6275#endif 6276#ifdef CONFIG_SCHED_MC 6277 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 6278#endif 6279 { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 6280 { NULL, }, 6281}; 6282 6283struct sched_domain_topology_level *sched_domain_topology = default_topology; 6284 6285#define for_each_sd_topology(tl) \ 6286 for (tl = sched_domain_topology; tl->mask; tl++) 6287 6288void set_sched_topology(struct sched_domain_topology_level *tl) 6289{ 6290 sched_domain_topology = tl; 6291} 6292 6293#ifdef CONFIG_NUMA 6294 6295static const struct cpumask *sd_numa_mask(int cpu) 6296{ 6297 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6298} 6299 6300static void sched_numa_warn(const char *str) 6301{ 6302 static int done = false; 6303 int i,j; 6304 6305 if (done) 6306 return; 6307 6308 done = true; 6309 6310 printk(KERN_WARNING "ERROR: %s\n\n", str); 6311 6312 for (i = 0; i < nr_node_ids; i++) { 6313 printk(KERN_WARNING " "); 6314 for (j = 0; j < nr_node_ids; j++) 6315 printk(KERN_CONT "%02d ", node_distance(i,j)); 6316 printk(KERN_CONT "\n"); 6317 } 6318 printk(KERN_WARNING "\n"); 6319} 6320 6321bool find_numa_distance(int distance) 6322{ 6323 int i; 6324 6325 if (distance == node_distance(0, 0)) 6326 return true; 6327 6328 for (i = 0; i < sched_domains_numa_levels; i++) { 6329 if (sched_domains_numa_distance[i] == distance) 6330 return true; 6331 } 6332 6333 return false; 6334} 6335 6336/* 6337 * A system can have three types of NUMA topology: 6338 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system 6339 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes 6340 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane 6341 * 6342 * The difference between a glueless mesh topology and a backplane 6343 * topology lies in whether communication between not directly 6344 * connected nodes goes through intermediary nodes (where programs 6345 * could run), or through backplane controllers. This affects 6346 * placement of programs. 6347 * 6348 * The type of topology can be discerned with the following tests: 6349 * - If the maximum distance between any nodes is 1 hop, the system 6350 * is directly connected. 6351 * - If for two nodes A and B, located N > 1 hops away from each other, 6352 * there is an intermediary node C, which is < N hops away from both 6353 * nodes A and B, the system is a glueless mesh. 6354 */ 6355static void init_numa_topology_type(void) 6356{ 6357 int a, b, c, n; 6358 6359 n = sched_max_numa_distance; 6360 6361 if (n <= 1) 6362 sched_numa_topology_type = NUMA_DIRECT; 6363 6364 for_each_online_node(a) { 6365 for_each_online_node(b) { 6366 /* Find two nodes furthest removed from each other. */ 6367 if (node_distance(a, b) < n) 6368 continue; 6369 6370 /* Is there an intermediary node between a and b? */ 6371 for_each_online_node(c) { 6372 if (node_distance(a, c) < n && 6373 node_distance(b, c) < n) { 6374 sched_numa_topology_type = 6375 NUMA_GLUELESS_MESH; 6376 return; 6377 } 6378 } 6379 6380 sched_numa_topology_type = NUMA_BACKPLANE; 6381 return; 6382 } 6383 } 6384} 6385 6386static void sched_init_numa(void) 6387{ 6388 int next_distance, curr_distance = node_distance(0, 0); 6389 struct sched_domain_topology_level *tl; 6390 int level = 0; 6391 int i, j, k; 6392 6393 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6394 if (!sched_domains_numa_distance) 6395 return; 6396 6397 /* 6398 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6399 * unique distances in the node_distance() table. 6400 * 6401 * Assumes node_distance(0,j) includes all distances in 6402 * node_distance(i,j) in order to avoid cubic time. 6403 */ 6404 next_distance = curr_distance; 6405 for (i = 0; i < nr_node_ids; i++) { 6406 for (j = 0; j < nr_node_ids; j++) { 6407 for (k = 0; k < nr_node_ids; k++) { 6408 int distance = node_distance(i, k); 6409 6410 if (distance > curr_distance && 6411 (distance < next_distance || 6412 next_distance == curr_distance)) 6413 next_distance = distance; 6414 6415 /* 6416 * While not a strong assumption it would be nice to know 6417 * about cases where if node A is connected to B, B is not 6418 * equally connected to A. 6419 */ 6420 if (sched_debug() && node_distance(k, i) != distance) 6421 sched_numa_warn("Node-distance not symmetric"); 6422 6423 if (sched_debug() && i && !find_numa_distance(distance)) 6424 sched_numa_warn("Node-0 not representative"); 6425 } 6426 if (next_distance != curr_distance) { 6427 sched_domains_numa_distance[level++] = next_distance; 6428 sched_domains_numa_levels = level; 6429 curr_distance = next_distance; 6430 } else break; 6431 } 6432 6433 /* 6434 * In case of sched_debug() we verify the above assumption. 6435 */ 6436 if (!sched_debug()) 6437 break; 6438 } 6439 6440 if (!level) 6441 return; 6442 6443 /* 6444 * 'level' contains the number of unique distances, excluding the 6445 * identity distance node_distance(i,i). 6446 * 6447 * The sched_domains_numa_distance[] array includes the actual distance 6448 * numbers. 6449 */ 6450 6451 /* 6452 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6453 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6454 * the array will contain less then 'level' members. This could be 6455 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6456 * in other functions. 6457 * 6458 * We reset it to 'level' at the end of this function. 6459 */ 6460 sched_domains_numa_levels = 0; 6461 6462 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6463 if (!sched_domains_numa_masks) 6464 return; 6465 6466 /* 6467 * Now for each level, construct a mask per node which contains all 6468 * cpus of nodes that are that many hops away from us. 6469 */ 6470 for (i = 0; i < level; i++) { 6471 sched_domains_numa_masks[i] = 6472 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6473 if (!sched_domains_numa_masks[i]) 6474 return; 6475 6476 for (j = 0; j < nr_node_ids; j++) { 6477 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6478 if (!mask) 6479 return; 6480 6481 sched_domains_numa_masks[i][j] = mask; 6482 6483 for_each_node(k) { 6484 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6485 continue; 6486 6487 cpumask_or(mask, mask, cpumask_of_node(k)); 6488 } 6489 } 6490 } 6491 6492 /* Compute default topology size */ 6493 for (i = 0; sched_domain_topology[i].mask; i++); 6494 6495 tl = kzalloc((i + level + 1) * 6496 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6497 if (!tl) 6498 return; 6499 6500 /* 6501 * Copy the default topology bits.. 6502 */ 6503 for (i = 0; sched_domain_topology[i].mask; i++) 6504 tl[i] = sched_domain_topology[i]; 6505 6506 /* 6507 * .. and append 'j' levels of NUMA goodness. 6508 */ 6509 for (j = 0; j < level; i++, j++) { 6510 tl[i] = (struct sched_domain_topology_level){ 6511 .mask = sd_numa_mask, 6512 .sd_flags = cpu_numa_flags, 6513 .flags = SDTL_OVERLAP, 6514 .numa_level = j, 6515 SD_INIT_NAME(NUMA) 6516 }; 6517 } 6518 6519 sched_domain_topology = tl; 6520 6521 sched_domains_numa_levels = level; 6522 sched_max_numa_distance = sched_domains_numa_distance[level - 1]; 6523 6524 init_numa_topology_type(); 6525} 6526 6527static void sched_domains_numa_masks_set(int cpu) 6528{ 6529 int i, j; 6530 int node = cpu_to_node(cpu); 6531 6532 for (i = 0; i < sched_domains_numa_levels; i++) { 6533 for (j = 0; j < nr_node_ids; j++) { 6534 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6535 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6536 } 6537 } 6538} 6539 6540static void sched_domains_numa_masks_clear(int cpu) 6541{ 6542 int i, j; 6543 for (i = 0; i < sched_domains_numa_levels; i++) { 6544 for (j = 0; j < nr_node_ids; j++) 6545 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6546 } 6547} 6548 6549/* 6550 * Update sched_domains_numa_masks[level][node] array when new cpus 6551 * are onlined. 6552 */ 6553static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6554 unsigned long action, 6555 void *hcpu) 6556{ 6557 int cpu = (long)hcpu; 6558 6559 switch (action & ~CPU_TASKS_FROZEN) { 6560 case CPU_ONLINE: 6561 sched_domains_numa_masks_set(cpu); 6562 break; 6563 6564 case CPU_DEAD: 6565 sched_domains_numa_masks_clear(cpu); 6566 break; 6567 6568 default: 6569 return NOTIFY_DONE; 6570 } 6571 6572 return NOTIFY_OK; 6573} 6574#else 6575static inline void sched_init_numa(void) 6576{ 6577} 6578 6579static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6580 unsigned long action, 6581 void *hcpu) 6582{ 6583 return 0; 6584} 6585#endif /* CONFIG_NUMA */ 6586 6587static int __sdt_alloc(const struct cpumask *cpu_map) 6588{ 6589 struct sched_domain_topology_level *tl; 6590 int j; 6591 6592 for_each_sd_topology(tl) { 6593 struct sd_data *sdd = &tl->data; 6594 6595 sdd->sd = alloc_percpu(struct sched_domain *); 6596 if (!sdd->sd) 6597 return -ENOMEM; 6598 6599 sdd->sg = alloc_percpu(struct sched_group *); 6600 if (!sdd->sg) 6601 return -ENOMEM; 6602 6603 sdd->sgc = alloc_percpu(struct sched_group_capacity *); 6604 if (!sdd->sgc) 6605 return -ENOMEM; 6606 6607 for_each_cpu(j, cpu_map) { 6608 struct sched_domain *sd; 6609 struct sched_group *sg; 6610 struct sched_group_capacity *sgc; 6611 6612 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6613 GFP_KERNEL, cpu_to_node(j)); 6614 if (!sd) 6615 return -ENOMEM; 6616 6617 *per_cpu_ptr(sdd->sd, j) = sd; 6618 6619 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6620 GFP_KERNEL, cpu_to_node(j)); 6621 if (!sg) 6622 return -ENOMEM; 6623 6624 sg->next = sg; 6625 6626 *per_cpu_ptr(sdd->sg, j) = sg; 6627 6628 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(), 6629 GFP_KERNEL, cpu_to_node(j)); 6630 if (!sgc) 6631 return -ENOMEM; 6632 6633 *per_cpu_ptr(sdd->sgc, j) = sgc; 6634 } 6635 } 6636 6637 return 0; 6638} 6639 6640static void __sdt_free(const struct cpumask *cpu_map) 6641{ 6642 struct sched_domain_topology_level *tl; 6643 int j; 6644 6645 for_each_sd_topology(tl) { 6646 struct sd_data *sdd = &tl->data; 6647 6648 for_each_cpu(j, cpu_map) { 6649 struct sched_domain *sd; 6650 6651 if (sdd->sd) { 6652 sd = *per_cpu_ptr(sdd->sd, j); 6653 if (sd && (sd->flags & SD_OVERLAP)) 6654 free_sched_groups(sd->groups, 0); 6655 kfree(*per_cpu_ptr(sdd->sd, j)); 6656 } 6657 6658 if (sdd->sg) 6659 kfree(*per_cpu_ptr(sdd->sg, j)); 6660 if (sdd->sgc) 6661 kfree(*per_cpu_ptr(sdd->sgc, j)); 6662 } 6663 free_percpu(sdd->sd); 6664 sdd->sd = NULL; 6665 free_percpu(sdd->sg); 6666 sdd->sg = NULL; 6667 free_percpu(sdd->sgc); 6668 sdd->sgc = NULL; 6669 } 6670} 6671 6672struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6673 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6674 struct sched_domain *child, int cpu) 6675{ 6676 struct sched_domain *sd = sd_init(tl, cpu); 6677 if (!sd) 6678 return child; 6679 6680 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6681 if (child) { 6682 sd->level = child->level + 1; 6683 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6684 child->parent = sd; 6685 sd->child = child; 6686 6687 if (!cpumask_subset(sched_domain_span(child), 6688 sched_domain_span(sd))) { 6689 pr_err("BUG: arch topology borken\n"); 6690#ifdef CONFIG_SCHED_DEBUG 6691 pr_err(" the %s domain not a subset of the %s domain\n", 6692 child->name, sd->name); 6693#endif 6694 /* Fixup, ensure @sd has at least @child cpus. */ 6695 cpumask_or(sched_domain_span(sd), 6696 sched_domain_span(sd), 6697 sched_domain_span(child)); 6698 } 6699 6700 } 6701 set_domain_attribute(sd, attr); 6702 6703 return sd; 6704} 6705 6706/* 6707 * Build sched domains for a given set of cpus and attach the sched domains 6708 * to the individual cpus 6709 */ 6710static int build_sched_domains(const struct cpumask *cpu_map, 6711 struct sched_domain_attr *attr) 6712{ 6713 enum s_alloc alloc_state; 6714 struct sched_domain *sd; 6715 struct s_data d; 6716 int i, ret = -ENOMEM; 6717 6718 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6719 if (alloc_state != sa_rootdomain) 6720 goto error; 6721 6722 /* Set up domains for cpus specified by the cpu_map. */ 6723 for_each_cpu(i, cpu_map) { 6724 struct sched_domain_topology_level *tl; 6725 6726 sd = NULL; 6727 for_each_sd_topology(tl) { 6728 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6729 if (tl == sched_domain_topology) 6730 *per_cpu_ptr(d.sd, i) = sd; 6731 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6732 sd->flags |= SD_OVERLAP; 6733 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6734 break; 6735 } 6736 } 6737 6738 /* Build the groups for the domains */ 6739 for_each_cpu(i, cpu_map) { 6740 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6741 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6742 if (sd->flags & SD_OVERLAP) { 6743 if (build_overlap_sched_groups(sd, i)) 6744 goto error; 6745 } else { 6746 if (build_sched_groups(sd, i)) 6747 goto error; 6748 } 6749 } 6750 } 6751 6752 /* Calculate CPU capacity for physical packages and nodes */ 6753 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6754 if (!cpumask_test_cpu(i, cpu_map)) 6755 continue; 6756 6757 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6758 claim_allocations(i, sd); 6759 init_sched_groups_capacity(i, sd); 6760 } 6761 } 6762 6763 /* Attach the domains */ 6764 rcu_read_lock(); 6765 for_each_cpu(i, cpu_map) { 6766 sd = *per_cpu_ptr(d.sd, i); 6767 cpu_attach_domain(sd, d.rd, i); 6768 } 6769 rcu_read_unlock(); 6770 6771 ret = 0; 6772error: 6773 __free_domain_allocs(&d, alloc_state, cpu_map); 6774 return ret; 6775} 6776 6777static cpumask_var_t *doms_cur; /* current sched domains */ 6778static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6779static struct sched_domain_attr *dattr_cur; 6780 /* attribues of custom domains in 'doms_cur' */ 6781 6782/* 6783 * Special case: If a kmalloc of a doms_cur partition (array of 6784 * cpumask) fails, then fallback to a single sched domain, 6785 * as determined by the single cpumask fallback_doms. 6786 */ 6787static cpumask_var_t fallback_doms; 6788 6789/* 6790 * arch_update_cpu_topology lets virtualized architectures update the 6791 * cpu core maps. It is supposed to return 1 if the topology changed 6792 * or 0 if it stayed the same. 6793 */ 6794int __weak arch_update_cpu_topology(void) 6795{ 6796 return 0; 6797} 6798 6799cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6800{ 6801 int i; 6802 cpumask_var_t *doms; 6803 6804 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6805 if (!doms) 6806 return NULL; 6807 for (i = 0; i < ndoms; i++) { 6808 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6809 free_sched_domains(doms, i); 6810 return NULL; 6811 } 6812 } 6813 return doms; 6814} 6815 6816void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6817{ 6818 unsigned int i; 6819 for (i = 0; i < ndoms; i++) 6820 free_cpumask_var(doms[i]); 6821 kfree(doms); 6822} 6823 6824/* 6825 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6826 * For now this just excludes isolated cpus, but could be used to 6827 * exclude other special cases in the future. 6828 */ 6829static int init_sched_domains(const struct cpumask *cpu_map) 6830{ 6831 int err; 6832 6833 arch_update_cpu_topology(); 6834 ndoms_cur = 1; 6835 doms_cur = alloc_sched_domains(ndoms_cur); 6836 if (!doms_cur) 6837 doms_cur = &fallback_doms; 6838 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6839 err = build_sched_domains(doms_cur[0], NULL); 6840 register_sched_domain_sysctl(); 6841 6842 return err; 6843} 6844 6845/* 6846 * Detach sched domains from a group of cpus specified in cpu_map 6847 * These cpus will now be attached to the NULL domain 6848 */ 6849static void detach_destroy_domains(const struct cpumask *cpu_map) 6850{ 6851 int i; 6852 6853 rcu_read_lock(); 6854 for_each_cpu(i, cpu_map) 6855 cpu_attach_domain(NULL, &def_root_domain, i); 6856 rcu_read_unlock(); 6857} 6858 6859/* handle null as "default" */ 6860static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6861 struct sched_domain_attr *new, int idx_new) 6862{ 6863 struct sched_domain_attr tmp; 6864 6865 /* fast path */ 6866 if (!new && !cur) 6867 return 1; 6868 6869 tmp = SD_ATTR_INIT; 6870 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6871 new ? (new + idx_new) : &tmp, 6872 sizeof(struct sched_domain_attr)); 6873} 6874 6875/* 6876 * Partition sched domains as specified by the 'ndoms_new' 6877 * cpumasks in the array doms_new[] of cpumasks. This compares 6878 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6879 * It destroys each deleted domain and builds each new domain. 6880 * 6881 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6882 * The masks don't intersect (don't overlap.) We should setup one 6883 * sched domain for each mask. CPUs not in any of the cpumasks will 6884 * not be load balanced. If the same cpumask appears both in the 6885 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6886 * it as it is. 6887 * 6888 * The passed in 'doms_new' should be allocated using 6889 * alloc_sched_domains. This routine takes ownership of it and will 6890 * free_sched_domains it when done with it. If the caller failed the 6891 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6892 * and partition_sched_domains() will fallback to the single partition 6893 * 'fallback_doms', it also forces the domains to be rebuilt. 6894 * 6895 * If doms_new == NULL it will be replaced with cpu_online_mask. 6896 * ndoms_new == 0 is a special case for destroying existing domains, 6897 * and it will not create the default domain. 6898 * 6899 * Call with hotplug lock held 6900 */ 6901void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6902 struct sched_domain_attr *dattr_new) 6903{ 6904 int i, j, n; 6905 int new_topology; 6906 6907 mutex_lock(&sched_domains_mutex); 6908 6909 /* always unregister in case we don't destroy any domains */ 6910 unregister_sched_domain_sysctl(); 6911 6912 /* Let architecture update cpu core mappings. */ 6913 new_topology = arch_update_cpu_topology(); 6914 6915 n = doms_new ? ndoms_new : 0; 6916 6917 /* Destroy deleted domains */ 6918 for (i = 0; i < ndoms_cur; i++) { 6919 for (j = 0; j < n && !new_topology; j++) { 6920 if (cpumask_equal(doms_cur[i], doms_new[j]) 6921 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6922 goto match1; 6923 } 6924 /* no match - a current sched domain not in new doms_new[] */ 6925 detach_destroy_domains(doms_cur[i]); 6926match1: 6927 ; 6928 } 6929 6930 n = ndoms_cur; 6931 if (doms_new == NULL) { 6932 n = 0; 6933 doms_new = &fallback_doms; 6934 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6935 WARN_ON_ONCE(dattr_new); 6936 } 6937 6938 /* Build new domains */ 6939 for (i = 0; i < ndoms_new; i++) { 6940 for (j = 0; j < n && !new_topology; j++) { 6941 if (cpumask_equal(doms_new[i], doms_cur[j]) 6942 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6943 goto match2; 6944 } 6945 /* no match - add a new doms_new */ 6946 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6947match2: 6948 ; 6949 } 6950 6951 /* Remember the new sched domains */ 6952 if (doms_cur != &fallback_doms) 6953 free_sched_domains(doms_cur, ndoms_cur); 6954 kfree(dattr_cur); /* kfree(NULL) is safe */ 6955 doms_cur = doms_new; 6956 dattr_cur = dattr_new; 6957 ndoms_cur = ndoms_new; 6958 6959 register_sched_domain_sysctl(); 6960 6961 mutex_unlock(&sched_domains_mutex); 6962} 6963 6964static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 6965 6966/* 6967 * Update cpusets according to cpu_active mask. If cpusets are 6968 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6969 * around partition_sched_domains(). 6970 * 6971 * If we come here as part of a suspend/resume, don't touch cpusets because we 6972 * want to restore it back to its original state upon resume anyway. 6973 */ 6974static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6975 void *hcpu) 6976{ 6977 switch (action) { 6978 case CPU_ONLINE_FROZEN: 6979 case CPU_DOWN_FAILED_FROZEN: 6980 6981 /* 6982 * num_cpus_frozen tracks how many CPUs are involved in suspend 6983 * resume sequence. As long as this is not the last online 6984 * operation in the resume sequence, just build a single sched 6985 * domain, ignoring cpusets. 6986 */ 6987 num_cpus_frozen--; 6988 if (likely(num_cpus_frozen)) { 6989 partition_sched_domains(1, NULL, NULL); 6990 break; 6991 } 6992 6993 /* 6994 * This is the last CPU online operation. So fall through and 6995 * restore the original sched domains by considering the 6996 * cpuset configurations. 6997 */ 6998 6999 case CPU_ONLINE: 7000 cpuset_update_active_cpus(true); 7001 break; 7002 default: 7003 return NOTIFY_DONE; 7004 } 7005 return NOTIFY_OK; 7006} 7007 7008static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 7009 void *hcpu) 7010{ 7011 unsigned long flags; 7012 long cpu = (long)hcpu; 7013 struct dl_bw *dl_b; 7014 bool overflow; 7015 int cpus; 7016 7017 switch (action) { 7018 case CPU_DOWN_PREPARE: 7019 rcu_read_lock_sched(); 7020 dl_b = dl_bw_of(cpu); 7021 7022 raw_spin_lock_irqsave(&dl_b->lock, flags); 7023 cpus = dl_bw_cpus(cpu); 7024 overflow = __dl_overflow(dl_b, cpus, 0, 0); 7025 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7026 7027 rcu_read_unlock_sched(); 7028 7029 if (overflow) 7030 return notifier_from_errno(-EBUSY); 7031 cpuset_update_active_cpus(false); 7032 break; 7033 case CPU_DOWN_PREPARE_FROZEN: 7034 num_cpus_frozen++; 7035 partition_sched_domains(1, NULL, NULL); 7036 break; 7037 default: 7038 return NOTIFY_DONE; 7039 } 7040 return NOTIFY_OK; 7041} 7042 7043void __init sched_init_smp(void) 7044{ 7045 cpumask_var_t non_isolated_cpus; 7046 7047 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7048 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7049 7050 sched_init_numa(); 7051 7052 /* 7053 * There's no userspace yet to cause hotplug operations; hence all the 7054 * cpu masks are stable and all blatant races in the below code cannot 7055 * happen. 7056 */ 7057 mutex_lock(&sched_domains_mutex); 7058 init_sched_domains(cpu_active_mask); 7059 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7060 if (cpumask_empty(non_isolated_cpus)) 7061 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7062 mutex_unlock(&sched_domains_mutex); 7063 7064 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 7065 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 7066 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 7067 7068 init_hrtick(); 7069 7070 /* Move init over to a non-isolated CPU */ 7071 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 7072 BUG(); 7073 sched_init_granularity(); 7074 free_cpumask_var(non_isolated_cpus); 7075 7076 init_sched_rt_class(); 7077 init_sched_dl_class(); 7078} 7079#else 7080void __init sched_init_smp(void) 7081{ 7082 sched_init_granularity(); 7083} 7084#endif /* CONFIG_SMP */ 7085 7086const_debug unsigned int sysctl_timer_migration = 1; 7087 7088int in_sched_functions(unsigned long addr) 7089{ 7090 return in_lock_functions(addr) || 7091 (addr >= (unsigned long)__sched_text_start 7092 && addr < (unsigned long)__sched_text_end); 7093} 7094 7095#ifdef CONFIG_CGROUP_SCHED 7096/* 7097 * Default task group. 7098 * Every task in system belongs to this group at bootup. 7099 */ 7100struct task_group root_task_group; 7101LIST_HEAD(task_groups); 7102#endif 7103 7104DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 7105 7106void __init sched_init(void) 7107{ 7108 int i, j; 7109 unsigned long alloc_size = 0, ptr; 7110 7111#ifdef CONFIG_FAIR_GROUP_SCHED 7112 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7113#endif 7114#ifdef CONFIG_RT_GROUP_SCHED 7115 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7116#endif 7117 if (alloc_size) { 7118 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7119 7120#ifdef CONFIG_FAIR_GROUP_SCHED 7121 root_task_group.se = (struct sched_entity **)ptr; 7122 ptr += nr_cpu_ids * sizeof(void **); 7123 7124 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 7125 ptr += nr_cpu_ids * sizeof(void **); 7126 7127#endif /* CONFIG_FAIR_GROUP_SCHED */ 7128#ifdef CONFIG_RT_GROUP_SCHED 7129 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 7130 ptr += nr_cpu_ids * sizeof(void **); 7131 7132 root_task_group.rt_rq = (struct rt_rq **)ptr; 7133 ptr += nr_cpu_ids * sizeof(void **); 7134 7135#endif /* CONFIG_RT_GROUP_SCHED */ 7136 } 7137#ifdef CONFIG_CPUMASK_OFFSTACK 7138 for_each_possible_cpu(i) { 7139 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 7140 cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 7141 } 7142#endif /* CONFIG_CPUMASK_OFFSTACK */ 7143 7144 init_rt_bandwidth(&def_rt_bandwidth, 7145 global_rt_period(), global_rt_runtime()); 7146 init_dl_bandwidth(&def_dl_bandwidth, 7147 global_rt_period(), global_rt_runtime()); 7148 7149#ifdef CONFIG_SMP 7150 init_defrootdomain(); 7151#endif 7152 7153#ifdef CONFIG_RT_GROUP_SCHED 7154 init_rt_bandwidth(&root_task_group.rt_bandwidth, 7155 global_rt_period(), global_rt_runtime()); 7156#endif /* CONFIG_RT_GROUP_SCHED */ 7157 7158#ifdef CONFIG_CGROUP_SCHED 7159 list_add(&root_task_group.list, &task_groups); 7160 INIT_LIST_HEAD(&root_task_group.children); 7161 INIT_LIST_HEAD(&root_task_group.siblings); 7162 autogroup_init(&init_task); 7163 7164#endif /* CONFIG_CGROUP_SCHED */ 7165 7166 for_each_possible_cpu(i) { 7167 struct rq *rq; 7168 7169 rq = cpu_rq(i); 7170 raw_spin_lock_init(&rq->lock); 7171 rq->nr_running = 0; 7172 rq->calc_load_active = 0; 7173 rq->calc_load_update = jiffies + LOAD_FREQ; 7174 init_cfs_rq(&rq->cfs); 7175 init_rt_rq(&rq->rt); 7176 init_dl_rq(&rq->dl); 7177#ifdef CONFIG_FAIR_GROUP_SCHED 7178 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 7179 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7180 /* 7181 * How much cpu bandwidth does root_task_group get? 7182 * 7183 * In case of task-groups formed thr' the cgroup filesystem, it 7184 * gets 100% of the cpu resources in the system. This overall 7185 * system cpu resource is divided among the tasks of 7186 * root_task_group and its child task-groups in a fair manner, 7187 * based on each entity's (task or task-group's) weight 7188 * (se->load.weight). 7189 * 7190 * In other words, if root_task_group has 10 tasks of weight 7191 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7192 * then A0's share of the cpu resource is: 7193 * 7194 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7195 * 7196 * We achieve this by letting root_task_group's tasks sit 7197 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 7198 */ 7199 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 7200 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 7201#endif /* CONFIG_FAIR_GROUP_SCHED */ 7202 7203 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7204#ifdef CONFIG_RT_GROUP_SCHED 7205 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 7206#endif 7207 7208 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7209 rq->cpu_load[j] = 0; 7210 7211 rq->last_load_update_tick = jiffies; 7212 7213#ifdef CONFIG_SMP 7214 rq->sd = NULL; 7215 rq->rd = NULL; 7216 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; 7217 rq->post_schedule = 0; 7218 rq->active_balance = 0; 7219 rq->next_balance = jiffies; 7220 rq->push_cpu = 0; 7221 rq->cpu = i; 7222 rq->online = 0; 7223 rq->idle_stamp = 0; 7224 rq->avg_idle = 2*sysctl_sched_migration_cost; 7225 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 7226 7227 INIT_LIST_HEAD(&rq->cfs_tasks); 7228 7229 rq_attach_root(rq, &def_root_domain); 7230#ifdef CONFIG_NO_HZ_COMMON 7231 rq->nohz_flags = 0; 7232#endif 7233#ifdef CONFIG_NO_HZ_FULL 7234 rq->last_sched_tick = 0; 7235#endif 7236#endif 7237 init_rq_hrtick(rq); 7238 atomic_set(&rq->nr_iowait, 0); 7239 } 7240 7241 set_load_weight(&init_task); 7242 7243#ifdef CONFIG_PREEMPT_NOTIFIERS 7244 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 7245#endif 7246 7247 /* 7248 * The boot idle thread does lazy MMU switching as well: 7249 */ 7250 atomic_inc(&init_mm.mm_count); 7251 enter_lazy_tlb(&init_mm, current); 7252 7253 /* 7254 * During early bootup we pretend to be a normal task: 7255 */ 7256 current->sched_class = &fair_sched_class; 7257 7258 /* 7259 * Make us the idle thread. Technically, schedule() should not be 7260 * called from this thread, however somewhere below it might be, 7261 * but because we are the idle thread, we just pick up running again 7262 * when this runqueue becomes "idle". 7263 */ 7264 init_idle(current, smp_processor_id()); 7265 7266 calc_load_update = jiffies + LOAD_FREQ; 7267 7268#ifdef CONFIG_SMP 7269 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7270 /* May be allocated at isolcpus cmdline parse time */ 7271 if (cpu_isolated_map == NULL) 7272 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7273 idle_thread_set_boot_cpu(); 7274 set_cpu_rq_start_time(); 7275#endif 7276 init_sched_fair_class(); 7277 7278 scheduler_running = 1; 7279} 7280 7281#ifdef CONFIG_DEBUG_ATOMIC_SLEEP 7282static inline int preempt_count_equals(int preempt_offset) 7283{ 7284 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7285 7286 return (nested == preempt_offset); 7287} 7288 7289void __might_sleep(const char *file, int line, int preempt_offset) 7290{ 7291 /* 7292 * Blocking primitives will set (and therefore destroy) current->state, 7293 * since we will exit with TASK_RUNNING make sure we enter with it, 7294 * otherwise we will destroy state. 7295 */ 7296 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, 7297 "do not call blocking ops when !TASK_RUNNING; " 7298 "state=%lx set at [<%p>] %pS\n", 7299 current->state, 7300 (void *)current->task_state_change, 7301 (void *)current->task_state_change); 7302 7303 ___might_sleep(file, line, preempt_offset); 7304} 7305EXPORT_SYMBOL(__might_sleep); 7306 7307void ___might_sleep(const char *file, int line, int preempt_offset) 7308{ 7309 static unsigned long prev_jiffy; /* ratelimiting */ 7310 7311 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7312 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 7313 !is_idle_task(current)) || 7314 system_state != SYSTEM_RUNNING || oops_in_progress) 7315 return; 7316 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7317 return; 7318 prev_jiffy = jiffies; 7319 7320 printk(KERN_ERR 7321 "BUG: sleeping function called from invalid context at %s:%d\n", 7322 file, line); 7323 printk(KERN_ERR 7324 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 7325 in_atomic(), irqs_disabled(), 7326 current->pid, current->comm); 7327 7328 if (task_stack_end_corrupted(current)) 7329 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); 7330 7331 debug_show_held_locks(current); 7332 if (irqs_disabled()) 7333 print_irqtrace_events(current); 7334#ifdef CONFIG_DEBUG_PREEMPT 7335 if (!preempt_count_equals(preempt_offset)) { 7336 pr_err("Preemption disabled at:"); 7337 print_ip_sym(current->preempt_disable_ip); 7338 pr_cont("\n"); 7339 } 7340#endif 7341 dump_stack(); 7342} 7343EXPORT_SYMBOL(___might_sleep); 7344#endif 7345 7346#ifdef CONFIG_MAGIC_SYSRQ 7347static void normalize_task(struct rq *rq, struct task_struct *p) 7348{ 7349 const struct sched_class *prev_class = p->sched_class; 7350 struct sched_attr attr = { 7351 .sched_policy = SCHED_NORMAL, 7352 }; 7353 int old_prio = p->prio; 7354 int queued; 7355 7356 queued = task_on_rq_queued(p); 7357 if (queued) 7358 dequeue_task(rq, p, 0); 7359 __setscheduler(rq, p, &attr, false); 7360 if (queued) { 7361 enqueue_task(rq, p, 0); 7362 resched_curr(rq); 7363 } 7364 7365 check_class_changed(rq, p, prev_class, old_prio); 7366} 7367 7368void normalize_rt_tasks(void) 7369{ 7370 struct task_struct *g, *p; 7371 unsigned long flags; 7372 struct rq *rq; 7373 7374 read_lock(&tasklist_lock); 7375 for_each_process_thread(g, p) { 7376 /* 7377 * Only normalize user tasks: 7378 */ 7379 if (p->flags & PF_KTHREAD) 7380 continue; 7381 7382 p->se.exec_start = 0; 7383#ifdef CONFIG_SCHEDSTATS 7384 p->se.statistics.wait_start = 0; 7385 p->se.statistics.sleep_start = 0; 7386 p->se.statistics.block_start = 0; 7387#endif 7388 7389 if (!dl_task(p) && !rt_task(p)) { 7390 /* 7391 * Renice negative nice level userspace 7392 * tasks back to 0: 7393 */ 7394 if (task_nice(p) < 0) 7395 set_user_nice(p, 0); 7396 continue; 7397 } 7398 7399 rq = task_rq_lock(p, &flags); 7400 normalize_task(rq, p); 7401 task_rq_unlock(rq, p, &flags); 7402 } 7403 read_unlock(&tasklist_lock); 7404} 7405 7406#endif /* CONFIG_MAGIC_SYSRQ */ 7407 7408#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7409/* 7410 * These functions are only useful for the IA64 MCA handling, or kdb. 7411 * 7412 * They can only be called when the whole system has been 7413 * stopped - every CPU needs to be quiescent, and no scheduling 7414 * activity can take place. Using them for anything else would 7415 * be a serious bug, and as a result, they aren't even visible 7416 * under any other configuration. 7417 */ 7418 7419/** 7420 * curr_task - return the current task for a given cpu. 7421 * @cpu: the processor in question. 7422 * 7423 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7424 * 7425 * Return: The current task for @cpu. 7426 */ 7427struct task_struct *curr_task(int cpu) 7428{ 7429 return cpu_curr(cpu); 7430} 7431 7432#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7433 7434#ifdef CONFIG_IA64 7435/** 7436 * set_curr_task - set the current task for a given cpu. 7437 * @cpu: the processor in question. 7438 * @p: the task pointer to set. 7439 * 7440 * Description: This function must only be used when non-maskable interrupts 7441 * are serviced on a separate stack. It allows the architecture to switch the 7442 * notion of the current task on a cpu in a non-blocking manner. This function 7443 * must be called with all CPU's synchronized, and interrupts disabled, the 7444 * and caller must save the original value of the current task (see 7445 * curr_task() above) and restore that value before reenabling interrupts and 7446 * re-starting the system. 7447 * 7448 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7449 */ 7450void set_curr_task(int cpu, struct task_struct *p) 7451{ 7452 cpu_curr(cpu) = p; 7453} 7454 7455#endif 7456 7457#ifdef CONFIG_CGROUP_SCHED 7458/* task_group_lock serializes the addition/removal of task groups */ 7459static DEFINE_SPINLOCK(task_group_lock); 7460 7461static void free_sched_group(struct task_group *tg) 7462{ 7463 free_fair_sched_group(tg); 7464 free_rt_sched_group(tg); 7465 autogroup_free(tg); 7466 kfree(tg); 7467} 7468 7469/* allocate runqueue etc for a new task group */ 7470struct task_group *sched_create_group(struct task_group *parent) 7471{ 7472 struct task_group *tg; 7473 7474 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7475 if (!tg) 7476 return ERR_PTR(-ENOMEM); 7477 7478 if (!alloc_fair_sched_group(tg, parent)) 7479 goto err; 7480 7481 if (!alloc_rt_sched_group(tg, parent)) 7482 goto err; 7483 7484 return tg; 7485 7486err: 7487 free_sched_group(tg); 7488 return ERR_PTR(-ENOMEM); 7489} 7490 7491void sched_online_group(struct task_group *tg, struct task_group *parent) 7492{ 7493 unsigned long flags; 7494 7495 spin_lock_irqsave(&task_group_lock, flags); 7496 list_add_rcu(&tg->list, &task_groups); 7497 7498 WARN_ON(!parent); /* root should already exist */ 7499 7500 tg->parent = parent; 7501 INIT_LIST_HEAD(&tg->children); 7502 list_add_rcu(&tg->siblings, &parent->children); 7503 spin_unlock_irqrestore(&task_group_lock, flags); 7504} 7505 7506/* rcu callback to free various structures associated with a task group */ 7507static void free_sched_group_rcu(struct rcu_head *rhp) 7508{ 7509 /* now it should be safe to free those cfs_rqs */ 7510 free_sched_group(container_of(rhp, struct task_group, rcu)); 7511} 7512 7513/* Destroy runqueue etc associated with a task group */ 7514void sched_destroy_group(struct task_group *tg) 7515{ 7516 /* wait for possible concurrent references to cfs_rqs complete */ 7517 call_rcu(&tg->rcu, free_sched_group_rcu); 7518} 7519 7520void sched_offline_group(struct task_group *tg) 7521{ 7522 unsigned long flags; 7523 int i; 7524 7525 /* end participation in shares distribution */ 7526 for_each_possible_cpu(i) 7527 unregister_fair_sched_group(tg, i); 7528 7529 spin_lock_irqsave(&task_group_lock, flags); 7530 list_del_rcu(&tg->list); 7531 list_del_rcu(&tg->siblings); 7532 spin_unlock_irqrestore(&task_group_lock, flags); 7533} 7534 7535/* change task's runqueue when it moves between groups. 7536 * The caller of this function should have put the task in its new group 7537 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7538 * reflect its new group. 7539 */ 7540void sched_move_task(struct task_struct *tsk) 7541{ 7542 struct task_group *tg; 7543 int queued, running; 7544 unsigned long flags; 7545 struct rq *rq; 7546 7547 rq = task_rq_lock(tsk, &flags); 7548 7549 running = task_current(rq, tsk); 7550 queued = task_on_rq_queued(tsk); 7551 7552 if (queued) 7553 dequeue_task(rq, tsk, 0); 7554 if (unlikely(running)) 7555 put_prev_task(rq, tsk); 7556 7557 /* 7558 * All callers are synchronized by task_rq_lock(); we do not use RCU 7559 * which is pointless here. Thus, we pass "true" to task_css_check() 7560 * to prevent lockdep warnings. 7561 */ 7562 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), 7563 struct task_group, css); 7564 tg = autogroup_task_group(tsk, tg); 7565 tsk->sched_task_group = tg; 7566 7567#ifdef CONFIG_FAIR_GROUP_SCHED 7568 if (tsk->sched_class->task_move_group) 7569 tsk->sched_class->task_move_group(tsk, queued); 7570 else 7571#endif 7572 set_task_rq(tsk, task_cpu(tsk)); 7573 7574 if (unlikely(running)) 7575 tsk->sched_class->set_curr_task(rq); 7576 if (queued) 7577 enqueue_task(rq, tsk, 0); 7578 7579 task_rq_unlock(rq, tsk, &flags); 7580} 7581#endif /* CONFIG_CGROUP_SCHED */ 7582 7583#ifdef CONFIG_RT_GROUP_SCHED 7584/* 7585 * Ensure that the real time constraints are schedulable. 7586 */ 7587static DEFINE_MUTEX(rt_constraints_mutex); 7588 7589/* Must be called with tasklist_lock held */ 7590static inline int tg_has_rt_tasks(struct task_group *tg) 7591{ 7592 struct task_struct *g, *p; 7593 7594 /* 7595 * Autogroups do not have RT tasks; see autogroup_create(). 7596 */ 7597 if (task_group_is_autogroup(tg)) 7598 return 0; 7599 7600 for_each_process_thread(g, p) { 7601 if (rt_task(p) && task_group(p) == tg) 7602 return 1; 7603 } 7604 7605 return 0; 7606} 7607 7608struct rt_schedulable_data { 7609 struct task_group *tg; 7610 u64 rt_period; 7611 u64 rt_runtime; 7612}; 7613 7614static int tg_rt_schedulable(struct task_group *tg, void *data) 7615{ 7616 struct rt_schedulable_data *d = data; 7617 struct task_group *child; 7618 unsigned long total, sum = 0; 7619 u64 period, runtime; 7620 7621 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7622 runtime = tg->rt_bandwidth.rt_runtime; 7623 7624 if (tg == d->tg) { 7625 period = d->rt_period; 7626 runtime = d->rt_runtime; 7627 } 7628 7629 /* 7630 * Cannot have more runtime than the period. 7631 */ 7632 if (runtime > period && runtime != RUNTIME_INF) 7633 return -EINVAL; 7634 7635 /* 7636 * Ensure we don't starve existing RT tasks. 7637 */ 7638 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7639 return -EBUSY; 7640 7641 total = to_ratio(period, runtime); 7642 7643 /* 7644 * Nobody can have more than the global setting allows. 7645 */ 7646 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7647 return -EINVAL; 7648 7649 /* 7650 * The sum of our children's runtime should not exceed our own. 7651 */ 7652 list_for_each_entry_rcu(child, &tg->children, siblings) { 7653 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7654 runtime = child->rt_bandwidth.rt_runtime; 7655 7656 if (child == d->tg) { 7657 period = d->rt_period; 7658 runtime = d->rt_runtime; 7659 } 7660 7661 sum += to_ratio(period, runtime); 7662 } 7663 7664 if (sum > total) 7665 return -EINVAL; 7666 7667 return 0; 7668} 7669 7670static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7671{ 7672 int ret; 7673 7674 struct rt_schedulable_data data = { 7675 .tg = tg, 7676 .rt_period = period, 7677 .rt_runtime = runtime, 7678 }; 7679 7680 rcu_read_lock(); 7681 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7682 rcu_read_unlock(); 7683 7684 return ret; 7685} 7686 7687static int tg_set_rt_bandwidth(struct task_group *tg, 7688 u64 rt_period, u64 rt_runtime) 7689{ 7690 int i, err = 0; 7691 7692 /* 7693 * Disallowing the root group RT runtime is BAD, it would disallow the 7694 * kernel creating (and or operating) RT threads. 7695 */ 7696 if (tg == &root_task_group && rt_runtime == 0) 7697 return -EINVAL; 7698 7699 /* No period doesn't make any sense. */ 7700 if (rt_period == 0) 7701 return -EINVAL; 7702 7703 mutex_lock(&rt_constraints_mutex); 7704 read_lock(&tasklist_lock); 7705 err = __rt_schedulable(tg, rt_period, rt_runtime); 7706 if (err) 7707 goto unlock; 7708 7709 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7710 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7711 tg->rt_bandwidth.rt_runtime = rt_runtime; 7712 7713 for_each_possible_cpu(i) { 7714 struct rt_rq *rt_rq = tg->rt_rq[i]; 7715 7716 raw_spin_lock(&rt_rq->rt_runtime_lock); 7717 rt_rq->rt_runtime = rt_runtime; 7718 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7719 } 7720 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7721unlock: 7722 read_unlock(&tasklist_lock); 7723 mutex_unlock(&rt_constraints_mutex); 7724 7725 return err; 7726} 7727 7728static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7729{ 7730 u64 rt_runtime, rt_period; 7731 7732 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7733 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7734 if (rt_runtime_us < 0) 7735 rt_runtime = RUNTIME_INF; 7736 7737 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7738} 7739 7740static long sched_group_rt_runtime(struct task_group *tg) 7741{ 7742 u64 rt_runtime_us; 7743 7744 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7745 return -1; 7746 7747 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7748 do_div(rt_runtime_us, NSEC_PER_USEC); 7749 return rt_runtime_us; 7750} 7751 7752static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7753{ 7754 u64 rt_runtime, rt_period; 7755 7756 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7757 rt_runtime = tg->rt_bandwidth.rt_runtime; 7758 7759 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7760} 7761 7762static long sched_group_rt_period(struct task_group *tg) 7763{ 7764 u64 rt_period_us; 7765 7766 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7767 do_div(rt_period_us, NSEC_PER_USEC); 7768 return rt_period_us; 7769} 7770#endif /* CONFIG_RT_GROUP_SCHED */ 7771 7772#ifdef CONFIG_RT_GROUP_SCHED 7773static int sched_rt_global_constraints(void) 7774{ 7775 int ret = 0; 7776 7777 mutex_lock(&rt_constraints_mutex); 7778 read_lock(&tasklist_lock); 7779 ret = __rt_schedulable(NULL, 0, 0); 7780 read_unlock(&tasklist_lock); 7781 mutex_unlock(&rt_constraints_mutex); 7782 7783 return ret; 7784} 7785 7786static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7787{ 7788 /* Don't accept realtime tasks when there is no way for them to run */ 7789 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7790 return 0; 7791 7792 return 1; 7793} 7794 7795#else /* !CONFIG_RT_GROUP_SCHED */ 7796static int sched_rt_global_constraints(void) 7797{ 7798 unsigned long flags; 7799 int i, ret = 0; 7800 7801 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7802 for_each_possible_cpu(i) { 7803 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7804 7805 raw_spin_lock(&rt_rq->rt_runtime_lock); 7806 rt_rq->rt_runtime = global_rt_runtime(); 7807 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7808 } 7809 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7810 7811 return ret; 7812} 7813#endif /* CONFIG_RT_GROUP_SCHED */ 7814 7815static int sched_dl_global_validate(void) 7816{ 7817 u64 runtime = global_rt_runtime(); 7818 u64 period = global_rt_period(); 7819 u64 new_bw = to_ratio(period, runtime); 7820 struct dl_bw *dl_b; 7821 int cpu, ret = 0; 7822 unsigned long flags; 7823 7824 /* 7825 * Here we want to check the bandwidth not being set to some 7826 * value smaller than the currently allocated bandwidth in 7827 * any of the root_domains. 7828 * 7829 * FIXME: Cycling on all the CPUs is overdoing, but simpler than 7830 * cycling on root_domains... Discussion on different/better 7831 * solutions is welcome! 7832 */ 7833 for_each_possible_cpu(cpu) { 7834 rcu_read_lock_sched(); 7835 dl_b = dl_bw_of(cpu); 7836 7837 raw_spin_lock_irqsave(&dl_b->lock, flags); 7838 if (new_bw < dl_b->total_bw) 7839 ret = -EBUSY; 7840 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7841 7842 rcu_read_unlock_sched(); 7843 7844 if (ret) 7845 break; 7846 } 7847 7848 return ret; 7849} 7850 7851static void sched_dl_do_global(void) 7852{ 7853 u64 new_bw = -1; 7854 struct dl_bw *dl_b; 7855 int cpu; 7856 unsigned long flags; 7857 7858 def_dl_bandwidth.dl_period = global_rt_period(); 7859 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 7860 7861 if (global_rt_runtime() != RUNTIME_INF) 7862 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 7863 7864 /* 7865 * FIXME: As above... 7866 */ 7867 for_each_possible_cpu(cpu) { 7868 rcu_read_lock_sched(); 7869 dl_b = dl_bw_of(cpu); 7870 7871 raw_spin_lock_irqsave(&dl_b->lock, flags); 7872 dl_b->bw = new_bw; 7873 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7874 7875 rcu_read_unlock_sched(); 7876 } 7877} 7878 7879static int sched_rt_global_validate(void) 7880{ 7881 if (sysctl_sched_rt_period <= 0) 7882 return -EINVAL; 7883 7884 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 7885 (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) 7886 return -EINVAL; 7887 7888 return 0; 7889} 7890 7891static void sched_rt_do_global(void) 7892{ 7893 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7894 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 7895} 7896 7897int sched_rt_handler(struct ctl_table *table, int write, 7898 void __user *buffer, size_t *lenp, 7899 loff_t *ppos) 7900{ 7901 int old_period, old_runtime; 7902 static DEFINE_MUTEX(mutex); 7903 int ret; 7904 7905 mutex_lock(&mutex); 7906 old_period = sysctl_sched_rt_period; 7907 old_runtime = sysctl_sched_rt_runtime; 7908 7909 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7910 7911 if (!ret && write) { 7912 ret = sched_rt_global_validate(); 7913 if (ret) 7914 goto undo; 7915 7916 ret = sched_dl_global_validate(); 7917 if (ret) 7918 goto undo; 7919 7920 ret = sched_rt_global_constraints(); 7921 if (ret) 7922 goto undo; 7923 7924 sched_rt_do_global(); 7925 sched_dl_do_global(); 7926 } 7927 if (0) { 7928undo: 7929 sysctl_sched_rt_period = old_period; 7930 sysctl_sched_rt_runtime = old_runtime; 7931 } 7932 mutex_unlock(&mutex); 7933 7934 return ret; 7935} 7936 7937int sched_rr_handler(struct ctl_table *table, int write, 7938 void __user *buffer, size_t *lenp, 7939 loff_t *ppos) 7940{ 7941 int ret; 7942 static DEFINE_MUTEX(mutex); 7943 7944 mutex_lock(&mutex); 7945 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7946 /* make sure that internally we keep jiffies */ 7947 /* also, writing zero resets timeslice to default */ 7948 if (!ret && write) { 7949 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7950 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7951 } 7952 mutex_unlock(&mutex); 7953 return ret; 7954} 7955 7956#ifdef CONFIG_CGROUP_SCHED 7957 7958static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7959{ 7960 return css ? container_of(css, struct task_group, css) : NULL; 7961} 7962 7963static struct cgroup_subsys_state * 7964cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 7965{ 7966 struct task_group *parent = css_tg(parent_css); 7967 struct task_group *tg; 7968 7969 if (!parent) { 7970 /* This is early initialization for the top cgroup */ 7971 return &root_task_group.css; 7972 } 7973 7974 tg = sched_create_group(parent); 7975 if (IS_ERR(tg)) 7976 return ERR_PTR(-ENOMEM); 7977 7978 return &tg->css; 7979} 7980 7981static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7982{ 7983 struct task_group *tg = css_tg(css); 7984 struct task_group *parent = css_tg(css->parent); 7985 7986 if (parent) 7987 sched_online_group(tg, parent); 7988 return 0; 7989} 7990 7991static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 7992{ 7993 struct task_group *tg = css_tg(css); 7994 7995 sched_destroy_group(tg); 7996} 7997 7998static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 7999{ 8000 struct task_group *tg = css_tg(css); 8001 8002 sched_offline_group(tg); 8003} 8004 8005static void cpu_cgroup_fork(struct task_struct *task) 8006{ 8007 sched_move_task(task); 8008} 8009 8010static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 8011 struct cgroup_taskset *tset) 8012{ 8013 struct task_struct *task; 8014 8015 cgroup_taskset_for_each(task, tset) { 8016#ifdef CONFIG_RT_GROUP_SCHED 8017 if (!sched_rt_can_attach(css_tg(css), task)) 8018 return -EINVAL; 8019#else 8020 /* We don't support RT-tasks being in separate groups */ 8021 if (task->sched_class != &fair_sched_class) 8022 return -EINVAL; 8023#endif 8024 } 8025 return 0; 8026} 8027 8028static void cpu_cgroup_attach(struct cgroup_subsys_state *css, 8029 struct cgroup_taskset *tset) 8030{ 8031 struct task_struct *task; 8032 8033 cgroup_taskset_for_each(task, tset) 8034 sched_move_task(task); 8035} 8036 8037static void cpu_cgroup_exit(struct cgroup_subsys_state *css, 8038 struct cgroup_subsys_state *old_css, 8039 struct task_struct *task) 8040{ 8041 /* 8042 * cgroup_exit() is called in the copy_process() failure path. 8043 * Ignore this case since the task hasn't ran yet, this avoids 8044 * trying to poke a half freed task state from generic code. 8045 */ 8046 if (!(task->flags & PF_EXITING)) 8047 return; 8048 8049 sched_move_task(task); 8050} 8051 8052#ifdef CONFIG_FAIR_GROUP_SCHED 8053static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 8054 struct cftype *cftype, u64 shareval) 8055{ 8056 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 8057} 8058 8059static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 8060 struct cftype *cft) 8061{ 8062 struct task_group *tg = css_tg(css); 8063 8064 return (u64) scale_load_down(tg->shares); 8065} 8066 8067#ifdef CONFIG_CFS_BANDWIDTH 8068static DEFINE_MUTEX(cfs_constraints_mutex); 8069 8070const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 8071const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 8072 8073static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 8074 8075static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 8076{ 8077 int i, ret = 0, runtime_enabled, runtime_was_enabled; 8078 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8079 8080 if (tg == &root_task_group) 8081 return -EINVAL; 8082 8083 /* 8084 * Ensure we have at some amount of bandwidth every period. This is 8085 * to prevent reaching a state of large arrears when throttled via 8086 * entity_tick() resulting in prolonged exit starvation. 8087 */ 8088 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 8089 return -EINVAL; 8090 8091 /* 8092 * Likewise, bound things on the otherside by preventing insane quota 8093 * periods. This also allows us to normalize in computing quota 8094 * feasibility. 8095 */ 8096 if (period > max_cfs_quota_period) 8097 return -EINVAL; 8098 8099 /* 8100 * Prevent race between setting of cfs_rq->runtime_enabled and 8101 * unthrottle_offline_cfs_rqs(). 8102 */ 8103 get_online_cpus(); 8104 mutex_lock(&cfs_constraints_mutex); 8105 ret = __cfs_schedulable(tg, period, quota); 8106 if (ret) 8107 goto out_unlock; 8108 8109 runtime_enabled = quota != RUNTIME_INF; 8110 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 8111 /* 8112 * If we need to toggle cfs_bandwidth_used, off->on must occur 8113 * before making related changes, and on->off must occur afterwards 8114 */ 8115 if (runtime_enabled && !runtime_was_enabled) 8116 cfs_bandwidth_usage_inc(); 8117 raw_spin_lock_irq(&cfs_b->lock); 8118 cfs_b->period = ns_to_ktime(period); 8119 cfs_b->quota = quota; 8120 8121 __refill_cfs_bandwidth_runtime(cfs_b); 8122 /* restart the period timer (if active) to handle new period expiry */ 8123 if (runtime_enabled && cfs_b->timer_active) { 8124 /* force a reprogram */ 8125 __start_cfs_bandwidth(cfs_b, true); 8126 } 8127 raw_spin_unlock_irq(&cfs_b->lock); 8128 8129 for_each_online_cpu(i) { 8130 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 8131 struct rq *rq = cfs_rq->rq; 8132 8133 raw_spin_lock_irq(&rq->lock); 8134 cfs_rq->runtime_enabled = runtime_enabled; 8135 cfs_rq->runtime_remaining = 0; 8136 8137 if (cfs_rq->throttled) 8138 unthrottle_cfs_rq(cfs_rq); 8139 raw_spin_unlock_irq(&rq->lock); 8140 } 8141 if (runtime_was_enabled && !runtime_enabled) 8142 cfs_bandwidth_usage_dec(); 8143out_unlock: 8144 mutex_unlock(&cfs_constraints_mutex); 8145 put_online_cpus(); 8146 8147 return ret; 8148} 8149 8150int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 8151{ 8152 u64 quota, period; 8153 8154 period = ktime_to_ns(tg->cfs_bandwidth.period); 8155 if (cfs_quota_us < 0) 8156 quota = RUNTIME_INF; 8157 else 8158 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 8159 8160 return tg_set_cfs_bandwidth(tg, period, quota); 8161} 8162 8163long tg_get_cfs_quota(struct task_group *tg) 8164{ 8165 u64 quota_us; 8166 8167 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 8168 return -1; 8169 8170 quota_us = tg->cfs_bandwidth.quota; 8171 do_div(quota_us, NSEC_PER_USEC); 8172 8173 return quota_us; 8174} 8175 8176int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 8177{ 8178 u64 quota, period; 8179 8180 period = (u64)cfs_period_us * NSEC_PER_USEC; 8181 quota = tg->cfs_bandwidth.quota; 8182 8183 return tg_set_cfs_bandwidth(tg, period, quota); 8184} 8185 8186long tg_get_cfs_period(struct task_group *tg) 8187{ 8188 u64 cfs_period_us; 8189 8190 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 8191 do_div(cfs_period_us, NSEC_PER_USEC); 8192 8193 return cfs_period_us; 8194} 8195 8196static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 8197 struct cftype *cft) 8198{ 8199 return tg_get_cfs_quota(css_tg(css)); 8200} 8201 8202static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 8203 struct cftype *cftype, s64 cfs_quota_us) 8204{ 8205 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 8206} 8207 8208static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 8209 struct cftype *cft) 8210{ 8211 return tg_get_cfs_period(css_tg(css)); 8212} 8213 8214static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 8215 struct cftype *cftype, u64 cfs_period_us) 8216{ 8217 return tg_set_cfs_period(css_tg(css), cfs_period_us); 8218} 8219 8220struct cfs_schedulable_data { 8221 struct task_group *tg; 8222 u64 period, quota; 8223}; 8224 8225/* 8226 * normalize group quota/period to be quota/max_period 8227 * note: units are usecs 8228 */ 8229static u64 normalize_cfs_quota(struct task_group *tg, 8230 struct cfs_schedulable_data *d) 8231{ 8232 u64 quota, period; 8233 8234 if (tg == d->tg) { 8235 period = d->period; 8236 quota = d->quota; 8237 } else { 8238 period = tg_get_cfs_period(tg); 8239 quota = tg_get_cfs_quota(tg); 8240 } 8241 8242 /* note: these should typically be equivalent */ 8243 if (quota == RUNTIME_INF || quota == -1) 8244 return RUNTIME_INF; 8245 8246 return to_ratio(period, quota); 8247} 8248 8249static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 8250{ 8251 struct cfs_schedulable_data *d = data; 8252 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8253 s64 quota = 0, parent_quota = -1; 8254 8255 if (!tg->parent) { 8256 quota = RUNTIME_INF; 8257 } else { 8258 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8259 8260 quota = normalize_cfs_quota(tg, d); 8261 parent_quota = parent_b->hierarchical_quota; 8262 8263 /* 8264 * ensure max(child_quota) <= parent_quota, inherit when no 8265 * limit is set 8266 */ 8267 if (quota == RUNTIME_INF) 8268 quota = parent_quota; 8269 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8270 return -EINVAL; 8271 } 8272 cfs_b->hierarchical_quota = quota; 8273 8274 return 0; 8275} 8276 8277static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 8278{ 8279 int ret; 8280 struct cfs_schedulable_data data = { 8281 .tg = tg, 8282 .period = period, 8283 .quota = quota, 8284 }; 8285 8286 if (quota != RUNTIME_INF) { 8287 do_div(data.period, NSEC_PER_USEC); 8288 do_div(data.quota, NSEC_PER_USEC); 8289 } 8290 8291 rcu_read_lock(); 8292 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 8293 rcu_read_unlock(); 8294 8295 return ret; 8296} 8297 8298static int cpu_stats_show(struct seq_file *sf, void *v) 8299{ 8300 struct task_group *tg = css_tg(seq_css(sf)); 8301 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 8302 8303 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 8304 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 8305 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 8306 8307 return 0; 8308} 8309#endif /* CONFIG_CFS_BANDWIDTH */ 8310#endif /* CONFIG_FAIR_GROUP_SCHED */ 8311 8312#ifdef CONFIG_RT_GROUP_SCHED 8313static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 8314 struct cftype *cft, s64 val) 8315{ 8316 return sched_group_set_rt_runtime(css_tg(css), val); 8317} 8318 8319static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 8320 struct cftype *cft) 8321{ 8322 return sched_group_rt_runtime(css_tg(css)); 8323} 8324 8325static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 8326 struct cftype *cftype, u64 rt_period_us) 8327{ 8328 return sched_group_set_rt_period(css_tg(css), rt_period_us); 8329} 8330 8331static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 8332 struct cftype *cft) 8333{ 8334 return sched_group_rt_period(css_tg(css)); 8335} 8336#endif /* CONFIG_RT_GROUP_SCHED */ 8337 8338static struct cftype cpu_files[] = { 8339#ifdef CONFIG_FAIR_GROUP_SCHED 8340 { 8341 .name = "shares", 8342 .read_u64 = cpu_shares_read_u64, 8343 .write_u64 = cpu_shares_write_u64, 8344 }, 8345#endif 8346#ifdef CONFIG_CFS_BANDWIDTH 8347 { 8348 .name = "cfs_quota_us", 8349 .read_s64 = cpu_cfs_quota_read_s64, 8350 .write_s64 = cpu_cfs_quota_write_s64, 8351 }, 8352 { 8353 .name = "cfs_period_us", 8354 .read_u64 = cpu_cfs_period_read_u64, 8355 .write_u64 = cpu_cfs_period_write_u64, 8356 }, 8357 { 8358 .name = "stat", 8359 .seq_show = cpu_stats_show, 8360 }, 8361#endif 8362#ifdef CONFIG_RT_GROUP_SCHED 8363 { 8364 .name = "rt_runtime_us", 8365 .read_s64 = cpu_rt_runtime_read, 8366 .write_s64 = cpu_rt_runtime_write, 8367 }, 8368 { 8369 .name = "rt_period_us", 8370 .read_u64 = cpu_rt_period_read_uint, 8371 .write_u64 = cpu_rt_period_write_uint, 8372 }, 8373#endif 8374 { } /* terminate */ 8375}; 8376 8377struct cgroup_subsys cpu_cgrp_subsys = { 8378 .css_alloc = cpu_cgroup_css_alloc, 8379 .css_free = cpu_cgroup_css_free, 8380 .css_online = cpu_cgroup_css_online, 8381 .css_offline = cpu_cgroup_css_offline, 8382 .fork = cpu_cgroup_fork, 8383 .can_attach = cpu_cgroup_can_attach, 8384 .attach = cpu_cgroup_attach, 8385 .exit = cpu_cgroup_exit, 8386 .legacy_cftypes = cpu_files, 8387 .early_init = 1, 8388}; 8389 8390#endif /* CONFIG_CGROUP_SCHED */ 8391 8392void dump_cpu_task(int cpu) 8393{ 8394 pr_info("Task dump for CPU %d:\n", cpu); 8395 sched_show_task(cpu_curr(cpu)); 8396} 8397