1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 2 3#include <linux/errno.h> 4#include <linux/kernel.h> 5#include <linux/mm.h> 6#include <linux/smp.h> 7#include <linux/prctl.h> 8#include <linux/slab.h> 9#include <linux/sched.h> 10#include <linux/module.h> 11#include <linux/pm.h> 12#include <linux/tick.h> 13#include <linux/random.h> 14#include <linux/user-return-notifier.h> 15#include <linux/dmi.h> 16#include <linux/utsname.h> 17#include <linux/stackprotector.h> 18#include <linux/tick.h> 19#include <linux/cpuidle.h> 20#include <trace/events/power.h> 21#include <linux/hw_breakpoint.h> 22#include <asm/cpu.h> 23#include <asm/apic.h> 24#include <asm/syscalls.h> 25#include <asm/idle.h> 26#include <asm/uaccess.h> 27#include <asm/mwait.h> 28#include <asm/i387.h> 29#include <asm/fpu-internal.h> 30#include <asm/debugreg.h> 31#include <asm/nmi.h> 32#include <asm/tlbflush.h> 33 34/* 35 * per-CPU TSS segments. Threads are completely 'soft' on Linux, 36 * no more per-task TSS's. The TSS size is kept cacheline-aligned 37 * so they are allowed to end up in the .data..cacheline_aligned 38 * section. Since TSS's are completely CPU-local, we want them 39 * on exact cacheline boundaries, to eliminate cacheline ping-pong. 40 */ 41__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { 42 .x86_tss = { 43 .sp0 = TOP_OF_INIT_STACK, 44#ifdef CONFIG_X86_32 45 .ss0 = __KERNEL_DS, 46 .ss1 = __KERNEL_CS, 47 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, 48#endif 49 }, 50#ifdef CONFIG_X86_32 51 /* 52 * Note that the .io_bitmap member must be extra-big. This is because 53 * the CPU will access an additional byte beyond the end of the IO 54 * permission bitmap. The extra byte must be all 1 bits, and must 55 * be within the limit. 56 */ 57 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, 58#endif 59}; 60EXPORT_PER_CPU_SYMBOL(cpu_tss); 61 62#ifdef CONFIG_X86_64 63static DEFINE_PER_CPU(unsigned char, is_idle); 64static ATOMIC_NOTIFIER_HEAD(idle_notifier); 65 66void idle_notifier_register(struct notifier_block *n) 67{ 68 atomic_notifier_chain_register(&idle_notifier, n); 69} 70EXPORT_SYMBOL_GPL(idle_notifier_register); 71 72void idle_notifier_unregister(struct notifier_block *n) 73{ 74 atomic_notifier_chain_unregister(&idle_notifier, n); 75} 76EXPORT_SYMBOL_GPL(idle_notifier_unregister); 77#endif 78 79struct kmem_cache *task_xstate_cachep; 80EXPORT_SYMBOL_GPL(task_xstate_cachep); 81 82/* 83 * this gets called so that we can store lazy state into memory and copy the 84 * current task into the new thread. 85 */ 86int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 87{ 88 *dst = *src; 89 90 dst->thread.fpu_counter = 0; 91 dst->thread.fpu.has_fpu = 0; 92 dst->thread.fpu.state = NULL; 93 task_disable_lazy_fpu_restore(dst); 94 if (tsk_used_math(src)) { 95 int err = fpu_alloc(&dst->thread.fpu); 96 if (err) 97 return err; 98 fpu_copy(dst, src); 99 } 100 return 0; 101} 102 103void free_thread_xstate(struct task_struct *tsk) 104{ 105 fpu_free(&tsk->thread.fpu); 106} 107 108void arch_release_task_struct(struct task_struct *tsk) 109{ 110 free_thread_xstate(tsk); 111} 112 113void arch_task_cache_init(void) 114{ 115 task_xstate_cachep = 116 kmem_cache_create("task_xstate", xstate_size, 117 __alignof__(union thread_xstate), 118 SLAB_PANIC | SLAB_NOTRACK, NULL); 119 setup_xstate_comp(); 120} 121 122/* 123 * Free current thread data structures etc.. 124 */ 125void exit_thread(void) 126{ 127 struct task_struct *me = current; 128 struct thread_struct *t = &me->thread; 129 unsigned long *bp = t->io_bitmap_ptr; 130 131 if (bp) { 132 struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); 133 134 t->io_bitmap_ptr = NULL; 135 clear_thread_flag(TIF_IO_BITMAP); 136 /* 137 * Careful, clear this in the TSS too: 138 */ 139 memset(tss->io_bitmap, 0xff, t->io_bitmap_max); 140 t->io_bitmap_max = 0; 141 put_cpu(); 142 kfree(bp); 143 } 144 145 drop_fpu(me); 146} 147 148void flush_thread(void) 149{ 150 struct task_struct *tsk = current; 151 152 flush_ptrace_hw_breakpoint(tsk); 153 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 154 155 if (!use_eager_fpu()) { 156 /* FPU state will be reallocated lazily at the first use. */ 157 drop_fpu(tsk); 158 free_thread_xstate(tsk); 159 } else { 160 if (!tsk_used_math(tsk)) { 161 /* kthread execs. TODO: cleanup this horror. */ 162 if (WARN_ON(init_fpu(tsk))) 163 force_sig(SIGKILL, tsk); 164 user_fpu_begin(); 165 } 166 restore_init_xstate(); 167 } 168} 169 170static void hard_disable_TSC(void) 171{ 172 cr4_set_bits(X86_CR4_TSD); 173} 174 175void disable_TSC(void) 176{ 177 preempt_disable(); 178 if (!test_and_set_thread_flag(TIF_NOTSC)) 179 /* 180 * Must flip the CPU state synchronously with 181 * TIF_NOTSC in the current running context. 182 */ 183 hard_disable_TSC(); 184 preempt_enable(); 185} 186 187static void hard_enable_TSC(void) 188{ 189 cr4_clear_bits(X86_CR4_TSD); 190} 191 192static void enable_TSC(void) 193{ 194 preempt_disable(); 195 if (test_and_clear_thread_flag(TIF_NOTSC)) 196 /* 197 * Must flip the CPU state synchronously with 198 * TIF_NOTSC in the current running context. 199 */ 200 hard_enable_TSC(); 201 preempt_enable(); 202} 203 204int get_tsc_mode(unsigned long adr) 205{ 206 unsigned int val; 207 208 if (test_thread_flag(TIF_NOTSC)) 209 val = PR_TSC_SIGSEGV; 210 else 211 val = PR_TSC_ENABLE; 212 213 return put_user(val, (unsigned int __user *)adr); 214} 215 216int set_tsc_mode(unsigned int val) 217{ 218 if (val == PR_TSC_SIGSEGV) 219 disable_TSC(); 220 else if (val == PR_TSC_ENABLE) 221 enable_TSC(); 222 else 223 return -EINVAL; 224 225 return 0; 226} 227 228void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 229 struct tss_struct *tss) 230{ 231 struct thread_struct *prev, *next; 232 233 prev = &prev_p->thread; 234 next = &next_p->thread; 235 236 if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ 237 test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { 238 unsigned long debugctl = get_debugctlmsr(); 239 240 debugctl &= ~DEBUGCTLMSR_BTF; 241 if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) 242 debugctl |= DEBUGCTLMSR_BTF; 243 244 update_debugctlmsr(debugctl); 245 } 246 247 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 248 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 249 /* prev and next are different */ 250 if (test_tsk_thread_flag(next_p, TIF_NOTSC)) 251 hard_disable_TSC(); 252 else 253 hard_enable_TSC(); 254 } 255 256 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 257 /* 258 * Copy the relevant range of the IO bitmap. 259 * Normally this is 128 bytes or less: 260 */ 261 memcpy(tss->io_bitmap, next->io_bitmap_ptr, 262 max(prev->io_bitmap_max, next->io_bitmap_max)); 263 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { 264 /* 265 * Clear any possible leftover bits: 266 */ 267 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 268 } 269 propagate_user_return_notify(prev_p, next_p); 270} 271 272/* 273 * Idle related variables and functions 274 */ 275unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE; 276EXPORT_SYMBOL(boot_option_idle_override); 277 278static void (*x86_idle)(void); 279 280#ifndef CONFIG_SMP 281static inline void play_dead(void) 282{ 283 BUG(); 284} 285#endif 286 287#ifdef CONFIG_X86_64 288void enter_idle(void) 289{ 290 this_cpu_write(is_idle, 1); 291 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 292} 293 294static void __exit_idle(void) 295{ 296 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 297 return; 298 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 299} 300 301/* Called from interrupts to signify idle end */ 302void exit_idle(void) 303{ 304 /* idle loop has pid 0 */ 305 if (current->pid) 306 return; 307 __exit_idle(); 308} 309#endif 310 311void arch_cpu_idle_enter(void) 312{ 313 local_touch_nmi(); 314 enter_idle(); 315} 316 317void arch_cpu_idle_exit(void) 318{ 319 __exit_idle(); 320} 321 322void arch_cpu_idle_dead(void) 323{ 324 play_dead(); 325} 326 327/* 328 * Called from the generic idle code. 329 */ 330void arch_cpu_idle(void) 331{ 332 x86_idle(); 333} 334 335/* 336 * We use this if we don't have any better idle routine.. 337 */ 338void default_idle(void) 339{ 340 trace_cpu_idle_rcuidle(1, smp_processor_id()); 341 safe_halt(); 342 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 343} 344#ifdef CONFIG_APM_MODULE 345EXPORT_SYMBOL(default_idle); 346#endif 347 348#ifdef CONFIG_XEN 349bool xen_set_default_idle(void) 350{ 351 bool ret = !!x86_idle; 352 353 x86_idle = default_idle; 354 355 return ret; 356} 357#endif 358void stop_this_cpu(void *dummy) 359{ 360 local_irq_disable(); 361 /* 362 * Remove this CPU: 363 */ 364 set_cpu_online(smp_processor_id(), false); 365 disable_local_APIC(); 366 367 for (;;) 368 halt(); 369} 370 371bool amd_e400_c1e_detected; 372EXPORT_SYMBOL(amd_e400_c1e_detected); 373 374static cpumask_var_t amd_e400_c1e_mask; 375 376void amd_e400_remove_cpu(int cpu) 377{ 378 if (amd_e400_c1e_mask != NULL) 379 cpumask_clear_cpu(cpu, amd_e400_c1e_mask); 380} 381 382/* 383 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt 384 * pending message MSR. If we detect C1E, then we handle it the same 385 * way as C3 power states (local apic timer and TSC stop) 386 */ 387static void amd_e400_idle(void) 388{ 389 if (!amd_e400_c1e_detected) { 390 u32 lo, hi; 391 392 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 393 394 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 395 amd_e400_c1e_detected = true; 396 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 397 mark_tsc_unstable("TSC halt in AMD C1E"); 398 pr_info("System has AMD C1E enabled\n"); 399 } 400 } 401 402 if (amd_e400_c1e_detected) { 403 int cpu = smp_processor_id(); 404 405 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { 406 cpumask_set_cpu(cpu, amd_e400_c1e_mask); 407 /* Force broadcast so ACPI can not interfere. */ 408 tick_broadcast_force(); 409 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 410 } 411 tick_broadcast_enter(); 412 413 default_idle(); 414 415 /* 416 * The switch back from broadcast mode needs to be 417 * called with interrupts disabled. 418 */ 419 local_irq_disable(); 420 tick_broadcast_exit(); 421 local_irq_enable(); 422 } else 423 default_idle(); 424} 425 426/* 427 * Intel Core2 and older machines prefer MWAIT over HALT for C1. 428 * We can't rely on cpuidle installing MWAIT, because it will not load 429 * on systems that support only C1 -- so the boot default must be MWAIT. 430 * 431 * Some AMD machines are the opposite, they depend on using HALT. 432 * 433 * So for default C1, which is used during boot until cpuidle loads, 434 * use MWAIT-C1 on Intel HW that has it, else use HALT. 435 */ 436static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) 437{ 438 if (c->x86_vendor != X86_VENDOR_INTEL) 439 return 0; 440 441 if (!cpu_has(c, X86_FEATURE_MWAIT)) 442 return 0; 443 444 return 1; 445} 446 447/* 448 * MONITOR/MWAIT with no hints, used for default default C1 state. 449 * This invokes MWAIT with interrutps enabled and no flags, 450 * which is backwards compatible with the original MWAIT implementation. 451 */ 452 453static void mwait_idle(void) 454{ 455 if (!current_set_polling_and_test()) { 456 trace_cpu_idle_rcuidle(1, smp_processor_id()); 457 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { 458 smp_mb(); /* quirk */ 459 clflush((void *)¤t_thread_info()->flags); 460 smp_mb(); /* quirk */ 461 } 462 463 __monitor((void *)¤t_thread_info()->flags, 0, 0); 464 if (!need_resched()) 465 __sti_mwait(0, 0); 466 else 467 local_irq_enable(); 468 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 469 } else { 470 local_irq_enable(); 471 } 472 __current_clr_polling(); 473} 474 475void select_idle_routine(const struct cpuinfo_x86 *c) 476{ 477#ifdef CONFIG_SMP 478 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1) 479 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 480#endif 481 if (x86_idle || boot_option_idle_override == IDLE_POLL) 482 return; 483 484 if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { 485 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 486 pr_info("using AMD E400 aware idle routine\n"); 487 x86_idle = amd_e400_idle; 488 } else if (prefer_mwait_c1_over_halt(c)) { 489 pr_info("using mwait in idle threads\n"); 490 x86_idle = mwait_idle; 491 } else 492 x86_idle = default_idle; 493} 494 495void __init init_amd_e400_c1e_mask(void) 496{ 497 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ 498 if (x86_idle == amd_e400_idle) 499 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); 500} 501 502static int __init idle_setup(char *str) 503{ 504 if (!str) 505 return -EINVAL; 506 507 if (!strcmp(str, "poll")) { 508 pr_info("using polling idle threads\n"); 509 boot_option_idle_override = IDLE_POLL; 510 cpu_idle_poll_ctrl(true); 511 } else if (!strcmp(str, "halt")) { 512 /* 513 * When the boot option of idle=halt is added, halt is 514 * forced to be used for CPU idle. In such case CPU C2/C3 515 * won't be used again. 516 * To continue to load the CPU idle driver, don't touch 517 * the boot_option_idle_override. 518 */ 519 x86_idle = default_idle; 520 boot_option_idle_override = IDLE_HALT; 521 } else if (!strcmp(str, "nomwait")) { 522 /* 523 * If the boot option of "idle=nomwait" is added, 524 * it means that mwait will be disabled for CPU C2/C3 525 * states. In such case it won't touch the variable 526 * of boot_option_idle_override. 527 */ 528 boot_option_idle_override = IDLE_NOMWAIT; 529 } else 530 return -1; 531 532 return 0; 533} 534early_param("idle", idle_setup); 535 536unsigned long arch_align_stack(unsigned long sp) 537{ 538 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) 539 sp -= get_random_int() % 8192; 540 return sp & ~0xf; 541} 542 543unsigned long arch_randomize_brk(struct mm_struct *mm) 544{ 545 unsigned long range_end = mm->brk + 0x02000000; 546 return randomize_range(mm->brk, range_end, 0) ? : mm->brk; 547} 548 549