1/* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <gareth@valinux.com>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - ashok.raj@intel.com 11 */ 12 13/* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17#include <linux/cpu.h> 18#include <linux/errno.h> 19#include <linux/sched.h> 20#include <linux/fs.h> 21#include <linux/kernel.h> 22#include <linux/mm.h> 23#include <linux/elfcore.h> 24#include <linux/smp.h> 25#include <linux/slab.h> 26#include <linux/user.h> 27#include <linux/interrupt.h> 28#include <linux/delay.h> 29#include <linux/module.h> 30#include <linux/ptrace.h> 31#include <linux/notifier.h> 32#include <linux/kprobes.h> 33#include <linux/kdebug.h> 34#include <linux/prctl.h> 35#include <linux/uaccess.h> 36#include <linux/io.h> 37#include <linux/ftrace.h> 38 39#include <asm/pgtable.h> 40#include <asm/processor.h> 41#include <asm/i387.h> 42#include <asm/fpu-internal.h> 43#include <asm/mmu_context.h> 44#include <asm/prctl.h> 45#include <asm/desc.h> 46#include <asm/proto.h> 47#include <asm/ia32.h> 48#include <asm/idle.h> 49#include <asm/syscalls.h> 50#include <asm/debugreg.h> 51#include <asm/switch_to.h> 52#include <asm/xen/hypervisor.h> 53 54asmlinkage extern void ret_from_fork(void); 55 56__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); 57 58/* Prints also some state that isn't saved in the pt_regs */ 59void __show_regs(struct pt_regs *regs, int all) 60{ 61 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 62 unsigned long d0, d1, d2, d3, d6, d7; 63 unsigned int fsindex, gsindex; 64 unsigned int ds, cs, es; 65 66 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 67 printk_address(regs->ip); 68 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 69 regs->sp, regs->flags); 70 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", 71 regs->ax, regs->bx, regs->cx); 72 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", 73 regs->dx, regs->si, regs->di); 74 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", 75 regs->bp, regs->r8, regs->r9); 76 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", 77 regs->r10, regs->r11, regs->r12); 78 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", 79 regs->r13, regs->r14, regs->r15); 80 81 asm("movl %%ds,%0" : "=r" (ds)); 82 asm("movl %%cs,%0" : "=r" (cs)); 83 asm("movl %%es,%0" : "=r" (es)); 84 asm("movl %%fs,%0" : "=r" (fsindex)); 85 asm("movl %%gs,%0" : "=r" (gsindex)); 86 87 rdmsrl(MSR_FS_BASE, fs); 88 rdmsrl(MSR_GS_BASE, gs); 89 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 90 91 if (!all) 92 return; 93 94 cr0 = read_cr0(); 95 cr2 = read_cr2(); 96 cr3 = read_cr3(); 97 cr4 = __read_cr4(); 98 99 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 100 fs, fsindex, gs, gsindex, shadowgs); 101 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 102 es, cr0); 103 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 104 cr4); 105 106 get_debugreg(d0, 0); 107 get_debugreg(d1, 1); 108 get_debugreg(d2, 2); 109 get_debugreg(d3, 3); 110 get_debugreg(d6, 6); 111 get_debugreg(d7, 7); 112 113 /* Only print out debug registers if they are in their non-default state. */ 114 if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && 115 (d6 == DR6_RESERVED) && (d7 == 0x400)) 116 return; 117 118 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 119 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 120 121} 122 123void release_thread(struct task_struct *dead_task) 124{ 125 if (dead_task->mm) { 126 if (dead_task->mm->context.ldt) { 127 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", 128 dead_task->comm, 129 dead_task->mm->context.ldt, 130 dead_task->mm->context.ldt->size); 131 BUG(); 132 } 133 } 134} 135 136static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 137{ 138 struct user_desc ud = { 139 .base_addr = addr, 140 .limit = 0xfffff, 141 .seg_32bit = 1, 142 .limit_in_pages = 1, 143 .useable = 1, 144 }; 145 struct desc_struct *desc = t->thread.tls_array; 146 desc += tls; 147 fill_ldt(desc, &ud); 148} 149 150static inline u32 read_32bit_tls(struct task_struct *t, int tls) 151{ 152 return get_desc_base(&t->thread.tls_array[tls]); 153} 154 155int copy_thread(unsigned long clone_flags, unsigned long sp, 156 unsigned long arg, struct task_struct *p) 157{ 158 int err; 159 struct pt_regs *childregs; 160 struct task_struct *me = current; 161 162 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; 163 childregs = task_pt_regs(p); 164 p->thread.sp = (unsigned long) childregs; 165 set_tsk_thread_flag(p, TIF_FORK); 166 p->thread.io_bitmap_ptr = NULL; 167 168 savesegment(gs, p->thread.gsindex); 169 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; 170 savesegment(fs, p->thread.fsindex); 171 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; 172 savesegment(es, p->thread.es); 173 savesegment(ds, p->thread.ds); 174 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 175 176 if (unlikely(p->flags & PF_KTHREAD)) { 177 /* kernel thread */ 178 memset(childregs, 0, sizeof(struct pt_regs)); 179 childregs->sp = (unsigned long)childregs; 180 childregs->ss = __KERNEL_DS; 181 childregs->bx = sp; /* function */ 182 childregs->bp = arg; 183 childregs->orig_ax = -1; 184 childregs->cs = __KERNEL_CS | get_kernel_rpl(); 185 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; 186 return 0; 187 } 188 *childregs = *current_pt_regs(); 189 190 childregs->ax = 0; 191 if (sp) 192 childregs->sp = sp; 193 194 err = -ENOMEM; 195 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 196 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, 197 IO_BITMAP_BYTES, GFP_KERNEL); 198 if (!p->thread.io_bitmap_ptr) { 199 p->thread.io_bitmap_max = 0; 200 return -ENOMEM; 201 } 202 set_tsk_thread_flag(p, TIF_IO_BITMAP); 203 } 204 205 /* 206 * Set a new TLS for the child thread? 207 */ 208 if (clone_flags & CLONE_SETTLS) { 209#ifdef CONFIG_IA32_EMULATION 210 if (is_ia32_task()) 211 err = do_set_thread_area(p, -1, 212 (struct user_desc __user *)childregs->si, 0); 213 else 214#endif 215 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 216 if (err) 217 goto out; 218 } 219 err = 0; 220out: 221 if (err && p->thread.io_bitmap_ptr) { 222 kfree(p->thread.io_bitmap_ptr); 223 p->thread.io_bitmap_max = 0; 224 } 225 226 return err; 227} 228 229static void 230start_thread_common(struct pt_regs *regs, unsigned long new_ip, 231 unsigned long new_sp, 232 unsigned int _cs, unsigned int _ss, unsigned int _ds) 233{ 234 loadsegment(fs, 0); 235 loadsegment(es, _ds); 236 loadsegment(ds, _ds); 237 load_gs_index(0); 238 regs->ip = new_ip; 239 regs->sp = new_sp; 240 regs->cs = _cs; 241 regs->ss = _ss; 242 regs->flags = X86_EFLAGS_IF; 243 force_iret(); 244} 245 246void 247start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 248{ 249 start_thread_common(regs, new_ip, new_sp, 250 __USER_CS, __USER_DS, 0); 251} 252 253#ifdef CONFIG_IA32_EMULATION 254void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) 255{ 256 start_thread_common(regs, new_ip, new_sp, 257 test_thread_flag(TIF_X32) 258 ? __USER_CS : __USER32_CS, 259 __USER_DS, __USER_DS); 260} 261#endif 262 263/* 264 * switch_to(x,y) should switch tasks from x to y. 265 * 266 * This could still be optimized: 267 * - fold all the options into a flag word and test it with a single test. 268 * - could test fs/gs bitsliced 269 * 270 * Kprobes not supported here. Set the probe on schedule instead. 271 * Function graph tracer not supported too. 272 */ 273__visible __notrace_funcgraph struct task_struct * 274__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 275{ 276 struct thread_struct *prev = &prev_p->thread; 277 struct thread_struct *next = &next_p->thread; 278 int cpu = smp_processor_id(); 279 struct tss_struct *tss = &per_cpu(cpu_tss, cpu); 280 unsigned fsindex, gsindex; 281 fpu_switch_t fpu; 282 283 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 284 285 /* We must save %fs and %gs before load_TLS() because 286 * %fs and %gs may be cleared by load_TLS(). 287 * 288 * (e.g. xen_load_tls()) 289 */ 290 savesegment(fs, fsindex); 291 savesegment(gs, gsindex); 292 293 /* 294 * Load TLS before restoring any segments so that segment loads 295 * reference the correct GDT entries. 296 */ 297 load_TLS(next, cpu); 298 299 /* 300 * Leave lazy mode, flushing any hypercalls made here. This 301 * must be done after loading TLS entries in the GDT but before 302 * loading segments that might reference them, and and it must 303 * be done before math_state_restore, so the TS bit is up to 304 * date. 305 */ 306 arch_end_context_switch(next_p); 307 308 /* Switch DS and ES. 309 * 310 * Reading them only returns the selectors, but writing them (if 311 * nonzero) loads the full descriptor from the GDT or LDT. The 312 * LDT for next is loaded in switch_mm, and the GDT is loaded 313 * above. 314 * 315 * We therefore need to write new values to the segment 316 * registers on every context switch unless both the new and old 317 * values are zero. 318 * 319 * Note that we don't need to do anything for CS and SS, as 320 * those are saved and restored as part of pt_regs. 321 */ 322 savesegment(es, prev->es); 323 if (unlikely(next->es | prev->es)) 324 loadsegment(es, next->es); 325 326 savesegment(ds, prev->ds); 327 if (unlikely(next->ds | prev->ds)) 328 loadsegment(ds, next->ds); 329 330 /* 331 * Switch FS and GS. 332 * 333 * These are even more complicated than FS and GS: they have 334 * 64-bit bases are that controlled by arch_prctl. Those bases 335 * only differ from the values in the GDT or LDT if the selector 336 * is 0. 337 * 338 * Loading the segment register resets the hidden base part of 339 * the register to 0 or the value from the GDT / LDT. If the 340 * next base address zero, writing 0 to the segment register is 341 * much faster than using wrmsr to explicitly zero the base. 342 * 343 * The thread_struct.fs and thread_struct.gs values are 0 344 * if the fs and gs bases respectively are not overridden 345 * from the values implied by fsindex and gsindex. They 346 * are nonzero, and store the nonzero base addresses, if 347 * the bases are overridden. 348 * 349 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should 350 * be impossible. 351 * 352 * Therefore we need to reload the segment registers if either 353 * the old or new selector is nonzero, and we need to override 354 * the base address if next thread expects it to be overridden. 355 * 356 * This code is unnecessarily slow in the case where the old and 357 * new indexes are zero and the new base is nonzero -- it will 358 * unnecessarily write 0 to the selector before writing the new 359 * base address. 360 * 361 * Note: This all depends on arch_prctl being the only way that 362 * user code can override the segment base. Once wrfsbase and 363 * wrgsbase are enabled, most of this code will need to change. 364 */ 365 if (unlikely(fsindex | next->fsindex | prev->fs)) { 366 loadsegment(fs, next->fsindex); 367 368 /* 369 * If user code wrote a nonzero value to FS, then it also 370 * cleared the overridden base address. 371 * 372 * XXX: if user code wrote 0 to FS and cleared the base 373 * address itself, we won't notice and we'll incorrectly 374 * restore the prior base address next time we reschdule 375 * the process. 376 */ 377 if (fsindex) 378 prev->fs = 0; 379 } 380 if (next->fs) 381 wrmsrl(MSR_FS_BASE, next->fs); 382 prev->fsindex = fsindex; 383 384 if (unlikely(gsindex | next->gsindex | prev->gs)) { 385 load_gs_index(next->gsindex); 386 387 /* This works (and fails) the same way as fsindex above. */ 388 if (gsindex) 389 prev->gs = 0; 390 } 391 if (next->gs) 392 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 393 prev->gsindex = gsindex; 394 395 switch_fpu_finish(next_p, fpu); 396 397 /* 398 * Switch the PDA and FPU contexts. 399 */ 400 this_cpu_write(current_task, next_p); 401 402 /* 403 * If it were not for PREEMPT_ACTIVE we could guarantee that the 404 * preempt_count of all tasks was equal here and this would not be 405 * needed. 406 */ 407 task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); 408 this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); 409 410 /* Reload esp0 and ss1. This changes current_thread_info(). */ 411 load_sp0(tss, next); 412 413 this_cpu_write(kernel_stack, 414 (unsigned long)task_stack_page(next_p) + THREAD_SIZE); 415 416 /* 417 * Now maybe reload the debug registers and handle I/O bitmaps 418 */ 419 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 420 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 421 __switch_to_xtra(prev_p, next_p, tss); 422 423#ifdef CONFIG_XEN 424 /* 425 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and 426 * current_pt_regs()->flags may not match the current task's 427 * intended IOPL. We need to switch it manually. 428 */ 429 if (unlikely(xen_pv_domain() && 430 prev->iopl != next->iopl)) 431 xen_set_iopl_mask(next->iopl); 432#endif 433 434 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { 435 /* 436 * AMD CPUs have a misfeature: SYSRET sets the SS selector but 437 * does not update the cached descriptor. As a result, if we 438 * do SYSRET while SS is NULL, we'll end up in user mode with 439 * SS apparently equal to __USER_DS but actually unusable. 440 * 441 * The straightforward workaround would be to fix it up just 442 * before SYSRET, but that would slow down the system call 443 * fast paths. Instead, we ensure that SS is never NULL in 444 * system call context. We do this by replacing NULL SS 445 * selectors at every context switch. SYSCALL sets up a valid 446 * SS, so the only way to get NULL is to re-enter the kernel 447 * from CPL 3 through an interrupt. Since that can't happen 448 * in the same task as a running syscall, we are guaranteed to 449 * context switch between every interrupt vector entry and a 450 * subsequent SYSRET. 451 * 452 * We read SS first because SS reads are much faster than 453 * writes. Out of caution, we force SS to __KERNEL_DS even if 454 * it previously had a different non-NULL value. 455 */ 456 unsigned short ss_sel; 457 savesegment(ss, ss_sel); 458 if (ss_sel != __KERNEL_DS) 459 loadsegment(ss, __KERNEL_DS); 460 } 461 462 return prev_p; 463} 464 465void set_personality_64bit(void) 466{ 467 /* inherit personality from parent */ 468 469 /* Make sure to be in 64bit mode */ 470 clear_thread_flag(TIF_IA32); 471 clear_thread_flag(TIF_ADDR32); 472 clear_thread_flag(TIF_X32); 473 474 /* Ensure the corresponding mm is not marked. */ 475 if (current->mm) 476 current->mm->context.ia32_compat = 0; 477 478 /* TBD: overwrites user setup. Should have two bits. 479 But 64bit processes have always behaved this way, 480 so it's not too bad. The main problem is just that 481 32bit childs are affected again. */ 482 current->personality &= ~READ_IMPLIES_EXEC; 483} 484 485void set_personality_ia32(bool x32) 486{ 487 /* inherit personality from parent */ 488 489 /* Make sure to be in 32bit mode */ 490 set_thread_flag(TIF_ADDR32); 491 492 /* Mark the associated mm as containing 32-bit tasks. */ 493 if (x32) { 494 clear_thread_flag(TIF_IA32); 495 set_thread_flag(TIF_X32); 496 if (current->mm) 497 current->mm->context.ia32_compat = TIF_X32; 498 current->personality &= ~READ_IMPLIES_EXEC; 499 /* is_compat_task() uses the presence of the x32 500 syscall bit flag to determine compat status */ 501 current_thread_info()->status &= ~TS_COMPAT; 502 } else { 503 set_thread_flag(TIF_IA32); 504 clear_thread_flag(TIF_X32); 505 if (current->mm) 506 current->mm->context.ia32_compat = TIF_IA32; 507 current->personality |= force_personality32; 508 /* Prepare the first "return" to user space */ 509 current_thread_info()->status |= TS_COMPAT; 510 } 511} 512EXPORT_SYMBOL_GPL(set_personality_ia32); 513 514/* 515 * Called from fs/proc with a reference on @p to find the function 516 * which called into schedule(). This needs to be done carefully 517 * because the task might wake up and we might look at a stack 518 * changing under us. 519 */ 520unsigned long get_wchan(struct task_struct *p) 521{ 522 unsigned long start, bottom, top, sp, fp, ip; 523 int count = 0; 524 525 if (!p || p == current || p->state == TASK_RUNNING) 526 return 0; 527 528 start = (unsigned long)task_stack_page(p); 529 if (!start) 530 return 0; 531 532 /* 533 * Layout of the stack page: 534 * 535 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) 536 * PADDING 537 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING 538 * stack 539 * ----------- bottom = start + sizeof(thread_info) 540 * thread_info 541 * ----------- start 542 * 543 * The tasks stack pointer points at the location where the 544 * framepointer is stored. The data on the stack is: 545 * ... IP FP ... IP FP 546 * 547 * We need to read FP and IP, so we need to adjust the upper 548 * bound by another unsigned long. 549 */ 550 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; 551 top -= 2 * sizeof(unsigned long); 552 bottom = start + sizeof(struct thread_info); 553 554 sp = READ_ONCE(p->thread.sp); 555 if (sp < bottom || sp > top) 556 return 0; 557 558 fp = READ_ONCE(*(unsigned long *)sp); 559 do { 560 if (fp < bottom || fp > top) 561 return 0; 562 ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); 563 if (!in_sched_functions(ip)) 564 return ip; 565 fp = READ_ONCE(*(unsigned long *)fp); 566 } while (count++ < 16 && p->state != TASK_RUNNING); 567 return 0; 568} 569 570long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 571{ 572 int ret = 0; 573 int doit = task == current; 574 int cpu; 575 576 switch (code) { 577 case ARCH_SET_GS: 578 if (addr >= TASK_SIZE_OF(task)) 579 return -EPERM; 580 cpu = get_cpu(); 581 /* handle small bases via the GDT because that's faster to 582 switch. */ 583 if (addr <= 0xffffffff) { 584 set_32bit_tls(task, GS_TLS, addr); 585 if (doit) { 586 load_TLS(&task->thread, cpu); 587 load_gs_index(GS_TLS_SEL); 588 } 589 task->thread.gsindex = GS_TLS_SEL; 590 task->thread.gs = 0; 591 } else { 592 task->thread.gsindex = 0; 593 task->thread.gs = addr; 594 if (doit) { 595 load_gs_index(0); 596 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); 597 } 598 } 599 put_cpu(); 600 break; 601 case ARCH_SET_FS: 602 /* Not strictly needed for fs, but do it for symmetry 603 with gs */ 604 if (addr >= TASK_SIZE_OF(task)) 605 return -EPERM; 606 cpu = get_cpu(); 607 /* handle small bases via the GDT because that's faster to 608 switch. */ 609 if (addr <= 0xffffffff) { 610 set_32bit_tls(task, FS_TLS, addr); 611 if (doit) { 612 load_TLS(&task->thread, cpu); 613 loadsegment(fs, FS_TLS_SEL); 614 } 615 task->thread.fsindex = FS_TLS_SEL; 616 task->thread.fs = 0; 617 } else { 618 task->thread.fsindex = 0; 619 task->thread.fs = addr; 620 if (doit) { 621 /* set the selector to 0 to not confuse 622 __switch_to */ 623 loadsegment(fs, 0); 624 ret = wrmsrl_safe(MSR_FS_BASE, addr); 625 } 626 } 627 put_cpu(); 628 break; 629 case ARCH_GET_FS: { 630 unsigned long base; 631 if (task->thread.fsindex == FS_TLS_SEL) 632 base = read_32bit_tls(task, FS_TLS); 633 else if (doit) 634 rdmsrl(MSR_FS_BASE, base); 635 else 636 base = task->thread.fs; 637 ret = put_user(base, (unsigned long __user *)addr); 638 break; 639 } 640 case ARCH_GET_GS: { 641 unsigned long base; 642 unsigned gsindex; 643 if (task->thread.gsindex == GS_TLS_SEL) 644 base = read_32bit_tls(task, GS_TLS); 645 else if (doit) { 646 savesegment(gs, gsindex); 647 if (gsindex) 648 rdmsrl(MSR_KERNEL_GS_BASE, base); 649 else 650 base = task->thread.gs; 651 } else 652 base = task->thread.gs; 653 ret = put_user(base, (unsigned long __user *)addr); 654 break; 655 } 656 657 default: 658 ret = -EINVAL; 659 break; 660 } 661 662 return ret; 663} 664 665long sys_arch_prctl(int code, unsigned long addr) 666{ 667 return do_arch_prctl(current, code, addr); 668} 669 670unsigned long KSTK_ESP(struct task_struct *task) 671{ 672 return task_pt_regs(task)->sp; 673} 674