root/arch/x86/kernel/process_64.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. __show_regs
  2. release_thread
  3. save_base_legacy
  4. save_fsgs
  5. save_fsgs_for_kvm
  6. loadseg
  7. load_seg_legacy
  8. x86_fsgsbase_load
  9. x86_fsgsbase_read_task
  10. x86_fsbase_read_task
  11. x86_gsbase_read_task
  12. x86_fsbase_write_task
  13. x86_gsbase_write_task
  14. copy_thread_tls
  15. start_thread_common
  16. start_thread
  17. compat_start_thread
  18. __switch_to
  19. set_personality_64bit
  20. __set_personality_x32
  21. __set_personality_ia32
  22. set_personality_ia32
  23. prctl_map_vdso
  24. do_arch_prctl_64
  25. SYSCALL_DEFINE2
  26. COMPAT_SYSCALL_DEFINE2
  27. KSTK_ESP

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  *  Copyright (C) 1995  Linus Torvalds
   4  *
   5  *  Pentium III FXSR, SSE support
   6  *      Gareth Hughes <gareth@valinux.com>, May 2000
   7  *
   8  *  X86-64 port
   9  *      Andi Kleen.
  10  *
  11  *      CPU hotplug support - ashok.raj@intel.com
  12  */
  13 
  14 /*
  15  * This file handles the architecture-dependent parts of process handling..
  16  */
  17 
  18 #include <linux/cpu.h>
  19 #include <linux/errno.h>
  20 #include <linux/sched.h>
  21 #include <linux/sched/task.h>
  22 #include <linux/sched/task_stack.h>
  23 #include <linux/fs.h>
  24 #include <linux/kernel.h>
  25 #include <linux/mm.h>
  26 #include <linux/elfcore.h>
  27 #include <linux/smp.h>
  28 #include <linux/slab.h>
  29 #include <linux/user.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/delay.h>
  32 #include <linux/export.h>
  33 #include <linux/ptrace.h>
  34 #include <linux/notifier.h>
  35 #include <linux/kprobes.h>
  36 #include <linux/kdebug.h>
  37 #include <linux/prctl.h>
  38 #include <linux/uaccess.h>
  39 #include <linux/io.h>
  40 #include <linux/ftrace.h>
  41 #include <linux/syscalls.h>
  42 
  43 #include <asm/pgtable.h>
  44 #include <asm/processor.h>
  45 #include <asm/fpu/internal.h>
  46 #include <asm/mmu_context.h>
  47 #include <asm/prctl.h>
  48 #include <asm/desc.h>
  49 #include <asm/proto.h>
  50 #include <asm/ia32.h>
  51 #include <asm/syscalls.h>
  52 #include <asm/debugreg.h>
  53 #include <asm/switch_to.h>
  54 #include <asm/xen/hypervisor.h>
  55 #include <asm/vdso.h>
  56 #include <asm/resctrl_sched.h>
  57 #include <asm/unistd.h>
  58 #include <asm/fsgsbase.h>
  59 #ifdef CONFIG_IA32_EMULATION
  60 /* Not included via unistd.h */
  61 #include <asm/unistd_32_ia32.h>
  62 #endif
  63 
  64 #include "process.h"
  65 
  66 /* Prints also some state that isn't saved in the pt_regs */
  67 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
  68 {
  69         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
  70         unsigned long d0, d1, d2, d3, d6, d7;
  71         unsigned int fsindex, gsindex;
  72         unsigned int ds, es;
  73 
  74         show_iret_regs(regs);
  75 
  76         if (regs->orig_ax != -1)
  77                 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
  78         else
  79                 pr_cont("\n");
  80 
  81         printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
  82                regs->ax, regs->bx, regs->cx);
  83         printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
  84                regs->dx, regs->si, regs->di);
  85         printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
  86                regs->bp, regs->r8, regs->r9);
  87         printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
  88                regs->r10, regs->r11, regs->r12);
  89         printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
  90                regs->r13, regs->r14, regs->r15);
  91 
  92         if (mode == SHOW_REGS_SHORT)
  93                 return;
  94 
  95         if (mode == SHOW_REGS_USER) {
  96                 rdmsrl(MSR_FS_BASE, fs);
  97                 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
  98                 printk(KERN_DEFAULT "FS:  %016lx GS:  %016lx\n",
  99                        fs, shadowgs);
 100                 return;
 101         }
 102 
 103         asm("movl %%ds,%0" : "=r" (ds));
 104         asm("movl %%es,%0" : "=r" (es));
 105         asm("movl %%fs,%0" : "=r" (fsindex));
 106         asm("movl %%gs,%0" : "=r" (gsindex));
 107 
 108         rdmsrl(MSR_FS_BASE, fs);
 109         rdmsrl(MSR_GS_BASE, gs);
 110         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
 111 
 112         cr0 = read_cr0();
 113         cr2 = read_cr2();
 114         cr3 = __read_cr3();
 115         cr4 = __read_cr4();
 116 
 117         printk(KERN_DEFAULT "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
 118                fs, fsindex, gs, gsindex, shadowgs);
 119         printk(KERN_DEFAULT "CS:  %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
 120                         es, cr0);
 121         printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
 122                         cr4);
 123 
 124         get_debugreg(d0, 0);
 125         get_debugreg(d1, 1);
 126         get_debugreg(d2, 2);
 127         get_debugreg(d3, 3);
 128         get_debugreg(d6, 6);
 129         get_debugreg(d7, 7);
 130 
 131         /* Only print out debug registers if they are in their non-default state. */
 132         if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
 133             (d6 == DR6_RESERVED) && (d7 == 0x400))) {
 134                 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
 135                        d0, d1, d2);
 136                 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
 137                        d3, d6, d7);
 138         }
 139 
 140         if (boot_cpu_has(X86_FEATURE_OSPKE))
 141                 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
 142 }
 143 
 144 void release_thread(struct task_struct *dead_task)
 145 {
 146         WARN_ON(dead_task->mm);
 147 }
 148 
 149 enum which_selector {
 150         FS,
 151         GS
 152 };
 153 
 154 /*
 155  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
 156  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
 157  * It's forcibly inlined because it'll generate better code and this function
 158  * is hot.
 159  */
 160 static __always_inline void save_base_legacy(struct task_struct *prev_p,
 161                                              unsigned short selector,
 162                                              enum which_selector which)
 163 {
 164         if (likely(selector == 0)) {
 165                 /*
 166                  * On Intel (without X86_BUG_NULL_SEG), the segment base could
 167                  * be the pre-existing saved base or it could be zero.  On AMD
 168                  * (with X86_BUG_NULL_SEG), the segment base could be almost
 169                  * anything.
 170                  *
 171                  * This branch is very hot (it's hit twice on almost every
 172                  * context switch between 64-bit programs), and avoiding
 173                  * the RDMSR helps a lot, so we just assume that whatever
 174                  * value is already saved is correct.  This matches historical
 175                  * Linux behavior, so it won't break existing applications.
 176                  *
 177                  * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
 178                  * report that the base is zero, it needs to actually be zero:
 179                  * see the corresponding logic in load_seg_legacy.
 180                  */
 181         } else {
 182                 /*
 183                  * If the selector is 1, 2, or 3, then the base is zero on
 184                  * !X86_BUG_NULL_SEG CPUs and could be anything on
 185                  * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
 186                  * has never attempted to preserve the base across context
 187                  * switches.
 188                  *
 189                  * If selector > 3, then it refers to a real segment, and
 190                  * saving the base isn't necessary.
 191                  */
 192                 if (which == FS)
 193                         prev_p->thread.fsbase = 0;
 194                 else
 195                         prev_p->thread.gsbase = 0;
 196         }
 197 }
 198 
 199 static __always_inline void save_fsgs(struct task_struct *task)
 200 {
 201         savesegment(fs, task->thread.fsindex);
 202         savesegment(gs, task->thread.gsindex);
 203         save_base_legacy(task, task->thread.fsindex, FS);
 204         save_base_legacy(task, task->thread.gsindex, GS);
 205 }
 206 
 207 #if IS_ENABLED(CONFIG_KVM)
 208 /*
 209  * While a process is running,current->thread.fsbase and current->thread.gsbase
 210  * may not match the corresponding CPU registers (see save_base_legacy()). KVM
 211  * wants an efficient way to save and restore FSBASE and GSBASE.
 212  * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
 213  */
 214 void save_fsgs_for_kvm(void)
 215 {
 216         save_fsgs(current);
 217 }
 218 EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
 219 #endif
 220 
 221 static __always_inline void loadseg(enum which_selector which,
 222                                     unsigned short sel)
 223 {
 224         if (which == FS)
 225                 loadsegment(fs, sel);
 226         else
 227                 load_gs_index(sel);
 228 }
 229 
 230 static __always_inline void load_seg_legacy(unsigned short prev_index,
 231                                             unsigned long prev_base,
 232                                             unsigned short next_index,
 233                                             unsigned long next_base,
 234                                             enum which_selector which)
 235 {
 236         if (likely(next_index <= 3)) {
 237                 /*
 238                  * The next task is using 64-bit TLS, is not using this
 239                  * segment at all, or is having fun with arcane CPU features.
 240                  */
 241                 if (next_base == 0) {
 242                         /*
 243                          * Nasty case: on AMD CPUs, we need to forcibly zero
 244                          * the base.
 245                          */
 246                         if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
 247                                 loadseg(which, __USER_DS);
 248                                 loadseg(which, next_index);
 249                         } else {
 250                                 /*
 251                                  * We could try to exhaustively detect cases
 252                                  * under which we can skip the segment load,
 253                                  * but there's really only one case that matters
 254                                  * for performance: if both the previous and
 255                                  * next states are fully zeroed, we can skip
 256                                  * the load.
 257                                  *
 258                                  * (This assumes that prev_base == 0 has no
 259                                  * false positives.  This is the case on
 260                                  * Intel-style CPUs.)
 261                                  */
 262                                 if (likely(prev_index | next_index | prev_base))
 263                                         loadseg(which, next_index);
 264                         }
 265                 } else {
 266                         if (prev_index != next_index)
 267                                 loadseg(which, next_index);
 268                         wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
 269                                next_base);
 270                 }
 271         } else {
 272                 /*
 273                  * The next task is using a real segment.  Loading the selector
 274                  * is sufficient.
 275                  */
 276                 loadseg(which, next_index);
 277         }
 278 }
 279 
 280 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
 281                                               struct thread_struct *next)
 282 {
 283         load_seg_legacy(prev->fsindex, prev->fsbase,
 284                         next->fsindex, next->fsbase, FS);
 285         load_seg_legacy(prev->gsindex, prev->gsbase,
 286                         next->gsindex, next->gsbase, GS);
 287 }
 288 
 289 static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
 290                                             unsigned short selector)
 291 {
 292         unsigned short idx = selector >> 3;
 293         unsigned long base;
 294 
 295         if (likely((selector & SEGMENT_TI_MASK) == 0)) {
 296                 if (unlikely(idx >= GDT_ENTRIES))
 297                         return 0;
 298 
 299                 /*
 300                  * There are no user segments in the GDT with nonzero bases
 301                  * other than the TLS segments.
 302                  */
 303                 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
 304                         return 0;
 305 
 306                 idx -= GDT_ENTRY_TLS_MIN;
 307                 base = get_desc_base(&task->thread.tls_array[idx]);
 308         } else {
 309 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 310                 struct ldt_struct *ldt;
 311 
 312                 /*
 313                  * If performance here mattered, we could protect the LDT
 314                  * with RCU.  This is a slow path, though, so we can just
 315                  * take the mutex.
 316                  */
 317                 mutex_lock(&task->mm->context.lock);
 318                 ldt = task->mm->context.ldt;
 319                 if (unlikely(idx >= ldt->nr_entries))
 320                         base = 0;
 321                 else
 322                         base = get_desc_base(ldt->entries + idx);
 323                 mutex_unlock(&task->mm->context.lock);
 324 #else
 325                 base = 0;
 326 #endif
 327         }
 328 
 329         return base;
 330 }
 331 
 332 unsigned long x86_fsbase_read_task(struct task_struct *task)
 333 {
 334         unsigned long fsbase;
 335 
 336         if (task == current)
 337                 fsbase = x86_fsbase_read_cpu();
 338         else if (task->thread.fsindex == 0)
 339                 fsbase = task->thread.fsbase;
 340         else
 341                 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
 342 
 343         return fsbase;
 344 }
 345 
 346 unsigned long x86_gsbase_read_task(struct task_struct *task)
 347 {
 348         unsigned long gsbase;
 349 
 350         if (task == current)
 351                 gsbase = x86_gsbase_read_cpu_inactive();
 352         else if (task->thread.gsindex == 0)
 353                 gsbase = task->thread.gsbase;
 354         else
 355                 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
 356 
 357         return gsbase;
 358 }
 359 
 360 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 361 {
 362         WARN_ON_ONCE(task == current);
 363 
 364         task->thread.fsbase = fsbase;
 365 }
 366 
 367 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 368 {
 369         WARN_ON_ONCE(task == current);
 370 
 371         task->thread.gsbase = gsbase;
 372 }
 373 
 374 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 375                 unsigned long arg, struct task_struct *p, unsigned long tls)
 376 {
 377         int err;
 378         struct pt_regs *childregs;
 379         struct fork_frame *fork_frame;
 380         struct inactive_task_frame *frame;
 381         struct task_struct *me = current;
 382 
 383         childregs = task_pt_regs(p);
 384         fork_frame = container_of(childregs, struct fork_frame, regs);
 385         frame = &fork_frame->frame;
 386 
 387         frame->bp = 0;
 388         frame->ret_addr = (unsigned long) ret_from_fork;
 389         p->thread.sp = (unsigned long) fork_frame;
 390         p->thread.io_bitmap_ptr = NULL;
 391 
 392         savesegment(gs, p->thread.gsindex);
 393         p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
 394         savesegment(fs, p->thread.fsindex);
 395         p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
 396         savesegment(es, p->thread.es);
 397         savesegment(ds, p->thread.ds);
 398         memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
 399 
 400         if (unlikely(p->flags & PF_KTHREAD)) {
 401                 /* kernel thread */
 402                 memset(childregs, 0, sizeof(struct pt_regs));
 403                 frame->bx = sp;         /* function */
 404                 frame->r12 = arg;
 405                 return 0;
 406         }
 407         frame->bx = 0;
 408         *childregs = *current_pt_regs();
 409 
 410         childregs->ax = 0;
 411         if (sp)
 412                 childregs->sp = sp;
 413 
 414         err = -ENOMEM;
 415         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 416                 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
 417                                                   IO_BITMAP_BYTES, GFP_KERNEL);
 418                 if (!p->thread.io_bitmap_ptr) {
 419                         p->thread.io_bitmap_max = 0;
 420                         return -ENOMEM;
 421                 }
 422                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
 423         }
 424 
 425         /*
 426          * Set a new TLS for the child thread?
 427          */
 428         if (clone_flags & CLONE_SETTLS) {
 429 #ifdef CONFIG_IA32_EMULATION
 430                 if (in_ia32_syscall())
 431                         err = do_set_thread_area(p, -1,
 432                                 (struct user_desc __user *)tls, 0);
 433                 else
 434 #endif
 435                         err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
 436                 if (err)
 437                         goto out;
 438         }
 439         err = 0;
 440 out:
 441         if (err && p->thread.io_bitmap_ptr) {
 442                 kfree(p->thread.io_bitmap_ptr);
 443                 p->thread.io_bitmap_max = 0;
 444         }
 445 
 446         return err;
 447 }
 448 
 449 static void
 450 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 451                     unsigned long new_sp,
 452                     unsigned int _cs, unsigned int _ss, unsigned int _ds)
 453 {
 454         WARN_ON_ONCE(regs != current_pt_regs());
 455 
 456         if (static_cpu_has(X86_BUG_NULL_SEG)) {
 457                 /* Loading zero below won't clear the base. */
 458                 loadsegment(fs, __USER_DS);
 459                 load_gs_index(__USER_DS);
 460         }
 461 
 462         loadsegment(fs, 0);
 463         loadsegment(es, _ds);
 464         loadsegment(ds, _ds);
 465         load_gs_index(0);
 466 
 467         regs->ip                = new_ip;
 468         regs->sp                = new_sp;
 469         regs->cs                = _cs;
 470         regs->ss                = _ss;
 471         regs->flags             = X86_EFLAGS_IF;
 472         force_iret();
 473 }
 474 
 475 void
 476 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 477 {
 478         start_thread_common(regs, new_ip, new_sp,
 479                             __USER_CS, __USER_DS, 0);
 480 }
 481 EXPORT_SYMBOL_GPL(start_thread);
 482 
 483 #ifdef CONFIG_COMPAT
 484 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 485 {
 486         start_thread_common(regs, new_ip, new_sp,
 487                             test_thread_flag(TIF_X32)
 488                             ? __USER_CS : __USER32_CS,
 489                             __USER_DS, __USER_DS);
 490 }
 491 #endif
 492 
 493 /*
 494  *      switch_to(x,y) should switch tasks from x to y.
 495  *
 496  * This could still be optimized:
 497  * - fold all the options into a flag word and test it with a single test.
 498  * - could test fs/gs bitsliced
 499  *
 500  * Kprobes not supported here. Set the probe on schedule instead.
 501  * Function graph tracer not supported too.
 502  */
 503 __visible __notrace_funcgraph struct task_struct *
 504 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 505 {
 506         struct thread_struct *prev = &prev_p->thread;
 507         struct thread_struct *next = &next_p->thread;
 508         struct fpu *prev_fpu = &prev->fpu;
 509         struct fpu *next_fpu = &next->fpu;
 510         int cpu = smp_processor_id();
 511 
 512         WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
 513                      this_cpu_read(irq_count) != -1);
 514 
 515         if (!test_thread_flag(TIF_NEED_FPU_LOAD))
 516                 switch_fpu_prepare(prev_fpu, cpu);
 517 
 518         /* We must save %fs and %gs before load_TLS() because
 519          * %fs and %gs may be cleared by load_TLS().
 520          *
 521          * (e.g. xen_load_tls())
 522          */
 523         save_fsgs(prev_p);
 524 
 525         /*
 526          * Load TLS before restoring any segments so that segment loads
 527          * reference the correct GDT entries.
 528          */
 529         load_TLS(next, cpu);
 530 
 531         /*
 532          * Leave lazy mode, flushing any hypercalls made here.  This
 533          * must be done after loading TLS entries in the GDT but before
 534          * loading segments that might reference them.
 535          */
 536         arch_end_context_switch(next_p);
 537 
 538         /* Switch DS and ES.
 539          *
 540          * Reading them only returns the selectors, but writing them (if
 541          * nonzero) loads the full descriptor from the GDT or LDT.  The
 542          * LDT for next is loaded in switch_mm, and the GDT is loaded
 543          * above.
 544          *
 545          * We therefore need to write new values to the segment
 546          * registers on every context switch unless both the new and old
 547          * values are zero.
 548          *
 549          * Note that we don't need to do anything for CS and SS, as
 550          * those are saved and restored as part of pt_regs.
 551          */
 552         savesegment(es, prev->es);
 553         if (unlikely(next->es | prev->es))
 554                 loadsegment(es, next->es);
 555 
 556         savesegment(ds, prev->ds);
 557         if (unlikely(next->ds | prev->ds))
 558                 loadsegment(ds, next->ds);
 559 
 560         x86_fsgsbase_load(prev, next);
 561 
 562         /*
 563          * Switch the PDA and FPU contexts.
 564          */
 565         this_cpu_write(current_task, next_p);
 566         this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
 567 
 568         switch_fpu_finish(next_fpu);
 569 
 570         /* Reload sp0. */
 571         update_task_stack(next_p);
 572 
 573         switch_to_extra(prev_p, next_p);
 574 
 575 #ifdef CONFIG_XEN_PV
 576         /*
 577          * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
 578          * current_pt_regs()->flags may not match the current task's
 579          * intended IOPL.  We need to switch it manually.
 580          */
 581         if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
 582                      prev->iopl != next->iopl))
 583                 xen_set_iopl_mask(next->iopl);
 584 #endif
 585 
 586         if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
 587                 /*
 588                  * AMD CPUs have a misfeature: SYSRET sets the SS selector but
 589                  * does not update the cached descriptor.  As a result, if we
 590                  * do SYSRET while SS is NULL, we'll end up in user mode with
 591                  * SS apparently equal to __USER_DS but actually unusable.
 592                  *
 593                  * The straightforward workaround would be to fix it up just
 594                  * before SYSRET, but that would slow down the system call
 595                  * fast paths.  Instead, we ensure that SS is never NULL in
 596                  * system call context.  We do this by replacing NULL SS
 597                  * selectors at every context switch.  SYSCALL sets up a valid
 598                  * SS, so the only way to get NULL is to re-enter the kernel
 599                  * from CPL 3 through an interrupt.  Since that can't happen
 600                  * in the same task as a running syscall, we are guaranteed to
 601                  * context switch between every interrupt vector entry and a
 602                  * subsequent SYSRET.
 603                  *
 604                  * We read SS first because SS reads are much faster than
 605                  * writes.  Out of caution, we force SS to __KERNEL_DS even if
 606                  * it previously had a different non-NULL value.
 607                  */
 608                 unsigned short ss_sel;
 609                 savesegment(ss, ss_sel);
 610                 if (ss_sel != __KERNEL_DS)
 611                         loadsegment(ss, __KERNEL_DS);
 612         }
 613 
 614         /* Load the Intel cache allocation PQR MSR. */
 615         resctrl_sched_in();
 616 
 617         return prev_p;
 618 }
 619 
 620 void set_personality_64bit(void)
 621 {
 622         /* inherit personality from parent */
 623 
 624         /* Make sure to be in 64bit mode */
 625         clear_thread_flag(TIF_IA32);
 626         clear_thread_flag(TIF_ADDR32);
 627         clear_thread_flag(TIF_X32);
 628         /* Pretend that this comes from a 64bit execve */
 629         task_pt_regs(current)->orig_ax = __NR_execve;
 630         current_thread_info()->status &= ~TS_COMPAT;
 631 
 632         /* Ensure the corresponding mm is not marked. */
 633         if (current->mm)
 634                 current->mm->context.ia32_compat = 0;
 635 
 636         /* TBD: overwrites user setup. Should have two bits.
 637            But 64bit processes have always behaved this way,
 638            so it's not too bad. The main problem is just that
 639            32bit children are affected again. */
 640         current->personality &= ~READ_IMPLIES_EXEC;
 641 }
 642 
 643 static void __set_personality_x32(void)
 644 {
 645 #ifdef CONFIG_X86_X32
 646         clear_thread_flag(TIF_IA32);
 647         set_thread_flag(TIF_X32);
 648         if (current->mm)
 649                 current->mm->context.ia32_compat = TIF_X32;
 650         current->personality &= ~READ_IMPLIES_EXEC;
 651         /*
 652          * in_32bit_syscall() uses the presence of the x32 syscall bit
 653          * flag to determine compat status.  The x86 mmap() code relies on
 654          * the syscall bitness so set x32 syscall bit right here to make
 655          * in_32bit_syscall() work during exec().
 656          *
 657          * Pretend to come from a x32 execve.
 658          */
 659         task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
 660         current_thread_info()->status &= ~TS_COMPAT;
 661 #endif
 662 }
 663 
 664 static void __set_personality_ia32(void)
 665 {
 666 #ifdef CONFIG_IA32_EMULATION
 667         set_thread_flag(TIF_IA32);
 668         clear_thread_flag(TIF_X32);
 669         if (current->mm)
 670                 current->mm->context.ia32_compat = TIF_IA32;
 671         current->personality |= force_personality32;
 672         /* Prepare the first "return" to user space */
 673         task_pt_regs(current)->orig_ax = __NR_ia32_execve;
 674         current_thread_info()->status |= TS_COMPAT;
 675 #endif
 676 }
 677 
 678 void set_personality_ia32(bool x32)
 679 {
 680         /* Make sure to be in 32bit mode */
 681         set_thread_flag(TIF_ADDR32);
 682 
 683         if (x32)
 684                 __set_personality_x32();
 685         else
 686                 __set_personality_ia32();
 687 }
 688 EXPORT_SYMBOL_GPL(set_personality_ia32);
 689 
 690 #ifdef CONFIG_CHECKPOINT_RESTORE
 691 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
 692 {
 693         int ret;
 694 
 695         ret = map_vdso_once(image, addr);
 696         if (ret)
 697                 return ret;
 698 
 699         return (long)image->size;
 700 }
 701 #endif
 702 
 703 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 704 {
 705         int ret = 0;
 706 
 707         switch (option) {
 708         case ARCH_SET_GS: {
 709                 if (unlikely(arg2 >= TASK_SIZE_MAX))
 710                         return -EPERM;
 711 
 712                 preempt_disable();
 713                 /*
 714                  * ARCH_SET_GS has always overwritten the index
 715                  * and the base. Zero is the most sensible value
 716                  * to put in the index, and is the only value that
 717                  * makes any sense if FSGSBASE is unavailable.
 718                  */
 719                 if (task == current) {
 720                         loadseg(GS, 0);
 721                         x86_gsbase_write_cpu_inactive(arg2);
 722 
 723                         /*
 724                          * On non-FSGSBASE systems, save_base_legacy() expects
 725                          * that we also fill in thread.gsbase.
 726                          */
 727                         task->thread.gsbase = arg2;
 728 
 729                 } else {
 730                         task->thread.gsindex = 0;
 731                         x86_gsbase_write_task(task, arg2);
 732                 }
 733                 preempt_enable();
 734                 break;
 735         }
 736         case ARCH_SET_FS: {
 737                 /*
 738                  * Not strictly needed for %fs, but do it for symmetry
 739                  * with %gs
 740                  */
 741                 if (unlikely(arg2 >= TASK_SIZE_MAX))
 742                         return -EPERM;
 743 
 744                 preempt_disable();
 745                 /*
 746                  * Set the selector to 0 for the same reason
 747                  * as %gs above.
 748                  */
 749                 if (task == current) {
 750                         loadseg(FS, 0);
 751                         x86_fsbase_write_cpu(arg2);
 752 
 753                         /*
 754                          * On non-FSGSBASE systems, save_base_legacy() expects
 755                          * that we also fill in thread.fsbase.
 756                          */
 757                         task->thread.fsbase = arg2;
 758                 } else {
 759                         task->thread.fsindex = 0;
 760                         x86_fsbase_write_task(task, arg2);
 761                 }
 762                 preempt_enable();
 763                 break;
 764         }
 765         case ARCH_GET_FS: {
 766                 unsigned long base = x86_fsbase_read_task(task);
 767 
 768                 ret = put_user(base, (unsigned long __user *)arg2);
 769                 break;
 770         }
 771         case ARCH_GET_GS: {
 772                 unsigned long base = x86_gsbase_read_task(task);
 773 
 774                 ret = put_user(base, (unsigned long __user *)arg2);
 775                 break;
 776         }
 777 
 778 #ifdef CONFIG_CHECKPOINT_RESTORE
 779 # ifdef CONFIG_X86_X32_ABI
 780         case ARCH_MAP_VDSO_X32:
 781                 return prctl_map_vdso(&vdso_image_x32, arg2);
 782 # endif
 783 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 784         case ARCH_MAP_VDSO_32:
 785                 return prctl_map_vdso(&vdso_image_32, arg2);
 786 # endif
 787         case ARCH_MAP_VDSO_64:
 788                 return prctl_map_vdso(&vdso_image_64, arg2);
 789 #endif
 790 
 791         default:
 792                 ret = -EINVAL;
 793                 break;
 794         }
 795 
 796         return ret;
 797 }
 798 
 799 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 800 {
 801         long ret;
 802 
 803         ret = do_arch_prctl_64(current, option, arg2);
 804         if (ret == -EINVAL)
 805                 ret = do_arch_prctl_common(current, option, arg2);
 806 
 807         return ret;
 808 }
 809 
 810 #ifdef CONFIG_IA32_EMULATION
 811 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
 812 {
 813         return do_arch_prctl_common(current, option, arg2);
 814 }
 815 #endif
 816 
 817 unsigned long KSTK_ESP(struct task_struct *task)
 818 {
 819         return task_pt_regs(task)->sp;
 820 }

/* [<][>][^][v][top][bottom][index][help] */