root/arch/x86/kernel/kprobes/core.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. __synthesize_relative_insn
  2. synthesize_reljump
  3. synthesize_relcall
  4. skip_prefixes
  5. can_boost
  6. __recover_probed_insn
  7. recover_probed_instruction
  8. can_probe
  9. is_IF_modifier
  10. __copy_instruction
  11. prepare_boost
  12. alloc_insn_page
  13. free_insn_page
  14. arch_copy_kprobe
  15. arch_prepare_kprobe
  16. arch_arm_kprobe
  17. arch_disarm_kprobe
  18. arch_remove_kprobe
  19. save_previous_kprobe
  20. restore_previous_kprobe
  21. set_current_kprobe
  22. clear_btf
  23. restore_btf
  24. arch_prepare_kretprobe
  25. setup_singlestep
  26. reenter_kprobe
  27. kprobe_int3_handler
  28. trampoline_handler
  29. resume_execution
  30. kprobe_debug_handler
  31. kprobe_fault_handler
  32. arch_populate_kprobe_blacklist
  33. arch_init_kprobes
  34. arch_trampoline_kprobe

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  *  Kernel Probes (KProbes)
   4  *
   5  * Copyright (C) IBM Corporation, 2002, 2004
   6  *
   7  * 2002-Oct     Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
   8  *              Probes initial implementation ( includes contributions from
   9  *              Rusty Russell).
  10  * 2004-July    Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
  11  *              interface to access function arguments.
  12  * 2004-Oct     Jim Keniston <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  13  *              <prasanna@in.ibm.com> adapted for x86_64 from i386.
  14  * 2005-Mar     Roland McGrath <roland@redhat.com>
  15  *              Fixed to handle %rip-relative addressing mode correctly.
  16  * 2005-May     Hien Nguyen <hien@us.ibm.com>, Jim Keniston
  17  *              <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
  18  *              <prasanna@in.ibm.com> added function-return probes.
  19  * 2005-May     Rusty Lynch <rusty.lynch@intel.com>
  20  *              Added function return probes functionality
  21  * 2006-Feb     Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
  22  *              kprobe-booster and kretprobe-booster for i386.
  23  * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
  24  *              and kretprobe-booster for x86-64
  25  * 2007-Dec     Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
  26  *              <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
  27  *              unified x86 kprobes code.
  28  */
  29 #include <linux/kprobes.h>
  30 #include <linux/ptrace.h>
  31 #include <linux/string.h>
  32 #include <linux/slab.h>
  33 #include <linux/hardirq.h>
  34 #include <linux/preempt.h>
  35 #include <linux/sched/debug.h>
  36 #include <linux/extable.h>
  37 #include <linux/kdebug.h>
  38 #include <linux/kallsyms.h>
  39 #include <linux/ftrace.h>
  40 #include <linux/frame.h>
  41 #include <linux/kasan.h>
  42 #include <linux/moduleloader.h>
  43 
  44 #include <asm/text-patching.h>
  45 #include <asm/cacheflush.h>
  46 #include <asm/desc.h>
  47 #include <asm/pgtable.h>
  48 #include <linux/uaccess.h>
  49 #include <asm/alternative.h>
  50 #include <asm/insn.h>
  51 #include <asm/debugreg.h>
  52 #include <asm/set_memory.h>
  53 
  54 #include "common.h"
  55 
  56 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
  57 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
  58 
  59 #define stack_addr(regs) ((unsigned long *)regs->sp)
  60 
  61 #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
  62         (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) |   \
  63           (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) |   \
  64           (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) |   \
  65           (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf))    \
  66          << (row % 32))
  67         /*
  68          * Undefined/reserved opcodes, conditional jump, Opcode Extension
  69          * Groups, and some special opcodes can not boost.
  70          * This is non-const and volatile to keep gcc from statically
  71          * optimizing it out, as variable_test_bit makes gcc think only
  72          * *(unsigned long*) is used.
  73          */
  74 static volatile u32 twobyte_is_boostable[256 / 32] = {
  75         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
  76         /*      ----------------------------------------------          */
  77         W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
  78         W(0x10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1) , /* 10 */
  79         W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 20 */
  80         W(0x30, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
  81         W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
  82         W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
  83         W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1) | /* 60 */
  84         W(0x70, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
  85         W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 80 */
  86         W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
  87         W(0xa0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* a0 */
  88         W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) , /* b0 */
  89         W(0xc0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
  90         W(0xd0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) , /* d0 */
  91         W(0xe0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1) | /* e0 */
  92         W(0xf0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0)   /* f0 */
  93         /*      -----------------------------------------------         */
  94         /*      0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f          */
  95 };
  96 #undef W
  97 
  98 struct kretprobe_blackpoint kretprobe_blacklist[] = {
  99         {"__switch_to", }, /* This function switches only current task, but
 100                               doesn't switch kernel stack.*/
 101         {NULL, NULL}    /* Terminator */
 102 };
 103 
 104 const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
 105 
 106 static nokprobe_inline void
 107 __synthesize_relative_insn(void *dest, void *from, void *to, u8 op)
 108 {
 109         struct __arch_relative_insn {
 110                 u8 op;
 111                 s32 raddr;
 112         } __packed *insn;
 113 
 114         insn = (struct __arch_relative_insn *)dest;
 115         insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
 116         insn->op = op;
 117 }
 118 
 119 /* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
 120 void synthesize_reljump(void *dest, void *from, void *to)
 121 {
 122         __synthesize_relative_insn(dest, from, to, RELATIVEJUMP_OPCODE);
 123 }
 124 NOKPROBE_SYMBOL(synthesize_reljump);
 125 
 126 /* Insert a call instruction at address 'from', which calls address 'to'.*/
 127 void synthesize_relcall(void *dest, void *from, void *to)
 128 {
 129         __synthesize_relative_insn(dest, from, to, RELATIVECALL_OPCODE);
 130 }
 131 NOKPROBE_SYMBOL(synthesize_relcall);
 132 
 133 /*
 134  * Skip the prefixes of the instruction.
 135  */
 136 static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
 137 {
 138         insn_attr_t attr;
 139 
 140         attr = inat_get_opcode_attribute((insn_byte_t)*insn);
 141         while (inat_is_legacy_prefix(attr)) {
 142                 insn++;
 143                 attr = inat_get_opcode_attribute((insn_byte_t)*insn);
 144         }
 145 #ifdef CONFIG_X86_64
 146         if (inat_is_rex_prefix(attr))
 147                 insn++;
 148 #endif
 149         return insn;
 150 }
 151 NOKPROBE_SYMBOL(skip_prefixes);
 152 
 153 /*
 154  * Returns non-zero if INSN is boostable.
 155  * RIP relative instructions are adjusted at copying time in 64 bits mode
 156  */
 157 int can_boost(struct insn *insn, void *addr)
 158 {
 159         kprobe_opcode_t opcode;
 160 
 161         if (search_exception_tables((unsigned long)addr))
 162                 return 0;       /* Page fault may occur on this address. */
 163 
 164         /* 2nd-byte opcode */
 165         if (insn->opcode.nbytes == 2)
 166                 return test_bit(insn->opcode.bytes[1],
 167                                 (unsigned long *)twobyte_is_boostable);
 168 
 169         if (insn->opcode.nbytes != 1)
 170                 return 0;
 171 
 172         /* Can't boost Address-size override prefix */
 173         if (unlikely(inat_is_address_size_prefix(insn->attr)))
 174                 return 0;
 175 
 176         opcode = insn->opcode.bytes[0];
 177 
 178         switch (opcode & 0xf0) {
 179         case 0x60:
 180                 /* can't boost "bound" */
 181                 return (opcode != 0x62);
 182         case 0x70:
 183                 return 0; /* can't boost conditional jump */
 184         case 0x90:
 185                 return opcode != 0x9a;  /* can't boost call far */
 186         case 0xc0:
 187                 /* can't boost software-interruptions */
 188                 return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf;
 189         case 0xd0:
 190                 /* can boost AA* and XLAT */
 191                 return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7);
 192         case 0xe0:
 193                 /* can boost in/out and absolute jmps */
 194                 return ((opcode & 0x04) || opcode == 0xea);
 195         case 0xf0:
 196                 /* clear and set flags are boostable */
 197                 return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe));
 198         default:
 199                 /* CS override prefix and call are not boostable */
 200                 return (opcode != 0x2e && opcode != 0x9a);
 201         }
 202 }
 203 
 204 static unsigned long
 205 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 206 {
 207         struct kprobe *kp;
 208         unsigned long faddr;
 209 
 210         kp = get_kprobe((void *)addr);
 211         faddr = ftrace_location(addr);
 212         /*
 213          * Addresses inside the ftrace location are refused by
 214          * arch_check_ftrace_location(). Something went terribly wrong
 215          * if such an address is checked here.
 216          */
 217         if (WARN_ON(faddr && faddr != addr))
 218                 return 0UL;
 219         /*
 220          * Use the current code if it is not modified by Kprobe
 221          * and it cannot be modified by ftrace.
 222          */
 223         if (!kp && !faddr)
 224                 return addr;
 225 
 226         /*
 227          * Basically, kp->ainsn.insn has an original instruction.
 228          * However, RIP-relative instruction can not do single-stepping
 229          * at different place, __copy_instruction() tweaks the displacement of
 230          * that instruction. In that case, we can't recover the instruction
 231          * from the kp->ainsn.insn.
 232          *
 233          * On the other hand, in case on normal Kprobe, kp->opcode has a copy
 234          * of the first byte of the probed instruction, which is overwritten
 235          * by int3. And the instruction at kp->addr is not modified by kprobes
 236          * except for the first byte, we can recover the original instruction
 237          * from it and kp->opcode.
 238          *
 239          * In case of Kprobes using ftrace, we do not have a copy of
 240          * the original instruction. In fact, the ftrace location might
 241          * be modified at anytime and even could be in an inconsistent state.
 242          * Fortunately, we know that the original code is the ideal 5-byte
 243          * long NOP.
 244          */
 245         if (probe_kernel_read(buf, (void *)addr,
 246                 MAX_INSN_SIZE * sizeof(kprobe_opcode_t)))
 247                 return 0UL;
 248 
 249         if (faddr)
 250                 memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
 251         else
 252                 buf[0] = kp->opcode;
 253         return (unsigned long)buf;
 254 }
 255 
 256 /*
 257  * Recover the probed instruction at addr for further analysis.
 258  * Caller must lock kprobes by kprobe_mutex, or disable preemption
 259  * for preventing to release referencing kprobes.
 260  * Returns zero if the instruction can not get recovered (or access failed).
 261  */
 262 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 263 {
 264         unsigned long __addr;
 265 
 266         __addr = __recover_optprobed_insn(buf, addr);
 267         if (__addr != addr)
 268                 return __addr;
 269 
 270         return __recover_probed_insn(buf, addr);
 271 }
 272 
 273 /* Check if paddr is at an instruction boundary */
 274 static int can_probe(unsigned long paddr)
 275 {
 276         unsigned long addr, __addr, offset = 0;
 277         struct insn insn;
 278         kprobe_opcode_t buf[MAX_INSN_SIZE];
 279 
 280         if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
 281                 return 0;
 282 
 283         /* Decode instructions */
 284         addr = paddr - offset;
 285         while (addr < paddr) {
 286                 /*
 287                  * Check if the instruction has been modified by another
 288                  * kprobe, in which case we replace the breakpoint by the
 289                  * original instruction in our buffer.
 290                  * Also, jump optimization will change the breakpoint to
 291                  * relative-jump. Since the relative-jump itself is
 292                  * normally used, we just go through if there is no kprobe.
 293                  */
 294                 __addr = recover_probed_instruction(buf, addr);
 295                 if (!__addr)
 296                         return 0;
 297                 kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
 298                 insn_get_length(&insn);
 299 
 300                 /*
 301                  * Another debugging subsystem might insert this breakpoint.
 302                  * In that case, we can't recover it.
 303                  */
 304                 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 305                         return 0;
 306                 addr += insn.length;
 307         }
 308 
 309         return (addr == paddr);
 310 }
 311 
 312 /*
 313  * Returns non-zero if opcode modifies the interrupt flag.
 314  */
 315 static int is_IF_modifier(kprobe_opcode_t *insn)
 316 {
 317         /* Skip prefixes */
 318         insn = skip_prefixes(insn);
 319 
 320         switch (*insn) {
 321         case 0xfa:              /* cli */
 322         case 0xfb:              /* sti */
 323         case 0xcf:              /* iret/iretd */
 324         case 0x9d:              /* popf/popfd */
 325                 return 1;
 326         }
 327 
 328         return 0;
 329 }
 330 
 331 /*
 332  * Copy an instruction with recovering modified instruction by kprobes
 333  * and adjust the displacement if the instruction uses the %rip-relative
 334  * addressing mode. Note that since @real will be the final place of copied
 335  * instruction, displacement must be adjust by @real, not @dest.
 336  * This returns the length of copied instruction, or 0 if it has an error.
 337  */
 338 int __copy_instruction(u8 *dest, u8 *src, u8 *real, struct insn *insn)
 339 {
 340         kprobe_opcode_t buf[MAX_INSN_SIZE];
 341         unsigned long recovered_insn =
 342                 recover_probed_instruction(buf, (unsigned long)src);
 343 
 344         if (!recovered_insn || !insn)
 345                 return 0;
 346 
 347         /* This can access kernel text if given address is not recovered */
 348         if (probe_kernel_read(dest, (void *)recovered_insn, MAX_INSN_SIZE))
 349                 return 0;
 350 
 351         kernel_insn_init(insn, dest, MAX_INSN_SIZE);
 352         insn_get_length(insn);
 353 
 354         /* Another subsystem puts a breakpoint, failed to recover */
 355         if (insn->opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 356                 return 0;
 357 
 358         /* We should not singlestep on the exception masking instructions */
 359         if (insn_masking_exception(insn))
 360                 return 0;
 361 
 362 #ifdef CONFIG_X86_64
 363         /* Only x86_64 has RIP relative instructions */
 364         if (insn_rip_relative(insn)) {
 365                 s64 newdisp;
 366                 u8 *disp;
 367                 /*
 368                  * The copied instruction uses the %rip-relative addressing
 369                  * mode.  Adjust the displacement for the difference between
 370                  * the original location of this instruction and the location
 371                  * of the copy that will actually be run.  The tricky bit here
 372                  * is making sure that the sign extension happens correctly in
 373                  * this calculation, since we need a signed 32-bit result to
 374                  * be sign-extended to 64 bits when it's added to the %rip
 375                  * value and yield the same 64-bit result that the sign-
 376                  * extension of the original signed 32-bit displacement would
 377                  * have given.
 378                  */
 379                 newdisp = (u8 *) src + (s64) insn->displacement.value
 380                           - (u8 *) real;
 381                 if ((s64) (s32) newdisp != newdisp) {
 382                         pr_err("Kprobes error: new displacement does not fit into s32 (%llx)\n", newdisp);
 383                         return 0;
 384                 }
 385                 disp = (u8 *) dest + insn_offset_displacement(insn);
 386                 *(s32 *) disp = (s32) newdisp;
 387         }
 388 #endif
 389         return insn->length;
 390 }
 391 
 392 /* Prepare reljump right after instruction to boost */
 393 static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
 394                           struct insn *insn)
 395 {
 396         int len = insn->length;
 397 
 398         if (can_boost(insn, p->addr) &&
 399             MAX_INSN_SIZE - len >= RELATIVEJUMP_SIZE) {
 400                 /*
 401                  * These instructions can be executed directly if it
 402                  * jumps back to correct address.
 403                  */
 404                 synthesize_reljump(buf + len, p->ainsn.insn + len,
 405                                    p->addr + insn->length);
 406                 len += RELATIVEJUMP_SIZE;
 407                 p->ainsn.boostable = true;
 408         } else {
 409                 p->ainsn.boostable = false;
 410         }
 411 
 412         return len;
 413 }
 414 
 415 /* Make page to RO mode when allocate it */
 416 void *alloc_insn_page(void)
 417 {
 418         void *page;
 419 
 420         page = module_alloc(PAGE_SIZE);
 421         if (!page)
 422                 return NULL;
 423 
 424         set_vm_flush_reset_perms(page);
 425         /*
 426          * First make the page read-only, and only then make it executable to
 427          * prevent it from being W+X in between.
 428          */
 429         set_memory_ro((unsigned long)page, 1);
 430 
 431         /*
 432          * TODO: Once additional kernel code protection mechanisms are set, ensure
 433          * that the page was not maliciously altered and it is still zeroed.
 434          */
 435         set_memory_x((unsigned long)page, 1);
 436 
 437         return page;
 438 }
 439 
 440 /* Recover page to RW mode before releasing it */
 441 void free_insn_page(void *page)
 442 {
 443         module_memfree(page);
 444 }
 445 
 446 static int arch_copy_kprobe(struct kprobe *p)
 447 {
 448         struct insn insn;
 449         kprobe_opcode_t buf[MAX_INSN_SIZE];
 450         int len;
 451 
 452         /* Copy an instruction with recovering if other optprobe modifies it.*/
 453         len = __copy_instruction(buf, p->addr, p->ainsn.insn, &insn);
 454         if (!len)
 455                 return -EINVAL;
 456 
 457         /*
 458          * __copy_instruction can modify the displacement of the instruction,
 459          * but it doesn't affect boostable check.
 460          */
 461         len = prepare_boost(buf, p, &insn);
 462 
 463         /* Check whether the instruction modifies Interrupt Flag or not */
 464         p->ainsn.if_modifier = is_IF_modifier(buf);
 465 
 466         /* Also, displacement change doesn't affect the first byte */
 467         p->opcode = buf[0];
 468 
 469         /* OK, write back the instruction(s) into ROX insn buffer */
 470         text_poke(p->ainsn.insn, buf, len);
 471 
 472         return 0;
 473 }
 474 
 475 int arch_prepare_kprobe(struct kprobe *p)
 476 {
 477         int ret;
 478 
 479         if (alternatives_text_reserved(p->addr, p->addr))
 480                 return -EINVAL;
 481 
 482         if (!can_probe((unsigned long)p->addr))
 483                 return -EILSEQ;
 484         /* insn: must be on special executable page on x86. */
 485         p->ainsn.insn = get_insn_slot();
 486         if (!p->ainsn.insn)
 487                 return -ENOMEM;
 488 
 489         ret = arch_copy_kprobe(p);
 490         if (ret) {
 491                 free_insn_slot(p->ainsn.insn, 0);
 492                 p->ainsn.insn = NULL;
 493         }
 494 
 495         return ret;
 496 }
 497 
 498 void arch_arm_kprobe(struct kprobe *p)
 499 {
 500         text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1);
 501 }
 502 
 503 void arch_disarm_kprobe(struct kprobe *p)
 504 {
 505         text_poke(p->addr, &p->opcode, 1);
 506 }
 507 
 508 void arch_remove_kprobe(struct kprobe *p)
 509 {
 510         if (p->ainsn.insn) {
 511                 free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
 512                 p->ainsn.insn = NULL;
 513         }
 514 }
 515 
 516 static nokprobe_inline void
 517 save_previous_kprobe(struct kprobe_ctlblk *kcb)
 518 {
 519         kcb->prev_kprobe.kp = kprobe_running();
 520         kcb->prev_kprobe.status = kcb->kprobe_status;
 521         kcb->prev_kprobe.old_flags = kcb->kprobe_old_flags;
 522         kcb->prev_kprobe.saved_flags = kcb->kprobe_saved_flags;
 523 }
 524 
 525 static nokprobe_inline void
 526 restore_previous_kprobe(struct kprobe_ctlblk *kcb)
 527 {
 528         __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
 529         kcb->kprobe_status = kcb->prev_kprobe.status;
 530         kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
 531         kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
 532 }
 533 
 534 static nokprobe_inline void
 535 set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
 536                    struct kprobe_ctlblk *kcb)
 537 {
 538         __this_cpu_write(current_kprobe, p);
 539         kcb->kprobe_saved_flags = kcb->kprobe_old_flags
 540                 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
 541         if (p->ainsn.if_modifier)
 542                 kcb->kprobe_saved_flags &= ~X86_EFLAGS_IF;
 543 }
 544 
 545 static nokprobe_inline void clear_btf(void)
 546 {
 547         if (test_thread_flag(TIF_BLOCKSTEP)) {
 548                 unsigned long debugctl = get_debugctlmsr();
 549 
 550                 debugctl &= ~DEBUGCTLMSR_BTF;
 551                 update_debugctlmsr(debugctl);
 552         }
 553 }
 554 
 555 static nokprobe_inline void restore_btf(void)
 556 {
 557         if (test_thread_flag(TIF_BLOCKSTEP)) {
 558                 unsigned long debugctl = get_debugctlmsr();
 559 
 560                 debugctl |= DEBUGCTLMSR_BTF;
 561                 update_debugctlmsr(debugctl);
 562         }
 563 }
 564 
 565 void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
 566 {
 567         unsigned long *sara = stack_addr(regs);
 568 
 569         ri->ret_addr = (kprobe_opcode_t *) *sara;
 570         ri->fp = sara;
 571 
 572         /* Replace the return addr with trampoline addr */
 573         *sara = (unsigned long) &kretprobe_trampoline;
 574 }
 575 NOKPROBE_SYMBOL(arch_prepare_kretprobe);
 576 
 577 static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
 578                              struct kprobe_ctlblk *kcb, int reenter)
 579 {
 580         if (setup_detour_execution(p, regs, reenter))
 581                 return;
 582 
 583 #if !defined(CONFIG_PREEMPTION)
 584         if (p->ainsn.boostable && !p->post_handler) {
 585                 /* Boost up -- we can execute copied instructions directly */
 586                 if (!reenter)
 587                         reset_current_kprobe();
 588                 /*
 589                  * Reentering boosted probe doesn't reset current_kprobe,
 590                  * nor set current_kprobe, because it doesn't use single
 591                  * stepping.
 592                  */
 593                 regs->ip = (unsigned long)p->ainsn.insn;
 594                 return;
 595         }
 596 #endif
 597         if (reenter) {
 598                 save_previous_kprobe(kcb);
 599                 set_current_kprobe(p, regs, kcb);
 600                 kcb->kprobe_status = KPROBE_REENTER;
 601         } else
 602                 kcb->kprobe_status = KPROBE_HIT_SS;
 603         /* Prepare real single stepping */
 604         clear_btf();
 605         regs->flags |= X86_EFLAGS_TF;
 606         regs->flags &= ~X86_EFLAGS_IF;
 607         /* single step inline if the instruction is an int3 */
 608         if (p->opcode == BREAKPOINT_INSTRUCTION)
 609                 regs->ip = (unsigned long)p->addr;
 610         else
 611                 regs->ip = (unsigned long)p->ainsn.insn;
 612 }
 613 NOKPROBE_SYMBOL(setup_singlestep);
 614 
 615 /*
 616  * We have reentered the kprobe_handler(), since another probe was hit while
 617  * within the handler. We save the original kprobes variables and just single
 618  * step on the instruction of the new probe without calling any user handlers.
 619  */
 620 static int reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
 621                           struct kprobe_ctlblk *kcb)
 622 {
 623         switch (kcb->kprobe_status) {
 624         case KPROBE_HIT_SSDONE:
 625         case KPROBE_HIT_ACTIVE:
 626         case KPROBE_HIT_SS:
 627                 kprobes_inc_nmissed_count(p);
 628                 setup_singlestep(p, regs, kcb, 1);
 629                 break;
 630         case KPROBE_REENTER:
 631                 /* A probe has been hit in the codepath leading up to, or just
 632                  * after, single-stepping of a probed instruction. This entire
 633                  * codepath should strictly reside in .kprobes.text section.
 634                  * Raise a BUG or we'll continue in an endless reentering loop
 635                  * and eventually a stack overflow.
 636                  */
 637                 pr_err("Unrecoverable kprobe detected.\n");
 638                 dump_kprobe(p);
 639                 BUG();
 640         default:
 641                 /* impossible cases */
 642                 WARN_ON(1);
 643                 return 0;
 644         }
 645 
 646         return 1;
 647 }
 648 NOKPROBE_SYMBOL(reenter_kprobe);
 649 
 650 /*
 651  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
 652  * remain disabled throughout this function.
 653  */
 654 int kprobe_int3_handler(struct pt_regs *regs)
 655 {
 656         kprobe_opcode_t *addr;
 657         struct kprobe *p;
 658         struct kprobe_ctlblk *kcb;
 659 
 660         if (user_mode(regs))
 661                 return 0;
 662 
 663         addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
 664         /*
 665          * We don't want to be preempted for the entire duration of kprobe
 666          * processing. Since int3 and debug trap disables irqs and we clear
 667          * IF while singlestepping, it must be no preemptible.
 668          */
 669 
 670         kcb = get_kprobe_ctlblk();
 671         p = get_kprobe(addr);
 672 
 673         if (p) {
 674                 if (kprobe_running()) {
 675                         if (reenter_kprobe(p, regs, kcb))
 676                                 return 1;
 677                 } else {
 678                         set_current_kprobe(p, regs, kcb);
 679                         kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 680 
 681                         /*
 682                          * If we have no pre-handler or it returned 0, we
 683                          * continue with normal processing.  If we have a
 684                          * pre-handler and it returned non-zero, that means
 685                          * user handler setup registers to exit to another
 686                          * instruction, we must skip the single stepping.
 687                          */
 688                         if (!p->pre_handler || !p->pre_handler(p, regs))
 689                                 setup_singlestep(p, regs, kcb, 0);
 690                         else
 691                                 reset_current_kprobe();
 692                         return 1;
 693                 }
 694         } else if (*addr != BREAKPOINT_INSTRUCTION) {
 695                 /*
 696                  * The breakpoint instruction was removed right
 697                  * after we hit it.  Another cpu has removed
 698                  * either a probepoint or a debugger breakpoint
 699                  * at this address.  In either case, no further
 700                  * handling of this interrupt is appropriate.
 701                  * Back up over the (now missing) int3 and run
 702                  * the original instruction.
 703                  */
 704                 regs->ip = (unsigned long)addr;
 705                 return 1;
 706         } /* else: not a kprobe fault; let the kernel handle it */
 707 
 708         return 0;
 709 }
 710 NOKPROBE_SYMBOL(kprobe_int3_handler);
 711 
 712 /*
 713  * When a retprobed function returns, this code saves registers and
 714  * calls trampoline_handler() runs, which calls the kretprobe's handler.
 715  */
 716 asm(
 717         ".text\n"
 718         ".global kretprobe_trampoline\n"
 719         ".type kretprobe_trampoline, @function\n"
 720         "kretprobe_trampoline:\n"
 721         /* We don't bother saving the ss register */
 722 #ifdef CONFIG_X86_64
 723         "       pushq %rsp\n"
 724         "       pushfq\n"
 725         SAVE_REGS_STRING
 726         "       movq %rsp, %rdi\n"
 727         "       call trampoline_handler\n"
 728         /* Replace saved sp with true return address. */
 729         "       movq %rax, 19*8(%rsp)\n"
 730         RESTORE_REGS_STRING
 731         "       popfq\n"
 732 #else
 733         "       pushl %esp\n"
 734         "       pushfl\n"
 735         SAVE_REGS_STRING
 736         "       movl %esp, %eax\n"
 737         "       call trampoline_handler\n"
 738         /* Replace saved sp with true return address. */
 739         "       movl %eax, 15*4(%esp)\n"
 740         RESTORE_REGS_STRING
 741         "       popfl\n"
 742 #endif
 743         "       ret\n"
 744         ".size kretprobe_trampoline, .-kretprobe_trampoline\n"
 745 );
 746 NOKPROBE_SYMBOL(kretprobe_trampoline);
 747 STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
 748 
 749 static struct kprobe kretprobe_kprobe = {
 750         .addr = (void *)kretprobe_trampoline,
 751 };
 752 
 753 /*
 754  * Called from kretprobe_trampoline
 755  */
 756 __used __visible void *trampoline_handler(struct pt_regs *regs)
 757 {
 758         struct kprobe_ctlblk *kcb;
 759         struct kretprobe_instance *ri = NULL;
 760         struct hlist_head *head, empty_rp;
 761         struct hlist_node *tmp;
 762         unsigned long flags, orig_ret_address = 0;
 763         unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
 764         kprobe_opcode_t *correct_ret_addr = NULL;
 765         void *frame_pointer;
 766         bool skipped = false;
 767 
 768         preempt_disable();
 769 
 770         /*
 771          * Set a dummy kprobe for avoiding kretprobe recursion.
 772          * Since kretprobe never run in kprobe handler, kprobe must not
 773          * be running at this point.
 774          */
 775         kcb = get_kprobe_ctlblk();
 776         __this_cpu_write(current_kprobe, &kretprobe_kprobe);
 777         kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 778 
 779         INIT_HLIST_HEAD(&empty_rp);
 780         kretprobe_hash_lock(current, &head, &flags);
 781         /* fixup registers */
 782         regs->cs = __KERNEL_CS;
 783 #ifdef CONFIG_X86_32
 784         regs->cs |= get_kernel_rpl();
 785         regs->gs = 0;
 786 #endif
 787         /* We use pt_regs->sp for return address holder. */
 788         frame_pointer = &regs->sp;
 789         regs->ip = trampoline_address;
 790         regs->orig_ax = ~0UL;
 791 
 792         /*
 793          * It is possible to have multiple instances associated with a given
 794          * task either because multiple functions in the call path have
 795          * return probes installed on them, and/or more than one
 796          * return probe was registered for a target function.
 797          *
 798          * We can handle this because:
 799          *     - instances are always pushed into the head of the list
 800          *     - when multiple return probes are registered for the same
 801          *       function, the (chronologically) first instance's ret_addr
 802          *       will be the real return address, and all the rest will
 803          *       point to kretprobe_trampoline.
 804          */
 805         hlist_for_each_entry(ri, head, hlist) {
 806                 if (ri->task != current)
 807                         /* another task is sharing our hash bucket */
 808                         continue;
 809                 /*
 810                  * Return probes must be pushed on this hash list correct
 811                  * order (same as return order) so that it can be popped
 812                  * correctly. However, if we find it is pushed it incorrect
 813                  * order, this means we find a function which should not be
 814                  * probed, because the wrong order entry is pushed on the
 815                  * path of processing other kretprobe itself.
 816                  */
 817                 if (ri->fp != frame_pointer) {
 818                         if (!skipped)
 819                                 pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n");
 820                         skipped = true;
 821                         continue;
 822                 }
 823 
 824                 orig_ret_address = (unsigned long)ri->ret_addr;
 825                 if (skipped)
 826                         pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n",
 827                                 ri->rp->kp.addr);
 828 
 829                 if (orig_ret_address != trampoline_address)
 830                         /*
 831                          * This is the real return address. Any other
 832                          * instances associated with this task are for
 833                          * other calls deeper on the call stack
 834                          */
 835                         break;
 836         }
 837 
 838         kretprobe_assert(ri, orig_ret_address, trampoline_address);
 839 
 840         correct_ret_addr = ri->ret_addr;
 841         hlist_for_each_entry_safe(ri, tmp, head, hlist) {
 842                 if (ri->task != current)
 843                         /* another task is sharing our hash bucket */
 844                         continue;
 845                 if (ri->fp != frame_pointer)
 846                         continue;
 847 
 848                 orig_ret_address = (unsigned long)ri->ret_addr;
 849                 if (ri->rp && ri->rp->handler) {
 850                         __this_cpu_write(current_kprobe, &ri->rp->kp);
 851                         ri->ret_addr = correct_ret_addr;
 852                         ri->rp->handler(ri, regs);
 853                         __this_cpu_write(current_kprobe, &kretprobe_kprobe);
 854                 }
 855 
 856                 recycle_rp_inst(ri, &empty_rp);
 857 
 858                 if (orig_ret_address != trampoline_address)
 859                         /*
 860                          * This is the real return address. Any other
 861                          * instances associated with this task are for
 862                          * other calls deeper on the call stack
 863                          */
 864                         break;
 865         }
 866 
 867         kretprobe_hash_unlock(current, &flags);
 868 
 869         __this_cpu_write(current_kprobe, NULL);
 870         preempt_enable();
 871 
 872         hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
 873                 hlist_del(&ri->hlist);
 874                 kfree(ri);
 875         }
 876         return (void *)orig_ret_address;
 877 }
 878 NOKPROBE_SYMBOL(trampoline_handler);
 879 
 880 /*
 881  * Called after single-stepping.  p->addr is the address of the
 882  * instruction whose first byte has been replaced by the "int 3"
 883  * instruction.  To avoid the SMP problems that can occur when we
 884  * temporarily put back the original opcode to single-step, we
 885  * single-stepped a copy of the instruction.  The address of this
 886  * copy is p->ainsn.insn.
 887  *
 888  * This function prepares to return from the post-single-step
 889  * interrupt.  We have to fix up the stack as follows:
 890  *
 891  * 0) Except in the case of absolute or indirect jump or call instructions,
 892  * the new ip is relative to the copied instruction.  We need to make
 893  * it relative to the original instruction.
 894  *
 895  * 1) If the single-stepped instruction was pushfl, then the TF and IF
 896  * flags are set in the just-pushed flags, and may need to be cleared.
 897  *
 898  * 2) If the single-stepped instruction was a call, the return address
 899  * that is atop the stack is the address following the copied instruction.
 900  * We need to make it the address following the original instruction.
 901  *
 902  * If this is the first time we've single-stepped the instruction at
 903  * this probepoint, and the instruction is boostable, boost it: add a
 904  * jump instruction after the copied instruction, that jumps to the next
 905  * instruction after the probepoint.
 906  */
 907 static void resume_execution(struct kprobe *p, struct pt_regs *regs,
 908                              struct kprobe_ctlblk *kcb)
 909 {
 910         unsigned long *tos = stack_addr(regs);
 911         unsigned long copy_ip = (unsigned long)p->ainsn.insn;
 912         unsigned long orig_ip = (unsigned long)p->addr;
 913         kprobe_opcode_t *insn = p->ainsn.insn;
 914 
 915         /* Skip prefixes */
 916         insn = skip_prefixes(insn);
 917 
 918         regs->flags &= ~X86_EFLAGS_TF;
 919         switch (*insn) {
 920         case 0x9c:      /* pushfl */
 921                 *tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
 922                 *tos |= kcb->kprobe_old_flags;
 923                 break;
 924         case 0xc2:      /* iret/ret/lret */
 925         case 0xc3:
 926         case 0xca:
 927         case 0xcb:
 928         case 0xcf:
 929         case 0xea:      /* jmp absolute -- ip is correct */
 930                 /* ip is already adjusted, no more changes required */
 931                 p->ainsn.boostable = true;
 932                 goto no_change;
 933         case 0xe8:      /* call relative - Fix return addr */
 934                 *tos = orig_ip + (*tos - copy_ip);
 935                 break;
 936 #ifdef CONFIG_X86_32
 937         case 0x9a:      /* call absolute -- same as call absolute, indirect */
 938                 *tos = orig_ip + (*tos - copy_ip);
 939                 goto no_change;
 940 #endif
 941         case 0xff:
 942                 if ((insn[1] & 0x30) == 0x10) {
 943                         /*
 944                          * call absolute, indirect
 945                          * Fix return addr; ip is correct.
 946                          * But this is not boostable
 947                          */
 948                         *tos = orig_ip + (*tos - copy_ip);
 949                         goto no_change;
 950                 } else if (((insn[1] & 0x31) == 0x20) ||
 951                            ((insn[1] & 0x31) == 0x21)) {
 952                         /*
 953                          * jmp near and far, absolute indirect
 954                          * ip is correct. And this is boostable
 955                          */
 956                         p->ainsn.boostable = true;
 957                         goto no_change;
 958                 }
 959         default:
 960                 break;
 961         }
 962 
 963         regs->ip += orig_ip - copy_ip;
 964 
 965 no_change:
 966         restore_btf();
 967 }
 968 NOKPROBE_SYMBOL(resume_execution);
 969 
 970 /*
 971  * Interrupts are disabled on entry as trap1 is an interrupt gate and they
 972  * remain disabled throughout this function.
 973  */
 974 int kprobe_debug_handler(struct pt_regs *regs)
 975 {
 976         struct kprobe *cur = kprobe_running();
 977         struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 978 
 979         if (!cur)
 980                 return 0;
 981 
 982         resume_execution(cur, regs, kcb);
 983         regs->flags |= kcb->kprobe_saved_flags;
 984 
 985         if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
 986                 kcb->kprobe_status = KPROBE_HIT_SSDONE;
 987                 cur->post_handler(cur, regs, 0);
 988         }
 989 
 990         /* Restore back the original saved kprobes variables and continue. */
 991         if (kcb->kprobe_status == KPROBE_REENTER) {
 992                 restore_previous_kprobe(kcb);
 993                 goto out;
 994         }
 995         reset_current_kprobe();
 996 out:
 997         /*
 998          * if somebody else is singlestepping across a probe point, flags
 999          * will have TF set, in which case, continue the remaining processing
1000          * of do_debug, as if this is not a probe hit.
1001          */
1002         if (regs->flags & X86_EFLAGS_TF)
1003                 return 0;
1004 
1005         return 1;
1006 }
1007 NOKPROBE_SYMBOL(kprobe_debug_handler);
1008 
1009 int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
1010 {
1011         struct kprobe *cur = kprobe_running();
1012         struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1013 
1014         if (unlikely(regs->ip == (unsigned long)cur->ainsn.insn)) {
1015                 /* This must happen on single-stepping */
1016                 WARN_ON(kcb->kprobe_status != KPROBE_HIT_SS &&
1017                         kcb->kprobe_status != KPROBE_REENTER);
1018                 /*
1019                  * We are here because the instruction being single
1020                  * stepped caused a page fault. We reset the current
1021                  * kprobe and the ip points back to the probe address
1022                  * and allow the page fault handler to continue as a
1023                  * normal page fault.
1024                  */
1025                 regs->ip = (unsigned long)cur->addr;
1026                 /*
1027                  * Trap flag (TF) has been set here because this fault
1028                  * happened where the single stepping will be done.
1029                  * So clear it by resetting the current kprobe:
1030                  */
1031                 regs->flags &= ~X86_EFLAGS_TF;
1032 
1033                 /*
1034                  * If the TF flag was set before the kprobe hit,
1035                  * don't touch it:
1036                  */
1037                 regs->flags |= kcb->kprobe_old_flags;
1038 
1039                 if (kcb->kprobe_status == KPROBE_REENTER)
1040                         restore_previous_kprobe(kcb);
1041                 else
1042                         reset_current_kprobe();
1043         } else if (kcb->kprobe_status == KPROBE_HIT_ACTIVE ||
1044                    kcb->kprobe_status == KPROBE_HIT_SSDONE) {
1045                 /*
1046                  * We increment the nmissed count for accounting,
1047                  * we can also use npre/npostfault count for accounting
1048                  * these specific fault cases.
1049                  */
1050                 kprobes_inc_nmissed_count(cur);
1051 
1052                 /*
1053                  * We come here because instructions in the pre/post
1054                  * handler caused the page_fault, this could happen
1055                  * if handler tries to access user space by
1056                  * copy_from_user(), get_user() etc. Let the
1057                  * user-specified handler try to fix it first.
1058                  */
1059                 if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
1060                         return 1;
1061         }
1062 
1063         return 0;
1064 }
1065 NOKPROBE_SYMBOL(kprobe_fault_handler);
1066 
1067 int __init arch_populate_kprobe_blacklist(void)
1068 {
1069         int ret;
1070 
1071         ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start,
1072                                          (unsigned long)__irqentry_text_end);
1073         if (ret)
1074                 return ret;
1075 
1076         return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
1077                                          (unsigned long)__entry_text_end);
1078 }
1079 
1080 int __init arch_init_kprobes(void)
1081 {
1082         return 0;
1083 }
1084 
1085 int arch_trampoline_kprobe(struct kprobe *p)
1086 {
1087         return 0;
1088 }

/* [<][>][^][v][top][bottom][index][help] */