root/tools/testing/selftests/x86/protection_keys.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. cat_into_file
  2. tracing_root_ok
  3. tracing_on
  4. tracing_off
  5. abort_hooks
  6. __page_o_noops
  7. lots_o_noops_around_write
  8. dump_mem
  9. si_code_str
  10. signal_handler
  11. wait_all_children
  12. sig_chld
  13. setup_sigsegv_handler
  14. setup_handlers
  15. fork_lazy_child
  16. hw_pkey_get
  17. hw_pkey_set
  18. pkey_disable_set
  19. pkey_disable_clear
  20. pkey_write_allow
  21. pkey_write_deny
  22. pkey_access_allow
  23. pkey_access_deny
  24. sys_mprotect_pkey
  25. sys_pkey_alloc
  26. alloc_pkey
  27. sys_pkey_free
  28. alloc_random_pkey
  29. mprotect_pkey
  30. record_pkey_malloc
  31. free_pkey_malloc
  32. malloc_pkey_with_mprotect
  33. malloc_pkey_anon_huge
  34. setup_hugetlbfs
  35. malloc_pkey_hugetlb
  36. malloc_pkey_mmap_dax
  37. malloc_pkey
  38. expected_pk_fault
  39. __save_test_fd
  40. get_test_read_fd
  41. close_test_fds
  42. read_ptr
  43. test_read_of_write_disabled_region
  44. test_read_of_access_disabled_region
  45. test_write_of_write_disabled_region
  46. test_write_of_access_disabled_region
  47. test_kernel_write_of_access_disabled_region
  48. test_kernel_write_of_write_disabled_region
  49. test_kernel_gup_of_access_disabled_region
  50. test_kernel_gup_write_to_write_disabled_region
  51. test_pkey_syscalls_on_non_allocated_pkey
  52. test_pkey_syscalls_bad_args
  53. become_child
  54. test_pkey_alloc_exhaust
  55. test_mprotect_with_pkey_0
  56. test_ptrace_of_child
  57. get_pointer_to_instructions
  58. test_executing_on_unreadable_memory
  59. test_implicit_mprotect_exec_only_memory
  60. test_mprotect_pkey_on_unsupported_cpu
  61. run_tests_once
  62. pkey_setup_shadow
  63. main

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Tests x86 Memory Protection Keys (see Documentation/core-api/protection-keys.rst)
   4  *
   5  * There are examples in here of:
   6  *  * how to set protection keys on memory
   7  *  * how to set/clear bits in PKRU (the rights register)
   8  *  * how to handle SEGV_PKRU signals and extract pkey-relevant
   9  *    information from the siginfo
  10  *
  11  * Things to add:
  12  *      make sure KSM and KSM COW breaking works
  13  *      prefault pages in at malloc, or not
  14  *      protect MPX bounds tables with protection keys?
  15  *      make sure VMA splitting/merging is working correctly
  16  *      OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys
  17  *      look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel
  18  *      do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks
  19  *
  20  * Compile like this:
  21  *      gcc      -o protection_keys    -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
  22  *      gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm
  23  */
  24 #define _GNU_SOURCE
  25 #include <errno.h>
  26 #include <linux/futex.h>
  27 #include <sys/time.h>
  28 #include <sys/syscall.h>
  29 #include <string.h>
  30 #include <stdio.h>
  31 #include <stdint.h>
  32 #include <stdbool.h>
  33 #include <signal.h>
  34 #include <assert.h>
  35 #include <stdlib.h>
  36 #include <ucontext.h>
  37 #include <sys/mman.h>
  38 #include <sys/types.h>
  39 #include <sys/wait.h>
  40 #include <sys/stat.h>
  41 #include <fcntl.h>
  42 #include <unistd.h>
  43 #include <sys/ptrace.h>
  44 #include <setjmp.h>
  45 
  46 #include "pkey-helpers.h"
  47 
  48 int iteration_nr = 1;
  49 int test_nr;
  50 
  51 unsigned int shadow_pkru;
  52 
  53 #define HPAGE_SIZE      (1UL<<21)
  54 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
  55 #define ALIGN_UP(x, align_to)   (((x) + ((align_to)-1)) & ~((align_to)-1))
  56 #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
  57 #define ALIGN_PTR_UP(p, ptr_align_to)   ((typeof(p))ALIGN_UP((unsigned long)(p),        ptr_align_to))
  58 #define ALIGN_PTR_DOWN(p, ptr_align_to) ((typeof(p))ALIGN_DOWN((unsigned long)(p),      ptr_align_to))
  59 #define __stringify_1(x...)     #x
  60 #define __stringify(x...)       __stringify_1(x)
  61 
  62 #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
  63 
  64 int dprint_in_signal;
  65 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
  66 
  67 extern void abort_hooks(void);
  68 #define pkey_assert(condition) do {             \
  69         if (!(condition)) {                     \
  70                 dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
  71                                 __FILE__, __LINE__,     \
  72                                 test_nr, iteration_nr); \
  73                 dprintf0("errno at assert: %d", errno); \
  74                 abort_hooks();                  \
  75                 exit(__LINE__);                 \
  76         }                                       \
  77 } while (0)
  78 
  79 void cat_into_file(char *str, char *file)
  80 {
  81         int fd = open(file, O_RDWR);
  82         int ret;
  83 
  84         dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file);
  85         /*
  86          * these need to be raw because they are called under
  87          * pkey_assert()
  88          */
  89         if (fd < 0) {
  90                 fprintf(stderr, "error opening '%s'\n", str);
  91                 perror("error: ");
  92                 exit(__LINE__);
  93         }
  94 
  95         ret = write(fd, str, strlen(str));
  96         if (ret != strlen(str)) {
  97                 perror("write to file failed");
  98                 fprintf(stderr, "filename: '%s' str: '%s'\n", file, str);
  99                 exit(__LINE__);
 100         }
 101         close(fd);
 102 }
 103 
 104 #if CONTROL_TRACING > 0
 105 static int warned_tracing;
 106 int tracing_root_ok(void)
 107 {
 108         if (geteuid() != 0) {
 109                 if (!warned_tracing)
 110                         fprintf(stderr, "WARNING: not run as root, "
 111                                         "can not do tracing control\n");
 112                 warned_tracing = 1;
 113                 return 0;
 114         }
 115         return 1;
 116 }
 117 #endif
 118 
 119 void tracing_on(void)
 120 {
 121 #if CONTROL_TRACING > 0
 122 #define TRACEDIR "/sys/kernel/debug/tracing"
 123         char pidstr[32];
 124 
 125         if (!tracing_root_ok())
 126                 return;
 127 
 128         sprintf(pidstr, "%d", getpid());
 129         cat_into_file("0", TRACEDIR "/tracing_on");
 130         cat_into_file("\n", TRACEDIR "/trace");
 131         if (1) {
 132                 cat_into_file("function_graph", TRACEDIR "/current_tracer");
 133                 cat_into_file("1", TRACEDIR "/options/funcgraph-proc");
 134         } else {
 135                 cat_into_file("nop", TRACEDIR "/current_tracer");
 136         }
 137         cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid");
 138         cat_into_file("1", TRACEDIR "/tracing_on");
 139         dprintf1("enabled tracing\n");
 140 #endif
 141 }
 142 
 143 void tracing_off(void)
 144 {
 145 #if CONTROL_TRACING > 0
 146         if (!tracing_root_ok())
 147                 return;
 148         cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on");
 149 #endif
 150 }
 151 
 152 void abort_hooks(void)
 153 {
 154         fprintf(stderr, "running %s()...\n", __func__);
 155         tracing_off();
 156 #ifdef SLEEP_ON_ABORT
 157         sleep(SLEEP_ON_ABORT);
 158 #endif
 159 }
 160 
 161 static inline void __page_o_noops(void)
 162 {
 163         /* 8-bytes of instruction * 512 bytes = 1 page */
 164         asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr");
 165 }
 166 
 167 /*
 168  * This attempts to have roughly a page of instructions followed by a few
 169  * instructions that do a write, and another page of instructions.  That
 170  * way, we are pretty sure that the write is in the second page of
 171  * instructions and has at least a page of padding behind it.
 172  *
 173  * *That* lets us be sure to madvise() away the write instruction, which
 174  * will then fault, which makes sure that the fault code handles
 175  * execute-only memory properly.
 176  */
 177 __attribute__((__aligned__(PAGE_SIZE)))
 178 void lots_o_noops_around_write(int *write_to_me)
 179 {
 180         dprintf3("running %s()\n", __func__);
 181         __page_o_noops();
 182         /* Assume this happens in the second page of instructions: */
 183         *write_to_me = __LINE__;
 184         /* pad out by another page: */
 185         __page_o_noops();
 186         dprintf3("%s() done\n", __func__);
 187 }
 188 
 189 /* Define some kernel-like types */
 190 #define  u8 uint8_t
 191 #define u16 uint16_t
 192 #define u32 uint32_t
 193 #define u64 uint64_t
 194 
 195 #ifdef __i386__
 196 
 197 #ifndef SYS_mprotect_key
 198 # define SYS_mprotect_key       380
 199 #endif
 200 
 201 #ifndef SYS_pkey_alloc
 202 # define SYS_pkey_alloc         381
 203 # define SYS_pkey_free          382
 204 #endif
 205 
 206 #define REG_IP_IDX              REG_EIP
 207 #define si_pkey_offset          0x14
 208 
 209 #else
 210 
 211 #ifndef SYS_mprotect_key
 212 # define SYS_mprotect_key       329
 213 #endif
 214 
 215 #ifndef SYS_pkey_alloc
 216 # define SYS_pkey_alloc         330
 217 # define SYS_pkey_free          331
 218 #endif
 219 
 220 #define REG_IP_IDX              REG_RIP
 221 #define si_pkey_offset          0x20
 222 
 223 #endif
 224 
 225 void dump_mem(void *dumpme, int len_bytes)
 226 {
 227         char *c = (void *)dumpme;
 228         int i;
 229 
 230         for (i = 0; i < len_bytes; i += sizeof(u64)) {
 231                 u64 *ptr = (u64 *)(c + i);
 232                 dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
 233         }
 234 }
 235 
 236 /* Failed address bound checks: */
 237 #ifndef SEGV_BNDERR
 238 # define SEGV_BNDERR            3
 239 #endif
 240 
 241 #ifndef SEGV_PKUERR
 242 # define SEGV_PKUERR            4
 243 #endif
 244 
 245 static char *si_code_str(int si_code)
 246 {
 247         if (si_code == SEGV_MAPERR)
 248                 return "SEGV_MAPERR";
 249         if (si_code == SEGV_ACCERR)
 250                 return "SEGV_ACCERR";
 251         if (si_code == SEGV_BNDERR)
 252                 return "SEGV_BNDERR";
 253         if (si_code == SEGV_PKUERR)
 254                 return "SEGV_PKUERR";
 255         return "UNKNOWN";
 256 }
 257 
 258 int pkru_faults;
 259 int last_si_pkey = -1;
 260 void signal_handler(int signum, siginfo_t *si, void *vucontext)
 261 {
 262         ucontext_t *uctxt = vucontext;
 263         int trapno;
 264         unsigned long ip;
 265         char *fpregs;
 266         u32 *pkru_ptr;
 267         u64 siginfo_pkey;
 268         u32 *si_pkey_ptr;
 269         int pkru_offset;
 270         fpregset_t fpregset;
 271 
 272         dprint_in_signal = 1;
 273         dprintf1(">>>>===============SIGSEGV============================\n");
 274         dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
 275                         __rdpkru(), shadow_pkru);
 276 
 277         trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
 278         ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
 279         fpregset = uctxt->uc_mcontext.fpregs;
 280         fpregs = (void *)fpregset;
 281 
 282         dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
 283                         trapno, ip, si_code_str(si->si_code), si->si_code);
 284 #ifdef __i386__
 285         /*
 286          * 32-bit has some extra padding so that userspace can tell whether
 287          * the XSTATE header is present in addition to the "legacy" FPU
 288          * state.  We just assume that it is here.
 289          */
 290         fpregs += 0x70;
 291 #endif
 292         pkru_offset = pkru_xstate_offset();
 293         pkru_ptr = (void *)(&fpregs[pkru_offset]);
 294 
 295         dprintf1("siginfo: %p\n", si);
 296         dprintf1(" fpregs: %p\n", fpregs);
 297         /*
 298          * If we got a PKRU fault, we *HAVE* to have at least one bit set in
 299          * here.
 300          */
 301         dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
 302         if (DEBUG_LEVEL > 4)
 303                 dump_mem(pkru_ptr - 128, 256);
 304         pkey_assert(*pkru_ptr);
 305 
 306         if ((si->si_code == SEGV_MAPERR) ||
 307             (si->si_code == SEGV_ACCERR) ||
 308             (si->si_code == SEGV_BNDERR)) {
 309                 printf("non-PK si_code, exiting...\n");
 310                 exit(4);
 311         }
 312 
 313         si_pkey_ptr = (u32 *)(((u8 *)si) + si_pkey_offset);
 314         dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr);
 315         dump_mem((u8 *)si_pkey_ptr - 8, 24);
 316         siginfo_pkey = *si_pkey_ptr;
 317         pkey_assert(siginfo_pkey < NR_PKEYS);
 318         last_si_pkey = siginfo_pkey;
 319 
 320         dprintf1("signal pkru from xsave: %08x\n", *pkru_ptr);
 321         /* need __rdpkru() version so we do not do shadow_pkru checking */
 322         dprintf1("signal pkru from  pkru: %08x\n", __rdpkru());
 323         dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
 324         *(u64 *)pkru_ptr = 0x00000000;
 325         dprintf1("WARNING: set PRKU=0 to allow faulting instruction to continue\n");
 326         pkru_faults++;
 327         dprintf1("<<<<==================================================\n");
 328         dprint_in_signal = 0;
 329 }
 330 
 331 int wait_all_children(void)
 332 {
 333         int status;
 334         return waitpid(-1, &status, 0);
 335 }
 336 
 337 void sig_chld(int x)
 338 {
 339         dprint_in_signal = 1;
 340         dprintf2("[%d] SIGCHLD: %d\n", getpid(), x);
 341         dprint_in_signal = 0;
 342 }
 343 
 344 void setup_sigsegv_handler(void)
 345 {
 346         int r, rs;
 347         struct sigaction newact;
 348         struct sigaction oldact;
 349 
 350         /* #PF is mapped to sigsegv */
 351         int signum  = SIGSEGV;
 352 
 353         newact.sa_handler = 0;
 354         newact.sa_sigaction = signal_handler;
 355 
 356         /*sigset_t - signals to block while in the handler */
 357         /* get the old signal mask. */
 358         rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask);
 359         pkey_assert(rs == 0);
 360 
 361         /* call sa_sigaction, not sa_handler*/
 362         newact.sa_flags = SA_SIGINFO;
 363 
 364         newact.sa_restorer = 0;  /* void(*)(), obsolete */
 365         r = sigaction(signum, &newact, &oldact);
 366         r = sigaction(SIGALRM, &newact, &oldact);
 367         pkey_assert(r == 0);
 368 }
 369 
 370 void setup_handlers(void)
 371 {
 372         signal(SIGCHLD, &sig_chld);
 373         setup_sigsegv_handler();
 374 }
 375 
 376 pid_t fork_lazy_child(void)
 377 {
 378         pid_t forkret;
 379 
 380         forkret = fork();
 381         pkey_assert(forkret >= 0);
 382         dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
 383 
 384         if (!forkret) {
 385                 /* in the child */
 386                 while (1) {
 387                         dprintf1("child sleeping...\n");
 388                         sleep(30);
 389                 }
 390         }
 391         return forkret;
 392 }
 393 
 394 #ifndef PKEY_DISABLE_ACCESS
 395 # define PKEY_DISABLE_ACCESS    0x1
 396 #endif
 397 
 398 #ifndef PKEY_DISABLE_WRITE
 399 # define PKEY_DISABLE_WRITE     0x2
 400 #endif
 401 
 402 static u32 hw_pkey_get(int pkey, unsigned long flags)
 403 {
 404         u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
 405         u32 pkru = __rdpkru();
 406         u32 shifted_pkru;
 407         u32 masked_pkru;
 408 
 409         dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
 410                         __func__, pkey, flags, 0, 0);
 411         dprintf2("%s() raw pkru: %x\n", __func__, pkru);
 412 
 413         shifted_pkru = (pkru >> (pkey * PKRU_BITS_PER_PKEY));
 414         dprintf2("%s() shifted_pkru: %x\n", __func__, shifted_pkru);
 415         masked_pkru = shifted_pkru & mask;
 416         dprintf2("%s() masked  pkru: %x\n", __func__, masked_pkru);
 417         /*
 418          * shift down the relevant bits to the lowest two, then
 419          * mask off all the other high bits.
 420          */
 421         return masked_pkru;
 422 }
 423 
 424 static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags)
 425 {
 426         u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
 427         u32 old_pkru = __rdpkru();
 428         u32 new_pkru;
 429 
 430         /* make sure that 'rights' only contains the bits we expect: */
 431         assert(!(rights & ~mask));
 432 
 433         /* copy old pkru */
 434         new_pkru = old_pkru;
 435         /* mask out bits from pkey in old value: */
 436         new_pkru &= ~(mask << (pkey * PKRU_BITS_PER_PKEY));
 437         /* OR in new bits for pkey: */
 438         new_pkru |= (rights << (pkey * PKRU_BITS_PER_PKEY));
 439 
 440         __wrpkru(new_pkru);
 441 
 442         dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x pkru now: %x old_pkru: %x\n",
 443                         __func__, pkey, rights, flags, 0, __rdpkru(), old_pkru);
 444         return 0;
 445 }
 446 
 447 void pkey_disable_set(int pkey, int flags)
 448 {
 449         unsigned long syscall_flags = 0;
 450         int ret;
 451         int pkey_rights;
 452         u32 orig_pkru = rdpkru();
 453 
 454         dprintf1("START->%s(%d, 0x%x)\n", __func__,
 455                 pkey, flags);
 456         pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 457 
 458         pkey_rights = hw_pkey_get(pkey, syscall_flags);
 459 
 460         dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 461                         pkey, pkey, pkey_rights);
 462         pkey_assert(pkey_rights >= 0);
 463 
 464         pkey_rights |= flags;
 465 
 466         ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
 467         assert(!ret);
 468         /*pkru and flags have the same format */
 469         shadow_pkru |= flags << (pkey * 2);
 470         dprintf1("%s(%d) shadow: 0x%x\n", __func__, pkey, shadow_pkru);
 471 
 472         pkey_assert(ret >= 0);
 473 
 474         pkey_rights = hw_pkey_get(pkey, syscall_flags);
 475         dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 476                         pkey, pkey, pkey_rights);
 477 
 478         dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
 479         if (flags)
 480                 pkey_assert(rdpkru() > orig_pkru);
 481         dprintf1("END<---%s(%d, 0x%x)\n", __func__,
 482                 pkey, flags);
 483 }
 484 
 485 void pkey_disable_clear(int pkey, int flags)
 486 {
 487         unsigned long syscall_flags = 0;
 488         int ret;
 489         int pkey_rights = hw_pkey_get(pkey, syscall_flags);
 490         u32 orig_pkru = rdpkru();
 491 
 492         pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 493 
 494         dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 495                         pkey, pkey, pkey_rights);
 496         pkey_assert(pkey_rights >= 0);
 497 
 498         pkey_rights |= flags;
 499 
 500         ret = hw_pkey_set(pkey, pkey_rights, 0);
 501         /* pkru and flags have the same format */
 502         shadow_pkru &= ~(flags << (pkey * 2));
 503         pkey_assert(ret >= 0);
 504 
 505         pkey_rights = hw_pkey_get(pkey, syscall_flags);
 506         dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__,
 507                         pkey, pkey, pkey_rights);
 508 
 509         dprintf1("%s(%d) pkru: 0x%x\n", __func__, pkey, rdpkru());
 510         if (flags)
 511                 assert(rdpkru() > orig_pkru);
 512 }
 513 
 514 void pkey_write_allow(int pkey)
 515 {
 516         pkey_disable_clear(pkey, PKEY_DISABLE_WRITE);
 517 }
 518 void pkey_write_deny(int pkey)
 519 {
 520         pkey_disable_set(pkey, PKEY_DISABLE_WRITE);
 521 }
 522 void pkey_access_allow(int pkey)
 523 {
 524         pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS);
 525 }
 526 void pkey_access_deny(int pkey)
 527 {
 528         pkey_disable_set(pkey, PKEY_DISABLE_ACCESS);
 529 }
 530 
 531 int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
 532                 unsigned long pkey)
 533 {
 534         int sret;
 535 
 536         dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__,
 537                         ptr, size, orig_prot, pkey);
 538 
 539         errno = 0;
 540         sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey);
 541         if (errno) {
 542                 dprintf2("SYS_mprotect_key sret: %d\n", sret);
 543                 dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot);
 544                 dprintf2("SYS_mprotect_key failed, errno: %d\n", errno);
 545                 if (DEBUG_LEVEL >= 2)
 546                         perror("SYS_mprotect_pkey");
 547         }
 548         return sret;
 549 }
 550 
 551 int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
 552 {
 553         int ret = syscall(SYS_pkey_alloc, flags, init_val);
 554         dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
 555                         __func__, flags, init_val, ret, errno);
 556         return ret;
 557 }
 558 
 559 int alloc_pkey(void)
 560 {
 561         int ret;
 562         unsigned long init_val = 0x0;
 563 
 564         dprintf1("alloc_pkey()::%d, pkru: 0x%x shadow: %x\n",
 565                         __LINE__, __rdpkru(), shadow_pkru);
 566         ret = sys_pkey_alloc(0, init_val);
 567         /*
 568          * pkey_alloc() sets PKRU, so we need to reflect it in
 569          * shadow_pkru:
 570          */
 571         dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 572                         __LINE__, ret, __rdpkru(), shadow_pkru);
 573         if (ret) {
 574                 /* clear both the bits: */
 575                 shadow_pkru &= ~(0x3      << (ret * 2));
 576                 dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 577                                 __LINE__, ret, __rdpkru(), shadow_pkru);
 578                 /*
 579                  * move the new state in from init_val
 580                  * (remember, we cheated and init_val == pkru format)
 581                  */
 582                 shadow_pkru |=  (init_val << (ret * 2));
 583         }
 584         dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 585                         __LINE__, ret, __rdpkru(), shadow_pkru);
 586         dprintf1("alloc_pkey()::%d errno: %d\n", __LINE__, errno);
 587         /* for shadow checking: */
 588         rdpkru();
 589         dprintf4("alloc_pkey()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n",
 590                         __LINE__, ret, __rdpkru(), shadow_pkru);
 591         return ret;
 592 }
 593 
 594 int sys_pkey_free(unsigned long pkey)
 595 {
 596         int ret = syscall(SYS_pkey_free, pkey);
 597         dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
 598         return ret;
 599 }
 600 
 601 /*
 602  * I had a bug where pkey bits could be set by mprotect() but
 603  * not cleared.  This ensures we get lots of random bit sets
 604  * and clears on the vma and pte pkey bits.
 605  */
 606 int alloc_random_pkey(void)
 607 {
 608         int max_nr_pkey_allocs;
 609         int ret;
 610         int i;
 611         int alloced_pkeys[NR_PKEYS];
 612         int nr_alloced = 0;
 613         int random_index;
 614         memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
 615 
 616         /* allocate every possible key and make a note of which ones we got */
 617         max_nr_pkey_allocs = NR_PKEYS;
 618         max_nr_pkey_allocs = 1;
 619         for (i = 0; i < max_nr_pkey_allocs; i++) {
 620                 int new_pkey = alloc_pkey();
 621                 if (new_pkey < 0)
 622                         break;
 623                 alloced_pkeys[nr_alloced++] = new_pkey;
 624         }
 625 
 626         pkey_assert(nr_alloced > 0);
 627         /* select a random one out of the allocated ones */
 628         random_index = rand() % nr_alloced;
 629         ret = alloced_pkeys[random_index];
 630         /* now zero it out so we don't free it next */
 631         alloced_pkeys[random_index] = 0;
 632 
 633         /* go through the allocated ones that we did not want and free them */
 634         for (i = 0; i < nr_alloced; i++) {
 635                 int free_ret;
 636                 if (!alloced_pkeys[i])
 637                         continue;
 638                 free_ret = sys_pkey_free(alloced_pkeys[i]);
 639                 pkey_assert(!free_ret);
 640         }
 641         dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 642                         __LINE__, ret, __rdpkru(), shadow_pkru);
 643         return ret;
 644 }
 645 
 646 int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
 647                 unsigned long pkey)
 648 {
 649         int nr_iterations = random() % 100;
 650         int ret;
 651 
 652         while (0) {
 653                 int rpkey = alloc_random_pkey();
 654                 ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
 655                 dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
 656                                 ptr, size, orig_prot, pkey, ret);
 657                 if (nr_iterations-- < 0)
 658                         break;
 659 
 660                 dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 661                         __LINE__, ret, __rdpkru(), shadow_pkru);
 662                 sys_pkey_free(rpkey);
 663                 dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 664                         __LINE__, ret, __rdpkru(), shadow_pkru);
 665         }
 666         pkey_assert(pkey < NR_PKEYS);
 667 
 668         ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey);
 669         dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n",
 670                         ptr, size, orig_prot, pkey, ret);
 671         pkey_assert(!ret);
 672         dprintf1("%s()::%d, ret: %d pkru: 0x%x shadow: 0x%x\n", __func__,
 673                         __LINE__, ret, __rdpkru(), shadow_pkru);
 674         return ret;
 675 }
 676 
 677 struct pkey_malloc_record {
 678         void *ptr;
 679         long size;
 680         int prot;
 681 };
 682 struct pkey_malloc_record *pkey_malloc_records;
 683 struct pkey_malloc_record *pkey_last_malloc_record;
 684 long nr_pkey_malloc_records;
 685 void record_pkey_malloc(void *ptr, long size, int prot)
 686 {
 687         long i;
 688         struct pkey_malloc_record *rec = NULL;
 689 
 690         for (i = 0; i < nr_pkey_malloc_records; i++) {
 691                 rec = &pkey_malloc_records[i];
 692                 /* find a free record */
 693                 if (rec)
 694                         break;
 695         }
 696         if (!rec) {
 697                 /* every record is full */
 698                 size_t old_nr_records = nr_pkey_malloc_records;
 699                 size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
 700                 size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record);
 701                 dprintf2("new_nr_records: %zd\n", new_nr_records);
 702                 dprintf2("new_size: %zd\n", new_size);
 703                 pkey_malloc_records = realloc(pkey_malloc_records, new_size);
 704                 pkey_assert(pkey_malloc_records != NULL);
 705                 rec = &pkey_malloc_records[nr_pkey_malloc_records];
 706                 /*
 707                  * realloc() does not initialize memory, so zero it from
 708                  * the first new record all the way to the end.
 709                  */
 710                 for (i = 0; i < new_nr_records - old_nr_records; i++)
 711                         memset(rec + i, 0, sizeof(*rec));
 712         }
 713         dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n",
 714                 (int)(rec - pkey_malloc_records), rec, ptr, size);
 715         rec->ptr = ptr;
 716         rec->size = size;
 717         rec->prot = prot;
 718         pkey_last_malloc_record = rec;
 719         nr_pkey_malloc_records++;
 720 }
 721 
 722 void free_pkey_malloc(void *ptr)
 723 {
 724         long i;
 725         int ret;
 726         dprintf3("%s(%p)\n", __func__, ptr);
 727         for (i = 0; i < nr_pkey_malloc_records; i++) {
 728                 struct pkey_malloc_record *rec = &pkey_malloc_records[i];
 729                 dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
 730                                 ptr, i, rec, rec->ptr, rec->size);
 731                 if ((ptr <  rec->ptr) ||
 732                     (ptr >= rec->ptr + rec->size))
 733                         continue;
 734 
 735                 dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n",
 736                                 ptr, i, rec, rec->ptr, rec->size);
 737                 nr_pkey_malloc_records--;
 738                 ret = munmap(rec->ptr, rec->size);
 739                 dprintf3("munmap ret: %d\n", ret);
 740                 pkey_assert(!ret);
 741                 dprintf3("clearing rec->ptr, rec: %p\n", rec);
 742                 rec->ptr = NULL;
 743                 dprintf3("done clearing rec->ptr, rec: %p\n", rec);
 744                 return;
 745         }
 746         pkey_assert(false);
 747 }
 748 
 749 
 750 void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey)
 751 {
 752         void *ptr;
 753         int ret;
 754 
 755         rdpkru();
 756         dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
 757                         size, prot, pkey);
 758         pkey_assert(pkey < NR_PKEYS);
 759         ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
 760         pkey_assert(ptr != (void *)-1);
 761         ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
 762         pkey_assert(!ret);
 763         record_pkey_malloc(ptr, size, prot);
 764         rdpkru();
 765 
 766         dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
 767         return ptr;
 768 }
 769 
 770 void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
 771 {
 772         int ret;
 773         void *ptr;
 774 
 775         dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
 776                         size, prot, pkey);
 777         /*
 778          * Guarantee we can fit at least one huge page in the resulting
 779          * allocation by allocating space for 2:
 780          */
 781         size = ALIGN_UP(size, HPAGE_SIZE * 2);
 782         ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
 783         pkey_assert(ptr != (void *)-1);
 784         record_pkey_malloc(ptr, size, prot);
 785         mprotect_pkey(ptr, size, prot, pkey);
 786 
 787         dprintf1("unaligned ptr: %p\n", ptr);
 788         ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE);
 789         dprintf1("  aligned ptr: %p\n", ptr);
 790         ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE);
 791         dprintf1("MADV_HUGEPAGE ret: %d\n", ret);
 792         ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED);
 793         dprintf1("MADV_WILLNEED ret: %d\n", ret);
 794         memset(ptr, 0, HPAGE_SIZE);
 795 
 796         dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr);
 797         return ptr;
 798 }
 799 
 800 int hugetlb_setup_ok;
 801 #define GET_NR_HUGE_PAGES 10
 802 void setup_hugetlbfs(void)
 803 {
 804         int err;
 805         int fd;
 806         char buf[] = "123";
 807 
 808         if (geteuid() != 0) {
 809                 fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n");
 810                 return;
 811         }
 812 
 813         cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages");
 814 
 815         /*
 816          * Now go make sure that we got the pages and that they
 817          * are 2M pages.  Someone might have made 1G the default.
 818          */
 819         fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", O_RDONLY);
 820         if (fd < 0) {
 821                 perror("opening sysfs 2M hugetlb config");
 822                 return;
 823         }
 824 
 825         /* -1 to guarantee leaving the trailing \0 */
 826         err = read(fd, buf, sizeof(buf)-1);
 827         close(fd);
 828         if (err <= 0) {
 829                 perror("reading sysfs 2M hugetlb config");
 830                 return;
 831         }
 832 
 833         if (atoi(buf) != GET_NR_HUGE_PAGES) {
 834                 fprintf(stderr, "could not confirm 2M pages, got: '%s' expected %d\n",
 835                         buf, GET_NR_HUGE_PAGES);
 836                 return;
 837         }
 838 
 839         hugetlb_setup_ok = 1;
 840 }
 841 
 842 void *malloc_pkey_hugetlb(long size, int prot, u16 pkey)
 843 {
 844         void *ptr;
 845         int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB;
 846 
 847         if (!hugetlb_setup_ok)
 848                 return PTR_ERR_ENOTSUP;
 849 
 850         dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey);
 851         size = ALIGN_UP(size, HPAGE_SIZE * 2);
 852         pkey_assert(pkey < NR_PKEYS);
 853         ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0);
 854         pkey_assert(ptr != (void *)-1);
 855         mprotect_pkey(ptr, size, prot, pkey);
 856 
 857         record_pkey_malloc(ptr, size, prot);
 858 
 859         dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr);
 860         return ptr;
 861 }
 862 
 863 void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey)
 864 {
 865         void *ptr;
 866         int fd;
 867 
 868         dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
 869                         size, prot, pkey);
 870         pkey_assert(pkey < NR_PKEYS);
 871         fd = open("/dax/foo", O_RDWR);
 872         pkey_assert(fd >= 0);
 873 
 874         ptr = mmap(0, size, prot, MAP_SHARED, fd, 0);
 875         pkey_assert(ptr != (void *)-1);
 876 
 877         mprotect_pkey(ptr, size, prot, pkey);
 878 
 879         record_pkey_malloc(ptr, size, prot);
 880 
 881         dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr);
 882         close(fd);
 883         return ptr;
 884 }
 885 
 886 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 887 
 888         malloc_pkey_with_mprotect,
 889         malloc_pkey_anon_huge,
 890         malloc_pkey_hugetlb
 891 /* can not do direct with the pkey_mprotect() API:
 892         malloc_pkey_mmap_direct,
 893         malloc_pkey_mmap_dax,
 894 */
 895 };
 896 
 897 void *malloc_pkey(long size, int prot, u16 pkey)
 898 {
 899         void *ret;
 900         static int malloc_type;
 901         int nr_malloc_types = ARRAY_SIZE(pkey_malloc);
 902 
 903         pkey_assert(pkey < NR_PKEYS);
 904 
 905         while (1) {
 906                 pkey_assert(malloc_type < nr_malloc_types);
 907 
 908                 ret = pkey_malloc[malloc_type](size, prot, pkey);
 909                 pkey_assert(ret != (void *)-1);
 910 
 911                 malloc_type++;
 912                 if (malloc_type >= nr_malloc_types)
 913                         malloc_type = (random()%nr_malloc_types);
 914 
 915                 /* try again if the malloc_type we tried is unsupported */
 916                 if (ret == PTR_ERR_ENOTSUP)
 917                         continue;
 918 
 919                 break;
 920         }
 921 
 922         dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__,
 923                         size, prot, pkey, ret);
 924         return ret;
 925 }
 926 
 927 int last_pkru_faults;
 928 #define UNKNOWN_PKEY -2
 929 void expected_pk_fault(int pkey)
 930 {
 931         dprintf2("%s(): last_pkru_faults: %d pkru_faults: %d\n",
 932                         __func__, last_pkru_faults, pkru_faults);
 933         dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey);
 934         pkey_assert(last_pkru_faults + 1 == pkru_faults);
 935 
 936        /*
 937         * For exec-only memory, we do not know the pkey in
 938         * advance, so skip this check.
 939         */
 940         if (pkey != UNKNOWN_PKEY)
 941                 pkey_assert(last_si_pkey == pkey);
 942 
 943         /*
 944          * The signal handler shold have cleared out PKRU to let the
 945          * test program continue.  We now have to restore it.
 946          */
 947         if (__rdpkru() != 0)
 948                 pkey_assert(0);
 949 
 950         __wrpkru(shadow_pkru);
 951         dprintf1("%s() set PKRU=%x to restore state after signal nuked it\n",
 952                         __func__, shadow_pkru);
 953         last_pkru_faults = pkru_faults;
 954         last_si_pkey = -1;
 955 }
 956 
 957 #define do_not_expect_pk_fault(msg)     do {                    \
 958         if (last_pkru_faults != pkru_faults)                    \
 959                 dprintf0("unexpected PK fault: %s\n", msg);     \
 960         pkey_assert(last_pkru_faults == pkru_faults);           \
 961 } while (0)
 962 
 963 int test_fds[10] = { -1 };
 964 int nr_test_fds;
 965 void __save_test_fd(int fd)
 966 {
 967         pkey_assert(fd >= 0);
 968         pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds));
 969         test_fds[nr_test_fds] = fd;
 970         nr_test_fds++;
 971 }
 972 
 973 int get_test_read_fd(void)
 974 {
 975         int test_fd = open("/etc/passwd", O_RDONLY);
 976         __save_test_fd(test_fd);
 977         return test_fd;
 978 }
 979 
 980 void close_test_fds(void)
 981 {
 982         int i;
 983 
 984         for (i = 0; i < nr_test_fds; i++) {
 985                 if (test_fds[i] < 0)
 986                         continue;
 987                 close(test_fds[i]);
 988                 test_fds[i] = -1;
 989         }
 990         nr_test_fds = 0;
 991 }
 992 
 993 #define barrier() __asm__ __volatile__("": : :"memory")
 994 __attribute__((noinline)) int read_ptr(int *ptr)
 995 {
 996         /*
 997          * Keep GCC from optimizing this away somehow
 998          */
 999         barrier();
1000         return *ptr;
1001 }
1002 
1003 void test_read_of_write_disabled_region(int *ptr, u16 pkey)
1004 {
1005         int ptr_contents;
1006 
1007         dprintf1("disabling write access to PKEY[1], doing read\n");
1008         pkey_write_deny(pkey);
1009         ptr_contents = read_ptr(ptr);
1010         dprintf1("*ptr: %d\n", ptr_contents);
1011         dprintf1("\n");
1012 }
1013 void test_read_of_access_disabled_region(int *ptr, u16 pkey)
1014 {
1015         int ptr_contents;
1016 
1017         dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr);
1018         rdpkru();
1019         pkey_access_deny(pkey);
1020         ptr_contents = read_ptr(ptr);
1021         dprintf1("*ptr: %d\n", ptr_contents);
1022         expected_pk_fault(pkey);
1023 }
1024 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
1025 {
1026         dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
1027         pkey_write_deny(pkey);
1028         *ptr = __LINE__;
1029         expected_pk_fault(pkey);
1030 }
1031 void test_write_of_access_disabled_region(int *ptr, u16 pkey)
1032 {
1033         dprintf1("disabling access to PKEY[%02d], doing write\n", pkey);
1034         pkey_access_deny(pkey);
1035         *ptr = __LINE__;
1036         expected_pk_fault(pkey);
1037 }
1038 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
1039 {
1040         int ret;
1041         int test_fd = get_test_read_fd();
1042 
1043         dprintf1("disabling access to PKEY[%02d], "
1044                  "having kernel read() to buffer\n", pkey);
1045         pkey_access_deny(pkey);
1046         ret = read(test_fd, ptr, 1);
1047         dprintf1("read ret: %d\n", ret);
1048         pkey_assert(ret);
1049 }
1050 void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
1051 {
1052         int ret;
1053         int test_fd = get_test_read_fd();
1054 
1055         pkey_write_deny(pkey);
1056         ret = read(test_fd, ptr, 100);
1057         dprintf1("read ret: %d\n", ret);
1058         if (ret < 0 && (DEBUG_LEVEL > 0))
1059                 perror("verbose read result (OK for this to be bad)");
1060         pkey_assert(ret);
1061 }
1062 
1063 void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
1064 {
1065         int pipe_ret, vmsplice_ret;
1066         struct iovec iov;
1067         int pipe_fds[2];
1068 
1069         pipe_ret = pipe(pipe_fds);
1070 
1071         pkey_assert(pipe_ret == 0);
1072         dprintf1("disabling access to PKEY[%02d], "
1073                  "having kernel vmsplice from buffer\n", pkey);
1074         pkey_access_deny(pkey);
1075         iov.iov_base = ptr;
1076         iov.iov_len = PAGE_SIZE;
1077         vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT);
1078         dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
1079         pkey_assert(vmsplice_ret == -1);
1080 
1081         close(pipe_fds[0]);
1082         close(pipe_fds[1]);
1083 }
1084 
1085 void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey)
1086 {
1087         int ignored = 0xdada;
1088         int futex_ret;
1089         int some_int = __LINE__;
1090 
1091         dprintf1("disabling write to PKEY[%02d], "
1092                  "doing futex gunk in buffer\n", pkey);
1093         *ptr = some_int;
1094         pkey_write_deny(pkey);
1095         futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL,
1096                         &ignored, ignored);
1097         if (DEBUG_LEVEL > 0)
1098                 perror("futex");
1099         dprintf1("futex() ret: %d\n", futex_ret);
1100 }
1101 
1102 /* Assumes that all pkeys other than 'pkey' are unallocated */
1103 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
1104 {
1105         int err;
1106         int i;
1107 
1108         /* Note: 0 is the default pkey, so don't mess with it */
1109         for (i = 1; i < NR_PKEYS; i++) {
1110                 if (pkey == i)
1111                         continue;
1112 
1113                 dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i);
1114                 err = sys_pkey_free(i);
1115                 pkey_assert(err);
1116 
1117                 err = sys_pkey_free(i);
1118                 pkey_assert(err);
1119 
1120                 err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i);
1121                 pkey_assert(err);
1122         }
1123 }
1124 
1125 /* Assumes that all pkeys other than 'pkey' are unallocated */
1126 void test_pkey_syscalls_bad_args(int *ptr, u16 pkey)
1127 {
1128         int err;
1129         int bad_pkey = NR_PKEYS+99;
1130 
1131         /* pass a known-invalid pkey in: */
1132         err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey);
1133         pkey_assert(err);
1134 }
1135 
1136 void become_child(void)
1137 {
1138         pid_t forkret;
1139 
1140         forkret = fork();
1141         pkey_assert(forkret >= 0);
1142         dprintf3("[%d] fork() ret: %d\n", getpid(), forkret);
1143 
1144         if (!forkret) {
1145                 /* in the child */
1146                 return;
1147         }
1148         exit(0);
1149 }
1150 
1151 /* Assumes that all pkeys other than 'pkey' are unallocated */
1152 void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
1153 {
1154         int err;
1155         int allocated_pkeys[NR_PKEYS] = {0};
1156         int nr_allocated_pkeys = 0;
1157         int i;
1158 
1159         for (i = 0; i < NR_PKEYS*3; i++) {
1160                 int new_pkey;
1161                 dprintf1("%s() alloc loop: %d\n", __func__, i);
1162                 new_pkey = alloc_pkey();
1163                 dprintf4("%s()::%d, err: %d pkru: 0x%x shadow: 0x%x\n", __func__,
1164                                 __LINE__, err, __rdpkru(), shadow_pkru);
1165                 rdpkru(); /* for shadow checking */
1166                 dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC);
1167                 if ((new_pkey == -1) && (errno == ENOSPC)) {
1168                         dprintf2("%s() failed to allocate pkey after %d tries\n",
1169                                 __func__, nr_allocated_pkeys);
1170                 } else {
1171                         /*
1172                          * Ensure the number of successes never
1173                          * exceeds the number of keys supported
1174                          * in the hardware.
1175                          */
1176                         pkey_assert(nr_allocated_pkeys < NR_PKEYS);
1177                         allocated_pkeys[nr_allocated_pkeys++] = new_pkey;
1178                 }
1179 
1180                 /*
1181                  * Make sure that allocation state is properly
1182                  * preserved across fork().
1183                  */
1184                 if (i == NR_PKEYS*2)
1185                         become_child();
1186         }
1187 
1188         dprintf3("%s()::%d\n", __func__, __LINE__);
1189 
1190         /*
1191          * There are 16 pkeys supported in hardware.  Three are
1192          * allocated by the time we get here:
1193          *   1. The default key (0)
1194          *   2. One possibly consumed by an execute-only mapping.
1195          *   3. One allocated by the test code and passed in via
1196          *      'pkey' to this function.
1197          * Ensure that we can allocate at least another 13 (16-3).
1198          */
1199         pkey_assert(i >= NR_PKEYS-3);
1200 
1201         for (i = 0; i < nr_allocated_pkeys; i++) {
1202                 err = sys_pkey_free(allocated_pkeys[i]);
1203                 pkey_assert(!err);
1204                 rdpkru(); /* for shadow checking */
1205         }
1206 }
1207 
1208 /*
1209  * pkey 0 is special.  It is allocated by default, so you do not
1210  * have to call pkey_alloc() to use it first.  Make sure that it
1211  * is usable.
1212  */
1213 void test_mprotect_with_pkey_0(int *ptr, u16 pkey)
1214 {
1215         long size;
1216         int prot;
1217 
1218         assert(pkey_last_malloc_record);
1219         size = pkey_last_malloc_record->size;
1220         /*
1221          * This is a bit of a hack.  But mprotect() requires
1222          * huge-page-aligned sizes when operating on hugetlbfs.
1223          * So, make sure that we use something that's a multiple
1224          * of a huge page when we can.
1225          */
1226         if (size >= HPAGE_SIZE)
1227                 size = HPAGE_SIZE;
1228         prot = pkey_last_malloc_record->prot;
1229 
1230         /* Use pkey 0 */
1231         mprotect_pkey(ptr, size, prot, 0);
1232 
1233         /* Make sure that we can set it back to the original pkey. */
1234         mprotect_pkey(ptr, size, prot, pkey);
1235 }
1236 
1237 void test_ptrace_of_child(int *ptr, u16 pkey)
1238 {
1239         __attribute__((__unused__)) int peek_result;
1240         pid_t child_pid;
1241         void *ignored = 0;
1242         long ret;
1243         int status;
1244         /*
1245          * This is the "control" for our little expermient.  Make sure
1246          * we can always access it when ptracing.
1247          */
1248         int *plain_ptr_unaligned = malloc(HPAGE_SIZE);
1249         int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE);
1250 
1251         /*
1252          * Fork a child which is an exact copy of this process, of course.
1253          * That means we can do all of our tests via ptrace() and then plain
1254          * memory access and ensure they work differently.
1255          */
1256         child_pid = fork_lazy_child();
1257         dprintf1("[%d] child pid: %d\n", getpid(), child_pid);
1258 
1259         ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored);
1260         if (ret)
1261                 perror("attach");
1262         dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__);
1263         pkey_assert(ret != -1);
1264         ret = waitpid(child_pid, &status, WUNTRACED);
1265         if ((ret != child_pid) || !(WIFSTOPPED(status))) {
1266                 fprintf(stderr, "weird waitpid result %ld stat %x\n",
1267                                 ret, status);
1268                 pkey_assert(0);
1269         }
1270         dprintf2("waitpid ret: %ld\n", ret);
1271         dprintf2("waitpid status: %d\n", status);
1272 
1273         pkey_access_deny(pkey);
1274         pkey_write_deny(pkey);
1275 
1276         /* Write access, untested for now:
1277         ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data);
1278         pkey_assert(ret != -1);
1279         dprintf1("poke at %p: %ld\n", peek_at, ret);
1280         */
1281 
1282         /*
1283          * Try to access the pkey-protected "ptr" via ptrace:
1284          */
1285         ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored);
1286         /* expect it to work, without an error: */
1287         pkey_assert(ret != -1);
1288         /* Now access from the current task, and expect an exception: */
1289         peek_result = read_ptr(ptr);
1290         expected_pk_fault(pkey);
1291 
1292         /*
1293          * Try to access the NON-pkey-protected "plain_ptr" via ptrace:
1294          */
1295         ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored);
1296         /* expect it to work, without an error: */
1297         pkey_assert(ret != -1);
1298         /* Now access from the current task, and expect NO exception: */
1299         peek_result = read_ptr(plain_ptr);
1300         do_not_expect_pk_fault("read plain pointer after ptrace");
1301 
1302         ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0);
1303         pkey_assert(ret != -1);
1304 
1305         ret = kill(child_pid, SIGKILL);
1306         pkey_assert(ret != -1);
1307 
1308         wait(&status);
1309 
1310         free(plain_ptr_unaligned);
1311 }
1312 
1313 void *get_pointer_to_instructions(void)
1314 {
1315         void *p1;
1316 
1317         p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE);
1318         dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write);
1319         /* lots_o_noops_around_write should be page-aligned already */
1320         assert(p1 == &lots_o_noops_around_write);
1321 
1322         /* Point 'p1' at the *second* page of the function: */
1323         p1 += PAGE_SIZE;
1324 
1325         /*
1326          * Try to ensure we fault this in on next touch to ensure
1327          * we get an instruction fault as opposed to a data one
1328          */
1329         madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1330 
1331         return p1;
1332 }
1333 
1334 void test_executing_on_unreadable_memory(int *ptr, u16 pkey)
1335 {
1336         void *p1;
1337         int scratch;
1338         int ptr_contents;
1339         int ret;
1340 
1341         p1 = get_pointer_to_instructions();
1342         lots_o_noops_around_write(&scratch);
1343         ptr_contents = read_ptr(p1);
1344         dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1345 
1346         ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey);
1347         pkey_assert(!ret);
1348         pkey_access_deny(pkey);
1349 
1350         dprintf2("pkru: %x\n", rdpkru());
1351 
1352         /*
1353          * Make sure this is an *instruction* fault
1354          */
1355         madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1356         lots_o_noops_around_write(&scratch);
1357         do_not_expect_pk_fault("executing on PROT_EXEC memory");
1358         ptr_contents = read_ptr(p1);
1359         dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1360         expected_pk_fault(pkey);
1361 }
1362 
1363 void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey)
1364 {
1365         void *p1;
1366         int scratch;
1367         int ptr_contents;
1368         int ret;
1369 
1370         dprintf1("%s() start\n", __func__);
1371 
1372         p1 = get_pointer_to_instructions();
1373         lots_o_noops_around_write(&scratch);
1374         ptr_contents = read_ptr(p1);
1375         dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1376 
1377         /* Use a *normal* mprotect(), not mprotect_pkey(): */
1378         ret = mprotect(p1, PAGE_SIZE, PROT_EXEC);
1379         pkey_assert(!ret);
1380 
1381         dprintf2("pkru: %x\n", rdpkru());
1382 
1383         /* Make sure this is an *instruction* fault */
1384         madvise(p1, PAGE_SIZE, MADV_DONTNEED);
1385         lots_o_noops_around_write(&scratch);
1386         do_not_expect_pk_fault("executing on PROT_EXEC memory");
1387         ptr_contents = read_ptr(p1);
1388         dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents);
1389         expected_pk_fault(UNKNOWN_PKEY);
1390 
1391         /*
1392          * Put the memory back to non-PROT_EXEC.  Should clear the
1393          * exec-only pkey off the VMA and allow it to be readable
1394          * again.  Go to PROT_NONE first to check for a kernel bug
1395          * that did not clear the pkey when doing PROT_NONE.
1396          */
1397         ret = mprotect(p1, PAGE_SIZE, PROT_NONE);
1398         pkey_assert(!ret);
1399 
1400         ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC);
1401         pkey_assert(!ret);
1402         ptr_contents = read_ptr(p1);
1403         do_not_expect_pk_fault("plain read on recently PROT_EXEC area");
1404 }
1405 
1406 void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey)
1407 {
1408         int size = PAGE_SIZE;
1409         int sret;
1410 
1411         if (cpu_has_pku()) {
1412                 dprintf1("SKIP: %s: no CPU support\n", __func__);
1413                 return;
1414         }
1415 
1416         sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey);
1417         pkey_assert(sret < 0);
1418 }
1419 
1420 void (*pkey_tests[])(int *ptr, u16 pkey) = {
1421         test_read_of_write_disabled_region,
1422         test_read_of_access_disabled_region,
1423         test_write_of_write_disabled_region,
1424         test_write_of_access_disabled_region,
1425         test_kernel_write_of_access_disabled_region,
1426         test_kernel_write_of_write_disabled_region,
1427         test_kernel_gup_of_access_disabled_region,
1428         test_kernel_gup_write_to_write_disabled_region,
1429         test_executing_on_unreadable_memory,
1430         test_implicit_mprotect_exec_only_memory,
1431         test_mprotect_with_pkey_0,
1432         test_ptrace_of_child,
1433         test_pkey_syscalls_on_non_allocated_pkey,
1434         test_pkey_syscalls_bad_args,
1435         test_pkey_alloc_exhaust,
1436 };
1437 
1438 void run_tests_once(void)
1439 {
1440         int *ptr;
1441         int prot = PROT_READ|PROT_WRITE;
1442 
1443         for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) {
1444                 int pkey;
1445                 int orig_pkru_faults = pkru_faults;
1446 
1447                 dprintf1("======================\n");
1448                 dprintf1("test %d preparing...\n", test_nr);
1449 
1450                 tracing_on();
1451                 pkey = alloc_random_pkey();
1452                 dprintf1("test %d starting with pkey: %d\n", test_nr, pkey);
1453                 ptr = malloc_pkey(PAGE_SIZE, prot, pkey);
1454                 dprintf1("test %d starting...\n", test_nr);
1455                 pkey_tests[test_nr](ptr, pkey);
1456                 dprintf1("freeing test memory: %p\n", ptr);
1457                 free_pkey_malloc(ptr);
1458                 sys_pkey_free(pkey);
1459 
1460                 dprintf1("pkru_faults: %d\n", pkru_faults);
1461                 dprintf1("orig_pkru_faults: %d\n", orig_pkru_faults);
1462 
1463                 tracing_off();
1464                 close_test_fds();
1465 
1466                 printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr);
1467                 dprintf1("======================\n\n");
1468         }
1469         iteration_nr++;
1470 }
1471 
1472 void pkey_setup_shadow(void)
1473 {
1474         shadow_pkru = __rdpkru();
1475 }
1476 
1477 int main(void)
1478 {
1479         int nr_iterations = 22;
1480 
1481         setup_handlers();
1482 
1483         printf("has pku: %d\n", cpu_has_pku());
1484 
1485         if (!cpu_has_pku()) {
1486                 int size = PAGE_SIZE;
1487                 int *ptr;
1488 
1489                 printf("running PKEY tests for unsupported CPU/OS\n");
1490 
1491                 ptr  = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
1492                 assert(ptr != (void *)-1);
1493                 test_mprotect_pkey_on_unsupported_cpu(ptr, 1);
1494                 exit(0);
1495         }
1496 
1497         pkey_setup_shadow();
1498         printf("startup pkru: %x\n", rdpkru());
1499         setup_hugetlbfs();
1500 
1501         while (nr_iterations-- > 0)
1502                 run_tests_once();
1503 
1504         printf("done (all tests OK)\n");
1505         return 0;
1506 }

/* [<][>][^][v][top][bottom][index][help] */