1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * MMU support 8 * 9 * Copyright (C) 2006 Qumranet, Inc. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 11 * 12 * Authors: 13 * Yaniv Kamay <yaniv@qumranet.com> 14 * Avi Kivity <avi@qumranet.com> 15 * 16 * This work is licensed under the terms of the GNU GPL, version 2. See 17 * the COPYING file in the top-level directory. 18 * 19 */ 20 21#include "irq.h" 22#include "mmu.h" 23#include "x86.h" 24#include "kvm_cache_regs.h" 25#include "cpuid.h" 26 27#include <linux/kvm_host.h> 28#include <linux/types.h> 29#include <linux/string.h> 30#include <linux/mm.h> 31#include <linux/highmem.h> 32#include <linux/module.h> 33#include <linux/swap.h> 34#include <linux/hugetlb.h> 35#include <linux/compiler.h> 36#include <linux/srcu.h> 37#include <linux/slab.h> 38#include <linux/uaccess.h> 39 40#include <asm/page.h> 41#include <asm/cmpxchg.h> 42#include <asm/io.h> 43#include <asm/vmx.h> 44 45/* 46 * When setting this variable to true it enables Two-Dimensional-Paging 47 * where the hardware walks 2 page tables: 48 * 1. the guest-virtual to guest-physical 49 * 2. while doing 1. it walks guest-physical to host-physical 50 * If the hardware supports that we don't need to do shadow paging. 51 */ 52bool tdp_enabled = false; 53 54enum { 55 AUDIT_PRE_PAGE_FAULT, 56 AUDIT_POST_PAGE_FAULT, 57 AUDIT_PRE_PTE_WRITE, 58 AUDIT_POST_PTE_WRITE, 59 AUDIT_PRE_SYNC, 60 AUDIT_POST_SYNC 61}; 62 63#undef MMU_DEBUG 64 65#ifdef MMU_DEBUG 66static bool dbg = 0; 67module_param(dbg, bool, 0644); 68 69#define pgprintk(x...) do { if (dbg) printk(x); } while (0) 70#define rmap_printk(x...) do { if (dbg) printk(x); } while (0) 71#define MMU_WARN_ON(x) WARN_ON(x) 72#else 73#define pgprintk(x...) do { } while (0) 74#define rmap_printk(x...) do { } while (0) 75#define MMU_WARN_ON(x) do { } while (0) 76#endif 77 78#define PTE_PREFETCH_NUM 8 79 80#define PT_FIRST_AVAIL_BITS_SHIFT 10 81#define PT64_SECOND_AVAIL_BITS_SHIFT 52 82 83#define PT64_LEVEL_BITS 9 84 85#define PT64_LEVEL_SHIFT(level) \ 86 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 87 88#define PT64_INDEX(address, level)\ 89 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 90 91 92#define PT32_LEVEL_BITS 10 93 94#define PT32_LEVEL_SHIFT(level) \ 95 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 96 97#define PT32_LVL_OFFSET_MASK(level) \ 98 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 99 * PT32_LEVEL_BITS))) - 1)) 100 101#define PT32_INDEX(address, level)\ 102 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 103 104 105#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 106#define PT64_DIR_BASE_ADDR_MASK \ 107 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 108#define PT64_LVL_ADDR_MASK(level) \ 109 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 110 * PT64_LEVEL_BITS))) - 1)) 111#define PT64_LVL_OFFSET_MASK(level) \ 112 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 113 * PT64_LEVEL_BITS))) - 1)) 114 115#define PT32_BASE_ADDR_MASK PAGE_MASK 116#define PT32_DIR_BASE_ADDR_MASK \ 117 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1)) 118#define PT32_LVL_ADDR_MASK(level) \ 119 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \ 120 * PT32_LEVEL_BITS))) - 1)) 121 122#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ 123 | shadow_x_mask | shadow_nx_mask) 124 125#define ACC_EXEC_MASK 1 126#define ACC_WRITE_MASK PT_WRITABLE_MASK 127#define ACC_USER_MASK PT_USER_MASK 128#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 129 130#include <trace/events/kvm.h> 131 132#define CREATE_TRACE_POINTS 133#include "mmutrace.h" 134 135#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) 136#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) 137 138#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) 139 140/* make pte_list_desc fit well in cache line */ 141#define PTE_LIST_EXT 3 142 143struct pte_list_desc { 144 u64 *sptes[PTE_LIST_EXT]; 145 struct pte_list_desc *more; 146}; 147 148struct kvm_shadow_walk_iterator { 149 u64 addr; 150 hpa_t shadow_addr; 151 u64 *sptep; 152 int level; 153 unsigned index; 154}; 155 156#define for_each_shadow_entry(_vcpu, _addr, _walker) \ 157 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 158 shadow_walk_okay(&(_walker)); \ 159 shadow_walk_next(&(_walker))) 160 161#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \ 162 for (shadow_walk_init(&(_walker), _vcpu, _addr); \ 163 shadow_walk_okay(&(_walker)) && \ 164 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \ 165 __shadow_walk_next(&(_walker), spte)) 166 167static struct kmem_cache *pte_list_desc_cache; 168static struct kmem_cache *mmu_page_header_cache; 169static struct percpu_counter kvm_total_used_mmu_pages; 170 171static u64 __read_mostly shadow_nx_mask; 172static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 173static u64 __read_mostly shadow_user_mask; 174static u64 __read_mostly shadow_accessed_mask; 175static u64 __read_mostly shadow_dirty_mask; 176static u64 __read_mostly shadow_mmio_mask; 177 178static void mmu_spte_set(u64 *sptep, u64 spte); 179static void mmu_free_roots(struct kvm_vcpu *vcpu); 180 181void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) 182{ 183 shadow_mmio_mask = mmio_mask; 184} 185EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); 186 187/* 188 * the low bit of the generation number is always presumed to be zero. 189 * This disables mmio caching during memslot updates. The concept is 190 * similar to a seqcount but instead of retrying the access we just punt 191 * and ignore the cache. 192 * 193 * spte bits 3-11 are used as bits 1-9 of the generation number, 194 * the bits 52-61 are used as bits 10-19 of the generation number. 195 */ 196#define MMIO_SPTE_GEN_LOW_SHIFT 2 197#define MMIO_SPTE_GEN_HIGH_SHIFT 52 198 199#define MMIO_GEN_SHIFT 20 200#define MMIO_GEN_LOW_SHIFT 10 201#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2) 202#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) 203 204static u64 generation_mmio_spte_mask(unsigned int gen) 205{ 206 u64 mask; 207 208 WARN_ON(gen & ~MMIO_GEN_MASK); 209 210 mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; 211 mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; 212 return mask; 213} 214 215static unsigned int get_mmio_spte_generation(u64 spte) 216{ 217 unsigned int gen; 218 219 spte &= ~shadow_mmio_mask; 220 221 gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; 222 gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; 223 return gen; 224} 225 226static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) 227{ 228 return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; 229} 230 231static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, 232 unsigned access) 233{ 234 unsigned int gen = kvm_current_mmio_generation(vcpu); 235 u64 mask = generation_mmio_spte_mask(gen); 236 237 access &= ACC_WRITE_MASK | ACC_USER_MASK; 238 mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; 239 240 trace_mark_mmio_spte(sptep, gfn, access, gen); 241 mmu_spte_set(sptep, mask); 242} 243 244static bool is_mmio_spte(u64 spte) 245{ 246 return (spte & shadow_mmio_mask) == shadow_mmio_mask; 247} 248 249static gfn_t get_mmio_spte_gfn(u64 spte) 250{ 251 u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; 252 return (spte & ~mask) >> PAGE_SHIFT; 253} 254 255static unsigned get_mmio_spte_access(u64 spte) 256{ 257 u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask; 258 return (spte & ~mask) & ~PAGE_MASK; 259} 260 261static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 262 pfn_t pfn, unsigned access) 263{ 264 if (unlikely(is_noslot_pfn(pfn))) { 265 mark_mmio_spte(vcpu, sptep, gfn, access); 266 return true; 267 } 268 269 return false; 270} 271 272static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) 273{ 274 unsigned int kvm_gen, spte_gen; 275 276 kvm_gen = kvm_current_mmio_generation(vcpu); 277 spte_gen = get_mmio_spte_generation(spte); 278 279 trace_check_mmio_spte(spte, kvm_gen, spte_gen); 280 return likely(kvm_gen == spte_gen); 281} 282 283void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 284 u64 dirty_mask, u64 nx_mask, u64 x_mask) 285{ 286 shadow_user_mask = user_mask; 287 shadow_accessed_mask = accessed_mask; 288 shadow_dirty_mask = dirty_mask; 289 shadow_nx_mask = nx_mask; 290 shadow_x_mask = x_mask; 291} 292EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 293 294static int is_cpuid_PSE36(void) 295{ 296 return 1; 297} 298 299static int is_nx(struct kvm_vcpu *vcpu) 300{ 301 return vcpu->arch.efer & EFER_NX; 302} 303 304static int is_shadow_present_pte(u64 pte) 305{ 306 return pte & PT_PRESENT_MASK && !is_mmio_spte(pte); 307} 308 309static int is_large_pte(u64 pte) 310{ 311 return pte & PT_PAGE_SIZE_MASK; 312} 313 314static int is_rmap_spte(u64 pte) 315{ 316 return is_shadow_present_pte(pte); 317} 318 319static int is_last_spte(u64 pte, int level) 320{ 321 if (level == PT_PAGE_TABLE_LEVEL) 322 return 1; 323 if (is_large_pte(pte)) 324 return 1; 325 return 0; 326} 327 328static pfn_t spte_to_pfn(u64 pte) 329{ 330 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 331} 332 333static gfn_t pse36_gfn_delta(u32 gpte) 334{ 335 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; 336 337 return (gpte & PT32_DIR_PSE36_MASK) << shift; 338} 339 340#ifdef CONFIG_X86_64 341static void __set_spte(u64 *sptep, u64 spte) 342{ 343 *sptep = spte; 344} 345 346static void __update_clear_spte_fast(u64 *sptep, u64 spte) 347{ 348 *sptep = spte; 349} 350 351static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 352{ 353 return xchg(sptep, spte); 354} 355 356static u64 __get_spte_lockless(u64 *sptep) 357{ 358 return ACCESS_ONCE(*sptep); 359} 360#else 361union split_spte { 362 struct { 363 u32 spte_low; 364 u32 spte_high; 365 }; 366 u64 spte; 367}; 368 369static void count_spte_clear(u64 *sptep, u64 spte) 370{ 371 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 372 373 if (is_shadow_present_pte(spte)) 374 return; 375 376 /* Ensure the spte is completely set before we increase the count */ 377 smp_wmb(); 378 sp->clear_spte_count++; 379} 380 381static void __set_spte(u64 *sptep, u64 spte) 382{ 383 union split_spte *ssptep, sspte; 384 385 ssptep = (union split_spte *)sptep; 386 sspte = (union split_spte)spte; 387 388 ssptep->spte_high = sspte.spte_high; 389 390 /* 391 * If we map the spte from nonpresent to present, We should store 392 * the high bits firstly, then set present bit, so cpu can not 393 * fetch this spte while we are setting the spte. 394 */ 395 smp_wmb(); 396 397 ssptep->spte_low = sspte.spte_low; 398} 399 400static void __update_clear_spte_fast(u64 *sptep, u64 spte) 401{ 402 union split_spte *ssptep, sspte; 403 404 ssptep = (union split_spte *)sptep; 405 sspte = (union split_spte)spte; 406 407 ssptep->spte_low = sspte.spte_low; 408 409 /* 410 * If we map the spte from present to nonpresent, we should clear 411 * present bit firstly to avoid vcpu fetch the old high bits. 412 */ 413 smp_wmb(); 414 415 ssptep->spte_high = sspte.spte_high; 416 count_spte_clear(sptep, spte); 417} 418 419static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) 420{ 421 union split_spte *ssptep, sspte, orig; 422 423 ssptep = (union split_spte *)sptep; 424 sspte = (union split_spte)spte; 425 426 /* xchg acts as a barrier before the setting of the high bits */ 427 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low); 428 orig.spte_high = ssptep->spte_high; 429 ssptep->spte_high = sspte.spte_high; 430 count_spte_clear(sptep, spte); 431 432 return orig.spte; 433} 434 435/* 436 * The idea using the light way get the spte on x86_32 guest is from 437 * gup_get_pte(arch/x86/mm/gup.c). 438 * 439 * An spte tlb flush may be pending, because kvm_set_pte_rmapp 440 * coalesces them and we are running out of the MMU lock. Therefore 441 * we need to protect against in-progress updates of the spte. 442 * 443 * Reading the spte while an update is in progress may get the old value 444 * for the high part of the spte. The race is fine for a present->non-present 445 * change (because the high part of the spte is ignored for non-present spte), 446 * but for a present->present change we must reread the spte. 447 * 448 * All such changes are done in two steps (present->non-present and 449 * non-present->present), hence it is enough to count the number of 450 * present->non-present updates: if it changed while reading the spte, 451 * we might have hit the race. This is done using clear_spte_count. 452 */ 453static u64 __get_spte_lockless(u64 *sptep) 454{ 455 struct kvm_mmu_page *sp = page_header(__pa(sptep)); 456 union split_spte spte, *orig = (union split_spte *)sptep; 457 int count; 458 459retry: 460 count = sp->clear_spte_count; 461 smp_rmb(); 462 463 spte.spte_low = orig->spte_low; 464 smp_rmb(); 465 466 spte.spte_high = orig->spte_high; 467 smp_rmb(); 468 469 if (unlikely(spte.spte_low != orig->spte_low || 470 count != sp->clear_spte_count)) 471 goto retry; 472 473 return spte.spte; 474} 475#endif 476 477static bool spte_is_locklessly_modifiable(u64 spte) 478{ 479 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) == 480 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE); 481} 482 483static bool spte_has_volatile_bits(u64 spte) 484{ 485 /* 486 * Always atomicly update spte if it can be updated 487 * out of mmu-lock, it can ensure dirty bit is not lost, 488 * also, it can help us to get a stable is_writable_pte() 489 * to ensure tlb flush is not missed. 490 */ 491 if (spte_is_locklessly_modifiable(spte)) 492 return true; 493 494 if (!shadow_accessed_mask) 495 return false; 496 497 if (!is_shadow_present_pte(spte)) 498 return false; 499 500 if ((spte & shadow_accessed_mask) && 501 (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) 502 return false; 503 504 return true; 505} 506 507static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) 508{ 509 return (old_spte & bit_mask) && !(new_spte & bit_mask); 510} 511 512static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask) 513{ 514 return (old_spte & bit_mask) != (new_spte & bit_mask); 515} 516 517/* Rules for using mmu_spte_set: 518 * Set the sptep from nonpresent to present. 519 * Note: the sptep being assigned *must* be either not present 520 * or in a state where the hardware will not attempt to update 521 * the spte. 522 */ 523static void mmu_spte_set(u64 *sptep, u64 new_spte) 524{ 525 WARN_ON(is_shadow_present_pte(*sptep)); 526 __set_spte(sptep, new_spte); 527} 528 529/* Rules for using mmu_spte_update: 530 * Update the state bits, it means the mapped pfn is not changged. 531 * 532 * Whenever we overwrite a writable spte with a read-only one we 533 * should flush remote TLBs. Otherwise rmap_write_protect 534 * will find a read-only spte, even though the writable spte 535 * might be cached on a CPU's TLB, the return value indicates this 536 * case. 537 */ 538static bool mmu_spte_update(u64 *sptep, u64 new_spte) 539{ 540 u64 old_spte = *sptep; 541 bool ret = false; 542 543 WARN_ON(!is_rmap_spte(new_spte)); 544 545 if (!is_shadow_present_pte(old_spte)) { 546 mmu_spte_set(sptep, new_spte); 547 return ret; 548 } 549 550 if (!spte_has_volatile_bits(old_spte)) 551 __update_clear_spte_fast(sptep, new_spte); 552 else 553 old_spte = __update_clear_spte_slow(sptep, new_spte); 554 555 /* 556 * For the spte updated out of mmu-lock is safe, since 557 * we always atomicly update it, see the comments in 558 * spte_has_volatile_bits(). 559 */ 560 if (spte_is_locklessly_modifiable(old_spte) && 561 !is_writable_pte(new_spte)) 562 ret = true; 563 564 if (!shadow_accessed_mask) 565 return ret; 566 567 /* 568 * Flush TLB when accessed/dirty bits are changed in the page tables, 569 * to guarantee consistency between TLB and page tables. 570 */ 571 if (spte_is_bit_changed(old_spte, new_spte, 572 shadow_accessed_mask | shadow_dirty_mask)) 573 ret = true; 574 575 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) 576 kvm_set_pfn_accessed(spte_to_pfn(old_spte)); 577 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) 578 kvm_set_pfn_dirty(spte_to_pfn(old_spte)); 579 580 return ret; 581} 582 583/* 584 * Rules for using mmu_spte_clear_track_bits: 585 * It sets the sptep from present to nonpresent, and track the 586 * state bits, it is used to clear the last level sptep. 587 */ 588static int mmu_spte_clear_track_bits(u64 *sptep) 589{ 590 pfn_t pfn; 591 u64 old_spte = *sptep; 592 593 if (!spte_has_volatile_bits(old_spte)) 594 __update_clear_spte_fast(sptep, 0ull); 595 else 596 old_spte = __update_clear_spte_slow(sptep, 0ull); 597 598 if (!is_rmap_spte(old_spte)) 599 return 0; 600 601 pfn = spte_to_pfn(old_spte); 602 603 /* 604 * KVM does not hold the refcount of the page used by 605 * kvm mmu, before reclaiming the page, we should 606 * unmap it from mmu first. 607 */ 608 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn))); 609 610 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 611 kvm_set_pfn_accessed(pfn); 612 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) 613 kvm_set_pfn_dirty(pfn); 614 return 1; 615} 616 617/* 618 * Rules for using mmu_spte_clear_no_track: 619 * Directly clear spte without caring the state bits of sptep, 620 * it is used to set the upper level spte. 621 */ 622static void mmu_spte_clear_no_track(u64 *sptep) 623{ 624 __update_clear_spte_fast(sptep, 0ull); 625} 626 627static u64 mmu_spte_get_lockless(u64 *sptep) 628{ 629 return __get_spte_lockless(sptep); 630} 631 632static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) 633{ 634 /* 635 * Prevent page table teardown by making any free-er wait during 636 * kvm_flush_remote_tlbs() IPI to all active vcpus. 637 */ 638 local_irq_disable(); 639 vcpu->mode = READING_SHADOW_PAGE_TABLES; 640 /* 641 * Make sure a following spte read is not reordered ahead of the write 642 * to vcpu->mode. 643 */ 644 smp_mb(); 645} 646 647static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) 648{ 649 /* 650 * Make sure the write to vcpu->mode is not reordered in front of 651 * reads to sptes. If it does, kvm_commit_zap_page() can see us 652 * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. 653 */ 654 smp_mb(); 655 vcpu->mode = OUTSIDE_GUEST_MODE; 656 local_irq_enable(); 657} 658 659static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 660 struct kmem_cache *base_cache, int min) 661{ 662 void *obj; 663 664 if (cache->nobjs >= min) 665 return 0; 666 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 667 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL); 668 if (!obj) 669 return -ENOMEM; 670 cache->objects[cache->nobjs++] = obj; 671 } 672 return 0; 673} 674 675static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) 676{ 677 return cache->nobjs; 678} 679 680static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, 681 struct kmem_cache *cache) 682{ 683 while (mc->nobjs) 684 kmem_cache_free(cache, mc->objects[--mc->nobjs]); 685} 686 687static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 688 int min) 689{ 690 void *page; 691 692 if (cache->nobjs >= min) 693 return 0; 694 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 695 page = (void *)__get_free_page(GFP_KERNEL); 696 if (!page) 697 return -ENOMEM; 698 cache->objects[cache->nobjs++] = page; 699 } 700 return 0; 701} 702 703static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) 704{ 705 while (mc->nobjs) 706 free_page((unsigned long)mc->objects[--mc->nobjs]); 707} 708 709static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) 710{ 711 int r; 712 713 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 714 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); 715 if (r) 716 goto out; 717 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 718 if (r) 719 goto out; 720 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, 721 mmu_page_header_cache, 4); 722out: 723 return r; 724} 725 726static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 727{ 728 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, 729 pte_list_desc_cache); 730 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 731 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 732 mmu_page_header_cache); 733} 734 735static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 736{ 737 void *p; 738 739 BUG_ON(!mc->nobjs); 740 p = mc->objects[--mc->nobjs]; 741 return p; 742} 743 744static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) 745{ 746 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); 747} 748 749static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) 750{ 751 kmem_cache_free(pte_list_desc_cache, pte_list_desc); 752} 753 754static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 755{ 756 if (!sp->role.direct) 757 return sp->gfns[index]; 758 759 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 760} 761 762static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 763{ 764 if (sp->role.direct) 765 BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); 766 else 767 sp->gfns[index] = gfn; 768} 769 770/* 771 * Return the pointer to the large page information for a given gfn, 772 * handling slots that are not large page aligned. 773 */ 774static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, 775 struct kvm_memory_slot *slot, 776 int level) 777{ 778 unsigned long idx; 779 780 idx = gfn_to_index(gfn, slot->base_gfn, level); 781 return &slot->arch.lpage_info[level - 2][idx]; 782} 783 784static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 785{ 786 struct kvm_memslots *slots; 787 struct kvm_memory_slot *slot; 788 struct kvm_lpage_info *linfo; 789 gfn_t gfn; 790 int i; 791 792 gfn = sp->gfn; 793 slots = kvm_memslots_for_spte_role(kvm, sp->role); 794 slot = __gfn_to_memslot(slots, gfn); 795 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 796 linfo = lpage_info_slot(gfn, slot, i); 797 linfo->write_count += 1; 798 } 799 kvm->arch.indirect_shadow_pages++; 800} 801 802static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) 803{ 804 struct kvm_memslots *slots; 805 struct kvm_memory_slot *slot; 806 struct kvm_lpage_info *linfo; 807 gfn_t gfn; 808 int i; 809 810 gfn = sp->gfn; 811 slots = kvm_memslots_for_spte_role(kvm, sp->role); 812 slot = __gfn_to_memslot(slots, gfn); 813 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 814 linfo = lpage_info_slot(gfn, slot, i); 815 linfo->write_count -= 1; 816 WARN_ON(linfo->write_count < 0); 817 } 818 kvm->arch.indirect_shadow_pages--; 819} 820 821static int __has_wrprotected_page(gfn_t gfn, int level, 822 struct kvm_memory_slot *slot) 823{ 824 struct kvm_lpage_info *linfo; 825 826 if (slot) { 827 linfo = lpage_info_slot(gfn, slot, level); 828 return linfo->write_count; 829 } 830 831 return 1; 832} 833 834static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) 835{ 836 struct kvm_memory_slot *slot; 837 838 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 839 return __has_wrprotected_page(gfn, level, slot); 840} 841 842static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 843{ 844 unsigned long page_size; 845 int i, ret = 0; 846 847 page_size = kvm_host_page_size(kvm, gfn); 848 849 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 850 if (page_size >= KVM_HPAGE_SIZE(i)) 851 ret = i; 852 else 853 break; 854 } 855 856 return ret; 857} 858 859static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, 860 bool no_dirty_log) 861{ 862 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 863 return false; 864 if (no_dirty_log && slot->dirty_bitmap) 865 return false; 866 867 return true; 868} 869 870static struct kvm_memory_slot * 871gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, 872 bool no_dirty_log) 873{ 874 struct kvm_memory_slot *slot; 875 876 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 877 if (!memslot_valid_for_gpte(slot, no_dirty_log)) 878 slot = NULL; 879 880 return slot; 881} 882 883static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, 884 bool *force_pt_level) 885{ 886 int host_level, level, max_level; 887 struct kvm_memory_slot *slot; 888 889 if (unlikely(*force_pt_level)) 890 return PT_PAGE_TABLE_LEVEL; 891 892 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); 893 *force_pt_level = !memslot_valid_for_gpte(slot, true); 894 if (unlikely(*force_pt_level)) 895 return PT_PAGE_TABLE_LEVEL; 896 897 host_level = host_mapping_level(vcpu->kvm, large_gfn); 898 899 if (host_level == PT_PAGE_TABLE_LEVEL) 900 return host_level; 901 902 max_level = min(kvm_x86_ops->get_lpage_level(), host_level); 903 904 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) 905 if (__has_wrprotected_page(large_gfn, level, slot)) 906 break; 907 908 return level - 1; 909} 910 911/* 912 * Pte mapping structures: 913 * 914 * If pte_list bit zero is zero, then pte_list point to the spte. 915 * 916 * If pte_list bit zero is one, (then pte_list & ~1) points to a struct 917 * pte_list_desc containing more mappings. 918 * 919 * Returns the number of pte entries before the spte was added or zero if 920 * the spte was not added. 921 * 922 */ 923static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, 924 unsigned long *pte_list) 925{ 926 struct pte_list_desc *desc; 927 int i, count = 0; 928 929 if (!*pte_list) { 930 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte); 931 *pte_list = (unsigned long)spte; 932 } else if (!(*pte_list & 1)) { 933 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte); 934 desc = mmu_alloc_pte_list_desc(vcpu); 935 desc->sptes[0] = (u64 *)*pte_list; 936 desc->sptes[1] = spte; 937 *pte_list = (unsigned long)desc | 1; 938 ++count; 939 } else { 940 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte); 941 desc = (struct pte_list_desc *)(*pte_list & ~1ul); 942 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) { 943 desc = desc->more; 944 count += PTE_LIST_EXT; 945 } 946 if (desc->sptes[PTE_LIST_EXT-1]) { 947 desc->more = mmu_alloc_pte_list_desc(vcpu); 948 desc = desc->more; 949 } 950 for (i = 0; desc->sptes[i]; ++i) 951 ++count; 952 desc->sptes[i] = spte; 953 } 954 return count; 955} 956 957static void 958pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, 959 int i, struct pte_list_desc *prev_desc) 960{ 961 int j; 962 963 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) 964 ; 965 desc->sptes[i] = desc->sptes[j]; 966 desc->sptes[j] = NULL; 967 if (j != 0) 968 return; 969 if (!prev_desc && !desc->more) 970 *pte_list = (unsigned long)desc->sptes[0]; 971 else 972 if (prev_desc) 973 prev_desc->more = desc->more; 974 else 975 *pte_list = (unsigned long)desc->more | 1; 976 mmu_free_pte_list_desc(desc); 977} 978 979static void pte_list_remove(u64 *spte, unsigned long *pte_list) 980{ 981 struct pte_list_desc *desc; 982 struct pte_list_desc *prev_desc; 983 int i; 984 985 if (!*pte_list) { 986 printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte); 987 BUG(); 988 } else if (!(*pte_list & 1)) { 989 rmap_printk("pte_list_remove: %p 1->0\n", spte); 990 if ((u64 *)*pte_list != spte) { 991 printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte); 992 BUG(); 993 } 994 *pte_list = 0; 995 } else { 996 rmap_printk("pte_list_remove: %p many->many\n", spte); 997 desc = (struct pte_list_desc *)(*pte_list & ~1ul); 998 prev_desc = NULL; 999 while (desc) { 1000 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) 1001 if (desc->sptes[i] == spte) { 1002 pte_list_desc_remove_entry(pte_list, 1003 desc, i, 1004 prev_desc); 1005 return; 1006 } 1007 prev_desc = desc; 1008 desc = desc->more; 1009 } 1010 pr_err("pte_list_remove: %p many->many\n", spte); 1011 BUG(); 1012 } 1013} 1014 1015typedef void (*pte_list_walk_fn) (u64 *spte); 1016static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) 1017{ 1018 struct pte_list_desc *desc; 1019 int i; 1020 1021 if (!*pte_list) 1022 return; 1023 1024 if (!(*pte_list & 1)) 1025 return fn((u64 *)*pte_list); 1026 1027 desc = (struct pte_list_desc *)(*pte_list & ~1ul); 1028 while (desc) { 1029 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) 1030 fn(desc->sptes[i]); 1031 desc = desc->more; 1032 } 1033} 1034 1035static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, 1036 struct kvm_memory_slot *slot) 1037{ 1038 unsigned long idx; 1039 1040 idx = gfn_to_index(gfn, slot->base_gfn, level); 1041 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; 1042} 1043 1044/* 1045 * Take gfn and return the reverse mapping to it. 1046 */ 1047static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) 1048{ 1049 struct kvm_memslots *slots; 1050 struct kvm_memory_slot *slot; 1051 1052 slots = kvm_memslots_for_spte_role(kvm, sp->role); 1053 slot = __gfn_to_memslot(slots, gfn); 1054 return __gfn_to_rmap(gfn, sp->role.level, slot); 1055} 1056 1057static bool rmap_can_add(struct kvm_vcpu *vcpu) 1058{ 1059 struct kvm_mmu_memory_cache *cache; 1060 1061 cache = &vcpu->arch.mmu_pte_list_desc_cache; 1062 return mmu_memory_cache_free_objects(cache); 1063} 1064 1065static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 1066{ 1067 struct kvm_mmu_page *sp; 1068 unsigned long *rmapp; 1069 1070 sp = page_header(__pa(spte)); 1071 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 1072 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); 1073 return pte_list_add(vcpu, spte, rmapp); 1074} 1075 1076static void rmap_remove(struct kvm *kvm, u64 *spte) 1077{ 1078 struct kvm_mmu_page *sp; 1079 gfn_t gfn; 1080 unsigned long *rmapp; 1081 1082 sp = page_header(__pa(spte)); 1083 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 1084 rmapp = gfn_to_rmap(kvm, gfn, sp); 1085 pte_list_remove(spte, rmapp); 1086} 1087 1088/* 1089 * Used by the following functions to iterate through the sptes linked by a 1090 * rmap. All fields are private and not assumed to be used outside. 1091 */ 1092struct rmap_iterator { 1093 /* private fields */ 1094 struct pte_list_desc *desc; /* holds the sptep if not NULL */ 1095 int pos; /* index of the sptep */ 1096}; 1097 1098/* 1099 * Iteration must be started by this function. This should also be used after 1100 * removing/dropping sptes from the rmap link because in such cases the 1101 * information in the itererator may not be valid. 1102 * 1103 * Returns sptep if found, NULL otherwise. 1104 */ 1105static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) 1106{ 1107 if (!rmap) 1108 return NULL; 1109 1110 if (!(rmap & 1)) { 1111 iter->desc = NULL; 1112 return (u64 *)rmap; 1113 } 1114 1115 iter->desc = (struct pte_list_desc *)(rmap & ~1ul); 1116 iter->pos = 0; 1117 return iter->desc->sptes[iter->pos]; 1118} 1119 1120/* 1121 * Must be used with a valid iterator: e.g. after rmap_get_first(). 1122 * 1123 * Returns sptep if found, NULL otherwise. 1124 */ 1125static u64 *rmap_get_next(struct rmap_iterator *iter) 1126{ 1127 if (iter->desc) { 1128 if (iter->pos < PTE_LIST_EXT - 1) { 1129 u64 *sptep; 1130 1131 ++iter->pos; 1132 sptep = iter->desc->sptes[iter->pos]; 1133 if (sptep) 1134 return sptep; 1135 } 1136 1137 iter->desc = iter->desc->more; 1138 1139 if (iter->desc) { 1140 iter->pos = 0; 1141 /* desc->sptes[0] cannot be NULL */ 1142 return iter->desc->sptes[iter->pos]; 1143 } 1144 } 1145 1146 return NULL; 1147} 1148 1149#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ 1150 for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ 1151 _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ 1152 _spte_ = rmap_get_next(_iter_)) 1153 1154static void drop_spte(struct kvm *kvm, u64 *sptep) 1155{ 1156 if (mmu_spte_clear_track_bits(sptep)) 1157 rmap_remove(kvm, sptep); 1158} 1159 1160 1161static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) 1162{ 1163 if (is_large_pte(*sptep)) { 1164 WARN_ON(page_header(__pa(sptep))->role.level == 1165 PT_PAGE_TABLE_LEVEL); 1166 drop_spte(kvm, sptep); 1167 --kvm->stat.lpages; 1168 return true; 1169 } 1170 1171 return false; 1172} 1173 1174static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1175{ 1176 if (__drop_large_spte(vcpu->kvm, sptep)) 1177 kvm_flush_remote_tlbs(vcpu->kvm); 1178} 1179 1180/* 1181 * Write-protect on the specified @sptep, @pt_protect indicates whether 1182 * spte write-protection is caused by protecting shadow page table. 1183 * 1184 * Note: write protection is difference between dirty logging and spte 1185 * protection: 1186 * - for dirty logging, the spte can be set to writable at anytime if 1187 * its dirty bitmap is properly set. 1188 * - for spte protection, the spte can be writable only after unsync-ing 1189 * shadow page. 1190 * 1191 * Return true if tlb need be flushed. 1192 */ 1193static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool pt_protect) 1194{ 1195 u64 spte = *sptep; 1196 1197 if (!is_writable_pte(spte) && 1198 !(pt_protect && spte_is_locklessly_modifiable(spte))) 1199 return false; 1200 1201 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); 1202 1203 if (pt_protect) 1204 spte &= ~SPTE_MMU_WRITEABLE; 1205 spte = spte & ~PT_WRITABLE_MASK; 1206 1207 return mmu_spte_update(sptep, spte); 1208} 1209 1210static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, 1211 bool pt_protect) 1212{ 1213 u64 *sptep; 1214 struct rmap_iterator iter; 1215 bool flush = false; 1216 1217 for_each_rmap_spte(rmapp, &iter, sptep) 1218 flush |= spte_write_protect(kvm, sptep, pt_protect); 1219 1220 return flush; 1221} 1222 1223static bool spte_clear_dirty(struct kvm *kvm, u64 *sptep) 1224{ 1225 u64 spte = *sptep; 1226 1227 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep); 1228 1229 spte &= ~shadow_dirty_mask; 1230 1231 return mmu_spte_update(sptep, spte); 1232} 1233 1234static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) 1235{ 1236 u64 *sptep; 1237 struct rmap_iterator iter; 1238 bool flush = false; 1239 1240 for_each_rmap_spte(rmapp, &iter, sptep) 1241 flush |= spte_clear_dirty(kvm, sptep); 1242 1243 return flush; 1244} 1245 1246static bool spte_set_dirty(struct kvm *kvm, u64 *sptep) 1247{ 1248 u64 spte = *sptep; 1249 1250 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); 1251 1252 spte |= shadow_dirty_mask; 1253 1254 return mmu_spte_update(sptep, spte); 1255} 1256 1257static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) 1258{ 1259 u64 *sptep; 1260 struct rmap_iterator iter; 1261 bool flush = false; 1262 1263 for_each_rmap_spte(rmapp, &iter, sptep) 1264 flush |= spte_set_dirty(kvm, sptep); 1265 1266 return flush; 1267} 1268 1269/** 1270 * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages 1271 * @kvm: kvm instance 1272 * @slot: slot to protect 1273 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1274 * @mask: indicates which pages we should protect 1275 * 1276 * Used when we do not need to care about huge page mappings: e.g. during dirty 1277 * logging we do not have any such mappings. 1278 */ 1279static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, 1280 struct kvm_memory_slot *slot, 1281 gfn_t gfn_offset, unsigned long mask) 1282{ 1283 unsigned long *rmapp; 1284 1285 while (mask) { 1286 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1287 PT_PAGE_TABLE_LEVEL, slot); 1288 __rmap_write_protect(kvm, rmapp, false); 1289 1290 /* clear the first set bit */ 1291 mask &= mask - 1; 1292 } 1293} 1294 1295/** 1296 * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages 1297 * @kvm: kvm instance 1298 * @slot: slot to clear D-bit 1299 * @gfn_offset: start of the BITS_PER_LONG pages we care about 1300 * @mask: indicates which pages we should clear D-bit 1301 * 1302 * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap. 1303 */ 1304void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, 1305 struct kvm_memory_slot *slot, 1306 gfn_t gfn_offset, unsigned long mask) 1307{ 1308 unsigned long *rmapp; 1309 1310 while (mask) { 1311 rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), 1312 PT_PAGE_TABLE_LEVEL, slot); 1313 __rmap_clear_dirty(kvm, rmapp); 1314 1315 /* clear the first set bit */ 1316 mask &= mask - 1; 1317 } 1318} 1319EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked); 1320 1321/** 1322 * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected 1323 * PT level pages. 1324 * 1325 * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to 1326 * enable dirty logging for them. 1327 * 1328 * Used when we do not need to care about huge page mappings: e.g. during dirty 1329 * logging we do not have any such mappings. 1330 */ 1331void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, 1332 struct kvm_memory_slot *slot, 1333 gfn_t gfn_offset, unsigned long mask) 1334{ 1335 if (kvm_x86_ops->enable_log_dirty_pt_masked) 1336 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, 1337 mask); 1338 else 1339 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); 1340} 1341 1342static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) 1343{ 1344 struct kvm_memory_slot *slot; 1345 unsigned long *rmapp; 1346 int i; 1347 bool write_protected = false; 1348 1349 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1350 1351 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { 1352 rmapp = __gfn_to_rmap(gfn, i, slot); 1353 write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); 1354 } 1355 1356 return write_protected; 1357} 1358 1359static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) 1360{ 1361 u64 *sptep; 1362 struct rmap_iterator iter; 1363 bool flush = false; 1364 1365 while ((sptep = rmap_get_first(*rmapp, &iter))) { 1366 BUG_ON(!(*sptep & PT_PRESENT_MASK)); 1367 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); 1368 1369 drop_spte(kvm, sptep); 1370 flush = true; 1371 } 1372 1373 return flush; 1374} 1375 1376static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, 1377 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1378 unsigned long data) 1379{ 1380 return kvm_zap_rmapp(kvm, rmapp); 1381} 1382 1383static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, 1384 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1385 unsigned long data) 1386{ 1387 u64 *sptep; 1388 struct rmap_iterator iter; 1389 int need_flush = 0; 1390 u64 new_spte; 1391 pte_t *ptep = (pte_t *)data; 1392 pfn_t new_pfn; 1393 1394 WARN_ON(pte_huge(*ptep)); 1395 new_pfn = pte_pfn(*ptep); 1396 1397restart: 1398 for_each_rmap_spte(rmapp, &iter, sptep) { 1399 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", 1400 sptep, *sptep, gfn, level); 1401 1402 need_flush = 1; 1403 1404 if (pte_write(*ptep)) { 1405 drop_spte(kvm, sptep); 1406 goto restart; 1407 } else { 1408 new_spte = *sptep & ~PT64_BASE_ADDR_MASK; 1409 new_spte |= (u64)new_pfn << PAGE_SHIFT; 1410 1411 new_spte &= ~PT_WRITABLE_MASK; 1412 new_spte &= ~SPTE_HOST_WRITEABLE; 1413 new_spte &= ~shadow_accessed_mask; 1414 1415 mmu_spte_clear_track_bits(sptep); 1416 mmu_spte_set(sptep, new_spte); 1417 } 1418 } 1419 1420 if (need_flush) 1421 kvm_flush_remote_tlbs(kvm); 1422 1423 return 0; 1424} 1425 1426struct slot_rmap_walk_iterator { 1427 /* input fields. */ 1428 struct kvm_memory_slot *slot; 1429 gfn_t start_gfn; 1430 gfn_t end_gfn; 1431 int start_level; 1432 int end_level; 1433 1434 /* output fields. */ 1435 gfn_t gfn; 1436 unsigned long *rmap; 1437 int level; 1438 1439 /* private field. */ 1440 unsigned long *end_rmap; 1441}; 1442 1443static void 1444rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) 1445{ 1446 iterator->level = level; 1447 iterator->gfn = iterator->start_gfn; 1448 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); 1449 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, 1450 iterator->slot); 1451} 1452 1453static void 1454slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, 1455 struct kvm_memory_slot *slot, int start_level, 1456 int end_level, gfn_t start_gfn, gfn_t end_gfn) 1457{ 1458 iterator->slot = slot; 1459 iterator->start_level = start_level; 1460 iterator->end_level = end_level; 1461 iterator->start_gfn = start_gfn; 1462 iterator->end_gfn = end_gfn; 1463 1464 rmap_walk_init_level(iterator, iterator->start_level); 1465} 1466 1467static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) 1468{ 1469 return !!iterator->rmap; 1470} 1471 1472static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) 1473{ 1474 if (++iterator->rmap <= iterator->end_rmap) { 1475 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); 1476 return; 1477 } 1478 1479 if (++iterator->level > iterator->end_level) { 1480 iterator->rmap = NULL; 1481 return; 1482 } 1483 1484 rmap_walk_init_level(iterator, iterator->level); 1485} 1486 1487#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ 1488 _start_gfn, _end_gfn, _iter_) \ 1489 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ 1490 _end_level_, _start_gfn, _end_gfn); \ 1491 slot_rmap_walk_okay(_iter_); \ 1492 slot_rmap_walk_next(_iter_)) 1493 1494static int kvm_handle_hva_range(struct kvm *kvm, 1495 unsigned long start, 1496 unsigned long end, 1497 unsigned long data, 1498 int (*handler)(struct kvm *kvm, 1499 unsigned long *rmapp, 1500 struct kvm_memory_slot *slot, 1501 gfn_t gfn, 1502 int level, 1503 unsigned long data)) 1504{ 1505 struct kvm_memslots *slots; 1506 struct kvm_memory_slot *memslot; 1507 struct slot_rmap_walk_iterator iterator; 1508 int ret = 0; 1509 int i; 1510 1511 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 1512 slots = __kvm_memslots(kvm, i); 1513 kvm_for_each_memslot(memslot, slots) { 1514 unsigned long hva_start, hva_end; 1515 gfn_t gfn_start, gfn_end; 1516 1517 hva_start = max(start, memslot->userspace_addr); 1518 hva_end = min(end, memslot->userspace_addr + 1519 (memslot->npages << PAGE_SHIFT)); 1520 if (hva_start >= hva_end) 1521 continue; 1522 /* 1523 * {gfn(page) | page intersects with [hva_start, hva_end)} = 1524 * {gfn_start, gfn_start+1, ..., gfn_end-1}. 1525 */ 1526 gfn_start = hva_to_gfn_memslot(hva_start, memslot); 1527 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); 1528 1529 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, 1530 PT_MAX_HUGEPAGE_LEVEL, 1531 gfn_start, gfn_end - 1, 1532 &iterator) 1533 ret |= handler(kvm, iterator.rmap, memslot, 1534 iterator.gfn, iterator.level, data); 1535 } 1536 } 1537 1538 return ret; 1539} 1540 1541static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, 1542 unsigned long data, 1543 int (*handler)(struct kvm *kvm, unsigned long *rmapp, 1544 struct kvm_memory_slot *slot, 1545 gfn_t gfn, int level, 1546 unsigned long data)) 1547{ 1548 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler); 1549} 1550 1551int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1552{ 1553 return kvm_handle_hva(kvm, hva, 0, kvm_unmap_rmapp); 1554} 1555 1556int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) 1557{ 1558 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); 1559} 1560 1561void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1562{ 1563 kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp); 1564} 1565 1566static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1567 struct kvm_memory_slot *slot, gfn_t gfn, int level, 1568 unsigned long data) 1569{ 1570 u64 *sptep; 1571 struct rmap_iterator uninitialized_var(iter); 1572 int young = 0; 1573 1574 BUG_ON(!shadow_accessed_mask); 1575 1576 for_each_rmap_spte(rmapp, &iter, sptep) 1577 if (*sptep & shadow_accessed_mask) { 1578 young = 1; 1579 clear_bit((ffs(shadow_accessed_mask) - 1), 1580 (unsigned long *)sptep); 1581 } 1582 1583 trace_kvm_age_page(gfn, level, slot, young); 1584 return young; 1585} 1586 1587static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, 1588 struct kvm_memory_slot *slot, gfn_t gfn, 1589 int level, unsigned long data) 1590{ 1591 u64 *sptep; 1592 struct rmap_iterator iter; 1593 int young = 0; 1594 1595 /* 1596 * If there's no access bit in the secondary pte set by the 1597 * hardware it's up to gup-fast/gup to set the access bit in 1598 * the primary pte or in the page structure. 1599 */ 1600 if (!shadow_accessed_mask) 1601 goto out; 1602 1603 for_each_rmap_spte(rmapp, &iter, sptep) 1604 if (*sptep & shadow_accessed_mask) { 1605 young = 1; 1606 break; 1607 } 1608out: 1609 return young; 1610} 1611 1612#define RMAP_RECYCLE_THRESHOLD 1000 1613 1614static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 1615{ 1616 unsigned long *rmapp; 1617 struct kvm_mmu_page *sp; 1618 1619 sp = page_header(__pa(spte)); 1620 1621 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); 1622 1623 kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); 1624 kvm_flush_remote_tlbs(vcpu->kvm); 1625} 1626 1627int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) 1628{ 1629 /* 1630 * In case of absence of EPT Access and Dirty Bits supports, 1631 * emulate the accessed bit for EPT, by checking if this page has 1632 * an EPT mapping, and clearing it if it does. On the next access, 1633 * a new EPT mapping will be established. 1634 * This has some overhead, but not as much as the cost of swapping 1635 * out actively used pages or breaking up actively used hugepages. 1636 */ 1637 if (!shadow_accessed_mask) { 1638 /* 1639 * We are holding the kvm->mmu_lock, and we are blowing up 1640 * shadow PTEs. MMU notifier consumers need to be kept at bay. 1641 * This is correct as long as we don't decouple the mmu_lock 1642 * protected regions (like invalidate_range_start|end does). 1643 */ 1644 kvm->mmu_notifier_seq++; 1645 return kvm_handle_hva_range(kvm, start, end, 0, 1646 kvm_unmap_rmapp); 1647 } 1648 1649 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); 1650} 1651 1652int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1653{ 1654 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); 1655} 1656 1657#ifdef MMU_DEBUG 1658static int is_empty_shadow_page(u64 *spt) 1659{ 1660 u64 *pos; 1661 u64 *end; 1662 1663 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) 1664 if (is_shadow_present_pte(*pos)) { 1665 printk(KERN_ERR "%s: %p %llx\n", __func__, 1666 pos, *pos); 1667 return 0; 1668 } 1669 return 1; 1670} 1671#endif 1672 1673/* 1674 * This value is the sum of all of the kvm instances's 1675 * kvm->arch.n_used_mmu_pages values. We need a global, 1676 * aggregate version in order to make the slab shrinker 1677 * faster 1678 */ 1679static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) 1680{ 1681 kvm->arch.n_used_mmu_pages += nr; 1682 percpu_counter_add(&kvm_total_used_mmu_pages, nr); 1683} 1684 1685static void kvm_mmu_free_page(struct kvm_mmu_page *sp) 1686{ 1687 MMU_WARN_ON(!is_empty_shadow_page(sp->spt)); 1688 hlist_del(&sp->hash_link); 1689 list_del(&sp->link); 1690 free_page((unsigned long)sp->spt); 1691 if (!sp->role.direct) 1692 free_page((unsigned long)sp->gfns); 1693 kmem_cache_free(mmu_page_header_cache, sp); 1694} 1695 1696static unsigned kvm_page_table_hashfn(gfn_t gfn) 1697{ 1698 return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); 1699} 1700 1701static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu, 1702 struct kvm_mmu_page *sp, u64 *parent_pte) 1703{ 1704 if (!parent_pte) 1705 return; 1706 1707 pte_list_add(vcpu, parent_pte, &sp->parent_ptes); 1708} 1709 1710static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, 1711 u64 *parent_pte) 1712{ 1713 pte_list_remove(parent_pte, &sp->parent_ptes); 1714} 1715 1716static void drop_parent_pte(struct kvm_mmu_page *sp, 1717 u64 *parent_pte) 1718{ 1719 mmu_page_remove_parent_pte(sp, parent_pte); 1720 mmu_spte_clear_no_track(parent_pte); 1721} 1722 1723static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 1724 u64 *parent_pte, int direct) 1725{ 1726 struct kvm_mmu_page *sp; 1727 1728 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); 1729 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1730 if (!direct) 1731 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); 1732 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 1733 1734 /* 1735 * The active_mmu_pages list is the FIFO list, do not move the 1736 * page until it is zapped. kvm_zap_obsolete_pages depends on 1737 * this feature. See the comments in kvm_zap_obsolete_pages(). 1738 */ 1739 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 1740 sp->parent_ptes = 0; 1741 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1742 kvm_mod_used_mmu_pages(vcpu->kvm, +1); 1743 return sp; 1744} 1745 1746static void mark_unsync(u64 *spte); 1747static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1748{ 1749 pte_list_walk(&sp->parent_ptes, mark_unsync); 1750} 1751 1752static void mark_unsync(u64 *spte) 1753{ 1754 struct kvm_mmu_page *sp; 1755 unsigned int index; 1756 1757 sp = page_header(__pa(spte)); 1758 index = spte - sp->spt; 1759 if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1760 return; 1761 if (sp->unsync_children++) 1762 return; 1763 kvm_mmu_mark_parents_unsync(sp); 1764} 1765 1766static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1767 struct kvm_mmu_page *sp) 1768{ 1769 return 1; 1770} 1771 1772static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 1773{ 1774} 1775 1776static void nonpaging_update_pte(struct kvm_vcpu *vcpu, 1777 struct kvm_mmu_page *sp, u64 *spte, 1778 const void *pte) 1779{ 1780 WARN_ON(1); 1781} 1782 1783#define KVM_PAGE_ARRAY_NR 16 1784 1785struct kvm_mmu_pages { 1786 struct mmu_page_and_offset { 1787 struct kvm_mmu_page *sp; 1788 unsigned int idx; 1789 } page[KVM_PAGE_ARRAY_NR]; 1790 unsigned int nr; 1791}; 1792 1793static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, 1794 int idx) 1795{ 1796 int i; 1797 1798 if (sp->unsync) 1799 for (i=0; i < pvec->nr; i++) 1800 if (pvec->page[i].sp == sp) 1801 return 0; 1802 1803 pvec->page[pvec->nr].sp = sp; 1804 pvec->page[pvec->nr].idx = idx; 1805 pvec->nr++; 1806 return (pvec->nr == KVM_PAGE_ARRAY_NR); 1807} 1808 1809static int __mmu_unsync_walk(struct kvm_mmu_page *sp, 1810 struct kvm_mmu_pages *pvec) 1811{ 1812 int i, ret, nr_unsync_leaf = 0; 1813 1814 for_each_set_bit(i, sp->unsync_child_bitmap, 512) { 1815 struct kvm_mmu_page *child; 1816 u64 ent = sp->spt[i]; 1817 1818 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) 1819 goto clear_child_bitmap; 1820 1821 child = page_header(ent & PT64_BASE_ADDR_MASK); 1822 1823 if (child->unsync_children) { 1824 if (mmu_pages_add(pvec, child, i)) 1825 return -ENOSPC; 1826 1827 ret = __mmu_unsync_walk(child, pvec); 1828 if (!ret) 1829 goto clear_child_bitmap; 1830 else if (ret > 0) 1831 nr_unsync_leaf += ret; 1832 else 1833 return ret; 1834 } else if (child->unsync) { 1835 nr_unsync_leaf++; 1836 if (mmu_pages_add(pvec, child, i)) 1837 return -ENOSPC; 1838 } else 1839 goto clear_child_bitmap; 1840 1841 continue; 1842 1843clear_child_bitmap: 1844 __clear_bit(i, sp->unsync_child_bitmap); 1845 sp->unsync_children--; 1846 WARN_ON((int)sp->unsync_children < 0); 1847 } 1848 1849 1850 return nr_unsync_leaf; 1851} 1852 1853static int mmu_unsync_walk(struct kvm_mmu_page *sp, 1854 struct kvm_mmu_pages *pvec) 1855{ 1856 if (!sp->unsync_children) 1857 return 0; 1858 1859 mmu_pages_add(pvec, sp, 0); 1860 return __mmu_unsync_walk(sp, pvec); 1861} 1862 1863static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1864{ 1865 WARN_ON(!sp->unsync); 1866 trace_kvm_mmu_sync_page(sp); 1867 sp->unsync = 0; 1868 --kvm->stat.mmu_unsync; 1869} 1870 1871static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1872 struct list_head *invalid_list); 1873static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1874 struct list_head *invalid_list); 1875 1876/* 1877 * NOTE: we should pay more attention on the zapped-obsolete page 1878 * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk 1879 * since it has been deleted from active_mmu_pages but still can be found 1880 * at hast list. 1881 * 1882 * for_each_gfn_indirect_valid_sp has skipped that kind of page and 1883 * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped 1884 * all the obsolete pages. 1885 */ 1886#define for_each_gfn_sp(_kvm, _sp, _gfn) \ 1887 hlist_for_each_entry(_sp, \ 1888 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ 1889 if ((_sp)->gfn != (_gfn)) {} else 1890 1891#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ 1892 for_each_gfn_sp(_kvm, _sp, _gfn) \ 1893 if ((_sp)->role.direct || (_sp)->role.invalid) {} else 1894 1895/* @sp->gfn should be write-protected at the call site */ 1896static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1897 struct list_head *invalid_list, bool clear_unsync) 1898{ 1899 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1900 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1901 return 1; 1902 } 1903 1904 if (clear_unsync) 1905 kvm_unlink_unsync_page(vcpu->kvm, sp); 1906 1907 if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1908 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1909 return 1; 1910 } 1911 1912 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1913 return 0; 1914} 1915 1916static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, 1917 struct kvm_mmu_page *sp) 1918{ 1919 LIST_HEAD(invalid_list); 1920 int ret; 1921 1922 ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); 1923 if (ret) 1924 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1925 1926 return ret; 1927} 1928 1929#ifdef CONFIG_KVM_MMU_AUDIT 1930#include "mmu_audit.c" 1931#else 1932static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { } 1933static void mmu_audit_disable(void) { } 1934#endif 1935 1936static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1937 struct list_head *invalid_list) 1938{ 1939 return __kvm_sync_page(vcpu, sp, invalid_list, true); 1940} 1941 1942/* @gfn should be write-protected at the call site */ 1943static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 1944{ 1945 struct kvm_mmu_page *s; 1946 LIST_HEAD(invalid_list); 1947 bool flush = false; 1948 1949 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { 1950 if (!s->unsync) 1951 continue; 1952 1953 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1954 kvm_unlink_unsync_page(vcpu->kvm, s); 1955 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1956 (vcpu->arch.mmu.sync_page(vcpu, s))) { 1957 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1958 continue; 1959 } 1960 flush = true; 1961 } 1962 1963 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1964 if (flush) 1965 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1966} 1967 1968struct mmu_page_path { 1969 struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; 1970 unsigned int idx[PT64_ROOT_LEVEL-1]; 1971}; 1972 1973#define for_each_sp(pvec, sp, parents, i) \ 1974 for (i = mmu_pages_next(&pvec, &parents, -1), \ 1975 sp = pvec.page[i].sp; \ 1976 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ 1977 i = mmu_pages_next(&pvec, &parents, i)) 1978 1979static int mmu_pages_next(struct kvm_mmu_pages *pvec, 1980 struct mmu_page_path *parents, 1981 int i) 1982{ 1983 int n; 1984 1985 for (n = i+1; n < pvec->nr; n++) { 1986 struct kvm_mmu_page *sp = pvec->page[n].sp; 1987 1988 if (sp->role.level == PT_PAGE_TABLE_LEVEL) { 1989 parents->idx[0] = pvec->page[n].idx; 1990 return n; 1991 } 1992 1993 parents->parent[sp->role.level-2] = sp; 1994 parents->idx[sp->role.level-1] = pvec->page[n].idx; 1995 } 1996 1997 return n; 1998} 1999 2000static void mmu_pages_clear_parents(struct mmu_page_path *parents) 2001{ 2002 struct kvm_mmu_page *sp; 2003 unsigned int level = 0; 2004 2005 do { 2006 unsigned int idx = parents->idx[level]; 2007 2008 sp = parents->parent[level]; 2009 if (!sp) 2010 return; 2011 2012 --sp->unsync_children; 2013 WARN_ON((int)sp->unsync_children < 0); 2014 __clear_bit(idx, sp->unsync_child_bitmap); 2015 level++; 2016 } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); 2017} 2018 2019static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, 2020 struct mmu_page_path *parents, 2021 struct kvm_mmu_pages *pvec) 2022{ 2023 parents->parent[parent->role.level-1] = NULL; 2024 pvec->nr = 0; 2025} 2026 2027static void mmu_sync_children(struct kvm_vcpu *vcpu, 2028 struct kvm_mmu_page *parent) 2029{ 2030 int i; 2031 struct kvm_mmu_page *sp; 2032 struct mmu_page_path parents; 2033 struct kvm_mmu_pages pages; 2034 LIST_HEAD(invalid_list); 2035 2036 kvm_mmu_pages_init(parent, &parents, &pages); 2037 while (mmu_unsync_walk(parent, &pages)) { 2038 bool protected = false; 2039 2040 for_each_sp(pages, sp, parents, i) 2041 protected |= rmap_write_protect(vcpu, sp->gfn); 2042 2043 if (protected) 2044 kvm_flush_remote_tlbs(vcpu->kvm); 2045 2046 for_each_sp(pages, sp, parents, i) { 2047 kvm_sync_page(vcpu, sp, &invalid_list); 2048 mmu_pages_clear_parents(&parents); 2049 } 2050 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2051 cond_resched_lock(&vcpu->kvm->mmu_lock); 2052 kvm_mmu_pages_init(parent, &parents, &pages); 2053 } 2054} 2055 2056static void init_shadow_page_table(struct kvm_mmu_page *sp) 2057{ 2058 int i; 2059 2060 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 2061 sp->spt[i] = 0ull; 2062} 2063 2064static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) 2065{ 2066 sp->write_flooding_count = 0; 2067} 2068 2069static void clear_sp_write_flooding_count(u64 *spte) 2070{ 2071 struct kvm_mmu_page *sp = page_header(__pa(spte)); 2072 2073 __clear_sp_write_flooding_count(sp); 2074} 2075 2076static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) 2077{ 2078 return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); 2079} 2080 2081static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 2082 gfn_t gfn, 2083 gva_t gaddr, 2084 unsigned level, 2085 int direct, 2086 unsigned access, 2087 u64 *parent_pte) 2088{ 2089 union kvm_mmu_page_role role; 2090 unsigned quadrant; 2091 struct kvm_mmu_page *sp; 2092 bool need_sync = false; 2093 2094 role = vcpu->arch.mmu.base_role; 2095 role.level = level; 2096 role.direct = direct; 2097 if (role.direct) 2098 role.cr4_pae = 0; 2099 role.access = access; 2100 if (!vcpu->arch.mmu.direct_map 2101 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 2102 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 2103 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 2104 role.quadrant = quadrant; 2105 } 2106 for_each_gfn_sp(vcpu->kvm, sp, gfn) { 2107 if (is_obsolete_sp(vcpu->kvm, sp)) 2108 continue; 2109 2110 if (!need_sync && sp->unsync) 2111 need_sync = true; 2112 2113 if (sp->role.word != role.word) 2114 continue; 2115 2116 if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) 2117 break; 2118 2119 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 2120 if (sp->unsync_children) { 2121 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 2122 kvm_mmu_mark_parents_unsync(sp); 2123 } else if (sp->unsync) 2124 kvm_mmu_mark_parents_unsync(sp); 2125 2126 __clear_sp_write_flooding_count(sp); 2127 trace_kvm_mmu_get_page(sp, false); 2128 return sp; 2129 } 2130 ++vcpu->kvm->stat.mmu_cache_miss; 2131 sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); 2132 if (!sp) 2133 return sp; 2134 sp->gfn = gfn; 2135 sp->role = role; 2136 hlist_add_head(&sp->hash_link, 2137 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); 2138 if (!direct) { 2139 if (rmap_write_protect(vcpu, gfn)) 2140 kvm_flush_remote_tlbs(vcpu->kvm); 2141 if (level > PT_PAGE_TABLE_LEVEL && need_sync) 2142 kvm_sync_pages(vcpu, gfn); 2143 2144 account_shadowed(vcpu->kvm, sp); 2145 } 2146 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; 2147 init_shadow_page_table(sp); 2148 trace_kvm_mmu_get_page(sp, true); 2149 return sp; 2150} 2151 2152static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, 2153 struct kvm_vcpu *vcpu, u64 addr) 2154{ 2155 iterator->addr = addr; 2156 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 2157 iterator->level = vcpu->arch.mmu.shadow_root_level; 2158 2159 if (iterator->level == PT64_ROOT_LEVEL && 2160 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && 2161 !vcpu->arch.mmu.direct_map) 2162 --iterator->level; 2163 2164 if (iterator->level == PT32E_ROOT_LEVEL) { 2165 iterator->shadow_addr 2166 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 2167 iterator->shadow_addr &= PT64_BASE_ADDR_MASK; 2168 --iterator->level; 2169 if (!iterator->shadow_addr) 2170 iterator->level = 0; 2171 } 2172} 2173 2174static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) 2175{ 2176 if (iterator->level < PT_PAGE_TABLE_LEVEL) 2177 return false; 2178 2179 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); 2180 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index; 2181 return true; 2182} 2183 2184static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator, 2185 u64 spte) 2186{ 2187 if (is_last_spte(spte, iterator->level)) { 2188 iterator->level = 0; 2189 return; 2190 } 2191 2192 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK; 2193 --iterator->level; 2194} 2195 2196static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator) 2197{ 2198 return __shadow_walk_next(iterator, *iterator->sptep); 2199} 2200 2201static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp, bool accessed) 2202{ 2203 u64 spte; 2204 2205 BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK || 2206 VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2207 2208 spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK | 2209 shadow_user_mask | shadow_x_mask; 2210 2211 if (accessed) 2212 spte |= shadow_accessed_mask; 2213 2214 mmu_spte_set(sptep, spte); 2215} 2216 2217static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2218 unsigned direct_access) 2219{ 2220 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 2221 struct kvm_mmu_page *child; 2222 2223 /* 2224 * For the direct sp, if the guest pte's dirty bit 2225 * changed form clean to dirty, it will corrupt the 2226 * sp's access: allow writable in the read-only sp, 2227 * so we should update the spte at this point to get 2228 * a new sp with the correct access. 2229 */ 2230 child = page_header(*sptep & PT64_BASE_ADDR_MASK); 2231 if (child->role.access == direct_access) 2232 return; 2233 2234 drop_parent_pte(child, sptep); 2235 kvm_flush_remote_tlbs(vcpu->kvm); 2236 } 2237} 2238 2239static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, 2240 u64 *spte) 2241{ 2242 u64 pte; 2243 struct kvm_mmu_page *child; 2244 2245 pte = *spte; 2246 if (is_shadow_present_pte(pte)) { 2247 if (is_last_spte(pte, sp->role.level)) { 2248 drop_spte(kvm, spte); 2249 if (is_large_pte(pte)) 2250 --kvm->stat.lpages; 2251 } else { 2252 child = page_header(pte & PT64_BASE_ADDR_MASK); 2253 drop_parent_pte(child, spte); 2254 } 2255 return true; 2256 } 2257 2258 if (is_mmio_spte(pte)) 2259 mmu_spte_clear_no_track(spte); 2260 2261 return false; 2262} 2263 2264static void kvm_mmu_page_unlink_children(struct kvm *kvm, 2265 struct kvm_mmu_page *sp) 2266{ 2267 unsigned i; 2268 2269 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 2270 mmu_page_zap_pte(kvm, sp, sp->spt + i); 2271} 2272 2273static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 2274{ 2275 mmu_page_remove_parent_pte(sp, parent_pte); 2276} 2277 2278static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) 2279{ 2280 u64 *sptep; 2281 struct rmap_iterator iter; 2282 2283 while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) 2284 drop_parent_pte(sp, sptep); 2285} 2286 2287static int mmu_zap_unsync_children(struct kvm *kvm, 2288 struct kvm_mmu_page *parent, 2289 struct list_head *invalid_list) 2290{ 2291 int i, zapped = 0; 2292 struct mmu_page_path parents; 2293 struct kvm_mmu_pages pages; 2294 2295 if (parent->role.level == PT_PAGE_TABLE_LEVEL) 2296 return 0; 2297 2298 kvm_mmu_pages_init(parent, &parents, &pages); 2299 while (mmu_unsync_walk(parent, &pages)) { 2300 struct kvm_mmu_page *sp; 2301 2302 for_each_sp(pages, sp, parents, i) { 2303 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2304 mmu_pages_clear_parents(&parents); 2305 zapped++; 2306 } 2307 kvm_mmu_pages_init(parent, &parents, &pages); 2308 } 2309 2310 return zapped; 2311} 2312 2313static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 2314 struct list_head *invalid_list) 2315{ 2316 int ret; 2317 2318 trace_kvm_mmu_prepare_zap_page(sp); 2319 ++kvm->stat.mmu_shadow_zapped; 2320 ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 2321 kvm_mmu_page_unlink_children(kvm, sp); 2322 kvm_mmu_unlink_parents(kvm, sp); 2323 2324 if (!sp->role.invalid && !sp->role.direct) 2325 unaccount_shadowed(kvm, sp); 2326 2327 if (sp->unsync) 2328 kvm_unlink_unsync_page(kvm, sp); 2329 if (!sp->root_count) { 2330 /* Count self */ 2331 ret++; 2332 list_move(&sp->link, invalid_list); 2333 kvm_mod_used_mmu_pages(kvm, -1); 2334 } else { 2335 list_move(&sp->link, &kvm->arch.active_mmu_pages); 2336 2337 /* 2338 * The obsolete pages can not be used on any vcpus. 2339 * See the comments in kvm_mmu_invalidate_zap_all_pages(). 2340 */ 2341 if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) 2342 kvm_reload_remote_mmus(kvm); 2343 } 2344 2345 sp->role.invalid = 1; 2346 return ret; 2347} 2348 2349static void kvm_mmu_commit_zap_page(struct kvm *kvm, 2350 struct list_head *invalid_list) 2351{ 2352 struct kvm_mmu_page *sp, *nsp; 2353 2354 if (list_empty(invalid_list)) 2355 return; 2356 2357 /* 2358 * wmb: make sure everyone sees our modifications to the page tables 2359 * rmb: make sure we see changes to vcpu->mode 2360 */ 2361 smp_mb(); 2362 2363 /* 2364 * Wait for all vcpus to exit guest mode and/or lockless shadow 2365 * page table walks. 2366 */ 2367 kvm_flush_remote_tlbs(kvm); 2368 2369 list_for_each_entry_safe(sp, nsp, invalid_list, link) { 2370 WARN_ON(!sp->role.invalid || sp->root_count); 2371 kvm_mmu_free_page(sp); 2372 } 2373} 2374 2375static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, 2376 struct list_head *invalid_list) 2377{ 2378 struct kvm_mmu_page *sp; 2379 2380 if (list_empty(&kvm->arch.active_mmu_pages)) 2381 return false; 2382 2383 sp = list_entry(kvm->arch.active_mmu_pages.prev, 2384 struct kvm_mmu_page, link); 2385 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 2386 2387 return true; 2388} 2389 2390/* 2391 * Changing the number of mmu pages allocated to the vm 2392 * Note: if goal_nr_mmu_pages is too small, you will get dead lock 2393 */ 2394void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) 2395{ 2396 LIST_HEAD(invalid_list); 2397 2398 spin_lock(&kvm->mmu_lock); 2399 2400 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { 2401 /* Need to free some mmu pages to achieve the goal. */ 2402 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) 2403 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) 2404 break; 2405 2406 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2407 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; 2408 } 2409 2410 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; 2411 2412 spin_unlock(&kvm->mmu_lock); 2413} 2414 2415int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 2416{ 2417 struct kvm_mmu_page *sp; 2418 LIST_HEAD(invalid_list); 2419 int r; 2420 2421 pgprintk("%s: looking for gfn %llx\n", __func__, gfn); 2422 r = 0; 2423 spin_lock(&kvm->mmu_lock); 2424 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) { 2425 pgprintk("%s: gfn %llx role %x\n", __func__, gfn, 2426 sp->role.word); 2427 r = 1; 2428 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 2429 } 2430 kvm_mmu_commit_zap_page(kvm, &invalid_list); 2431 spin_unlock(&kvm->mmu_lock); 2432 2433 return r; 2434} 2435EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); 2436 2437static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 2438{ 2439 trace_kvm_mmu_unsync_page(sp); 2440 ++vcpu->kvm->stat.mmu_unsync; 2441 sp->unsync = 1; 2442 2443 kvm_mmu_mark_parents_unsync(sp); 2444} 2445 2446static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 2447{ 2448 struct kvm_mmu_page *s; 2449 2450 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { 2451 if (s->unsync) 2452 continue; 2453 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 2454 __kvm_unsync_page(vcpu, s); 2455 } 2456} 2457 2458static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 2459 bool can_unsync) 2460{ 2461 struct kvm_mmu_page *s; 2462 bool need_unsync = false; 2463 2464 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) { 2465 if (!can_unsync) 2466 return 1; 2467 2468 if (s->role.level != PT_PAGE_TABLE_LEVEL) 2469 return 1; 2470 2471 if (!s->unsync) 2472 need_unsync = true; 2473 } 2474 if (need_unsync) 2475 kvm_unsync_pages(vcpu, gfn); 2476 return 0; 2477} 2478 2479static bool kvm_is_mmio_pfn(pfn_t pfn) 2480{ 2481 if (pfn_valid(pfn)) 2482 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); 2483 2484 return true; 2485} 2486 2487static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2488 unsigned pte_access, int level, 2489 gfn_t gfn, pfn_t pfn, bool speculative, 2490 bool can_unsync, bool host_writable) 2491{ 2492 u64 spte; 2493 int ret = 0; 2494 2495 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) 2496 return 0; 2497 2498 spte = PT_PRESENT_MASK; 2499 if (!speculative) 2500 spte |= shadow_accessed_mask; 2501 2502 if (pte_access & ACC_EXEC_MASK) 2503 spte |= shadow_x_mask; 2504 else 2505 spte |= shadow_nx_mask; 2506 2507 if (pte_access & ACC_USER_MASK) 2508 spte |= shadow_user_mask; 2509 2510 if (level > PT_PAGE_TABLE_LEVEL) 2511 spte |= PT_PAGE_SIZE_MASK; 2512 if (tdp_enabled) 2513 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2514 kvm_is_mmio_pfn(pfn)); 2515 2516 if (host_writable) 2517 spte |= SPTE_HOST_WRITEABLE; 2518 else 2519 pte_access &= ~ACC_WRITE_MASK; 2520 2521 spte |= (u64)pfn << PAGE_SHIFT; 2522 2523 if (pte_access & ACC_WRITE_MASK) { 2524 2525 /* 2526 * Other vcpu creates new sp in the window between 2527 * mapping_level() and acquiring mmu-lock. We can 2528 * allow guest to retry the access, the mapping can 2529 * be fixed if guest refault. 2530 */ 2531 if (level > PT_PAGE_TABLE_LEVEL && 2532 has_wrprotected_page(vcpu, gfn, level)) 2533 goto done; 2534 2535 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2536 2537 /* 2538 * Optimization: for pte sync, if spte was writable the hash 2539 * lookup is unnecessary (and expensive). Write protection 2540 * is responsibility of mmu_get_page / kvm_sync_page. 2541 * Same reasoning can be applied to dirty page accounting. 2542 */ 2543 if (!can_unsync && is_writable_pte(*sptep)) 2544 goto set_pte; 2545 2546 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2547 pgprintk("%s: found shadow page for %llx, marking ro\n", 2548 __func__, gfn); 2549 ret = 1; 2550 pte_access &= ~ACC_WRITE_MASK; 2551 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); 2552 } 2553 } 2554 2555 if (pte_access & ACC_WRITE_MASK) { 2556 kvm_vcpu_mark_page_dirty(vcpu, gfn); 2557 spte |= shadow_dirty_mask; 2558 } 2559 2560set_pte: 2561 if (mmu_spte_update(sptep, spte)) 2562 kvm_flush_remote_tlbs(vcpu->kvm); 2563done: 2564 return ret; 2565} 2566 2567static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, 2568 unsigned pte_access, int write_fault, int *emulate, 2569 int level, gfn_t gfn, pfn_t pfn, bool speculative, 2570 bool host_writable) 2571{ 2572 int was_rmapped = 0; 2573 int rmap_count; 2574 2575 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__, 2576 *sptep, write_fault, gfn); 2577 2578 if (is_rmap_spte(*sptep)) { 2579 /* 2580 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 2581 * the parent of the now unreachable PTE. 2582 */ 2583 if (level > PT_PAGE_TABLE_LEVEL && 2584 !is_large_pte(*sptep)) { 2585 struct kvm_mmu_page *child; 2586 u64 pte = *sptep; 2587 2588 child = page_header(pte & PT64_BASE_ADDR_MASK); 2589 drop_parent_pte(child, sptep); 2590 kvm_flush_remote_tlbs(vcpu->kvm); 2591 } else if (pfn != spte_to_pfn(*sptep)) { 2592 pgprintk("hfn old %llx new %llx\n", 2593 spte_to_pfn(*sptep), pfn); 2594 drop_spte(vcpu->kvm, sptep); 2595 kvm_flush_remote_tlbs(vcpu->kvm); 2596 } else 2597 was_rmapped = 1; 2598 } 2599 2600 if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative, 2601 true, host_writable)) { 2602 if (write_fault) 2603 *emulate = 1; 2604 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2605 } 2606 2607 if (unlikely(is_mmio_spte(*sptep) && emulate)) 2608 *emulate = 1; 2609 2610 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2611 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", 2612 is_large_pte(*sptep)? "2MB" : "4kB", 2613 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2614 *sptep, sptep); 2615 if (!was_rmapped && is_large_pte(*sptep)) 2616 ++vcpu->kvm->stat.lpages; 2617 2618 if (is_shadow_present_pte(*sptep)) { 2619 if (!was_rmapped) { 2620 rmap_count = rmap_add(vcpu, sptep, gfn); 2621 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2622 rmap_recycle(vcpu, sptep, gfn); 2623 } 2624 } 2625 2626 kvm_release_pfn_clean(pfn); 2627} 2628 2629static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2630 bool no_dirty_log) 2631{ 2632 struct kvm_memory_slot *slot; 2633 2634 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); 2635 if (!slot) 2636 return KVM_PFN_ERR_FAULT; 2637 2638 return gfn_to_pfn_memslot_atomic(slot, gfn); 2639} 2640 2641static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, 2642 struct kvm_mmu_page *sp, 2643 u64 *start, u64 *end) 2644{ 2645 struct page *pages[PTE_PREFETCH_NUM]; 2646 struct kvm_memory_slot *slot; 2647 unsigned access = sp->role.access; 2648 int i, ret; 2649 gfn_t gfn; 2650 2651 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); 2652 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); 2653 if (!slot) 2654 return -1; 2655 2656 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); 2657 if (ret <= 0) 2658 return -1; 2659 2660 for (i = 0; i < ret; i++, gfn++, start++) 2661 mmu_set_spte(vcpu, start, access, 0, NULL, 2662 sp->role.level, gfn, page_to_pfn(pages[i]), 2663 true, true); 2664 2665 return 0; 2666} 2667 2668static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, 2669 struct kvm_mmu_page *sp, u64 *sptep) 2670{ 2671 u64 *spte, *start = NULL; 2672 int i; 2673 2674 WARN_ON(!sp->role.direct); 2675 2676 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); 2677 spte = sp->spt + i; 2678 2679 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 2680 if (is_shadow_present_pte(*spte) || spte == sptep) { 2681 if (!start) 2682 continue; 2683 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) 2684 break; 2685 start = NULL; 2686 } else if (!start) 2687 start = spte; 2688 } 2689} 2690 2691static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) 2692{ 2693 struct kvm_mmu_page *sp; 2694 2695 /* 2696 * Since it's no accessed bit on EPT, it's no way to 2697 * distinguish between actually accessed translations 2698 * and prefetched, so disable pte prefetch if EPT is 2699 * enabled. 2700 */ 2701 if (!shadow_accessed_mask) 2702 return; 2703 2704 sp = page_header(__pa(sptep)); 2705 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 2706 return; 2707 2708 __direct_pte_prefetch(vcpu, sp, sptep); 2709} 2710 2711static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2712 int map_writable, int level, gfn_t gfn, pfn_t pfn, 2713 bool prefault) 2714{ 2715 struct kvm_shadow_walk_iterator iterator; 2716 struct kvm_mmu_page *sp; 2717 int emulate = 0; 2718 gfn_t pseudo_gfn; 2719 2720 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2721 return 0; 2722 2723 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2724 if (iterator.level == level) { 2725 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, 2726 write, &emulate, level, gfn, pfn, 2727 prefault, map_writable); 2728 direct_pte_prefetch(vcpu, iterator.sptep); 2729 ++vcpu->stat.pf_fixed; 2730 break; 2731 } 2732 2733 drop_large_spte(vcpu, iterator.sptep); 2734 if (!is_shadow_present_pte(*iterator.sptep)) { 2735 u64 base_addr = iterator.addr; 2736 2737 base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 2738 pseudo_gfn = base_addr >> PAGE_SHIFT; 2739 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2740 iterator.level - 1, 2741 1, ACC_ALL, iterator.sptep); 2742 2743 link_shadow_page(iterator.sptep, sp, true); 2744 } 2745 } 2746 return emulate; 2747} 2748 2749static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) 2750{ 2751 siginfo_t info; 2752 2753 info.si_signo = SIGBUS; 2754 info.si_errno = 0; 2755 info.si_code = BUS_MCEERR_AR; 2756 info.si_addr = (void __user *)address; 2757 info.si_addr_lsb = PAGE_SHIFT; 2758 2759 send_sig_info(SIGBUS, &info, tsk); 2760} 2761 2762static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) 2763{ 2764 /* 2765 * Do not cache the mmio info caused by writing the readonly gfn 2766 * into the spte otherwise read access on readonly gfn also can 2767 * caused mmio page fault and treat it as mmio access. 2768 * Return 1 to tell kvm to emulate it. 2769 */ 2770 if (pfn == KVM_PFN_ERR_RO_FAULT) 2771 return 1; 2772 2773 if (pfn == KVM_PFN_ERR_HWPOISON) { 2774 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); 2775 return 0; 2776 } 2777 2778 return -EFAULT; 2779} 2780 2781static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, 2782 gfn_t *gfnp, pfn_t *pfnp, int *levelp) 2783{ 2784 pfn_t pfn = *pfnp; 2785 gfn_t gfn = *gfnp; 2786 int level = *levelp; 2787 2788 /* 2789 * Check if it's a transparent hugepage. If this would be an 2790 * hugetlbfs page, level wouldn't be set to 2791 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done 2792 * here. 2793 */ 2794 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && 2795 level == PT_PAGE_TABLE_LEVEL && 2796 PageTransCompound(pfn_to_page(pfn)) && 2797 !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { 2798 unsigned long mask; 2799 /* 2800 * mmu_notifier_retry was successful and we hold the 2801 * mmu_lock here, so the pmd can't become splitting 2802 * from under us, and in turn 2803 * __split_huge_page_refcount() can't run from under 2804 * us and we can safely transfer the refcount from 2805 * PG_tail to PG_head as we switch the pfn to tail to 2806 * head. 2807 */ 2808 *levelp = level = PT_DIRECTORY_LEVEL; 2809 mask = KVM_PAGES_PER_HPAGE(level) - 1; 2810 VM_BUG_ON((gfn & mask) != (pfn & mask)); 2811 if (pfn & mask) { 2812 gfn &= ~mask; 2813 *gfnp = gfn; 2814 kvm_release_pfn_clean(pfn); 2815 pfn &= ~mask; 2816 kvm_get_pfn(pfn); 2817 *pfnp = pfn; 2818 } 2819 } 2820} 2821 2822static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2823 pfn_t pfn, unsigned access, int *ret_val) 2824{ 2825 bool ret = true; 2826 2827 /* The pfn is invalid, report the error! */ 2828 if (unlikely(is_error_pfn(pfn))) { 2829 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); 2830 goto exit; 2831 } 2832 2833 if (unlikely(is_noslot_pfn(pfn))) 2834 vcpu_cache_mmio_info(vcpu, gva, gfn, access); 2835 2836 ret = false; 2837exit: 2838 return ret; 2839} 2840 2841static bool page_fault_can_be_fast(u32 error_code) 2842{ 2843 /* 2844 * Do not fix the mmio spte with invalid generation number which 2845 * need to be updated by slow page fault path. 2846 */ 2847 if (unlikely(error_code & PFERR_RSVD_MASK)) 2848 return false; 2849 2850 /* 2851 * #PF can be fast only if the shadow page table is present and it 2852 * is caused by write-protect, that means we just need change the 2853 * W bit of the spte which can be done out of mmu-lock. 2854 */ 2855 if (!(error_code & PFERR_PRESENT_MASK) || 2856 !(error_code & PFERR_WRITE_MASK)) 2857 return false; 2858 2859 return true; 2860} 2861 2862static bool 2863fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 2864 u64 *sptep, u64 spte) 2865{ 2866 gfn_t gfn; 2867 2868 WARN_ON(!sp->role.direct); 2869 2870 /* 2871 * The gfn of direct spte is stable since it is calculated 2872 * by sp->gfn. 2873 */ 2874 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); 2875 2876 /* 2877 * Theoretically we could also set dirty bit (and flush TLB) here in 2878 * order to eliminate unnecessary PML logging. See comments in 2879 * set_spte. But fast_page_fault is very unlikely to happen with PML 2880 * enabled, so we do not do this. This might result in the same GPA 2881 * to be logged in PML buffer again when the write really happens, and 2882 * eventually to be called by mark_page_dirty twice. But it's also no 2883 * harm. This also avoids the TLB flush needed after setting dirty bit 2884 * so non-PML cases won't be impacted. 2885 * 2886 * Compare with set_spte where instead shadow_dirty_mask is set. 2887 */ 2888 if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) 2889 kvm_vcpu_mark_page_dirty(vcpu, gfn); 2890 2891 return true; 2892} 2893 2894/* 2895 * Return value: 2896 * - true: let the vcpu to access on the same address again. 2897 * - false: let the real page fault path to fix it. 2898 */ 2899static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, 2900 u32 error_code) 2901{ 2902 struct kvm_shadow_walk_iterator iterator; 2903 struct kvm_mmu_page *sp; 2904 bool ret = false; 2905 u64 spte = 0ull; 2906 2907 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2908 return false; 2909 2910 if (!page_fault_can_be_fast(error_code)) 2911 return false; 2912 2913 walk_shadow_page_lockless_begin(vcpu); 2914 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) 2915 if (!is_shadow_present_pte(spte) || iterator.level < level) 2916 break; 2917 2918 /* 2919 * If the mapping has been changed, let the vcpu fault on the 2920 * same address again. 2921 */ 2922 if (!is_rmap_spte(spte)) { 2923 ret = true; 2924 goto exit; 2925 } 2926 2927 sp = page_header(__pa(iterator.sptep)); 2928 if (!is_last_spte(spte, sp->role.level)) 2929 goto exit; 2930 2931 /* 2932 * Check if it is a spurious fault caused by TLB lazily flushed. 2933 * 2934 * Need not check the access of upper level table entries since 2935 * they are always ACC_ALL. 2936 */ 2937 if (is_writable_pte(spte)) { 2938 ret = true; 2939 goto exit; 2940 } 2941 2942 /* 2943 * Currently, to simplify the code, only the spte write-protected 2944 * by dirty-log can be fast fixed. 2945 */ 2946 if (!spte_is_locklessly_modifiable(spte)) 2947 goto exit; 2948 2949 /* 2950 * Do not fix write-permission on the large spte since we only dirty 2951 * the first page into the dirty-bitmap in fast_pf_fix_direct_spte() 2952 * that means other pages are missed if its slot is dirty-logged. 2953 * 2954 * Instead, we let the slow page fault path create a normal spte to 2955 * fix the access. 2956 * 2957 * See the comments in kvm_arch_commit_memory_region(). 2958 */ 2959 if (sp->role.level > PT_PAGE_TABLE_LEVEL) 2960 goto exit; 2961 2962 /* 2963 * Currently, fast page fault only works for direct mapping since 2964 * the gfn is not stable for indirect shadow page. 2965 * See Documentation/virtual/kvm/locking.txt to get more detail. 2966 */ 2967 ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte); 2968exit: 2969 trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, 2970 spte, ret); 2971 walk_shadow_page_lockless_end(vcpu); 2972 2973 return ret; 2974} 2975 2976static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 2977 gva_t gva, pfn_t *pfn, bool write, bool *writable); 2978static void make_mmu_pages_available(struct kvm_vcpu *vcpu); 2979 2980static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, 2981 gfn_t gfn, bool prefault) 2982{ 2983 int r; 2984 int level; 2985 bool force_pt_level = false; 2986 pfn_t pfn; 2987 unsigned long mmu_seq; 2988 bool map_writable, write = error_code & PFERR_WRITE_MASK; 2989 2990 level = mapping_level(vcpu, gfn, &force_pt_level); 2991 if (likely(!force_pt_level)) { 2992 /* 2993 * This path builds a PAE pagetable - so we can map 2994 * 2mb pages at maximum. Therefore check if the level 2995 * is larger than that. 2996 */ 2997 if (level > PT_DIRECTORY_LEVEL) 2998 level = PT_DIRECTORY_LEVEL; 2999 3000 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3001 } 3002 3003 if (fast_page_fault(vcpu, v, level, error_code)) 3004 return 0; 3005 3006 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3007 smp_rmb(); 3008 3009 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable)) 3010 return 0; 3011 3012 if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r)) 3013 return r; 3014 3015 spin_lock(&vcpu->kvm->mmu_lock); 3016 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3017 goto out_unlock; 3018 make_mmu_pages_available(vcpu); 3019 if (likely(!force_pt_level)) 3020 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3021 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn, 3022 prefault); 3023 spin_unlock(&vcpu->kvm->mmu_lock); 3024 3025 3026 return r; 3027 3028out_unlock: 3029 spin_unlock(&vcpu->kvm->mmu_lock); 3030 kvm_release_pfn_clean(pfn); 3031 return 0; 3032} 3033 3034 3035static void mmu_free_roots(struct kvm_vcpu *vcpu) 3036{ 3037 int i; 3038 struct kvm_mmu_page *sp; 3039 LIST_HEAD(invalid_list); 3040 3041 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3042 return; 3043 3044 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && 3045 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || 3046 vcpu->arch.mmu.direct_map)) { 3047 hpa_t root = vcpu->arch.mmu.root_hpa; 3048 3049 spin_lock(&vcpu->kvm->mmu_lock); 3050 sp = page_header(root); 3051 --sp->root_count; 3052 if (!sp->root_count && sp->role.invalid) { 3053 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 3054 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3055 } 3056 spin_unlock(&vcpu->kvm->mmu_lock); 3057 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 3058 return; 3059 } 3060 3061 spin_lock(&vcpu->kvm->mmu_lock); 3062 for (i = 0; i < 4; ++i) { 3063 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3064 3065 if (root) { 3066 root &= PT64_BASE_ADDR_MASK; 3067 sp = page_header(root); 3068 --sp->root_count; 3069 if (!sp->root_count && sp->role.invalid) 3070 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3071 &invalid_list); 3072 } 3073 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 3074 } 3075 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3076 spin_unlock(&vcpu->kvm->mmu_lock); 3077 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 3078} 3079 3080static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) 3081{ 3082 int ret = 0; 3083 3084 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 3085 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3086 ret = 1; 3087 } 3088 3089 return ret; 3090} 3091 3092static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) 3093{ 3094 struct kvm_mmu_page *sp; 3095 unsigned i; 3096 3097 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3098 spin_lock(&vcpu->kvm->mmu_lock); 3099 make_mmu_pages_available(vcpu); 3100 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, 3101 1, ACC_ALL, NULL); 3102 ++sp->root_count; 3103 spin_unlock(&vcpu->kvm->mmu_lock); 3104 vcpu->arch.mmu.root_hpa = __pa(sp->spt); 3105 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { 3106 for (i = 0; i < 4; ++i) { 3107 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3108 3109 MMU_WARN_ON(VALID_PAGE(root)); 3110 spin_lock(&vcpu->kvm->mmu_lock); 3111 make_mmu_pages_available(vcpu); 3112 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), 3113 i << 30, 3114 PT32_ROOT_LEVEL, 1, ACC_ALL, 3115 NULL); 3116 root = __pa(sp->spt); 3117 ++sp->root_count; 3118 spin_unlock(&vcpu->kvm->mmu_lock); 3119 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 3120 } 3121 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 3122 } else 3123 BUG(); 3124 3125 return 0; 3126} 3127 3128static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) 3129{ 3130 struct kvm_mmu_page *sp; 3131 u64 pdptr, pm_mask; 3132 gfn_t root_gfn; 3133 int i; 3134 3135 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; 3136 3137 if (mmu_check_root(vcpu, root_gfn)) 3138 return 1; 3139 3140 /* 3141 * Do we shadow a long mode page table? If so we need to 3142 * write-protect the guests page table root. 3143 */ 3144 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3145 hpa_t root = vcpu->arch.mmu.root_hpa; 3146 3147 MMU_WARN_ON(VALID_PAGE(root)); 3148 3149 spin_lock(&vcpu->kvm->mmu_lock); 3150 make_mmu_pages_available(vcpu); 3151 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, 3152 0, ACC_ALL, NULL); 3153 root = __pa(sp->spt); 3154 ++sp->root_count; 3155 spin_unlock(&vcpu->kvm->mmu_lock); 3156 vcpu->arch.mmu.root_hpa = root; 3157 return 0; 3158 } 3159 3160 /* 3161 * We shadow a 32 bit page table. This may be a legacy 2-level 3162 * or a PAE 3-level page table. In either case we need to be aware that 3163 * the shadow page table may be a PAE or a long mode page table. 3164 */ 3165 pm_mask = PT_PRESENT_MASK; 3166 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) 3167 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; 3168 3169 for (i = 0; i < 4; ++i) { 3170 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3171 3172 MMU_WARN_ON(VALID_PAGE(root)); 3173 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 3174 pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); 3175 if (!is_present_gpte(pdptr)) { 3176 vcpu->arch.mmu.pae_root[i] = 0; 3177 continue; 3178 } 3179 root_gfn = pdptr >> PAGE_SHIFT; 3180 if (mmu_check_root(vcpu, root_gfn)) 3181 return 1; 3182 } 3183 spin_lock(&vcpu->kvm->mmu_lock); 3184 make_mmu_pages_available(vcpu); 3185 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 3186 PT32_ROOT_LEVEL, 0, 3187 ACC_ALL, NULL); 3188 root = __pa(sp->spt); 3189 ++sp->root_count; 3190 spin_unlock(&vcpu->kvm->mmu_lock); 3191 3192 vcpu->arch.mmu.pae_root[i] = root | pm_mask; 3193 } 3194 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 3195 3196 /* 3197 * If we shadow a 32 bit page table with a long mode page 3198 * table we enter this path. 3199 */ 3200 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 3201 if (vcpu->arch.mmu.lm_root == NULL) { 3202 /* 3203 * The additional page necessary for this is only 3204 * allocated on demand. 3205 */ 3206 3207 u64 *lm_root; 3208 3209 lm_root = (void*)get_zeroed_page(GFP_KERNEL); 3210 if (lm_root == NULL) 3211 return 1; 3212 3213 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; 3214 3215 vcpu->arch.mmu.lm_root = lm_root; 3216 } 3217 3218 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); 3219 } 3220 3221 return 0; 3222} 3223 3224static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 3225{ 3226 if (vcpu->arch.mmu.direct_map) 3227 return mmu_alloc_direct_roots(vcpu); 3228 else 3229 return mmu_alloc_shadow_roots(vcpu); 3230} 3231 3232static void mmu_sync_roots(struct kvm_vcpu *vcpu) 3233{ 3234 int i; 3235 struct kvm_mmu_page *sp; 3236 3237 if (vcpu->arch.mmu.direct_map) 3238 return; 3239 3240 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3241 return; 3242 3243 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); 3244 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); 3245 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { 3246 hpa_t root = vcpu->arch.mmu.root_hpa; 3247 sp = page_header(root); 3248 mmu_sync_children(vcpu, sp); 3249 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3250 return; 3251 } 3252 for (i = 0; i < 4; ++i) { 3253 hpa_t root = vcpu->arch.mmu.pae_root[i]; 3254 3255 if (root && VALID_PAGE(root)) { 3256 root &= PT64_BASE_ADDR_MASK; 3257 sp = page_header(root); 3258 mmu_sync_children(vcpu, sp); 3259 } 3260 } 3261 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); 3262} 3263 3264void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 3265{ 3266 spin_lock(&vcpu->kvm->mmu_lock); 3267 mmu_sync_roots(vcpu); 3268 spin_unlock(&vcpu->kvm->mmu_lock); 3269} 3270EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots); 3271 3272static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 3273 u32 access, struct x86_exception *exception) 3274{ 3275 if (exception) 3276 exception->error_code = 0; 3277 return vaddr; 3278} 3279 3280static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, 3281 u32 access, 3282 struct x86_exception *exception) 3283{ 3284 if (exception) 3285 exception->error_code = 0; 3286 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); 3287} 3288 3289static bool 3290__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) 3291{ 3292 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; 3293 3294 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | 3295 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); 3296} 3297 3298static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) 3299{ 3300 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); 3301} 3302 3303static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) 3304{ 3305 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); 3306} 3307 3308static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3309{ 3310 if (direct) 3311 return vcpu_match_mmio_gpa(vcpu, addr); 3312 3313 return vcpu_match_mmio_gva(vcpu, addr); 3314} 3315 3316/* return true if reserved bit is detected on spte. */ 3317static bool 3318walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) 3319{ 3320 struct kvm_shadow_walk_iterator iterator; 3321 u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; 3322 int root, leaf; 3323 bool reserved = false; 3324 3325 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3326 goto exit; 3327 3328 walk_shadow_page_lockless_begin(vcpu); 3329 3330 for (shadow_walk_init(&iterator, vcpu, addr), 3331 leaf = root = iterator.level; 3332 shadow_walk_okay(&iterator); 3333 __shadow_walk_next(&iterator, spte)) { 3334 spte = mmu_spte_get_lockless(iterator.sptep); 3335 3336 sptes[leaf - 1] = spte; 3337 leaf--; 3338 3339 if (!is_shadow_present_pte(spte)) 3340 break; 3341 3342 reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, 3343 iterator.level); 3344 } 3345 3346 walk_shadow_page_lockless_end(vcpu); 3347 3348 if (reserved) { 3349 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", 3350 __func__, addr); 3351 while (root > leaf) { 3352 pr_err("------ spte 0x%llx level %d.\n", 3353 sptes[root - 1], root); 3354 root--; 3355 } 3356 } 3357exit: 3358 *sptep = spte; 3359 return reserved; 3360} 3361 3362int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) 3363{ 3364 u64 spte; 3365 bool reserved; 3366 3367 if (quickly_check_mmio_pf(vcpu, addr, direct)) 3368 return RET_MMIO_PF_EMULATE; 3369 3370 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); 3371 if (WARN_ON(reserved)) 3372 return RET_MMIO_PF_BUG; 3373 3374 if (is_mmio_spte(spte)) { 3375 gfn_t gfn = get_mmio_spte_gfn(spte); 3376 unsigned access = get_mmio_spte_access(spte); 3377 3378 if (!check_mmio_spte(vcpu, spte)) 3379 return RET_MMIO_PF_INVALID; 3380 3381 if (direct) 3382 addr = 0; 3383 3384 trace_handle_mmio_page_fault(addr, gfn, access); 3385 vcpu_cache_mmio_info(vcpu, addr, gfn, access); 3386 return RET_MMIO_PF_EMULATE; 3387 } 3388 3389 /* 3390 * If the page table is zapped by other cpus, let CPU fault again on 3391 * the address. 3392 */ 3393 return RET_MMIO_PF_RETRY; 3394} 3395EXPORT_SYMBOL_GPL(handle_mmio_page_fault); 3396 3397static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 3398 u32 error_code, bool prefault) 3399{ 3400 gfn_t gfn; 3401 int r; 3402 3403 pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); 3404 3405 if (unlikely(error_code & PFERR_RSVD_MASK)) { 3406 r = handle_mmio_page_fault(vcpu, gva, true); 3407 3408 if (likely(r != RET_MMIO_PF_INVALID)) 3409 return r; 3410 } 3411 3412 r = mmu_topup_memory_caches(vcpu); 3413 if (r) 3414 return r; 3415 3416 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3417 3418 gfn = gva >> PAGE_SHIFT; 3419 3420 return nonpaging_map(vcpu, gva & PAGE_MASK, 3421 error_code, gfn, prefault); 3422} 3423 3424static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) 3425{ 3426 struct kvm_arch_async_pf arch; 3427 3428 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; 3429 arch.gfn = gfn; 3430 arch.direct_map = vcpu->arch.mmu.direct_map; 3431 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); 3432 3433 return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); 3434} 3435 3436static bool can_do_async_pf(struct kvm_vcpu *vcpu) 3437{ 3438 if (unlikely(!lapic_in_kernel(vcpu) || 3439 kvm_event_needs_reinjection(vcpu))) 3440 return false; 3441 3442 return kvm_x86_ops->interrupt_allowed(vcpu); 3443} 3444 3445static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, 3446 gva_t gva, pfn_t *pfn, bool write, bool *writable) 3447{ 3448 struct kvm_memory_slot *slot; 3449 bool async; 3450 3451 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 3452 async = false; 3453 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); 3454 if (!async) 3455 return false; /* *pfn has correct page already */ 3456 3457 if (!prefault && can_do_async_pf(vcpu)) { 3458 trace_kvm_try_async_get_page(gva, gfn); 3459 if (kvm_find_async_pf_gfn(vcpu, gfn)) { 3460 trace_kvm_async_pf_doublefault(gva, gfn); 3461 kvm_make_request(KVM_REQ_APF_HALT, vcpu); 3462 return true; 3463 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn)) 3464 return true; 3465 } 3466 3467 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); 3468 return false; 3469} 3470 3471static bool 3472check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) 3473{ 3474 int page_num = KVM_PAGES_PER_HPAGE(level); 3475 3476 gfn &= ~(page_num - 1); 3477 3478 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); 3479} 3480 3481static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, 3482 bool prefault) 3483{ 3484 pfn_t pfn; 3485 int r; 3486 int level; 3487 bool force_pt_level; 3488 gfn_t gfn = gpa >> PAGE_SHIFT; 3489 unsigned long mmu_seq; 3490 int write = error_code & PFERR_WRITE_MASK; 3491 bool map_writable; 3492 3493 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3494 3495 if (unlikely(error_code & PFERR_RSVD_MASK)) { 3496 r = handle_mmio_page_fault(vcpu, gpa, true); 3497 3498 if (likely(r != RET_MMIO_PF_INVALID)) 3499 return r; 3500 } 3501 3502 r = mmu_topup_memory_caches(vcpu); 3503 if (r) 3504 return r; 3505 3506 force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, 3507 PT_DIRECTORY_LEVEL); 3508 level = mapping_level(vcpu, gfn, &force_pt_level); 3509 if (likely(!force_pt_level)) { 3510 if (level > PT_DIRECTORY_LEVEL && 3511 !check_hugepage_cache_consistency(vcpu, gfn, level)) 3512 level = PT_DIRECTORY_LEVEL; 3513 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 3514 } 3515 3516 if (fast_page_fault(vcpu, gpa, level, error_code)) 3517 return 0; 3518 3519 mmu_seq = vcpu->kvm->mmu_notifier_seq; 3520 smp_rmb(); 3521 3522 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable)) 3523 return 0; 3524 3525 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r)) 3526 return r; 3527 3528 spin_lock(&vcpu->kvm->mmu_lock); 3529 if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3530 goto out_unlock; 3531 make_mmu_pages_available(vcpu); 3532 if (likely(!force_pt_level)) 3533 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level); 3534 r = __direct_map(vcpu, gpa, write, map_writable, 3535 level, gfn, pfn, prefault); 3536 spin_unlock(&vcpu->kvm->mmu_lock); 3537 3538 return r; 3539 3540out_unlock: 3541 spin_unlock(&vcpu->kvm->mmu_lock); 3542 kvm_release_pfn_clean(pfn); 3543 return 0; 3544} 3545 3546static void nonpaging_init_context(struct kvm_vcpu *vcpu, 3547 struct kvm_mmu *context) 3548{ 3549 context->page_fault = nonpaging_page_fault; 3550 context->gva_to_gpa = nonpaging_gva_to_gpa; 3551 context->sync_page = nonpaging_sync_page; 3552 context->invlpg = nonpaging_invlpg; 3553 context->update_pte = nonpaging_update_pte; 3554 context->root_level = 0; 3555 context->shadow_root_level = PT32E_ROOT_LEVEL; 3556 context->root_hpa = INVALID_PAGE; 3557 context->direct_map = true; 3558 context->nx = false; 3559} 3560 3561void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu) 3562{ 3563 mmu_free_roots(vcpu); 3564} 3565 3566static unsigned long get_cr3(struct kvm_vcpu *vcpu) 3567{ 3568 return kvm_read_cr3(vcpu); 3569} 3570 3571static void inject_page_fault(struct kvm_vcpu *vcpu, 3572 struct x86_exception *fault) 3573{ 3574 vcpu->arch.mmu.inject_page_fault(vcpu, fault); 3575} 3576 3577static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, 3578 unsigned access, int *nr_present) 3579{ 3580 if (unlikely(is_mmio_spte(*sptep))) { 3581 if (gfn != get_mmio_spte_gfn(*sptep)) { 3582 mmu_spte_clear_no_track(sptep); 3583 return true; 3584 } 3585 3586 (*nr_present)++; 3587 mark_mmio_spte(vcpu, sptep, gfn, access); 3588 return true; 3589 } 3590 3591 return false; 3592} 3593 3594static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) 3595{ 3596 unsigned index; 3597 3598 index = level - 1; 3599 index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); 3600 return mmu->last_pte_bitmap & (1 << index); 3601} 3602 3603#define PTTYPE_EPT 18 /* arbitrary */ 3604#define PTTYPE PTTYPE_EPT 3605#include "paging_tmpl.h" 3606#undef PTTYPE 3607 3608#define PTTYPE 64 3609#include "paging_tmpl.h" 3610#undef PTTYPE 3611 3612#define PTTYPE 32 3613#include "paging_tmpl.h" 3614#undef PTTYPE 3615 3616static void 3617__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 3618 struct rsvd_bits_validate *rsvd_check, 3619 int maxphyaddr, int level, bool nx, bool gbpages, 3620 bool pse, bool amd) 3621{ 3622 u64 exb_bit_rsvd = 0; 3623 u64 gbpages_bit_rsvd = 0; 3624 u64 nonleaf_bit8_rsvd = 0; 3625 3626 rsvd_check->bad_mt_xwr = 0; 3627 3628 if (!nx) 3629 exb_bit_rsvd = rsvd_bits(63, 63); 3630 if (!gbpages) 3631 gbpages_bit_rsvd = rsvd_bits(7, 7); 3632 3633 /* 3634 * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for 3635 * leaf entries) on AMD CPUs only. 3636 */ 3637 if (amd) 3638 nonleaf_bit8_rsvd = rsvd_bits(8, 8); 3639 3640 switch (level) { 3641 case PT32_ROOT_LEVEL: 3642 /* no rsvd bits for 2 level 4K page table entries */ 3643 rsvd_check->rsvd_bits_mask[0][1] = 0; 3644 rsvd_check->rsvd_bits_mask[0][0] = 0; 3645 rsvd_check->rsvd_bits_mask[1][0] = 3646 rsvd_check->rsvd_bits_mask[0][0]; 3647 3648 if (!pse) { 3649 rsvd_check->rsvd_bits_mask[1][1] = 0; 3650 break; 3651 } 3652 3653 if (is_cpuid_PSE36()) 3654 /* 36bits PSE 4MB page */ 3655 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); 3656 else 3657 /* 32 bits PSE 4MB page */ 3658 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); 3659 break; 3660 case PT32E_ROOT_LEVEL: 3661 rsvd_check->rsvd_bits_mask[0][2] = 3662 rsvd_bits(maxphyaddr, 63) | 3663 rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ 3664 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3665 rsvd_bits(maxphyaddr, 62); /* PDE */ 3666 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | 3667 rsvd_bits(maxphyaddr, 62); /* PTE */ 3668 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | 3669 rsvd_bits(maxphyaddr, 62) | 3670 rsvd_bits(13, 20); /* large page */ 3671 rsvd_check->rsvd_bits_mask[1][0] = 3672 rsvd_check->rsvd_bits_mask[0][0]; 3673 break; 3674 case PT64_ROOT_LEVEL: 3675 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | 3676 nonleaf_bit8_rsvd | rsvd_bits(7, 7) | 3677 rsvd_bits(maxphyaddr, 51); 3678 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | 3679 nonleaf_bit8_rsvd | gbpages_bit_rsvd | 3680 rsvd_bits(maxphyaddr, 51); 3681 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | 3682 rsvd_bits(maxphyaddr, 51); 3683 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | 3684 rsvd_bits(maxphyaddr, 51); 3685 rsvd_check->rsvd_bits_mask[1][3] = 3686 rsvd_check->rsvd_bits_mask[0][3]; 3687 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | 3688 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | 3689 rsvd_bits(13, 29); 3690 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | 3691 rsvd_bits(maxphyaddr, 51) | 3692 rsvd_bits(13, 20); /* large page */ 3693 rsvd_check->rsvd_bits_mask[1][0] = 3694 rsvd_check->rsvd_bits_mask[0][0]; 3695 break; 3696 } 3697} 3698 3699static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, 3700 struct kvm_mmu *context) 3701{ 3702 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, 3703 cpuid_maxphyaddr(vcpu), context->root_level, 3704 context->nx, guest_cpuid_has_gbpages(vcpu), 3705 is_pse(vcpu), guest_cpuid_is_amd(vcpu)); 3706} 3707 3708static void 3709__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, 3710 int maxphyaddr, bool execonly) 3711{ 3712 u64 bad_mt_xwr; 3713 3714 rsvd_check->rsvd_bits_mask[0][3] = 3715 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); 3716 rsvd_check->rsvd_bits_mask[0][2] = 3717 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); 3718 rsvd_check->rsvd_bits_mask[0][1] = 3719 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); 3720 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); 3721 3722 /* large page */ 3723 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; 3724 rsvd_check->rsvd_bits_mask[1][2] = 3725 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); 3726 rsvd_check->rsvd_bits_mask[1][1] = 3727 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); 3728 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; 3729 3730 bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ 3731 bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ 3732 bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ 3733 bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ 3734 bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ 3735 if (!execonly) { 3736 /* bits 0..2 must not be 100 unless VMX capabilities allow it */ 3737 bad_mt_xwr |= REPEAT_BYTE(1ull << 4); 3738 } 3739 rsvd_check->bad_mt_xwr = bad_mt_xwr; 3740} 3741 3742static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, 3743 struct kvm_mmu *context, bool execonly) 3744{ 3745 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, 3746 cpuid_maxphyaddr(vcpu), execonly); 3747} 3748 3749/* 3750 * the page table on host is the shadow page table for the page 3751 * table in guest or amd nested guest, its mmu features completely 3752 * follow the features in guest. 3753 */ 3754void 3755reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 3756{ 3757 bool uses_nx = context->nx || context->base_role.smep_andnot_wp; 3758 3759 /* 3760 * Passing "true" to the last argument is okay; it adds a check 3761 * on bit 8 of the SPTEs which KVM doesn't use anyway. 3762 */ 3763 __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 3764 boot_cpu_data.x86_phys_bits, 3765 context->shadow_root_level, uses_nx, 3766 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 3767 true); 3768} 3769EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); 3770 3771static inline bool boot_cpu_is_amd(void) 3772{ 3773 WARN_ON_ONCE(!tdp_enabled); 3774 return shadow_x_mask == 0; 3775} 3776 3777/* 3778 * the direct page table on host, use as much mmu features as 3779 * possible, however, kvm currently does not do execution-protection. 3780 */ 3781static void 3782reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 3783 struct kvm_mmu *context) 3784{ 3785 if (boot_cpu_is_amd()) 3786 __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 3787 boot_cpu_data.x86_phys_bits, 3788 context->shadow_root_level, false, 3789 cpu_has_gbpages, true, true); 3790 else 3791 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 3792 boot_cpu_data.x86_phys_bits, 3793 false); 3794 3795} 3796 3797/* 3798 * as the comments in reset_shadow_zero_bits_mask() except it 3799 * is the shadow page table for intel nested guest. 3800 */ 3801static void 3802reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 3803 struct kvm_mmu *context, bool execonly) 3804{ 3805 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 3806 boot_cpu_data.x86_phys_bits, execonly); 3807} 3808 3809static void update_permission_bitmask(struct kvm_vcpu *vcpu, 3810 struct kvm_mmu *mmu, bool ept) 3811{ 3812 unsigned bit, byte, pfec; 3813 u8 map; 3814 bool fault, x, w, u, wf, uf, ff, smapf, cr4_smap, cr4_smep, smap = 0; 3815 3816 cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 3817 cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 3818 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { 3819 pfec = byte << 1; 3820 map = 0; 3821 wf = pfec & PFERR_WRITE_MASK; 3822 uf = pfec & PFERR_USER_MASK; 3823 ff = pfec & PFERR_FETCH_MASK; 3824 /* 3825 * PFERR_RSVD_MASK bit is set in PFEC if the access is not 3826 * subject to SMAP restrictions, and cleared otherwise. The 3827 * bit is only meaningful if the SMAP bit is set in CR4. 3828 */ 3829 smapf = !(pfec & PFERR_RSVD_MASK); 3830 for (bit = 0; bit < 8; ++bit) { 3831 x = bit & ACC_EXEC_MASK; 3832 w = bit & ACC_WRITE_MASK; 3833 u = bit & ACC_USER_MASK; 3834 3835 if (!ept) { 3836 /* Not really needed: !nx will cause pte.nx to fault */ 3837 x |= !mmu->nx; 3838 /* Allow supervisor writes if !cr0.wp */ 3839 w |= !is_write_protection(vcpu) && !uf; 3840 /* Disallow supervisor fetches of user code if cr4.smep */ 3841 x &= !(cr4_smep && u && !uf); 3842 3843 /* 3844 * SMAP:kernel-mode data accesses from user-mode 3845 * mappings should fault. A fault is considered 3846 * as a SMAP violation if all of the following 3847 * conditions are ture: 3848 * - X86_CR4_SMAP is set in CR4 3849 * - An user page is accessed 3850 * - Page fault in kernel mode 3851 * - if CPL = 3 or X86_EFLAGS_AC is clear 3852 * 3853 * Here, we cover the first three conditions. 3854 * The fourth is computed dynamically in 3855 * permission_fault() and is in smapf. 3856 * 3857 * Also, SMAP does not affect instruction 3858 * fetches, add the !ff check here to make it 3859 * clearer. 3860 */ 3861 smap = cr4_smap && u && !uf && !ff; 3862 } else 3863 /* Not really needed: no U/S accesses on ept */ 3864 u = 1; 3865 3866 fault = (ff && !x) || (uf && !u) || (wf && !w) || 3867 (smapf && smap); 3868 map |= fault << bit; 3869 } 3870 mmu->permissions[byte] = map; 3871 } 3872} 3873 3874static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) 3875{ 3876 u8 map; 3877 unsigned level, root_level = mmu->root_level; 3878 const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ 3879 3880 if (root_level == PT32E_ROOT_LEVEL) 3881 --root_level; 3882 /* PT_PAGE_TABLE_LEVEL always terminates */ 3883 map = 1 | (1 << ps_set_index); 3884 for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { 3885 if (level <= PT_PDPE_LEVEL 3886 && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) 3887 map |= 1 << (ps_set_index | (level - 1)); 3888 } 3889 mmu->last_pte_bitmap = map; 3890} 3891 3892static void paging64_init_context_common(struct kvm_vcpu *vcpu, 3893 struct kvm_mmu *context, 3894 int level) 3895{ 3896 context->nx = is_nx(vcpu); 3897 context->root_level = level; 3898 3899 reset_rsvds_bits_mask(vcpu, context); 3900 update_permission_bitmask(vcpu, context, false); 3901 update_last_pte_bitmap(vcpu, context); 3902 3903 MMU_WARN_ON(!is_pae(vcpu)); 3904 context->page_fault = paging64_page_fault; 3905 context->gva_to_gpa = paging64_gva_to_gpa; 3906 context->sync_page = paging64_sync_page; 3907 context->invlpg = paging64_invlpg; 3908 context->update_pte = paging64_update_pte; 3909 context->shadow_root_level = level; 3910 context->root_hpa = INVALID_PAGE; 3911 context->direct_map = false; 3912} 3913 3914static void paging64_init_context(struct kvm_vcpu *vcpu, 3915 struct kvm_mmu *context) 3916{ 3917 paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); 3918} 3919 3920static void paging32_init_context(struct kvm_vcpu *vcpu, 3921 struct kvm_mmu *context) 3922{ 3923 context->nx = false; 3924 context->root_level = PT32_ROOT_LEVEL; 3925 3926 reset_rsvds_bits_mask(vcpu, context); 3927 update_permission_bitmask(vcpu, context, false); 3928 update_last_pte_bitmap(vcpu, context); 3929 3930 context->page_fault = paging32_page_fault; 3931 context->gva_to_gpa = paging32_gva_to_gpa; 3932 context->sync_page = paging32_sync_page; 3933 context->invlpg = paging32_invlpg; 3934 context->update_pte = paging32_update_pte; 3935 context->shadow_root_level = PT32E_ROOT_LEVEL; 3936 context->root_hpa = INVALID_PAGE; 3937 context->direct_map = false; 3938} 3939 3940static void paging32E_init_context(struct kvm_vcpu *vcpu, 3941 struct kvm_mmu *context) 3942{ 3943 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); 3944} 3945 3946static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 3947{ 3948 struct kvm_mmu *context = &vcpu->arch.mmu; 3949 3950 context->base_role.word = 0; 3951 context->base_role.smm = is_smm(vcpu); 3952 context->page_fault = tdp_page_fault; 3953 context->sync_page = nonpaging_sync_page; 3954 context->invlpg = nonpaging_invlpg; 3955 context->update_pte = nonpaging_update_pte; 3956 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 3957 context->root_hpa = INVALID_PAGE; 3958 context->direct_map = true; 3959 context->set_cr3 = kvm_x86_ops->set_tdp_cr3; 3960 context->get_cr3 = get_cr3; 3961 context->get_pdptr = kvm_pdptr_read; 3962 context->inject_page_fault = kvm_inject_page_fault; 3963 3964 if (!is_paging(vcpu)) { 3965 context->nx = false; 3966 context->gva_to_gpa = nonpaging_gva_to_gpa; 3967 context->root_level = 0; 3968 } else if (is_long_mode(vcpu)) { 3969 context->nx = is_nx(vcpu); 3970 context->root_level = PT64_ROOT_LEVEL; 3971 reset_rsvds_bits_mask(vcpu, context); 3972 context->gva_to_gpa = paging64_gva_to_gpa; 3973 } else if (is_pae(vcpu)) { 3974 context->nx = is_nx(vcpu); 3975 context->root_level = PT32E_ROOT_LEVEL; 3976 reset_rsvds_bits_mask(vcpu, context); 3977 context->gva_to_gpa = paging64_gva_to_gpa; 3978 } else { 3979 context->nx = false; 3980 context->root_level = PT32_ROOT_LEVEL; 3981 reset_rsvds_bits_mask(vcpu, context); 3982 context->gva_to_gpa = paging32_gva_to_gpa; 3983 } 3984 3985 update_permission_bitmask(vcpu, context, false); 3986 update_last_pte_bitmap(vcpu, context); 3987 reset_tdp_shadow_zero_bits_mask(vcpu, context); 3988} 3989 3990void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) 3991{ 3992 bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); 3993 bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); 3994 struct kvm_mmu *context = &vcpu->arch.mmu; 3995 3996 MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 3997 3998 if (!is_paging(vcpu)) 3999 nonpaging_init_context(vcpu, context); 4000 else if (is_long_mode(vcpu)) 4001 paging64_init_context(vcpu, context); 4002 else if (is_pae(vcpu)) 4003 paging32E_init_context(vcpu, context); 4004 else 4005 paging32_init_context(vcpu, context); 4006 4007 context->base_role.nxe = is_nx(vcpu); 4008 context->base_role.cr4_pae = !!is_pae(vcpu); 4009 context->base_role.cr0_wp = is_write_protection(vcpu); 4010 context->base_role.smep_andnot_wp 4011 = smep && !is_write_protection(vcpu); 4012 context->base_role.smap_andnot_wp 4013 = smap && !is_write_protection(vcpu); 4014 context->base_role.smm = is_smm(vcpu); 4015 reset_shadow_zero_bits_mask(vcpu, context); 4016} 4017EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); 4018 4019void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) 4020{ 4021 struct kvm_mmu *context = &vcpu->arch.mmu; 4022 4023 MMU_WARN_ON(VALID_PAGE(context->root_hpa)); 4024 4025 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 4026 4027 context->nx = true; 4028 context->page_fault = ept_page_fault; 4029 context->gva_to_gpa = ept_gva_to_gpa; 4030 context->sync_page = ept_sync_page; 4031 context->invlpg = ept_invlpg; 4032 context->update_pte = ept_update_pte; 4033 context->root_level = context->shadow_root_level; 4034 context->root_hpa = INVALID_PAGE; 4035 context->direct_map = false; 4036 4037 update_permission_bitmask(vcpu, context, true); 4038 reset_rsvds_bits_mask_ept(vcpu, context, execonly); 4039 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); 4040} 4041EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); 4042 4043static void init_kvm_softmmu(struct kvm_vcpu *vcpu) 4044{ 4045 struct kvm_mmu *context = &vcpu->arch.mmu; 4046 4047 kvm_init_shadow_mmu(vcpu); 4048 context->set_cr3 = kvm_x86_ops->set_cr3; 4049 context->get_cr3 = get_cr3; 4050 context->get_pdptr = kvm_pdptr_read; 4051 context->inject_page_fault = kvm_inject_page_fault; 4052} 4053 4054static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) 4055{ 4056 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; 4057 4058 g_context->get_cr3 = get_cr3; 4059 g_context->get_pdptr = kvm_pdptr_read; 4060 g_context->inject_page_fault = kvm_inject_page_fault; 4061 4062 /* 4063 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The 4064 * translation of l2_gpa to l1_gpa addresses is done using the 4065 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa 4066 * functions between mmu and nested_mmu are swapped. 4067 */ 4068 if (!is_paging(vcpu)) { 4069 g_context->nx = false; 4070 g_context->root_level = 0; 4071 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; 4072 } else if (is_long_mode(vcpu)) { 4073 g_context->nx = is_nx(vcpu); 4074 g_context->root_level = PT64_ROOT_LEVEL; 4075 reset_rsvds_bits_mask(vcpu, g_context); 4076 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4077 } else if (is_pae(vcpu)) { 4078 g_context->nx = is_nx(vcpu); 4079 g_context->root_level = PT32E_ROOT_LEVEL; 4080 reset_rsvds_bits_mask(vcpu, g_context); 4081 g_context->gva_to_gpa = paging64_gva_to_gpa_nested; 4082 } else { 4083 g_context->nx = false; 4084 g_context->root_level = PT32_ROOT_LEVEL; 4085 reset_rsvds_bits_mask(vcpu, g_context); 4086 g_context->gva_to_gpa = paging32_gva_to_gpa_nested; 4087 } 4088 4089 update_permission_bitmask(vcpu, g_context, false); 4090 update_last_pte_bitmap(vcpu, g_context); 4091} 4092 4093static void init_kvm_mmu(struct kvm_vcpu *vcpu) 4094{ 4095 if (mmu_is_nested(vcpu)) 4096 init_kvm_nested_mmu(vcpu); 4097 else if (tdp_enabled) 4098 init_kvm_tdp_mmu(vcpu); 4099 else 4100 init_kvm_softmmu(vcpu); 4101} 4102 4103void kvm_mmu_reset_context(struct kvm_vcpu *vcpu) 4104{ 4105 kvm_mmu_unload(vcpu); 4106 init_kvm_mmu(vcpu); 4107} 4108EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); 4109 4110int kvm_mmu_load(struct kvm_vcpu *vcpu) 4111{ 4112 int r; 4113 4114 r = mmu_topup_memory_caches(vcpu); 4115 if (r) 4116 goto out; 4117 r = mmu_alloc_roots(vcpu); 4118 kvm_mmu_sync_roots(vcpu); 4119 if (r) 4120 goto out; 4121 /* set_cr3() should ensure TLB has been flushed */ 4122 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 4123out: 4124 return r; 4125} 4126EXPORT_SYMBOL_GPL(kvm_mmu_load); 4127 4128void kvm_mmu_unload(struct kvm_vcpu *vcpu) 4129{ 4130 mmu_free_roots(vcpu); 4131 WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 4132} 4133EXPORT_SYMBOL_GPL(kvm_mmu_unload); 4134 4135static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 4136 struct kvm_mmu_page *sp, u64 *spte, 4137 const void *new) 4138{ 4139 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 4140 ++vcpu->kvm->stat.mmu_pde_zapped; 4141 return; 4142 } 4143 4144 ++vcpu->kvm->stat.mmu_pte_updated; 4145 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); 4146} 4147 4148static bool need_remote_flush(u64 old, u64 new) 4149{ 4150 if (!is_shadow_present_pte(old)) 4151 return false; 4152 if (!is_shadow_present_pte(new)) 4153 return true; 4154 if ((old ^ new) & PT64_BASE_ADDR_MASK) 4155 return true; 4156 old ^= shadow_nx_mask; 4157 new ^= shadow_nx_mask; 4158 return (old & ~new & PT64_PERM_MASK) != 0; 4159} 4160 4161static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, 4162 bool remote_flush, bool local_flush) 4163{ 4164 if (zap_page) 4165 return; 4166 4167 if (remote_flush) 4168 kvm_flush_remote_tlbs(vcpu->kvm); 4169 else if (local_flush) 4170 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 4171} 4172 4173static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, 4174 const u8 *new, int *bytes) 4175{ 4176 u64 gentry; 4177 int r; 4178 4179 /* 4180 * Assume that the pte write on a page table of the same type 4181 * as the current vcpu paging mode since we update the sptes only 4182 * when they have the same mode. 4183 */ 4184 if (is_pae(vcpu) && *bytes == 4) { 4185 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 4186 *gpa &= ~(gpa_t)7; 4187 *bytes = 8; 4188 r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); 4189 if (r) 4190 gentry = 0; 4191 new = (const u8 *)&gentry; 4192 } 4193 4194 switch (*bytes) { 4195 case 4: 4196 gentry = *(const u32 *)new; 4197 break; 4198 case 8: 4199 gentry = *(const u64 *)new; 4200 break; 4201 default: 4202 gentry = 0; 4203 break; 4204 } 4205 4206 return gentry; 4207} 4208 4209/* 4210 * If we're seeing too many writes to a page, it may no longer be a page table, 4211 * or we may be forking, in which case it is better to unmap the page. 4212 */ 4213static bool detect_write_flooding(struct kvm_mmu_page *sp) 4214{ 4215 /* 4216 * Skip write-flooding detected for the sp whose level is 1, because 4217 * it can become unsync, then the guest page is not write-protected. 4218 */ 4219 if (sp->role.level == PT_PAGE_TABLE_LEVEL) 4220 return false; 4221 4222 return ++sp->write_flooding_count >= 3; 4223} 4224 4225/* 4226 * Misaligned accesses are too much trouble to fix up; also, they usually 4227 * indicate a page is not used as a page table. 4228 */ 4229static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa, 4230 int bytes) 4231{ 4232 unsigned offset, pte_size, misaligned; 4233 4234 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 4235 gpa, bytes, sp->role.word); 4236 4237 offset = offset_in_page(gpa); 4238 pte_size = sp->role.cr4_pae ? 8 : 4; 4239 4240 /* 4241 * Sometimes, the OS only writes the last one bytes to update status 4242 * bits, for example, in linux, andb instruction is used in clear_bit(). 4243 */ 4244 if (!(offset & (pte_size - 1)) && bytes == 1) 4245 return false; 4246 4247 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 4248 misaligned |= bytes < 4; 4249 4250 return misaligned; 4251} 4252 4253static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) 4254{ 4255 unsigned page_offset, quadrant; 4256 u64 *spte; 4257 int level; 4258 4259 page_offset = offset_in_page(gpa); 4260 level = sp->role.level; 4261 *nspte = 1; 4262 if (!sp->role.cr4_pae) { 4263 page_offset <<= 1; /* 32->64 */ 4264 /* 4265 * A 32-bit pde maps 4MB while the shadow pdes map 4266 * only 2MB. So we need to double the offset again 4267 * and zap two pdes instead of one. 4268 */ 4269 if (level == PT32_ROOT_LEVEL) { 4270 page_offset &= ~7; /* kill rounding error */ 4271 page_offset <<= 1; 4272 *nspte = 2; 4273 } 4274 quadrant = page_offset >> PAGE_SHIFT; 4275 page_offset &= ~PAGE_MASK; 4276 if (quadrant != sp->role.quadrant) 4277 return NULL; 4278 } 4279 4280 spte = &sp->spt[page_offset / sizeof(*spte)]; 4281 return spte; 4282} 4283 4284void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 4285 const u8 *new, int bytes) 4286{ 4287 gfn_t gfn = gpa >> PAGE_SHIFT; 4288 struct kvm_mmu_page *sp; 4289 LIST_HEAD(invalid_list); 4290 u64 entry, gentry, *spte; 4291 int npte; 4292 bool remote_flush, local_flush, zap_page; 4293 union kvm_mmu_page_role mask = { }; 4294 4295 mask.cr0_wp = 1; 4296 mask.cr4_pae = 1; 4297 mask.nxe = 1; 4298 mask.smep_andnot_wp = 1; 4299 mask.smap_andnot_wp = 1; 4300 mask.smm = 1; 4301 4302 /* 4303 * If we don't have indirect shadow pages, it means no page is 4304 * write-protected, so we can exit simply. 4305 */ 4306 if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages)) 4307 return; 4308 4309 zap_page = remote_flush = local_flush = false; 4310 4311 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 4312 4313 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, new, &bytes); 4314 4315 /* 4316 * No need to care whether allocation memory is successful 4317 * or not since pte prefetch is skiped if it does not have 4318 * enough objects in the cache. 4319 */ 4320 mmu_topup_memory_caches(vcpu); 4321 4322 spin_lock(&vcpu->kvm->mmu_lock); 4323 ++vcpu->kvm->stat.mmu_pte_write; 4324 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); 4325 4326 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { 4327 if (detect_write_misaligned(sp, gpa, bytes) || 4328 detect_write_flooding(sp)) { 4329 zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 4330 &invalid_list); 4331 ++vcpu->kvm->stat.mmu_flooded; 4332 continue; 4333 } 4334 4335 spte = get_written_sptes(sp, gpa, &npte); 4336 if (!spte) 4337 continue; 4338 4339 local_flush = true; 4340 while (npte--) { 4341 entry = *spte; 4342 mmu_page_zap_pte(vcpu->kvm, sp, spte); 4343 if (gentry && 4344 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 4345 & mask.word) && rmap_can_add(vcpu)) 4346 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 4347 if (need_remote_flush(entry, *spte)) 4348 remote_flush = true; 4349 ++spte; 4350 } 4351 } 4352 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 4353 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 4354 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); 4355 spin_unlock(&vcpu->kvm->mmu_lock); 4356} 4357 4358int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 4359{ 4360 gpa_t gpa; 4361 int r; 4362 4363 if (vcpu->arch.mmu.direct_map) 4364 return 0; 4365 4366 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 4367 4368 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 4369 4370 return r; 4371} 4372EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); 4373 4374static void make_mmu_pages_available(struct kvm_vcpu *vcpu) 4375{ 4376 LIST_HEAD(invalid_list); 4377 4378 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) 4379 return; 4380 4381 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { 4382 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) 4383 break; 4384 4385 ++vcpu->kvm->stat.mmu_recycled; 4386 } 4387 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 4388} 4389 4390static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr) 4391{ 4392 if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu)) 4393 return vcpu_match_mmio_gpa(vcpu, addr); 4394 4395 return vcpu_match_mmio_gva(vcpu, addr); 4396} 4397 4398int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, 4399 void *insn, int insn_len) 4400{ 4401 int r, emulation_type = EMULTYPE_RETRY; 4402 enum emulation_result er; 4403 4404 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); 4405 if (r < 0) 4406 goto out; 4407 4408 if (!r) { 4409 r = 1; 4410 goto out; 4411 } 4412 4413 if (is_mmio_page_fault(vcpu, cr2)) 4414 emulation_type = 0; 4415 4416 er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); 4417 4418 switch (er) { 4419 case EMULATE_DONE: 4420 return 1; 4421 case EMULATE_USER_EXIT: 4422 ++vcpu->stat.mmio_exits; 4423 /* fall through */ 4424 case EMULATE_FAIL: 4425 return 0; 4426 default: 4427 BUG(); 4428 } 4429out: 4430 return r; 4431} 4432EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 4433 4434void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) 4435{ 4436 vcpu->arch.mmu.invlpg(vcpu, gva); 4437 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 4438 ++vcpu->stat.invlpg; 4439} 4440EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); 4441 4442void kvm_enable_tdp(void) 4443{ 4444 tdp_enabled = true; 4445} 4446EXPORT_SYMBOL_GPL(kvm_enable_tdp); 4447 4448void kvm_disable_tdp(void) 4449{ 4450 tdp_enabled = false; 4451} 4452EXPORT_SYMBOL_GPL(kvm_disable_tdp); 4453 4454static void free_mmu_pages(struct kvm_vcpu *vcpu) 4455{ 4456 free_page((unsigned long)vcpu->arch.mmu.pae_root); 4457 if (vcpu->arch.mmu.lm_root != NULL) 4458 free_page((unsigned long)vcpu->arch.mmu.lm_root); 4459} 4460 4461static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 4462{ 4463 struct page *page; 4464 int i; 4465 4466 /* 4467 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. 4468 * Therefore we need to allocate shadow page tables in the first 4469 * 4GB of memory, which happens to fit the DMA32 zone. 4470 */ 4471 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 4472 if (!page) 4473 return -ENOMEM; 4474 4475 vcpu->arch.mmu.pae_root = page_address(page); 4476 for (i = 0; i < 4; ++i) 4477 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 4478 4479 return 0; 4480} 4481 4482int kvm_mmu_create(struct kvm_vcpu *vcpu) 4483{ 4484 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 4485 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 4486 vcpu->arch.mmu.translate_gpa = translate_gpa; 4487 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; 4488 4489 return alloc_mmu_pages(vcpu); 4490} 4491 4492void kvm_mmu_setup(struct kvm_vcpu *vcpu) 4493{ 4494 MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 4495 4496 init_kvm_mmu(vcpu); 4497} 4498 4499/* The return value indicates if tlb flush on all vcpus is needed. */ 4500typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); 4501 4502/* The caller should hold mmu-lock before calling this function. */ 4503static bool 4504slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, 4505 slot_level_handler fn, int start_level, int end_level, 4506 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) 4507{ 4508 struct slot_rmap_walk_iterator iterator; 4509 bool flush = false; 4510 4511 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, 4512 end_gfn, &iterator) { 4513 if (iterator.rmap) 4514 flush |= fn(kvm, iterator.rmap); 4515 4516 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { 4517 if (flush && lock_flush_tlb) { 4518 kvm_flush_remote_tlbs(kvm); 4519 flush = false; 4520 } 4521 cond_resched_lock(&kvm->mmu_lock); 4522 } 4523 } 4524 4525 if (flush && lock_flush_tlb) { 4526 kvm_flush_remote_tlbs(kvm); 4527 flush = false; 4528 } 4529 4530 return flush; 4531} 4532 4533static bool 4534slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 4535 slot_level_handler fn, int start_level, int end_level, 4536 bool lock_flush_tlb) 4537{ 4538 return slot_handle_level_range(kvm, memslot, fn, start_level, 4539 end_level, memslot->base_gfn, 4540 memslot->base_gfn + memslot->npages - 1, 4541 lock_flush_tlb); 4542} 4543 4544static bool 4545slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 4546 slot_level_handler fn, bool lock_flush_tlb) 4547{ 4548 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, 4549 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); 4550} 4551 4552static bool 4553slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, 4554 slot_level_handler fn, bool lock_flush_tlb) 4555{ 4556 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, 4557 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); 4558} 4559 4560static bool 4561slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, 4562 slot_level_handler fn, bool lock_flush_tlb) 4563{ 4564 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, 4565 PT_PAGE_TABLE_LEVEL, lock_flush_tlb); 4566} 4567 4568void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) 4569{ 4570 struct kvm_memslots *slots; 4571 struct kvm_memory_slot *memslot; 4572 int i; 4573 4574 spin_lock(&kvm->mmu_lock); 4575 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 4576 slots = __kvm_memslots(kvm, i); 4577 kvm_for_each_memslot(memslot, slots) { 4578 gfn_t start, end; 4579 4580 start = max(gfn_start, memslot->base_gfn); 4581 end = min(gfn_end, memslot->base_gfn + memslot->npages); 4582 if (start >= end) 4583 continue; 4584 4585 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, 4586 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, 4587 start, end - 1, true); 4588 } 4589 } 4590 4591 spin_unlock(&kvm->mmu_lock); 4592} 4593 4594static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) 4595{ 4596 return __rmap_write_protect(kvm, rmapp, false); 4597} 4598 4599void kvm_mmu_slot_remove_write_access(struct kvm *kvm, 4600 struct kvm_memory_slot *memslot) 4601{ 4602 bool flush; 4603 4604 spin_lock(&kvm->mmu_lock); 4605 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, 4606 false); 4607 spin_unlock(&kvm->mmu_lock); 4608 4609 /* 4610 * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() 4611 * which do tlb flush out of mmu-lock should be serialized by 4612 * kvm->slots_lock otherwise tlb flush would be missed. 4613 */ 4614 lockdep_assert_held(&kvm->slots_lock); 4615 4616 /* 4617 * We can flush all the TLBs out of the mmu lock without TLB 4618 * corruption since we just change the spte from writable to 4619 * readonly so that we only need to care the case of changing 4620 * spte from present to present (changing the spte from present 4621 * to nonpresent will flush all the TLBs immediately), in other 4622 * words, the only case we care is mmu_spte_update() where we 4623 * haved checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE 4624 * instead of PT_WRITABLE_MASK, that means it does not depend 4625 * on PT_WRITABLE_MASK anymore. 4626 */ 4627 if (flush) 4628 kvm_flush_remote_tlbs(kvm); 4629} 4630 4631static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, 4632 unsigned long *rmapp) 4633{ 4634 u64 *sptep; 4635 struct rmap_iterator iter; 4636 int need_tlb_flush = 0; 4637 pfn_t pfn; 4638 struct kvm_mmu_page *sp; 4639 4640restart: 4641 for_each_rmap_spte(rmapp, &iter, sptep) { 4642 sp = page_header(__pa(sptep)); 4643 pfn = spte_to_pfn(*sptep); 4644 4645 /* 4646 * We cannot do huge page mapping for indirect shadow pages, 4647 * which are found on the last rmap (level = 1) when not using 4648 * tdp; such shadow pages are synced with the page table in 4649 * the guest, and the guest page table is using 4K page size 4650 * mapping if the indirect sp has level = 1. 4651 */ 4652 if (sp->role.direct && 4653 !kvm_is_reserved_pfn(pfn) && 4654 PageTransCompound(pfn_to_page(pfn))) { 4655 drop_spte(kvm, sptep); 4656 need_tlb_flush = 1; 4657 goto restart; 4658 } 4659 } 4660 4661 return need_tlb_flush; 4662} 4663 4664void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, 4665 const struct kvm_memory_slot *memslot) 4666{ 4667 /* FIXME: const-ify all uses of struct kvm_memory_slot. */ 4668 spin_lock(&kvm->mmu_lock); 4669 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, 4670 kvm_mmu_zap_collapsible_spte, true); 4671 spin_unlock(&kvm->mmu_lock); 4672} 4673 4674void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, 4675 struct kvm_memory_slot *memslot) 4676{ 4677 bool flush; 4678 4679 spin_lock(&kvm->mmu_lock); 4680 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); 4681 spin_unlock(&kvm->mmu_lock); 4682 4683 lockdep_assert_held(&kvm->slots_lock); 4684 4685 /* 4686 * It's also safe to flush TLBs out of mmu lock here as currently this 4687 * function is only used for dirty logging, in which case flushing TLB 4688 * out of mmu lock also guarantees no dirty pages will be lost in 4689 * dirty_bitmap. 4690 */ 4691 if (flush) 4692 kvm_flush_remote_tlbs(kvm); 4693} 4694EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); 4695 4696void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, 4697 struct kvm_memory_slot *memslot) 4698{ 4699 bool flush; 4700 4701 spin_lock(&kvm->mmu_lock); 4702 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, 4703 false); 4704 spin_unlock(&kvm->mmu_lock); 4705 4706 /* see kvm_mmu_slot_remove_write_access */ 4707 lockdep_assert_held(&kvm->slots_lock); 4708 4709 if (flush) 4710 kvm_flush_remote_tlbs(kvm); 4711} 4712EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); 4713 4714void kvm_mmu_slot_set_dirty(struct kvm *kvm, 4715 struct kvm_memory_slot *memslot) 4716{ 4717 bool flush; 4718 4719 spin_lock(&kvm->mmu_lock); 4720 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); 4721 spin_unlock(&kvm->mmu_lock); 4722 4723 lockdep_assert_held(&kvm->slots_lock); 4724 4725 /* see kvm_mmu_slot_leaf_clear_dirty */ 4726 if (flush) 4727 kvm_flush_remote_tlbs(kvm); 4728} 4729EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); 4730 4731#define BATCH_ZAP_PAGES 10 4732static void kvm_zap_obsolete_pages(struct kvm *kvm) 4733{ 4734 struct kvm_mmu_page *sp, *node; 4735 int batch = 0; 4736 4737restart: 4738 list_for_each_entry_safe_reverse(sp, node, 4739 &kvm->arch.active_mmu_pages, link) { 4740 int ret; 4741 4742 /* 4743 * No obsolete page exists before new created page since 4744 * active_mmu_pages is the FIFO list. 4745 */ 4746 if (!is_obsolete_sp(kvm, sp)) 4747 break; 4748 4749 /* 4750 * Since we are reversely walking the list and the invalid 4751 * list will be moved to the head, skip the invalid page 4752 * can help us to avoid the infinity list walking. 4753 */ 4754 if (sp->role.invalid) 4755 continue; 4756 4757 /* 4758 * Need not flush tlb since we only zap the sp with invalid 4759 * generation number. 4760 */ 4761 if (batch >= BATCH_ZAP_PAGES && 4762 cond_resched_lock(&kvm->mmu_lock)) { 4763 batch = 0; 4764 goto restart; 4765 } 4766 4767 ret = kvm_mmu_prepare_zap_page(kvm, sp, 4768 &kvm->arch.zapped_obsolete_pages); 4769 batch += ret; 4770 4771 if (ret) 4772 goto restart; 4773 } 4774 4775 /* 4776 * Should flush tlb before free page tables since lockless-walking 4777 * may use the pages. 4778 */ 4779 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); 4780} 4781 4782/* 4783 * Fast invalidate all shadow pages and use lock-break technique 4784 * to zap obsolete pages. 4785 * 4786 * It's required when memslot is being deleted or VM is being 4787 * destroyed, in these cases, we should ensure that KVM MMU does 4788 * not use any resource of the being-deleted slot or all slots 4789 * after calling the function. 4790 */ 4791void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) 4792{ 4793 spin_lock(&kvm->mmu_lock); 4794 trace_kvm_mmu_invalidate_zap_all_pages(kvm); 4795 kvm->arch.mmu_valid_gen++; 4796 4797 /* 4798 * Notify all vcpus to reload its shadow page table 4799 * and flush TLB. Then all vcpus will switch to new 4800 * shadow page table with the new mmu_valid_gen. 4801 * 4802 * Note: we should do this under the protection of 4803 * mmu-lock, otherwise, vcpu would purge shadow page 4804 * but miss tlb flush. 4805 */ 4806 kvm_reload_remote_mmus(kvm); 4807 4808 kvm_zap_obsolete_pages(kvm); 4809 spin_unlock(&kvm->mmu_lock); 4810} 4811 4812static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) 4813{ 4814 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); 4815} 4816 4817void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) 4818{ 4819 /* 4820 * The very rare case: if the generation-number is round, 4821 * zap all shadow pages. 4822 */ 4823 if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { 4824 printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); 4825 kvm_mmu_invalidate_zap_all_pages(kvm); 4826 } 4827} 4828 4829static unsigned long 4830mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 4831{ 4832 struct kvm *kvm; 4833 int nr_to_scan = sc->nr_to_scan; 4834 unsigned long freed = 0; 4835 4836 spin_lock(&kvm_lock); 4837 4838 list_for_each_entry(kvm, &vm_list, vm_list) { 4839 int idx; 4840 LIST_HEAD(invalid_list); 4841 4842 /* 4843 * Never scan more than sc->nr_to_scan VM instances. 4844 * Will not hit this condition practically since we do not try 4845 * to shrink more than one VM and it is very unlikely to see 4846 * !n_used_mmu_pages so many times. 4847 */ 4848 if (!nr_to_scan--) 4849 break; 4850 /* 4851 * n_used_mmu_pages is accessed without holding kvm->mmu_lock 4852 * here. We may skip a VM instance errorneosly, but we do not 4853 * want to shrink a VM that only started to populate its MMU 4854 * anyway. 4855 */ 4856 if (!kvm->arch.n_used_mmu_pages && 4857 !kvm_has_zapped_obsolete_pages(kvm)) 4858 continue; 4859 4860 idx = srcu_read_lock(&kvm->srcu); 4861 spin_lock(&kvm->mmu_lock); 4862 4863 if (kvm_has_zapped_obsolete_pages(kvm)) { 4864 kvm_mmu_commit_zap_page(kvm, 4865 &kvm->arch.zapped_obsolete_pages); 4866 goto unlock; 4867 } 4868 4869 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) 4870 freed++; 4871 kvm_mmu_commit_zap_page(kvm, &invalid_list); 4872 4873unlock: 4874 spin_unlock(&kvm->mmu_lock); 4875 srcu_read_unlock(&kvm->srcu, idx); 4876 4877 /* 4878 * unfair on small ones 4879 * per-vm shrinkers cry out 4880 * sadness comes quickly 4881 */ 4882 list_move_tail(&kvm->vm_list, &vm_list); 4883 break; 4884 } 4885 4886 spin_unlock(&kvm_lock); 4887 return freed; 4888} 4889 4890static unsigned long 4891mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 4892{ 4893 return percpu_counter_read_positive(&kvm_total_used_mmu_pages); 4894} 4895 4896static struct shrinker mmu_shrinker = { 4897 .count_objects = mmu_shrink_count, 4898 .scan_objects = mmu_shrink_scan, 4899 .seeks = DEFAULT_SEEKS * 10, 4900}; 4901 4902static void mmu_destroy_caches(void) 4903{ 4904 if (pte_list_desc_cache) 4905 kmem_cache_destroy(pte_list_desc_cache); 4906 if (mmu_page_header_cache) 4907 kmem_cache_destroy(mmu_page_header_cache); 4908} 4909 4910int kvm_mmu_module_init(void) 4911{ 4912 pte_list_desc_cache = kmem_cache_create("pte_list_desc", 4913 sizeof(struct pte_list_desc), 4914 0, 0, NULL); 4915 if (!pte_list_desc_cache) 4916 goto nomem; 4917 4918 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header", 4919 sizeof(struct kvm_mmu_page), 4920 0, 0, NULL); 4921 if (!mmu_page_header_cache) 4922 goto nomem; 4923 4924 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) 4925 goto nomem; 4926 4927 register_shrinker(&mmu_shrinker); 4928 4929 return 0; 4930 4931nomem: 4932 mmu_destroy_caches(); 4933 return -ENOMEM; 4934} 4935 4936/* 4937 * Caculate mmu pages needed for kvm. 4938 */ 4939unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) 4940{ 4941 unsigned int nr_mmu_pages; 4942 unsigned int nr_pages = 0; 4943 struct kvm_memslots *slots; 4944 struct kvm_memory_slot *memslot; 4945 int i; 4946 4947 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 4948 slots = __kvm_memslots(kvm, i); 4949 4950 kvm_for_each_memslot(memslot, slots) 4951 nr_pages += memslot->npages; 4952 } 4953 4954 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 4955 nr_mmu_pages = max(nr_mmu_pages, 4956 (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); 4957 4958 return nr_mmu_pages; 4959} 4960 4961void kvm_mmu_destroy(struct kvm_vcpu *vcpu) 4962{ 4963 kvm_mmu_unload(vcpu); 4964 free_mmu_pages(vcpu); 4965 mmu_free_memory_caches(vcpu); 4966} 4967 4968void kvm_mmu_module_exit(void) 4969{ 4970 mmu_destroy_caches(); 4971 percpu_counter_destroy(&kvm_total_used_mmu_pages); 4972 unregister_shrinker(&mmu_shrinker); 4973 mmu_audit_disable(); 4974} 4975