This source file includes following definitions.
- kvm_available_flush_tlb_with_range
- kvm_flush_remote_tlbs_with_range
- kvm_flush_remote_tlbs_with_address
- kvm_mmu_set_mmio_spte_mask
- is_mmio_spte
- sp_ad_disabled
- kvm_vcpu_ad_need_write_protect
- spte_ad_enabled
- spte_ad_need_write_protect
- is_nx_huge_page_enabled
- spte_shadow_accessed_mask
- spte_shadow_dirty_mask
- is_access_track_spte
- generation_mmio_spte_mask
- get_mmio_spte_generation
- mark_mmio_spte
- get_mmio_spte_gfn
- get_mmio_spte_access
- set_mmio_spte
- check_mmio_spte
- kvm_mmu_set_mask_ptes
- kvm_get_shadow_phys_bits
- kvm_mmu_reset_all_pte_masks
- is_cpuid_PSE36
- is_nx
- is_shadow_present_pte
- is_large_pte
- is_last_spte
- is_executable_pte
- spte_to_pfn
- pse36_gfn_delta
- __set_spte
- __update_clear_spte_fast
- __update_clear_spte_slow
- __get_spte_lockless
- count_spte_clear
- __set_spte
- __update_clear_spte_fast
- __update_clear_spte_slow
- __get_spte_lockless
- spte_can_locklessly_be_made_writable
- spte_has_volatile_bits
- is_accessed_spte
- is_dirty_spte
- mmu_spte_set
- mmu_spte_update_no_track
- mmu_spte_update
- mmu_spte_clear_track_bits
- mmu_spte_clear_no_track
- mmu_spte_get_lockless
- mark_spte_for_access_track
- restore_acc_track_spte
- mmu_spte_age
- walk_shadow_page_lockless_begin
- walk_shadow_page_lockless_end
- mmu_topup_memory_cache
- mmu_memory_cache_free_objects
- mmu_free_memory_cache
- mmu_topup_memory_cache_page
- mmu_free_memory_cache_page
- mmu_topup_memory_caches
- mmu_free_memory_caches
- mmu_memory_cache_alloc
- mmu_alloc_pte_list_desc
- mmu_free_pte_list_desc
- kvm_mmu_page_get_gfn
- kvm_mmu_page_set_gfn
- lpage_info_slot
- update_gfn_disallow_lpage_count
- kvm_mmu_gfn_disallow_lpage
- kvm_mmu_gfn_allow_lpage
- account_shadowed
- account_huge_nx_page
- unaccount_shadowed
- unaccount_huge_nx_page
- __mmu_gfn_lpage_is_disallowed
- mmu_gfn_lpage_is_disallowed
- host_mapping_level
- memslot_valid_for_gpte
- gfn_to_memslot_dirty_bitmap
- mapping_level
- pte_list_add
- pte_list_desc_remove_entry
- __pte_list_remove
- pte_list_remove
- __gfn_to_rmap
- gfn_to_rmap
- rmap_can_add
- rmap_add
- rmap_remove
- rmap_get_first
- rmap_get_next
- drop_spte
- __drop_large_spte
- drop_large_spte
- spte_write_protect
- __rmap_write_protect
- spte_clear_dirty
- spte_wrprot_for_clear_dirty
- __rmap_clear_dirty
- spte_set_dirty
- __rmap_set_dirty
- kvm_mmu_write_protect_pt_masked
- kvm_mmu_clear_dirty_pt_masked
- kvm_arch_mmu_enable_log_dirty_pt_masked
- kvm_arch_write_log_dirty
- kvm_mmu_slot_gfn_write_protect
- rmap_write_protect
- kvm_zap_rmapp
- kvm_unmap_rmapp
- kvm_set_pte_rmapp
- rmap_walk_init_level
- slot_rmap_walk_init
- slot_rmap_walk_okay
- slot_rmap_walk_next
- kvm_handle_hva_range
- kvm_handle_hva
- kvm_unmap_hva_range
- kvm_set_spte_hva
- kvm_age_rmapp
- kvm_test_age_rmapp
- rmap_recycle
- kvm_age_hva
- kvm_test_age_hva
- is_empty_shadow_page
- kvm_mod_used_mmu_pages
- kvm_mmu_free_page
- kvm_page_table_hashfn
- mmu_page_add_parent_pte
- mmu_page_remove_parent_pte
- drop_parent_pte
- kvm_mmu_alloc_page
- kvm_mmu_mark_parents_unsync
- mark_unsync
- nonpaging_sync_page
- nonpaging_invlpg
- nonpaging_update_pte
- mmu_pages_add
- clear_unsync_child_bit
- __mmu_unsync_walk
- mmu_unsync_walk
- kvm_unlink_unsync_page
- is_ept_sp
- __kvm_sync_page
- kvm_mmu_remote_flush_or_zap
- kvm_mmu_flush_or_zap
- kvm_mmu_audit
- mmu_audit_disable
- is_obsolete_sp
- kvm_sync_page
- kvm_sync_pages
- mmu_pages_next
- mmu_pages_first
- mmu_pages_clear_parents
- mmu_sync_children
- __clear_sp_write_flooding_count
- clear_sp_write_flooding_count
- kvm_mmu_get_page
- shadow_walk_init_using_root
- shadow_walk_init
- shadow_walk_okay
- __shadow_walk_next
- shadow_walk_next
- link_shadow_page
- validate_direct_spte
- mmu_page_zap_pte
- kvm_mmu_page_unlink_children
- kvm_mmu_unlink_parents
- mmu_zap_unsync_children
- __kvm_mmu_prepare_zap_page
- kvm_mmu_prepare_zap_page
- kvm_mmu_commit_zap_page
- prepare_zap_oldest_mmu_page
- kvm_mmu_change_mmu_pages
- kvm_mmu_unprotect_page
- kvm_unsync_page
- mmu_need_write_protect
- kvm_is_mmio_pfn
- set_spte
- mmu_set_spte
- pte_prefetch_gfn_to_pfn
- direct_pte_prefetch_many
- __direct_pte_prefetch
- direct_pte_prefetch
- disallowed_hugepage_adjust
- __direct_map
- kvm_send_hwpoison_signal
- kvm_handle_bad_page
- transparent_hugepage_adjust
- handle_abnormal_pfn
- page_fault_can_be_fast
- fast_pf_fix_direct_spte
- is_access_allowed
- fast_page_fault
- nonpaging_map
- mmu_free_root_page
- kvm_mmu_free_roots
- mmu_check_root
- mmu_alloc_direct_roots
- mmu_alloc_shadow_roots
- mmu_alloc_roots
- kvm_mmu_sync_roots
- nonpaging_gva_to_gpa
- nonpaging_gva_to_gpa_nested
- __is_rsvd_bits_set
- is_rsvd_bits_set
- is_shadow_zero_bits_set
- mmio_info_in_cache
- walk_shadow_page_get_mmio_spte
- handle_mmio_page_fault
- page_fault_handle_page_track
- shadow_page_table_clear_flood
- nonpaging_page_fault
- kvm_arch_setup_async_pf
- try_async_pf
- kvm_handle_page_fault
- check_hugepage_cache_consistency
- tdp_page_fault
- nonpaging_init_context
- cached_root_available
- fast_cr3_switch
- __kvm_mmu_new_cr3
- kvm_mmu_new_cr3
- get_cr3
- inject_page_fault
- sync_mmio_spte
- is_last_gpte
- __reset_rsvds_bits_mask
- reset_rsvds_bits_mask
- __reset_rsvds_bits_mask_ept
- reset_rsvds_bits_mask_ept
- reset_shadow_zero_bits_mask
- boot_cpu_is_amd
- reset_tdp_shadow_zero_bits_mask
- reset_ept_shadow_zero_bits_mask
- update_permission_bitmask
- update_pkru_bitmask
- update_last_nonleaf_level
- paging64_init_context_common
- paging64_init_context
- paging32_init_context
- paging32E_init_context
- kvm_calc_mmu_role_ext
- kvm_calc_mmu_role_common
- kvm_calc_tdp_mmu_root_page_role
- init_kvm_tdp_mmu
- kvm_calc_shadow_mmu_root_page_role
- kvm_init_shadow_mmu
- kvm_calc_shadow_ept_root_page_role
- kvm_init_shadow_ept_mmu
- init_kvm_softmmu
- init_kvm_nested_mmu
- kvm_init_mmu
- kvm_mmu_calc_root_page_role
- kvm_mmu_reset_context
- kvm_mmu_load
- kvm_mmu_unload
- mmu_pte_write_new_pte
- need_remote_flush
- mmu_pte_write_fetch_gpte
- detect_write_flooding
- detect_write_misaligned
- get_written_sptes
- kvm_mmu_pte_write
- kvm_mmu_unprotect_page_virt
- make_mmu_pages_available
- kvm_mmu_page_fault
- kvm_mmu_invlpg
- kvm_mmu_invpcid_gva
- kvm_enable_tdp
- kvm_disable_tdp
- slot_handle_level_range
- slot_handle_level
- slot_handle_all_level
- slot_handle_large_level
- slot_handle_leaf
- free_mmu_pages
- alloc_mmu_pages
- kvm_mmu_create
- kvm_zap_obsolete_pages
- kvm_mmu_zap_all_fast
- kvm_has_zapped_obsolete_pages
- kvm_mmu_invalidate_zap_pages_in_memslot
- kvm_mmu_init_vm
- kvm_mmu_uninit_vm
- kvm_zap_gfn_range
- slot_rmap_write_protect
- kvm_mmu_slot_remove_write_access
- kvm_mmu_zap_collapsible_spte
- kvm_mmu_zap_collapsible_sptes
- kvm_mmu_slot_leaf_clear_dirty
- kvm_mmu_slot_largepage_remove_write_access
- kvm_mmu_slot_set_dirty
- kvm_mmu_zap_all
- kvm_mmu_invalidate_mmio_sptes
- mmu_shrink_scan
- mmu_shrink_count
- mmu_destroy_caches
- kvm_set_mmio_spte_mask
- get_nx_auto_mode
- __set_nx_huge_pages
- set_nx_huge_pages
- kvm_mmu_module_init
- kvm_mmu_calculate_default_mmu_pages
- kvm_mmu_destroy
- kvm_mmu_module_exit
- set_nx_huge_pages_recovery_ratio
- kvm_recover_nx_lpages
- get_nx_lpage_recovery_timeout
- kvm_nx_lpage_recovery_worker
- kvm_mmu_post_init_vm
- kvm_mmu_pre_destroy_vm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 #include "irq.h"
19 #include "mmu.h"
20 #include "x86.h"
21 #include "kvm_cache_regs.h"
22 #include "cpuid.h"
23
24 #include <linux/kvm_host.h>
25 #include <linux/types.h>
26 #include <linux/string.h>
27 #include <linux/mm.h>
28 #include <linux/highmem.h>
29 #include <linux/moduleparam.h>
30 #include <linux/export.h>
31 #include <linux/swap.h>
32 #include <linux/hugetlb.h>
33 #include <linux/compiler.h>
34 #include <linux/srcu.h>
35 #include <linux/slab.h>
36 #include <linux/sched/signal.h>
37 #include <linux/uaccess.h>
38 #include <linux/hash.h>
39 #include <linux/kern_levels.h>
40 #include <linux/kthread.h>
41
42 #include <asm/page.h>
43 #include <asm/pat.h>
44 #include <asm/cmpxchg.h>
45 #include <asm/e820/api.h>
46 #include <asm/io.h>
47 #include <asm/vmx.h>
48 #include <asm/kvm_page_track.h>
49 #include "trace.h"
50
51 extern bool itlb_multihit_kvm_mitigation;
52
53 static int __read_mostly nx_huge_pages = -1;
54 #ifdef CONFIG_PREEMPT_RT
55
56 static uint __read_mostly nx_huge_pages_recovery_ratio = 0;
57 #else
58 static uint __read_mostly nx_huge_pages_recovery_ratio = 60;
59 #endif
60
61 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp);
62 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp);
63
64 static struct kernel_param_ops nx_huge_pages_ops = {
65 .set = set_nx_huge_pages,
66 .get = param_get_bool,
67 };
68
69 static struct kernel_param_ops nx_huge_pages_recovery_ratio_ops = {
70 .set = set_nx_huge_pages_recovery_ratio,
71 .get = param_get_uint,
72 };
73
74 module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, 0644);
75 __MODULE_PARM_TYPE(nx_huge_pages, "bool");
76 module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_ratio_ops,
77 &nx_huge_pages_recovery_ratio, 0644);
78 __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
79
80
81
82
83
84
85
86
87 bool tdp_enabled = false;
88
89 enum {
90 AUDIT_PRE_PAGE_FAULT,
91 AUDIT_POST_PAGE_FAULT,
92 AUDIT_PRE_PTE_WRITE,
93 AUDIT_POST_PTE_WRITE,
94 AUDIT_PRE_SYNC,
95 AUDIT_POST_SYNC
96 };
97
98 #undef MMU_DEBUG
99
100 #ifdef MMU_DEBUG
101 static bool dbg = 0;
102 module_param(dbg, bool, 0644);
103
104 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
105 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
106 #define MMU_WARN_ON(x) WARN_ON(x)
107 #else
108 #define pgprintk(x...) do { } while (0)
109 #define rmap_printk(x...) do { } while (0)
110 #define MMU_WARN_ON(x) do { } while (0)
111 #endif
112
113 #define PTE_PREFETCH_NUM 8
114
115 #define PT_FIRST_AVAIL_BITS_SHIFT 10
116 #define PT64_SECOND_AVAIL_BITS_SHIFT 54
117
118
119
120
121
122 #define SPTE_SPECIAL_MASK (3ULL << 52)
123 #define SPTE_AD_ENABLED_MASK (0ULL << 52)
124 #define SPTE_AD_DISABLED_MASK (1ULL << 52)
125 #define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
126 #define SPTE_MMIO_MASK (3ULL << 52)
127
128 #define PT64_LEVEL_BITS 9
129
130 #define PT64_LEVEL_SHIFT(level) \
131 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
132
133 #define PT64_INDEX(address, level)\
134 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
135
136
137 #define PT32_LEVEL_BITS 10
138
139 #define PT32_LEVEL_SHIFT(level) \
140 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
141
142 #define PT32_LVL_OFFSET_MASK(level) \
143 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
144 * PT32_LEVEL_BITS))) - 1))
145
146 #define PT32_INDEX(address, level)\
147 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
148
149
150 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
151 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
152 #else
153 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
154 #endif
155 #define PT64_LVL_ADDR_MASK(level) \
156 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
157 * PT64_LEVEL_BITS))) - 1))
158 #define PT64_LVL_OFFSET_MASK(level) \
159 (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
160 * PT64_LEVEL_BITS))) - 1))
161
162 #define PT32_BASE_ADDR_MASK PAGE_MASK
163 #define PT32_DIR_BASE_ADDR_MASK \
164 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
165 #define PT32_LVL_ADDR_MASK(level) \
166 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
167 * PT32_LEVEL_BITS))) - 1))
168
169 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
170 | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
171
172 #define ACC_EXEC_MASK 1
173 #define ACC_WRITE_MASK PT_WRITABLE_MASK
174 #define ACC_USER_MASK PT_USER_MASK
175 #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
176
177
178 #define PT64_EPT_READABLE_MASK 0x1ull
179 #define PT64_EPT_EXECUTABLE_MASK 0x4ull
180
181 #include <trace/events/kvm.h>
182
183 #define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
184 #define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
185
186 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
187
188
189 #define PTE_LIST_EXT 3
190
191
192
193
194
195
196
197
198
199 enum {
200 RET_PF_RETRY = 0,
201 RET_PF_EMULATE = 1,
202 RET_PF_INVALID = 2,
203 };
204
205 struct pte_list_desc {
206 u64 *sptes[PTE_LIST_EXT];
207 struct pte_list_desc *more;
208 };
209
210 struct kvm_shadow_walk_iterator {
211 u64 addr;
212 hpa_t shadow_addr;
213 u64 *sptep;
214 int level;
215 unsigned index;
216 };
217
218 static const union kvm_mmu_page_role mmu_base_role_mask = {
219 .cr0_wp = 1,
220 .gpte_is_8_bytes = 1,
221 .nxe = 1,
222 .smep_andnot_wp = 1,
223 .smap_andnot_wp = 1,
224 .smm = 1,
225 .guest_mode = 1,
226 .ad_disabled = 1,
227 };
228
229 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
230 for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
231 (_root), (_addr)); \
232 shadow_walk_okay(&(_walker)); \
233 shadow_walk_next(&(_walker)))
234
235 #define for_each_shadow_entry(_vcpu, _addr, _walker) \
236 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
237 shadow_walk_okay(&(_walker)); \
238 shadow_walk_next(&(_walker)))
239
240 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
241 for (shadow_walk_init(&(_walker), _vcpu, _addr); \
242 shadow_walk_okay(&(_walker)) && \
243 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
244 __shadow_walk_next(&(_walker), spte))
245
246 static struct kmem_cache *pte_list_desc_cache;
247 static struct kmem_cache *mmu_page_header_cache;
248 static struct percpu_counter kvm_total_used_mmu_pages;
249
250 static u64 __read_mostly shadow_nx_mask;
251 static u64 __read_mostly shadow_x_mask;
252 static u64 __read_mostly shadow_user_mask;
253 static u64 __read_mostly shadow_accessed_mask;
254 static u64 __read_mostly shadow_dirty_mask;
255 static u64 __read_mostly shadow_mmio_mask;
256 static u64 __read_mostly shadow_mmio_value;
257 static u64 __read_mostly shadow_mmio_access_mask;
258 static u64 __read_mostly shadow_present_mask;
259 static u64 __read_mostly shadow_me_mask;
260
261
262
263
264
265
266 static u64 __read_mostly shadow_acc_track_mask;
267
268
269
270
271
272
273
274 static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
275 PT64_EPT_EXECUTABLE_MASK;
276 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
277
278
279
280
281
282 static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
283
284
285
286
287 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
288
289
290
291
292
293
294
295
296
297 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
298
299
300
301
302
303 static u8 __read_mostly shadow_phys_bits;
304
305 static void mmu_spte_set(u64 *sptep, u64 spte);
306 static bool is_executable_pte(u64 spte);
307 static union kvm_mmu_page_role
308 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
309
310 #define CREATE_TRACE_POINTS
311 #include "mmutrace.h"
312
313
314 static inline bool kvm_available_flush_tlb_with_range(void)
315 {
316 return kvm_x86_ops->tlb_remote_flush_with_range;
317 }
318
319 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
320 struct kvm_tlb_range *range)
321 {
322 int ret = -ENOTSUPP;
323
324 if (range && kvm_x86_ops->tlb_remote_flush_with_range)
325 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
326
327 if (ret)
328 kvm_flush_remote_tlbs(kvm);
329 }
330
331 static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
332 u64 start_gfn, u64 pages)
333 {
334 struct kvm_tlb_range range;
335
336 range.start_gfn = start_gfn;
337 range.pages = pages;
338
339 kvm_flush_remote_tlbs_with_range(kvm, &range);
340 }
341
342 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
343 {
344 BUG_ON((u64)(unsigned)access_mask != access_mask);
345 BUG_ON((mmio_mask & mmio_value) != mmio_value);
346 WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
347 WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
348 shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
349 shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
350 shadow_mmio_access_mask = access_mask;
351 }
352 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
353
354 static bool is_mmio_spte(u64 spte)
355 {
356 return (spte & shadow_mmio_mask) == shadow_mmio_value;
357 }
358
359 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
360 {
361 return sp->role.ad_disabled;
362 }
363
364 static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
365 {
366
367
368
369
370
371
372 return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
373 }
374
375 static inline bool spte_ad_enabled(u64 spte)
376 {
377 MMU_WARN_ON(is_mmio_spte(spte));
378 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
379 }
380
381 static inline bool spte_ad_need_write_protect(u64 spte)
382 {
383 MMU_WARN_ON(is_mmio_spte(spte));
384 return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
385 }
386
387 static bool is_nx_huge_page_enabled(void)
388 {
389 return READ_ONCE(nx_huge_pages);
390 }
391
392 static inline u64 spte_shadow_accessed_mask(u64 spte)
393 {
394 MMU_WARN_ON(is_mmio_spte(spte));
395 return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
396 }
397
398 static inline u64 spte_shadow_dirty_mask(u64 spte)
399 {
400 MMU_WARN_ON(is_mmio_spte(spte));
401 return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
402 }
403
404 static inline bool is_access_track_spte(u64 spte)
405 {
406 return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
407 }
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423 #define MMIO_SPTE_GEN_MASK GENMASK_ULL(17, 0)
424
425 #define MMIO_SPTE_GEN_LOW_START 3
426 #define MMIO_SPTE_GEN_LOW_END 11
427 #define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
428 MMIO_SPTE_GEN_LOW_START)
429
430 #define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
431 #define MMIO_SPTE_GEN_HIGH_END 62
432 #define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
433 MMIO_SPTE_GEN_HIGH_START)
434
435 static u64 generation_mmio_spte_mask(u64 gen)
436 {
437 u64 mask;
438
439 WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
440 BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
441
442 mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
443 mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
444 return mask;
445 }
446
447 static u64 get_mmio_spte_generation(u64 spte)
448 {
449 u64 gen;
450
451 gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
452 gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
453 return gen;
454 }
455
456 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
457 unsigned access)
458 {
459 u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
460 u64 mask = generation_mmio_spte_mask(gen);
461 u64 gpa = gfn << PAGE_SHIFT;
462
463 access &= shadow_mmio_access_mask;
464 mask |= shadow_mmio_value | access;
465 mask |= gpa | shadow_nonpresent_or_rsvd_mask;
466 mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
467 << shadow_nonpresent_or_rsvd_mask_len;
468
469 trace_mark_mmio_spte(sptep, gfn, access, gen);
470 mmu_spte_set(sptep, mask);
471 }
472
473 static gfn_t get_mmio_spte_gfn(u64 spte)
474 {
475 u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
476
477 gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
478 & shadow_nonpresent_or_rsvd_mask;
479
480 return gpa >> PAGE_SHIFT;
481 }
482
483 static unsigned get_mmio_spte_access(u64 spte)
484 {
485 return spte & shadow_mmio_access_mask;
486 }
487
488 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
489 kvm_pfn_t pfn, unsigned access)
490 {
491 if (unlikely(is_noslot_pfn(pfn))) {
492 mark_mmio_spte(vcpu, sptep, gfn, access);
493 return true;
494 }
495
496 return false;
497 }
498
499 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
500 {
501 u64 kvm_gen, spte_gen, gen;
502
503 gen = kvm_vcpu_memslots(vcpu)->generation;
504 if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
505 return false;
506
507 kvm_gen = gen & MMIO_SPTE_GEN_MASK;
508 spte_gen = get_mmio_spte_generation(spte);
509
510 trace_check_mmio_spte(spte, kvm_gen, spte_gen);
511 return likely(kvm_gen == spte_gen);
512 }
513
514
515
516
517
518
519
520
521 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
522 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
523 u64 acc_track_mask, u64 me_mask)
524 {
525 BUG_ON(!dirty_mask != !accessed_mask);
526 BUG_ON(!accessed_mask && !acc_track_mask);
527 BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
528
529 shadow_user_mask = user_mask;
530 shadow_accessed_mask = accessed_mask;
531 shadow_dirty_mask = dirty_mask;
532 shadow_nx_mask = nx_mask;
533 shadow_x_mask = x_mask;
534 shadow_present_mask = p_mask;
535 shadow_acc_track_mask = acc_track_mask;
536 shadow_me_mask = me_mask;
537 }
538 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
539
540 static u8 kvm_get_shadow_phys_bits(void)
541 {
542
543
544
545
546
547
548 if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
549 return cpuid_eax(0x80000008) & 0xff;
550
551
552
553
554
555
556 return boot_cpu_data.x86_phys_bits;
557 }
558
559 static void kvm_mmu_reset_all_pte_masks(void)
560 {
561 u8 low_phys_bits;
562
563 shadow_user_mask = 0;
564 shadow_accessed_mask = 0;
565 shadow_dirty_mask = 0;
566 shadow_nx_mask = 0;
567 shadow_x_mask = 0;
568 shadow_mmio_mask = 0;
569 shadow_present_mask = 0;
570 shadow_acc_track_mask = 0;
571
572 shadow_phys_bits = kvm_get_shadow_phys_bits();
573
574
575
576
577
578
579
580
581
582
583
584 shadow_nonpresent_or_rsvd_mask = 0;
585 low_phys_bits = boot_cpu_data.x86_phys_bits;
586 if (boot_cpu_has_bug(X86_BUG_L1TF) &&
587 !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
588 52 - shadow_nonpresent_or_rsvd_mask_len)) {
589 low_phys_bits = boot_cpu_data.x86_cache_bits
590 - shadow_nonpresent_or_rsvd_mask_len;
591 shadow_nonpresent_or_rsvd_mask =
592 rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
593 }
594
595 shadow_nonpresent_or_rsvd_lower_gfn_mask =
596 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
597 }
598
599 static int is_cpuid_PSE36(void)
600 {
601 return 1;
602 }
603
604 static int is_nx(struct kvm_vcpu *vcpu)
605 {
606 return vcpu->arch.efer & EFER_NX;
607 }
608
609 static int is_shadow_present_pte(u64 pte)
610 {
611 return (pte != 0) && !is_mmio_spte(pte);
612 }
613
614 static int is_large_pte(u64 pte)
615 {
616 return pte & PT_PAGE_SIZE_MASK;
617 }
618
619 static int is_last_spte(u64 pte, int level)
620 {
621 if (level == PT_PAGE_TABLE_LEVEL)
622 return 1;
623 if (is_large_pte(pte))
624 return 1;
625 return 0;
626 }
627
628 static bool is_executable_pte(u64 spte)
629 {
630 return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
631 }
632
633 static kvm_pfn_t spte_to_pfn(u64 pte)
634 {
635 return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
636 }
637
638 static gfn_t pse36_gfn_delta(u32 gpte)
639 {
640 int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
641
642 return (gpte & PT32_DIR_PSE36_MASK) << shift;
643 }
644
645 #ifdef CONFIG_X86_64
646 static void __set_spte(u64 *sptep, u64 spte)
647 {
648 WRITE_ONCE(*sptep, spte);
649 }
650
651 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
652 {
653 WRITE_ONCE(*sptep, spte);
654 }
655
656 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
657 {
658 return xchg(sptep, spte);
659 }
660
661 static u64 __get_spte_lockless(u64 *sptep)
662 {
663 return READ_ONCE(*sptep);
664 }
665 #else
666 union split_spte {
667 struct {
668 u32 spte_low;
669 u32 spte_high;
670 };
671 u64 spte;
672 };
673
674 static void count_spte_clear(u64 *sptep, u64 spte)
675 {
676 struct kvm_mmu_page *sp = page_header(__pa(sptep));
677
678 if (is_shadow_present_pte(spte))
679 return;
680
681
682 smp_wmb();
683 sp->clear_spte_count++;
684 }
685
686 static void __set_spte(u64 *sptep, u64 spte)
687 {
688 union split_spte *ssptep, sspte;
689
690 ssptep = (union split_spte *)sptep;
691 sspte = (union split_spte)spte;
692
693 ssptep->spte_high = sspte.spte_high;
694
695
696
697
698
699
700 smp_wmb();
701
702 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
703 }
704
705 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
706 {
707 union split_spte *ssptep, sspte;
708
709 ssptep = (union split_spte *)sptep;
710 sspte = (union split_spte)spte;
711
712 WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
713
714
715
716
717
718 smp_wmb();
719
720 ssptep->spte_high = sspte.spte_high;
721 count_spte_clear(sptep, spte);
722 }
723
724 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
725 {
726 union split_spte *ssptep, sspte, orig;
727
728 ssptep = (union split_spte *)sptep;
729 sspte = (union split_spte)spte;
730
731
732 orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
733 orig.spte_high = ssptep->spte_high;
734 ssptep->spte_high = sspte.spte_high;
735 count_spte_clear(sptep, spte);
736
737 return orig.spte;
738 }
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758 static u64 __get_spte_lockless(u64 *sptep)
759 {
760 struct kvm_mmu_page *sp = page_header(__pa(sptep));
761 union split_spte spte, *orig = (union split_spte *)sptep;
762 int count;
763
764 retry:
765 count = sp->clear_spte_count;
766 smp_rmb();
767
768 spte.spte_low = orig->spte_low;
769 smp_rmb();
770
771 spte.spte_high = orig->spte_high;
772 smp_rmb();
773
774 if (unlikely(spte.spte_low != orig->spte_low ||
775 count != sp->clear_spte_count))
776 goto retry;
777
778 return spte.spte;
779 }
780 #endif
781
782 static bool spte_can_locklessly_be_made_writable(u64 spte)
783 {
784 return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
785 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
786 }
787
788 static bool spte_has_volatile_bits(u64 spte)
789 {
790 if (!is_shadow_present_pte(spte))
791 return false;
792
793
794
795
796
797
798
799 if (spte_can_locklessly_be_made_writable(spte) ||
800 is_access_track_spte(spte))
801 return true;
802
803 if (spte_ad_enabled(spte)) {
804 if ((spte & shadow_accessed_mask) == 0 ||
805 (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
806 return true;
807 }
808
809 return false;
810 }
811
812 static bool is_accessed_spte(u64 spte)
813 {
814 u64 accessed_mask = spte_shadow_accessed_mask(spte);
815
816 return accessed_mask ? spte & accessed_mask
817 : !is_access_track_spte(spte);
818 }
819
820 static bool is_dirty_spte(u64 spte)
821 {
822 u64 dirty_mask = spte_shadow_dirty_mask(spte);
823
824 return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
825 }
826
827
828
829
830
831
832
833 static void mmu_spte_set(u64 *sptep, u64 new_spte)
834 {
835 WARN_ON(is_shadow_present_pte(*sptep));
836 __set_spte(sptep, new_spte);
837 }
838
839
840
841
842
843 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
844 {
845 u64 old_spte = *sptep;
846
847 WARN_ON(!is_shadow_present_pte(new_spte));
848
849 if (!is_shadow_present_pte(old_spte)) {
850 mmu_spte_set(sptep, new_spte);
851 return old_spte;
852 }
853
854 if (!spte_has_volatile_bits(old_spte))
855 __update_clear_spte_fast(sptep, new_spte);
856 else
857 old_spte = __update_clear_spte_slow(sptep, new_spte);
858
859 WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
860
861 return old_spte;
862 }
863
864
865
866
867
868
869
870
871
872
873
874
875 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
876 {
877 bool flush = false;
878 u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
879
880 if (!is_shadow_present_pte(old_spte))
881 return false;
882
883
884
885
886
887
888 if (spte_can_locklessly_be_made_writable(old_spte) &&
889 !is_writable_pte(new_spte))
890 flush = true;
891
892
893
894
895
896
897 if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
898 flush = true;
899 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
900 }
901
902 if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
903 flush = true;
904 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
905 }
906
907 return flush;
908 }
909
910
911
912
913
914
915
916 static int mmu_spte_clear_track_bits(u64 *sptep)
917 {
918 kvm_pfn_t pfn;
919 u64 old_spte = *sptep;
920
921 if (!spte_has_volatile_bits(old_spte))
922 __update_clear_spte_fast(sptep, 0ull);
923 else
924 old_spte = __update_clear_spte_slow(sptep, 0ull);
925
926 if (!is_shadow_present_pte(old_spte))
927 return 0;
928
929 pfn = spte_to_pfn(old_spte);
930
931
932
933
934
935
936 WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
937
938 if (is_accessed_spte(old_spte))
939 kvm_set_pfn_accessed(pfn);
940
941 if (is_dirty_spte(old_spte))
942 kvm_set_pfn_dirty(pfn);
943
944 return 1;
945 }
946
947
948
949
950
951
952 static void mmu_spte_clear_no_track(u64 *sptep)
953 {
954 __update_clear_spte_fast(sptep, 0ull);
955 }
956
957 static u64 mmu_spte_get_lockless(u64 *sptep)
958 {
959 return __get_spte_lockless(sptep);
960 }
961
962 static u64 mark_spte_for_access_track(u64 spte)
963 {
964 if (spte_ad_enabled(spte))
965 return spte & ~shadow_accessed_mask;
966
967 if (is_access_track_spte(spte))
968 return spte;
969
970
971
972
973
974
975 WARN_ONCE((spte & PT_WRITABLE_MASK) &&
976 !spte_can_locklessly_be_made_writable(spte),
977 "kvm: Writable SPTE is not locklessly dirty-trackable\n");
978
979 WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
980 shadow_acc_track_saved_bits_shift),
981 "kvm: Access Tracking saved bit locations are not zero\n");
982
983 spte |= (spte & shadow_acc_track_saved_bits_mask) <<
984 shadow_acc_track_saved_bits_shift;
985 spte &= ~shadow_acc_track_mask;
986
987 return spte;
988 }
989
990
991 static u64 restore_acc_track_spte(u64 spte)
992 {
993 u64 new_spte = spte;
994 u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
995 & shadow_acc_track_saved_bits_mask;
996
997 WARN_ON_ONCE(spte_ad_enabled(spte));
998 WARN_ON_ONCE(!is_access_track_spte(spte));
999
1000 new_spte &= ~shadow_acc_track_mask;
1001 new_spte &= ~(shadow_acc_track_saved_bits_mask <<
1002 shadow_acc_track_saved_bits_shift);
1003 new_spte |= saved_bits;
1004
1005 return new_spte;
1006 }
1007
1008
1009 static bool mmu_spte_age(u64 *sptep)
1010 {
1011 u64 spte = mmu_spte_get_lockless(sptep);
1012
1013 if (!is_accessed_spte(spte))
1014 return false;
1015
1016 if (spte_ad_enabled(spte)) {
1017 clear_bit((ffs(shadow_accessed_mask) - 1),
1018 (unsigned long *)sptep);
1019 } else {
1020
1021
1022
1023
1024 if (is_writable_pte(spte))
1025 kvm_set_pfn_dirty(spte_to_pfn(spte));
1026
1027 spte = mark_spte_for_access_track(spte);
1028 mmu_spte_update_no_track(sptep, spte);
1029 }
1030
1031 return true;
1032 }
1033
1034 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
1035 {
1036
1037
1038
1039
1040 local_irq_disable();
1041
1042
1043
1044
1045
1046 smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
1047 }
1048
1049 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
1050 {
1051
1052
1053
1054
1055
1056 smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
1057 local_irq_enable();
1058 }
1059
1060 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
1061 struct kmem_cache *base_cache, int min)
1062 {
1063 void *obj;
1064
1065 if (cache->nobjs >= min)
1066 return 0;
1067 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1068 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1069 if (!obj)
1070 return cache->nobjs >= min ? 0 : -ENOMEM;
1071 cache->objects[cache->nobjs++] = obj;
1072 }
1073 return 0;
1074 }
1075
1076 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1077 {
1078 return cache->nobjs;
1079 }
1080
1081 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1082 struct kmem_cache *cache)
1083 {
1084 while (mc->nobjs)
1085 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1086 }
1087
1088 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1089 int min)
1090 {
1091 void *page;
1092
1093 if (cache->nobjs >= min)
1094 return 0;
1095 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1096 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1097 if (!page)
1098 return cache->nobjs >= min ? 0 : -ENOMEM;
1099 cache->objects[cache->nobjs++] = page;
1100 }
1101 return 0;
1102 }
1103
1104 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1105 {
1106 while (mc->nobjs)
1107 free_page((unsigned long)mc->objects[--mc->nobjs]);
1108 }
1109
1110 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1111 {
1112 int r;
1113
1114 r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1115 pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1116 if (r)
1117 goto out;
1118 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1119 if (r)
1120 goto out;
1121 r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1122 mmu_page_header_cache, 4);
1123 out:
1124 return r;
1125 }
1126
1127 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1128 {
1129 mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1130 pte_list_desc_cache);
1131 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1132 mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1133 mmu_page_header_cache);
1134 }
1135
1136 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1137 {
1138 void *p;
1139
1140 BUG_ON(!mc->nobjs);
1141 p = mc->objects[--mc->nobjs];
1142 return p;
1143 }
1144
1145 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1146 {
1147 return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1148 }
1149
1150 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1151 {
1152 kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1153 }
1154
1155 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1156 {
1157 if (!sp->role.direct)
1158 return sp->gfns[index];
1159
1160 return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1161 }
1162
1163 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1164 {
1165 if (!sp->role.direct) {
1166 sp->gfns[index] = gfn;
1167 return;
1168 }
1169
1170 if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1171 pr_err_ratelimited("gfn mismatch under direct page %llx "
1172 "(expected %llx, got %llx)\n",
1173 sp->gfn,
1174 kvm_mmu_page_get_gfn(sp, index), gfn);
1175 }
1176
1177
1178
1179
1180
1181 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1182 struct kvm_memory_slot *slot,
1183 int level)
1184 {
1185 unsigned long idx;
1186
1187 idx = gfn_to_index(gfn, slot->base_gfn, level);
1188 return &slot->arch.lpage_info[level - 2][idx];
1189 }
1190
1191 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1192 gfn_t gfn, int count)
1193 {
1194 struct kvm_lpage_info *linfo;
1195 int i;
1196
1197 for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1198 linfo = lpage_info_slot(gfn, slot, i);
1199 linfo->disallow_lpage += count;
1200 WARN_ON(linfo->disallow_lpage < 0);
1201 }
1202 }
1203
1204 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1205 {
1206 update_gfn_disallow_lpage_count(slot, gfn, 1);
1207 }
1208
1209 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1210 {
1211 update_gfn_disallow_lpage_count(slot, gfn, -1);
1212 }
1213
1214 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1215 {
1216 struct kvm_memslots *slots;
1217 struct kvm_memory_slot *slot;
1218 gfn_t gfn;
1219
1220 kvm->arch.indirect_shadow_pages++;
1221 gfn = sp->gfn;
1222 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1223 slot = __gfn_to_memslot(slots, gfn);
1224
1225
1226 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1227 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1228 KVM_PAGE_TRACK_WRITE);
1229
1230 kvm_mmu_gfn_disallow_lpage(slot, gfn);
1231 }
1232
1233 static void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1234 {
1235 if (sp->lpage_disallowed)
1236 return;
1237
1238 ++kvm->stat.nx_lpage_splits;
1239 list_add_tail(&sp->lpage_disallowed_link,
1240 &kvm->arch.lpage_disallowed_mmu_pages);
1241 sp->lpage_disallowed = true;
1242 }
1243
1244 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1245 {
1246 struct kvm_memslots *slots;
1247 struct kvm_memory_slot *slot;
1248 gfn_t gfn;
1249
1250 kvm->arch.indirect_shadow_pages--;
1251 gfn = sp->gfn;
1252 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1253 slot = __gfn_to_memslot(slots, gfn);
1254 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1255 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1256 KVM_PAGE_TRACK_WRITE);
1257
1258 kvm_mmu_gfn_allow_lpage(slot, gfn);
1259 }
1260
1261 static void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1262 {
1263 --kvm->stat.nx_lpage_splits;
1264 sp->lpage_disallowed = false;
1265 list_del(&sp->lpage_disallowed_link);
1266 }
1267
1268 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1269 struct kvm_memory_slot *slot)
1270 {
1271 struct kvm_lpage_info *linfo;
1272
1273 if (slot) {
1274 linfo = lpage_info_slot(gfn, slot, level);
1275 return !!linfo->disallow_lpage;
1276 }
1277
1278 return true;
1279 }
1280
1281 static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1282 int level)
1283 {
1284 struct kvm_memory_slot *slot;
1285
1286 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1287 return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1288 }
1289
1290 static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
1291 {
1292 unsigned long page_size;
1293 int i, ret = 0;
1294
1295 page_size = kvm_host_page_size(vcpu, gfn);
1296
1297 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1298 if (page_size >= KVM_HPAGE_SIZE(i))
1299 ret = i;
1300 else
1301 break;
1302 }
1303
1304 return ret;
1305 }
1306
1307 static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1308 bool no_dirty_log)
1309 {
1310 if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1311 return false;
1312 if (no_dirty_log && slot->dirty_bitmap)
1313 return false;
1314
1315 return true;
1316 }
1317
1318 static struct kvm_memory_slot *
1319 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1320 bool no_dirty_log)
1321 {
1322 struct kvm_memory_slot *slot;
1323
1324 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1325 if (!memslot_valid_for_gpte(slot, no_dirty_log))
1326 slot = NULL;
1327
1328 return slot;
1329 }
1330
1331 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1332 bool *force_pt_level)
1333 {
1334 int host_level, level, max_level;
1335 struct kvm_memory_slot *slot;
1336
1337 if (unlikely(*force_pt_level))
1338 return PT_PAGE_TABLE_LEVEL;
1339
1340 slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1341 *force_pt_level = !memslot_valid_for_gpte(slot, true);
1342 if (unlikely(*force_pt_level))
1343 return PT_PAGE_TABLE_LEVEL;
1344
1345 host_level = host_mapping_level(vcpu, large_gfn);
1346
1347 if (host_level == PT_PAGE_TABLE_LEVEL)
1348 return host_level;
1349
1350 max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1351
1352 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1353 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1354 break;
1355
1356 return level - 1;
1357 }
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1371 struct kvm_rmap_head *rmap_head)
1372 {
1373 struct pte_list_desc *desc;
1374 int i, count = 0;
1375
1376 if (!rmap_head->val) {
1377 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1378 rmap_head->val = (unsigned long)spte;
1379 } else if (!(rmap_head->val & 1)) {
1380 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1381 desc = mmu_alloc_pte_list_desc(vcpu);
1382 desc->sptes[0] = (u64 *)rmap_head->val;
1383 desc->sptes[1] = spte;
1384 rmap_head->val = (unsigned long)desc | 1;
1385 ++count;
1386 } else {
1387 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1388 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1389 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1390 desc = desc->more;
1391 count += PTE_LIST_EXT;
1392 }
1393 if (desc->sptes[PTE_LIST_EXT-1]) {
1394 desc->more = mmu_alloc_pte_list_desc(vcpu);
1395 desc = desc->more;
1396 }
1397 for (i = 0; desc->sptes[i]; ++i)
1398 ++count;
1399 desc->sptes[i] = spte;
1400 }
1401 return count;
1402 }
1403
1404 static void
1405 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1406 struct pte_list_desc *desc, int i,
1407 struct pte_list_desc *prev_desc)
1408 {
1409 int j;
1410
1411 for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1412 ;
1413 desc->sptes[i] = desc->sptes[j];
1414 desc->sptes[j] = NULL;
1415 if (j != 0)
1416 return;
1417 if (!prev_desc && !desc->more)
1418 rmap_head->val = (unsigned long)desc->sptes[0];
1419 else
1420 if (prev_desc)
1421 prev_desc->more = desc->more;
1422 else
1423 rmap_head->val = (unsigned long)desc->more | 1;
1424 mmu_free_pte_list_desc(desc);
1425 }
1426
1427 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1428 {
1429 struct pte_list_desc *desc;
1430 struct pte_list_desc *prev_desc;
1431 int i;
1432
1433 if (!rmap_head->val) {
1434 pr_err("%s: %p 0->BUG\n", __func__, spte);
1435 BUG();
1436 } else if (!(rmap_head->val & 1)) {
1437 rmap_printk("%s: %p 1->0\n", __func__, spte);
1438 if ((u64 *)rmap_head->val != spte) {
1439 pr_err("%s: %p 1->BUG\n", __func__, spte);
1440 BUG();
1441 }
1442 rmap_head->val = 0;
1443 } else {
1444 rmap_printk("%s: %p many->many\n", __func__, spte);
1445 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1446 prev_desc = NULL;
1447 while (desc) {
1448 for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1449 if (desc->sptes[i] == spte) {
1450 pte_list_desc_remove_entry(rmap_head,
1451 desc, i, prev_desc);
1452 return;
1453 }
1454 }
1455 prev_desc = desc;
1456 desc = desc->more;
1457 }
1458 pr_err("%s: %p many->many\n", __func__, spte);
1459 BUG();
1460 }
1461 }
1462
1463 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1464 {
1465 mmu_spte_clear_track_bits(sptep);
1466 __pte_list_remove(sptep, rmap_head);
1467 }
1468
1469 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1470 struct kvm_memory_slot *slot)
1471 {
1472 unsigned long idx;
1473
1474 idx = gfn_to_index(gfn, slot->base_gfn, level);
1475 return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1476 }
1477
1478 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1479 struct kvm_mmu_page *sp)
1480 {
1481 struct kvm_memslots *slots;
1482 struct kvm_memory_slot *slot;
1483
1484 slots = kvm_memslots_for_spte_role(kvm, sp->role);
1485 slot = __gfn_to_memslot(slots, gfn);
1486 return __gfn_to_rmap(gfn, sp->role.level, slot);
1487 }
1488
1489 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1490 {
1491 struct kvm_mmu_memory_cache *cache;
1492
1493 cache = &vcpu->arch.mmu_pte_list_desc_cache;
1494 return mmu_memory_cache_free_objects(cache);
1495 }
1496
1497 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1498 {
1499 struct kvm_mmu_page *sp;
1500 struct kvm_rmap_head *rmap_head;
1501
1502 sp = page_header(__pa(spte));
1503 kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1504 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1505 return pte_list_add(vcpu, spte, rmap_head);
1506 }
1507
1508 static void rmap_remove(struct kvm *kvm, u64 *spte)
1509 {
1510 struct kvm_mmu_page *sp;
1511 gfn_t gfn;
1512 struct kvm_rmap_head *rmap_head;
1513
1514 sp = page_header(__pa(spte));
1515 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1516 rmap_head = gfn_to_rmap(kvm, gfn, sp);
1517 __pte_list_remove(spte, rmap_head);
1518 }
1519
1520
1521
1522
1523
1524 struct rmap_iterator {
1525
1526 struct pte_list_desc *desc;
1527 int pos;
1528 };
1529
1530
1531
1532
1533
1534
1535
1536
1537 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1538 struct rmap_iterator *iter)
1539 {
1540 u64 *sptep;
1541
1542 if (!rmap_head->val)
1543 return NULL;
1544
1545 if (!(rmap_head->val & 1)) {
1546 iter->desc = NULL;
1547 sptep = (u64 *)rmap_head->val;
1548 goto out;
1549 }
1550
1551 iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1552 iter->pos = 0;
1553 sptep = iter->desc->sptes[iter->pos];
1554 out:
1555 BUG_ON(!is_shadow_present_pte(*sptep));
1556 return sptep;
1557 }
1558
1559
1560
1561
1562
1563
1564 static u64 *rmap_get_next(struct rmap_iterator *iter)
1565 {
1566 u64 *sptep;
1567
1568 if (iter->desc) {
1569 if (iter->pos < PTE_LIST_EXT - 1) {
1570 ++iter->pos;
1571 sptep = iter->desc->sptes[iter->pos];
1572 if (sptep)
1573 goto out;
1574 }
1575
1576 iter->desc = iter->desc->more;
1577
1578 if (iter->desc) {
1579 iter->pos = 0;
1580
1581 sptep = iter->desc->sptes[iter->pos];
1582 goto out;
1583 }
1584 }
1585
1586 return NULL;
1587 out:
1588 BUG_ON(!is_shadow_present_pte(*sptep));
1589 return sptep;
1590 }
1591
1592 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1593 for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1594 _spte_; _spte_ = rmap_get_next(_iter_))
1595
1596 static void drop_spte(struct kvm *kvm, u64 *sptep)
1597 {
1598 if (mmu_spte_clear_track_bits(sptep))
1599 rmap_remove(kvm, sptep);
1600 }
1601
1602
1603 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1604 {
1605 if (is_large_pte(*sptep)) {
1606 WARN_ON(page_header(__pa(sptep))->role.level ==
1607 PT_PAGE_TABLE_LEVEL);
1608 drop_spte(kvm, sptep);
1609 --kvm->stat.lpages;
1610 return true;
1611 }
1612
1613 return false;
1614 }
1615
1616 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1617 {
1618 if (__drop_large_spte(vcpu->kvm, sptep)) {
1619 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1620
1621 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1622 KVM_PAGES_PER_HPAGE(sp->role.level));
1623 }
1624 }
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1640 {
1641 u64 spte = *sptep;
1642
1643 if (!is_writable_pte(spte) &&
1644 !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1645 return false;
1646
1647 rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1648
1649 if (pt_protect)
1650 spte &= ~SPTE_MMU_WRITEABLE;
1651 spte = spte & ~PT_WRITABLE_MASK;
1652
1653 return mmu_spte_update(sptep, spte);
1654 }
1655
1656 static bool __rmap_write_protect(struct kvm *kvm,
1657 struct kvm_rmap_head *rmap_head,
1658 bool pt_protect)
1659 {
1660 u64 *sptep;
1661 struct rmap_iterator iter;
1662 bool flush = false;
1663
1664 for_each_rmap_spte(rmap_head, &iter, sptep)
1665 flush |= spte_write_protect(sptep, pt_protect);
1666
1667 return flush;
1668 }
1669
1670 static bool spte_clear_dirty(u64 *sptep)
1671 {
1672 u64 spte = *sptep;
1673
1674 rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1675
1676 MMU_WARN_ON(!spte_ad_enabled(spte));
1677 spte &= ~shadow_dirty_mask;
1678 return mmu_spte_update(sptep, spte);
1679 }
1680
1681 static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1682 {
1683 bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1684 (unsigned long *)sptep);
1685 if (was_writable && !spte_ad_enabled(*sptep))
1686 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1687
1688 return was_writable;
1689 }
1690
1691
1692
1693
1694
1695
1696
1697 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1698 {
1699 u64 *sptep;
1700 struct rmap_iterator iter;
1701 bool flush = false;
1702
1703 for_each_rmap_spte(rmap_head, &iter, sptep)
1704 if (spte_ad_need_write_protect(*sptep))
1705 flush |= spte_wrprot_for_clear_dirty(sptep);
1706 else
1707 flush |= spte_clear_dirty(sptep);
1708
1709 return flush;
1710 }
1711
1712 static bool spte_set_dirty(u64 *sptep)
1713 {
1714 u64 spte = *sptep;
1715
1716 rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1717
1718
1719
1720
1721
1722
1723 spte |= shadow_dirty_mask;
1724
1725 return mmu_spte_update(sptep, spte);
1726 }
1727
1728 static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1729 {
1730 u64 *sptep;
1731 struct rmap_iterator iter;
1732 bool flush = false;
1733
1734 for_each_rmap_spte(rmap_head, &iter, sptep)
1735 if (spte_ad_enabled(*sptep))
1736 flush |= spte_set_dirty(sptep);
1737
1738 return flush;
1739 }
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1752 struct kvm_memory_slot *slot,
1753 gfn_t gfn_offset, unsigned long mask)
1754 {
1755 struct kvm_rmap_head *rmap_head;
1756
1757 while (mask) {
1758 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1759 PT_PAGE_TABLE_LEVEL, slot);
1760 __rmap_write_protect(kvm, rmap_head, false);
1761
1762
1763 mask &= mask - 1;
1764 }
1765 }
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1778 struct kvm_memory_slot *slot,
1779 gfn_t gfn_offset, unsigned long mask)
1780 {
1781 struct kvm_rmap_head *rmap_head;
1782
1783 while (mask) {
1784 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1785 PT_PAGE_TABLE_LEVEL, slot);
1786 __rmap_clear_dirty(kvm, rmap_head);
1787
1788
1789 mask &= mask - 1;
1790 }
1791 }
1792 EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1805 struct kvm_memory_slot *slot,
1806 gfn_t gfn_offset, unsigned long mask)
1807 {
1808 if (kvm_x86_ops->enable_log_dirty_pt_masked)
1809 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1810 mask);
1811 else
1812 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1813 }
1814
1815
1816
1817
1818
1819
1820
1821
1822 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1823 {
1824 if (kvm_x86_ops->write_log_dirty)
1825 return kvm_x86_ops->write_log_dirty(vcpu);
1826
1827 return 0;
1828 }
1829
1830 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1831 struct kvm_memory_slot *slot, u64 gfn)
1832 {
1833 struct kvm_rmap_head *rmap_head;
1834 int i;
1835 bool write_protected = false;
1836
1837 for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1838 rmap_head = __gfn_to_rmap(gfn, i, slot);
1839 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1840 }
1841
1842 return write_protected;
1843 }
1844
1845 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1846 {
1847 struct kvm_memory_slot *slot;
1848
1849 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1850 return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1851 }
1852
1853 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1854 {
1855 u64 *sptep;
1856 struct rmap_iterator iter;
1857 bool flush = false;
1858
1859 while ((sptep = rmap_get_first(rmap_head, &iter))) {
1860 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1861
1862 pte_list_remove(rmap_head, sptep);
1863 flush = true;
1864 }
1865
1866 return flush;
1867 }
1868
1869 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1870 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1871 unsigned long data)
1872 {
1873 return kvm_zap_rmapp(kvm, rmap_head);
1874 }
1875
1876 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1877 struct kvm_memory_slot *slot, gfn_t gfn, int level,
1878 unsigned long data)
1879 {
1880 u64 *sptep;
1881 struct rmap_iterator iter;
1882 int need_flush = 0;
1883 u64 new_spte;
1884 pte_t *ptep = (pte_t *)data;
1885 kvm_pfn_t new_pfn;
1886
1887 WARN_ON(pte_huge(*ptep));
1888 new_pfn = pte_pfn(*ptep);
1889
1890 restart:
1891 for_each_rmap_spte(rmap_head, &iter, sptep) {
1892 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1893 sptep, *sptep, gfn, level);
1894
1895 need_flush = 1;
1896
1897 if (pte_write(*ptep)) {
1898 pte_list_remove(rmap_head, sptep);
1899 goto restart;
1900 } else {
1901 new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1902 new_spte |= (u64)new_pfn << PAGE_SHIFT;
1903
1904 new_spte &= ~PT_WRITABLE_MASK;
1905 new_spte &= ~SPTE_HOST_WRITEABLE;
1906
1907 new_spte = mark_spte_for_access_track(new_spte);
1908
1909 mmu_spte_clear_track_bits(sptep);
1910 mmu_spte_set(sptep, new_spte);
1911 }
1912 }
1913
1914 if (need_flush && kvm_available_flush_tlb_with_range()) {
1915 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1916 return 0;
1917 }
1918
1919 return need_flush;
1920 }
1921
1922 struct slot_rmap_walk_iterator {
1923
1924 struct kvm_memory_slot *slot;
1925 gfn_t start_gfn;
1926 gfn_t end_gfn;
1927 int start_level;
1928 int end_level;
1929
1930
1931 gfn_t gfn;
1932 struct kvm_rmap_head *rmap;
1933 int level;
1934
1935
1936 struct kvm_rmap_head *end_rmap;
1937 };
1938
1939 static void
1940 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1941 {
1942 iterator->level = level;
1943 iterator->gfn = iterator->start_gfn;
1944 iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1945 iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1946 iterator->slot);
1947 }
1948
1949 static void
1950 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1951 struct kvm_memory_slot *slot, int start_level,
1952 int end_level, gfn_t start_gfn, gfn_t end_gfn)
1953 {
1954 iterator->slot = slot;
1955 iterator->start_level = start_level;
1956 iterator->end_level = end_level;
1957 iterator->start_gfn = start_gfn;
1958 iterator->end_gfn = end_gfn;
1959
1960 rmap_walk_init_level(iterator, iterator->start_level);
1961 }
1962
1963 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1964 {
1965 return !!iterator->rmap;
1966 }
1967
1968 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1969 {
1970 if (++iterator->rmap <= iterator->end_rmap) {
1971 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1972 return;
1973 }
1974
1975 if (++iterator->level > iterator->end_level) {
1976 iterator->rmap = NULL;
1977 return;
1978 }
1979
1980 rmap_walk_init_level(iterator, iterator->level);
1981 }
1982
1983 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1984 _start_gfn, _end_gfn, _iter_) \
1985 for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1986 _end_level_, _start_gfn, _end_gfn); \
1987 slot_rmap_walk_okay(_iter_); \
1988 slot_rmap_walk_next(_iter_))
1989
1990 static int kvm_handle_hva_range(struct kvm *kvm,
1991 unsigned long start,
1992 unsigned long end,
1993 unsigned long data,
1994 int (*handler)(struct kvm *kvm,
1995 struct kvm_rmap_head *rmap_head,
1996 struct kvm_memory_slot *slot,
1997 gfn_t gfn,
1998 int level,
1999 unsigned long data))
2000 {
2001 struct kvm_memslots *slots;
2002 struct kvm_memory_slot *memslot;
2003 struct slot_rmap_walk_iterator iterator;
2004 int ret = 0;
2005 int i;
2006
2007 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
2008 slots = __kvm_memslots(kvm, i);
2009 kvm_for_each_memslot(memslot, slots) {
2010 unsigned long hva_start, hva_end;
2011 gfn_t gfn_start, gfn_end;
2012
2013 hva_start = max(start, memslot->userspace_addr);
2014 hva_end = min(end, memslot->userspace_addr +
2015 (memslot->npages << PAGE_SHIFT));
2016 if (hva_start >= hva_end)
2017 continue;
2018
2019
2020
2021
2022 gfn_start = hva_to_gfn_memslot(hva_start, memslot);
2023 gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
2024
2025 for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
2026 PT_MAX_HUGEPAGE_LEVEL,
2027 gfn_start, gfn_end - 1,
2028 &iterator)
2029 ret |= handler(kvm, iterator.rmap, memslot,
2030 iterator.gfn, iterator.level, data);
2031 }
2032 }
2033
2034 return ret;
2035 }
2036
2037 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
2038 unsigned long data,
2039 int (*handler)(struct kvm *kvm,
2040 struct kvm_rmap_head *rmap_head,
2041 struct kvm_memory_slot *slot,
2042 gfn_t gfn, int level,
2043 unsigned long data))
2044 {
2045 return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
2046 }
2047
2048 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
2049 {
2050 return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
2051 }
2052
2053 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
2054 {
2055 return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
2056 }
2057
2058 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2059 struct kvm_memory_slot *slot, gfn_t gfn, int level,
2060 unsigned long data)
2061 {
2062 u64 *sptep;
2063 struct rmap_iterator uninitialized_var(iter);
2064 int young = 0;
2065
2066 for_each_rmap_spte(rmap_head, &iter, sptep)
2067 young |= mmu_spte_age(sptep);
2068
2069 trace_kvm_age_page(gfn, level, slot, young);
2070 return young;
2071 }
2072
2073 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
2074 struct kvm_memory_slot *slot, gfn_t gfn,
2075 int level, unsigned long data)
2076 {
2077 u64 *sptep;
2078 struct rmap_iterator iter;
2079
2080 for_each_rmap_spte(rmap_head, &iter, sptep)
2081 if (is_accessed_spte(*sptep))
2082 return 1;
2083 return 0;
2084 }
2085
2086 #define RMAP_RECYCLE_THRESHOLD 1000
2087
2088 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2089 {
2090 struct kvm_rmap_head *rmap_head;
2091 struct kvm_mmu_page *sp;
2092
2093 sp = page_header(__pa(spte));
2094
2095 rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2096
2097 kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2098 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2099 KVM_PAGES_PER_HPAGE(sp->role.level));
2100 }
2101
2102 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2103 {
2104 return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2105 }
2106
2107 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2108 {
2109 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2110 }
2111
2112 #ifdef MMU_DEBUG
2113 static int is_empty_shadow_page(u64 *spt)
2114 {
2115 u64 *pos;
2116 u64 *end;
2117
2118 for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2119 if (is_shadow_present_pte(*pos)) {
2120 printk(KERN_ERR "%s: %p %llx\n", __func__,
2121 pos, *pos);
2122 return 0;
2123 }
2124 return 1;
2125 }
2126 #endif
2127
2128
2129
2130
2131
2132
2133
2134 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2135 {
2136 kvm->arch.n_used_mmu_pages += nr;
2137 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2138 }
2139
2140 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2141 {
2142 MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2143 hlist_del(&sp->hash_link);
2144 list_del(&sp->link);
2145 free_page((unsigned long)sp->spt);
2146 if (!sp->role.direct)
2147 free_page((unsigned long)sp->gfns);
2148 kmem_cache_free(mmu_page_header_cache, sp);
2149 }
2150
2151 static unsigned kvm_page_table_hashfn(gfn_t gfn)
2152 {
2153 return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2154 }
2155
2156 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2157 struct kvm_mmu_page *sp, u64 *parent_pte)
2158 {
2159 if (!parent_pte)
2160 return;
2161
2162 pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2163 }
2164
2165 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2166 u64 *parent_pte)
2167 {
2168 __pte_list_remove(parent_pte, &sp->parent_ptes);
2169 }
2170
2171 static void drop_parent_pte(struct kvm_mmu_page *sp,
2172 u64 *parent_pte)
2173 {
2174 mmu_page_remove_parent_pte(sp, parent_pte);
2175 mmu_spte_clear_no_track(parent_pte);
2176 }
2177
2178 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2179 {
2180 struct kvm_mmu_page *sp;
2181
2182 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2183 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2184 if (!direct)
2185 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2186 set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2187
2188
2189
2190
2191
2192
2193 sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2194 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2195 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2196 return sp;
2197 }
2198
2199 static void mark_unsync(u64 *spte);
2200 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2201 {
2202 u64 *sptep;
2203 struct rmap_iterator iter;
2204
2205 for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2206 mark_unsync(sptep);
2207 }
2208 }
2209
2210 static void mark_unsync(u64 *spte)
2211 {
2212 struct kvm_mmu_page *sp;
2213 unsigned int index;
2214
2215 sp = page_header(__pa(spte));
2216 index = spte - sp->spt;
2217 if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2218 return;
2219 if (sp->unsync_children++)
2220 return;
2221 kvm_mmu_mark_parents_unsync(sp);
2222 }
2223
2224 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2225 struct kvm_mmu_page *sp)
2226 {
2227 return 0;
2228 }
2229
2230 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2231 {
2232 }
2233
2234 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2235 struct kvm_mmu_page *sp, u64 *spte,
2236 const void *pte)
2237 {
2238 WARN_ON(1);
2239 }
2240
2241 #define KVM_PAGE_ARRAY_NR 16
2242
2243 struct kvm_mmu_pages {
2244 struct mmu_page_and_offset {
2245 struct kvm_mmu_page *sp;
2246 unsigned int idx;
2247 } page[KVM_PAGE_ARRAY_NR];
2248 unsigned int nr;
2249 };
2250
2251 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2252 int idx)
2253 {
2254 int i;
2255
2256 if (sp->unsync)
2257 for (i=0; i < pvec->nr; i++)
2258 if (pvec->page[i].sp == sp)
2259 return 0;
2260
2261 pvec->page[pvec->nr].sp = sp;
2262 pvec->page[pvec->nr].idx = idx;
2263 pvec->nr++;
2264 return (pvec->nr == KVM_PAGE_ARRAY_NR);
2265 }
2266
2267 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2268 {
2269 --sp->unsync_children;
2270 WARN_ON((int)sp->unsync_children < 0);
2271 __clear_bit(idx, sp->unsync_child_bitmap);
2272 }
2273
2274 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2275 struct kvm_mmu_pages *pvec)
2276 {
2277 int i, ret, nr_unsync_leaf = 0;
2278
2279 for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2280 struct kvm_mmu_page *child;
2281 u64 ent = sp->spt[i];
2282
2283 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2284 clear_unsync_child_bit(sp, i);
2285 continue;
2286 }
2287
2288 child = page_header(ent & PT64_BASE_ADDR_MASK);
2289
2290 if (child->unsync_children) {
2291 if (mmu_pages_add(pvec, child, i))
2292 return -ENOSPC;
2293
2294 ret = __mmu_unsync_walk(child, pvec);
2295 if (!ret) {
2296 clear_unsync_child_bit(sp, i);
2297 continue;
2298 } else if (ret > 0) {
2299 nr_unsync_leaf += ret;
2300 } else
2301 return ret;
2302 } else if (child->unsync) {
2303 nr_unsync_leaf++;
2304 if (mmu_pages_add(pvec, child, i))
2305 return -ENOSPC;
2306 } else
2307 clear_unsync_child_bit(sp, i);
2308 }
2309
2310 return nr_unsync_leaf;
2311 }
2312
2313 #define INVALID_INDEX (-1)
2314
2315 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2316 struct kvm_mmu_pages *pvec)
2317 {
2318 pvec->nr = 0;
2319 if (!sp->unsync_children)
2320 return 0;
2321
2322 mmu_pages_add(pvec, sp, INVALID_INDEX);
2323 return __mmu_unsync_walk(sp, pvec);
2324 }
2325
2326 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2327 {
2328 WARN_ON(!sp->unsync);
2329 trace_kvm_mmu_sync_page(sp);
2330 sp->unsync = 0;
2331 --kvm->stat.mmu_unsync;
2332 }
2333
2334 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2335 struct list_head *invalid_list);
2336 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2337 struct list_head *invalid_list);
2338
2339
2340 #define for_each_valid_sp(_kvm, _sp, _gfn) \
2341 hlist_for_each_entry(_sp, \
2342 &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2343 if (is_obsolete_sp((_kvm), (_sp))) { \
2344 } else
2345
2346 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
2347 for_each_valid_sp(_kvm, _sp, _gfn) \
2348 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2349
2350 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2351 {
2352 return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2353 }
2354
2355
2356 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2357 struct list_head *invalid_list)
2358 {
2359 if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2360 vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2361 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2362 return false;
2363 }
2364
2365 return true;
2366 }
2367
2368 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2369 struct list_head *invalid_list,
2370 bool remote_flush)
2371 {
2372 if (!remote_flush && list_empty(invalid_list))
2373 return false;
2374
2375 if (!list_empty(invalid_list))
2376 kvm_mmu_commit_zap_page(kvm, invalid_list);
2377 else
2378 kvm_flush_remote_tlbs(kvm);
2379 return true;
2380 }
2381
2382 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2383 struct list_head *invalid_list,
2384 bool remote_flush, bool local_flush)
2385 {
2386 if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2387 return;
2388
2389 if (local_flush)
2390 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2391 }
2392
2393 #ifdef CONFIG_KVM_MMU_AUDIT
2394 #include "mmu_audit.c"
2395 #else
2396 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2397 static void mmu_audit_disable(void) { }
2398 #endif
2399
2400 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2401 {
2402 return sp->role.invalid ||
2403 unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2404 }
2405
2406 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2407 struct list_head *invalid_list)
2408 {
2409 kvm_unlink_unsync_page(vcpu->kvm, sp);
2410 return __kvm_sync_page(vcpu, sp, invalid_list);
2411 }
2412
2413
2414 static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2415 struct list_head *invalid_list)
2416 {
2417 struct kvm_mmu_page *s;
2418 bool ret = false;
2419
2420 for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2421 if (!s->unsync)
2422 continue;
2423
2424 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2425 ret |= kvm_sync_page(vcpu, s, invalid_list);
2426 }
2427
2428 return ret;
2429 }
2430
2431 struct mmu_page_path {
2432 struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2433 unsigned int idx[PT64_ROOT_MAX_LEVEL];
2434 };
2435
2436 #define for_each_sp(pvec, sp, parents, i) \
2437 for (i = mmu_pages_first(&pvec, &parents); \
2438 i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2439 i = mmu_pages_next(&pvec, &parents, i))
2440
2441 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2442 struct mmu_page_path *parents,
2443 int i)
2444 {
2445 int n;
2446
2447 for (n = i+1; n < pvec->nr; n++) {
2448 struct kvm_mmu_page *sp = pvec->page[n].sp;
2449 unsigned idx = pvec->page[n].idx;
2450 int level = sp->role.level;
2451
2452 parents->idx[level-1] = idx;
2453 if (level == PT_PAGE_TABLE_LEVEL)
2454 break;
2455
2456 parents->parent[level-2] = sp;
2457 }
2458
2459 return n;
2460 }
2461
2462 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2463 struct mmu_page_path *parents)
2464 {
2465 struct kvm_mmu_page *sp;
2466 int level;
2467
2468 if (pvec->nr == 0)
2469 return 0;
2470
2471 WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2472
2473 sp = pvec->page[0].sp;
2474 level = sp->role.level;
2475 WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2476
2477 parents->parent[level-2] = sp;
2478
2479
2480
2481
2482 parents->parent[level-1] = NULL;
2483 return mmu_pages_next(pvec, parents, 0);
2484 }
2485
2486 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2487 {
2488 struct kvm_mmu_page *sp;
2489 unsigned int level = 0;
2490
2491 do {
2492 unsigned int idx = parents->idx[level];
2493 sp = parents->parent[level];
2494 if (!sp)
2495 return;
2496
2497 WARN_ON(idx == INVALID_INDEX);
2498 clear_unsync_child_bit(sp, idx);
2499 level++;
2500 } while (!sp->unsync_children);
2501 }
2502
2503 static void mmu_sync_children(struct kvm_vcpu *vcpu,
2504 struct kvm_mmu_page *parent)
2505 {
2506 int i;
2507 struct kvm_mmu_page *sp;
2508 struct mmu_page_path parents;
2509 struct kvm_mmu_pages pages;
2510 LIST_HEAD(invalid_list);
2511 bool flush = false;
2512
2513 while (mmu_unsync_walk(parent, &pages)) {
2514 bool protected = false;
2515
2516 for_each_sp(pages, sp, parents, i)
2517 protected |= rmap_write_protect(vcpu, sp->gfn);
2518
2519 if (protected) {
2520 kvm_flush_remote_tlbs(vcpu->kvm);
2521 flush = false;
2522 }
2523
2524 for_each_sp(pages, sp, parents, i) {
2525 flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2526 mmu_pages_clear_parents(&parents);
2527 }
2528 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2529 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2530 cond_resched_lock(&vcpu->kvm->mmu_lock);
2531 flush = false;
2532 }
2533 }
2534
2535 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2536 }
2537
2538 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2539 {
2540 atomic_set(&sp->write_flooding_count, 0);
2541 }
2542
2543 static void clear_sp_write_flooding_count(u64 *spte)
2544 {
2545 struct kvm_mmu_page *sp = page_header(__pa(spte));
2546
2547 __clear_sp_write_flooding_count(sp);
2548 }
2549
2550 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2551 gfn_t gfn,
2552 gva_t gaddr,
2553 unsigned level,
2554 int direct,
2555 unsigned access)
2556 {
2557 union kvm_mmu_page_role role;
2558 unsigned quadrant;
2559 struct kvm_mmu_page *sp;
2560 bool need_sync = false;
2561 bool flush = false;
2562 int collisions = 0;
2563 LIST_HEAD(invalid_list);
2564
2565 role = vcpu->arch.mmu->mmu_role.base;
2566 role.level = level;
2567 role.direct = direct;
2568 if (role.direct)
2569 role.gpte_is_8_bytes = true;
2570 role.access = access;
2571 if (!vcpu->arch.mmu->direct_map
2572 && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2573 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2574 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2575 role.quadrant = quadrant;
2576 }
2577 for_each_valid_sp(vcpu->kvm, sp, gfn) {
2578 if (sp->gfn != gfn) {
2579 collisions++;
2580 continue;
2581 }
2582
2583 if (!need_sync && sp->unsync)
2584 need_sync = true;
2585
2586 if (sp->role.word != role.word)
2587 continue;
2588
2589 if (sp->unsync) {
2590
2591
2592
2593 if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2594 break;
2595
2596 WARN_ON(!list_empty(&invalid_list));
2597 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2598 }
2599
2600 if (sp->unsync_children)
2601 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2602
2603 __clear_sp_write_flooding_count(sp);
2604 trace_kvm_mmu_get_page(sp, false);
2605 goto out;
2606 }
2607
2608 ++vcpu->kvm->stat.mmu_cache_miss;
2609
2610 sp = kvm_mmu_alloc_page(vcpu, direct);
2611
2612 sp->gfn = gfn;
2613 sp->role = role;
2614 hlist_add_head(&sp->hash_link,
2615 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2616 if (!direct) {
2617
2618
2619
2620
2621
2622 account_shadowed(vcpu->kvm, sp);
2623 if (level == PT_PAGE_TABLE_LEVEL &&
2624 rmap_write_protect(vcpu, gfn))
2625 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2626
2627 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2628 flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2629 }
2630 clear_page(sp->spt);
2631 trace_kvm_mmu_get_page(sp, true);
2632
2633 kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2634 out:
2635 if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2636 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2637 return sp;
2638 }
2639
2640 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2641 struct kvm_vcpu *vcpu, hpa_t root,
2642 u64 addr)
2643 {
2644 iterator->addr = addr;
2645 iterator->shadow_addr = root;
2646 iterator->level = vcpu->arch.mmu->shadow_root_level;
2647
2648 if (iterator->level == PT64_ROOT_4LEVEL &&
2649 vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2650 !vcpu->arch.mmu->direct_map)
2651 --iterator->level;
2652
2653 if (iterator->level == PT32E_ROOT_LEVEL) {
2654
2655
2656
2657
2658 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2659
2660 iterator->shadow_addr
2661 = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2662 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2663 --iterator->level;
2664 if (!iterator->shadow_addr)
2665 iterator->level = 0;
2666 }
2667 }
2668
2669 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2670 struct kvm_vcpu *vcpu, u64 addr)
2671 {
2672 shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2673 addr);
2674 }
2675
2676 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2677 {
2678 if (iterator->level < PT_PAGE_TABLE_LEVEL)
2679 return false;
2680
2681 iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2682 iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2683 return true;
2684 }
2685
2686 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2687 u64 spte)
2688 {
2689 if (is_last_spte(spte, iterator->level)) {
2690 iterator->level = 0;
2691 return;
2692 }
2693
2694 iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2695 --iterator->level;
2696 }
2697
2698 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2699 {
2700 __shadow_walk_next(iterator, *iterator->sptep);
2701 }
2702
2703 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2704 struct kvm_mmu_page *sp)
2705 {
2706 u64 spte;
2707
2708 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2709
2710 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2711 shadow_user_mask | shadow_x_mask | shadow_me_mask;
2712
2713 if (sp_ad_disabled(sp))
2714 spte |= SPTE_AD_DISABLED_MASK;
2715 else
2716 spte |= shadow_accessed_mask;
2717
2718 mmu_spte_set(sptep, spte);
2719
2720 mmu_page_add_parent_pte(vcpu, sp, sptep);
2721
2722 if (sp->unsync_children || sp->unsync)
2723 mark_unsync(sptep);
2724 }
2725
2726 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2727 unsigned direct_access)
2728 {
2729 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2730 struct kvm_mmu_page *child;
2731
2732
2733
2734
2735
2736
2737
2738
2739 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2740 if (child->role.access == direct_access)
2741 return;
2742
2743 drop_parent_pte(child, sptep);
2744 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2745 }
2746 }
2747
2748 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2749 u64 *spte)
2750 {
2751 u64 pte;
2752 struct kvm_mmu_page *child;
2753
2754 pte = *spte;
2755 if (is_shadow_present_pte(pte)) {
2756 if (is_last_spte(pte, sp->role.level)) {
2757 drop_spte(kvm, spte);
2758 if (is_large_pte(pte))
2759 --kvm->stat.lpages;
2760 } else {
2761 child = page_header(pte & PT64_BASE_ADDR_MASK);
2762 drop_parent_pte(child, spte);
2763 }
2764 return true;
2765 }
2766
2767 if (is_mmio_spte(pte))
2768 mmu_spte_clear_no_track(spte);
2769
2770 return false;
2771 }
2772
2773 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2774 struct kvm_mmu_page *sp)
2775 {
2776 unsigned i;
2777
2778 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2779 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2780 }
2781
2782 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2783 {
2784 u64 *sptep;
2785 struct rmap_iterator iter;
2786
2787 while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2788 drop_parent_pte(sp, sptep);
2789 }
2790
2791 static int mmu_zap_unsync_children(struct kvm *kvm,
2792 struct kvm_mmu_page *parent,
2793 struct list_head *invalid_list)
2794 {
2795 int i, zapped = 0;
2796 struct mmu_page_path parents;
2797 struct kvm_mmu_pages pages;
2798
2799 if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2800 return 0;
2801
2802 while (mmu_unsync_walk(parent, &pages)) {
2803 struct kvm_mmu_page *sp;
2804
2805 for_each_sp(pages, sp, parents, i) {
2806 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2807 mmu_pages_clear_parents(&parents);
2808 zapped++;
2809 }
2810 }
2811
2812 return zapped;
2813 }
2814
2815 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2816 struct kvm_mmu_page *sp,
2817 struct list_head *invalid_list,
2818 int *nr_zapped)
2819 {
2820 bool list_unstable;
2821
2822 trace_kvm_mmu_prepare_zap_page(sp);
2823 ++kvm->stat.mmu_shadow_zapped;
2824 *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2825 kvm_mmu_page_unlink_children(kvm, sp);
2826 kvm_mmu_unlink_parents(kvm, sp);
2827
2828
2829 list_unstable = *nr_zapped;
2830
2831 if (!sp->role.invalid && !sp->role.direct)
2832 unaccount_shadowed(kvm, sp);
2833
2834 if (sp->unsync)
2835 kvm_unlink_unsync_page(kvm, sp);
2836 if (!sp->root_count) {
2837
2838 (*nr_zapped)++;
2839 list_move(&sp->link, invalid_list);
2840 kvm_mod_used_mmu_pages(kvm, -1);
2841 } else {
2842 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2843
2844
2845
2846
2847
2848
2849 if (!is_obsolete_sp(kvm, sp))
2850 kvm_reload_remote_mmus(kvm);
2851 }
2852
2853 if (sp->lpage_disallowed)
2854 unaccount_huge_nx_page(kvm, sp);
2855
2856 sp->role.invalid = 1;
2857 return list_unstable;
2858 }
2859
2860 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2861 struct list_head *invalid_list)
2862 {
2863 int nr_zapped;
2864
2865 __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2866 return nr_zapped;
2867 }
2868
2869 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2870 struct list_head *invalid_list)
2871 {
2872 struct kvm_mmu_page *sp, *nsp;
2873
2874 if (list_empty(invalid_list))
2875 return;
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886 kvm_flush_remote_tlbs(kvm);
2887
2888 list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2889 WARN_ON(!sp->role.invalid || sp->root_count);
2890 kvm_mmu_free_page(sp);
2891 }
2892 }
2893
2894 static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2895 struct list_head *invalid_list)
2896 {
2897 struct kvm_mmu_page *sp;
2898
2899 if (list_empty(&kvm->arch.active_mmu_pages))
2900 return false;
2901
2902 sp = list_last_entry(&kvm->arch.active_mmu_pages,
2903 struct kvm_mmu_page, link);
2904 return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2905 }
2906
2907
2908
2909
2910
2911 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2912 {
2913 LIST_HEAD(invalid_list);
2914
2915 spin_lock(&kvm->mmu_lock);
2916
2917 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2918
2919 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2920 if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2921 break;
2922
2923 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2924 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2925 }
2926
2927 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2928
2929 spin_unlock(&kvm->mmu_lock);
2930 }
2931
2932 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2933 {
2934 struct kvm_mmu_page *sp;
2935 LIST_HEAD(invalid_list);
2936 int r;
2937
2938 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2939 r = 0;
2940 spin_lock(&kvm->mmu_lock);
2941 for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2942 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2943 sp->role.word);
2944 r = 1;
2945 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2946 }
2947 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2948 spin_unlock(&kvm->mmu_lock);
2949
2950 return r;
2951 }
2952 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2953
2954 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2955 {
2956 trace_kvm_mmu_unsync_page(sp);
2957 ++vcpu->kvm->stat.mmu_unsync;
2958 sp->unsync = 1;
2959
2960 kvm_mmu_mark_parents_unsync(sp);
2961 }
2962
2963 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2964 bool can_unsync)
2965 {
2966 struct kvm_mmu_page *sp;
2967
2968 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2969 return true;
2970
2971 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2972 if (!can_unsync)
2973 return true;
2974
2975 if (sp->unsync)
2976 continue;
2977
2978 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2979 kvm_unsync_page(vcpu, sp);
2980 }
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019 smp_wmb();
3020
3021 return false;
3022 }
3023
3024 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
3025 {
3026 if (pfn_valid(pfn))
3027 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038 (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
3039
3040 return !e820__mapped_raw_any(pfn_to_hpa(pfn),
3041 pfn_to_hpa(pfn + 1) - 1,
3042 E820_TYPE_RAM);
3043 }
3044
3045
3046 #define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
3047 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
3048
3049 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
3050 unsigned pte_access, int level,
3051 gfn_t gfn, kvm_pfn_t pfn, bool speculative,
3052 bool can_unsync, bool host_writable)
3053 {
3054 u64 spte = 0;
3055 int ret = 0;
3056 struct kvm_mmu_page *sp;
3057
3058 if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
3059 return 0;
3060
3061 sp = page_header(__pa(sptep));
3062 if (sp_ad_disabled(sp))
3063 spte |= SPTE_AD_DISABLED_MASK;
3064 else if (kvm_vcpu_ad_need_write_protect(vcpu))
3065 spte |= SPTE_AD_WRPROT_ONLY_MASK;
3066
3067
3068
3069
3070
3071
3072
3073 spte |= shadow_present_mask;
3074 if (!speculative)
3075 spte |= spte_shadow_accessed_mask(spte);
3076
3077 if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) &&
3078 is_nx_huge_page_enabled()) {
3079 pte_access &= ~ACC_EXEC_MASK;
3080 }
3081
3082 if (pte_access & ACC_EXEC_MASK)
3083 spte |= shadow_x_mask;
3084 else
3085 spte |= shadow_nx_mask;
3086
3087 if (pte_access & ACC_USER_MASK)
3088 spte |= shadow_user_mask;
3089
3090 if (level > PT_PAGE_TABLE_LEVEL)
3091 spte |= PT_PAGE_SIZE_MASK;
3092 if (tdp_enabled)
3093 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
3094 kvm_is_mmio_pfn(pfn));
3095
3096 if (host_writable)
3097 spte |= SPTE_HOST_WRITEABLE;
3098 else
3099 pte_access &= ~ACC_WRITE_MASK;
3100
3101 if (!kvm_is_mmio_pfn(pfn))
3102 spte |= shadow_me_mask;
3103
3104 spte |= (u64)pfn << PAGE_SHIFT;
3105
3106 if (pte_access & ACC_WRITE_MASK) {
3107
3108
3109
3110
3111
3112
3113
3114 if (level > PT_PAGE_TABLE_LEVEL &&
3115 mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
3116 goto done;
3117
3118 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3119
3120
3121
3122
3123
3124
3125
3126 if (!can_unsync && is_writable_pte(*sptep))
3127 goto set_pte;
3128
3129 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3130 pgprintk("%s: found shadow page for %llx, marking ro\n",
3131 __func__, gfn);
3132 ret |= SET_SPTE_WRITE_PROTECTED_PT;
3133 pte_access &= ~ACC_WRITE_MASK;
3134 spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3135 }
3136 }
3137
3138 if (pte_access & ACC_WRITE_MASK) {
3139 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3140 spte |= spte_shadow_dirty_mask(spte);
3141 }
3142
3143 if (speculative)
3144 spte = mark_spte_for_access_track(spte);
3145
3146 set_pte:
3147 if (mmu_spte_update(sptep, spte))
3148 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3149 done:
3150 return ret;
3151 }
3152
3153 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3154 int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3155 bool speculative, bool host_writable)
3156 {
3157 int was_rmapped = 0;
3158 int rmap_count;
3159 int set_spte_ret;
3160 int ret = RET_PF_RETRY;
3161 bool flush = false;
3162
3163 pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3164 *sptep, write_fault, gfn);
3165
3166 if (is_shadow_present_pte(*sptep)) {
3167
3168
3169
3170
3171 if (level > PT_PAGE_TABLE_LEVEL &&
3172 !is_large_pte(*sptep)) {
3173 struct kvm_mmu_page *child;
3174 u64 pte = *sptep;
3175
3176 child = page_header(pte & PT64_BASE_ADDR_MASK);
3177 drop_parent_pte(child, sptep);
3178 flush = true;
3179 } else if (pfn != spte_to_pfn(*sptep)) {
3180 pgprintk("hfn old %llx new %llx\n",
3181 spte_to_pfn(*sptep), pfn);
3182 drop_spte(vcpu->kvm, sptep);
3183 flush = true;
3184 } else
3185 was_rmapped = 1;
3186 }
3187
3188 set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3189 speculative, true, host_writable);
3190 if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3191 if (write_fault)
3192 ret = RET_PF_EMULATE;
3193 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3194 }
3195
3196 if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3197 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3198 KVM_PAGES_PER_HPAGE(level));
3199
3200 if (unlikely(is_mmio_spte(*sptep)))
3201 ret = RET_PF_EMULATE;
3202
3203 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3204 trace_kvm_mmu_set_spte(level, gfn, sptep);
3205 if (!was_rmapped && is_large_pte(*sptep))
3206 ++vcpu->kvm->stat.lpages;
3207
3208 if (is_shadow_present_pte(*sptep)) {
3209 if (!was_rmapped) {
3210 rmap_count = rmap_add(vcpu, sptep, gfn);
3211 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3212 rmap_recycle(vcpu, sptep, gfn);
3213 }
3214 }
3215
3216 return ret;
3217 }
3218
3219 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3220 bool no_dirty_log)
3221 {
3222 struct kvm_memory_slot *slot;
3223
3224 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3225 if (!slot)
3226 return KVM_PFN_ERR_FAULT;
3227
3228 return gfn_to_pfn_memslot_atomic(slot, gfn);
3229 }
3230
3231 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3232 struct kvm_mmu_page *sp,
3233 u64 *start, u64 *end)
3234 {
3235 struct page *pages[PTE_PREFETCH_NUM];
3236 struct kvm_memory_slot *slot;
3237 unsigned access = sp->role.access;
3238 int i, ret;
3239 gfn_t gfn;
3240
3241 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3242 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3243 if (!slot)
3244 return -1;
3245
3246 ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3247 if (ret <= 0)
3248 return -1;
3249
3250 for (i = 0; i < ret; i++, gfn++, start++) {
3251 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3252 page_to_pfn(pages[i]), true, true);
3253 put_page(pages[i]);
3254 }
3255
3256 return 0;
3257 }
3258
3259 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3260 struct kvm_mmu_page *sp, u64 *sptep)
3261 {
3262 u64 *spte, *start = NULL;
3263 int i;
3264
3265 WARN_ON(!sp->role.direct);
3266
3267 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3268 spte = sp->spt + i;
3269
3270 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3271 if (is_shadow_present_pte(*spte) || spte == sptep) {
3272 if (!start)
3273 continue;
3274 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3275 break;
3276 start = NULL;
3277 } else if (!start)
3278 start = spte;
3279 }
3280 }
3281
3282 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3283 {
3284 struct kvm_mmu_page *sp;
3285
3286 sp = page_header(__pa(sptep));
3287
3288
3289
3290
3291
3292
3293 if (sp_ad_disabled(sp))
3294 return;
3295
3296 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3297 return;
3298
3299 __direct_pte_prefetch(vcpu, sp, sptep);
3300 }
3301
3302 static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
3303 gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
3304 {
3305 int level = *levelp;
3306 u64 spte = *it.sptep;
3307
3308 if (it.level == level && level > PT_PAGE_TABLE_LEVEL &&
3309 is_nx_huge_page_enabled() &&
3310 is_shadow_present_pte(spte) &&
3311 !is_large_pte(spte)) {
3312
3313
3314
3315
3316
3317
3318
3319 u64 page_mask = KVM_PAGES_PER_HPAGE(level) - KVM_PAGES_PER_HPAGE(level - 1);
3320 *pfnp |= gfn & page_mask;
3321 (*levelp)--;
3322 }
3323 }
3324
3325 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3326 int map_writable, int level, kvm_pfn_t pfn,
3327 bool prefault, bool lpage_disallowed)
3328 {
3329 struct kvm_shadow_walk_iterator it;
3330 struct kvm_mmu_page *sp;
3331 int ret;
3332 gfn_t gfn = gpa >> PAGE_SHIFT;
3333 gfn_t base_gfn = gfn;
3334
3335 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3336 return RET_PF_RETRY;
3337
3338 trace_kvm_mmu_spte_requested(gpa, level, pfn);
3339 for_each_shadow_entry(vcpu, gpa, it) {
3340
3341
3342
3343
3344 disallowed_hugepage_adjust(it, gfn, &pfn, &level);
3345
3346 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3347 if (it.level == level)
3348 break;
3349
3350 drop_large_spte(vcpu, it.sptep);
3351 if (!is_shadow_present_pte(*it.sptep)) {
3352 sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3353 it.level - 1, true, ACC_ALL);
3354
3355 link_shadow_page(vcpu, it.sptep, sp);
3356 if (lpage_disallowed)
3357 account_huge_nx_page(vcpu->kvm, sp);
3358 }
3359 }
3360
3361 ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3362 write, level, base_gfn, pfn, prefault,
3363 map_writable);
3364 direct_pte_prefetch(vcpu, it.sptep);
3365 ++vcpu->stat.pf_fixed;
3366 return ret;
3367 }
3368
3369 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3370 {
3371 send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3372 }
3373
3374 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3375 {
3376
3377
3378
3379
3380
3381 if (pfn == KVM_PFN_ERR_RO_FAULT)
3382 return RET_PF_EMULATE;
3383
3384 if (pfn == KVM_PFN_ERR_HWPOISON) {
3385 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3386 return RET_PF_RETRY;
3387 }
3388
3389 return -EFAULT;
3390 }
3391
3392 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3393 gfn_t gfn, kvm_pfn_t *pfnp,
3394 int *levelp)
3395 {
3396 kvm_pfn_t pfn = *pfnp;
3397 int level = *levelp;
3398
3399
3400
3401
3402
3403
3404
3405 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3406 !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
3407 PageTransCompoundMap(pfn_to_page(pfn)) &&
3408 !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3409 unsigned long mask;
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419 *levelp = level = PT_DIRECTORY_LEVEL;
3420 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3421 VM_BUG_ON((gfn & mask) != (pfn & mask));
3422 if (pfn & mask) {
3423 kvm_release_pfn_clean(pfn);
3424 pfn &= ~mask;
3425 kvm_get_pfn(pfn);
3426 *pfnp = pfn;
3427 }
3428 }
3429 }
3430
3431 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3432 kvm_pfn_t pfn, unsigned access, int *ret_val)
3433 {
3434
3435 if (unlikely(is_error_pfn(pfn))) {
3436 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3437 return true;
3438 }
3439
3440 if (unlikely(is_noslot_pfn(pfn)))
3441 vcpu_cache_mmio_info(vcpu, gva, gfn,
3442 access & shadow_mmio_access_mask);
3443
3444 return false;
3445 }
3446
3447 static bool page_fault_can_be_fast(u32 error_code)
3448 {
3449
3450
3451
3452
3453 if (unlikely(error_code & PFERR_RSVD_MASK))
3454 return false;
3455
3456
3457 if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3458 == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3459 return false;
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475 return shadow_acc_track_mask != 0 ||
3476 ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3477 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3478 }
3479
3480
3481
3482
3483
3484 static bool
3485 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3486 u64 *sptep, u64 old_spte, u64 new_spte)
3487 {
3488 gfn_t gfn;
3489
3490 WARN_ON(!sp->role.direct);
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504 if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3505 return false;
3506
3507 if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3508
3509
3510
3511
3512 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3513 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3514 }
3515
3516 return true;
3517 }
3518
3519 static bool is_access_allowed(u32 fault_err_code, u64 spte)
3520 {
3521 if (fault_err_code & PFERR_FETCH_MASK)
3522 return is_executable_pte(spte);
3523
3524 if (fault_err_code & PFERR_WRITE_MASK)
3525 return is_writable_pte(spte);
3526
3527
3528 return spte & PT_PRESENT_MASK;
3529 }
3530
3531
3532
3533
3534
3535
3536 static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
3537 u32 error_code)
3538 {
3539 struct kvm_shadow_walk_iterator iterator;
3540 struct kvm_mmu_page *sp;
3541 bool fault_handled = false;
3542 u64 spte = 0ull;
3543 uint retry_count = 0;
3544
3545 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3546 return false;
3547
3548 if (!page_fault_can_be_fast(error_code))
3549 return false;
3550
3551 walk_shadow_page_lockless_begin(vcpu);
3552
3553 do {
3554 u64 new_spte;
3555
3556 for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
3557 if (!is_shadow_present_pte(spte) ||
3558 iterator.level < level)
3559 break;
3560
3561 sp = page_header(__pa(iterator.sptep));
3562 if (!is_last_spte(spte, sp->role.level))
3563 break;
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575 if (is_access_allowed(error_code, spte)) {
3576 fault_handled = true;
3577 break;
3578 }
3579
3580 new_spte = spte;
3581
3582 if (is_access_track_spte(spte))
3583 new_spte = restore_acc_track_spte(new_spte);
3584
3585
3586
3587
3588
3589
3590 if ((error_code & PFERR_WRITE_MASK) &&
3591 spte_can_locklessly_be_made_writable(spte))
3592 {
3593 new_spte |= PT_WRITABLE_MASK;
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3607 break;
3608 }
3609
3610
3611 if (new_spte == spte ||
3612 !is_access_allowed(error_code, new_spte))
3613 break;
3614
3615
3616
3617
3618
3619
3620 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3621 iterator.sptep, spte,
3622 new_spte);
3623 if (fault_handled)
3624 break;
3625
3626 if (++retry_count > 4) {
3627 printk_once(KERN_WARNING
3628 "kvm: Fast #PF retrying more than 4 times.\n");
3629 break;
3630 }
3631
3632 } while (true);
3633
3634 trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
3635 spte, fault_handled);
3636 walk_shadow_page_lockless_end(vcpu);
3637
3638 return fault_handled;
3639 }
3640
3641 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3642 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
3643 bool *writable);
3644 static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3645
3646 static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
3647 gfn_t gfn, bool prefault)
3648 {
3649 int r;
3650 int level;
3651 bool force_pt_level;
3652 kvm_pfn_t pfn;
3653 unsigned long mmu_seq;
3654 bool map_writable, write = error_code & PFERR_WRITE_MASK;
3655 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
3656 is_nx_huge_page_enabled();
3657
3658 force_pt_level = lpage_disallowed;
3659 level = mapping_level(vcpu, gfn, &force_pt_level);
3660 if (likely(!force_pt_level)) {
3661
3662
3663
3664
3665
3666 if (level > PT_DIRECTORY_LEVEL)
3667 level = PT_DIRECTORY_LEVEL;
3668
3669 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3670 }
3671
3672 if (fast_page_fault(vcpu, gpa, level, error_code))
3673 return RET_PF_RETRY;
3674
3675 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3676 smp_rmb();
3677
3678 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
3679 return RET_PF_RETRY;
3680
3681 if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
3682 return r;
3683
3684 r = RET_PF_RETRY;
3685 spin_lock(&vcpu->kvm->mmu_lock);
3686 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3687 goto out_unlock;
3688 if (make_mmu_pages_available(vcpu) < 0)
3689 goto out_unlock;
3690 if (likely(!force_pt_level))
3691 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3692 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
3693 prefault, false);
3694 out_unlock:
3695 spin_unlock(&vcpu->kvm->mmu_lock);
3696 kvm_release_pfn_clean(pfn);
3697 return r;
3698 }
3699
3700 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3701 struct list_head *invalid_list)
3702 {
3703 struct kvm_mmu_page *sp;
3704
3705 if (!VALID_PAGE(*root_hpa))
3706 return;
3707
3708 sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3709 --sp->root_count;
3710 if (!sp->root_count && sp->role.invalid)
3711 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3712
3713 *root_hpa = INVALID_PAGE;
3714 }
3715
3716
3717 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3718 ulong roots_to_free)
3719 {
3720 int i;
3721 LIST_HEAD(invalid_list);
3722 bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3723
3724 BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3725
3726
3727 if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3728 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3729 if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3730 VALID_PAGE(mmu->prev_roots[i].hpa))
3731 break;
3732
3733 if (i == KVM_MMU_NUM_PREV_ROOTS)
3734 return;
3735 }
3736
3737 spin_lock(&vcpu->kvm->mmu_lock);
3738
3739 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3740 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3741 mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3742 &invalid_list);
3743
3744 if (free_active_root) {
3745 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3746 (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3747 mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3748 &invalid_list);
3749 } else {
3750 for (i = 0; i < 4; ++i)
3751 if (mmu->pae_root[i] != 0)
3752 mmu_free_root_page(vcpu->kvm,
3753 &mmu->pae_root[i],
3754 &invalid_list);
3755 mmu->root_hpa = INVALID_PAGE;
3756 }
3757 mmu->root_cr3 = 0;
3758 }
3759
3760 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3761 spin_unlock(&vcpu->kvm->mmu_lock);
3762 }
3763 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3764
3765 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3766 {
3767 int ret = 0;
3768
3769 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3770 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3771 ret = 1;
3772 }
3773
3774 return ret;
3775 }
3776
3777 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3778 {
3779 struct kvm_mmu_page *sp;
3780 unsigned i;
3781
3782 if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3783 spin_lock(&vcpu->kvm->mmu_lock);
3784 if(make_mmu_pages_available(vcpu) < 0) {
3785 spin_unlock(&vcpu->kvm->mmu_lock);
3786 return -ENOSPC;
3787 }
3788 sp = kvm_mmu_get_page(vcpu, 0, 0,
3789 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3790 ++sp->root_count;
3791 spin_unlock(&vcpu->kvm->mmu_lock);
3792 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3793 } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3794 for (i = 0; i < 4; ++i) {
3795 hpa_t root = vcpu->arch.mmu->pae_root[i];
3796
3797 MMU_WARN_ON(VALID_PAGE(root));
3798 spin_lock(&vcpu->kvm->mmu_lock);
3799 if (make_mmu_pages_available(vcpu) < 0) {
3800 spin_unlock(&vcpu->kvm->mmu_lock);
3801 return -ENOSPC;
3802 }
3803 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3804 i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3805 root = __pa(sp->spt);
3806 ++sp->root_count;
3807 spin_unlock(&vcpu->kvm->mmu_lock);
3808 vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3809 }
3810 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3811 } else
3812 BUG();
3813 vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3814
3815 return 0;
3816 }
3817
3818 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3819 {
3820 struct kvm_mmu_page *sp;
3821 u64 pdptr, pm_mask;
3822 gfn_t root_gfn, root_cr3;
3823 int i;
3824
3825 root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3826 root_gfn = root_cr3 >> PAGE_SHIFT;
3827
3828 if (mmu_check_root(vcpu, root_gfn))
3829 return 1;
3830
3831
3832
3833
3834
3835 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3836 hpa_t root = vcpu->arch.mmu->root_hpa;
3837
3838 MMU_WARN_ON(VALID_PAGE(root));
3839
3840 spin_lock(&vcpu->kvm->mmu_lock);
3841 if (make_mmu_pages_available(vcpu) < 0) {
3842 spin_unlock(&vcpu->kvm->mmu_lock);
3843 return -ENOSPC;
3844 }
3845 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3846 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3847 root = __pa(sp->spt);
3848 ++sp->root_count;
3849 spin_unlock(&vcpu->kvm->mmu_lock);
3850 vcpu->arch.mmu->root_hpa = root;
3851 goto set_root_cr3;
3852 }
3853
3854
3855
3856
3857
3858
3859 pm_mask = PT_PRESENT_MASK;
3860 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3861 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3862
3863 for (i = 0; i < 4; ++i) {
3864 hpa_t root = vcpu->arch.mmu->pae_root[i];
3865
3866 MMU_WARN_ON(VALID_PAGE(root));
3867 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3868 pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3869 if (!(pdptr & PT_PRESENT_MASK)) {
3870 vcpu->arch.mmu->pae_root[i] = 0;
3871 continue;
3872 }
3873 root_gfn = pdptr >> PAGE_SHIFT;
3874 if (mmu_check_root(vcpu, root_gfn))
3875 return 1;
3876 }
3877 spin_lock(&vcpu->kvm->mmu_lock);
3878 if (make_mmu_pages_available(vcpu) < 0) {
3879 spin_unlock(&vcpu->kvm->mmu_lock);
3880 return -ENOSPC;
3881 }
3882 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3883 0, ACC_ALL);
3884 root = __pa(sp->spt);
3885 ++sp->root_count;
3886 spin_unlock(&vcpu->kvm->mmu_lock);
3887
3888 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3889 }
3890 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3891
3892
3893
3894
3895
3896 if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3897 if (vcpu->arch.mmu->lm_root == NULL) {
3898
3899
3900
3901
3902
3903 u64 *lm_root;
3904
3905 lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3906 if (lm_root == NULL)
3907 return 1;
3908
3909 lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3910
3911 vcpu->arch.mmu->lm_root = lm_root;
3912 }
3913
3914 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3915 }
3916
3917 set_root_cr3:
3918 vcpu->arch.mmu->root_cr3 = root_cr3;
3919
3920 return 0;
3921 }
3922
3923 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3924 {
3925 if (vcpu->arch.mmu->direct_map)
3926 return mmu_alloc_direct_roots(vcpu);
3927 else
3928 return mmu_alloc_shadow_roots(vcpu);
3929 }
3930
3931 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3932 {
3933 int i;
3934 struct kvm_mmu_page *sp;
3935
3936 if (vcpu->arch.mmu->direct_map)
3937 return;
3938
3939 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3940 return;
3941
3942 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3943
3944 if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3945 hpa_t root = vcpu->arch.mmu->root_hpa;
3946 sp = page_header(root);
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958 if (!smp_load_acquire(&sp->unsync) &&
3959 !smp_load_acquire(&sp->unsync_children))
3960 return;
3961
3962 spin_lock(&vcpu->kvm->mmu_lock);
3963 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3964
3965 mmu_sync_children(vcpu, sp);
3966
3967 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3968 spin_unlock(&vcpu->kvm->mmu_lock);
3969 return;
3970 }
3971
3972 spin_lock(&vcpu->kvm->mmu_lock);
3973 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3974
3975 for (i = 0; i < 4; ++i) {
3976 hpa_t root = vcpu->arch.mmu->pae_root[i];
3977
3978 if (root && VALID_PAGE(root)) {
3979 root &= PT64_BASE_ADDR_MASK;
3980 sp = page_header(root);
3981 mmu_sync_children(vcpu, sp);
3982 }
3983 }
3984
3985 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3986 spin_unlock(&vcpu->kvm->mmu_lock);
3987 }
3988 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3989
3990 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
3991 u32 access, struct x86_exception *exception)
3992 {
3993 if (exception)
3994 exception->error_code = 0;
3995 return vaddr;
3996 }
3997
3998 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
3999 u32 access,
4000 struct x86_exception *exception)
4001 {
4002 if (exception)
4003 exception->error_code = 0;
4004 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
4005 }
4006
4007 static bool
4008 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
4009 {
4010 int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
4011
4012 return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
4013 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
4014 }
4015
4016 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
4017 {
4018 return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
4019 }
4020
4021 static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
4022 {
4023 return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
4024 }
4025
4026 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4027 {
4028
4029
4030
4031
4032 if (mmu_is_nested(vcpu))
4033 return false;
4034
4035 if (direct)
4036 return vcpu_match_mmio_gpa(vcpu, addr);
4037
4038 return vcpu_match_mmio_gva(vcpu, addr);
4039 }
4040
4041
4042 static bool
4043 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
4044 {
4045 struct kvm_shadow_walk_iterator iterator;
4046 u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
4047 int root, leaf;
4048 bool reserved = false;
4049
4050 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4051 goto exit;
4052
4053 walk_shadow_page_lockless_begin(vcpu);
4054
4055 for (shadow_walk_init(&iterator, vcpu, addr),
4056 leaf = root = iterator.level;
4057 shadow_walk_okay(&iterator);
4058 __shadow_walk_next(&iterator, spte)) {
4059 spte = mmu_spte_get_lockless(iterator.sptep);
4060
4061 sptes[leaf - 1] = spte;
4062 leaf--;
4063
4064 if (!is_shadow_present_pte(spte))
4065 break;
4066
4067 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
4068 iterator.level);
4069 }
4070
4071 walk_shadow_page_lockless_end(vcpu);
4072
4073 if (reserved) {
4074 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
4075 __func__, addr);
4076 while (root > leaf) {
4077 pr_err("------ spte 0x%llx level %d.\n",
4078 sptes[root - 1], root);
4079 root--;
4080 }
4081 }
4082 exit:
4083 *sptep = spte;
4084 return reserved;
4085 }
4086
4087 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4088 {
4089 u64 spte;
4090 bool reserved;
4091
4092 if (mmio_info_in_cache(vcpu, addr, direct))
4093 return RET_PF_EMULATE;
4094
4095 reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
4096 if (WARN_ON(reserved))
4097 return -EINVAL;
4098
4099 if (is_mmio_spte(spte)) {
4100 gfn_t gfn = get_mmio_spte_gfn(spte);
4101 unsigned access = get_mmio_spte_access(spte);
4102
4103 if (!check_mmio_spte(vcpu, spte))
4104 return RET_PF_INVALID;
4105
4106 if (direct)
4107 addr = 0;
4108
4109 trace_handle_mmio_page_fault(addr, gfn, access);
4110 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4111 return RET_PF_EMULATE;
4112 }
4113
4114
4115
4116
4117
4118 return RET_PF_RETRY;
4119 }
4120
4121 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4122 u32 error_code, gfn_t gfn)
4123 {
4124 if (unlikely(error_code & PFERR_RSVD_MASK))
4125 return false;
4126
4127 if (!(error_code & PFERR_PRESENT_MASK) ||
4128 !(error_code & PFERR_WRITE_MASK))
4129 return false;
4130
4131
4132
4133
4134
4135 if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
4136 return true;
4137
4138 return false;
4139 }
4140
4141 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4142 {
4143 struct kvm_shadow_walk_iterator iterator;
4144 u64 spte;
4145
4146 if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4147 return;
4148
4149 walk_shadow_page_lockless_begin(vcpu);
4150 for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4151 clear_sp_write_flooding_count(iterator.sptep);
4152 if (!is_shadow_present_pte(spte))
4153 break;
4154 }
4155 walk_shadow_page_lockless_end(vcpu);
4156 }
4157
4158 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
4159 u32 error_code, bool prefault)
4160 {
4161 gfn_t gfn = gpa >> PAGE_SHIFT;
4162 int r;
4163
4164
4165 pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
4166
4167 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4168 return RET_PF_EMULATE;
4169
4170 r = mmu_topup_memory_caches(vcpu);
4171 if (r)
4172 return r;
4173
4174 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4175
4176
4177 return nonpaging_map(vcpu, gpa & PAGE_MASK,
4178 error_code, gfn, prefault);
4179 }
4180
4181 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4182 gfn_t gfn)
4183 {
4184 struct kvm_arch_async_pf arch;
4185
4186 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4187 arch.gfn = gfn;
4188 arch.direct_map = vcpu->arch.mmu->direct_map;
4189 arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4190
4191 return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4192 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4193 }
4194
4195 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4196 gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
4197 bool *writable)
4198 {
4199 struct kvm_memory_slot *slot;
4200 bool async;
4201
4202
4203
4204
4205 if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4206 *pfn = KVM_PFN_NOSLOT;
4207 return false;
4208 }
4209
4210 slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4211 async = false;
4212 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4213 if (!async)
4214 return false;
4215
4216 if (!prefault && kvm_can_do_async_pf(vcpu)) {
4217 trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
4218 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4219 trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
4220 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4221 return true;
4222 } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
4223 return true;
4224 }
4225
4226 *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4227 return false;
4228 }
4229
4230 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4231 u64 fault_address, char *insn, int insn_len)
4232 {
4233 int r = 1;
4234
4235 #ifndef CONFIG_X86_64
4236
4237 if (WARN_ON_ONCE(fault_address >> 32))
4238 return -EFAULT;
4239 #endif
4240
4241 vcpu->arch.l1tf_flush_l1d = true;
4242 switch (vcpu->arch.apf.host_apf_reason) {
4243 default:
4244 trace_kvm_page_fault(fault_address, error_code);
4245
4246 if (kvm_event_needs_reinjection(vcpu))
4247 kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4248 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4249 insn_len);
4250 break;
4251 case KVM_PV_REASON_PAGE_NOT_PRESENT:
4252 vcpu->arch.apf.host_apf_reason = 0;
4253 local_irq_disable();
4254 kvm_async_pf_task_wait(fault_address, 0);
4255 local_irq_enable();
4256 break;
4257 case KVM_PV_REASON_PAGE_READY:
4258 vcpu->arch.apf.host_apf_reason = 0;
4259 local_irq_disable();
4260 kvm_async_pf_task_wake(fault_address);
4261 local_irq_enable();
4262 break;
4263 }
4264 return r;
4265 }
4266 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4267
4268 static bool
4269 check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4270 {
4271 int page_num = KVM_PAGES_PER_HPAGE(level);
4272
4273 gfn &= ~(page_num - 1);
4274
4275 return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4276 }
4277
4278 static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
4279 bool prefault)
4280 {
4281 kvm_pfn_t pfn;
4282 int r;
4283 int level;
4284 bool force_pt_level;
4285 gfn_t gfn = gpa >> PAGE_SHIFT;
4286 unsigned long mmu_seq;
4287 int write = error_code & PFERR_WRITE_MASK;
4288 bool map_writable;
4289 bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
4290 is_nx_huge_page_enabled();
4291
4292 MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4293
4294 if (page_fault_handle_page_track(vcpu, error_code, gfn))
4295 return RET_PF_EMULATE;
4296
4297 r = mmu_topup_memory_caches(vcpu);
4298 if (r)
4299 return r;
4300
4301 force_pt_level =
4302 lpage_disallowed ||
4303 !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
4304 level = mapping_level(vcpu, gfn, &force_pt_level);
4305 if (likely(!force_pt_level)) {
4306 if (level > PT_DIRECTORY_LEVEL &&
4307 !check_hugepage_cache_consistency(vcpu, gfn, level))
4308 level = PT_DIRECTORY_LEVEL;
4309 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4310 }
4311
4312 if (fast_page_fault(vcpu, gpa, level, error_code))
4313 return RET_PF_RETRY;
4314
4315 mmu_seq = vcpu->kvm->mmu_notifier_seq;
4316 smp_rmb();
4317
4318 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4319 return RET_PF_RETRY;
4320
4321 if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4322 return r;
4323
4324 r = RET_PF_RETRY;
4325 spin_lock(&vcpu->kvm->mmu_lock);
4326 if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4327 goto out_unlock;
4328 if (make_mmu_pages_available(vcpu) < 0)
4329 goto out_unlock;
4330 if (likely(!force_pt_level))
4331 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4332 r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
4333 prefault, lpage_disallowed);
4334 out_unlock:
4335 spin_unlock(&vcpu->kvm->mmu_lock);
4336 kvm_release_pfn_clean(pfn);
4337 return r;
4338 }
4339
4340 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4341 struct kvm_mmu *context)
4342 {
4343 context->page_fault = nonpaging_page_fault;
4344 context->gva_to_gpa = nonpaging_gva_to_gpa;
4345 context->sync_page = nonpaging_sync_page;
4346 context->invlpg = nonpaging_invlpg;
4347 context->update_pte = nonpaging_update_pte;
4348 context->root_level = 0;
4349 context->shadow_root_level = PT32E_ROOT_LEVEL;
4350 context->direct_map = true;
4351 context->nx = false;
4352 }
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4363 union kvm_mmu_page_role new_role)
4364 {
4365 uint i;
4366 struct kvm_mmu_root_info root;
4367 struct kvm_mmu *mmu = vcpu->arch.mmu;
4368
4369 root.cr3 = mmu->root_cr3;
4370 root.hpa = mmu->root_hpa;
4371
4372 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4373 swap(root, mmu->prev_roots[i]);
4374
4375 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4376 page_header(root.hpa) != NULL &&
4377 new_role.word == page_header(root.hpa)->role.word)
4378 break;
4379 }
4380
4381 mmu->root_hpa = root.hpa;
4382 mmu->root_cr3 = root.cr3;
4383
4384 return i < KVM_MMU_NUM_PREV_ROOTS;
4385 }
4386
4387 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4388 union kvm_mmu_page_role new_role,
4389 bool skip_tlb_flush)
4390 {
4391 struct kvm_mmu *mmu = vcpu->arch.mmu;
4392
4393
4394
4395
4396
4397
4398 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4399 mmu->root_level >= PT64_ROOT_4LEVEL) {
4400 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4401 return false;
4402
4403 if (cached_root_available(vcpu, new_cr3, new_role)) {
4404
4405
4406
4407
4408
4409
4410
4411 kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4412 if (!skip_tlb_flush) {
4413 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4414 kvm_x86_ops->tlb_flush(vcpu, true);
4415 }
4416
4417
4418
4419
4420
4421
4422
4423
4424 vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4425
4426 __clear_sp_write_flooding_count(
4427 page_header(mmu->root_hpa));
4428
4429 return true;
4430 }
4431 }
4432
4433 return false;
4434 }
4435
4436 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4437 union kvm_mmu_page_role new_role,
4438 bool skip_tlb_flush)
4439 {
4440 if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4441 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4442 KVM_MMU_ROOT_CURRENT);
4443 }
4444
4445 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4446 {
4447 __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4448 skip_tlb_flush);
4449 }
4450 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4451
4452 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4453 {
4454 return kvm_read_cr3(vcpu);
4455 }
4456
4457 static void inject_page_fault(struct kvm_vcpu *vcpu,
4458 struct x86_exception *fault)
4459 {
4460 vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4461 }
4462
4463 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4464 unsigned access, int *nr_present)
4465 {
4466 if (unlikely(is_mmio_spte(*sptep))) {
4467 if (gfn != get_mmio_spte_gfn(*sptep)) {
4468 mmu_spte_clear_no_track(sptep);
4469 return true;
4470 }
4471
4472 (*nr_present)++;
4473 mark_mmio_spte(vcpu, sptep, gfn, access);
4474 return true;
4475 }
4476
4477 return false;
4478 }
4479
4480 static inline bool is_last_gpte(struct kvm_mmu *mmu,
4481 unsigned level, unsigned gpte)
4482 {
4483
4484
4485
4486
4487
4488 gpte &= level - mmu->last_nonleaf_level;
4489
4490
4491
4492
4493
4494
4495 gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4496
4497 return gpte & PT_PAGE_SIZE_MASK;
4498 }
4499
4500 #define PTTYPE_EPT 18
4501 #define PTTYPE PTTYPE_EPT
4502 #include "paging_tmpl.h"
4503 #undef PTTYPE
4504
4505 #define PTTYPE 64
4506 #include "paging_tmpl.h"
4507 #undef PTTYPE
4508
4509 #define PTTYPE 32
4510 #include "paging_tmpl.h"
4511 #undef PTTYPE
4512
4513 static void
4514 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4515 struct rsvd_bits_validate *rsvd_check,
4516 int maxphyaddr, int level, bool nx, bool gbpages,
4517 bool pse, bool amd)
4518 {
4519 u64 exb_bit_rsvd = 0;
4520 u64 gbpages_bit_rsvd = 0;
4521 u64 nonleaf_bit8_rsvd = 0;
4522
4523 rsvd_check->bad_mt_xwr = 0;
4524
4525 if (!nx)
4526 exb_bit_rsvd = rsvd_bits(63, 63);
4527 if (!gbpages)
4528 gbpages_bit_rsvd = rsvd_bits(7, 7);
4529
4530
4531
4532
4533
4534 if (amd)
4535 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4536
4537 switch (level) {
4538 case PT32_ROOT_LEVEL:
4539
4540 rsvd_check->rsvd_bits_mask[0][1] = 0;
4541 rsvd_check->rsvd_bits_mask[0][0] = 0;
4542 rsvd_check->rsvd_bits_mask[1][0] =
4543 rsvd_check->rsvd_bits_mask[0][0];
4544
4545 if (!pse) {
4546 rsvd_check->rsvd_bits_mask[1][1] = 0;
4547 break;
4548 }
4549
4550 if (is_cpuid_PSE36())
4551
4552 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4553 else
4554
4555 rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4556 break;
4557 case PT32E_ROOT_LEVEL:
4558 rsvd_check->rsvd_bits_mask[0][2] =
4559 rsvd_bits(maxphyaddr, 63) |
4560 rsvd_bits(5, 8) | rsvd_bits(1, 2);
4561 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4562 rsvd_bits(maxphyaddr, 62);
4563 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4564 rsvd_bits(maxphyaddr, 62);
4565 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4566 rsvd_bits(maxphyaddr, 62) |
4567 rsvd_bits(13, 20);
4568 rsvd_check->rsvd_bits_mask[1][0] =
4569 rsvd_check->rsvd_bits_mask[0][0];
4570 break;
4571 case PT64_ROOT_5LEVEL:
4572 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4573 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4574 rsvd_bits(maxphyaddr, 51);
4575 rsvd_check->rsvd_bits_mask[1][4] =
4576 rsvd_check->rsvd_bits_mask[0][4];
4577
4578 case PT64_ROOT_4LEVEL:
4579 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4580 nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4581 rsvd_bits(maxphyaddr, 51);
4582 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4583 nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4584 rsvd_bits(maxphyaddr, 51);
4585 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4586 rsvd_bits(maxphyaddr, 51);
4587 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4588 rsvd_bits(maxphyaddr, 51);
4589 rsvd_check->rsvd_bits_mask[1][3] =
4590 rsvd_check->rsvd_bits_mask[0][3];
4591 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4592 gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4593 rsvd_bits(13, 29);
4594 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4595 rsvd_bits(maxphyaddr, 51) |
4596 rsvd_bits(13, 20);
4597 rsvd_check->rsvd_bits_mask[1][0] =
4598 rsvd_check->rsvd_bits_mask[0][0];
4599 break;
4600 }
4601 }
4602
4603 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4604 struct kvm_mmu *context)
4605 {
4606 __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4607 cpuid_maxphyaddr(vcpu), context->root_level,
4608 context->nx,
4609 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4610 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4611 }
4612
4613 static void
4614 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4615 int maxphyaddr, bool execonly)
4616 {
4617 u64 bad_mt_xwr;
4618
4619 rsvd_check->rsvd_bits_mask[0][4] =
4620 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4621 rsvd_check->rsvd_bits_mask[0][3] =
4622 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4623 rsvd_check->rsvd_bits_mask[0][2] =
4624 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4625 rsvd_check->rsvd_bits_mask[0][1] =
4626 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4627 rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4628
4629
4630 rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4631 rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4632 rsvd_check->rsvd_bits_mask[1][2] =
4633 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4634 rsvd_check->rsvd_bits_mask[1][1] =
4635 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4636 rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4637
4638 bad_mt_xwr = 0xFFull << (2 * 8);
4639 bad_mt_xwr |= 0xFFull << (3 * 8);
4640 bad_mt_xwr |= 0xFFull << (7 * 8);
4641 bad_mt_xwr |= REPEAT_BYTE(1ull << 2);
4642 bad_mt_xwr |= REPEAT_BYTE(1ull << 6);
4643 if (!execonly) {
4644
4645 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4646 }
4647 rsvd_check->bad_mt_xwr = bad_mt_xwr;
4648 }
4649
4650 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4651 struct kvm_mmu *context, bool execonly)
4652 {
4653 __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4654 cpuid_maxphyaddr(vcpu), execonly);
4655 }
4656
4657
4658
4659
4660
4661
4662 void
4663 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4664 {
4665 bool uses_nx = context->nx ||
4666 context->mmu_role.base.smep_andnot_wp;
4667 struct rsvd_bits_validate *shadow_zero_check;
4668 int i;
4669
4670
4671
4672
4673
4674 shadow_zero_check = &context->shadow_zero_check;
4675 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4676 shadow_phys_bits,
4677 context->shadow_root_level, uses_nx,
4678 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4679 is_pse(vcpu), true);
4680
4681 if (!shadow_me_mask)
4682 return;
4683
4684 for (i = context->shadow_root_level; --i >= 0;) {
4685 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4686 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4687 }
4688
4689 }
4690 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4691
4692 static inline bool boot_cpu_is_amd(void)
4693 {
4694 WARN_ON_ONCE(!tdp_enabled);
4695 return shadow_x_mask == 0;
4696 }
4697
4698
4699
4700
4701
4702 static void
4703 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4704 struct kvm_mmu *context)
4705 {
4706 struct rsvd_bits_validate *shadow_zero_check;
4707 int i;
4708
4709 shadow_zero_check = &context->shadow_zero_check;
4710
4711 if (boot_cpu_is_amd())
4712 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4713 shadow_phys_bits,
4714 context->shadow_root_level, false,
4715 boot_cpu_has(X86_FEATURE_GBPAGES),
4716 true, true);
4717 else
4718 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4719 shadow_phys_bits,
4720 false);
4721
4722 if (!shadow_me_mask)
4723 return;
4724
4725 for (i = context->shadow_root_level; --i >= 0;) {
4726 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4727 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4728 }
4729 }
4730
4731
4732
4733
4734
4735 static void
4736 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4737 struct kvm_mmu *context, bool execonly)
4738 {
4739 __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4740 shadow_phys_bits, execonly);
4741 }
4742
4743 #define BYTE_MASK(access) \
4744 ((1 & (access) ? 2 : 0) | \
4745 (2 & (access) ? 4 : 0) | \
4746 (3 & (access) ? 8 : 0) | \
4747 (4 & (access) ? 16 : 0) | \
4748 (5 & (access) ? 32 : 0) | \
4749 (6 & (access) ? 64 : 0) | \
4750 (7 & (access) ? 128 : 0))
4751
4752
4753 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4754 struct kvm_mmu *mmu, bool ept)
4755 {
4756 unsigned byte;
4757
4758 const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4759 const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4760 const u8 u = BYTE_MASK(ACC_USER_MASK);
4761
4762 bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4763 bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4764 bool cr0_wp = is_write_protection(vcpu);
4765
4766 for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4767 unsigned pfec = byte << 1;
4768
4769
4770
4771
4772
4773
4774
4775 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4776
4777 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4778
4779 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4780
4781 u8 smepf = 0;
4782
4783 u8 smapf = 0;
4784
4785 if (!ept) {
4786
4787 u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4788
4789
4790 if (!mmu->nx)
4791 ff = 0;
4792
4793
4794 if (!cr0_wp)
4795 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4796
4797
4798 if (cr4_smep)
4799 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817 if (cr4_smap)
4818 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4819 }
4820
4821 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4822 }
4823 }
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849 static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4850 bool ept)
4851 {
4852 unsigned bit;
4853 bool wp;
4854
4855 if (ept) {
4856 mmu->pkru_mask = 0;
4857 return;
4858 }
4859
4860
4861 if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4862 mmu->pkru_mask = 0;
4863 return;
4864 }
4865
4866 wp = is_write_protection(vcpu);
4867
4868 for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4869 unsigned pfec, pkey_bits;
4870 bool check_pkey, check_write, ff, uf, wf, pte_user;
4871
4872 pfec = bit << 1;
4873 ff = pfec & PFERR_FETCH_MASK;
4874 uf = pfec & PFERR_USER_MASK;
4875 wf = pfec & PFERR_WRITE_MASK;
4876
4877
4878 pte_user = pfec & PFERR_RSVD_MASK;
4879
4880
4881
4882
4883
4884 check_pkey = (!ff && pte_user);
4885
4886
4887
4888
4889 check_write = check_pkey && wf && (uf || wp);
4890
4891
4892 pkey_bits = !!check_pkey;
4893
4894 pkey_bits |= (!!check_write) << 1;
4895
4896 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4897 }
4898 }
4899
4900 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4901 {
4902 unsigned root_level = mmu->root_level;
4903
4904 mmu->last_nonleaf_level = root_level;
4905 if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4906 mmu->last_nonleaf_level++;
4907 }
4908
4909 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4910 struct kvm_mmu *context,
4911 int level)
4912 {
4913 context->nx = is_nx(vcpu);
4914 context->root_level = level;
4915
4916 reset_rsvds_bits_mask(vcpu, context);
4917 update_permission_bitmask(vcpu, context, false);
4918 update_pkru_bitmask(vcpu, context, false);
4919 update_last_nonleaf_level(vcpu, context);
4920
4921 MMU_WARN_ON(!is_pae(vcpu));
4922 context->page_fault = paging64_page_fault;
4923 context->gva_to_gpa = paging64_gva_to_gpa;
4924 context->sync_page = paging64_sync_page;
4925 context->invlpg = paging64_invlpg;
4926 context->update_pte = paging64_update_pte;
4927 context->shadow_root_level = level;
4928 context->direct_map = false;
4929 }
4930
4931 static void paging64_init_context(struct kvm_vcpu *vcpu,
4932 struct kvm_mmu *context)
4933 {
4934 int root_level = is_la57_mode(vcpu) ?
4935 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4936
4937 paging64_init_context_common(vcpu, context, root_level);
4938 }
4939
4940 static void paging32_init_context(struct kvm_vcpu *vcpu,
4941 struct kvm_mmu *context)
4942 {
4943 context->nx = false;
4944 context->root_level = PT32_ROOT_LEVEL;
4945
4946 reset_rsvds_bits_mask(vcpu, context);
4947 update_permission_bitmask(vcpu, context, false);
4948 update_pkru_bitmask(vcpu, context, false);
4949 update_last_nonleaf_level(vcpu, context);
4950
4951 context->page_fault = paging32_page_fault;
4952 context->gva_to_gpa = paging32_gva_to_gpa;
4953 context->sync_page = paging32_sync_page;
4954 context->invlpg = paging32_invlpg;
4955 context->update_pte = paging32_update_pte;
4956 context->shadow_root_level = PT32E_ROOT_LEVEL;
4957 context->direct_map = false;
4958 }
4959
4960 static void paging32E_init_context(struct kvm_vcpu *vcpu,
4961 struct kvm_mmu *context)
4962 {
4963 paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4964 }
4965
4966 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4967 {
4968 union kvm_mmu_extended_role ext = {0};
4969
4970 ext.cr0_pg = !!is_paging(vcpu);
4971 ext.cr4_pae = !!is_pae(vcpu);
4972 ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4973 ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4974 ext.cr4_pse = !!is_pse(vcpu);
4975 ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4976 ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4977 ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4978
4979 ext.valid = 1;
4980
4981 return ext;
4982 }
4983
4984 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4985 bool base_only)
4986 {
4987 union kvm_mmu_role role = {0};
4988
4989 role.base.access = ACC_ALL;
4990 role.base.nxe = !!is_nx(vcpu);
4991 role.base.cr0_wp = is_write_protection(vcpu);
4992 role.base.smm = is_smm(vcpu);
4993 role.base.guest_mode = is_guest_mode(vcpu);
4994
4995 if (base_only)
4996 return role;
4997
4998 role.ext = kvm_calc_mmu_role_ext(vcpu);
4999
5000 return role;
5001 }
5002
5003 static union kvm_mmu_role
5004 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
5005 {
5006 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
5007
5008 role.base.ad_disabled = (shadow_accessed_mask == 0);
5009 role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
5010 role.base.direct = true;
5011 role.base.gpte_is_8_bytes = true;
5012
5013 return role;
5014 }
5015
5016 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
5017 {
5018 struct kvm_mmu *context = vcpu->arch.mmu;
5019 union kvm_mmu_role new_role =
5020 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
5021
5022 new_role.base.word &= mmu_base_role_mask.word;
5023 if (new_role.as_u64 == context->mmu_role.as_u64)
5024 return;
5025
5026 context->mmu_role.as_u64 = new_role.as_u64;
5027 context->page_fault = tdp_page_fault;
5028 context->sync_page = nonpaging_sync_page;
5029 context->invlpg = nonpaging_invlpg;
5030 context->update_pte = nonpaging_update_pte;
5031 context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
5032 context->direct_map = true;
5033 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
5034 context->get_cr3 = get_cr3;
5035 context->get_pdptr = kvm_pdptr_read;
5036 context->inject_page_fault = kvm_inject_page_fault;
5037
5038 if (!is_paging(vcpu)) {
5039 context->nx = false;
5040 context->gva_to_gpa = nonpaging_gva_to_gpa;
5041 context->root_level = 0;
5042 } else if (is_long_mode(vcpu)) {
5043 context->nx = is_nx(vcpu);
5044 context->root_level = is_la57_mode(vcpu) ?
5045 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5046 reset_rsvds_bits_mask(vcpu, context);
5047 context->gva_to_gpa = paging64_gva_to_gpa;
5048 } else if (is_pae(vcpu)) {
5049 context->nx = is_nx(vcpu);
5050 context->root_level = PT32E_ROOT_LEVEL;
5051 reset_rsvds_bits_mask(vcpu, context);
5052 context->gva_to_gpa = paging64_gva_to_gpa;
5053 } else {
5054 context->nx = false;
5055 context->root_level = PT32_ROOT_LEVEL;
5056 reset_rsvds_bits_mask(vcpu, context);
5057 context->gva_to_gpa = paging32_gva_to_gpa;
5058 }
5059
5060 update_permission_bitmask(vcpu, context, false);
5061 update_pkru_bitmask(vcpu, context, false);
5062 update_last_nonleaf_level(vcpu, context);
5063 reset_tdp_shadow_zero_bits_mask(vcpu, context);
5064 }
5065
5066 static union kvm_mmu_role
5067 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
5068 {
5069 union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
5070
5071 role.base.smep_andnot_wp = role.ext.cr4_smep &&
5072 !is_write_protection(vcpu);
5073 role.base.smap_andnot_wp = role.ext.cr4_smap &&
5074 !is_write_protection(vcpu);
5075 role.base.direct = !is_paging(vcpu);
5076 role.base.gpte_is_8_bytes = !!is_pae(vcpu);
5077
5078 if (!is_long_mode(vcpu))
5079 role.base.level = PT32E_ROOT_LEVEL;
5080 else if (is_la57_mode(vcpu))
5081 role.base.level = PT64_ROOT_5LEVEL;
5082 else
5083 role.base.level = PT64_ROOT_4LEVEL;
5084
5085 return role;
5086 }
5087
5088 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
5089 {
5090 struct kvm_mmu *context = vcpu->arch.mmu;
5091 union kvm_mmu_role new_role =
5092 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
5093
5094 new_role.base.word &= mmu_base_role_mask.word;
5095 if (new_role.as_u64 == context->mmu_role.as_u64)
5096 return;
5097
5098 if (!is_paging(vcpu))
5099 nonpaging_init_context(vcpu, context);
5100 else if (is_long_mode(vcpu))
5101 paging64_init_context(vcpu, context);
5102 else if (is_pae(vcpu))
5103 paging32E_init_context(vcpu, context);
5104 else
5105 paging32_init_context(vcpu, context);
5106
5107 context->mmu_role.as_u64 = new_role.as_u64;
5108 reset_shadow_zero_bits_mask(vcpu, context);
5109 }
5110 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
5111
5112 static union kvm_mmu_role
5113 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5114 bool execonly)
5115 {
5116 union kvm_mmu_role role = {0};
5117
5118
5119 role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
5120
5121 role.base.level = PT64_ROOT_4LEVEL;
5122 role.base.gpte_is_8_bytes = true;
5123 role.base.direct = false;
5124 role.base.ad_disabled = !accessed_dirty;
5125 role.base.guest_mode = true;
5126 role.base.access = ACC_ALL;
5127
5128
5129
5130
5131
5132 role.base.cr0_wp = true;
5133 role.base.smap_andnot_wp = true;
5134
5135 role.ext = kvm_calc_mmu_role_ext(vcpu);
5136 role.ext.execonly = execonly;
5137
5138 return role;
5139 }
5140
5141 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5142 bool accessed_dirty, gpa_t new_eptp)
5143 {
5144 struct kvm_mmu *context = vcpu->arch.mmu;
5145 union kvm_mmu_role new_role =
5146 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5147 execonly);
5148
5149 __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
5150
5151 new_role.base.word &= mmu_base_role_mask.word;
5152 if (new_role.as_u64 == context->mmu_role.as_u64)
5153 return;
5154
5155 context->shadow_root_level = PT64_ROOT_4LEVEL;
5156
5157 context->nx = true;
5158 context->ept_ad = accessed_dirty;
5159 context->page_fault = ept_page_fault;
5160 context->gva_to_gpa = ept_gva_to_gpa;
5161 context->sync_page = ept_sync_page;
5162 context->invlpg = ept_invlpg;
5163 context->update_pte = ept_update_pte;
5164 context->root_level = PT64_ROOT_4LEVEL;
5165 context->direct_map = false;
5166 context->mmu_role.as_u64 = new_role.as_u64;
5167
5168 update_permission_bitmask(vcpu, context, true);
5169 update_pkru_bitmask(vcpu, context, true);
5170 update_last_nonleaf_level(vcpu, context);
5171 reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5172 reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5173 }
5174 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5175
5176 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5177 {
5178 struct kvm_mmu *context = vcpu->arch.mmu;
5179
5180 kvm_init_shadow_mmu(vcpu);
5181 context->set_cr3 = kvm_x86_ops->set_cr3;
5182 context->get_cr3 = get_cr3;
5183 context->get_pdptr = kvm_pdptr_read;
5184 context->inject_page_fault = kvm_inject_page_fault;
5185 }
5186
5187 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5188 {
5189 union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5190 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5191
5192 new_role.base.word &= mmu_base_role_mask.word;
5193 if (new_role.as_u64 == g_context->mmu_role.as_u64)
5194 return;
5195
5196 g_context->mmu_role.as_u64 = new_role.as_u64;
5197 g_context->get_cr3 = get_cr3;
5198 g_context->get_pdptr = kvm_pdptr_read;
5199 g_context->inject_page_fault = kvm_inject_page_fault;
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209 if (!is_paging(vcpu)) {
5210 g_context->nx = false;
5211 g_context->root_level = 0;
5212 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5213 } else if (is_long_mode(vcpu)) {
5214 g_context->nx = is_nx(vcpu);
5215 g_context->root_level = is_la57_mode(vcpu) ?
5216 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5217 reset_rsvds_bits_mask(vcpu, g_context);
5218 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5219 } else if (is_pae(vcpu)) {
5220 g_context->nx = is_nx(vcpu);
5221 g_context->root_level = PT32E_ROOT_LEVEL;
5222 reset_rsvds_bits_mask(vcpu, g_context);
5223 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5224 } else {
5225 g_context->nx = false;
5226 g_context->root_level = PT32_ROOT_LEVEL;
5227 reset_rsvds_bits_mask(vcpu, g_context);
5228 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5229 }
5230
5231 update_permission_bitmask(vcpu, g_context, false);
5232 update_pkru_bitmask(vcpu, g_context, false);
5233 update_last_nonleaf_level(vcpu, g_context);
5234 }
5235
5236 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5237 {
5238 if (reset_roots) {
5239 uint i;
5240
5241 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5242
5243 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5244 vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5245 }
5246
5247 if (mmu_is_nested(vcpu))
5248 init_kvm_nested_mmu(vcpu);
5249 else if (tdp_enabled)
5250 init_kvm_tdp_mmu(vcpu);
5251 else
5252 init_kvm_softmmu(vcpu);
5253 }
5254 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5255
5256 static union kvm_mmu_page_role
5257 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5258 {
5259 union kvm_mmu_role role;
5260
5261 if (tdp_enabled)
5262 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5263 else
5264 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5265
5266 return role.base;
5267 }
5268
5269 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5270 {
5271 kvm_mmu_unload(vcpu);
5272 kvm_init_mmu(vcpu, true);
5273 }
5274 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5275
5276 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5277 {
5278 int r;
5279
5280 r = mmu_topup_memory_caches(vcpu);
5281 if (r)
5282 goto out;
5283 r = mmu_alloc_roots(vcpu);
5284 kvm_mmu_sync_roots(vcpu);
5285 if (r)
5286 goto out;
5287 kvm_mmu_load_cr3(vcpu);
5288 kvm_x86_ops->tlb_flush(vcpu, true);
5289 out:
5290 return r;
5291 }
5292 EXPORT_SYMBOL_GPL(kvm_mmu_load);
5293
5294 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5295 {
5296 kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5297 WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5298 kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5299 WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5300 }
5301 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5302
5303 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5304 struct kvm_mmu_page *sp, u64 *spte,
5305 const void *new)
5306 {
5307 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5308 ++vcpu->kvm->stat.mmu_pde_zapped;
5309 return;
5310 }
5311
5312 ++vcpu->kvm->stat.mmu_pte_updated;
5313 vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5314 }
5315
5316 static bool need_remote_flush(u64 old, u64 new)
5317 {
5318 if (!is_shadow_present_pte(old))
5319 return false;
5320 if (!is_shadow_present_pte(new))
5321 return true;
5322 if ((old ^ new) & PT64_BASE_ADDR_MASK)
5323 return true;
5324 old ^= shadow_nx_mask;
5325 new ^= shadow_nx_mask;
5326 return (old & ~new & PT64_PERM_MASK) != 0;
5327 }
5328
5329 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5330 int *bytes)
5331 {
5332 u64 gentry = 0;
5333 int r;
5334
5335
5336
5337
5338
5339
5340 if (is_pae(vcpu) && *bytes == 4) {
5341
5342 *gpa &= ~(gpa_t)7;
5343 *bytes = 8;
5344 }
5345
5346 if (*bytes == 4 || *bytes == 8) {
5347 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5348 if (r)
5349 gentry = 0;
5350 }
5351
5352 return gentry;
5353 }
5354
5355
5356
5357
5358
5359 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5360 {
5361
5362
5363
5364
5365 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5366 return false;
5367
5368 atomic_inc(&sp->write_flooding_count);
5369 return atomic_read(&sp->write_flooding_count) >= 3;
5370 }
5371
5372
5373
5374
5375
5376 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5377 int bytes)
5378 {
5379 unsigned offset, pte_size, misaligned;
5380
5381 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5382 gpa, bytes, sp->role.word);
5383
5384 offset = offset_in_page(gpa);
5385 pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5386
5387
5388
5389
5390
5391 if (!(offset & (pte_size - 1)) && bytes == 1)
5392 return false;
5393
5394 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5395 misaligned |= bytes < 4;
5396
5397 return misaligned;
5398 }
5399
5400 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5401 {
5402 unsigned page_offset, quadrant;
5403 u64 *spte;
5404 int level;
5405
5406 page_offset = offset_in_page(gpa);
5407 level = sp->role.level;
5408 *nspte = 1;
5409 if (!sp->role.gpte_is_8_bytes) {
5410 page_offset <<= 1;
5411
5412
5413
5414
5415
5416 if (level == PT32_ROOT_LEVEL) {
5417 page_offset &= ~7;
5418 page_offset <<= 1;
5419 *nspte = 2;
5420 }
5421 quadrant = page_offset >> PAGE_SHIFT;
5422 page_offset &= ~PAGE_MASK;
5423 if (quadrant != sp->role.quadrant)
5424 return NULL;
5425 }
5426
5427 spte = &sp->spt[page_offset / sizeof(*spte)];
5428 return spte;
5429 }
5430
5431 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5432 const u8 *new, int bytes,
5433 struct kvm_page_track_notifier_node *node)
5434 {
5435 gfn_t gfn = gpa >> PAGE_SHIFT;
5436 struct kvm_mmu_page *sp;
5437 LIST_HEAD(invalid_list);
5438 u64 entry, gentry, *spte;
5439 int npte;
5440 bool remote_flush, local_flush;
5441
5442
5443
5444
5445
5446 if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5447 return;
5448
5449 remote_flush = local_flush = false;
5450
5451 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5452
5453
5454
5455
5456
5457
5458 mmu_topup_memory_caches(vcpu);
5459
5460 spin_lock(&vcpu->kvm->mmu_lock);
5461
5462 gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5463
5464 ++vcpu->kvm->stat.mmu_pte_write;
5465 kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5466
5467 for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5468 if (detect_write_misaligned(sp, gpa, bytes) ||
5469 detect_write_flooding(sp)) {
5470 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5471 ++vcpu->kvm->stat.mmu_flooded;
5472 continue;
5473 }
5474
5475 spte = get_written_sptes(sp, gpa, &npte);
5476 if (!spte)
5477 continue;
5478
5479 local_flush = true;
5480 while (npte--) {
5481 u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5482
5483 entry = *spte;
5484 mmu_page_zap_pte(vcpu->kvm, sp, spte);
5485 if (gentry &&
5486 !((sp->role.word ^ base_role)
5487 & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5488 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5489 if (need_remote_flush(entry, *spte))
5490 remote_flush = true;
5491 ++spte;
5492 }
5493 }
5494 kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5495 kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5496 spin_unlock(&vcpu->kvm->mmu_lock);
5497 }
5498
5499 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5500 {
5501 gpa_t gpa;
5502 int r;
5503
5504 if (vcpu->arch.mmu->direct_map)
5505 return 0;
5506
5507 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5508
5509 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5510
5511 return r;
5512 }
5513 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5514
5515 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5516 {
5517 LIST_HEAD(invalid_list);
5518
5519 if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5520 return 0;
5521
5522 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5523 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5524 break;
5525
5526 ++vcpu->kvm->stat.mmu_recycled;
5527 }
5528 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5529
5530 if (!kvm_mmu_available_pages(vcpu->kvm))
5531 return -ENOSPC;
5532 return 0;
5533 }
5534
5535 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5536 void *insn, int insn_len)
5537 {
5538 int r, emulation_type = 0;
5539 bool direct = vcpu->arch.mmu->direct_map;
5540
5541
5542 if (vcpu->arch.mmu->direct_map) {
5543 vcpu->arch.gpa_available = true;
5544 vcpu->arch.gpa_val = cr2_or_gpa;
5545 }
5546
5547 r = RET_PF_INVALID;
5548 if (unlikely(error_code & PFERR_RSVD_MASK)) {
5549 r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
5550 if (r == RET_PF_EMULATE)
5551 goto emulate;
5552 }
5553
5554 if (r == RET_PF_INVALID) {
5555 r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa,
5556 lower_32_bits(error_code),
5557 false);
5558 WARN_ON(r == RET_PF_INVALID);
5559 }
5560
5561 if (r == RET_PF_RETRY)
5562 return 1;
5563 if (r < 0)
5564 return r;
5565
5566
5567
5568
5569
5570
5571
5572
5573 if (vcpu->arch.mmu->direct_map &&
5574 (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5575 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
5576 return 1;
5577 }
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590 if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5591 emulation_type = EMULTYPE_ALLOW_RETRY;
5592 emulate:
5593
5594
5595
5596
5597
5598
5599
5600 if (unlikely(insn && !insn_len)) {
5601 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5602 return 1;
5603 }
5604
5605 return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5606 insn_len);
5607 }
5608 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5609
5610 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5611 {
5612 struct kvm_mmu *mmu = vcpu->arch.mmu;
5613 int i;
5614
5615
5616 if (is_noncanonical_address(gva, vcpu))
5617 return;
5618
5619 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5633 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5634 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5635
5636 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5637 ++vcpu->stat.invlpg;
5638 }
5639 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5640
5641 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5642 {
5643 struct kvm_mmu *mmu = vcpu->arch.mmu;
5644 bool tlb_flush = false;
5645 uint i;
5646
5647 if (pcid == kvm_get_active_pcid(vcpu)) {
5648 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5649 tlb_flush = true;
5650 }
5651
5652 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5653 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5654 pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5655 mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5656 tlb_flush = true;
5657 }
5658 }
5659
5660 if (tlb_flush)
5661 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5662
5663 ++vcpu->stat.invlpg;
5664
5665
5666
5667
5668
5669
5670 }
5671 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5672
5673 void kvm_enable_tdp(void)
5674 {
5675 tdp_enabled = true;
5676 }
5677 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5678
5679 void kvm_disable_tdp(void)
5680 {
5681 tdp_enabled = false;
5682 }
5683 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5684
5685
5686
5687 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5688
5689
5690 static __always_inline bool
5691 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5692 slot_level_handler fn, int start_level, int end_level,
5693 gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5694 {
5695 struct slot_rmap_walk_iterator iterator;
5696 bool flush = false;
5697
5698 for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5699 end_gfn, &iterator) {
5700 if (iterator.rmap)
5701 flush |= fn(kvm, iterator.rmap);
5702
5703 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5704 if (flush && lock_flush_tlb) {
5705 kvm_flush_remote_tlbs_with_address(kvm,
5706 start_gfn,
5707 iterator.gfn - start_gfn + 1);
5708 flush = false;
5709 }
5710 cond_resched_lock(&kvm->mmu_lock);
5711 }
5712 }
5713
5714 if (flush && lock_flush_tlb) {
5715 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5716 end_gfn - start_gfn + 1);
5717 flush = false;
5718 }
5719
5720 return flush;
5721 }
5722
5723 static __always_inline bool
5724 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5725 slot_level_handler fn, int start_level, int end_level,
5726 bool lock_flush_tlb)
5727 {
5728 return slot_handle_level_range(kvm, memslot, fn, start_level,
5729 end_level, memslot->base_gfn,
5730 memslot->base_gfn + memslot->npages - 1,
5731 lock_flush_tlb);
5732 }
5733
5734 static __always_inline bool
5735 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5736 slot_level_handler fn, bool lock_flush_tlb)
5737 {
5738 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5739 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5740 }
5741
5742 static __always_inline bool
5743 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5744 slot_level_handler fn, bool lock_flush_tlb)
5745 {
5746 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5747 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5748 }
5749
5750 static __always_inline bool
5751 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5752 slot_level_handler fn, bool lock_flush_tlb)
5753 {
5754 return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5755 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5756 }
5757
5758 static void free_mmu_pages(struct kvm_mmu *mmu)
5759 {
5760 free_page((unsigned long)mmu->pae_root);
5761 free_page((unsigned long)mmu->lm_root);
5762 }
5763
5764 static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5765 {
5766 struct page *page;
5767 int i;
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778 if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5779 return 0;
5780
5781 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5782 if (!page)
5783 return -ENOMEM;
5784
5785 mmu->pae_root = page_address(page);
5786 for (i = 0; i < 4; ++i)
5787 mmu->pae_root[i] = INVALID_PAGE;
5788
5789 return 0;
5790 }
5791
5792 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5793 {
5794 uint i;
5795 int ret;
5796
5797 vcpu->arch.mmu = &vcpu->arch.root_mmu;
5798 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5799
5800 vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5801 vcpu->arch.root_mmu.root_cr3 = 0;
5802 vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5803 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5804 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5805
5806 vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5807 vcpu->arch.guest_mmu.root_cr3 = 0;
5808 vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5809 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5810 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5811
5812 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5813
5814 ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
5815 if (ret)
5816 return ret;
5817
5818 ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
5819 if (ret)
5820 goto fail_allocate_root;
5821
5822 return ret;
5823 fail_allocate_root:
5824 free_mmu_pages(&vcpu->arch.guest_mmu);
5825 return ret;
5826 }
5827
5828 #define BATCH_ZAP_PAGES 10
5829 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5830 {
5831 struct kvm_mmu_page *sp, *node;
5832 int nr_zapped, batch = 0;
5833
5834 restart:
5835 list_for_each_entry_safe_reverse(sp, node,
5836 &kvm->arch.active_mmu_pages, link) {
5837
5838
5839
5840
5841 if (!is_obsolete_sp(kvm, sp))
5842 break;
5843
5844
5845
5846
5847
5848
5849
5850 if (sp->role.invalid && sp->root_count)
5851 continue;
5852
5853
5854
5855
5856
5857
5858
5859 if (batch >= BATCH_ZAP_PAGES &&
5860 cond_resched_lock(&kvm->mmu_lock)) {
5861 batch = 0;
5862 goto restart;
5863 }
5864
5865 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5866 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5867 batch += nr_zapped;
5868 goto restart;
5869 }
5870 }
5871
5872
5873
5874
5875
5876
5877 kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5878 }
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5890 {
5891 lockdep_assert_held(&kvm->slots_lock);
5892
5893 spin_lock(&kvm->mmu_lock);
5894 trace_kvm_mmu_zap_all_fast(kvm);
5895
5896
5897
5898
5899
5900
5901
5902
5903 kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913 kvm_reload_remote_mmus(kvm);
5914
5915 kvm_zap_obsolete_pages(kvm);
5916 spin_unlock(&kvm->mmu_lock);
5917 }
5918
5919 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5920 {
5921 return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5922 }
5923
5924 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5925 struct kvm_memory_slot *slot,
5926 struct kvm_page_track_notifier_node *node)
5927 {
5928 kvm_mmu_zap_all_fast(kvm);
5929 }
5930
5931 void kvm_mmu_init_vm(struct kvm *kvm)
5932 {
5933 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5934
5935 node->track_write = kvm_mmu_pte_write;
5936 node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5937 kvm_page_track_register_notifier(kvm, node);
5938 }
5939
5940 void kvm_mmu_uninit_vm(struct kvm *kvm)
5941 {
5942 struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5943
5944 kvm_page_track_unregister_notifier(kvm, node);
5945 }
5946
5947 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5948 {
5949 struct kvm_memslots *slots;
5950 struct kvm_memory_slot *memslot;
5951 int i;
5952
5953 spin_lock(&kvm->mmu_lock);
5954 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5955 slots = __kvm_memslots(kvm, i);
5956 kvm_for_each_memslot(memslot, slots) {
5957 gfn_t start, end;
5958
5959 start = max(gfn_start, memslot->base_gfn);
5960 end = min(gfn_end, memslot->base_gfn + memslot->npages);
5961 if (start >= end)
5962 continue;
5963
5964 slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5965 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5966 start, end - 1, true);
5967 }
5968 }
5969
5970 spin_unlock(&kvm->mmu_lock);
5971 }
5972
5973 static bool slot_rmap_write_protect(struct kvm *kvm,
5974 struct kvm_rmap_head *rmap_head)
5975 {
5976 return __rmap_write_protect(kvm, rmap_head, false);
5977 }
5978
5979 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5980 struct kvm_memory_slot *memslot)
5981 {
5982 bool flush;
5983
5984 spin_lock(&kvm->mmu_lock);
5985 flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5986 false);
5987 spin_unlock(&kvm->mmu_lock);
5988
5989
5990
5991
5992
5993
5994 lockdep_assert_held(&kvm->slots_lock);
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007 if (flush)
6008 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6009 memslot->npages);
6010 }
6011
6012 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6013 struct kvm_rmap_head *rmap_head)
6014 {
6015 u64 *sptep;
6016 struct rmap_iterator iter;
6017 int need_tlb_flush = 0;
6018 kvm_pfn_t pfn;
6019 struct kvm_mmu_page *sp;
6020
6021 restart:
6022 for_each_rmap_spte(rmap_head, &iter, sptep) {
6023 sp = page_header(__pa(sptep));
6024 pfn = spte_to_pfn(*sptep);
6025
6026
6027
6028
6029
6030
6031
6032
6033 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
6034 !kvm_is_zone_device_pfn(pfn) &&
6035 PageTransCompoundMap(pfn_to_page(pfn))) {
6036 pte_list_remove(rmap_head, sptep);
6037
6038 if (kvm_available_flush_tlb_with_range())
6039 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
6040 KVM_PAGES_PER_HPAGE(sp->role.level));
6041 else
6042 need_tlb_flush = 1;
6043
6044 goto restart;
6045 }
6046 }
6047
6048 return need_tlb_flush;
6049 }
6050
6051 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6052 const struct kvm_memory_slot *memslot)
6053 {
6054
6055 spin_lock(&kvm->mmu_lock);
6056 slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
6057 kvm_mmu_zap_collapsible_spte, true);
6058 spin_unlock(&kvm->mmu_lock);
6059 }
6060
6061 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6062 struct kvm_memory_slot *memslot)
6063 {
6064 bool flush;
6065
6066 spin_lock(&kvm->mmu_lock);
6067 flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
6068 spin_unlock(&kvm->mmu_lock);
6069
6070 lockdep_assert_held(&kvm->slots_lock);
6071
6072
6073
6074
6075
6076
6077
6078 if (flush)
6079 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6080 memslot->npages);
6081 }
6082 EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
6083
6084 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
6085 struct kvm_memory_slot *memslot)
6086 {
6087 bool flush;
6088
6089 spin_lock(&kvm->mmu_lock);
6090 flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
6091 false);
6092 spin_unlock(&kvm->mmu_lock);
6093
6094
6095 lockdep_assert_held(&kvm->slots_lock);
6096
6097 if (flush)
6098 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6099 memslot->npages);
6100 }
6101 EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
6102
6103 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
6104 struct kvm_memory_slot *memslot)
6105 {
6106 bool flush;
6107
6108 spin_lock(&kvm->mmu_lock);
6109 flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
6110 spin_unlock(&kvm->mmu_lock);
6111
6112 lockdep_assert_held(&kvm->slots_lock);
6113
6114
6115 if (flush)
6116 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
6117 memslot->npages);
6118 }
6119 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
6120
6121 void kvm_mmu_zap_all(struct kvm *kvm)
6122 {
6123 struct kvm_mmu_page *sp, *node;
6124 LIST_HEAD(invalid_list);
6125 int ign;
6126
6127 spin_lock(&kvm->mmu_lock);
6128 restart:
6129 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6130 if (sp->role.invalid && sp->root_count)
6131 continue;
6132 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
6133 goto restart;
6134 if (cond_resched_lock(&kvm->mmu_lock))
6135 goto restart;
6136 }
6137
6138 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6139 spin_unlock(&kvm->mmu_lock);
6140 }
6141
6142 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6143 {
6144 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6145
6146 gen &= MMIO_SPTE_GEN_MASK;
6147
6148
6149
6150
6151
6152
6153
6154
6155 gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6156
6157
6158
6159
6160
6161 if (unlikely(gen == 0)) {
6162 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6163 kvm_mmu_zap_all_fast(kvm);
6164 }
6165 }
6166
6167 static unsigned long
6168 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6169 {
6170 struct kvm *kvm;
6171 int nr_to_scan = sc->nr_to_scan;
6172 unsigned long freed = 0;
6173
6174 mutex_lock(&kvm_lock);
6175
6176 list_for_each_entry(kvm, &vm_list, vm_list) {
6177 int idx;
6178 LIST_HEAD(invalid_list);
6179
6180
6181
6182
6183
6184
6185
6186 if (!nr_to_scan--)
6187 break;
6188
6189
6190
6191
6192
6193
6194 if (!kvm->arch.n_used_mmu_pages &&
6195 !kvm_has_zapped_obsolete_pages(kvm))
6196 continue;
6197
6198 idx = srcu_read_lock(&kvm->srcu);
6199 spin_lock(&kvm->mmu_lock);
6200
6201 if (kvm_has_zapped_obsolete_pages(kvm)) {
6202 kvm_mmu_commit_zap_page(kvm,
6203 &kvm->arch.zapped_obsolete_pages);
6204 goto unlock;
6205 }
6206
6207 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
6208 freed++;
6209 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6210
6211 unlock:
6212 spin_unlock(&kvm->mmu_lock);
6213 srcu_read_unlock(&kvm->srcu, idx);
6214
6215
6216
6217
6218
6219
6220 list_move_tail(&kvm->vm_list, &vm_list);
6221 break;
6222 }
6223
6224 mutex_unlock(&kvm_lock);
6225 return freed;
6226 }
6227
6228 static unsigned long
6229 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6230 {
6231 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6232 }
6233
6234 static struct shrinker mmu_shrinker = {
6235 .count_objects = mmu_shrink_count,
6236 .scan_objects = mmu_shrink_scan,
6237 .seeks = DEFAULT_SEEKS * 10,
6238 };
6239
6240 static void mmu_destroy_caches(void)
6241 {
6242 kmem_cache_destroy(pte_list_desc_cache);
6243 kmem_cache_destroy(mmu_page_header_cache);
6244 }
6245
6246 static void kvm_set_mmio_spte_mask(void)
6247 {
6248 u64 mask;
6249
6250
6251
6252
6253
6254
6255
6256
6257 if (shadow_phys_bits < 52)
6258 mask = BIT_ULL(51) | PT_PRESENT_MASK;
6259 else
6260 mask = 0;
6261
6262 kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
6263 }
6264
6265 static bool get_nx_auto_mode(void)
6266 {
6267
6268 return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6269 }
6270
6271 static void __set_nx_huge_pages(bool val)
6272 {
6273 nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6274 }
6275
6276 static int set_nx_huge_pages(const char *val, const struct kernel_param *kp)
6277 {
6278 bool old_val = nx_huge_pages;
6279 bool new_val;
6280
6281
6282 if (sysfs_streq(val, "off"))
6283 new_val = 0;
6284 else if (sysfs_streq(val, "force"))
6285 new_val = 1;
6286 else if (sysfs_streq(val, "auto"))
6287 new_val = get_nx_auto_mode();
6288 else if (strtobool(val, &new_val) < 0)
6289 return -EINVAL;
6290
6291 __set_nx_huge_pages(new_val);
6292
6293 if (new_val != old_val) {
6294 struct kvm *kvm;
6295
6296 mutex_lock(&kvm_lock);
6297
6298 list_for_each_entry(kvm, &vm_list, vm_list) {
6299 mutex_lock(&kvm->slots_lock);
6300 kvm_mmu_zap_all_fast(kvm);
6301 mutex_unlock(&kvm->slots_lock);
6302
6303 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6304 }
6305 mutex_unlock(&kvm_lock);
6306 }
6307
6308 return 0;
6309 }
6310
6311 int kvm_mmu_module_init(void)
6312 {
6313 int ret = -ENOMEM;
6314
6315 if (nx_huge_pages == -1)
6316 __set_nx_huge_pages(get_nx_auto_mode());
6317
6318
6319
6320
6321
6322
6323
6324 BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6325 BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6326 BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6327
6328 kvm_mmu_reset_all_pte_masks();
6329
6330 kvm_set_mmio_spte_mask();
6331
6332 pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6333 sizeof(struct pte_list_desc),
6334 0, SLAB_ACCOUNT, NULL);
6335 if (!pte_list_desc_cache)
6336 goto out;
6337
6338 mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6339 sizeof(struct kvm_mmu_page),
6340 0, SLAB_ACCOUNT, NULL);
6341 if (!mmu_page_header_cache)
6342 goto out;
6343
6344 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6345 goto out;
6346
6347 ret = register_shrinker(&mmu_shrinker);
6348 if (ret)
6349 goto out;
6350
6351 return 0;
6352
6353 out:
6354 mmu_destroy_caches();
6355 return ret;
6356 }
6357
6358
6359
6360
6361 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6362 {
6363 unsigned long nr_mmu_pages;
6364 unsigned long nr_pages = 0;
6365 struct kvm_memslots *slots;
6366 struct kvm_memory_slot *memslot;
6367 int i;
6368
6369 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6370 slots = __kvm_memslots(kvm, i);
6371
6372 kvm_for_each_memslot(memslot, slots)
6373 nr_pages += memslot->npages;
6374 }
6375
6376 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6377 nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6378
6379 return nr_mmu_pages;
6380 }
6381
6382 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6383 {
6384 kvm_mmu_unload(vcpu);
6385 free_mmu_pages(&vcpu->arch.root_mmu);
6386 free_mmu_pages(&vcpu->arch.guest_mmu);
6387 mmu_free_memory_caches(vcpu);
6388 }
6389
6390 void kvm_mmu_module_exit(void)
6391 {
6392 mmu_destroy_caches();
6393 percpu_counter_destroy(&kvm_total_used_mmu_pages);
6394 unregister_shrinker(&mmu_shrinker);
6395 mmu_audit_disable();
6396 }
6397
6398 static int set_nx_huge_pages_recovery_ratio(const char *val, const struct kernel_param *kp)
6399 {
6400 unsigned int old_val;
6401 int err;
6402
6403 old_val = nx_huge_pages_recovery_ratio;
6404 err = param_set_uint(val, kp);
6405 if (err)
6406 return err;
6407
6408 if (READ_ONCE(nx_huge_pages) &&
6409 !old_val && nx_huge_pages_recovery_ratio) {
6410 struct kvm *kvm;
6411
6412 mutex_lock(&kvm_lock);
6413
6414 list_for_each_entry(kvm, &vm_list, vm_list)
6415 wake_up_process(kvm->arch.nx_lpage_recovery_thread);
6416
6417 mutex_unlock(&kvm_lock);
6418 }
6419
6420 return err;
6421 }
6422
6423 static void kvm_recover_nx_lpages(struct kvm *kvm)
6424 {
6425 int rcu_idx;
6426 struct kvm_mmu_page *sp;
6427 unsigned int ratio;
6428 LIST_HEAD(invalid_list);
6429 ulong to_zap;
6430
6431 rcu_idx = srcu_read_lock(&kvm->srcu);
6432 spin_lock(&kvm->mmu_lock);
6433
6434 ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
6435 to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0;
6436 while (to_zap && !list_empty(&kvm->arch.lpage_disallowed_mmu_pages)) {
6437
6438
6439
6440
6441
6442 sp = list_first_entry(&kvm->arch.lpage_disallowed_mmu_pages,
6443 struct kvm_mmu_page,
6444 lpage_disallowed_link);
6445 WARN_ON_ONCE(!sp->lpage_disallowed);
6446 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
6447 WARN_ON_ONCE(sp->lpage_disallowed);
6448
6449 if (!--to_zap || need_resched() || spin_needbreak(&kvm->mmu_lock)) {
6450 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6451 if (to_zap)
6452 cond_resched_lock(&kvm->mmu_lock);
6453 }
6454 }
6455
6456 spin_unlock(&kvm->mmu_lock);
6457 srcu_read_unlock(&kvm->srcu, rcu_idx);
6458 }
6459
6460 static long get_nx_lpage_recovery_timeout(u64 start_time)
6461 {
6462 return READ_ONCE(nx_huge_pages) && READ_ONCE(nx_huge_pages_recovery_ratio)
6463 ? start_time + 60 * HZ - get_jiffies_64()
6464 : MAX_SCHEDULE_TIMEOUT;
6465 }
6466
6467 static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)
6468 {
6469 u64 start_time;
6470 long remaining_time;
6471
6472 while (true) {
6473 start_time = get_jiffies_64();
6474 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6475
6476 set_current_state(TASK_INTERRUPTIBLE);
6477 while (!kthread_should_stop() && remaining_time > 0) {
6478 schedule_timeout(remaining_time);
6479 remaining_time = get_nx_lpage_recovery_timeout(start_time);
6480 set_current_state(TASK_INTERRUPTIBLE);
6481 }
6482
6483 set_current_state(TASK_RUNNING);
6484
6485 if (kthread_should_stop())
6486 return 0;
6487
6488 kvm_recover_nx_lpages(kvm);
6489 }
6490 }
6491
6492 int kvm_mmu_post_init_vm(struct kvm *kvm)
6493 {
6494 int err;
6495
6496 err = kvm_vm_create_worker_thread(kvm, kvm_nx_lpage_recovery_worker, 0,
6497 "kvm-nx-lpage-recovery",
6498 &kvm->arch.nx_lpage_recovery_thread);
6499 if (!err)
6500 kthread_unpark(kvm->arch.nx_lpage_recovery_thread);
6501
6502 return err;
6503 }
6504
6505 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
6506 {
6507 if (kvm->arch.nx_lpage_recovery_thread)
6508 kthread_stop(kvm->arch.nx_lpage_recovery_thread);
6509 }