1/* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19#include "irq.h" 20#include "mmu.h" 21#include "cpuid.h" 22 23#include <linux/kvm_host.h> 24#include <linux/module.h> 25#include <linux/kernel.h> 26#include <linux/mm.h> 27#include <linux/highmem.h> 28#include <linux/sched.h> 29#include <linux/moduleparam.h> 30#include <linux/mod_devicetable.h> 31#include <linux/ftrace_event.h> 32#include <linux/slab.h> 33#include <linux/tboot.h> 34#include <linux/hrtimer.h> 35#include "kvm_cache_regs.h" 36#include "x86.h" 37 38#include <asm/io.h> 39#include <asm/desc.h> 40#include <asm/vmx.h> 41#include <asm/virtext.h> 42#include <asm/mce.h> 43#include <asm/i387.h> 44#include <asm/xcr.h> 45#include <asm/perf_event.h> 46#include <asm/debugreg.h> 47#include <asm/kexec.h> 48#include <asm/apic.h> 49 50#include "trace.h" 51 52#define __ex(x) __kvm_handle_fault_on_reboot(x) 53#define __ex_clear(x, reg) \ 54 ____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg) 55 56MODULE_AUTHOR("Qumranet"); 57MODULE_LICENSE("GPL"); 58 59static const struct x86_cpu_id vmx_cpu_id[] = { 60 X86_FEATURE_MATCH(X86_FEATURE_VMX), 61 {} 62}; 63MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id); 64 65static bool __read_mostly enable_vpid = 1; 66module_param_named(vpid, enable_vpid, bool, 0444); 67 68static bool __read_mostly flexpriority_enabled = 1; 69module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO); 70 71static bool __read_mostly enable_ept = 1; 72module_param_named(ept, enable_ept, bool, S_IRUGO); 73 74static bool __read_mostly enable_unrestricted_guest = 1; 75module_param_named(unrestricted_guest, 76 enable_unrestricted_guest, bool, S_IRUGO); 77 78static bool __read_mostly enable_ept_ad_bits = 1; 79module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); 80 81static bool __read_mostly emulate_invalid_guest_state = true; 82module_param(emulate_invalid_guest_state, bool, S_IRUGO); 83 84static bool __read_mostly vmm_exclusive = 1; 85module_param(vmm_exclusive, bool, S_IRUGO); 86 87static bool __read_mostly fasteoi = 1; 88module_param(fasteoi, bool, S_IRUGO); 89 90static bool __read_mostly enable_apicv = 1; 91module_param(enable_apicv, bool, S_IRUGO); 92 93static bool __read_mostly enable_shadow_vmcs = 1; 94module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 95/* 96 * If nested=1, nested virtualization is supported, i.e., guests may use 97 * VMX and be a hypervisor for its own guests. If nested=0, guests may not 98 * use VMX instructions. 99 */ 100static bool __read_mostly nested = 0; 101module_param(nested, bool, S_IRUGO); 102 103static u64 __read_mostly host_xss; 104 105static bool __read_mostly enable_pml = 1; 106module_param_named(pml, enable_pml, bool, S_IRUGO); 107 108#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) 109#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) 110#define KVM_VM_CR0_ALWAYS_ON \ 111 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) 112#define KVM_CR4_GUEST_OWNED_BITS \ 113 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \ 114 | X86_CR4_OSXMMEXCPT | X86_CR4_TSD) 115 116#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) 117#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) 118 119#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) 120 121#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 122 123/* 124 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 125 * ple_gap: upper bound on the amount of time between two successive 126 * executions of PAUSE in a loop. Also indicate if ple enabled. 127 * According to test, this time is usually smaller than 128 cycles. 128 * ple_window: upper bound on the amount of time a guest is allowed to execute 129 * in a PAUSE loop. Tests indicate that most spinlocks are held for 130 * less than 2^12 cycles 131 * Time is measured based on a counter that runs at the same rate as the TSC, 132 * refer SDM volume 3b section 21.6.13 & 22.1.3. 133 */ 134#define KVM_VMX_DEFAULT_PLE_GAP 128 135#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 136#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 137#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 138#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ 139 INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW 140 141static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 142module_param(ple_gap, int, S_IRUGO); 143 144static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; 145module_param(ple_window, int, S_IRUGO); 146 147/* Default doubles per-vcpu window every exit. */ 148static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; 149module_param(ple_window_grow, int, S_IRUGO); 150 151/* Default resets per-vcpu window every exit to ple_window. */ 152static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; 153module_param(ple_window_shrink, int, S_IRUGO); 154 155/* Default is to compute the maximum so we can never overflow. */ 156static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 157static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; 158module_param(ple_window_max, int, S_IRUGO); 159 160extern const ulong vmx_return; 161 162#define NR_AUTOLOAD_MSRS 8 163#define VMCS02_POOL_SIZE 1 164 165struct vmcs { 166 u32 revision_id; 167 u32 abort; 168 char data[0]; 169}; 170 171/* 172 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also 173 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs 174 * loaded on this CPU (so we can clear them if the CPU goes down). 175 */ 176struct loaded_vmcs { 177 struct vmcs *vmcs; 178 int cpu; 179 int launched; 180 struct list_head loaded_vmcss_on_cpu_link; 181}; 182 183struct shared_msr_entry { 184 unsigned index; 185 u64 data; 186 u64 mask; 187}; 188 189/* 190 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a 191 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has 192 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is 193 * stored in guest memory specified by VMPTRLD, but is opaque to the guest, 194 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. 195 * More than one of these structures may exist, if L1 runs multiple L2 guests. 196 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the 197 * underlying hardware which will be used to run L2. 198 * This structure is packed to ensure that its layout is identical across 199 * machines (necessary for live migration). 200 * If there are changes in this struct, VMCS12_REVISION must be changed. 201 */ 202typedef u64 natural_width; 203struct __packed vmcs12 { 204 /* According to the Intel spec, a VMCS region must start with the 205 * following two fields. Then follow implementation-specific data. 206 */ 207 u32 revision_id; 208 u32 abort; 209 210 u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */ 211 u32 padding[7]; /* room for future expansion */ 212 213 u64 io_bitmap_a; 214 u64 io_bitmap_b; 215 u64 msr_bitmap; 216 u64 vm_exit_msr_store_addr; 217 u64 vm_exit_msr_load_addr; 218 u64 vm_entry_msr_load_addr; 219 u64 tsc_offset; 220 u64 virtual_apic_page_addr; 221 u64 apic_access_addr; 222 u64 posted_intr_desc_addr; 223 u64 ept_pointer; 224 u64 eoi_exit_bitmap0; 225 u64 eoi_exit_bitmap1; 226 u64 eoi_exit_bitmap2; 227 u64 eoi_exit_bitmap3; 228 u64 xss_exit_bitmap; 229 u64 guest_physical_address; 230 u64 vmcs_link_pointer; 231 u64 guest_ia32_debugctl; 232 u64 guest_ia32_pat; 233 u64 guest_ia32_efer; 234 u64 guest_ia32_perf_global_ctrl; 235 u64 guest_pdptr0; 236 u64 guest_pdptr1; 237 u64 guest_pdptr2; 238 u64 guest_pdptr3; 239 u64 guest_bndcfgs; 240 u64 host_ia32_pat; 241 u64 host_ia32_efer; 242 u64 host_ia32_perf_global_ctrl; 243 u64 padding64[8]; /* room for future expansion */ 244 /* 245 * To allow migration of L1 (complete with its L2 guests) between 246 * machines of different natural widths (32 or 64 bit), we cannot have 247 * unsigned long fields with no explict size. We use u64 (aliased 248 * natural_width) instead. Luckily, x86 is little-endian. 249 */ 250 natural_width cr0_guest_host_mask; 251 natural_width cr4_guest_host_mask; 252 natural_width cr0_read_shadow; 253 natural_width cr4_read_shadow; 254 natural_width cr3_target_value0; 255 natural_width cr3_target_value1; 256 natural_width cr3_target_value2; 257 natural_width cr3_target_value3; 258 natural_width exit_qualification; 259 natural_width guest_linear_address; 260 natural_width guest_cr0; 261 natural_width guest_cr3; 262 natural_width guest_cr4; 263 natural_width guest_es_base; 264 natural_width guest_cs_base; 265 natural_width guest_ss_base; 266 natural_width guest_ds_base; 267 natural_width guest_fs_base; 268 natural_width guest_gs_base; 269 natural_width guest_ldtr_base; 270 natural_width guest_tr_base; 271 natural_width guest_gdtr_base; 272 natural_width guest_idtr_base; 273 natural_width guest_dr7; 274 natural_width guest_rsp; 275 natural_width guest_rip; 276 natural_width guest_rflags; 277 natural_width guest_pending_dbg_exceptions; 278 natural_width guest_sysenter_esp; 279 natural_width guest_sysenter_eip; 280 natural_width host_cr0; 281 natural_width host_cr3; 282 natural_width host_cr4; 283 natural_width host_fs_base; 284 natural_width host_gs_base; 285 natural_width host_tr_base; 286 natural_width host_gdtr_base; 287 natural_width host_idtr_base; 288 natural_width host_ia32_sysenter_esp; 289 natural_width host_ia32_sysenter_eip; 290 natural_width host_rsp; 291 natural_width host_rip; 292 natural_width paddingl[8]; /* room for future expansion */ 293 u32 pin_based_vm_exec_control; 294 u32 cpu_based_vm_exec_control; 295 u32 exception_bitmap; 296 u32 page_fault_error_code_mask; 297 u32 page_fault_error_code_match; 298 u32 cr3_target_count; 299 u32 vm_exit_controls; 300 u32 vm_exit_msr_store_count; 301 u32 vm_exit_msr_load_count; 302 u32 vm_entry_controls; 303 u32 vm_entry_msr_load_count; 304 u32 vm_entry_intr_info_field; 305 u32 vm_entry_exception_error_code; 306 u32 vm_entry_instruction_len; 307 u32 tpr_threshold; 308 u32 secondary_vm_exec_control; 309 u32 vm_instruction_error; 310 u32 vm_exit_reason; 311 u32 vm_exit_intr_info; 312 u32 vm_exit_intr_error_code; 313 u32 idt_vectoring_info_field; 314 u32 idt_vectoring_error_code; 315 u32 vm_exit_instruction_len; 316 u32 vmx_instruction_info; 317 u32 guest_es_limit; 318 u32 guest_cs_limit; 319 u32 guest_ss_limit; 320 u32 guest_ds_limit; 321 u32 guest_fs_limit; 322 u32 guest_gs_limit; 323 u32 guest_ldtr_limit; 324 u32 guest_tr_limit; 325 u32 guest_gdtr_limit; 326 u32 guest_idtr_limit; 327 u32 guest_es_ar_bytes; 328 u32 guest_cs_ar_bytes; 329 u32 guest_ss_ar_bytes; 330 u32 guest_ds_ar_bytes; 331 u32 guest_fs_ar_bytes; 332 u32 guest_gs_ar_bytes; 333 u32 guest_ldtr_ar_bytes; 334 u32 guest_tr_ar_bytes; 335 u32 guest_interruptibility_info; 336 u32 guest_activity_state; 337 u32 guest_sysenter_cs; 338 u32 host_ia32_sysenter_cs; 339 u32 vmx_preemption_timer_value; 340 u32 padding32[7]; /* room for future expansion */ 341 u16 virtual_processor_id; 342 u16 posted_intr_nv; 343 u16 guest_es_selector; 344 u16 guest_cs_selector; 345 u16 guest_ss_selector; 346 u16 guest_ds_selector; 347 u16 guest_fs_selector; 348 u16 guest_gs_selector; 349 u16 guest_ldtr_selector; 350 u16 guest_tr_selector; 351 u16 guest_intr_status; 352 u16 host_es_selector; 353 u16 host_cs_selector; 354 u16 host_ss_selector; 355 u16 host_ds_selector; 356 u16 host_fs_selector; 357 u16 host_gs_selector; 358 u16 host_tr_selector; 359}; 360 361/* 362 * VMCS12_REVISION is an arbitrary id that should be changed if the content or 363 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and 364 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id. 365 */ 366#define VMCS12_REVISION 0x11e57ed0 367 368/* 369 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region 370 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the 371 * current implementation, 4K are reserved to avoid future complications. 372 */ 373#define VMCS12_SIZE 0x1000 374 375/* Used to remember the last vmcs02 used for some recently used vmcs12s */ 376struct vmcs02_list { 377 struct list_head list; 378 gpa_t vmptr; 379 struct loaded_vmcs vmcs02; 380}; 381 382/* 383 * The nested_vmx structure is part of vcpu_vmx, and holds information we need 384 * for correct emulation of VMX (i.e., nested VMX) on this vcpu. 385 */ 386struct nested_vmx { 387 /* Has the level1 guest done vmxon? */ 388 bool vmxon; 389 gpa_t vmxon_ptr; 390 391 /* The guest-physical address of the current VMCS L1 keeps for L2 */ 392 gpa_t current_vmptr; 393 /* The host-usable pointer to the above */ 394 struct page *current_vmcs12_page; 395 struct vmcs12 *current_vmcs12; 396 struct vmcs *current_shadow_vmcs; 397 /* 398 * Indicates if the shadow vmcs must be updated with the 399 * data hold by vmcs12 400 */ 401 bool sync_shadow_vmcs; 402 403 /* vmcs02_list cache of VMCSs recently used to run L2 guests */ 404 struct list_head vmcs02_pool; 405 int vmcs02_num; 406 u64 vmcs01_tsc_offset; 407 /* L2 must run next, and mustn't decide to exit to L1. */ 408 bool nested_run_pending; 409 /* 410 * Guest pages referred to in vmcs02 with host-physical pointers, so 411 * we must keep them pinned while L2 runs. 412 */ 413 struct page *apic_access_page; 414 struct page *virtual_apic_page; 415 struct page *pi_desc_page; 416 struct pi_desc *pi_desc; 417 bool pi_pending; 418 u16 posted_intr_nv; 419 u64 msr_ia32_feature_control; 420 421 struct hrtimer preemption_timer; 422 bool preemption_timer_expired; 423 424 /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ 425 u64 vmcs01_debugctl; 426 427 u32 nested_vmx_procbased_ctls_low; 428 u32 nested_vmx_procbased_ctls_high; 429 u32 nested_vmx_true_procbased_ctls_low; 430 u32 nested_vmx_secondary_ctls_low; 431 u32 nested_vmx_secondary_ctls_high; 432 u32 nested_vmx_pinbased_ctls_low; 433 u32 nested_vmx_pinbased_ctls_high; 434 u32 nested_vmx_exit_ctls_low; 435 u32 nested_vmx_exit_ctls_high; 436 u32 nested_vmx_true_exit_ctls_low; 437 u32 nested_vmx_entry_ctls_low; 438 u32 nested_vmx_entry_ctls_high; 439 u32 nested_vmx_true_entry_ctls_low; 440 u32 nested_vmx_misc_low; 441 u32 nested_vmx_misc_high; 442 u32 nested_vmx_ept_caps; 443}; 444 445#define POSTED_INTR_ON 0 446/* Posted-Interrupt Descriptor */ 447struct pi_desc { 448 u32 pir[8]; /* Posted interrupt requested */ 449 u32 control; /* bit 0 of control is outstanding notification bit */ 450 u32 rsvd[7]; 451} __aligned(64); 452 453static bool pi_test_and_set_on(struct pi_desc *pi_desc) 454{ 455 return test_and_set_bit(POSTED_INTR_ON, 456 (unsigned long *)&pi_desc->control); 457} 458 459static bool pi_test_and_clear_on(struct pi_desc *pi_desc) 460{ 461 return test_and_clear_bit(POSTED_INTR_ON, 462 (unsigned long *)&pi_desc->control); 463} 464 465static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc) 466{ 467 return test_and_set_bit(vector, (unsigned long *)pi_desc->pir); 468} 469 470struct vcpu_vmx { 471 struct kvm_vcpu vcpu; 472 unsigned long host_rsp; 473 u8 fail; 474 bool nmi_known_unmasked; 475 u32 exit_intr_info; 476 u32 idt_vectoring_info; 477 ulong rflags; 478 struct shared_msr_entry *guest_msrs; 479 int nmsrs; 480 int save_nmsrs; 481 unsigned long host_idt_base; 482#ifdef CONFIG_X86_64 483 u64 msr_host_kernel_gs_base; 484 u64 msr_guest_kernel_gs_base; 485#endif 486 u32 vm_entry_controls_shadow; 487 u32 vm_exit_controls_shadow; 488 /* 489 * loaded_vmcs points to the VMCS currently used in this vcpu. For a 490 * non-nested (L1) guest, it always points to vmcs01. For a nested 491 * guest (L2), it points to a different VMCS. 492 */ 493 struct loaded_vmcs vmcs01; 494 struct loaded_vmcs *loaded_vmcs; 495 bool __launched; /* temporary, used in vmx_vcpu_run */ 496 struct msr_autoload { 497 unsigned nr; 498 struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; 499 struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; 500 } msr_autoload; 501 struct { 502 int loaded; 503 u16 fs_sel, gs_sel, ldt_sel; 504#ifdef CONFIG_X86_64 505 u16 ds_sel, es_sel; 506#endif 507 int gs_ldt_reload_needed; 508 int fs_reload_needed; 509 u64 msr_host_bndcfgs; 510 unsigned long vmcs_host_cr4; /* May not match real cr4 */ 511 } host_state; 512 struct { 513 int vm86_active; 514 ulong save_rflags; 515 struct kvm_segment segs[8]; 516 } rmode; 517 struct { 518 u32 bitmask; /* 4 bits per segment (1 bit per field) */ 519 struct kvm_save_segment { 520 u16 selector; 521 unsigned long base; 522 u32 limit; 523 u32 ar; 524 } seg[8]; 525 } segment_cache; 526 int vpid; 527 bool emulation_required; 528 529 /* Support for vnmi-less CPUs */ 530 int soft_vnmi_blocked; 531 ktime_t entry_time; 532 s64 vnmi_blocked_time; 533 u32 exit_reason; 534 535 bool rdtscp_enabled; 536 537 /* Posted interrupt descriptor */ 538 struct pi_desc pi_desc; 539 540 /* Support for a guest hypervisor (nested VMX) */ 541 struct nested_vmx nested; 542 543 /* Dynamic PLE window. */ 544 int ple_window; 545 bool ple_window_dirty; 546 547 /* Support for PML */ 548#define PML_ENTITY_NUM 512 549 struct page *pml_pg; 550}; 551 552enum segment_cache_field { 553 SEG_FIELD_SEL = 0, 554 SEG_FIELD_BASE = 1, 555 SEG_FIELD_LIMIT = 2, 556 SEG_FIELD_AR = 3, 557 558 SEG_FIELD_NR = 4 559}; 560 561static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 562{ 563 return container_of(vcpu, struct vcpu_vmx, vcpu); 564} 565 566#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) 567#define FIELD(number, name) [number] = VMCS12_OFFSET(name) 568#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ 569 [number##_HIGH] = VMCS12_OFFSET(name)+4 570 571 572static unsigned long shadow_read_only_fields[] = { 573 /* 574 * We do NOT shadow fields that are modified when L0 575 * traps and emulates any vmx instruction (e.g. VMPTRLD, 576 * VMXON...) executed by L1. 577 * For example, VM_INSTRUCTION_ERROR is read 578 * by L1 if a vmx instruction fails (part of the error path). 579 * Note the code assumes this logic. If for some reason 580 * we start shadowing these fields then we need to 581 * force a shadow sync when L0 emulates vmx instructions 582 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified 583 * by nested_vmx_failValid) 584 */ 585 VM_EXIT_REASON, 586 VM_EXIT_INTR_INFO, 587 VM_EXIT_INSTRUCTION_LEN, 588 IDT_VECTORING_INFO_FIELD, 589 IDT_VECTORING_ERROR_CODE, 590 VM_EXIT_INTR_ERROR_CODE, 591 EXIT_QUALIFICATION, 592 GUEST_LINEAR_ADDRESS, 593 GUEST_PHYSICAL_ADDRESS 594}; 595static int max_shadow_read_only_fields = 596 ARRAY_SIZE(shadow_read_only_fields); 597 598static unsigned long shadow_read_write_fields[] = { 599 TPR_THRESHOLD, 600 GUEST_RIP, 601 GUEST_RSP, 602 GUEST_CR0, 603 GUEST_CR3, 604 GUEST_CR4, 605 GUEST_INTERRUPTIBILITY_INFO, 606 GUEST_RFLAGS, 607 GUEST_CS_SELECTOR, 608 GUEST_CS_AR_BYTES, 609 GUEST_CS_LIMIT, 610 GUEST_CS_BASE, 611 GUEST_ES_BASE, 612 GUEST_BNDCFGS, 613 CR0_GUEST_HOST_MASK, 614 CR0_READ_SHADOW, 615 CR4_READ_SHADOW, 616 TSC_OFFSET, 617 EXCEPTION_BITMAP, 618 CPU_BASED_VM_EXEC_CONTROL, 619 VM_ENTRY_EXCEPTION_ERROR_CODE, 620 VM_ENTRY_INTR_INFO_FIELD, 621 VM_ENTRY_INSTRUCTION_LEN, 622 VM_ENTRY_EXCEPTION_ERROR_CODE, 623 HOST_FS_BASE, 624 HOST_GS_BASE, 625 HOST_FS_SELECTOR, 626 HOST_GS_SELECTOR 627}; 628static int max_shadow_read_write_fields = 629 ARRAY_SIZE(shadow_read_write_fields); 630 631static const unsigned short vmcs_field_to_offset_table[] = { 632 FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id), 633 FIELD(POSTED_INTR_NV, posted_intr_nv), 634 FIELD(GUEST_ES_SELECTOR, guest_es_selector), 635 FIELD(GUEST_CS_SELECTOR, guest_cs_selector), 636 FIELD(GUEST_SS_SELECTOR, guest_ss_selector), 637 FIELD(GUEST_DS_SELECTOR, guest_ds_selector), 638 FIELD(GUEST_FS_SELECTOR, guest_fs_selector), 639 FIELD(GUEST_GS_SELECTOR, guest_gs_selector), 640 FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector), 641 FIELD(GUEST_TR_SELECTOR, guest_tr_selector), 642 FIELD(GUEST_INTR_STATUS, guest_intr_status), 643 FIELD(HOST_ES_SELECTOR, host_es_selector), 644 FIELD(HOST_CS_SELECTOR, host_cs_selector), 645 FIELD(HOST_SS_SELECTOR, host_ss_selector), 646 FIELD(HOST_DS_SELECTOR, host_ds_selector), 647 FIELD(HOST_FS_SELECTOR, host_fs_selector), 648 FIELD(HOST_GS_SELECTOR, host_gs_selector), 649 FIELD(HOST_TR_SELECTOR, host_tr_selector), 650 FIELD64(IO_BITMAP_A, io_bitmap_a), 651 FIELD64(IO_BITMAP_B, io_bitmap_b), 652 FIELD64(MSR_BITMAP, msr_bitmap), 653 FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr), 654 FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr), 655 FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr), 656 FIELD64(TSC_OFFSET, tsc_offset), 657 FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr), 658 FIELD64(APIC_ACCESS_ADDR, apic_access_addr), 659 FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr), 660 FIELD64(EPT_POINTER, ept_pointer), 661 FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0), 662 FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1), 663 FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2), 664 FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3), 665 FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap), 666 FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address), 667 FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer), 668 FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl), 669 FIELD64(GUEST_IA32_PAT, guest_ia32_pat), 670 FIELD64(GUEST_IA32_EFER, guest_ia32_efer), 671 FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl), 672 FIELD64(GUEST_PDPTR0, guest_pdptr0), 673 FIELD64(GUEST_PDPTR1, guest_pdptr1), 674 FIELD64(GUEST_PDPTR2, guest_pdptr2), 675 FIELD64(GUEST_PDPTR3, guest_pdptr3), 676 FIELD64(GUEST_BNDCFGS, guest_bndcfgs), 677 FIELD64(HOST_IA32_PAT, host_ia32_pat), 678 FIELD64(HOST_IA32_EFER, host_ia32_efer), 679 FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl), 680 FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control), 681 FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control), 682 FIELD(EXCEPTION_BITMAP, exception_bitmap), 683 FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask), 684 FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match), 685 FIELD(CR3_TARGET_COUNT, cr3_target_count), 686 FIELD(VM_EXIT_CONTROLS, vm_exit_controls), 687 FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count), 688 FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count), 689 FIELD(VM_ENTRY_CONTROLS, vm_entry_controls), 690 FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count), 691 FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field), 692 FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code), 693 FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len), 694 FIELD(TPR_THRESHOLD, tpr_threshold), 695 FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control), 696 FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error), 697 FIELD(VM_EXIT_REASON, vm_exit_reason), 698 FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info), 699 FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code), 700 FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field), 701 FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code), 702 FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len), 703 FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info), 704 FIELD(GUEST_ES_LIMIT, guest_es_limit), 705 FIELD(GUEST_CS_LIMIT, guest_cs_limit), 706 FIELD(GUEST_SS_LIMIT, guest_ss_limit), 707 FIELD(GUEST_DS_LIMIT, guest_ds_limit), 708 FIELD(GUEST_FS_LIMIT, guest_fs_limit), 709 FIELD(GUEST_GS_LIMIT, guest_gs_limit), 710 FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit), 711 FIELD(GUEST_TR_LIMIT, guest_tr_limit), 712 FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit), 713 FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit), 714 FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes), 715 FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes), 716 FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes), 717 FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes), 718 FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes), 719 FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes), 720 FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes), 721 FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes), 722 FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info), 723 FIELD(GUEST_ACTIVITY_STATE, guest_activity_state), 724 FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs), 725 FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs), 726 FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value), 727 FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask), 728 FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask), 729 FIELD(CR0_READ_SHADOW, cr0_read_shadow), 730 FIELD(CR4_READ_SHADOW, cr4_read_shadow), 731 FIELD(CR3_TARGET_VALUE0, cr3_target_value0), 732 FIELD(CR3_TARGET_VALUE1, cr3_target_value1), 733 FIELD(CR3_TARGET_VALUE2, cr3_target_value2), 734 FIELD(CR3_TARGET_VALUE3, cr3_target_value3), 735 FIELD(EXIT_QUALIFICATION, exit_qualification), 736 FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address), 737 FIELD(GUEST_CR0, guest_cr0), 738 FIELD(GUEST_CR3, guest_cr3), 739 FIELD(GUEST_CR4, guest_cr4), 740 FIELD(GUEST_ES_BASE, guest_es_base), 741 FIELD(GUEST_CS_BASE, guest_cs_base), 742 FIELD(GUEST_SS_BASE, guest_ss_base), 743 FIELD(GUEST_DS_BASE, guest_ds_base), 744 FIELD(GUEST_FS_BASE, guest_fs_base), 745 FIELD(GUEST_GS_BASE, guest_gs_base), 746 FIELD(GUEST_LDTR_BASE, guest_ldtr_base), 747 FIELD(GUEST_TR_BASE, guest_tr_base), 748 FIELD(GUEST_GDTR_BASE, guest_gdtr_base), 749 FIELD(GUEST_IDTR_BASE, guest_idtr_base), 750 FIELD(GUEST_DR7, guest_dr7), 751 FIELD(GUEST_RSP, guest_rsp), 752 FIELD(GUEST_RIP, guest_rip), 753 FIELD(GUEST_RFLAGS, guest_rflags), 754 FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions), 755 FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp), 756 FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip), 757 FIELD(HOST_CR0, host_cr0), 758 FIELD(HOST_CR3, host_cr3), 759 FIELD(HOST_CR4, host_cr4), 760 FIELD(HOST_FS_BASE, host_fs_base), 761 FIELD(HOST_GS_BASE, host_gs_base), 762 FIELD(HOST_TR_BASE, host_tr_base), 763 FIELD(HOST_GDTR_BASE, host_gdtr_base), 764 FIELD(HOST_IDTR_BASE, host_idtr_base), 765 FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp), 766 FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip), 767 FIELD(HOST_RSP, host_rsp), 768 FIELD(HOST_RIP, host_rip), 769}; 770 771static inline short vmcs_field_to_offset(unsigned long field) 772{ 773 BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); 774 775 if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || 776 vmcs_field_to_offset_table[field] == 0) 777 return -ENOENT; 778 779 return vmcs_field_to_offset_table[field]; 780} 781 782static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) 783{ 784 return to_vmx(vcpu)->nested.current_vmcs12; 785} 786 787static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) 788{ 789 struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); 790 if (is_error_page(page)) 791 return NULL; 792 793 return page; 794} 795 796static void nested_release_page(struct page *page) 797{ 798 kvm_release_page_dirty(page); 799} 800 801static void nested_release_page_clean(struct page *page) 802{ 803 kvm_release_page_clean(page); 804} 805 806static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); 807static u64 construct_eptp(unsigned long root_hpa); 808static void kvm_cpu_vmxon(u64 addr); 809static void kvm_cpu_vmxoff(void); 810static bool vmx_mpx_supported(void); 811static bool vmx_xsaves_supported(void); 812static int vmx_vm_has_apicv(struct kvm *kvm); 813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); 814static void vmx_set_segment(struct kvm_vcpu *vcpu, 815 struct kvm_segment *var, int seg); 816static void vmx_get_segment(struct kvm_vcpu *vcpu, 817 struct kvm_segment *var, int seg); 818static bool guest_state_valid(struct kvm_vcpu *vcpu); 819static u32 vmx_segment_access_rights(struct kvm_segment *var); 820static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); 821static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); 822static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); 823static int alloc_identity_pagetable(struct kvm *kvm); 824 825static DEFINE_PER_CPU(struct vmcs *, vmxarea); 826static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 827/* 828 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed 829 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. 830 */ 831static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); 832static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 833 834static unsigned long *vmx_io_bitmap_a; 835static unsigned long *vmx_io_bitmap_b; 836static unsigned long *vmx_msr_bitmap_legacy; 837static unsigned long *vmx_msr_bitmap_longmode; 838static unsigned long *vmx_msr_bitmap_legacy_x2apic; 839static unsigned long *vmx_msr_bitmap_longmode_x2apic; 840static unsigned long *vmx_msr_bitmap_nested; 841static unsigned long *vmx_vmread_bitmap; 842static unsigned long *vmx_vmwrite_bitmap; 843 844static bool cpu_has_load_ia32_efer; 845static bool cpu_has_load_perf_global_ctrl; 846 847static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 848static DEFINE_SPINLOCK(vmx_vpid_lock); 849 850static struct vmcs_config { 851 int size; 852 int order; 853 u32 revision_id; 854 u32 pin_based_exec_ctrl; 855 u32 cpu_based_exec_ctrl; 856 u32 cpu_based_2nd_exec_ctrl; 857 u32 vmexit_ctrl; 858 u32 vmentry_ctrl; 859} vmcs_config; 860 861static struct vmx_capability { 862 u32 ept; 863 u32 vpid; 864} vmx_capability; 865 866#define VMX_SEGMENT_FIELD(seg) \ 867 [VCPU_SREG_##seg] = { \ 868 .selector = GUEST_##seg##_SELECTOR, \ 869 .base = GUEST_##seg##_BASE, \ 870 .limit = GUEST_##seg##_LIMIT, \ 871 .ar_bytes = GUEST_##seg##_AR_BYTES, \ 872 } 873 874static const struct kvm_vmx_segment_field { 875 unsigned selector; 876 unsigned base; 877 unsigned limit; 878 unsigned ar_bytes; 879} kvm_vmx_segment_fields[] = { 880 VMX_SEGMENT_FIELD(CS), 881 VMX_SEGMENT_FIELD(DS), 882 VMX_SEGMENT_FIELD(ES), 883 VMX_SEGMENT_FIELD(FS), 884 VMX_SEGMENT_FIELD(GS), 885 VMX_SEGMENT_FIELD(SS), 886 VMX_SEGMENT_FIELD(TR), 887 VMX_SEGMENT_FIELD(LDTR), 888}; 889 890static u64 host_efer; 891 892static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 893 894/* 895 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it 896 * away by decrementing the array size. 897 */ 898static const u32 vmx_msr_index[] = { 899#ifdef CONFIG_X86_64 900 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, 901#endif 902 MSR_EFER, MSR_TSC_AUX, MSR_STAR, 903}; 904 905static inline bool is_page_fault(u32 intr_info) 906{ 907 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 908 INTR_INFO_VALID_MASK)) == 909 (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK); 910} 911 912static inline bool is_no_device(u32 intr_info) 913{ 914 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 915 INTR_INFO_VALID_MASK)) == 916 (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK); 917} 918 919static inline bool is_invalid_opcode(u32 intr_info) 920{ 921 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 922 INTR_INFO_VALID_MASK)) == 923 (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK); 924} 925 926static inline bool is_external_interrupt(u32 intr_info) 927{ 928 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 929 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 930} 931 932static inline bool is_machine_check(u32 intr_info) 933{ 934 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 935 INTR_INFO_VALID_MASK)) == 936 (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK); 937} 938 939static inline bool cpu_has_vmx_msr_bitmap(void) 940{ 941 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS; 942} 943 944static inline bool cpu_has_vmx_tpr_shadow(void) 945{ 946 return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW; 947} 948 949static inline bool vm_need_tpr_shadow(struct kvm *kvm) 950{ 951 return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)); 952} 953 954static inline bool cpu_has_secondary_exec_ctrls(void) 955{ 956 return vmcs_config.cpu_based_exec_ctrl & 957 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 958} 959 960static inline bool cpu_has_vmx_virtualize_apic_accesses(void) 961{ 962 return vmcs_config.cpu_based_2nd_exec_ctrl & 963 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 964} 965 966static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) 967{ 968 return vmcs_config.cpu_based_2nd_exec_ctrl & 969 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 970} 971 972static inline bool cpu_has_vmx_apic_register_virt(void) 973{ 974 return vmcs_config.cpu_based_2nd_exec_ctrl & 975 SECONDARY_EXEC_APIC_REGISTER_VIRT; 976} 977 978static inline bool cpu_has_vmx_virtual_intr_delivery(void) 979{ 980 return vmcs_config.cpu_based_2nd_exec_ctrl & 981 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; 982} 983 984static inline bool cpu_has_vmx_posted_intr(void) 985{ 986 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; 987} 988 989static inline bool cpu_has_vmx_apicv(void) 990{ 991 return cpu_has_vmx_apic_register_virt() && 992 cpu_has_vmx_virtual_intr_delivery() && 993 cpu_has_vmx_posted_intr(); 994} 995 996static inline bool cpu_has_vmx_flexpriority(void) 997{ 998 return cpu_has_vmx_tpr_shadow() && 999 cpu_has_vmx_virtualize_apic_accesses(); 1000} 1001 1002static inline bool cpu_has_vmx_ept_execute_only(void) 1003{ 1004 return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT; 1005} 1006 1007static inline bool cpu_has_vmx_ept_2m_page(void) 1008{ 1009 return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT; 1010} 1011 1012static inline bool cpu_has_vmx_ept_1g_page(void) 1013{ 1014 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 1015} 1016 1017static inline bool cpu_has_vmx_ept_4levels(void) 1018{ 1019 return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 1020} 1021 1022static inline bool cpu_has_vmx_ept_ad_bits(void) 1023{ 1024 return vmx_capability.ept & VMX_EPT_AD_BIT; 1025} 1026 1027static inline bool cpu_has_vmx_invept_context(void) 1028{ 1029 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; 1030} 1031 1032static inline bool cpu_has_vmx_invept_global(void) 1033{ 1034 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 1035} 1036 1037static inline bool cpu_has_vmx_invvpid_single(void) 1038{ 1039 return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; 1040} 1041 1042static inline bool cpu_has_vmx_invvpid_global(void) 1043{ 1044 return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; 1045} 1046 1047static inline bool cpu_has_vmx_ept(void) 1048{ 1049 return vmcs_config.cpu_based_2nd_exec_ctrl & 1050 SECONDARY_EXEC_ENABLE_EPT; 1051} 1052 1053static inline bool cpu_has_vmx_unrestricted_guest(void) 1054{ 1055 return vmcs_config.cpu_based_2nd_exec_ctrl & 1056 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1057} 1058 1059static inline bool cpu_has_vmx_ple(void) 1060{ 1061 return vmcs_config.cpu_based_2nd_exec_ctrl & 1062 SECONDARY_EXEC_PAUSE_LOOP_EXITING; 1063} 1064 1065static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm) 1066{ 1067 return flexpriority_enabled && irqchip_in_kernel(kvm); 1068} 1069 1070static inline bool cpu_has_vmx_vpid(void) 1071{ 1072 return vmcs_config.cpu_based_2nd_exec_ctrl & 1073 SECONDARY_EXEC_ENABLE_VPID; 1074} 1075 1076static inline bool cpu_has_vmx_rdtscp(void) 1077{ 1078 return vmcs_config.cpu_based_2nd_exec_ctrl & 1079 SECONDARY_EXEC_RDTSCP; 1080} 1081 1082static inline bool cpu_has_vmx_invpcid(void) 1083{ 1084 return vmcs_config.cpu_based_2nd_exec_ctrl & 1085 SECONDARY_EXEC_ENABLE_INVPCID; 1086} 1087 1088static inline bool cpu_has_virtual_nmis(void) 1089{ 1090 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 1091} 1092 1093static inline bool cpu_has_vmx_wbinvd_exit(void) 1094{ 1095 return vmcs_config.cpu_based_2nd_exec_ctrl & 1096 SECONDARY_EXEC_WBINVD_EXITING; 1097} 1098 1099static inline bool cpu_has_vmx_shadow_vmcs(void) 1100{ 1101 u64 vmx_msr; 1102 rdmsrl(MSR_IA32_VMX_MISC, vmx_msr); 1103 /* check if the cpu supports writing r/o exit information fields */ 1104 if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS)) 1105 return false; 1106 1107 return vmcs_config.cpu_based_2nd_exec_ctrl & 1108 SECONDARY_EXEC_SHADOW_VMCS; 1109} 1110 1111static inline bool cpu_has_vmx_pml(void) 1112{ 1113 return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; 1114} 1115 1116static inline bool report_flexpriority(void) 1117{ 1118 return flexpriority_enabled; 1119} 1120 1121static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit) 1122{ 1123 return vmcs12->cpu_based_vm_exec_control & bit; 1124} 1125 1126static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit) 1127{ 1128 return (vmcs12->cpu_based_vm_exec_control & 1129 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 1130 (vmcs12->secondary_vm_exec_control & bit); 1131} 1132 1133static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12) 1134{ 1135 return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS; 1136} 1137 1138static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12) 1139{ 1140 return vmcs12->pin_based_vm_exec_control & 1141 PIN_BASED_VMX_PREEMPTION_TIMER; 1142} 1143 1144static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12) 1145{ 1146 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT); 1147} 1148 1149static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12) 1150{ 1151 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) && 1152 vmx_xsaves_supported(); 1153} 1154 1155static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12) 1156{ 1157 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE); 1158} 1159 1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12) 1161{ 1162 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT); 1163} 1164 1165static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12) 1166{ 1167 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 1168} 1169 1170static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) 1171{ 1172 return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; 1173} 1174 1175static inline bool is_exception(u32 intr_info) 1176{ 1177 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) 1178 == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); 1179} 1180 1181static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 1182 u32 exit_intr_info, 1183 unsigned long exit_qualification); 1184static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 1185 struct vmcs12 *vmcs12, 1186 u32 reason, unsigned long qualification); 1187 1188static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 1189{ 1190 int i; 1191 1192 for (i = 0; i < vmx->nmsrs; ++i) 1193 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) 1194 return i; 1195 return -1; 1196} 1197 1198static inline void __invvpid(int ext, u16 vpid, gva_t gva) 1199{ 1200 struct { 1201 u64 vpid : 16; 1202 u64 rsvd : 48; 1203 u64 gva; 1204 } operand = { vpid, 0, gva }; 1205 1206 asm volatile (__ex(ASM_VMX_INVVPID) 1207 /* CF==1 or ZF==1 --> rc = -1 */ 1208 "; ja 1f ; ud2 ; 1:" 1209 : : "a"(&operand), "c"(ext) : "cc", "memory"); 1210} 1211 1212static inline void __invept(int ext, u64 eptp, gpa_t gpa) 1213{ 1214 struct { 1215 u64 eptp, gpa; 1216 } operand = {eptp, gpa}; 1217 1218 asm volatile (__ex(ASM_VMX_INVEPT) 1219 /* CF==1 or ZF==1 --> rc = -1 */ 1220 "; ja 1f ; ud2 ; 1:\n" 1221 : : "a" (&operand), "c" (ext) : "cc", "memory"); 1222} 1223 1224static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 1225{ 1226 int i; 1227 1228 i = __find_msr_index(vmx, msr); 1229 if (i >= 0) 1230 return &vmx->guest_msrs[i]; 1231 return NULL; 1232} 1233 1234static void vmcs_clear(struct vmcs *vmcs) 1235{ 1236 u64 phys_addr = __pa(vmcs); 1237 u8 error; 1238 1239 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 1240 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1241 : "cc", "memory"); 1242 if (error) 1243 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 1244 vmcs, phys_addr); 1245} 1246 1247static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) 1248{ 1249 vmcs_clear(loaded_vmcs->vmcs); 1250 loaded_vmcs->cpu = -1; 1251 loaded_vmcs->launched = 0; 1252} 1253 1254static void vmcs_load(struct vmcs *vmcs) 1255{ 1256 u64 phys_addr = __pa(vmcs); 1257 u8 error; 1258 1259 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 1260 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr) 1261 : "cc", "memory"); 1262 if (error) 1263 printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n", 1264 vmcs, phys_addr); 1265} 1266 1267#ifdef CONFIG_KEXEC 1268/* 1269 * This bitmap is used to indicate whether the vmclear 1270 * operation is enabled on all cpus. All disabled by 1271 * default. 1272 */ 1273static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; 1274 1275static inline void crash_enable_local_vmclear(int cpu) 1276{ 1277 cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); 1278} 1279 1280static inline void crash_disable_local_vmclear(int cpu) 1281{ 1282 cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); 1283} 1284 1285static inline int crash_local_vmclear_enabled(int cpu) 1286{ 1287 return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); 1288} 1289 1290static void crash_vmclear_local_loaded_vmcss(void) 1291{ 1292 int cpu = raw_smp_processor_id(); 1293 struct loaded_vmcs *v; 1294 1295 if (!crash_local_vmclear_enabled(cpu)) 1296 return; 1297 1298 list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 1299 loaded_vmcss_on_cpu_link) 1300 vmcs_clear(v->vmcs); 1301} 1302#else 1303static inline void crash_enable_local_vmclear(int cpu) { } 1304static inline void crash_disable_local_vmclear(int cpu) { } 1305#endif /* CONFIG_KEXEC */ 1306 1307static void __loaded_vmcs_clear(void *arg) 1308{ 1309 struct loaded_vmcs *loaded_vmcs = arg; 1310 int cpu = raw_smp_processor_id(); 1311 1312 if (loaded_vmcs->cpu != cpu) 1313 return; /* vcpu migration can race with cpu offline */ 1314 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1315 per_cpu(current_vmcs, cpu) = NULL; 1316 crash_disable_local_vmclear(cpu); 1317 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1318 1319 /* 1320 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link 1321 * is before setting loaded_vmcs->vcpu to -1 which is done in 1322 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist 1323 * then adds the vmcs into percpu list before it is deleted. 1324 */ 1325 smp_wmb(); 1326 1327 loaded_vmcs_init(loaded_vmcs); 1328 crash_enable_local_vmclear(cpu); 1329} 1330 1331static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1332{ 1333 int cpu = loaded_vmcs->cpu; 1334 1335 if (cpu != -1) 1336 smp_call_function_single(cpu, 1337 __loaded_vmcs_clear, loaded_vmcs, 1); 1338} 1339 1340static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 1341{ 1342 if (vmx->vpid == 0) 1343 return; 1344 1345 if (cpu_has_vmx_invvpid_single()) 1346 __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 1347} 1348 1349static inline void vpid_sync_vcpu_global(void) 1350{ 1351 if (cpu_has_vmx_invvpid_global()) 1352 __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); 1353} 1354 1355static inline void vpid_sync_context(struct vcpu_vmx *vmx) 1356{ 1357 if (cpu_has_vmx_invvpid_single()) 1358 vpid_sync_vcpu_single(vmx); 1359 else 1360 vpid_sync_vcpu_global(); 1361} 1362 1363static inline void ept_sync_global(void) 1364{ 1365 if (cpu_has_vmx_invept_global()) 1366 __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); 1367} 1368 1369static inline void ept_sync_context(u64 eptp) 1370{ 1371 if (enable_ept) { 1372 if (cpu_has_vmx_invept_context()) 1373 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1374 else 1375 ept_sync_global(); 1376 } 1377} 1378 1379static __always_inline unsigned long vmcs_readl(unsigned long field) 1380{ 1381 unsigned long value; 1382 1383 asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0") 1384 : "=a"(value) : "d"(field) : "cc"); 1385 return value; 1386} 1387 1388static __always_inline u16 vmcs_read16(unsigned long field) 1389{ 1390 return vmcs_readl(field); 1391} 1392 1393static __always_inline u32 vmcs_read32(unsigned long field) 1394{ 1395 return vmcs_readl(field); 1396} 1397 1398static __always_inline u64 vmcs_read64(unsigned long field) 1399{ 1400#ifdef CONFIG_X86_64 1401 return vmcs_readl(field); 1402#else 1403 return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32); 1404#endif 1405} 1406 1407static noinline void vmwrite_error(unsigned long field, unsigned long value) 1408{ 1409 printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n", 1410 field, value, vmcs_read32(VM_INSTRUCTION_ERROR)); 1411 dump_stack(); 1412} 1413 1414static void vmcs_writel(unsigned long field, unsigned long value) 1415{ 1416 u8 error; 1417 1418 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" 1419 : "=q"(error) : "a"(value), "d"(field) : "cc"); 1420 if (unlikely(error)) 1421 vmwrite_error(field, value); 1422} 1423 1424static void vmcs_write16(unsigned long field, u16 value) 1425{ 1426 vmcs_writel(field, value); 1427} 1428 1429static void vmcs_write32(unsigned long field, u32 value) 1430{ 1431 vmcs_writel(field, value); 1432} 1433 1434static void vmcs_write64(unsigned long field, u64 value) 1435{ 1436 vmcs_writel(field, value); 1437#ifndef CONFIG_X86_64 1438 asm volatile (""); 1439 vmcs_writel(field+1, value >> 32); 1440#endif 1441} 1442 1443static void vmcs_clear_bits(unsigned long field, u32 mask) 1444{ 1445 vmcs_writel(field, vmcs_readl(field) & ~mask); 1446} 1447 1448static void vmcs_set_bits(unsigned long field, u32 mask) 1449{ 1450 vmcs_writel(field, vmcs_readl(field) | mask); 1451} 1452 1453static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val) 1454{ 1455 vmcs_write32(VM_ENTRY_CONTROLS, val); 1456 vmx->vm_entry_controls_shadow = val; 1457} 1458 1459static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val) 1460{ 1461 if (vmx->vm_entry_controls_shadow != val) 1462 vm_entry_controls_init(vmx, val); 1463} 1464 1465static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx) 1466{ 1467 return vmx->vm_entry_controls_shadow; 1468} 1469 1470 1471static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1472{ 1473 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val); 1474} 1475 1476static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1477{ 1478 vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val); 1479} 1480 1481static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val) 1482{ 1483 vmcs_write32(VM_EXIT_CONTROLS, val); 1484 vmx->vm_exit_controls_shadow = val; 1485} 1486 1487static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val) 1488{ 1489 if (vmx->vm_exit_controls_shadow != val) 1490 vm_exit_controls_init(vmx, val); 1491} 1492 1493static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx) 1494{ 1495 return vmx->vm_exit_controls_shadow; 1496} 1497 1498 1499static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val) 1500{ 1501 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val); 1502} 1503 1504static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val) 1505{ 1506 vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val); 1507} 1508 1509static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) 1510{ 1511 vmx->segment_cache.bitmask = 0; 1512} 1513 1514static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, 1515 unsigned field) 1516{ 1517 bool ret; 1518 u32 mask = 1 << (seg * SEG_FIELD_NR + field); 1519 1520 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { 1521 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); 1522 vmx->segment_cache.bitmask = 0; 1523 } 1524 ret = vmx->segment_cache.bitmask & mask; 1525 vmx->segment_cache.bitmask |= mask; 1526 return ret; 1527} 1528 1529static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) 1530{ 1531 u16 *p = &vmx->segment_cache.seg[seg].selector; 1532 1533 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) 1534 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); 1535 return *p; 1536} 1537 1538static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) 1539{ 1540 ulong *p = &vmx->segment_cache.seg[seg].base; 1541 1542 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) 1543 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); 1544 return *p; 1545} 1546 1547static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) 1548{ 1549 u32 *p = &vmx->segment_cache.seg[seg].limit; 1550 1551 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) 1552 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); 1553 return *p; 1554} 1555 1556static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) 1557{ 1558 u32 *p = &vmx->segment_cache.seg[seg].ar; 1559 1560 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) 1561 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); 1562 return *p; 1563} 1564 1565static void update_exception_bitmap(struct kvm_vcpu *vcpu) 1566{ 1567 u32 eb; 1568 1569 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | 1570 (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); 1571 if ((vcpu->guest_debug & 1572 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == 1573 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) 1574 eb |= 1u << BP_VECTOR; 1575 if (to_vmx(vcpu)->rmode.vm86_active) 1576 eb = ~0; 1577 if (enable_ept) 1578 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 1579 if (vcpu->fpu_active) 1580 eb &= ~(1u << NM_VECTOR); 1581 1582 /* When we are running a nested L2 guest and L1 specified for it a 1583 * certain exception bitmap, we must trap the same exceptions and pass 1584 * them to L1. When running L2, we will only handle the exceptions 1585 * specified above if L1 did not want them. 1586 */ 1587 if (is_guest_mode(vcpu)) 1588 eb |= get_vmcs12(vcpu)->exception_bitmap; 1589 1590 vmcs_write32(EXCEPTION_BITMAP, eb); 1591} 1592 1593static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1594 unsigned long entry, unsigned long exit) 1595{ 1596 vm_entry_controls_clearbit(vmx, entry); 1597 vm_exit_controls_clearbit(vmx, exit); 1598} 1599 1600static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1601{ 1602 unsigned i; 1603 struct msr_autoload *m = &vmx->msr_autoload; 1604 1605 switch (msr) { 1606 case MSR_EFER: 1607 if (cpu_has_load_ia32_efer) { 1608 clear_atomic_switch_msr_special(vmx, 1609 VM_ENTRY_LOAD_IA32_EFER, 1610 VM_EXIT_LOAD_IA32_EFER); 1611 return; 1612 } 1613 break; 1614 case MSR_CORE_PERF_GLOBAL_CTRL: 1615 if (cpu_has_load_perf_global_ctrl) { 1616 clear_atomic_switch_msr_special(vmx, 1617 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1618 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 1619 return; 1620 } 1621 break; 1622 } 1623 1624 for (i = 0; i < m->nr; ++i) 1625 if (m->guest[i].index == msr) 1626 break; 1627 1628 if (i == m->nr) 1629 return; 1630 --m->nr; 1631 m->guest[i] = m->guest[m->nr]; 1632 m->host[i] = m->host[m->nr]; 1633 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1634 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1635} 1636 1637static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, 1638 unsigned long entry, unsigned long exit, 1639 unsigned long guest_val_vmcs, unsigned long host_val_vmcs, 1640 u64 guest_val, u64 host_val) 1641{ 1642 vmcs_write64(guest_val_vmcs, guest_val); 1643 vmcs_write64(host_val_vmcs, host_val); 1644 vm_entry_controls_setbit(vmx, entry); 1645 vm_exit_controls_setbit(vmx, exit); 1646} 1647 1648static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1649 u64 guest_val, u64 host_val) 1650{ 1651 unsigned i; 1652 struct msr_autoload *m = &vmx->msr_autoload; 1653 1654 switch (msr) { 1655 case MSR_EFER: 1656 if (cpu_has_load_ia32_efer) { 1657 add_atomic_switch_msr_special(vmx, 1658 VM_ENTRY_LOAD_IA32_EFER, 1659 VM_EXIT_LOAD_IA32_EFER, 1660 GUEST_IA32_EFER, 1661 HOST_IA32_EFER, 1662 guest_val, host_val); 1663 return; 1664 } 1665 break; 1666 case MSR_CORE_PERF_GLOBAL_CTRL: 1667 if (cpu_has_load_perf_global_ctrl) { 1668 add_atomic_switch_msr_special(vmx, 1669 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 1670 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, 1671 GUEST_IA32_PERF_GLOBAL_CTRL, 1672 HOST_IA32_PERF_GLOBAL_CTRL, 1673 guest_val, host_val); 1674 return; 1675 } 1676 break; 1677 case MSR_IA32_PEBS_ENABLE: 1678 /* PEBS needs a quiescent period after being disabled (to write 1679 * a record). Disabling PEBS through VMX MSR swapping doesn't 1680 * provide that period, so a CPU could write host's record into 1681 * guest's memory. 1682 */ 1683 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 1684 } 1685 1686 for (i = 0; i < m->nr; ++i) 1687 if (m->guest[i].index == msr) 1688 break; 1689 1690 if (i == NR_AUTOLOAD_MSRS) { 1691 printk_once(KERN_WARNING "Not enough msr switch entries. " 1692 "Can't add msr %x\n", msr); 1693 return; 1694 } else if (i == m->nr) { 1695 ++m->nr; 1696 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); 1697 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); 1698 } 1699 1700 m->guest[i].index = msr; 1701 m->guest[i].value = guest_val; 1702 m->host[i].index = msr; 1703 m->host[i].value = host_val; 1704} 1705 1706static void reload_tss(void) 1707{ 1708 /* 1709 * VT restores TR but not its size. Useless. 1710 */ 1711 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1712 struct desc_struct *descs; 1713 1714 descs = (void *)gdt->address; 1715 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 1716 load_TR_desc(); 1717} 1718 1719static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) 1720{ 1721 u64 guest_efer = vmx->vcpu.arch.efer; 1722 u64 ignore_bits = 0; 1723 1724 if (!enable_ept) { 1725 /* 1726 * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing 1727 * host CPUID is more efficient than testing guest CPUID 1728 * or CR4. Host SMEP is anyway a requirement for guest SMEP. 1729 */ 1730 if (boot_cpu_has(X86_FEATURE_SMEP)) 1731 guest_efer |= EFER_NX; 1732 else if (!(guest_efer & EFER_NX)) 1733 ignore_bits |= EFER_NX; 1734 } 1735 1736 /* 1737 * LMA and LME handled by hardware; SCE meaningless outside long mode. 1738 */ 1739 ignore_bits |= EFER_SCE; 1740#ifdef CONFIG_X86_64 1741 ignore_bits |= EFER_LMA | EFER_LME; 1742 /* SCE is meaningful only in long mode on Intel */ 1743 if (guest_efer & EFER_LMA) 1744 ignore_bits &= ~(u64)EFER_SCE; 1745#endif 1746 1747 clear_atomic_switch_msr(vmx, MSR_EFER); 1748 1749 /* 1750 * On EPT, we can't emulate NX, so we must switch EFER atomically. 1751 * On CPUs that support "load IA32_EFER", always switch EFER 1752 * atomically, since it's faster than switching it manually. 1753 */ 1754 if (cpu_has_load_ia32_efer || 1755 (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) { 1756 if (!(guest_efer & EFER_LMA)) 1757 guest_efer &= ~EFER_LME; 1758 if (guest_efer != host_efer) 1759 add_atomic_switch_msr(vmx, MSR_EFER, 1760 guest_efer, host_efer); 1761 return false; 1762 } else { 1763 guest_efer &= ~ignore_bits; 1764 guest_efer |= host_efer & ignore_bits; 1765 1766 vmx->guest_msrs[efer_offset].data = guest_efer; 1767 vmx->guest_msrs[efer_offset].mask = ~ignore_bits; 1768 1769 return true; 1770 } 1771} 1772 1773static unsigned long segment_base(u16 selector) 1774{ 1775 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1776 struct desc_struct *d; 1777 unsigned long table_base; 1778 unsigned long v; 1779 1780 if (!(selector & ~3)) 1781 return 0; 1782 1783 table_base = gdt->address; 1784 1785 if (selector & 4) { /* from ldt */ 1786 u16 ldt_selector = kvm_read_ldt(); 1787 1788 if (!(ldt_selector & ~3)) 1789 return 0; 1790 1791 table_base = segment_base(ldt_selector); 1792 } 1793 d = (struct desc_struct *)(table_base + (selector & ~7)); 1794 v = get_desc_base(d); 1795#ifdef CONFIG_X86_64 1796 if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) 1797 v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; 1798#endif 1799 return v; 1800} 1801 1802static inline unsigned long kvm_read_tr_base(void) 1803{ 1804 u16 tr; 1805 asm("str %0" : "=g"(tr)); 1806 return segment_base(tr); 1807} 1808 1809static void vmx_save_host_state(struct kvm_vcpu *vcpu) 1810{ 1811 struct vcpu_vmx *vmx = to_vmx(vcpu); 1812 int i; 1813 1814 if (vmx->host_state.loaded) 1815 return; 1816 1817 vmx->host_state.loaded = 1; 1818 /* 1819 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 1820 * allow segment selectors with cpl > 0 or ti == 1. 1821 */ 1822 vmx->host_state.ldt_sel = kvm_read_ldt(); 1823 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 1824 savesegment(fs, vmx->host_state.fs_sel); 1825 if (!(vmx->host_state.fs_sel & 7)) { 1826 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 1827 vmx->host_state.fs_reload_needed = 0; 1828 } else { 1829 vmcs_write16(HOST_FS_SELECTOR, 0); 1830 vmx->host_state.fs_reload_needed = 1; 1831 } 1832 savesegment(gs, vmx->host_state.gs_sel); 1833 if (!(vmx->host_state.gs_sel & 7)) 1834 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 1835 else { 1836 vmcs_write16(HOST_GS_SELECTOR, 0); 1837 vmx->host_state.gs_ldt_reload_needed = 1; 1838 } 1839 1840#ifdef CONFIG_X86_64 1841 savesegment(ds, vmx->host_state.ds_sel); 1842 savesegment(es, vmx->host_state.es_sel); 1843#endif 1844 1845#ifdef CONFIG_X86_64 1846 vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE)); 1847 vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE)); 1848#else 1849 vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel)); 1850 vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel)); 1851#endif 1852 1853#ifdef CONFIG_X86_64 1854 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1855 if (is_long_mode(&vmx->vcpu)) 1856 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1857#endif 1858 if (boot_cpu_has(X86_FEATURE_MPX)) 1859 rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 1860 for (i = 0; i < vmx->save_nmsrs; ++i) 1861 kvm_set_shared_msr(vmx->guest_msrs[i].index, 1862 vmx->guest_msrs[i].data, 1863 vmx->guest_msrs[i].mask); 1864} 1865 1866static void __vmx_load_host_state(struct vcpu_vmx *vmx) 1867{ 1868 if (!vmx->host_state.loaded) 1869 return; 1870 1871 ++vmx->vcpu.stat.host_state_reload; 1872 vmx->host_state.loaded = 0; 1873#ifdef CONFIG_X86_64 1874 if (is_long_mode(&vmx->vcpu)) 1875 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 1876#endif 1877 if (vmx->host_state.gs_ldt_reload_needed) { 1878 kvm_load_ldt(vmx->host_state.ldt_sel); 1879#ifdef CONFIG_X86_64 1880 load_gs_index(vmx->host_state.gs_sel); 1881#else 1882 loadsegment(gs, vmx->host_state.gs_sel); 1883#endif 1884 } 1885 if (vmx->host_state.fs_reload_needed) 1886 loadsegment(fs, vmx->host_state.fs_sel); 1887#ifdef CONFIG_X86_64 1888 if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) { 1889 loadsegment(ds, vmx->host_state.ds_sel); 1890 loadsegment(es, vmx->host_state.es_sel); 1891 } 1892#endif 1893 reload_tss(); 1894#ifdef CONFIG_X86_64 1895 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 1896#endif 1897 if (vmx->host_state.msr_host_bndcfgs) 1898 wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); 1899 /* 1900 * If the FPU is not active (through the host task or 1901 * the guest vcpu), then restore the cr0.TS bit. 1902 */ 1903 if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) 1904 stts(); 1905 load_gdt(this_cpu_ptr(&host_gdt)); 1906} 1907 1908static void vmx_load_host_state(struct vcpu_vmx *vmx) 1909{ 1910 preempt_disable(); 1911 __vmx_load_host_state(vmx); 1912 preempt_enable(); 1913} 1914 1915/* 1916 * Switches to specified vcpu, until a matching vcpu_put(), but assumes 1917 * vcpu mutex is already taken. 1918 */ 1919static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1920{ 1921 struct vcpu_vmx *vmx = to_vmx(vcpu); 1922 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1923 1924 if (!vmm_exclusive) 1925 kvm_cpu_vmxon(phys_addr); 1926 else if (vmx->loaded_vmcs->cpu != cpu) 1927 loaded_vmcs_clear(vmx->loaded_vmcs); 1928 1929 if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) { 1930 per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs; 1931 vmcs_load(vmx->loaded_vmcs->vmcs); 1932 } 1933 1934 if (vmx->loaded_vmcs->cpu != cpu) { 1935 struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); 1936 unsigned long sysenter_esp; 1937 1938 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1939 local_irq_disable(); 1940 crash_disable_local_vmclear(cpu); 1941 1942 /* 1943 * Read loaded_vmcs->cpu should be before fetching 1944 * loaded_vmcs->loaded_vmcss_on_cpu_link. 1945 * See the comments in __loaded_vmcs_clear(). 1946 */ 1947 smp_rmb(); 1948 1949 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1950 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1951 crash_enable_local_vmclear(cpu); 1952 local_irq_enable(); 1953 1954 /* 1955 * Linux uses per-cpu TSS and GDT, so set these when switching 1956 * processors. 1957 */ 1958 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 1959 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ 1960 1961 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 1962 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1963 vmx->loaded_vmcs->cpu = cpu; 1964 } 1965} 1966 1967static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 1968{ 1969 __vmx_load_host_state(to_vmx(vcpu)); 1970 if (!vmm_exclusive) { 1971 __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); 1972 vcpu->cpu = -1; 1973 kvm_cpu_vmxoff(); 1974 } 1975} 1976 1977static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 1978{ 1979 ulong cr0; 1980 1981 if (vcpu->fpu_active) 1982 return; 1983 vcpu->fpu_active = 1; 1984 cr0 = vmcs_readl(GUEST_CR0); 1985 cr0 &= ~(X86_CR0_TS | X86_CR0_MP); 1986 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP); 1987 vmcs_writel(GUEST_CR0, cr0); 1988 update_exception_bitmap(vcpu); 1989 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; 1990 if (is_guest_mode(vcpu)) 1991 vcpu->arch.cr0_guest_owned_bits &= 1992 ~get_vmcs12(vcpu)->cr0_guest_host_mask; 1993 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 1994} 1995 1996static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); 1997 1998/* 1999 * Return the cr0 value that a nested guest would read. This is a combination 2000 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by 2001 * its hypervisor (cr0_read_shadow). 2002 */ 2003static inline unsigned long nested_read_cr0(struct vmcs12 *fields) 2004{ 2005 return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) | 2006 (fields->cr0_read_shadow & fields->cr0_guest_host_mask); 2007} 2008static inline unsigned long nested_read_cr4(struct vmcs12 *fields) 2009{ 2010 return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) | 2011 (fields->cr4_read_shadow & fields->cr4_guest_host_mask); 2012} 2013 2014static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 2015{ 2016 /* Note that there is no vcpu->fpu_active = 0 here. The caller must 2017 * set this *before* calling this function. 2018 */ 2019 vmx_decache_cr0_guest_bits(vcpu); 2020 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP); 2021 update_exception_bitmap(vcpu); 2022 vcpu->arch.cr0_guest_owned_bits = 0; 2023 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2024 if (is_guest_mode(vcpu)) { 2025 /* 2026 * L1's specified read shadow might not contain the TS bit, 2027 * so now that we turned on shadowing of this bit, we need to 2028 * set this bit of the shadow. Like in nested_vmx_run we need 2029 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet 2030 * up-to-date here because we just decached cr0.TS (and we'll 2031 * only update vmcs12->guest_cr0 on nested exit). 2032 */ 2033 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2034 vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) | 2035 (vcpu->arch.cr0 & X86_CR0_TS); 2036 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2037 } else 2038 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2039} 2040 2041static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 2042{ 2043 unsigned long rflags, save_rflags; 2044 2045 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { 2046 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 2047 rflags = vmcs_readl(GUEST_RFLAGS); 2048 if (to_vmx(vcpu)->rmode.vm86_active) { 2049 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 2050 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 2051 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 2052 } 2053 to_vmx(vcpu)->rflags = rflags; 2054 } 2055 return to_vmx(vcpu)->rflags; 2056} 2057 2058static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 2059{ 2060 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); 2061 to_vmx(vcpu)->rflags = rflags; 2062 if (to_vmx(vcpu)->rmode.vm86_active) { 2063 to_vmx(vcpu)->rmode.save_rflags = rflags; 2064 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 2065 } 2066 vmcs_writel(GUEST_RFLAGS, rflags); 2067} 2068 2069static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) 2070{ 2071 u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 2072 int ret = 0; 2073 2074 if (interruptibility & GUEST_INTR_STATE_STI) 2075 ret |= KVM_X86_SHADOW_INT_STI; 2076 if (interruptibility & GUEST_INTR_STATE_MOV_SS) 2077 ret |= KVM_X86_SHADOW_INT_MOV_SS; 2078 2079 return ret; 2080} 2081 2082static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) 2083{ 2084 u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 2085 u32 interruptibility = interruptibility_old; 2086 2087 interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS); 2088 2089 if (mask & KVM_X86_SHADOW_INT_MOV_SS) 2090 interruptibility |= GUEST_INTR_STATE_MOV_SS; 2091 else if (mask & KVM_X86_SHADOW_INT_STI) 2092 interruptibility |= GUEST_INTR_STATE_STI; 2093 2094 if ((interruptibility != interruptibility_old)) 2095 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility); 2096} 2097 2098static void skip_emulated_instruction(struct kvm_vcpu *vcpu) 2099{ 2100 unsigned long rip; 2101 2102 rip = kvm_rip_read(vcpu); 2103 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 2104 kvm_rip_write(vcpu, rip); 2105 2106 /* skipping an emulated instruction also counts */ 2107 vmx_set_interrupt_shadow(vcpu, 0); 2108} 2109 2110/* 2111 * KVM wants to inject page-faults which it got to the guest. This function 2112 * checks whether in a nested guest, we need to inject them to L1 or L2. 2113 */ 2114static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr) 2115{ 2116 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2117 2118 if (!(vmcs12->exception_bitmap & (1u << nr))) 2119 return 0; 2120 2121 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 2122 vmcs_read32(VM_EXIT_INTR_INFO), 2123 vmcs_readl(EXIT_QUALIFICATION)); 2124 return 1; 2125} 2126 2127static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 2128 bool has_error_code, u32 error_code, 2129 bool reinject) 2130{ 2131 struct vcpu_vmx *vmx = to_vmx(vcpu); 2132 u32 intr_info = nr | INTR_INFO_VALID_MASK; 2133 2134 if (!reinject && is_guest_mode(vcpu) && 2135 nested_vmx_check_exception(vcpu, nr)) 2136 return; 2137 2138 if (has_error_code) { 2139 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); 2140 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 2141 } 2142 2143 if (vmx->rmode.vm86_active) { 2144 int inc_eip = 0; 2145 if (kvm_exception_is_soft(nr)) 2146 inc_eip = vcpu->arch.event_exit_inst_len; 2147 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) 2148 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2149 return; 2150 } 2151 2152 if (kvm_exception_is_soft(nr)) { 2153 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2154 vmx->vcpu.arch.event_exit_inst_len); 2155 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 2156 } else 2157 intr_info |= INTR_TYPE_HARD_EXCEPTION; 2158 2159 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 2160} 2161 2162static bool vmx_rdtscp_supported(void) 2163{ 2164 return cpu_has_vmx_rdtscp(); 2165} 2166 2167static bool vmx_invpcid_supported(void) 2168{ 2169 return cpu_has_vmx_invpcid() && enable_ept; 2170} 2171 2172/* 2173 * Swap MSR entry in host/guest MSR entry array. 2174 */ 2175static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 2176{ 2177 struct shared_msr_entry tmp; 2178 2179 tmp = vmx->guest_msrs[to]; 2180 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 2181 vmx->guest_msrs[from] = tmp; 2182} 2183 2184static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) 2185{ 2186 unsigned long *msr_bitmap; 2187 2188 if (is_guest_mode(vcpu)) 2189 msr_bitmap = vmx_msr_bitmap_nested; 2190 else if (irqchip_in_kernel(vcpu->kvm) && 2191 apic_x2apic_mode(vcpu->arch.apic)) { 2192 if (is_long_mode(vcpu)) 2193 msr_bitmap = vmx_msr_bitmap_longmode_x2apic; 2194 else 2195 msr_bitmap = vmx_msr_bitmap_legacy_x2apic; 2196 } else { 2197 if (is_long_mode(vcpu)) 2198 msr_bitmap = vmx_msr_bitmap_longmode; 2199 else 2200 msr_bitmap = vmx_msr_bitmap_legacy; 2201 } 2202 2203 vmcs_write64(MSR_BITMAP, __pa(msr_bitmap)); 2204} 2205 2206/* 2207 * Set up the vmcs to automatically save and restore system 2208 * msrs. Don't touch the 64-bit msrs if the guest is in legacy 2209 * mode, as fiddling with msrs is very expensive. 2210 */ 2211static void setup_msrs(struct vcpu_vmx *vmx) 2212{ 2213 int save_nmsrs, index; 2214 2215 save_nmsrs = 0; 2216#ifdef CONFIG_X86_64 2217 if (is_long_mode(&vmx->vcpu)) { 2218 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 2219 if (index >= 0) 2220 move_msr_up(vmx, index, save_nmsrs++); 2221 index = __find_msr_index(vmx, MSR_LSTAR); 2222 if (index >= 0) 2223 move_msr_up(vmx, index, save_nmsrs++); 2224 index = __find_msr_index(vmx, MSR_CSTAR); 2225 if (index >= 0) 2226 move_msr_up(vmx, index, save_nmsrs++); 2227 index = __find_msr_index(vmx, MSR_TSC_AUX); 2228 if (index >= 0 && vmx->rdtscp_enabled) 2229 move_msr_up(vmx, index, save_nmsrs++); 2230 /* 2231 * MSR_STAR is only needed on long mode guests, and only 2232 * if efer.sce is enabled. 2233 */ 2234 index = __find_msr_index(vmx, MSR_STAR); 2235 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE)) 2236 move_msr_up(vmx, index, save_nmsrs++); 2237 } 2238#endif 2239 index = __find_msr_index(vmx, MSR_EFER); 2240 if (index >= 0 && update_transition_efer(vmx, index)) 2241 move_msr_up(vmx, index, save_nmsrs++); 2242 2243 vmx->save_nmsrs = save_nmsrs; 2244 2245 if (cpu_has_vmx_msr_bitmap()) 2246 vmx_set_msr_bitmap(&vmx->vcpu); 2247} 2248 2249/* 2250 * reads and returns guest's timestamp counter "register" 2251 * guest_tsc = host_tsc + tsc_offset -- 21.3 2252 */ 2253static u64 guest_read_tsc(void) 2254{ 2255 u64 host_tsc, tsc_offset; 2256 2257 rdtscll(host_tsc); 2258 tsc_offset = vmcs_read64(TSC_OFFSET); 2259 return host_tsc + tsc_offset; 2260} 2261 2262/* 2263 * Like guest_read_tsc, but always returns L1's notion of the timestamp 2264 * counter, even if a nested guest (L2) is currently running. 2265 */ 2266static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 2267{ 2268 u64 tsc_offset; 2269 2270 tsc_offset = is_guest_mode(vcpu) ? 2271 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 2272 vmcs_read64(TSC_OFFSET); 2273 return host_tsc + tsc_offset; 2274} 2275 2276/* 2277 * Engage any workarounds for mis-matched TSC rates. Currently limited to 2278 * software catchup for faster rates on slower CPUs. 2279 */ 2280static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) 2281{ 2282 if (!scale) 2283 return; 2284 2285 if (user_tsc_khz > tsc_khz) { 2286 vcpu->arch.tsc_catchup = 1; 2287 vcpu->arch.tsc_always_catchup = 1; 2288 } else 2289 WARN(1, "user requested TSC rate below hardware speed\n"); 2290} 2291 2292static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) 2293{ 2294 return vmcs_read64(TSC_OFFSET); 2295} 2296 2297/* 2298 * writes 'offset' into guest's timestamp counter offset register 2299 */ 2300static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 2301{ 2302 if (is_guest_mode(vcpu)) { 2303 /* 2304 * We're here if L1 chose not to trap WRMSR to TSC. According 2305 * to the spec, this should set L1's TSC; The offset that L1 2306 * set for L2 remains unchanged, and still needs to be added 2307 * to the newly set TSC to get L2's TSC. 2308 */ 2309 struct vmcs12 *vmcs12; 2310 to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; 2311 /* recalculate vmcs02.TSC_OFFSET: */ 2312 vmcs12 = get_vmcs12(vcpu); 2313 vmcs_write64(TSC_OFFSET, offset + 2314 (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? 2315 vmcs12->tsc_offset : 0)); 2316 } else { 2317 trace_kvm_write_tsc_offset(vcpu->vcpu_id, 2318 vmcs_read64(TSC_OFFSET), offset); 2319 vmcs_write64(TSC_OFFSET, offset); 2320 } 2321} 2322 2323static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) 2324{ 2325 u64 offset = vmcs_read64(TSC_OFFSET); 2326 2327 vmcs_write64(TSC_OFFSET, offset + adjustment); 2328 if (is_guest_mode(vcpu)) { 2329 /* Even when running L2, the adjustment needs to apply to L1 */ 2330 to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment; 2331 } else 2332 trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, 2333 offset + adjustment); 2334} 2335 2336static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) 2337{ 2338 return target_tsc - native_read_tsc(); 2339} 2340 2341static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) 2342{ 2343 struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); 2344 return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31))); 2345} 2346 2347/* 2348 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX 2349 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for 2350 * all guests if the "nested" module option is off, and can also be disabled 2351 * for a single guest by disabling its VMX cpuid bit. 2352 */ 2353static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) 2354{ 2355 return nested && guest_cpuid_has_vmx(vcpu); 2356} 2357 2358/* 2359 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 2360 * returned for the various VMX controls MSRs when nested VMX is enabled. 2361 * The same values should also be used to verify that vmcs12 control fields are 2362 * valid during nested entry from L1 to L2. 2363 * Each of these control msrs has a low and high 32-bit half: A low bit is on 2364 * if the corresponding bit in the (32-bit) control field *must* be on, and a 2365 * bit in the high half is on if the corresponding bit in the control field 2366 * may be on. See also vmx_control_verify(). 2367 */ 2368static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) 2369{ 2370 /* 2371 * Note that as a general rule, the high half of the MSRs (bits in 2372 * the control fields which may be 1) should be initialized by the 2373 * intersection of the underlying hardware's MSR (i.e., features which 2374 * can be supported) and the list of features we want to expose - 2375 * because they are known to be properly supported in our code. 2376 * Also, usually, the low half of the MSRs (bits which must be 1) can 2377 * be set to 0, meaning that L1 may turn off any of these bits. The 2378 * reason is that if one of these bits is necessary, it will appear 2379 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 2380 * fields of vmcs01 and vmcs02, will turn these bits off - and 2381 * nested_vmx_exit_handled() will not pass related exits to L1. 2382 * These rules have exceptions below. 2383 */ 2384 2385 /* pin-based controls */ 2386 rdmsr(MSR_IA32_VMX_PINBASED_CTLS, 2387 vmx->nested.nested_vmx_pinbased_ctls_low, 2388 vmx->nested.nested_vmx_pinbased_ctls_high); 2389 vmx->nested.nested_vmx_pinbased_ctls_low |= 2390 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2391 vmx->nested.nested_vmx_pinbased_ctls_high &= 2392 PIN_BASED_EXT_INTR_MASK | 2393 PIN_BASED_NMI_EXITING | 2394 PIN_BASED_VIRTUAL_NMIS; 2395 vmx->nested.nested_vmx_pinbased_ctls_high |= 2396 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2397 PIN_BASED_VMX_PREEMPTION_TIMER; 2398 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) 2399 vmx->nested.nested_vmx_pinbased_ctls_high |= 2400 PIN_BASED_POSTED_INTR; 2401 2402 /* exit controls */ 2403 rdmsr(MSR_IA32_VMX_EXIT_CTLS, 2404 vmx->nested.nested_vmx_exit_ctls_low, 2405 vmx->nested.nested_vmx_exit_ctls_high); 2406 vmx->nested.nested_vmx_exit_ctls_low = 2407 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 2408 2409 vmx->nested.nested_vmx_exit_ctls_high &= 2410#ifdef CONFIG_X86_64 2411 VM_EXIT_HOST_ADDR_SPACE_SIZE | 2412#endif 2413 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT; 2414 vmx->nested.nested_vmx_exit_ctls_high |= 2415 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 2416 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 2417 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT; 2418 2419 if (vmx_mpx_supported()) 2420 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; 2421 2422 /* We support free control of debug control saving. */ 2423 vmx->nested.nested_vmx_true_exit_ctls_low = 2424 vmx->nested.nested_vmx_exit_ctls_low & 2425 ~VM_EXIT_SAVE_DEBUG_CONTROLS; 2426 2427 /* entry controls */ 2428 rdmsr(MSR_IA32_VMX_ENTRY_CTLS, 2429 vmx->nested.nested_vmx_entry_ctls_low, 2430 vmx->nested.nested_vmx_entry_ctls_high); 2431 vmx->nested.nested_vmx_entry_ctls_low = 2432 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 2433 vmx->nested.nested_vmx_entry_ctls_high &= 2434#ifdef CONFIG_X86_64 2435 VM_ENTRY_IA32E_MODE | 2436#endif 2437 VM_ENTRY_LOAD_IA32_PAT; 2438 vmx->nested.nested_vmx_entry_ctls_high |= 2439 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER); 2440 if (vmx_mpx_supported()) 2441 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; 2442 2443 /* We support free control of debug control loading. */ 2444 vmx->nested.nested_vmx_true_entry_ctls_low = 2445 vmx->nested.nested_vmx_entry_ctls_low & 2446 ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 2447 2448 /* cpu-based controls */ 2449 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, 2450 vmx->nested.nested_vmx_procbased_ctls_low, 2451 vmx->nested.nested_vmx_procbased_ctls_high); 2452 vmx->nested.nested_vmx_procbased_ctls_low = 2453 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 2454 vmx->nested.nested_vmx_procbased_ctls_high &= 2455 CPU_BASED_VIRTUAL_INTR_PENDING | 2456 CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | 2457 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 2458 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 2459 CPU_BASED_CR3_STORE_EXITING | 2460#ifdef CONFIG_X86_64 2461 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 2462#endif 2463 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 2464 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING | 2465 CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING | 2466 CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW | 2467 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 2468 /* 2469 * We can allow some features even when not supported by the 2470 * hardware. For example, L1 can specify an MSR bitmap - and we 2471 * can use it to avoid exits to L1 - even when L0 runs L2 2472 * without MSR bitmaps. 2473 */ 2474 vmx->nested.nested_vmx_procbased_ctls_high |= 2475 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 2476 CPU_BASED_USE_MSR_BITMAPS; 2477 2478 /* We support free control of CR3 access interception. */ 2479 vmx->nested.nested_vmx_true_procbased_ctls_low = 2480 vmx->nested.nested_vmx_procbased_ctls_low & 2481 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 2482 2483 /* secondary cpu-based controls */ 2484 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, 2485 vmx->nested.nested_vmx_secondary_ctls_low, 2486 vmx->nested.nested_vmx_secondary_ctls_high); 2487 vmx->nested.nested_vmx_secondary_ctls_low = 0; 2488 vmx->nested.nested_vmx_secondary_ctls_high &= 2489 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2490 SECONDARY_EXEC_RDTSCP | 2491 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2492 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2493 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2494 SECONDARY_EXEC_WBINVD_EXITING | 2495 SECONDARY_EXEC_XSAVES; 2496 2497 if (enable_ept) { 2498 /* nested EPT: emulate EPT also to L1 */ 2499 vmx->nested.nested_vmx_secondary_ctls_high |= 2500 SECONDARY_EXEC_ENABLE_EPT; 2501 vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | 2502 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | 2503 VMX_EPT_INVEPT_BIT; 2504 vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; 2505 /* 2506 * For nested guests, we don't do anything specific 2507 * for single context invalidation. Hence, only advertise 2508 * support for global context invalidation. 2509 */ 2510 vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT; 2511 } else 2512 vmx->nested.nested_vmx_ept_caps = 0; 2513 2514 if (enable_unrestricted_guest) 2515 vmx->nested.nested_vmx_secondary_ctls_high |= 2516 SECONDARY_EXEC_UNRESTRICTED_GUEST; 2517 2518 /* miscellaneous data */ 2519 rdmsr(MSR_IA32_VMX_MISC, 2520 vmx->nested.nested_vmx_misc_low, 2521 vmx->nested.nested_vmx_misc_high); 2522 vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA; 2523 vmx->nested.nested_vmx_misc_low |= 2524 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 2525 VMX_MISC_ACTIVITY_HLT; 2526 vmx->nested.nested_vmx_misc_high = 0; 2527} 2528 2529static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 2530{ 2531 /* 2532 * Bits 0 in high must be 0, and bits 1 in low must be 1. 2533 */ 2534 return ((control & high) | low) == control; 2535} 2536 2537static inline u64 vmx_control_msr(u32 low, u32 high) 2538{ 2539 return low | ((u64)high << 32); 2540} 2541 2542/* Returns 0 on success, non-0 otherwise. */ 2543static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2544{ 2545 struct vcpu_vmx *vmx = to_vmx(vcpu); 2546 2547 switch (msr_index) { 2548 case MSR_IA32_VMX_BASIC: 2549 /* 2550 * This MSR reports some information about VMX support. We 2551 * should return information about the VMX we emulate for the 2552 * guest, and the VMCS structure we give it - not about the 2553 * VMX support of the underlying hardware. 2554 */ 2555 *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | 2556 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 2557 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 2558 break; 2559 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 2560 case MSR_IA32_VMX_PINBASED_CTLS: 2561 *pdata = vmx_control_msr( 2562 vmx->nested.nested_vmx_pinbased_ctls_low, 2563 vmx->nested.nested_vmx_pinbased_ctls_high); 2564 break; 2565 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 2566 *pdata = vmx_control_msr( 2567 vmx->nested.nested_vmx_true_procbased_ctls_low, 2568 vmx->nested.nested_vmx_procbased_ctls_high); 2569 break; 2570 case MSR_IA32_VMX_PROCBASED_CTLS: 2571 *pdata = vmx_control_msr( 2572 vmx->nested.nested_vmx_procbased_ctls_low, 2573 vmx->nested.nested_vmx_procbased_ctls_high); 2574 break; 2575 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 2576 *pdata = vmx_control_msr( 2577 vmx->nested.nested_vmx_true_exit_ctls_low, 2578 vmx->nested.nested_vmx_exit_ctls_high); 2579 break; 2580 case MSR_IA32_VMX_EXIT_CTLS: 2581 *pdata = vmx_control_msr( 2582 vmx->nested.nested_vmx_exit_ctls_low, 2583 vmx->nested.nested_vmx_exit_ctls_high); 2584 break; 2585 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 2586 *pdata = vmx_control_msr( 2587 vmx->nested.nested_vmx_true_entry_ctls_low, 2588 vmx->nested.nested_vmx_entry_ctls_high); 2589 break; 2590 case MSR_IA32_VMX_ENTRY_CTLS: 2591 *pdata = vmx_control_msr( 2592 vmx->nested.nested_vmx_entry_ctls_low, 2593 vmx->nested.nested_vmx_entry_ctls_high); 2594 break; 2595 case MSR_IA32_VMX_MISC: 2596 *pdata = vmx_control_msr( 2597 vmx->nested.nested_vmx_misc_low, 2598 vmx->nested.nested_vmx_misc_high); 2599 break; 2600 /* 2601 * These MSRs specify bits which the guest must keep fixed (on or off) 2602 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 2603 * We picked the standard core2 setting. 2604 */ 2605#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 2606#define VMXON_CR4_ALWAYSON X86_CR4_VMXE 2607 case MSR_IA32_VMX_CR0_FIXED0: 2608 *pdata = VMXON_CR0_ALWAYSON; 2609 break; 2610 case MSR_IA32_VMX_CR0_FIXED1: 2611 *pdata = -1ULL; 2612 break; 2613 case MSR_IA32_VMX_CR4_FIXED0: 2614 *pdata = VMXON_CR4_ALWAYSON; 2615 break; 2616 case MSR_IA32_VMX_CR4_FIXED1: 2617 *pdata = -1ULL; 2618 break; 2619 case MSR_IA32_VMX_VMCS_ENUM: 2620 *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ 2621 break; 2622 case MSR_IA32_VMX_PROCBASED_CTLS2: 2623 *pdata = vmx_control_msr( 2624 vmx->nested.nested_vmx_secondary_ctls_low, 2625 vmx->nested.nested_vmx_secondary_ctls_high); 2626 break; 2627 case MSR_IA32_VMX_EPT_VPID_CAP: 2628 /* Currently, no nested vpid support */ 2629 *pdata = vmx->nested.nested_vmx_ept_caps; 2630 break; 2631 default: 2632 return 1; 2633 } 2634 2635 return 0; 2636} 2637 2638/* 2639 * Reads an msr value (of 'msr_index') into 'pdata'. 2640 * Returns 0 on success, non-0 otherwise. 2641 * Assumes vcpu_load() was already called. 2642 */ 2643static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 2644{ 2645 u64 data; 2646 struct shared_msr_entry *msr; 2647 2648 if (!pdata) { 2649 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 2650 return -EINVAL; 2651 } 2652 2653 switch (msr_index) { 2654#ifdef CONFIG_X86_64 2655 case MSR_FS_BASE: 2656 data = vmcs_readl(GUEST_FS_BASE); 2657 break; 2658 case MSR_GS_BASE: 2659 data = vmcs_readl(GUEST_GS_BASE); 2660 break; 2661 case MSR_KERNEL_GS_BASE: 2662 vmx_load_host_state(to_vmx(vcpu)); 2663 data = to_vmx(vcpu)->msr_guest_kernel_gs_base; 2664 break; 2665#endif 2666 case MSR_EFER: 2667 return kvm_get_msr_common(vcpu, msr_index, pdata); 2668 case MSR_IA32_TSC: 2669 data = guest_read_tsc(); 2670 break; 2671 case MSR_IA32_SYSENTER_CS: 2672 data = vmcs_read32(GUEST_SYSENTER_CS); 2673 break; 2674 case MSR_IA32_SYSENTER_EIP: 2675 data = vmcs_readl(GUEST_SYSENTER_EIP); 2676 break; 2677 case MSR_IA32_SYSENTER_ESP: 2678 data = vmcs_readl(GUEST_SYSENTER_ESP); 2679 break; 2680 case MSR_IA32_BNDCFGS: 2681 if (!vmx_mpx_supported()) 2682 return 1; 2683 data = vmcs_read64(GUEST_BNDCFGS); 2684 break; 2685 case MSR_IA32_FEATURE_CONTROL: 2686 if (!nested_vmx_allowed(vcpu)) 2687 return 1; 2688 data = to_vmx(vcpu)->nested.msr_ia32_feature_control; 2689 break; 2690 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2691 if (!nested_vmx_allowed(vcpu)) 2692 return 1; 2693 return vmx_get_vmx_msr(vcpu, msr_index, pdata); 2694 case MSR_IA32_XSS: 2695 if (!vmx_xsaves_supported()) 2696 return 1; 2697 data = vcpu->arch.ia32_xss; 2698 break; 2699 case MSR_TSC_AUX: 2700 if (!to_vmx(vcpu)->rdtscp_enabled) 2701 return 1; 2702 /* Otherwise falls through */ 2703 default: 2704 msr = find_msr_entry(to_vmx(vcpu), msr_index); 2705 if (msr) { 2706 data = msr->data; 2707 break; 2708 } 2709 return kvm_get_msr_common(vcpu, msr_index, pdata); 2710 } 2711 2712 *pdata = data; 2713 return 0; 2714} 2715 2716static void vmx_leave_nested(struct kvm_vcpu *vcpu); 2717 2718/* 2719 * Writes msr value into into the appropriate "register". 2720 * Returns 0 on success, non-0 otherwise. 2721 * Assumes vcpu_load() was already called. 2722 */ 2723static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2724{ 2725 struct vcpu_vmx *vmx = to_vmx(vcpu); 2726 struct shared_msr_entry *msr; 2727 int ret = 0; 2728 u32 msr_index = msr_info->index; 2729 u64 data = msr_info->data; 2730 2731 switch (msr_index) { 2732 case MSR_EFER: 2733 ret = kvm_set_msr_common(vcpu, msr_info); 2734 break; 2735#ifdef CONFIG_X86_64 2736 case MSR_FS_BASE: 2737 vmx_segment_cache_clear(vmx); 2738 vmcs_writel(GUEST_FS_BASE, data); 2739 break; 2740 case MSR_GS_BASE: 2741 vmx_segment_cache_clear(vmx); 2742 vmcs_writel(GUEST_GS_BASE, data); 2743 break; 2744 case MSR_KERNEL_GS_BASE: 2745 vmx_load_host_state(vmx); 2746 vmx->msr_guest_kernel_gs_base = data; 2747 break; 2748#endif 2749 case MSR_IA32_SYSENTER_CS: 2750 vmcs_write32(GUEST_SYSENTER_CS, data); 2751 break; 2752 case MSR_IA32_SYSENTER_EIP: 2753 vmcs_writel(GUEST_SYSENTER_EIP, data); 2754 break; 2755 case MSR_IA32_SYSENTER_ESP: 2756 vmcs_writel(GUEST_SYSENTER_ESP, data); 2757 break; 2758 case MSR_IA32_BNDCFGS: 2759 if (!vmx_mpx_supported()) 2760 return 1; 2761 vmcs_write64(GUEST_BNDCFGS, data); 2762 break; 2763 case MSR_IA32_TSC: 2764 kvm_write_tsc(vcpu, msr_info); 2765 break; 2766 case MSR_IA32_CR_PAT: 2767 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2768 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) 2769 return 1; 2770 vmcs_write64(GUEST_IA32_PAT, data); 2771 vcpu->arch.pat = data; 2772 break; 2773 } 2774 ret = kvm_set_msr_common(vcpu, msr_info); 2775 break; 2776 case MSR_IA32_TSC_ADJUST: 2777 ret = kvm_set_msr_common(vcpu, msr_info); 2778 break; 2779 case MSR_IA32_FEATURE_CONTROL: 2780 if (!nested_vmx_allowed(vcpu) || 2781 (to_vmx(vcpu)->nested.msr_ia32_feature_control & 2782 FEATURE_CONTROL_LOCKED && !msr_info->host_initiated)) 2783 return 1; 2784 vmx->nested.msr_ia32_feature_control = data; 2785 if (msr_info->host_initiated && data == 0) 2786 vmx_leave_nested(vcpu); 2787 break; 2788 case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: 2789 return 1; /* they are read-only */ 2790 case MSR_IA32_XSS: 2791 if (!vmx_xsaves_supported()) 2792 return 1; 2793 /* 2794 * The only supported bit as of Skylake is bit 8, but 2795 * it is not supported on KVM. 2796 */ 2797 if (data != 0) 2798 return 1; 2799 vcpu->arch.ia32_xss = data; 2800 if (vcpu->arch.ia32_xss != host_xss) 2801 add_atomic_switch_msr(vmx, MSR_IA32_XSS, 2802 vcpu->arch.ia32_xss, host_xss); 2803 else 2804 clear_atomic_switch_msr(vmx, MSR_IA32_XSS); 2805 break; 2806 case MSR_TSC_AUX: 2807 if (!vmx->rdtscp_enabled) 2808 return 1; 2809 /* Check reserved bit, higher 32 bits should be zero */ 2810 if ((data >> 32) != 0) 2811 return 1; 2812 /* Otherwise falls through */ 2813 default: 2814 msr = find_msr_entry(vmx, msr_index); 2815 if (msr) { 2816 u64 old_msr_data = msr->data; 2817 msr->data = data; 2818 if (msr - vmx->guest_msrs < vmx->save_nmsrs) { 2819 preempt_disable(); 2820 ret = kvm_set_shared_msr(msr->index, msr->data, 2821 msr->mask); 2822 preempt_enable(); 2823 if (ret) 2824 msr->data = old_msr_data; 2825 } 2826 break; 2827 } 2828 ret = kvm_set_msr_common(vcpu, msr_info); 2829 } 2830 2831 return ret; 2832} 2833 2834static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) 2835{ 2836 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); 2837 switch (reg) { 2838 case VCPU_REGS_RSP: 2839 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); 2840 break; 2841 case VCPU_REGS_RIP: 2842 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); 2843 break; 2844 case VCPU_EXREG_PDPTR: 2845 if (enable_ept) 2846 ept_save_pdptrs(vcpu); 2847 break; 2848 default: 2849 break; 2850 } 2851} 2852 2853static __init int cpu_has_kvm_support(void) 2854{ 2855 return cpu_has_vmx(); 2856} 2857 2858static __init int vmx_disabled_by_bios(void) 2859{ 2860 u64 msr; 2861 2862 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 2863 if (msr & FEATURE_CONTROL_LOCKED) { 2864 /* launched w/ TXT and VMX disabled */ 2865 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2866 && tboot_enabled()) 2867 return 1; 2868 /* launched w/o TXT and VMX only enabled w/ TXT */ 2869 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2870 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 2871 && !tboot_enabled()) { 2872 printk(KERN_WARNING "kvm: disable TXT in the BIOS or " 2873 "activate TXT before enabling KVM\n"); 2874 return 1; 2875 } 2876 /* launched w/o TXT and VMX disabled */ 2877 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 2878 && !tboot_enabled()) 2879 return 1; 2880 } 2881 2882 return 0; 2883} 2884 2885static void kvm_cpu_vmxon(u64 addr) 2886{ 2887 asm volatile (ASM_VMX_VMXON_RAX 2888 : : "a"(&addr), "m"(addr) 2889 : "memory", "cc"); 2890} 2891 2892static int hardware_enable(void) 2893{ 2894 int cpu = raw_smp_processor_id(); 2895 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 2896 u64 old, test_bits; 2897 2898 if (cr4_read_shadow() & X86_CR4_VMXE) 2899 return -EBUSY; 2900 2901 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2902 2903 /* 2904 * Now we can enable the vmclear operation in kdump 2905 * since the loaded_vmcss_on_cpu list on this cpu 2906 * has been initialized. 2907 * 2908 * Though the cpu is not in VMX operation now, there 2909 * is no problem to enable the vmclear operation 2910 * for the loaded_vmcss_on_cpu list is empty! 2911 */ 2912 crash_enable_local_vmclear(cpu); 2913 2914 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2915 2916 test_bits = FEATURE_CONTROL_LOCKED; 2917 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 2918 if (tboot_enabled()) 2919 test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; 2920 2921 if ((old & test_bits) != test_bits) { 2922 /* enable and lock */ 2923 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 2924 } 2925 cr4_set_bits(X86_CR4_VMXE); 2926 2927 if (vmm_exclusive) { 2928 kvm_cpu_vmxon(phys_addr); 2929 ept_sync_global(); 2930 } 2931 2932 native_store_gdt(this_cpu_ptr(&host_gdt)); 2933 2934 return 0; 2935} 2936 2937static void vmclear_local_loaded_vmcss(void) 2938{ 2939 int cpu = raw_smp_processor_id(); 2940 struct loaded_vmcs *v, *n; 2941 2942 list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu), 2943 loaded_vmcss_on_cpu_link) 2944 __loaded_vmcs_clear(v); 2945} 2946 2947 2948/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() 2949 * tricks. 2950 */ 2951static void kvm_cpu_vmxoff(void) 2952{ 2953 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 2954} 2955 2956static void hardware_disable(void) 2957{ 2958 if (vmm_exclusive) { 2959 vmclear_local_loaded_vmcss(); 2960 kvm_cpu_vmxoff(); 2961 } 2962 cr4_clear_bits(X86_CR4_VMXE); 2963} 2964 2965static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, 2966 u32 msr, u32 *result) 2967{ 2968 u32 vmx_msr_low, vmx_msr_high; 2969 u32 ctl = ctl_min | ctl_opt; 2970 2971 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2972 2973 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */ 2974 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */ 2975 2976 /* Ensure minimum (required) set of control bits are supported. */ 2977 if (ctl_min & ~ctl) 2978 return -EIO; 2979 2980 *result = ctl; 2981 return 0; 2982} 2983 2984static __init bool allow_1_setting(u32 msr, u32 ctl) 2985{ 2986 u32 vmx_msr_low, vmx_msr_high; 2987 2988 rdmsr(msr, vmx_msr_low, vmx_msr_high); 2989 return vmx_msr_high & ctl; 2990} 2991 2992static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 2993{ 2994 u32 vmx_msr_low, vmx_msr_high; 2995 u32 min, opt, min2, opt2; 2996 u32 _pin_based_exec_control = 0; 2997 u32 _cpu_based_exec_control = 0; 2998 u32 _cpu_based_2nd_exec_control = 0; 2999 u32 _vmexit_control = 0; 3000 u32 _vmentry_control = 0; 3001 3002 min = CPU_BASED_HLT_EXITING | 3003#ifdef CONFIG_X86_64 3004 CPU_BASED_CR8_LOAD_EXITING | 3005 CPU_BASED_CR8_STORE_EXITING | 3006#endif 3007 CPU_BASED_CR3_LOAD_EXITING | 3008 CPU_BASED_CR3_STORE_EXITING | 3009 CPU_BASED_USE_IO_BITMAPS | 3010 CPU_BASED_MOV_DR_EXITING | 3011 CPU_BASED_USE_TSC_OFFSETING | 3012 CPU_BASED_MWAIT_EXITING | 3013 CPU_BASED_MONITOR_EXITING | 3014 CPU_BASED_INVLPG_EXITING | 3015 CPU_BASED_RDPMC_EXITING; 3016 3017 opt = CPU_BASED_TPR_SHADOW | 3018 CPU_BASED_USE_MSR_BITMAPS | 3019 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 3020 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 3021 &_cpu_based_exec_control) < 0) 3022 return -EIO; 3023#ifdef CONFIG_X86_64 3024 if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 3025 _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING & 3026 ~CPU_BASED_CR8_STORE_EXITING; 3027#endif 3028 if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { 3029 min2 = 0; 3030 opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 3031 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3032 SECONDARY_EXEC_WBINVD_EXITING | 3033 SECONDARY_EXEC_ENABLE_VPID | 3034 SECONDARY_EXEC_ENABLE_EPT | 3035 SECONDARY_EXEC_UNRESTRICTED_GUEST | 3036 SECONDARY_EXEC_PAUSE_LOOP_EXITING | 3037 SECONDARY_EXEC_RDTSCP | 3038 SECONDARY_EXEC_ENABLE_INVPCID | 3039 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3040 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 3041 SECONDARY_EXEC_SHADOW_VMCS | 3042 SECONDARY_EXEC_XSAVES | 3043 SECONDARY_EXEC_ENABLE_PML; 3044 if (adjust_vmx_controls(min2, opt2, 3045 MSR_IA32_VMX_PROCBASED_CTLS2, 3046 &_cpu_based_2nd_exec_control) < 0) 3047 return -EIO; 3048 } 3049#ifndef CONFIG_X86_64 3050 if (!(_cpu_based_2nd_exec_control & 3051 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 3052 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 3053#endif 3054 3055 if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)) 3056 _cpu_based_2nd_exec_control &= ~( 3057 SECONDARY_EXEC_APIC_REGISTER_VIRT | 3058 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 3059 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 3060 3061 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 3062 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT 3063 enabled */ 3064 _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING | 3065 CPU_BASED_CR3_STORE_EXITING | 3066 CPU_BASED_INVLPG_EXITING); 3067 rdmsr(MSR_IA32_VMX_EPT_VPID_CAP, 3068 vmx_capability.ept, vmx_capability.vpid); 3069 } 3070 3071 min = VM_EXIT_SAVE_DEBUG_CONTROLS; 3072#ifdef CONFIG_X86_64 3073 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; 3074#endif 3075 opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT | 3076 VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS; 3077 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, 3078 &_vmexit_control) < 0) 3079 return -EIO; 3080 3081 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 3082 opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; 3083 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 3084 &_pin_based_exec_control) < 0) 3085 return -EIO; 3086 3087 if (!(_cpu_based_2nd_exec_control & 3088 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || 3089 !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) 3090 _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; 3091 3092 min = VM_ENTRY_LOAD_DEBUG_CONTROLS; 3093 opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 3094 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, 3095 &_vmentry_control) < 0) 3096 return -EIO; 3097 3098 rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high); 3099 3100 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */ 3101 if ((vmx_msr_high & 0x1fff) > PAGE_SIZE) 3102 return -EIO; 3103 3104#ifdef CONFIG_X86_64 3105 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */ 3106 if (vmx_msr_high & (1u<<16)) 3107 return -EIO; 3108#endif 3109 3110 /* Require Write-Back (WB) memory type for VMCS accesses. */ 3111 if (((vmx_msr_high >> 18) & 15) != 6) 3112 return -EIO; 3113 3114 vmcs_conf->size = vmx_msr_high & 0x1fff; 3115 vmcs_conf->order = get_order(vmcs_config.size); 3116 vmcs_conf->revision_id = vmx_msr_low; 3117 3118 vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control; 3119 vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control; 3120 vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control; 3121 vmcs_conf->vmexit_ctrl = _vmexit_control; 3122 vmcs_conf->vmentry_ctrl = _vmentry_control; 3123 3124 cpu_has_load_ia32_efer = 3125 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 3126 VM_ENTRY_LOAD_IA32_EFER) 3127 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 3128 VM_EXIT_LOAD_IA32_EFER); 3129 3130 cpu_has_load_perf_global_ctrl = 3131 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, 3132 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) 3133 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, 3134 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); 3135 3136 /* 3137 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL 3138 * but due to arrata below it can't be used. Workaround is to use 3139 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. 3140 * 3141 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] 3142 * 3143 * AAK155 (model 26) 3144 * AAP115 (model 30) 3145 * AAT100 (model 37) 3146 * BC86,AAY89,BD102 (model 44) 3147 * BA97 (model 46) 3148 * 3149 */ 3150 if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { 3151 switch (boot_cpu_data.x86_model) { 3152 case 26: 3153 case 30: 3154 case 37: 3155 case 44: 3156 case 46: 3157 cpu_has_load_perf_global_ctrl = false; 3158 printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " 3159 "does not work properly. Using workaround\n"); 3160 break; 3161 default: 3162 break; 3163 } 3164 } 3165 3166 if (cpu_has_xsaves) 3167 rdmsrl(MSR_IA32_XSS, host_xss); 3168 3169 return 0; 3170} 3171 3172static struct vmcs *alloc_vmcs_cpu(int cpu) 3173{ 3174 int node = cpu_to_node(cpu); 3175 struct page *pages; 3176 struct vmcs *vmcs; 3177 3178 pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); 3179 if (!pages) 3180 return NULL; 3181 vmcs = page_address(pages); 3182 memset(vmcs, 0, vmcs_config.size); 3183 vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */ 3184 return vmcs; 3185} 3186 3187static struct vmcs *alloc_vmcs(void) 3188{ 3189 return alloc_vmcs_cpu(raw_smp_processor_id()); 3190} 3191 3192static void free_vmcs(struct vmcs *vmcs) 3193{ 3194 free_pages((unsigned long)vmcs, vmcs_config.order); 3195} 3196 3197/* 3198 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded 3199 */ 3200static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) 3201{ 3202 if (!loaded_vmcs->vmcs) 3203 return; 3204 loaded_vmcs_clear(loaded_vmcs); 3205 free_vmcs(loaded_vmcs->vmcs); 3206 loaded_vmcs->vmcs = NULL; 3207} 3208 3209static void free_kvm_area(void) 3210{ 3211 int cpu; 3212 3213 for_each_possible_cpu(cpu) { 3214 free_vmcs(per_cpu(vmxarea, cpu)); 3215 per_cpu(vmxarea, cpu) = NULL; 3216 } 3217} 3218 3219static void init_vmcs_shadow_fields(void) 3220{ 3221 int i, j; 3222 3223 /* No checks for read only fields yet */ 3224 3225 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 3226 switch (shadow_read_write_fields[i]) { 3227 case GUEST_BNDCFGS: 3228 if (!vmx_mpx_supported()) 3229 continue; 3230 break; 3231 default: 3232 break; 3233 } 3234 3235 if (j < i) 3236 shadow_read_write_fields[j] = 3237 shadow_read_write_fields[i]; 3238 j++; 3239 } 3240 max_shadow_read_write_fields = j; 3241 3242 /* shadowed fields guest access without vmexit */ 3243 for (i = 0; i < max_shadow_read_write_fields; i++) { 3244 clear_bit(shadow_read_write_fields[i], 3245 vmx_vmwrite_bitmap); 3246 clear_bit(shadow_read_write_fields[i], 3247 vmx_vmread_bitmap); 3248 } 3249 for (i = 0; i < max_shadow_read_only_fields; i++) 3250 clear_bit(shadow_read_only_fields[i], 3251 vmx_vmread_bitmap); 3252} 3253 3254static __init int alloc_kvm_area(void) 3255{ 3256 int cpu; 3257 3258 for_each_possible_cpu(cpu) { 3259 struct vmcs *vmcs; 3260 3261 vmcs = alloc_vmcs_cpu(cpu); 3262 if (!vmcs) { 3263 free_kvm_area(); 3264 return -ENOMEM; 3265 } 3266 3267 per_cpu(vmxarea, cpu) = vmcs; 3268 } 3269 return 0; 3270} 3271 3272static bool emulation_required(struct kvm_vcpu *vcpu) 3273{ 3274 return emulate_invalid_guest_state && !guest_state_valid(vcpu); 3275} 3276 3277static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, 3278 struct kvm_segment *save) 3279{ 3280 if (!emulate_invalid_guest_state) { 3281 /* 3282 * CS and SS RPL should be equal during guest entry according 3283 * to VMX spec, but in reality it is not always so. Since vcpu 3284 * is in the middle of the transition from real mode to 3285 * protected mode it is safe to assume that RPL 0 is a good 3286 * default value. 3287 */ 3288 if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS) 3289 save->selector &= ~SEGMENT_RPL_MASK; 3290 save->dpl = save->selector & SEGMENT_RPL_MASK; 3291 save->s = 1; 3292 } 3293 vmx_set_segment(vcpu, save, seg); 3294} 3295 3296static void enter_pmode(struct kvm_vcpu *vcpu) 3297{ 3298 unsigned long flags; 3299 struct vcpu_vmx *vmx = to_vmx(vcpu); 3300 3301 /* 3302 * Update real mode segment cache. It may be not up-to-date if sement 3303 * register was written while vcpu was in a guest mode. 3304 */ 3305 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3306 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3307 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3308 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3309 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3310 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3311 3312 vmx->rmode.vm86_active = 0; 3313 3314 vmx_segment_cache_clear(vmx); 3315 3316 vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3317 3318 flags = vmcs_readl(GUEST_RFLAGS); 3319 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 3320 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 3321 vmcs_writel(GUEST_RFLAGS, flags); 3322 3323 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 3324 (vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME)); 3325 3326 update_exception_bitmap(vcpu); 3327 3328 fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3329 fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3330 fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3331 fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3332 fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3333 fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3334} 3335 3336static void fix_rmode_seg(int seg, struct kvm_segment *save) 3337{ 3338 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3339 struct kvm_segment var = *save; 3340 3341 var.dpl = 0x3; 3342 if (seg == VCPU_SREG_CS) 3343 var.type = 0x3; 3344 3345 if (!emulate_invalid_guest_state) { 3346 var.selector = var.base >> 4; 3347 var.base = var.base & 0xffff0; 3348 var.limit = 0xffff; 3349 var.g = 0; 3350 var.db = 0; 3351 var.present = 1; 3352 var.s = 1; 3353 var.l = 0; 3354 var.unusable = 0; 3355 var.type = 0x3; 3356 var.avl = 0; 3357 if (save->base & 0xf) 3358 printk_once(KERN_WARNING "kvm: segment base is not " 3359 "paragraph aligned when entering " 3360 "protected mode (seg=%d)", seg); 3361 } 3362 3363 vmcs_write16(sf->selector, var.selector); 3364 vmcs_write32(sf->base, var.base); 3365 vmcs_write32(sf->limit, var.limit); 3366 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var)); 3367} 3368 3369static void enter_rmode(struct kvm_vcpu *vcpu) 3370{ 3371 unsigned long flags; 3372 struct vcpu_vmx *vmx = to_vmx(vcpu); 3373 3374 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); 3375 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES); 3376 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS); 3377 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS); 3378 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS); 3379 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS); 3380 vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS); 3381 3382 vmx->rmode.vm86_active = 1; 3383 3384 /* 3385 * Very old userspace does not call KVM_SET_TSS_ADDR before entering 3386 * vcpu. Warn the user that an update is overdue. 3387 */ 3388 if (!vcpu->kvm->arch.tss_addr) 3389 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " 3390 "called before entering vcpu\n"); 3391 3392 vmx_segment_cache_clear(vmx); 3393 3394 vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr); 3395 vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1); 3396 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 3397 3398 flags = vmcs_readl(GUEST_RFLAGS); 3399 vmx->rmode.save_rflags = flags; 3400 3401 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 3402 3403 vmcs_writel(GUEST_RFLAGS, flags); 3404 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 3405 update_exception_bitmap(vcpu); 3406 3407 fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]); 3408 fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]); 3409 fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]); 3410 fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); 3411 fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); 3412 fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); 3413 3414 kvm_mmu_reset_context(vcpu); 3415} 3416 3417static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 3418{ 3419 struct vcpu_vmx *vmx = to_vmx(vcpu); 3420 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 3421 3422 if (!msr) 3423 return; 3424 3425 /* 3426 * Force kernel_gs_base reloading before EFER changes, as control 3427 * of this msr depends on is_long_mode(). 3428 */ 3429 vmx_load_host_state(to_vmx(vcpu)); 3430 vcpu->arch.efer = efer; 3431 if (efer & EFER_LMA) { 3432 vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3433 msr->data = efer; 3434 } else { 3435 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3436 3437 msr->data = efer & ~EFER_LME; 3438 } 3439 setup_msrs(vmx); 3440} 3441 3442#ifdef CONFIG_X86_64 3443 3444static void enter_lmode(struct kvm_vcpu *vcpu) 3445{ 3446 u32 guest_tr_ar; 3447 3448 vmx_segment_cache_clear(to_vmx(vcpu)); 3449 3450 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 3451 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 3452 pr_debug_ratelimited("%s: tss fixup for long mode. \n", 3453 __func__); 3454 vmcs_write32(GUEST_TR_AR_BYTES, 3455 (guest_tr_ar & ~AR_TYPE_MASK) 3456 | AR_TYPE_BUSY_64_TSS); 3457 } 3458 vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA); 3459} 3460 3461static void exit_lmode(struct kvm_vcpu *vcpu) 3462{ 3463 vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE); 3464 vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA); 3465} 3466 3467#endif 3468 3469static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 3470{ 3471 vpid_sync_context(to_vmx(vcpu)); 3472 if (enable_ept) { 3473 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 3474 return; 3475 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 3476 } 3477} 3478 3479static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 3480{ 3481 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; 3482 3483 vcpu->arch.cr0 &= ~cr0_guest_owned_bits; 3484 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 3485} 3486 3487static void vmx_decache_cr3(struct kvm_vcpu *vcpu) 3488{ 3489 if (enable_ept && is_paging(vcpu)) 3490 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3491 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); 3492} 3493 3494static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 3495{ 3496 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 3497 3498 vcpu->arch.cr4 &= ~cr4_guest_owned_bits; 3499 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; 3500} 3501 3502static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 3503{ 3504 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3505 3506 if (!test_bit(VCPU_EXREG_PDPTR, 3507 (unsigned long *)&vcpu->arch.regs_dirty)) 3508 return; 3509 3510 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3511 vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]); 3512 vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]); 3513 vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]); 3514 vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]); 3515 } 3516} 3517 3518static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 3519{ 3520 struct kvm_mmu *mmu = vcpu->arch.walk_mmu; 3521 3522 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 3523 mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 3524 mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 3525 mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 3526 mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 3527 } 3528 3529 __set_bit(VCPU_EXREG_PDPTR, 3530 (unsigned long *)&vcpu->arch.regs_avail); 3531 __set_bit(VCPU_EXREG_PDPTR, 3532 (unsigned long *)&vcpu->arch.regs_dirty); 3533} 3534 3535static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 3536 3537static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, 3538 unsigned long cr0, 3539 struct kvm_vcpu *vcpu) 3540{ 3541 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) 3542 vmx_decache_cr3(vcpu); 3543 if (!(cr0 & X86_CR0_PG)) { 3544 /* From paging/starting to nonpaging */ 3545 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3546 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | 3547 (CPU_BASED_CR3_LOAD_EXITING | 3548 CPU_BASED_CR3_STORE_EXITING)); 3549 vcpu->arch.cr0 = cr0; 3550 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3551 } else if (!is_paging(vcpu)) { 3552 /* From nonpaging to paging */ 3553 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 3554 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & 3555 ~(CPU_BASED_CR3_LOAD_EXITING | 3556 CPU_BASED_CR3_STORE_EXITING)); 3557 vcpu->arch.cr0 = cr0; 3558 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); 3559 } 3560 3561 if (!(cr0 & X86_CR0_WP)) 3562 *hw_cr0 &= ~X86_CR0_WP; 3563} 3564 3565static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 3566{ 3567 struct vcpu_vmx *vmx = to_vmx(vcpu); 3568 unsigned long hw_cr0; 3569 3570 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK); 3571 if (enable_unrestricted_guest) 3572 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; 3573 else { 3574 hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; 3575 3576 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 3577 enter_pmode(vcpu); 3578 3579 if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE)) 3580 enter_rmode(vcpu); 3581 } 3582 3583#ifdef CONFIG_X86_64 3584 if (vcpu->arch.efer & EFER_LME) { 3585 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 3586 enter_lmode(vcpu); 3587 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 3588 exit_lmode(vcpu); 3589 } 3590#endif 3591 3592 if (enable_ept) 3593 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 3594 3595 if (!vcpu->fpu_active) 3596 hw_cr0 |= X86_CR0_TS | X86_CR0_MP; 3597 3598 vmcs_writel(CR0_READ_SHADOW, cr0); 3599 vmcs_writel(GUEST_CR0, hw_cr0); 3600 vcpu->arch.cr0 = cr0; 3601 3602 /* depends on vcpu->arch.cr0 to be set to a new value */ 3603 vmx->emulation_required = emulation_required(vcpu); 3604} 3605 3606static u64 construct_eptp(unsigned long root_hpa) 3607{ 3608 u64 eptp; 3609 3610 /* TODO write the value reading from MSR */ 3611 eptp = VMX_EPT_DEFAULT_MT | 3612 VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; 3613 if (enable_ept_ad_bits) 3614 eptp |= VMX_EPT_AD_ENABLE_BIT; 3615 eptp |= (root_hpa & PAGE_MASK); 3616 3617 return eptp; 3618} 3619 3620static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 3621{ 3622 unsigned long guest_cr3; 3623 u64 eptp; 3624 3625 guest_cr3 = cr3; 3626 if (enable_ept) { 3627 eptp = construct_eptp(cr3); 3628 vmcs_write64(EPT_POINTER, eptp); 3629 if (is_paging(vcpu) || is_guest_mode(vcpu)) 3630 guest_cr3 = kvm_read_cr3(vcpu); 3631 else 3632 guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr; 3633 ept_load_pdptrs(vcpu); 3634 } 3635 3636 vmx_flush_tlb(vcpu); 3637 vmcs_writel(GUEST_CR3, guest_cr3); 3638} 3639 3640static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 3641{ 3642 /* 3643 * Pass through host's Machine Check Enable value to hw_cr4, which 3644 * is in force while we are in guest mode. Do not let guests control 3645 * this bit, even if host CR4.MCE == 0. 3646 */ 3647 unsigned long hw_cr4 = 3648 (cr4_read_shadow() & X86_CR4_MCE) | 3649 (cr4 & ~X86_CR4_MCE) | 3650 (to_vmx(vcpu)->rmode.vm86_active ? 3651 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 3652 3653 if (cr4 & X86_CR4_VMXE) { 3654 /* 3655 * To use VMXON (and later other VMX instructions), a guest 3656 * must first be able to turn on cr4.VMXE (see handle_vmon()). 3657 * So basically the check on whether to allow nested VMX 3658 * is here. 3659 */ 3660 if (!nested_vmx_allowed(vcpu)) 3661 return 1; 3662 } 3663 if (to_vmx(vcpu)->nested.vmxon && 3664 ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) 3665 return 1; 3666 3667 vcpu->arch.cr4 = cr4; 3668 if (enable_ept) { 3669 if (!is_paging(vcpu)) { 3670 hw_cr4 &= ~X86_CR4_PAE; 3671 hw_cr4 |= X86_CR4_PSE; 3672 } else if (!(cr4 & X86_CR4_PAE)) { 3673 hw_cr4 &= ~X86_CR4_PAE; 3674 } 3675 } 3676 3677 if (!enable_unrestricted_guest && !is_paging(vcpu)) 3678 /* 3679 * SMEP/SMAP is disabled if CPU is in non-paging mode in 3680 * hardware. However KVM always uses paging mode without 3681 * unrestricted guest. 3682 * To emulate this behavior, SMEP/SMAP needs to be manually 3683 * disabled when guest switches to non-paging mode. 3684 */ 3685 hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP); 3686 3687 vmcs_writel(CR4_READ_SHADOW, cr4); 3688 vmcs_writel(GUEST_CR4, hw_cr4); 3689 return 0; 3690} 3691 3692static void vmx_get_segment(struct kvm_vcpu *vcpu, 3693 struct kvm_segment *var, int seg) 3694{ 3695 struct vcpu_vmx *vmx = to_vmx(vcpu); 3696 u32 ar; 3697 3698 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3699 *var = vmx->rmode.segs[seg]; 3700 if (seg == VCPU_SREG_TR 3701 || var->selector == vmx_read_guest_seg_selector(vmx, seg)) 3702 return; 3703 var->base = vmx_read_guest_seg_base(vmx, seg); 3704 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3705 return; 3706 } 3707 var->base = vmx_read_guest_seg_base(vmx, seg); 3708 var->limit = vmx_read_guest_seg_limit(vmx, seg); 3709 var->selector = vmx_read_guest_seg_selector(vmx, seg); 3710 ar = vmx_read_guest_seg_ar(vmx, seg); 3711 var->unusable = (ar >> 16) & 1; 3712 var->type = ar & 15; 3713 var->s = (ar >> 4) & 1; 3714 var->dpl = (ar >> 5) & 3; 3715 /* 3716 * Some userspaces do not preserve unusable property. Since usable 3717 * segment has to be present according to VMX spec we can use present 3718 * property to amend userspace bug by making unusable segment always 3719 * nonpresent. vmx_segment_access_rights() already marks nonpresent 3720 * segment as unusable. 3721 */ 3722 var->present = !var->unusable; 3723 var->avl = (ar >> 12) & 1; 3724 var->l = (ar >> 13) & 1; 3725 var->db = (ar >> 14) & 1; 3726 var->g = (ar >> 15) & 1; 3727} 3728 3729static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 3730{ 3731 struct kvm_segment s; 3732 3733 if (to_vmx(vcpu)->rmode.vm86_active) { 3734 vmx_get_segment(vcpu, &s, seg); 3735 return s.base; 3736 } 3737 return vmx_read_guest_seg_base(to_vmx(vcpu), seg); 3738} 3739 3740static int vmx_get_cpl(struct kvm_vcpu *vcpu) 3741{ 3742 struct vcpu_vmx *vmx = to_vmx(vcpu); 3743 3744 if (unlikely(vmx->rmode.vm86_active)) 3745 return 0; 3746 else { 3747 int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS); 3748 return AR_DPL(ar); 3749 } 3750} 3751 3752static u32 vmx_segment_access_rights(struct kvm_segment *var) 3753{ 3754 u32 ar; 3755 3756 if (var->unusable || !var->present) 3757 ar = 1 << 16; 3758 else { 3759 ar = var->type & 15; 3760 ar |= (var->s & 1) << 4; 3761 ar |= (var->dpl & 3) << 5; 3762 ar |= (var->present & 1) << 7; 3763 ar |= (var->avl & 1) << 12; 3764 ar |= (var->l & 1) << 13; 3765 ar |= (var->db & 1) << 14; 3766 ar |= (var->g & 1) << 15; 3767 } 3768 3769 return ar; 3770} 3771 3772static void vmx_set_segment(struct kvm_vcpu *vcpu, 3773 struct kvm_segment *var, int seg) 3774{ 3775 struct vcpu_vmx *vmx = to_vmx(vcpu); 3776 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 3777 3778 vmx_segment_cache_clear(vmx); 3779 3780 if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) { 3781 vmx->rmode.segs[seg] = *var; 3782 if (seg == VCPU_SREG_TR) 3783 vmcs_write16(sf->selector, var->selector); 3784 else if (var->s) 3785 fix_rmode_seg(seg, &vmx->rmode.segs[seg]); 3786 goto out; 3787 } 3788 3789 vmcs_writel(sf->base, var->base); 3790 vmcs_write32(sf->limit, var->limit); 3791 vmcs_write16(sf->selector, var->selector); 3792 3793 /* 3794 * Fix the "Accessed" bit in AR field of segment registers for older 3795 * qemu binaries. 3796 * IA32 arch specifies that at the time of processor reset the 3797 * "Accessed" bit in the AR field of segment registers is 1. And qemu 3798 * is setting it to 0 in the userland code. This causes invalid guest 3799 * state vmexit when "unrestricted guest" mode is turned on. 3800 * Fix for this setup issue in cpu_reset is being pushed in the qemu 3801 * tree. Newer qemu binaries with that qemu fix would not need this 3802 * kvm hack. 3803 */ 3804 if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR)) 3805 var->type |= 0x1; /* Accessed */ 3806 3807 vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); 3808 3809out: 3810 vmx->emulation_required = emulation_required(vcpu); 3811} 3812 3813static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 3814{ 3815 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); 3816 3817 *db = (ar >> 14) & 1; 3818 *l = (ar >> 13) & 1; 3819} 3820 3821static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3822{ 3823 dt->size = vmcs_read32(GUEST_IDTR_LIMIT); 3824 dt->address = vmcs_readl(GUEST_IDTR_BASE); 3825} 3826 3827static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3828{ 3829 vmcs_write32(GUEST_IDTR_LIMIT, dt->size); 3830 vmcs_writel(GUEST_IDTR_BASE, dt->address); 3831} 3832 3833static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3834{ 3835 dt->size = vmcs_read32(GUEST_GDTR_LIMIT); 3836 dt->address = vmcs_readl(GUEST_GDTR_BASE); 3837} 3838 3839static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 3840{ 3841 vmcs_write32(GUEST_GDTR_LIMIT, dt->size); 3842 vmcs_writel(GUEST_GDTR_BASE, dt->address); 3843} 3844 3845static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) 3846{ 3847 struct kvm_segment var; 3848 u32 ar; 3849 3850 vmx_get_segment(vcpu, &var, seg); 3851 var.dpl = 0x3; 3852 if (seg == VCPU_SREG_CS) 3853 var.type = 0x3; 3854 ar = vmx_segment_access_rights(&var); 3855 3856 if (var.base != (var.selector << 4)) 3857 return false; 3858 if (var.limit != 0xffff) 3859 return false; 3860 if (ar != 0xf3) 3861 return false; 3862 3863 return true; 3864} 3865 3866static bool code_segment_valid(struct kvm_vcpu *vcpu) 3867{ 3868 struct kvm_segment cs; 3869 unsigned int cs_rpl; 3870 3871 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3872 cs_rpl = cs.selector & SEGMENT_RPL_MASK; 3873 3874 if (cs.unusable) 3875 return false; 3876 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) 3877 return false; 3878 if (!cs.s) 3879 return false; 3880 if (cs.type & AR_TYPE_WRITEABLE_MASK) { 3881 if (cs.dpl > cs_rpl) 3882 return false; 3883 } else { 3884 if (cs.dpl != cs_rpl) 3885 return false; 3886 } 3887 if (!cs.present) 3888 return false; 3889 3890 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ 3891 return true; 3892} 3893 3894static bool stack_segment_valid(struct kvm_vcpu *vcpu) 3895{ 3896 struct kvm_segment ss; 3897 unsigned int ss_rpl; 3898 3899 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3900 ss_rpl = ss.selector & SEGMENT_RPL_MASK; 3901 3902 if (ss.unusable) 3903 return true; 3904 if (ss.type != 3 && ss.type != 7) 3905 return false; 3906 if (!ss.s) 3907 return false; 3908 if (ss.dpl != ss_rpl) /* DPL != RPL */ 3909 return false; 3910 if (!ss.present) 3911 return false; 3912 3913 return true; 3914} 3915 3916static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) 3917{ 3918 struct kvm_segment var; 3919 unsigned int rpl; 3920 3921 vmx_get_segment(vcpu, &var, seg); 3922 rpl = var.selector & SEGMENT_RPL_MASK; 3923 3924 if (var.unusable) 3925 return true; 3926 if (!var.s) 3927 return false; 3928 if (!var.present) 3929 return false; 3930 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { 3931 if (var.dpl < rpl) /* DPL < RPL */ 3932 return false; 3933 } 3934 3935 /* TODO: Add other members to kvm_segment_field to allow checking for other access 3936 * rights flags 3937 */ 3938 return true; 3939} 3940 3941static bool tr_valid(struct kvm_vcpu *vcpu) 3942{ 3943 struct kvm_segment tr; 3944 3945 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); 3946 3947 if (tr.unusable) 3948 return false; 3949 if (tr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3950 return false; 3951 if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */ 3952 return false; 3953 if (!tr.present) 3954 return false; 3955 3956 return true; 3957} 3958 3959static bool ldtr_valid(struct kvm_vcpu *vcpu) 3960{ 3961 struct kvm_segment ldtr; 3962 3963 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); 3964 3965 if (ldtr.unusable) 3966 return true; 3967 if (ldtr.selector & SEGMENT_TI_MASK) /* TI = 1 */ 3968 return false; 3969 if (ldtr.type != 2) 3970 return false; 3971 if (!ldtr.present) 3972 return false; 3973 3974 return true; 3975} 3976 3977static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) 3978{ 3979 struct kvm_segment cs, ss; 3980 3981 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 3982 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); 3983 3984 return ((cs.selector & SEGMENT_RPL_MASK) == 3985 (ss.selector & SEGMENT_RPL_MASK)); 3986} 3987 3988/* 3989 * Check if guest state is valid. Returns true if valid, false if 3990 * not. 3991 * We assume that registers are always usable 3992 */ 3993static bool guest_state_valid(struct kvm_vcpu *vcpu) 3994{ 3995 if (enable_unrestricted_guest) 3996 return true; 3997 3998 /* real mode guest state checks */ 3999 if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 4000 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 4001 return false; 4002 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 4003 return false; 4004 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) 4005 return false; 4006 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) 4007 return false; 4008 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) 4009 return false; 4010 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) 4011 return false; 4012 } else { 4013 /* protected mode guest state checks */ 4014 if (!cs_ss_rpl_check(vcpu)) 4015 return false; 4016 if (!code_segment_valid(vcpu)) 4017 return false; 4018 if (!stack_segment_valid(vcpu)) 4019 return false; 4020 if (!data_segment_valid(vcpu, VCPU_SREG_DS)) 4021 return false; 4022 if (!data_segment_valid(vcpu, VCPU_SREG_ES)) 4023 return false; 4024 if (!data_segment_valid(vcpu, VCPU_SREG_FS)) 4025 return false; 4026 if (!data_segment_valid(vcpu, VCPU_SREG_GS)) 4027 return false; 4028 if (!tr_valid(vcpu)) 4029 return false; 4030 if (!ldtr_valid(vcpu)) 4031 return false; 4032 } 4033 /* TODO: 4034 * - Add checks on RIP 4035 * - Add checks on RFLAGS 4036 */ 4037 4038 return true; 4039} 4040 4041static int init_rmode_tss(struct kvm *kvm) 4042{ 4043 gfn_t fn; 4044 u16 data = 0; 4045 int idx, r; 4046 4047 idx = srcu_read_lock(&kvm->srcu); 4048 fn = kvm->arch.tss_addr >> PAGE_SHIFT; 4049 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 4050 if (r < 0) 4051 goto out; 4052 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 4053 r = kvm_write_guest_page(kvm, fn++, &data, 4054 TSS_IOPB_BASE_OFFSET, sizeof(u16)); 4055 if (r < 0) 4056 goto out; 4057 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 4058 if (r < 0) 4059 goto out; 4060 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 4061 if (r < 0) 4062 goto out; 4063 data = ~0; 4064 r = kvm_write_guest_page(kvm, fn, &data, 4065 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1, 4066 sizeof(u8)); 4067out: 4068 srcu_read_unlock(&kvm->srcu, idx); 4069 return r; 4070} 4071 4072static int init_rmode_identity_map(struct kvm *kvm) 4073{ 4074 int i, idx, r = 0; 4075 pfn_t identity_map_pfn; 4076 u32 tmp; 4077 4078 if (!enable_ept) 4079 return 0; 4080 4081 /* Protect kvm->arch.ept_identity_pagetable_done. */ 4082 mutex_lock(&kvm->slots_lock); 4083 4084 if (likely(kvm->arch.ept_identity_pagetable_done)) 4085 goto out2; 4086 4087 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 4088 4089 r = alloc_identity_pagetable(kvm); 4090 if (r < 0) 4091 goto out2; 4092 4093 idx = srcu_read_lock(&kvm->srcu); 4094 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 4095 if (r < 0) 4096 goto out; 4097 /* Set up identity-mapping pagetable for EPT in real mode */ 4098 for (i = 0; i < PT32_ENT_PER_PAGE; i++) { 4099 tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | 4100 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); 4101 r = kvm_write_guest_page(kvm, identity_map_pfn, 4102 &tmp, i * sizeof(tmp), sizeof(tmp)); 4103 if (r < 0) 4104 goto out; 4105 } 4106 kvm->arch.ept_identity_pagetable_done = true; 4107 4108out: 4109 srcu_read_unlock(&kvm->srcu, idx); 4110 4111out2: 4112 mutex_unlock(&kvm->slots_lock); 4113 return r; 4114} 4115 4116static void seg_setup(int seg) 4117{ 4118 const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 4119 unsigned int ar; 4120 4121 vmcs_write16(sf->selector, 0); 4122 vmcs_writel(sf->base, 0); 4123 vmcs_write32(sf->limit, 0xffff); 4124 ar = 0x93; 4125 if (seg == VCPU_SREG_CS) 4126 ar |= 0x08; /* code segment */ 4127 4128 vmcs_write32(sf->ar_bytes, ar); 4129} 4130 4131static int alloc_apic_access_page(struct kvm *kvm) 4132{ 4133 struct page *page; 4134 struct kvm_userspace_memory_region kvm_userspace_mem; 4135 int r = 0; 4136 4137 mutex_lock(&kvm->slots_lock); 4138 if (kvm->arch.apic_access_page_done) 4139 goto out; 4140 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 4141 kvm_userspace_mem.flags = 0; 4142 kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; 4143 kvm_userspace_mem.memory_size = PAGE_SIZE; 4144 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 4145 if (r) 4146 goto out; 4147 4148 page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); 4149 if (is_error_page(page)) { 4150 r = -EFAULT; 4151 goto out; 4152 } 4153 4154 /* 4155 * Do not pin the page in memory, so that memory hot-unplug 4156 * is able to migrate it. 4157 */ 4158 put_page(page); 4159 kvm->arch.apic_access_page_done = true; 4160out: 4161 mutex_unlock(&kvm->slots_lock); 4162 return r; 4163} 4164 4165static int alloc_identity_pagetable(struct kvm *kvm) 4166{ 4167 /* Called with kvm->slots_lock held. */ 4168 4169 struct kvm_userspace_memory_region kvm_userspace_mem; 4170 int r = 0; 4171 4172 BUG_ON(kvm->arch.ept_identity_pagetable_done); 4173 4174 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 4175 kvm_userspace_mem.flags = 0; 4176 kvm_userspace_mem.guest_phys_addr = 4177 kvm->arch.ept_identity_map_addr; 4178 kvm_userspace_mem.memory_size = PAGE_SIZE; 4179 r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); 4180 4181 return r; 4182} 4183 4184static void allocate_vpid(struct vcpu_vmx *vmx) 4185{ 4186 int vpid; 4187 4188 vmx->vpid = 0; 4189 if (!enable_vpid) 4190 return; 4191 spin_lock(&vmx_vpid_lock); 4192 vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); 4193 if (vpid < VMX_NR_VPIDS) { 4194 vmx->vpid = vpid; 4195 __set_bit(vpid, vmx_vpid_bitmap); 4196 } 4197 spin_unlock(&vmx_vpid_lock); 4198} 4199 4200static void free_vpid(struct vcpu_vmx *vmx) 4201{ 4202 if (!enable_vpid) 4203 return; 4204 spin_lock(&vmx_vpid_lock); 4205 if (vmx->vpid != 0) 4206 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 4207 spin_unlock(&vmx_vpid_lock); 4208} 4209 4210#define MSR_TYPE_R 1 4211#define MSR_TYPE_W 2 4212static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, 4213 u32 msr, int type) 4214{ 4215 int f = sizeof(unsigned long); 4216 4217 if (!cpu_has_vmx_msr_bitmap()) 4218 return; 4219 4220 /* 4221 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4222 * have the write-low and read-high bitmap offsets the wrong way round. 4223 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4224 */ 4225 if (msr <= 0x1fff) { 4226 if (type & MSR_TYPE_R) 4227 /* read-low */ 4228 __clear_bit(msr, msr_bitmap + 0x000 / f); 4229 4230 if (type & MSR_TYPE_W) 4231 /* write-low */ 4232 __clear_bit(msr, msr_bitmap + 0x800 / f); 4233 4234 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4235 msr &= 0x1fff; 4236 if (type & MSR_TYPE_R) 4237 /* read-high */ 4238 __clear_bit(msr, msr_bitmap + 0x400 / f); 4239 4240 if (type & MSR_TYPE_W) 4241 /* write-high */ 4242 __clear_bit(msr, msr_bitmap + 0xc00 / f); 4243 4244 } 4245} 4246 4247static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, 4248 u32 msr, int type) 4249{ 4250 int f = sizeof(unsigned long); 4251 4252 if (!cpu_has_vmx_msr_bitmap()) 4253 return; 4254 4255 /* 4256 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4257 * have the write-low and read-high bitmap offsets the wrong way round. 4258 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4259 */ 4260 if (msr <= 0x1fff) { 4261 if (type & MSR_TYPE_R) 4262 /* read-low */ 4263 __set_bit(msr, msr_bitmap + 0x000 / f); 4264 4265 if (type & MSR_TYPE_W) 4266 /* write-low */ 4267 __set_bit(msr, msr_bitmap + 0x800 / f); 4268 4269 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4270 msr &= 0x1fff; 4271 if (type & MSR_TYPE_R) 4272 /* read-high */ 4273 __set_bit(msr, msr_bitmap + 0x400 / f); 4274 4275 if (type & MSR_TYPE_W) 4276 /* write-high */ 4277 __set_bit(msr, msr_bitmap + 0xc00 / f); 4278 4279 } 4280} 4281 4282/* 4283 * If a msr is allowed by L0, we should check whether it is allowed by L1. 4284 * The corresponding bit will be cleared unless both of L0 and L1 allow it. 4285 */ 4286static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, 4287 unsigned long *msr_bitmap_nested, 4288 u32 msr, int type) 4289{ 4290 int f = sizeof(unsigned long); 4291 4292 if (!cpu_has_vmx_msr_bitmap()) { 4293 WARN_ON(1); 4294 return; 4295 } 4296 4297 /* 4298 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals 4299 * have the write-low and read-high bitmap offsets the wrong way round. 4300 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. 4301 */ 4302 if (msr <= 0x1fff) { 4303 if (type & MSR_TYPE_R && 4304 !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) 4305 /* read-low */ 4306 __clear_bit(msr, msr_bitmap_nested + 0x000 / f); 4307 4308 if (type & MSR_TYPE_W && 4309 !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) 4310 /* write-low */ 4311 __clear_bit(msr, msr_bitmap_nested + 0x800 / f); 4312 4313 } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { 4314 msr &= 0x1fff; 4315 if (type & MSR_TYPE_R && 4316 !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) 4317 /* read-high */ 4318 __clear_bit(msr, msr_bitmap_nested + 0x400 / f); 4319 4320 if (type & MSR_TYPE_W && 4321 !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) 4322 /* write-high */ 4323 __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); 4324 4325 } 4326} 4327 4328static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) 4329{ 4330 if (!longmode_only) 4331 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, 4332 msr, MSR_TYPE_R | MSR_TYPE_W); 4333 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, 4334 msr, MSR_TYPE_R | MSR_TYPE_W); 4335} 4336 4337static void vmx_enable_intercept_msr_read_x2apic(u32 msr) 4338{ 4339 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4340 msr, MSR_TYPE_R); 4341 __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4342 msr, MSR_TYPE_R); 4343} 4344 4345static void vmx_disable_intercept_msr_read_x2apic(u32 msr) 4346{ 4347 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4348 msr, MSR_TYPE_R); 4349 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4350 msr, MSR_TYPE_R); 4351} 4352 4353static void vmx_disable_intercept_msr_write_x2apic(u32 msr) 4354{ 4355 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, 4356 msr, MSR_TYPE_W); 4357 __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, 4358 msr, MSR_TYPE_W); 4359} 4360 4361static int vmx_vm_has_apicv(struct kvm *kvm) 4362{ 4363 return enable_apicv && irqchip_in_kernel(kvm); 4364} 4365 4366static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 4367{ 4368 struct vcpu_vmx *vmx = to_vmx(vcpu); 4369 int max_irr; 4370 void *vapic_page; 4371 u16 status; 4372 4373 if (vmx->nested.pi_desc && 4374 vmx->nested.pi_pending) { 4375 vmx->nested.pi_pending = false; 4376 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 4377 return 0; 4378 4379 max_irr = find_last_bit( 4380 (unsigned long *)vmx->nested.pi_desc->pir, 256); 4381 4382 if (max_irr == 256) 4383 return 0; 4384 4385 vapic_page = kmap(vmx->nested.virtual_apic_page); 4386 if (!vapic_page) { 4387 WARN_ON(1); 4388 return -ENOMEM; 4389 } 4390 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); 4391 kunmap(vmx->nested.virtual_apic_page); 4392 4393 status = vmcs_read16(GUEST_INTR_STATUS); 4394 if ((u8)max_irr > ((u8)status & 0xff)) { 4395 status &= ~0xff; 4396 status |= (u8)max_irr; 4397 vmcs_write16(GUEST_INTR_STATUS, status); 4398 } 4399 } 4400 return 0; 4401} 4402 4403static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) 4404{ 4405#ifdef CONFIG_SMP 4406 if (vcpu->mode == IN_GUEST_MODE) { 4407 apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), 4408 POSTED_INTR_VECTOR); 4409 return true; 4410 } 4411#endif 4412 return false; 4413} 4414 4415static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, 4416 int vector) 4417{ 4418 struct vcpu_vmx *vmx = to_vmx(vcpu); 4419 4420 if (is_guest_mode(vcpu) && 4421 vector == vmx->nested.posted_intr_nv) { 4422 /* the PIR and ON have been set by L1. */ 4423 kvm_vcpu_trigger_posted_interrupt(vcpu); 4424 /* 4425 * If a posted intr is not recognized by hardware, 4426 * we will accomplish it in the next vmentry. 4427 */ 4428 vmx->nested.pi_pending = true; 4429 kvm_make_request(KVM_REQ_EVENT, vcpu); 4430 return 0; 4431 } 4432 return -1; 4433} 4434/* 4435 * Send interrupt to vcpu via posted interrupt way. 4436 * 1. If target vcpu is running(non-root mode), send posted interrupt 4437 * notification to vcpu and hardware will sync PIR to vIRR atomically. 4438 * 2. If target vcpu isn't running(root mode), kick it to pick up the 4439 * interrupt from PIR in next vmentry. 4440 */ 4441static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) 4442{ 4443 struct vcpu_vmx *vmx = to_vmx(vcpu); 4444 int r; 4445 4446 r = vmx_deliver_nested_posted_interrupt(vcpu, vector); 4447 if (!r) 4448 return; 4449 4450 if (pi_test_and_set_pir(vector, &vmx->pi_desc)) 4451 return; 4452 4453 r = pi_test_and_set_on(&vmx->pi_desc); 4454 kvm_make_request(KVM_REQ_EVENT, vcpu); 4455 if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu)) 4456 kvm_vcpu_kick(vcpu); 4457} 4458 4459static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) 4460{ 4461 struct vcpu_vmx *vmx = to_vmx(vcpu); 4462 4463 if (!pi_test_and_clear_on(&vmx->pi_desc)) 4464 return; 4465 4466 kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); 4467} 4468 4469static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu) 4470{ 4471 return; 4472} 4473 4474/* 4475 * Set up the vmcs's constant host-state fields, i.e., host-state fields that 4476 * will not change in the lifetime of the guest. 4477 * Note that host-state that does change is set elsewhere. E.g., host-state 4478 * that is set differently for each CPU is set in vmx_vcpu_load(), not here. 4479 */ 4480static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) 4481{ 4482 u32 low32, high32; 4483 unsigned long tmpl; 4484 struct desc_ptr dt; 4485 unsigned long cr4; 4486 4487 vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ 4488 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 4489 4490 /* Save the most likely value for this task's CR4 in the VMCS. */ 4491 cr4 = cr4_read_shadow(); 4492 vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */ 4493 vmx->host_state.vmcs_host_cr4 = cr4; 4494 4495 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 4496#ifdef CONFIG_X86_64 4497 /* 4498 * Load null selectors, so we can avoid reloading them in 4499 * __vmx_load_host_state(), in case userspace uses the null selectors 4500 * too (the expected case). 4501 */ 4502 vmcs_write16(HOST_DS_SELECTOR, 0); 4503 vmcs_write16(HOST_ES_SELECTOR, 0); 4504#else 4505 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4506 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4507#endif 4508 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 4509 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 4510 4511 native_store_idt(&dt); 4512 vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ 4513 vmx->host_idt_base = dt.address; 4514 4515 vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ 4516 4517 rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); 4518 vmcs_write32(HOST_IA32_SYSENTER_CS, low32); 4519 rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl); 4520 vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */ 4521 4522 if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { 4523 rdmsr(MSR_IA32_CR_PAT, low32, high32); 4524 vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32)); 4525 } 4526} 4527 4528static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) 4529{ 4530 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS; 4531 if (enable_ept) 4532 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 4533 if (is_guest_mode(&vmx->vcpu)) 4534 vmx->vcpu.arch.cr4_guest_owned_bits &= 4535 ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask; 4536 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 4537} 4538 4539static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) 4540{ 4541 u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; 4542 4543 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4544 pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; 4545 return pin_based_exec_ctrl; 4546} 4547 4548static u32 vmx_exec_control(struct vcpu_vmx *vmx) 4549{ 4550 u32 exec_control = vmcs_config.cpu_based_exec_ctrl; 4551 4552 if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT) 4553 exec_control &= ~CPU_BASED_MOV_DR_EXITING; 4554 4555 if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) { 4556 exec_control &= ~CPU_BASED_TPR_SHADOW; 4557#ifdef CONFIG_X86_64 4558 exec_control |= CPU_BASED_CR8_STORE_EXITING | 4559 CPU_BASED_CR8_LOAD_EXITING; 4560#endif 4561 } 4562 if (!enable_ept) 4563 exec_control |= CPU_BASED_CR3_STORE_EXITING | 4564 CPU_BASED_CR3_LOAD_EXITING | 4565 CPU_BASED_INVLPG_EXITING; 4566 return exec_control; 4567} 4568 4569static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) 4570{ 4571 u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; 4572 if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) 4573 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 4574 if (vmx->vpid == 0) 4575 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 4576 if (!enable_ept) { 4577 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 4578 enable_unrestricted_guest = 0; 4579 /* Enable INVPCID for non-ept guests may cause performance regression. */ 4580 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 4581 } 4582 if (!enable_unrestricted_guest) 4583 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 4584 if (!ple_gap) 4585 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; 4586 if (!vmx_vm_has_apicv(vmx->vcpu.kvm)) 4587 exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | 4588 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); 4589 exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 4590 /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD 4591 (handle_vmptrld). 4592 We can NOT enable shadow_vmcs here because we don't have yet 4593 a current VMCS12 4594 */ 4595 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 4596 /* PML is enabled/disabled in creating/destorying vcpu */ 4597 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 4598 4599 return exec_control; 4600} 4601 4602static void ept_set_mmio_spte_mask(void) 4603{ 4604 /* 4605 * EPT Misconfigurations can be generated if the value of bits 2:0 4606 * of an EPT paging-structure entry is 110b (write/execute). 4607 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio 4608 * spte. 4609 */ 4610 kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull); 4611} 4612 4613#define VMX_XSS_EXIT_BITMAP 0 4614/* 4615 * Sets up the vmcs for emulated real mode. 4616 */ 4617static int vmx_vcpu_setup(struct vcpu_vmx *vmx) 4618{ 4619#ifdef CONFIG_X86_64 4620 unsigned long a; 4621#endif 4622 int i; 4623 4624 /* I/O */ 4625 vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); 4626 vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); 4627 4628 if (enable_shadow_vmcs) { 4629 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 4630 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 4631 } 4632 if (cpu_has_vmx_msr_bitmap()) 4633 vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy)); 4634 4635 vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ 4636 4637 /* Control */ 4638 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx)); 4639 4640 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); 4641 4642 if (cpu_has_secondary_exec_ctrls()) { 4643 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 4644 vmx_secondary_exec_control(vmx)); 4645 } 4646 4647 if (vmx_vm_has_apicv(vmx->vcpu.kvm)) { 4648 vmcs_write64(EOI_EXIT_BITMAP0, 0); 4649 vmcs_write64(EOI_EXIT_BITMAP1, 0); 4650 vmcs_write64(EOI_EXIT_BITMAP2, 0); 4651 vmcs_write64(EOI_EXIT_BITMAP3, 0); 4652 4653 vmcs_write16(GUEST_INTR_STATUS, 0); 4654 4655 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); 4656 vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc))); 4657 } 4658 4659 if (ple_gap) { 4660 vmcs_write32(PLE_GAP, ple_gap); 4661 vmx->ple_window = ple_window; 4662 vmx->ple_window_dirty = true; 4663 } 4664 4665 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 4666 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 4667 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 4668 4669 vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ 4670 vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ 4671 vmx_set_constant_host_state(vmx); 4672#ifdef CONFIG_X86_64 4673 rdmsrl(MSR_FS_BASE, a); 4674 vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */ 4675 rdmsrl(MSR_GS_BASE, a); 4676 vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */ 4677#else 4678 vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */ 4679 vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */ 4680#endif 4681 4682 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4683 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 4684 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); 4685 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 4686 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); 4687 4688 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 4689 u32 msr_low, msr_high; 4690 u64 host_pat; 4691 rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); 4692 host_pat = msr_low | ((u64) msr_high << 32); 4693 /* Write the default value follow host pat */ 4694 vmcs_write64(GUEST_IA32_PAT, host_pat); 4695 /* Keep arch.pat sync with GUEST_IA32_PAT */ 4696 vmx->vcpu.arch.pat = host_pat; 4697 } 4698 4699 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { 4700 u32 index = vmx_msr_index[i]; 4701 u32 data_low, data_high; 4702 int j = vmx->nmsrs; 4703 4704 if (rdmsr_safe(index, &data_low, &data_high) < 0) 4705 continue; 4706 if (wrmsr_safe(index, data_low, data_high) < 0) 4707 continue; 4708 vmx->guest_msrs[j].index = i; 4709 vmx->guest_msrs[j].data = 0; 4710 vmx->guest_msrs[j].mask = -1ull; 4711 ++vmx->nmsrs; 4712 } 4713 4714 4715 vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl); 4716 4717 /* 22.2.1, 20.8.1 */ 4718 vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl); 4719 4720 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 4721 set_cr4_guest_host_mask(vmx); 4722 4723 if (vmx_xsaves_supported()) 4724 vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP); 4725 4726 return 0; 4727} 4728 4729static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) 4730{ 4731 struct vcpu_vmx *vmx = to_vmx(vcpu); 4732 struct msr_data apic_base_msr; 4733 4734 vmx->rmode.vm86_active = 0; 4735 4736 vmx->soft_vnmi_blocked = 0; 4737 4738 vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); 4739 kvm_set_cr8(&vmx->vcpu, 0); 4740 apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; 4741 if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) 4742 apic_base_msr.data |= MSR_IA32_APICBASE_BSP; 4743 apic_base_msr.host_initiated = true; 4744 kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); 4745 4746 vmx_segment_cache_clear(vmx); 4747 4748 seg_setup(VCPU_SREG_CS); 4749 vmcs_write16(GUEST_CS_SELECTOR, 0xf000); 4750 vmcs_write32(GUEST_CS_BASE, 0xffff0000); 4751 4752 seg_setup(VCPU_SREG_DS); 4753 seg_setup(VCPU_SREG_ES); 4754 seg_setup(VCPU_SREG_FS); 4755 seg_setup(VCPU_SREG_GS); 4756 seg_setup(VCPU_SREG_SS); 4757 4758 vmcs_write16(GUEST_TR_SELECTOR, 0); 4759 vmcs_writel(GUEST_TR_BASE, 0); 4760 vmcs_write32(GUEST_TR_LIMIT, 0xffff); 4761 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 4762 4763 vmcs_write16(GUEST_LDTR_SELECTOR, 0); 4764 vmcs_writel(GUEST_LDTR_BASE, 0); 4765 vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); 4766 vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); 4767 4768 vmcs_write32(GUEST_SYSENTER_CS, 0); 4769 vmcs_writel(GUEST_SYSENTER_ESP, 0); 4770 vmcs_writel(GUEST_SYSENTER_EIP, 0); 4771 4772 vmcs_writel(GUEST_RFLAGS, 0x02); 4773 kvm_rip_write(vcpu, 0xfff0); 4774 4775 vmcs_writel(GUEST_GDTR_BASE, 0); 4776 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); 4777 4778 vmcs_writel(GUEST_IDTR_BASE, 0); 4779 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 4780 4781 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); 4782 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 4783 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 4784 4785 /* Special registers */ 4786 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4787 4788 setup_msrs(vmx); 4789 4790 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ 4791 4792 if (cpu_has_vmx_tpr_shadow()) { 4793 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 4794 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 4795 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 4796 __pa(vmx->vcpu.arch.apic->regs)); 4797 vmcs_write32(TPR_THRESHOLD, 0); 4798 } 4799 4800 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4801 4802 if (vmx_vm_has_apicv(vcpu->kvm)) 4803 memset(&vmx->pi_desc, 0, sizeof(struct pi_desc)); 4804 4805 if (vmx->vpid != 0) 4806 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 4807 4808 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 4809 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ 4810 vmx_set_cr4(&vmx->vcpu, 0); 4811 vmx_set_efer(&vmx->vcpu, 0); 4812 vmx_fpu_activate(&vmx->vcpu); 4813 update_exception_bitmap(&vmx->vcpu); 4814 4815 vpid_sync_context(vmx); 4816} 4817 4818/* 4819 * In nested virtualization, check if L1 asked to exit on external interrupts. 4820 * For most existing hypervisors, this will always return true. 4821 */ 4822static bool nested_exit_on_intr(struct kvm_vcpu *vcpu) 4823{ 4824 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4825 PIN_BASED_EXT_INTR_MASK; 4826} 4827 4828/* 4829 * In nested virtualization, check if L1 has set 4830 * VM_EXIT_ACK_INTR_ON_EXIT 4831 */ 4832static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 4833{ 4834 return get_vmcs12(vcpu)->vm_exit_controls & 4835 VM_EXIT_ACK_INTR_ON_EXIT; 4836} 4837 4838static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu) 4839{ 4840 return get_vmcs12(vcpu)->pin_based_vm_exec_control & 4841 PIN_BASED_NMI_EXITING; 4842} 4843 4844static void enable_irq_window(struct kvm_vcpu *vcpu) 4845{ 4846 u32 cpu_based_vm_exec_control; 4847 4848 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4849 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; 4850 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4851} 4852 4853static void enable_nmi_window(struct kvm_vcpu *vcpu) 4854{ 4855 u32 cpu_based_vm_exec_control; 4856 4857 if (!cpu_has_virtual_nmis() || 4858 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { 4859 enable_irq_window(vcpu); 4860 return; 4861 } 4862 4863 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 4864 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 4865 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 4866} 4867 4868static void vmx_inject_irq(struct kvm_vcpu *vcpu) 4869{ 4870 struct vcpu_vmx *vmx = to_vmx(vcpu); 4871 uint32_t intr; 4872 int irq = vcpu->arch.interrupt.nr; 4873 4874 trace_kvm_inj_virq(irq); 4875 4876 ++vcpu->stat.irq_injections; 4877 if (vmx->rmode.vm86_active) { 4878 int inc_eip = 0; 4879 if (vcpu->arch.interrupt.soft) 4880 inc_eip = vcpu->arch.event_exit_inst_len; 4881 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) 4882 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4883 return; 4884 } 4885 intr = irq | INTR_INFO_VALID_MASK; 4886 if (vcpu->arch.interrupt.soft) { 4887 intr |= INTR_TYPE_SOFT_INTR; 4888 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 4889 vmx->vcpu.arch.event_exit_inst_len); 4890 } else 4891 intr |= INTR_TYPE_EXT_INTR; 4892 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 4893} 4894 4895static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 4896{ 4897 struct vcpu_vmx *vmx = to_vmx(vcpu); 4898 4899 if (is_guest_mode(vcpu)) 4900 return; 4901 4902 if (!cpu_has_virtual_nmis()) { 4903 /* 4904 * Tracking the NMI-blocked state in software is built upon 4905 * finding the next open IRQ window. This, in turn, depends on 4906 * well-behaving guests: They have to keep IRQs disabled at 4907 * least as long as the NMI handler runs. Otherwise we may 4908 * cause NMI nesting, maybe breaking the guest. But as this is 4909 * highly unlikely, we can live with the residual risk. 4910 */ 4911 vmx->soft_vnmi_blocked = 1; 4912 vmx->vnmi_blocked_time = 0; 4913 } 4914 4915 ++vcpu->stat.nmi_injections; 4916 vmx->nmi_known_unmasked = false; 4917 if (vmx->rmode.vm86_active) { 4918 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) 4919 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4920 return; 4921 } 4922 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 4923 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 4924} 4925 4926static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 4927{ 4928 if (!cpu_has_virtual_nmis()) 4929 return to_vmx(vcpu)->soft_vnmi_blocked; 4930 if (to_vmx(vcpu)->nmi_known_unmasked) 4931 return false; 4932 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 4933} 4934 4935static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) 4936{ 4937 struct vcpu_vmx *vmx = to_vmx(vcpu); 4938 4939 if (!cpu_has_virtual_nmis()) { 4940 if (vmx->soft_vnmi_blocked != masked) { 4941 vmx->soft_vnmi_blocked = masked; 4942 vmx->vnmi_blocked_time = 0; 4943 } 4944 } else { 4945 vmx->nmi_known_unmasked = !masked; 4946 if (masked) 4947 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4948 GUEST_INTR_STATE_NMI); 4949 else 4950 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4951 GUEST_INTR_STATE_NMI); 4952 } 4953} 4954 4955static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 4956{ 4957 if (to_vmx(vcpu)->nested.nested_run_pending) 4958 return 0; 4959 4960 if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) 4961 return 0; 4962 4963 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4964 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI 4965 | GUEST_INTR_STATE_NMI)); 4966} 4967 4968static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 4969{ 4970 return (!to_vmx(vcpu)->nested.nested_run_pending && 4971 vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 4972 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 4973 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); 4974} 4975 4976static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) 4977{ 4978 int ret; 4979 struct kvm_userspace_memory_region tss_mem = { 4980 .slot = TSS_PRIVATE_MEMSLOT, 4981 .guest_phys_addr = addr, 4982 .memory_size = PAGE_SIZE * 3, 4983 .flags = 0, 4984 }; 4985 4986 ret = kvm_set_memory_region(kvm, &tss_mem); 4987 if (ret) 4988 return ret; 4989 kvm->arch.tss_addr = addr; 4990 return init_rmode_tss(kvm); 4991} 4992 4993static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) 4994{ 4995 switch (vec) { 4996 case BP_VECTOR: 4997 /* 4998 * Update instruction length as we may reinject the exception 4999 * from user space while in guest debugging mode. 5000 */ 5001 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = 5002 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5003 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 5004 return false; 5005 /* fall through */ 5006 case DB_VECTOR: 5007 if (vcpu->guest_debug & 5008 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 5009 return false; 5010 /* fall through */ 5011 case DE_VECTOR: 5012 case OF_VECTOR: 5013 case BR_VECTOR: 5014 case UD_VECTOR: 5015 case DF_VECTOR: 5016 case SS_VECTOR: 5017 case GP_VECTOR: 5018 case MF_VECTOR: 5019 return true; 5020 break; 5021 } 5022 return false; 5023} 5024 5025static int handle_rmode_exception(struct kvm_vcpu *vcpu, 5026 int vec, u32 err_code) 5027{ 5028 /* 5029 * Instruction with address size override prefix opcode 0x67 5030 * Cause the #SS fault with 0 error code in VM86 mode. 5031 */ 5032 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) { 5033 if (emulate_instruction(vcpu, 0) == EMULATE_DONE) { 5034 if (vcpu->arch.halt_request) { 5035 vcpu->arch.halt_request = 0; 5036 return kvm_vcpu_halt(vcpu); 5037 } 5038 return 1; 5039 } 5040 return 0; 5041 } 5042 5043 /* 5044 * Forward all other exceptions that are valid in real mode. 5045 * FIXME: Breaks guest debugging in real mode, needs to be fixed with 5046 * the required debugging infrastructure rework. 5047 */ 5048 kvm_queue_exception(vcpu, vec); 5049 return 1; 5050} 5051 5052/* 5053 * Trigger machine check on the host. We assume all the MSRs are already set up 5054 * by the CPU and that we still run on the same CPU as the MCE occurred on. 5055 * We pass a fake environment to the machine check handler because we want 5056 * the guest to be always treated like user space, no matter what context 5057 * it used internally. 5058 */ 5059static void kvm_machine_check(void) 5060{ 5061#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64) 5062 struct pt_regs regs = { 5063 .cs = 3, /* Fake ring 3 no matter what the guest ran on */ 5064 .flags = X86_EFLAGS_IF, 5065 }; 5066 5067 do_machine_check(®s, 0); 5068#endif 5069} 5070 5071static int handle_machine_check(struct kvm_vcpu *vcpu) 5072{ 5073 /* already handled by vcpu_run */ 5074 return 1; 5075} 5076 5077static int handle_exception(struct kvm_vcpu *vcpu) 5078{ 5079 struct vcpu_vmx *vmx = to_vmx(vcpu); 5080 struct kvm_run *kvm_run = vcpu->run; 5081 u32 intr_info, ex_no, error_code; 5082 unsigned long cr2, rip, dr6; 5083 u32 vect_info; 5084 enum emulation_result er; 5085 5086 vect_info = vmx->idt_vectoring_info; 5087 intr_info = vmx->exit_intr_info; 5088 5089 if (is_machine_check(intr_info)) 5090 return handle_machine_check(vcpu); 5091 5092 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 5093 return 1; /* already handled by vmx_vcpu_run() */ 5094 5095 if (is_no_device(intr_info)) { 5096 vmx_fpu_activate(vcpu); 5097 return 1; 5098 } 5099 5100 if (is_invalid_opcode(intr_info)) { 5101 if (is_guest_mode(vcpu)) { 5102 kvm_queue_exception(vcpu, UD_VECTOR); 5103 return 1; 5104 } 5105 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); 5106 if (er != EMULATE_DONE) 5107 kvm_queue_exception(vcpu, UD_VECTOR); 5108 return 1; 5109 } 5110 5111 error_code = 0; 5112 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 5113 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 5114 5115 /* 5116 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 5117 * MMIO, it is better to report an internal error. 5118 * See the comments in vmx_handle_exit. 5119 */ 5120 if ((vect_info & VECTORING_INFO_VALID_MASK) && 5121 !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 5122 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5123 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 5124 vcpu->run->internal.ndata = 3; 5125 vcpu->run->internal.data[0] = vect_info; 5126 vcpu->run->internal.data[1] = intr_info; 5127 vcpu->run->internal.data[2] = error_code; 5128 return 0; 5129 } 5130 5131 if (is_page_fault(intr_info)) { 5132 /* EPT won't cause page fault directly */ 5133 BUG_ON(enable_ept); 5134 cr2 = vmcs_readl(EXIT_QUALIFICATION); 5135 trace_kvm_page_fault(cr2, error_code); 5136 5137 if (kvm_event_needs_reinjection(vcpu)) 5138 kvm_mmu_unprotect_page_virt(vcpu, cr2); 5139 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); 5140 } 5141 5142 ex_no = intr_info & INTR_INFO_VECTOR_MASK; 5143 5144 if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no)) 5145 return handle_rmode_exception(vcpu, ex_no, error_code); 5146 5147 switch (ex_no) { 5148 case AC_VECTOR: 5149 kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); 5150 return 1; 5151 case DB_VECTOR: 5152 dr6 = vmcs_readl(EXIT_QUALIFICATION); 5153 if (!(vcpu->guest_debug & 5154 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { 5155 vcpu->arch.dr6 &= ~15; 5156 vcpu->arch.dr6 |= dr6 | DR6_RTM; 5157 if (!(dr6 & ~DR6_RESERVED)) /* icebp */ 5158 skip_emulated_instruction(vcpu); 5159 5160 kvm_queue_exception(vcpu, DB_VECTOR); 5161 return 1; 5162 } 5163 kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; 5164 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 5165 /* fall through */ 5166 case BP_VECTOR: 5167 /* 5168 * Update instruction length as we may reinject #BP from 5169 * user space while in guest debugging mode. Reading it for 5170 * #DB as well causes no harm, it is not used in that case. 5171 */ 5172 vmx->vcpu.arch.event_exit_inst_len = 5173 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 5174 kvm_run->exit_reason = KVM_EXIT_DEBUG; 5175 rip = kvm_rip_read(vcpu); 5176 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 5177 kvm_run->debug.arch.exception = ex_no; 5178 break; 5179 default: 5180 kvm_run->exit_reason = KVM_EXIT_EXCEPTION; 5181 kvm_run->ex.exception = ex_no; 5182 kvm_run->ex.error_code = error_code; 5183 break; 5184 } 5185 return 0; 5186} 5187 5188static int handle_external_interrupt(struct kvm_vcpu *vcpu) 5189{ 5190 ++vcpu->stat.irq_exits; 5191 return 1; 5192} 5193 5194static int handle_triple_fault(struct kvm_vcpu *vcpu) 5195{ 5196 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 5197 return 0; 5198} 5199 5200static int handle_io(struct kvm_vcpu *vcpu) 5201{ 5202 unsigned long exit_qualification; 5203 int size, in, string; 5204 unsigned port; 5205 5206 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5207 string = (exit_qualification & 16) != 0; 5208 in = (exit_qualification & 8) != 0; 5209 5210 ++vcpu->stat.io_exits; 5211 5212 if (string || in) 5213 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5214 5215 port = exit_qualification >> 16; 5216 size = (exit_qualification & 7) + 1; 5217 skip_emulated_instruction(vcpu); 5218 5219 return kvm_fast_pio_out(vcpu, size, port); 5220} 5221 5222static void 5223vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) 5224{ 5225 /* 5226 * Patch in the VMCALL instruction: 5227 */ 5228 hypercall[0] = 0x0f; 5229 hypercall[1] = 0x01; 5230 hypercall[2] = 0xc1; 5231} 5232 5233static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) 5234{ 5235 unsigned long always_on = VMXON_CR0_ALWAYSON; 5236 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5237 5238 if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & 5239 SECONDARY_EXEC_UNRESTRICTED_GUEST && 5240 nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 5241 always_on &= ~(X86_CR0_PE | X86_CR0_PG); 5242 return (val & always_on) == always_on; 5243} 5244 5245/* called to set cr0 as appropriate for a mov-to-cr0 exit. */ 5246static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) 5247{ 5248 if (is_guest_mode(vcpu)) { 5249 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5250 unsigned long orig_val = val; 5251 5252 /* 5253 * We get here when L2 changed cr0 in a way that did not change 5254 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr), 5255 * but did change L0 shadowed bits. So we first calculate the 5256 * effective cr0 value that L1 would like to write into the 5257 * hardware. It consists of the L2-owned bits from the new 5258 * value combined with the L1-owned bits from L1's guest_cr0. 5259 */ 5260 val = (val & ~vmcs12->cr0_guest_host_mask) | 5261 (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); 5262 5263 if (!nested_cr0_valid(vcpu, val)) 5264 return 1; 5265 5266 if (kvm_set_cr0(vcpu, val)) 5267 return 1; 5268 vmcs_writel(CR0_READ_SHADOW, orig_val); 5269 return 0; 5270 } else { 5271 if (to_vmx(vcpu)->nested.vmxon && 5272 ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) 5273 return 1; 5274 return kvm_set_cr0(vcpu, val); 5275 } 5276} 5277 5278static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) 5279{ 5280 if (is_guest_mode(vcpu)) { 5281 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5282 unsigned long orig_val = val; 5283 5284 /* analogously to handle_set_cr0 */ 5285 val = (val & ~vmcs12->cr4_guest_host_mask) | 5286 (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask); 5287 if (kvm_set_cr4(vcpu, val)) 5288 return 1; 5289 vmcs_writel(CR4_READ_SHADOW, orig_val); 5290 return 0; 5291 } else 5292 return kvm_set_cr4(vcpu, val); 5293} 5294 5295/* called to set cr0 as approriate for clts instruction exit. */ 5296static void handle_clts(struct kvm_vcpu *vcpu) 5297{ 5298 if (is_guest_mode(vcpu)) { 5299 /* 5300 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS 5301 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on, 5302 * just pretend it's off (also in arch.cr0 for fpu_activate). 5303 */ 5304 vmcs_writel(CR0_READ_SHADOW, 5305 vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS); 5306 vcpu->arch.cr0 &= ~X86_CR0_TS; 5307 } else 5308 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 5309} 5310 5311static int handle_cr(struct kvm_vcpu *vcpu) 5312{ 5313 unsigned long exit_qualification, val; 5314 int cr; 5315 int reg; 5316 int err; 5317 5318 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5319 cr = exit_qualification & 15; 5320 reg = (exit_qualification >> 8) & 15; 5321 switch ((exit_qualification >> 4) & 3) { 5322 case 0: /* mov to cr */ 5323 val = kvm_register_readl(vcpu, reg); 5324 trace_kvm_cr_write(cr, val); 5325 switch (cr) { 5326 case 0: 5327 err = handle_set_cr0(vcpu, val); 5328 kvm_complete_insn_gp(vcpu, err); 5329 return 1; 5330 case 3: 5331 err = kvm_set_cr3(vcpu, val); 5332 kvm_complete_insn_gp(vcpu, err); 5333 return 1; 5334 case 4: 5335 err = handle_set_cr4(vcpu, val); 5336 kvm_complete_insn_gp(vcpu, err); 5337 return 1; 5338 case 8: { 5339 u8 cr8_prev = kvm_get_cr8(vcpu); 5340 u8 cr8 = (u8)val; 5341 err = kvm_set_cr8(vcpu, cr8); 5342 kvm_complete_insn_gp(vcpu, err); 5343 if (irqchip_in_kernel(vcpu->kvm)) 5344 return 1; 5345 if (cr8_prev <= cr8) 5346 return 1; 5347 vcpu->run->exit_reason = KVM_EXIT_SET_TPR; 5348 return 0; 5349 } 5350 } 5351 break; 5352 case 2: /* clts */ 5353 handle_clts(vcpu); 5354 trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); 5355 skip_emulated_instruction(vcpu); 5356 vmx_fpu_activate(vcpu); 5357 return 1; 5358 case 1: /*mov from cr*/ 5359 switch (cr) { 5360 case 3: 5361 val = kvm_read_cr3(vcpu); 5362 kvm_register_write(vcpu, reg, val); 5363 trace_kvm_cr_read(cr, val); 5364 skip_emulated_instruction(vcpu); 5365 return 1; 5366 case 8: 5367 val = kvm_get_cr8(vcpu); 5368 kvm_register_write(vcpu, reg, val); 5369 trace_kvm_cr_read(cr, val); 5370 skip_emulated_instruction(vcpu); 5371 return 1; 5372 } 5373 break; 5374 case 3: /* lmsw */ 5375 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 5376 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); 5377 kvm_lmsw(vcpu, val); 5378 5379 skip_emulated_instruction(vcpu); 5380 return 1; 5381 default: 5382 break; 5383 } 5384 vcpu->run->exit_reason = 0; 5385 vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 5386 (int)(exit_qualification >> 4) & 3, cr); 5387 return 0; 5388} 5389 5390static int handle_dr(struct kvm_vcpu *vcpu) 5391{ 5392 unsigned long exit_qualification; 5393 int dr, dr7, reg; 5394 5395 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5396 dr = exit_qualification & DEBUG_REG_ACCESS_NUM; 5397 5398 /* First, if DR does not exist, trigger UD */ 5399 if (!kvm_require_dr(vcpu, dr)) 5400 return 1; 5401 5402 /* Do not handle if the CPL > 0, will trigger GP on re-entry */ 5403 if (!kvm_require_cpl(vcpu, 0)) 5404 return 1; 5405 dr7 = vmcs_readl(GUEST_DR7); 5406 if (dr7 & DR7_GD) { 5407 /* 5408 * As the vm-exit takes precedence over the debug trap, we 5409 * need to emulate the latter, either for the host or the 5410 * guest debugging itself. 5411 */ 5412 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 5413 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; 5414 vcpu->run->debug.arch.dr7 = dr7; 5415 vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu); 5416 vcpu->run->debug.arch.exception = DB_VECTOR; 5417 vcpu->run->exit_reason = KVM_EXIT_DEBUG; 5418 return 0; 5419 } else { 5420 vcpu->arch.dr6 &= ~15; 5421 vcpu->arch.dr6 |= DR6_BD | DR6_RTM; 5422 kvm_queue_exception(vcpu, DB_VECTOR); 5423 return 1; 5424 } 5425 } 5426 5427 if (vcpu->guest_debug == 0) { 5428 u32 cpu_based_vm_exec_control; 5429 5430 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5431 cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING; 5432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5433 5434 /* 5435 * No more DR vmexits; force a reload of the debug registers 5436 * and reenter on this instruction. The next vmexit will 5437 * retrieve the full state of the debug registers. 5438 */ 5439 vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; 5440 return 1; 5441 } 5442 5443 reg = DEBUG_REG_ACCESS_REG(exit_qualification); 5444 if (exit_qualification & TYPE_MOV_FROM_DR) { 5445 unsigned long val; 5446 5447 if (kvm_get_dr(vcpu, dr, &val)) 5448 return 1; 5449 kvm_register_write(vcpu, reg, val); 5450 } else 5451 if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) 5452 return 1; 5453 5454 skip_emulated_instruction(vcpu); 5455 return 1; 5456} 5457 5458static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) 5459{ 5460 return vcpu->arch.dr6; 5461} 5462 5463static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) 5464{ 5465} 5466 5467static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) 5468{ 5469 u32 cpu_based_vm_exec_control; 5470 5471 get_debugreg(vcpu->arch.db[0], 0); 5472 get_debugreg(vcpu->arch.db[1], 1); 5473 get_debugreg(vcpu->arch.db[2], 2); 5474 get_debugreg(vcpu->arch.db[3], 3); 5475 get_debugreg(vcpu->arch.dr6, 6); 5476 vcpu->arch.dr7 = vmcs_readl(GUEST_DR7); 5477 5478 vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; 5479 5480 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5481 cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING; 5482 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5483} 5484 5485static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) 5486{ 5487 vmcs_writel(GUEST_DR7, val); 5488} 5489 5490static int handle_cpuid(struct kvm_vcpu *vcpu) 5491{ 5492 kvm_emulate_cpuid(vcpu); 5493 return 1; 5494} 5495 5496static int handle_rdmsr(struct kvm_vcpu *vcpu) 5497{ 5498 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5499 u64 data; 5500 5501 if (vmx_get_msr(vcpu, ecx, &data)) { 5502 trace_kvm_msr_read_ex(ecx); 5503 kvm_inject_gp(vcpu, 0); 5504 return 1; 5505 } 5506 5507 trace_kvm_msr_read(ecx, data); 5508 5509 /* FIXME: handling of bits 32:63 of rax, rdx */ 5510 vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; 5511 vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; 5512 skip_emulated_instruction(vcpu); 5513 return 1; 5514} 5515 5516static int handle_wrmsr(struct kvm_vcpu *vcpu) 5517{ 5518 struct msr_data msr; 5519 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 5520 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 5521 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 5522 5523 msr.data = data; 5524 msr.index = ecx; 5525 msr.host_initiated = false; 5526 if (kvm_set_msr(vcpu, &msr) != 0) { 5527 trace_kvm_msr_write_ex(ecx, data); 5528 kvm_inject_gp(vcpu, 0); 5529 return 1; 5530 } 5531 5532 trace_kvm_msr_write(ecx, data); 5533 skip_emulated_instruction(vcpu); 5534 return 1; 5535} 5536 5537static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 5538{ 5539 kvm_make_request(KVM_REQ_EVENT, vcpu); 5540 return 1; 5541} 5542 5543static int handle_interrupt_window(struct kvm_vcpu *vcpu) 5544{ 5545 u32 cpu_based_vm_exec_control; 5546 5547 /* clear pending irq */ 5548 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5549 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 5550 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5551 5552 kvm_make_request(KVM_REQ_EVENT, vcpu); 5553 5554 ++vcpu->stat.irq_window_exits; 5555 5556 /* 5557 * If the user space waits to inject interrupts, exit as soon as 5558 * possible 5559 */ 5560 if (!irqchip_in_kernel(vcpu->kvm) && 5561 vcpu->run->request_interrupt_window && 5562 !kvm_cpu_has_interrupt(vcpu)) { 5563 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 5564 return 0; 5565 } 5566 return 1; 5567} 5568 5569static int handle_halt(struct kvm_vcpu *vcpu) 5570{ 5571 return kvm_emulate_halt(vcpu); 5572} 5573 5574static int handle_vmcall(struct kvm_vcpu *vcpu) 5575{ 5576 kvm_emulate_hypercall(vcpu); 5577 return 1; 5578} 5579 5580static int handle_invd(struct kvm_vcpu *vcpu) 5581{ 5582 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5583} 5584 5585static int handle_invlpg(struct kvm_vcpu *vcpu) 5586{ 5587 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5588 5589 kvm_mmu_invlpg(vcpu, exit_qualification); 5590 skip_emulated_instruction(vcpu); 5591 return 1; 5592} 5593 5594static int handle_rdpmc(struct kvm_vcpu *vcpu) 5595{ 5596 int err; 5597 5598 err = kvm_rdpmc(vcpu); 5599 kvm_complete_insn_gp(vcpu, err); 5600 5601 return 1; 5602} 5603 5604static int handle_wbinvd(struct kvm_vcpu *vcpu) 5605{ 5606 kvm_emulate_wbinvd(vcpu); 5607 return 1; 5608} 5609 5610static int handle_xsetbv(struct kvm_vcpu *vcpu) 5611{ 5612 u64 new_bv = kvm_read_edx_eax(vcpu); 5613 u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 5614 5615 if (kvm_set_xcr(vcpu, index, new_bv) == 0) 5616 skip_emulated_instruction(vcpu); 5617 return 1; 5618} 5619 5620static int handle_xsaves(struct kvm_vcpu *vcpu) 5621{ 5622 skip_emulated_instruction(vcpu); 5623 WARN(1, "this should never happen\n"); 5624 return 1; 5625} 5626 5627static int handle_xrstors(struct kvm_vcpu *vcpu) 5628{ 5629 skip_emulated_instruction(vcpu); 5630 WARN(1, "this should never happen\n"); 5631 return 1; 5632} 5633 5634static int handle_apic_access(struct kvm_vcpu *vcpu) 5635{ 5636 if (likely(fasteoi)) { 5637 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5638 int access_type, offset; 5639 5640 access_type = exit_qualification & APIC_ACCESS_TYPE; 5641 offset = exit_qualification & APIC_ACCESS_OFFSET; 5642 /* 5643 * Sane guest uses MOV to write EOI, with written value 5644 * not cared. So make a short-circuit here by avoiding 5645 * heavy instruction emulation. 5646 */ 5647 if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && 5648 (offset == APIC_EOI)) { 5649 kvm_lapic_set_eoi(vcpu); 5650 skip_emulated_instruction(vcpu); 5651 return 1; 5652 } 5653 } 5654 return emulate_instruction(vcpu, 0) == EMULATE_DONE; 5655} 5656 5657static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) 5658{ 5659 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5660 int vector = exit_qualification & 0xff; 5661 5662 /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ 5663 kvm_apic_set_eoi_accelerated(vcpu, vector); 5664 return 1; 5665} 5666 5667static int handle_apic_write(struct kvm_vcpu *vcpu) 5668{ 5669 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5670 u32 offset = exit_qualification & 0xfff; 5671 5672 /* APIC-write VM exit is trap-like and thus no need to adjust IP */ 5673 kvm_apic_write_nodecode(vcpu, offset); 5674 return 1; 5675} 5676 5677static int handle_task_switch(struct kvm_vcpu *vcpu) 5678{ 5679 struct vcpu_vmx *vmx = to_vmx(vcpu); 5680 unsigned long exit_qualification; 5681 bool has_error_code = false; 5682 u32 error_code = 0; 5683 u16 tss_selector; 5684 int reason, type, idt_v, idt_index; 5685 5686 idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); 5687 idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); 5688 type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); 5689 5690 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5691 5692 reason = (u32)exit_qualification >> 30; 5693 if (reason == TASK_SWITCH_GATE && idt_v) { 5694 switch (type) { 5695 case INTR_TYPE_NMI_INTR: 5696 vcpu->arch.nmi_injected = false; 5697 vmx_set_nmi_mask(vcpu, true); 5698 break; 5699 case INTR_TYPE_EXT_INTR: 5700 case INTR_TYPE_SOFT_INTR: 5701 kvm_clear_interrupt_queue(vcpu); 5702 break; 5703 case INTR_TYPE_HARD_EXCEPTION: 5704 if (vmx->idt_vectoring_info & 5705 VECTORING_INFO_DELIVER_CODE_MASK) { 5706 has_error_code = true; 5707 error_code = 5708 vmcs_read32(IDT_VECTORING_ERROR_CODE); 5709 } 5710 /* fall through */ 5711 case INTR_TYPE_SOFT_EXCEPTION: 5712 kvm_clear_exception_queue(vcpu); 5713 break; 5714 default: 5715 break; 5716 } 5717 } 5718 tss_selector = exit_qualification; 5719 5720 if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION && 5721 type != INTR_TYPE_EXT_INTR && 5722 type != INTR_TYPE_NMI_INTR)) 5723 skip_emulated_instruction(vcpu); 5724 5725 if (kvm_task_switch(vcpu, tss_selector, 5726 type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, 5727 has_error_code, error_code) == EMULATE_FAIL) { 5728 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5729 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5730 vcpu->run->internal.ndata = 0; 5731 return 0; 5732 } 5733 5734 /* clear all local breakpoint enable flags */ 5735 vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); 5736 5737 /* 5738 * TODO: What about debug traps on tss switch? 5739 * Are we supposed to inject them and update dr6? 5740 */ 5741 5742 return 1; 5743} 5744 5745static int handle_ept_violation(struct kvm_vcpu *vcpu) 5746{ 5747 unsigned long exit_qualification; 5748 gpa_t gpa; 5749 u32 error_code; 5750 int gla_validity; 5751 5752 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 5753 5754 gla_validity = (exit_qualification >> 7) & 0x3; 5755 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { 5756 printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); 5757 printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", 5758 (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), 5759 vmcs_readl(GUEST_LINEAR_ADDRESS)); 5760 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 5761 (long unsigned int)exit_qualification); 5762 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 5763 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 5764 return 0; 5765 } 5766 5767 /* 5768 * EPT violation happened while executing iret from NMI, 5769 * "blocked by NMI" bit has to be set before next VM entry. 5770 * There are errata that may cause this bit to not be set: 5771 * AAK134, BY25. 5772 */ 5773 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 5774 cpu_has_virtual_nmis() && 5775 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 5776 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); 5777 5778 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5779 trace_kvm_page_fault(gpa, exit_qualification); 5780 5781 /* It is a write fault? */ 5782 error_code = exit_qualification & PFERR_WRITE_MASK; 5783 /* It is a fetch fault? */ 5784 error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK; 5785 /* ept page table is present? */ 5786 error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK; 5787 5788 vcpu->arch.exit_qualification = exit_qualification; 5789 5790 return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); 5791} 5792 5793static u64 ept_rsvd_mask(u64 spte, int level) 5794{ 5795 int i; 5796 u64 mask = 0; 5797 5798 for (i = 51; i > boot_cpu_data.x86_phys_bits; i--) 5799 mask |= (1ULL << i); 5800 5801 if (level == 4) 5802 /* bits 7:3 reserved */ 5803 mask |= 0xf8; 5804 else if (spte & (1ULL << 7)) 5805 /* 5806 * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively, 5807 * level == 1 if the hypervisor is using the ignored bit 7. 5808 */ 5809 mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE; 5810 else if (level > 1) 5811 /* bits 6:3 reserved */ 5812 mask |= 0x78; 5813 5814 return mask; 5815} 5816 5817static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte, 5818 int level) 5819{ 5820 printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level); 5821 5822 /* 010b (write-only) */ 5823 WARN_ON((spte & 0x7) == 0x2); 5824 5825 /* 110b (write/execute) */ 5826 WARN_ON((spte & 0x7) == 0x6); 5827 5828 /* 100b (execute-only) and value not supported by logical processor */ 5829 if (!cpu_has_vmx_ept_execute_only()) 5830 WARN_ON((spte & 0x7) == 0x4); 5831 5832 /* not 000b */ 5833 if ((spte & 0x7)) { 5834 u64 rsvd_bits = spte & ept_rsvd_mask(spte, level); 5835 5836 if (rsvd_bits != 0) { 5837 printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n", 5838 __func__, rsvd_bits); 5839 WARN_ON(1); 5840 } 5841 5842 /* bits 5:3 are _not_ reserved for large page or leaf page */ 5843 if ((rsvd_bits & 0x38) == 0) { 5844 u64 ept_mem_type = (spte & 0x38) >> 3; 5845 5846 if (ept_mem_type == 2 || ept_mem_type == 3 || 5847 ept_mem_type == 7) { 5848 printk(KERN_ERR "%s: ept_mem_type=0x%llx\n", 5849 __func__, ept_mem_type); 5850 WARN_ON(1); 5851 } 5852 } 5853 } 5854} 5855 5856static int handle_ept_misconfig(struct kvm_vcpu *vcpu) 5857{ 5858 u64 sptes[4]; 5859 int nr_sptes, i, ret; 5860 gpa_t gpa; 5861 5862 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 5863 if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { 5864 skip_emulated_instruction(vcpu); 5865 return 1; 5866 } 5867 5868 ret = handle_mmio_page_fault_common(vcpu, gpa, true); 5869 if (likely(ret == RET_MMIO_PF_EMULATE)) 5870 return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == 5871 EMULATE_DONE; 5872 5873 if (unlikely(ret == RET_MMIO_PF_INVALID)) 5874 return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0); 5875 5876 if (unlikely(ret == RET_MMIO_PF_RETRY)) 5877 return 1; 5878 5879 /* It is the real ept misconfig */ 5880 printk(KERN_ERR "EPT: Misconfiguration.\n"); 5881 printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa); 5882 5883 nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes); 5884 5885 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 5886 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 5887 5888 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; 5889 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 5890 5891 return 0; 5892} 5893 5894static int handle_nmi_window(struct kvm_vcpu *vcpu) 5895{ 5896 u32 cpu_based_vm_exec_control; 5897 5898 /* clear pending NMI */ 5899 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5900 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 5901 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 5902 ++vcpu->stat.nmi_window_exits; 5903 kvm_make_request(KVM_REQ_EVENT, vcpu); 5904 5905 return 1; 5906} 5907 5908static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) 5909{ 5910 struct vcpu_vmx *vmx = to_vmx(vcpu); 5911 enum emulation_result err = EMULATE_DONE; 5912 int ret = 1; 5913 u32 cpu_exec_ctrl; 5914 bool intr_window_requested; 5915 unsigned count = 130; 5916 5917 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 5918 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; 5919 5920 while (vmx->emulation_required && count-- != 0) { 5921 if (intr_window_requested && vmx_interrupt_allowed(vcpu)) 5922 return handle_interrupt_window(&vmx->vcpu); 5923 5924 if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) 5925 return 1; 5926 5927 err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); 5928 5929 if (err == EMULATE_USER_EXIT) { 5930 ++vcpu->stat.mmio_exits; 5931 ret = 0; 5932 goto out; 5933 } 5934 5935 if (err != EMULATE_DONE) { 5936 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5937 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 5938 vcpu->run->internal.ndata = 0; 5939 return 0; 5940 } 5941 5942 if (vcpu->arch.halt_request) { 5943 vcpu->arch.halt_request = 0; 5944 ret = kvm_vcpu_halt(vcpu); 5945 goto out; 5946 } 5947 5948 if (signal_pending(current)) 5949 goto out; 5950 if (need_resched()) 5951 schedule(); 5952 } 5953 5954out: 5955 return ret; 5956} 5957 5958static int __grow_ple_window(int val) 5959{ 5960 if (ple_window_grow < 1) 5961 return ple_window; 5962 5963 val = min(val, ple_window_actual_max); 5964 5965 if (ple_window_grow < ple_window) 5966 val *= ple_window_grow; 5967 else 5968 val += ple_window_grow; 5969 5970 return val; 5971} 5972 5973static int __shrink_ple_window(int val, int modifier, int minimum) 5974{ 5975 if (modifier < 1) 5976 return ple_window; 5977 5978 if (modifier < ple_window) 5979 val /= modifier; 5980 else 5981 val -= modifier; 5982 5983 return max(val, minimum); 5984} 5985 5986static void grow_ple_window(struct kvm_vcpu *vcpu) 5987{ 5988 struct vcpu_vmx *vmx = to_vmx(vcpu); 5989 int old = vmx->ple_window; 5990 5991 vmx->ple_window = __grow_ple_window(old); 5992 5993 if (vmx->ple_window != old) 5994 vmx->ple_window_dirty = true; 5995 5996 trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); 5997} 5998 5999static void shrink_ple_window(struct kvm_vcpu *vcpu) 6000{ 6001 struct vcpu_vmx *vmx = to_vmx(vcpu); 6002 int old = vmx->ple_window; 6003 6004 vmx->ple_window = __shrink_ple_window(old, 6005 ple_window_shrink, ple_window); 6006 6007 if (vmx->ple_window != old) 6008 vmx->ple_window_dirty = true; 6009 6010 trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); 6011} 6012 6013/* 6014 * ple_window_actual_max is computed to be one grow_ple_window() below 6015 * ple_window_max. (See __grow_ple_window for the reason.) 6016 * This prevents overflows, because ple_window_max is int. 6017 * ple_window_max effectively rounded down to a multiple of ple_window_grow in 6018 * this process. 6019 * ple_window_max is also prevented from setting vmx->ple_window < ple_window. 6020 */ 6021static void update_ple_window_actual_max(void) 6022{ 6023 ple_window_actual_max = 6024 __shrink_ple_window(max(ple_window_max, ple_window), 6025 ple_window_grow, INT_MIN); 6026} 6027 6028static __init int hardware_setup(void) 6029{ 6030 int r = -ENOMEM, i, msr; 6031 6032 rdmsrl_safe(MSR_EFER, &host_efer); 6033 6034 for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) 6035 kvm_define_shared_msr(i, vmx_msr_index[i]); 6036 6037 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 6038 if (!vmx_io_bitmap_a) 6039 return r; 6040 6041 vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); 6042 if (!vmx_io_bitmap_b) 6043 goto out; 6044 6045 vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); 6046 if (!vmx_msr_bitmap_legacy) 6047 goto out1; 6048 6049 vmx_msr_bitmap_legacy_x2apic = 6050 (unsigned long *)__get_free_page(GFP_KERNEL); 6051 if (!vmx_msr_bitmap_legacy_x2apic) 6052 goto out2; 6053 6054 vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); 6055 if (!vmx_msr_bitmap_longmode) 6056 goto out3; 6057 6058 vmx_msr_bitmap_longmode_x2apic = 6059 (unsigned long *)__get_free_page(GFP_KERNEL); 6060 if (!vmx_msr_bitmap_longmode_x2apic) 6061 goto out4; 6062 6063 if (nested) { 6064 vmx_msr_bitmap_nested = 6065 (unsigned long *)__get_free_page(GFP_KERNEL); 6066 if (!vmx_msr_bitmap_nested) 6067 goto out5; 6068 } 6069 6070 vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6071 if (!vmx_vmread_bitmap) 6072 goto out6; 6073 6074 vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); 6075 if (!vmx_vmwrite_bitmap) 6076 goto out7; 6077 6078 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 6079 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 6080 6081 /* 6082 * Allow direct access to the PC debug port (it is often used for I/O 6083 * delays, but the vmexits simply slow things down). 6084 */ 6085 memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); 6086 clear_bit(0x80, vmx_io_bitmap_a); 6087 6088 memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); 6089 6090 memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); 6091 memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); 6092 if (nested) 6093 memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE); 6094 6095 if (setup_vmcs_config(&vmcs_config) < 0) { 6096 r = -EIO; 6097 goto out8; 6098 } 6099 6100 if (boot_cpu_has(X86_FEATURE_NX)) 6101 kvm_enable_efer_bits(EFER_NX); 6102 6103 if (!cpu_has_vmx_vpid()) 6104 enable_vpid = 0; 6105 if (!cpu_has_vmx_shadow_vmcs()) 6106 enable_shadow_vmcs = 0; 6107 if (enable_shadow_vmcs) 6108 init_vmcs_shadow_fields(); 6109 6110 if (!cpu_has_vmx_ept() || 6111 !cpu_has_vmx_ept_4levels()) { 6112 enable_ept = 0; 6113 enable_unrestricted_guest = 0; 6114 enable_ept_ad_bits = 0; 6115 } 6116 6117 if (!cpu_has_vmx_ept_ad_bits()) 6118 enable_ept_ad_bits = 0; 6119 6120 if (!cpu_has_vmx_unrestricted_guest()) 6121 enable_unrestricted_guest = 0; 6122 6123 if (!cpu_has_vmx_flexpriority()) 6124 flexpriority_enabled = 0; 6125 6126 /* 6127 * set_apic_access_page_addr() is used to reload apic access 6128 * page upon invalidation. No need to do anything if not 6129 * using the APIC_ACCESS_ADDR VMCS field. 6130 */ 6131 if (!flexpriority_enabled) 6132 kvm_x86_ops->set_apic_access_page_addr = NULL; 6133 6134 if (!cpu_has_vmx_tpr_shadow()) 6135 kvm_x86_ops->update_cr8_intercept = NULL; 6136 6137 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 6138 kvm_disable_largepages(); 6139 6140 if (!cpu_has_vmx_ple()) 6141 ple_gap = 0; 6142 6143 if (!cpu_has_vmx_apicv()) 6144 enable_apicv = 0; 6145 6146 if (enable_apicv) 6147 kvm_x86_ops->update_cr8_intercept = NULL; 6148 else { 6149 kvm_x86_ops->hwapic_irr_update = NULL; 6150 kvm_x86_ops->hwapic_isr_update = NULL; 6151 kvm_x86_ops->deliver_posted_interrupt = NULL; 6152 kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy; 6153 } 6154 6155 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 6156 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 6157 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); 6158 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false); 6159 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false); 6160 vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); 6161 vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); 6162 6163 memcpy(vmx_msr_bitmap_legacy_x2apic, 6164 vmx_msr_bitmap_legacy, PAGE_SIZE); 6165 memcpy(vmx_msr_bitmap_longmode_x2apic, 6166 vmx_msr_bitmap_longmode, PAGE_SIZE); 6167 6168 set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ 6169 6170 if (enable_apicv) { 6171 for (msr = 0x800; msr <= 0x8ff; msr++) 6172 vmx_disable_intercept_msr_read_x2apic(msr); 6173 6174 /* According SDM, in x2apic mode, the whole id reg is used. 6175 * But in KVM, it only use the highest eight bits. Need to 6176 * intercept it */ 6177 vmx_enable_intercept_msr_read_x2apic(0x802); 6178 /* TMCCT */ 6179 vmx_enable_intercept_msr_read_x2apic(0x839); 6180 /* TPR */ 6181 vmx_disable_intercept_msr_write_x2apic(0x808); 6182 /* EOI */ 6183 vmx_disable_intercept_msr_write_x2apic(0x80b); 6184 /* SELF-IPI */ 6185 vmx_disable_intercept_msr_write_x2apic(0x83f); 6186 } 6187 6188 if (enable_ept) { 6189 kvm_mmu_set_mask_ptes(0ull, 6190 (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, 6191 (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, 6192 0ull, VMX_EPT_EXECUTABLE_MASK); 6193 ept_set_mmio_spte_mask(); 6194 kvm_enable_tdp(); 6195 } else 6196 kvm_disable_tdp(); 6197 6198 update_ple_window_actual_max(); 6199 6200 /* 6201 * Only enable PML when hardware supports PML feature, and both EPT 6202 * and EPT A/D bit features are enabled -- PML depends on them to work. 6203 */ 6204 if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml()) 6205 enable_pml = 0; 6206 6207 if (!enable_pml) { 6208 kvm_x86_ops->slot_enable_log_dirty = NULL; 6209 kvm_x86_ops->slot_disable_log_dirty = NULL; 6210 kvm_x86_ops->flush_log_dirty = NULL; 6211 kvm_x86_ops->enable_log_dirty_pt_masked = NULL; 6212 } 6213 6214 return alloc_kvm_area(); 6215 6216out8: 6217 free_page((unsigned long)vmx_vmwrite_bitmap); 6218out7: 6219 free_page((unsigned long)vmx_vmread_bitmap); 6220out6: 6221 if (nested) 6222 free_page((unsigned long)vmx_msr_bitmap_nested); 6223out5: 6224 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6225out4: 6226 free_page((unsigned long)vmx_msr_bitmap_longmode); 6227out3: 6228 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 6229out2: 6230 free_page((unsigned long)vmx_msr_bitmap_legacy); 6231out1: 6232 free_page((unsigned long)vmx_io_bitmap_b); 6233out: 6234 free_page((unsigned long)vmx_io_bitmap_a); 6235 6236 return r; 6237} 6238 6239static __exit void hardware_unsetup(void) 6240{ 6241 free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); 6242 free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); 6243 free_page((unsigned long)vmx_msr_bitmap_legacy); 6244 free_page((unsigned long)vmx_msr_bitmap_longmode); 6245 free_page((unsigned long)vmx_io_bitmap_b); 6246 free_page((unsigned long)vmx_io_bitmap_a); 6247 free_page((unsigned long)vmx_vmwrite_bitmap); 6248 free_page((unsigned long)vmx_vmread_bitmap); 6249 if (nested) 6250 free_page((unsigned long)vmx_msr_bitmap_nested); 6251 6252 free_kvm_area(); 6253} 6254 6255/* 6256 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE 6257 * exiting, so only get here on cpu with PAUSE-Loop-Exiting. 6258 */ 6259static int handle_pause(struct kvm_vcpu *vcpu) 6260{ 6261 if (ple_gap) 6262 grow_ple_window(vcpu); 6263 6264 skip_emulated_instruction(vcpu); 6265 kvm_vcpu_on_spin(vcpu); 6266 6267 return 1; 6268} 6269 6270static int handle_nop(struct kvm_vcpu *vcpu) 6271{ 6272 skip_emulated_instruction(vcpu); 6273 return 1; 6274} 6275 6276static int handle_mwait(struct kvm_vcpu *vcpu) 6277{ 6278 printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n"); 6279 return handle_nop(vcpu); 6280} 6281 6282static int handle_monitor(struct kvm_vcpu *vcpu) 6283{ 6284 printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n"); 6285 return handle_nop(vcpu); 6286} 6287 6288/* 6289 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. 6290 * We could reuse a single VMCS for all the L2 guests, but we also want the 6291 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this 6292 * allows keeping them loaded on the processor, and in the future will allow 6293 * optimizations where prepare_vmcs02 doesn't need to set all the fields on 6294 * every entry if they never change. 6295 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE 6296 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. 6297 * 6298 * The following functions allocate and free a vmcs02 in this pool. 6299 */ 6300 6301/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ 6302static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) 6303{ 6304 struct vmcs02_list *item; 6305 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 6306 if (item->vmptr == vmx->nested.current_vmptr) { 6307 list_move(&item->list, &vmx->nested.vmcs02_pool); 6308 return &item->vmcs02; 6309 } 6310 6311 if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { 6312 /* Recycle the least recently used VMCS. */ 6313 item = list_entry(vmx->nested.vmcs02_pool.prev, 6314 struct vmcs02_list, list); 6315 item->vmptr = vmx->nested.current_vmptr; 6316 list_move(&item->list, &vmx->nested.vmcs02_pool); 6317 return &item->vmcs02; 6318 } 6319 6320 /* Create a new VMCS */ 6321 item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL); 6322 if (!item) 6323 return NULL; 6324 item->vmcs02.vmcs = alloc_vmcs(); 6325 if (!item->vmcs02.vmcs) { 6326 kfree(item); 6327 return NULL; 6328 } 6329 loaded_vmcs_init(&item->vmcs02); 6330 item->vmptr = vmx->nested.current_vmptr; 6331 list_add(&(item->list), &(vmx->nested.vmcs02_pool)); 6332 vmx->nested.vmcs02_num++; 6333 return &item->vmcs02; 6334} 6335 6336/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ 6337static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) 6338{ 6339 struct vmcs02_list *item; 6340 list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) 6341 if (item->vmptr == vmptr) { 6342 free_loaded_vmcs(&item->vmcs02); 6343 list_del(&item->list); 6344 kfree(item); 6345 vmx->nested.vmcs02_num--; 6346 return; 6347 } 6348} 6349 6350/* 6351 * Free all VMCSs saved for this vcpu, except the one pointed by 6352 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs 6353 * must be &vmx->vmcs01. 6354 */ 6355static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) 6356{ 6357 struct vmcs02_list *item, *n; 6358 6359 WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); 6360 list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { 6361 /* 6362 * Something will leak if the above WARN triggers. Better than 6363 * a use-after-free. 6364 */ 6365 if (vmx->loaded_vmcs == &item->vmcs02) 6366 continue; 6367 6368 free_loaded_vmcs(&item->vmcs02); 6369 list_del(&item->list); 6370 kfree(item); 6371 vmx->nested.vmcs02_num--; 6372 } 6373} 6374 6375/* 6376 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 6377 * set the success or error code of an emulated VMX instruction, as specified 6378 * by Vol 2B, VMX Instruction Reference, "Conventions". 6379 */ 6380static void nested_vmx_succeed(struct kvm_vcpu *vcpu) 6381{ 6382 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 6383 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 6384 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 6385} 6386 6387static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 6388{ 6389 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 6390 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 6391 X86_EFLAGS_SF | X86_EFLAGS_OF)) 6392 | X86_EFLAGS_CF); 6393} 6394 6395static void nested_vmx_failValid(struct kvm_vcpu *vcpu, 6396 u32 vm_instruction_error) 6397{ 6398 if (to_vmx(vcpu)->nested.current_vmptr == -1ull) { 6399 /* 6400 * failValid writes the error number to the current VMCS, which 6401 * can't be done there isn't a current VMCS. 6402 */ 6403 nested_vmx_failInvalid(vcpu); 6404 return; 6405 } 6406 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 6407 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 6408 X86_EFLAGS_SF | X86_EFLAGS_OF)) 6409 | X86_EFLAGS_ZF); 6410 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 6411 /* 6412 * We don't need to force a shadow sync because 6413 * VM_INSTRUCTION_ERROR is not shadowed 6414 */ 6415} 6416 6417static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 6418{ 6419 /* TODO: not to reset guest simply here. */ 6420 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 6421 pr_warn("kvm: nested vmx abort, indicator %d\n", indicator); 6422} 6423 6424static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 6425{ 6426 struct vcpu_vmx *vmx = 6427 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 6428 6429 vmx->nested.preemption_timer_expired = true; 6430 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 6431 kvm_vcpu_kick(&vmx->vcpu); 6432 6433 return HRTIMER_NORESTART; 6434} 6435 6436/* 6437 * Decode the memory-address operand of a vmx instruction, as recorded on an 6438 * exit caused by such an instruction (run by a guest hypervisor). 6439 * On success, returns 0. When the operand is invalid, returns 1 and throws 6440 * #UD or #GP. 6441 */ 6442static int get_vmx_mem_address(struct kvm_vcpu *vcpu, 6443 unsigned long exit_qualification, 6444 u32 vmx_instruction_info, gva_t *ret) 6445{ 6446 /* 6447 * According to Vol. 3B, "Information for VM Exits Due to Instruction 6448 * Execution", on an exit, vmx_instruction_info holds most of the 6449 * addressing components of the operand. Only the displacement part 6450 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 6451 * For how an actual address is calculated from all these components, 6452 * refer to Vol. 1, "Operand Addressing". 6453 */ 6454 int scaling = vmx_instruction_info & 3; 6455 int addr_size = (vmx_instruction_info >> 7) & 7; 6456 bool is_reg = vmx_instruction_info & (1u << 10); 6457 int seg_reg = (vmx_instruction_info >> 15) & 7; 6458 int index_reg = (vmx_instruction_info >> 18) & 0xf; 6459 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 6460 int base_reg = (vmx_instruction_info >> 23) & 0xf; 6461 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 6462 6463 if (is_reg) { 6464 kvm_queue_exception(vcpu, UD_VECTOR); 6465 return 1; 6466 } 6467 6468 /* Addr = segment_base + offset */ 6469 /* offset = base + [index * scale] + displacement */ 6470 *ret = vmx_get_segment_base(vcpu, seg_reg); 6471 if (base_is_valid) 6472 *ret += kvm_register_read(vcpu, base_reg); 6473 if (index_is_valid) 6474 *ret += kvm_register_read(vcpu, index_reg)<<scaling; 6475 *ret += exit_qualification; /* holds the displacement */ 6476 6477 if (addr_size == 1) /* 32 bit */ 6478 *ret &= 0xffffffff; 6479 6480 /* 6481 * TODO: throw #GP (and return 1) in various cases that the VM* 6482 * instructions require it - e.g., offset beyond segment limit, 6483 * unusable or unreadable/unwritable segment, non-canonical 64-bit 6484 * address, and so on. Currently these are not checked. 6485 */ 6486 return 0; 6487} 6488 6489/* 6490 * This function performs the various checks including 6491 * - if it's 4KB aligned 6492 * - No bits beyond the physical address width are set 6493 * - Returns 0 on success or else 1 6494 * (Intel SDM Section 30.3) 6495 */ 6496static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, 6497 gpa_t *vmpointer) 6498{ 6499 gva_t gva; 6500 gpa_t vmptr; 6501 struct x86_exception e; 6502 struct page *page; 6503 struct vcpu_vmx *vmx = to_vmx(vcpu); 6504 int maxphyaddr = cpuid_maxphyaddr(vcpu); 6505 6506 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 6507 vmcs_read32(VMX_INSTRUCTION_INFO), &gva)) 6508 return 1; 6509 6510 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr, 6511 sizeof(vmptr), &e)) { 6512 kvm_inject_page_fault(vcpu, &e); 6513 return 1; 6514 } 6515 6516 switch (exit_reason) { 6517 case EXIT_REASON_VMON: 6518 /* 6519 * SDM 3: 24.11.5 6520 * The first 4 bytes of VMXON region contain the supported 6521 * VMCS revision identifier 6522 * 6523 * Note - IA32_VMX_BASIC[48] will never be 1 6524 * for the nested case; 6525 * which replaces physical address width with 32 6526 * 6527 */ 6528 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6529 nested_vmx_failInvalid(vcpu); 6530 skip_emulated_instruction(vcpu); 6531 return 1; 6532 } 6533 6534 page = nested_get_page(vcpu, vmptr); 6535 if (page == NULL || 6536 *(u32 *)kmap(page) != VMCS12_REVISION) { 6537 nested_vmx_failInvalid(vcpu); 6538 kunmap(page); 6539 skip_emulated_instruction(vcpu); 6540 return 1; 6541 } 6542 kunmap(page); 6543 vmx->nested.vmxon_ptr = vmptr; 6544 break; 6545 case EXIT_REASON_VMCLEAR: 6546 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6547 nested_vmx_failValid(vcpu, 6548 VMXERR_VMCLEAR_INVALID_ADDRESS); 6549 skip_emulated_instruction(vcpu); 6550 return 1; 6551 } 6552 6553 if (vmptr == vmx->nested.vmxon_ptr) { 6554 nested_vmx_failValid(vcpu, 6555 VMXERR_VMCLEAR_VMXON_POINTER); 6556 skip_emulated_instruction(vcpu); 6557 return 1; 6558 } 6559 break; 6560 case EXIT_REASON_VMPTRLD: 6561 if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { 6562 nested_vmx_failValid(vcpu, 6563 VMXERR_VMPTRLD_INVALID_ADDRESS); 6564 skip_emulated_instruction(vcpu); 6565 return 1; 6566 } 6567 6568 if (vmptr == vmx->nested.vmxon_ptr) { 6569 nested_vmx_failValid(vcpu, 6570 VMXERR_VMCLEAR_VMXON_POINTER); 6571 skip_emulated_instruction(vcpu); 6572 return 1; 6573 } 6574 break; 6575 default: 6576 return 1; /* shouldn't happen */ 6577 } 6578 6579 if (vmpointer) 6580 *vmpointer = vmptr; 6581 return 0; 6582} 6583 6584/* 6585 * Emulate the VMXON instruction. 6586 * Currently, we just remember that VMX is active, and do not save or even 6587 * inspect the argument to VMXON (the so-called "VMXON pointer") because we 6588 * do not currently need to store anything in that guest-allocated memory 6589 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their 6590 * argument is different from the VMXON pointer (which the spec says they do). 6591 */ 6592static int handle_vmon(struct kvm_vcpu *vcpu) 6593{ 6594 struct kvm_segment cs; 6595 struct vcpu_vmx *vmx = to_vmx(vcpu); 6596 struct vmcs *shadow_vmcs; 6597 const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED 6598 | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; 6599 6600 /* The Intel VMX Instruction Reference lists a bunch of bits that 6601 * are prerequisite to running VMXON, most notably cr4.VMXE must be 6602 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). 6603 * Otherwise, we should fail with #UD. We test these now: 6604 */ 6605 if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || 6606 !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || 6607 (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { 6608 kvm_queue_exception(vcpu, UD_VECTOR); 6609 return 1; 6610 } 6611 6612 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6613 if (is_long_mode(vcpu) && !cs.l) { 6614 kvm_queue_exception(vcpu, UD_VECTOR); 6615 return 1; 6616 } 6617 6618 if (vmx_get_cpl(vcpu)) { 6619 kvm_inject_gp(vcpu, 0); 6620 return 1; 6621 } 6622 6623 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL)) 6624 return 1; 6625 6626 if (vmx->nested.vmxon) { 6627 nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 6628 skip_emulated_instruction(vcpu); 6629 return 1; 6630 } 6631 6632 if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 6633 != VMXON_NEEDED_FEATURES) { 6634 kvm_inject_gp(vcpu, 0); 6635 return 1; 6636 } 6637 6638 if (enable_shadow_vmcs) { 6639 shadow_vmcs = alloc_vmcs(); 6640 if (!shadow_vmcs) 6641 return -ENOMEM; 6642 /* mark vmcs as shadow */ 6643 shadow_vmcs->revision_id |= (1u << 31); 6644 /* init shadow vmcs */ 6645 vmcs_clear(shadow_vmcs); 6646 vmx->nested.current_shadow_vmcs = shadow_vmcs; 6647 } 6648 6649 INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); 6650 vmx->nested.vmcs02_num = 0; 6651 6652 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 6653 HRTIMER_MODE_REL); 6654 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 6655 6656 vmx->nested.vmxon = true; 6657 6658 skip_emulated_instruction(vcpu); 6659 nested_vmx_succeed(vcpu); 6660 return 1; 6661} 6662 6663/* 6664 * Intel's VMX Instruction Reference specifies a common set of prerequisites 6665 * for running VMX instructions (except VMXON, whose prerequisites are 6666 * slightly different). It also specifies what exception to inject otherwise. 6667 */ 6668static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 6669{ 6670 struct kvm_segment cs; 6671 struct vcpu_vmx *vmx = to_vmx(vcpu); 6672 6673 if (!vmx->nested.vmxon) { 6674 kvm_queue_exception(vcpu, UD_VECTOR); 6675 return 0; 6676 } 6677 6678 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); 6679 if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || 6680 (is_long_mode(vcpu) && !cs.l)) { 6681 kvm_queue_exception(vcpu, UD_VECTOR); 6682 return 0; 6683 } 6684 6685 if (vmx_get_cpl(vcpu)) { 6686 kvm_inject_gp(vcpu, 0); 6687 return 0; 6688 } 6689 6690 return 1; 6691} 6692 6693static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) 6694{ 6695 u32 exec_control; 6696 if (vmx->nested.current_vmptr == -1ull) 6697 return; 6698 6699 /* current_vmptr and current_vmcs12 are always set/reset together */ 6700 if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) 6701 return; 6702 6703 if (enable_shadow_vmcs) { 6704 /* copy to memory all shadowed fields in case 6705 they were modified */ 6706 copy_shadow_to_vmcs12(vmx); 6707 vmx->nested.sync_shadow_vmcs = false; 6708 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 6709 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 6710 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 6711 vmcs_write64(VMCS_LINK_POINTER, -1ull); 6712 } 6713 vmx->nested.posted_intr_nv = -1; 6714 kunmap(vmx->nested.current_vmcs12_page); 6715 nested_release_page(vmx->nested.current_vmcs12_page); 6716 vmx->nested.current_vmptr = -1ull; 6717 vmx->nested.current_vmcs12 = NULL; 6718} 6719 6720/* 6721 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 6722 * just stops using VMX. 6723 */ 6724static void free_nested(struct vcpu_vmx *vmx) 6725{ 6726 if (!vmx->nested.vmxon) 6727 return; 6728 6729 vmx->nested.vmxon = false; 6730 nested_release_vmcs12(vmx); 6731 if (enable_shadow_vmcs) 6732 free_vmcs(vmx->nested.current_shadow_vmcs); 6733 /* Unpin physical memory we referred to in current vmcs02 */ 6734 if (vmx->nested.apic_access_page) { 6735 nested_release_page(vmx->nested.apic_access_page); 6736 vmx->nested.apic_access_page = NULL; 6737 } 6738 if (vmx->nested.virtual_apic_page) { 6739 nested_release_page(vmx->nested.virtual_apic_page); 6740 vmx->nested.virtual_apic_page = NULL; 6741 } 6742 if (vmx->nested.pi_desc_page) { 6743 kunmap(vmx->nested.pi_desc_page); 6744 nested_release_page(vmx->nested.pi_desc_page); 6745 vmx->nested.pi_desc_page = NULL; 6746 vmx->nested.pi_desc = NULL; 6747 } 6748 6749 nested_free_all_saved_vmcss(vmx); 6750} 6751 6752/* Emulate the VMXOFF instruction */ 6753static int handle_vmoff(struct kvm_vcpu *vcpu) 6754{ 6755 if (!nested_vmx_check_permission(vcpu)) 6756 return 1; 6757 free_nested(to_vmx(vcpu)); 6758 skip_emulated_instruction(vcpu); 6759 nested_vmx_succeed(vcpu); 6760 return 1; 6761} 6762 6763/* Emulate the VMCLEAR instruction */ 6764static int handle_vmclear(struct kvm_vcpu *vcpu) 6765{ 6766 struct vcpu_vmx *vmx = to_vmx(vcpu); 6767 gpa_t vmptr; 6768 struct vmcs12 *vmcs12; 6769 struct page *page; 6770 6771 if (!nested_vmx_check_permission(vcpu)) 6772 return 1; 6773 6774 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) 6775 return 1; 6776 6777 if (vmptr == vmx->nested.current_vmptr) 6778 nested_release_vmcs12(vmx); 6779 6780 page = nested_get_page(vcpu, vmptr); 6781 if (page == NULL) { 6782 /* 6783 * For accurate processor emulation, VMCLEAR beyond available 6784 * physical memory should do nothing at all. However, it is 6785 * possible that a nested vmx bug, not a guest hypervisor bug, 6786 * resulted in this case, so let's shut down before doing any 6787 * more damage: 6788 */ 6789 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 6790 return 1; 6791 } 6792 vmcs12 = kmap(page); 6793 vmcs12->launch_state = 0; 6794 kunmap(page); 6795 nested_release_page(page); 6796 6797 nested_free_vmcs02(vmx, vmptr); 6798 6799 skip_emulated_instruction(vcpu); 6800 nested_vmx_succeed(vcpu); 6801 return 1; 6802} 6803 6804static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); 6805 6806/* Emulate the VMLAUNCH instruction */ 6807static int handle_vmlaunch(struct kvm_vcpu *vcpu) 6808{ 6809 return nested_vmx_run(vcpu, true); 6810} 6811 6812/* Emulate the VMRESUME instruction */ 6813static int handle_vmresume(struct kvm_vcpu *vcpu) 6814{ 6815 6816 return nested_vmx_run(vcpu, false); 6817} 6818 6819enum vmcs_field_type { 6820 VMCS_FIELD_TYPE_U16 = 0, 6821 VMCS_FIELD_TYPE_U64 = 1, 6822 VMCS_FIELD_TYPE_U32 = 2, 6823 VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 6824}; 6825 6826static inline int vmcs_field_type(unsigned long field) 6827{ 6828 if (0x1 & field) /* the *_HIGH fields are all 32 bit */ 6829 return VMCS_FIELD_TYPE_U32; 6830 return (field >> 13) & 0x3 ; 6831} 6832 6833static inline int vmcs_field_readonly(unsigned long field) 6834{ 6835 return (((field >> 10) & 0x3) == 1); 6836} 6837 6838/* 6839 * Read a vmcs12 field. Since these can have varying lengths and we return 6840 * one type, we chose the biggest type (u64) and zero-extend the return value 6841 * to that size. Note that the caller, handle_vmread, might need to use only 6842 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of 6843 * 64-bit fields are to be returned). 6844 */ 6845static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, 6846 unsigned long field, u64 *ret) 6847{ 6848 short offset = vmcs_field_to_offset(field); 6849 char *p; 6850 6851 if (offset < 0) 6852 return offset; 6853 6854 p = ((char *)(get_vmcs12(vcpu))) + offset; 6855 6856 switch (vmcs_field_type(field)) { 6857 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6858 *ret = *((natural_width *)p); 6859 return 0; 6860 case VMCS_FIELD_TYPE_U16: 6861 *ret = *((u16 *)p); 6862 return 0; 6863 case VMCS_FIELD_TYPE_U32: 6864 *ret = *((u32 *)p); 6865 return 0; 6866 case VMCS_FIELD_TYPE_U64: 6867 *ret = *((u64 *)p); 6868 return 0; 6869 default: 6870 WARN_ON(1); 6871 return -ENOENT; 6872 } 6873} 6874 6875 6876static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, 6877 unsigned long field, u64 field_value){ 6878 short offset = vmcs_field_to_offset(field); 6879 char *p = ((char *) get_vmcs12(vcpu)) + offset; 6880 if (offset < 0) 6881 return offset; 6882 6883 switch (vmcs_field_type(field)) { 6884 case VMCS_FIELD_TYPE_U16: 6885 *(u16 *)p = field_value; 6886 return 0; 6887 case VMCS_FIELD_TYPE_U32: 6888 *(u32 *)p = field_value; 6889 return 0; 6890 case VMCS_FIELD_TYPE_U64: 6891 *(u64 *)p = field_value; 6892 return 0; 6893 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6894 *(natural_width *)p = field_value; 6895 return 0; 6896 default: 6897 WARN_ON(1); 6898 return -ENOENT; 6899 } 6900 6901} 6902 6903static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 6904{ 6905 int i; 6906 unsigned long field; 6907 u64 field_value; 6908 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6909 const unsigned long *fields = shadow_read_write_fields; 6910 const int num_fields = max_shadow_read_write_fields; 6911 6912 preempt_disable(); 6913 6914 vmcs_load(shadow_vmcs); 6915 6916 for (i = 0; i < num_fields; i++) { 6917 field = fields[i]; 6918 switch (vmcs_field_type(field)) { 6919 case VMCS_FIELD_TYPE_U16: 6920 field_value = vmcs_read16(field); 6921 break; 6922 case VMCS_FIELD_TYPE_U32: 6923 field_value = vmcs_read32(field); 6924 break; 6925 case VMCS_FIELD_TYPE_U64: 6926 field_value = vmcs_read64(field); 6927 break; 6928 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6929 field_value = vmcs_readl(field); 6930 break; 6931 default: 6932 WARN_ON(1); 6933 continue; 6934 } 6935 vmcs12_write_any(&vmx->vcpu, field, field_value); 6936 } 6937 6938 vmcs_clear(shadow_vmcs); 6939 vmcs_load(vmx->loaded_vmcs->vmcs); 6940 6941 preempt_enable(); 6942} 6943 6944static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 6945{ 6946 const unsigned long *fields[] = { 6947 shadow_read_write_fields, 6948 shadow_read_only_fields 6949 }; 6950 const int max_fields[] = { 6951 max_shadow_read_write_fields, 6952 max_shadow_read_only_fields 6953 }; 6954 int i, q; 6955 unsigned long field; 6956 u64 field_value = 0; 6957 struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs; 6958 6959 vmcs_load(shadow_vmcs); 6960 6961 for (q = 0; q < ARRAY_SIZE(fields); q++) { 6962 for (i = 0; i < max_fields[q]; i++) { 6963 field = fields[q][i]; 6964 vmcs12_read_any(&vmx->vcpu, field, &field_value); 6965 6966 switch (vmcs_field_type(field)) { 6967 case VMCS_FIELD_TYPE_U16: 6968 vmcs_write16(field, (u16)field_value); 6969 break; 6970 case VMCS_FIELD_TYPE_U32: 6971 vmcs_write32(field, (u32)field_value); 6972 break; 6973 case VMCS_FIELD_TYPE_U64: 6974 vmcs_write64(field, (u64)field_value); 6975 break; 6976 case VMCS_FIELD_TYPE_NATURAL_WIDTH: 6977 vmcs_writel(field, (long)field_value); 6978 break; 6979 default: 6980 WARN_ON(1); 6981 break; 6982 } 6983 } 6984 } 6985 6986 vmcs_clear(shadow_vmcs); 6987 vmcs_load(vmx->loaded_vmcs->vmcs); 6988} 6989 6990/* 6991 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was 6992 * used before) all generate the same failure when it is missing. 6993 */ 6994static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) 6995{ 6996 struct vcpu_vmx *vmx = to_vmx(vcpu); 6997 if (vmx->nested.current_vmptr == -1ull) { 6998 nested_vmx_failInvalid(vcpu); 6999 skip_emulated_instruction(vcpu); 7000 return 0; 7001 } 7002 return 1; 7003} 7004 7005static int handle_vmread(struct kvm_vcpu *vcpu) 7006{ 7007 unsigned long field; 7008 u64 field_value; 7009 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7010 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7011 gva_t gva = 0; 7012 7013 if (!nested_vmx_check_permission(vcpu) || 7014 !nested_vmx_check_vmcs12(vcpu)) 7015 return 1; 7016 7017 /* Decode instruction info and find the field to read */ 7018 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 7019 /* Read the field, zero-extended to a u64 field_value */ 7020 if (vmcs12_read_any(vcpu, field, &field_value) < 0) { 7021 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 7022 skip_emulated_instruction(vcpu); 7023 return 1; 7024 } 7025 /* 7026 * Now copy part of this value to register or memory, as requested. 7027 * Note that the number of bits actually copied is 32 or 64 depending 7028 * on the guest's mode (32 or 64 bit), not on the given field's length. 7029 */ 7030 if (vmx_instruction_info & (1u << 10)) { 7031 kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), 7032 field_value); 7033 } else { 7034 if (get_vmx_mem_address(vcpu, exit_qualification, 7035 vmx_instruction_info, &gva)) 7036 return 1; 7037 /* _system ok, as nested_vmx_check_permission verified cpl=0 */ 7038 kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, 7039 &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); 7040 } 7041 7042 nested_vmx_succeed(vcpu); 7043 skip_emulated_instruction(vcpu); 7044 return 1; 7045} 7046 7047 7048static int handle_vmwrite(struct kvm_vcpu *vcpu) 7049{ 7050 unsigned long field; 7051 gva_t gva; 7052 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7053 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7054 /* The value to write might be 32 or 64 bits, depending on L1's long 7055 * mode, and eventually we need to write that into a field of several 7056 * possible lengths. The code below first zero-extends the value to 64 7057 * bit (field_value), and then copies only the approriate number of 7058 * bits into the vmcs12 field. 7059 */ 7060 u64 field_value = 0; 7061 struct x86_exception e; 7062 7063 if (!nested_vmx_check_permission(vcpu) || 7064 !nested_vmx_check_vmcs12(vcpu)) 7065 return 1; 7066 7067 if (vmx_instruction_info & (1u << 10)) 7068 field_value = kvm_register_readl(vcpu, 7069 (((vmx_instruction_info) >> 3) & 0xf)); 7070 else { 7071 if (get_vmx_mem_address(vcpu, exit_qualification, 7072 vmx_instruction_info, &gva)) 7073 return 1; 7074 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, 7075 &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { 7076 kvm_inject_page_fault(vcpu, &e); 7077 return 1; 7078 } 7079 } 7080 7081 7082 field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 7083 if (vmcs_field_readonly(field)) { 7084 nested_vmx_failValid(vcpu, 7085 VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 7086 skip_emulated_instruction(vcpu); 7087 return 1; 7088 } 7089 7090 if (vmcs12_write_any(vcpu, field, field_value) < 0) { 7091 nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 7092 skip_emulated_instruction(vcpu); 7093 return 1; 7094 } 7095 7096 nested_vmx_succeed(vcpu); 7097 skip_emulated_instruction(vcpu); 7098 return 1; 7099} 7100 7101/* Emulate the VMPTRLD instruction */ 7102static int handle_vmptrld(struct kvm_vcpu *vcpu) 7103{ 7104 struct vcpu_vmx *vmx = to_vmx(vcpu); 7105 gpa_t vmptr; 7106 u32 exec_control; 7107 7108 if (!nested_vmx_check_permission(vcpu)) 7109 return 1; 7110 7111 if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr)) 7112 return 1; 7113 7114 if (vmx->nested.current_vmptr != vmptr) { 7115 struct vmcs12 *new_vmcs12; 7116 struct page *page; 7117 page = nested_get_page(vcpu, vmptr); 7118 if (page == NULL) { 7119 nested_vmx_failInvalid(vcpu); 7120 skip_emulated_instruction(vcpu); 7121 return 1; 7122 } 7123 new_vmcs12 = kmap(page); 7124 if (new_vmcs12->revision_id != VMCS12_REVISION) { 7125 kunmap(page); 7126 nested_release_page_clean(page); 7127 nested_vmx_failValid(vcpu, 7128 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 7129 skip_emulated_instruction(vcpu); 7130 return 1; 7131 } 7132 7133 nested_release_vmcs12(vmx); 7134 vmx->nested.current_vmptr = vmptr; 7135 vmx->nested.current_vmcs12 = new_vmcs12; 7136 vmx->nested.current_vmcs12_page = page; 7137 if (enable_shadow_vmcs) { 7138 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7139 exec_control |= SECONDARY_EXEC_SHADOW_VMCS; 7140 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7141 vmcs_write64(VMCS_LINK_POINTER, 7142 __pa(vmx->nested.current_shadow_vmcs)); 7143 vmx->nested.sync_shadow_vmcs = true; 7144 } 7145 } 7146 7147 nested_vmx_succeed(vcpu); 7148 skip_emulated_instruction(vcpu); 7149 return 1; 7150} 7151 7152/* Emulate the VMPTRST instruction */ 7153static int handle_vmptrst(struct kvm_vcpu *vcpu) 7154{ 7155 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7156 u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7157 gva_t vmcs_gva; 7158 struct x86_exception e; 7159 7160 if (!nested_vmx_check_permission(vcpu)) 7161 return 1; 7162 7163 if (get_vmx_mem_address(vcpu, exit_qualification, 7164 vmx_instruction_info, &vmcs_gva)) 7165 return 1; 7166 /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ 7167 if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, 7168 (void *)&to_vmx(vcpu)->nested.current_vmptr, 7169 sizeof(u64), &e)) { 7170 kvm_inject_page_fault(vcpu, &e); 7171 return 1; 7172 } 7173 nested_vmx_succeed(vcpu); 7174 skip_emulated_instruction(vcpu); 7175 return 1; 7176} 7177 7178/* Emulate the INVEPT instruction */ 7179static int handle_invept(struct kvm_vcpu *vcpu) 7180{ 7181 struct vcpu_vmx *vmx = to_vmx(vcpu); 7182 u32 vmx_instruction_info, types; 7183 unsigned long type; 7184 gva_t gva; 7185 struct x86_exception e; 7186 struct { 7187 u64 eptp, gpa; 7188 } operand; 7189 7190 if (!(vmx->nested.nested_vmx_secondary_ctls_high & 7191 SECONDARY_EXEC_ENABLE_EPT) || 7192 !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) { 7193 kvm_queue_exception(vcpu, UD_VECTOR); 7194 return 1; 7195 } 7196 7197 if (!nested_vmx_check_permission(vcpu)) 7198 return 1; 7199 7200 if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { 7201 kvm_queue_exception(vcpu, UD_VECTOR); 7202 return 1; 7203 } 7204 7205 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 7206 type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); 7207 7208 types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 7209 7210 if (!(types & (1UL << type))) { 7211 nested_vmx_failValid(vcpu, 7212 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 7213 skip_emulated_instruction(vcpu); 7214 return 1; 7215 } 7216 7217 /* According to the Intel VMX instruction reference, the memory 7218 * operand is read even if it isn't needed (e.g., for type==global) 7219 */ 7220 if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), 7221 vmx_instruction_info, &gva)) 7222 return 1; 7223 if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand, 7224 sizeof(operand), &e)) { 7225 kvm_inject_page_fault(vcpu, &e); 7226 return 1; 7227 } 7228 7229 switch (type) { 7230 case VMX_EPT_EXTENT_GLOBAL: 7231 kvm_mmu_sync_roots(vcpu); 7232 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 7233 nested_vmx_succeed(vcpu); 7234 break; 7235 default: 7236 /* Trap single context invalidation invept calls */ 7237 BUG_ON(1); 7238 break; 7239 } 7240 7241 skip_emulated_instruction(vcpu); 7242 return 1; 7243} 7244 7245static int handle_invvpid(struct kvm_vcpu *vcpu) 7246{ 7247 kvm_queue_exception(vcpu, UD_VECTOR); 7248 return 1; 7249} 7250 7251static int handle_pml_full(struct kvm_vcpu *vcpu) 7252{ 7253 unsigned long exit_qualification; 7254 7255 trace_kvm_pml_full(vcpu->vcpu_id); 7256 7257 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7258 7259 /* 7260 * PML buffer FULL happened while executing iret from NMI, 7261 * "blocked by NMI" bit has to be set before next VM entry. 7262 */ 7263 if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && 7264 cpu_has_virtual_nmis() && 7265 (exit_qualification & INTR_INFO_UNBLOCK_NMI)) 7266 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 7267 GUEST_INTR_STATE_NMI); 7268 7269 /* 7270 * PML buffer already flushed at beginning of VMEXIT. Nothing to do 7271 * here.., and there's no userspace involvement needed for PML. 7272 */ 7273 return 1; 7274} 7275 7276/* 7277 * The exit handlers return 1 if the exit was handled fully and guest execution 7278 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 7279 * to be done to userspace and return 0. 7280 */ 7281static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { 7282 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 7283 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 7284 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 7285 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, 7286 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 7287 [EXIT_REASON_CR_ACCESS] = handle_cr, 7288 [EXIT_REASON_DR_ACCESS] = handle_dr, 7289 [EXIT_REASON_CPUID] = handle_cpuid, 7290 [EXIT_REASON_MSR_READ] = handle_rdmsr, 7291 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 7292 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 7293 [EXIT_REASON_HLT] = handle_halt, 7294 [EXIT_REASON_INVD] = handle_invd, 7295 [EXIT_REASON_INVLPG] = handle_invlpg, 7296 [EXIT_REASON_RDPMC] = handle_rdpmc, 7297 [EXIT_REASON_VMCALL] = handle_vmcall, 7298 [EXIT_REASON_VMCLEAR] = handle_vmclear, 7299 [EXIT_REASON_VMLAUNCH] = handle_vmlaunch, 7300 [EXIT_REASON_VMPTRLD] = handle_vmptrld, 7301 [EXIT_REASON_VMPTRST] = handle_vmptrst, 7302 [EXIT_REASON_VMREAD] = handle_vmread, 7303 [EXIT_REASON_VMRESUME] = handle_vmresume, 7304 [EXIT_REASON_VMWRITE] = handle_vmwrite, 7305 [EXIT_REASON_VMOFF] = handle_vmoff, 7306 [EXIT_REASON_VMON] = handle_vmon, 7307 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 7308 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 7309 [EXIT_REASON_APIC_WRITE] = handle_apic_write, 7310 [EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced, 7311 [EXIT_REASON_WBINVD] = handle_wbinvd, 7312 [EXIT_REASON_XSETBV] = handle_xsetbv, 7313 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 7314 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 7315 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 7316 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 7317 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, 7318 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, 7319 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, 7320 [EXIT_REASON_INVEPT] = handle_invept, 7321 [EXIT_REASON_INVVPID] = handle_invvpid, 7322 [EXIT_REASON_XSAVES] = handle_xsaves, 7323 [EXIT_REASON_XRSTORS] = handle_xrstors, 7324 [EXIT_REASON_PML_FULL] = handle_pml_full, 7325}; 7326 7327static const int kvm_vmx_max_exit_handlers = 7328 ARRAY_SIZE(kvm_vmx_exit_handlers); 7329 7330static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 7331 struct vmcs12 *vmcs12) 7332{ 7333 unsigned long exit_qualification; 7334 gpa_t bitmap, last_bitmap; 7335 unsigned int port; 7336 int size; 7337 u8 b; 7338 7339 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 7340 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 7341 7342 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7343 7344 port = exit_qualification >> 16; 7345 size = (exit_qualification & 7) + 1; 7346 7347 last_bitmap = (gpa_t)-1; 7348 b = -1; 7349 7350 while (size > 0) { 7351 if (port < 0x8000) 7352 bitmap = vmcs12->io_bitmap_a; 7353 else if (port < 0x10000) 7354 bitmap = vmcs12->io_bitmap_b; 7355 else 7356 return true; 7357 bitmap += (port & 0x7fff) / 8; 7358 7359 if (last_bitmap != bitmap) 7360 if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) 7361 return true; 7362 if (b & (1 << (port & 7))) 7363 return true; 7364 7365 port++; 7366 size--; 7367 last_bitmap = bitmap; 7368 } 7369 7370 return false; 7371} 7372 7373/* 7374 * Return 1 if we should exit from L2 to L1 to handle an MSR access access, 7375 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 7376 * disinterest in the current event (read or write a specific MSR) by using an 7377 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 7378 */ 7379static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 7380 struct vmcs12 *vmcs12, u32 exit_reason) 7381{ 7382 u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; 7383 gpa_t bitmap; 7384 7385 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 7386 return true; 7387 7388 /* 7389 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 7390 * for the four combinations of read/write and low/high MSR numbers. 7391 * First we need to figure out which of the four to use: 7392 */ 7393 bitmap = vmcs12->msr_bitmap; 7394 if (exit_reason == EXIT_REASON_MSR_WRITE) 7395 bitmap += 2048; 7396 if (msr_index >= 0xc0000000) { 7397 msr_index -= 0xc0000000; 7398 bitmap += 1024; 7399 } 7400 7401 /* Then read the msr_index'th bit from this bitmap: */ 7402 if (msr_index < 1024*8) { 7403 unsigned char b; 7404 if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) 7405 return true; 7406 return 1 & (b >> (msr_index & 7)); 7407 } else 7408 return true; /* let L1 handle the wrong parameter */ 7409} 7410 7411/* 7412 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 7413 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 7414 * intercept (via guest_host_mask etc.) the current event. 7415 */ 7416static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 7417 struct vmcs12 *vmcs12) 7418{ 7419 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 7420 int cr = exit_qualification & 15; 7421 int reg = (exit_qualification >> 8) & 15; 7422 unsigned long val = kvm_register_readl(vcpu, reg); 7423 7424 switch ((exit_qualification >> 4) & 3) { 7425 case 0: /* mov to cr */ 7426 switch (cr) { 7427 case 0: 7428 if (vmcs12->cr0_guest_host_mask & 7429 (val ^ vmcs12->cr0_read_shadow)) 7430 return true; 7431 break; 7432 case 3: 7433 if ((vmcs12->cr3_target_count >= 1 && 7434 vmcs12->cr3_target_value0 == val) || 7435 (vmcs12->cr3_target_count >= 2 && 7436 vmcs12->cr3_target_value1 == val) || 7437 (vmcs12->cr3_target_count >= 3 && 7438 vmcs12->cr3_target_value2 == val) || 7439 (vmcs12->cr3_target_count >= 4 && 7440 vmcs12->cr3_target_value3 == val)) 7441 return false; 7442 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 7443 return true; 7444 break; 7445 case 4: 7446 if (vmcs12->cr4_guest_host_mask & 7447 (vmcs12->cr4_read_shadow ^ val)) 7448 return true; 7449 break; 7450 case 8: 7451 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 7452 return true; 7453 break; 7454 } 7455 break; 7456 case 2: /* clts */ 7457 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 7458 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 7459 return true; 7460 break; 7461 case 1: /* mov from cr */ 7462 switch (cr) { 7463 case 3: 7464 if (vmcs12->cpu_based_vm_exec_control & 7465 CPU_BASED_CR3_STORE_EXITING) 7466 return true; 7467 break; 7468 case 8: 7469 if (vmcs12->cpu_based_vm_exec_control & 7470 CPU_BASED_CR8_STORE_EXITING) 7471 return true; 7472 break; 7473 } 7474 break; 7475 case 3: /* lmsw */ 7476 /* 7477 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 7478 * cr0. Other attempted changes are ignored, with no exit. 7479 */ 7480 if (vmcs12->cr0_guest_host_mask & 0xe & 7481 (val ^ vmcs12->cr0_read_shadow)) 7482 return true; 7483 if ((vmcs12->cr0_guest_host_mask & 0x1) && 7484 !(vmcs12->cr0_read_shadow & 0x1) && 7485 (val & 0x1)) 7486 return true; 7487 break; 7488 } 7489 return false; 7490} 7491 7492/* 7493 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we 7494 * should handle it ourselves in L0 (and then continue L2). Only call this 7495 * when in is_guest_mode (L2). 7496 */ 7497static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) 7498{ 7499 u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7500 struct vcpu_vmx *vmx = to_vmx(vcpu); 7501 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7502 u32 exit_reason = vmx->exit_reason; 7503 7504 trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, 7505 vmcs_readl(EXIT_QUALIFICATION), 7506 vmx->idt_vectoring_info, 7507 intr_info, 7508 vmcs_read32(VM_EXIT_INTR_ERROR_CODE), 7509 KVM_ISA_VMX); 7510 7511 if (vmx->nested.nested_run_pending) 7512 return false; 7513 7514 if (unlikely(vmx->fail)) { 7515 pr_info_ratelimited("%s failed vm entry %x\n", __func__, 7516 vmcs_read32(VM_INSTRUCTION_ERROR)); 7517 return true; 7518 } 7519 7520 switch (exit_reason) { 7521 case EXIT_REASON_EXCEPTION_NMI: 7522 if (!is_exception(intr_info)) 7523 return false; 7524 else if (is_page_fault(intr_info)) 7525 return enable_ept; 7526 else if (is_no_device(intr_info) && 7527 !(vmcs12->guest_cr0 & X86_CR0_TS)) 7528 return false; 7529 return vmcs12->exception_bitmap & 7530 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 7531 case EXIT_REASON_EXTERNAL_INTERRUPT: 7532 return false; 7533 case EXIT_REASON_TRIPLE_FAULT: 7534 return true; 7535 case EXIT_REASON_PENDING_INTERRUPT: 7536 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING); 7537 case EXIT_REASON_NMI_WINDOW: 7538 return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING); 7539 case EXIT_REASON_TASK_SWITCH: 7540 return true; 7541 case EXIT_REASON_CPUID: 7542 if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa) 7543 return false; 7544 return true; 7545 case EXIT_REASON_HLT: 7546 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 7547 case EXIT_REASON_INVD: 7548 return true; 7549 case EXIT_REASON_INVLPG: 7550 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 7551 case EXIT_REASON_RDPMC: 7552 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 7553 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 7554 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 7555 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 7556 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 7557 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: 7558 case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: 7559 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 7560 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 7561 /* 7562 * VMX instructions trap unconditionally. This allows L1 to 7563 * emulate them for its L2 guest, i.e., allows 3-level nesting! 7564 */ 7565 return true; 7566 case EXIT_REASON_CR_ACCESS: 7567 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 7568 case EXIT_REASON_DR_ACCESS: 7569 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 7570 case EXIT_REASON_IO_INSTRUCTION: 7571 return nested_vmx_exit_handled_io(vcpu, vmcs12); 7572 case EXIT_REASON_MSR_READ: 7573 case EXIT_REASON_MSR_WRITE: 7574 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 7575 case EXIT_REASON_INVALID_STATE: 7576 return true; 7577 case EXIT_REASON_MWAIT_INSTRUCTION: 7578 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 7579 case EXIT_REASON_MONITOR_INSTRUCTION: 7580 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 7581 case EXIT_REASON_PAUSE_INSTRUCTION: 7582 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 7583 nested_cpu_has2(vmcs12, 7584 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 7585 case EXIT_REASON_MCE_DURING_VMENTRY: 7586 return false; 7587 case EXIT_REASON_TPR_BELOW_THRESHOLD: 7588 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 7589 case EXIT_REASON_APIC_ACCESS: 7590 return nested_cpu_has2(vmcs12, 7591 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); 7592 case EXIT_REASON_APIC_WRITE: 7593 case EXIT_REASON_EOI_INDUCED: 7594 /* apic_write and eoi_induced should exit unconditionally. */ 7595 return true; 7596 case EXIT_REASON_EPT_VIOLATION: 7597 /* 7598 * L0 always deals with the EPT violation. If nested EPT is 7599 * used, and the nested mmu code discovers that the address is 7600 * missing in the guest EPT table (EPT12), the EPT violation 7601 * will be injected with nested_ept_inject_page_fault() 7602 */ 7603 return false; 7604 case EXIT_REASON_EPT_MISCONFIG: 7605 /* 7606 * L2 never uses directly L1's EPT, but rather L0's own EPT 7607 * table (shadow on EPT) or a merged EPT table that L0 built 7608 * (EPT on EPT). So any problems with the structure of the 7609 * table is L0's fault. 7610 */ 7611 return false; 7612 case EXIT_REASON_WBINVD: 7613 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 7614 case EXIT_REASON_XSETBV: 7615 return true; 7616 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 7617 /* 7618 * This should never happen, since it is not possible to 7619 * set XSS to a non-zero value---neither in L1 nor in L2. 7620 * If if it were, XSS would have to be checked against 7621 * the XSS exit bitmap in vmcs12. 7622 */ 7623 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES); 7624 default: 7625 return true; 7626 } 7627} 7628 7629static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 7630{ 7631 *info1 = vmcs_readl(EXIT_QUALIFICATION); 7632 *info2 = vmcs_read32(VM_EXIT_INTR_INFO); 7633} 7634 7635static int vmx_enable_pml(struct vcpu_vmx *vmx) 7636{ 7637 struct page *pml_pg; 7638 u32 exec_control; 7639 7640 pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO); 7641 if (!pml_pg) 7642 return -ENOMEM; 7643 7644 vmx->pml_pg = pml_pg; 7645 7646 vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg)); 7647 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7648 7649 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7650 exec_control |= SECONDARY_EXEC_ENABLE_PML; 7651 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7652 7653 return 0; 7654} 7655 7656static void vmx_disable_pml(struct vcpu_vmx *vmx) 7657{ 7658 u32 exec_control; 7659 7660 ASSERT(vmx->pml_pg); 7661 __free_page(vmx->pml_pg); 7662 vmx->pml_pg = NULL; 7663 7664 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7665 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 7666 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 7667} 7668 7669static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) 7670{ 7671 struct kvm *kvm = vmx->vcpu.kvm; 7672 u64 *pml_buf; 7673 u16 pml_idx; 7674 7675 pml_idx = vmcs_read16(GUEST_PML_INDEX); 7676 7677 /* Do nothing if PML buffer is empty */ 7678 if (pml_idx == (PML_ENTITY_NUM - 1)) 7679 return; 7680 7681 /* PML index always points to next available PML buffer entity */ 7682 if (pml_idx >= PML_ENTITY_NUM) 7683 pml_idx = 0; 7684 else 7685 pml_idx++; 7686 7687 pml_buf = page_address(vmx->pml_pg); 7688 for (; pml_idx < PML_ENTITY_NUM; pml_idx++) { 7689 u64 gpa; 7690 7691 gpa = pml_buf[pml_idx]; 7692 WARN_ON(gpa & (PAGE_SIZE - 1)); 7693 mark_page_dirty(kvm, gpa >> PAGE_SHIFT); 7694 } 7695 7696 /* reset PML index */ 7697 vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1); 7698} 7699 7700/* 7701 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap. 7702 * Called before reporting dirty_bitmap to userspace. 7703 */ 7704static void kvm_flush_pml_buffers(struct kvm *kvm) 7705{ 7706 int i; 7707 struct kvm_vcpu *vcpu; 7708 /* 7709 * We only need to kick vcpu out of guest mode here, as PML buffer 7710 * is flushed at beginning of all VMEXITs, and it's obvious that only 7711 * vcpus running in guest are possible to have unflushed GPAs in PML 7712 * buffer. 7713 */ 7714 kvm_for_each_vcpu(i, vcpu, kvm) 7715 kvm_vcpu_kick(vcpu); 7716} 7717 7718/* 7719 * The guest has exited. See if we can fix it or if we need userspace 7720 * assistance. 7721 */ 7722static int vmx_handle_exit(struct kvm_vcpu *vcpu) 7723{ 7724 struct vcpu_vmx *vmx = to_vmx(vcpu); 7725 u32 exit_reason = vmx->exit_reason; 7726 u32 vectoring_info = vmx->idt_vectoring_info; 7727 7728 /* 7729 * Flush logged GPAs PML buffer, this will make dirty_bitmap more 7730 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before 7731 * querying dirty_bitmap, we only need to kick all vcpus out of guest 7732 * mode as if vcpus is in root mode, the PML buffer must has been 7733 * flushed already. 7734 */ 7735 if (enable_pml) 7736 vmx_flush_pml_buffer(vmx); 7737 7738 /* If guest state is invalid, start emulating */ 7739 if (vmx->emulation_required) 7740 return handle_invalid_guest_state(vcpu); 7741 7742 if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) { 7743 nested_vmx_vmexit(vcpu, exit_reason, 7744 vmcs_read32(VM_EXIT_INTR_INFO), 7745 vmcs_readl(EXIT_QUALIFICATION)); 7746 return 1; 7747 } 7748 7749 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 7750 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 7751 vcpu->run->fail_entry.hardware_entry_failure_reason 7752 = exit_reason; 7753 return 0; 7754 } 7755 7756 if (unlikely(vmx->fail)) { 7757 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 7758 vcpu->run->fail_entry.hardware_entry_failure_reason 7759 = vmcs_read32(VM_INSTRUCTION_ERROR); 7760 return 0; 7761 } 7762 7763 /* 7764 * Note: 7765 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 7766 * delivery event since it indicates guest is accessing MMIO. 7767 * The vm-exit can be triggered again after return to guest that 7768 * will cause infinite loop. 7769 */ 7770 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 7771 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 7772 exit_reason != EXIT_REASON_EPT_VIOLATION && 7773 exit_reason != EXIT_REASON_TASK_SWITCH)) { 7774 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 7775 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 7776 vcpu->run->internal.ndata = 2; 7777 vcpu->run->internal.data[0] = vectoring_info; 7778 vcpu->run->internal.data[1] = exit_reason; 7779 return 0; 7780 } 7781 7782 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 7783 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( 7784 get_vmcs12(vcpu))))) { 7785 if (vmx_interrupt_allowed(vcpu)) { 7786 vmx->soft_vnmi_blocked = 0; 7787 } else if (vmx->vnmi_blocked_time > 1000000000LL && 7788 vcpu->arch.nmi_pending) { 7789 /* 7790 * This CPU don't support us in finding the end of an 7791 * NMI-blocked window if the guest runs with IRQs 7792 * disabled. So we pull the trigger after 1 s of 7793 * futile waiting, but inform the user about this. 7794 */ 7795 printk(KERN_WARNING "%s: Breaking out of NMI-blocked " 7796 "state on VCPU %d after 1 s timeout\n", 7797 __func__, vcpu->vcpu_id); 7798 vmx->soft_vnmi_blocked = 0; 7799 } 7800 } 7801 7802 if (exit_reason < kvm_vmx_max_exit_handlers 7803 && kvm_vmx_exit_handlers[exit_reason]) 7804 return kvm_vmx_exit_handlers[exit_reason](vcpu); 7805 else { 7806 WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); 7807 kvm_queue_exception(vcpu, UD_VECTOR); 7808 return 1; 7809 } 7810} 7811 7812static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) 7813{ 7814 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 7815 7816 if (is_guest_mode(vcpu) && 7817 nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 7818 return; 7819 7820 if (irr == -1 || tpr < irr) { 7821 vmcs_write32(TPR_THRESHOLD, 0); 7822 return; 7823 } 7824 7825 vmcs_write32(TPR_THRESHOLD, irr); 7826} 7827 7828static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) 7829{ 7830 u32 sec_exec_control; 7831 7832 /* 7833 * There is not point to enable virtualize x2apic without enable 7834 * apicv 7835 */ 7836 if (!cpu_has_vmx_virtualize_x2apic_mode() || 7837 !vmx_vm_has_apicv(vcpu->kvm)) 7838 return; 7839 7840 if (!vm_need_tpr_shadow(vcpu->kvm)) 7841 return; 7842 7843 sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 7844 7845 if (set) { 7846 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7847 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 7848 } else { 7849 sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; 7850 sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 7851 } 7852 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); 7853 7854 vmx_set_msr_bitmap(vcpu); 7855} 7856 7857static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) 7858{ 7859 struct vcpu_vmx *vmx = to_vmx(vcpu); 7860 7861 /* 7862 * Currently we do not handle the nested case where L2 has an 7863 * APIC access page of its own; that page is still pinned. 7864 * Hence, we skip the case where the VCPU is in guest mode _and_ 7865 * L1 prepared an APIC access page for L2. 7866 * 7867 * For the case where L1 and L2 share the same APIC access page 7868 * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear 7869 * in the vmcs12), this function will only update either the vmcs01 7870 * or the vmcs02. If the former, the vmcs02 will be updated by 7871 * prepare_vmcs02. If the latter, the vmcs01 will be updated in 7872 * the next L2->L1 exit. 7873 */ 7874 if (!is_guest_mode(vcpu) || 7875 !nested_cpu_has2(vmx->nested.current_vmcs12, 7876 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 7877 vmcs_write64(APIC_ACCESS_ADDR, hpa); 7878} 7879 7880static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) 7881{ 7882 u16 status; 7883 u8 old; 7884 7885 if (isr == -1) 7886 isr = 0; 7887 7888 status = vmcs_read16(GUEST_INTR_STATUS); 7889 old = status >> 8; 7890 if (isr != old) { 7891 status &= 0xff; 7892 status |= isr << 8; 7893 vmcs_write16(GUEST_INTR_STATUS, status); 7894 } 7895} 7896 7897static void vmx_set_rvi(int vector) 7898{ 7899 u16 status; 7900 u8 old; 7901 7902 if (vector == -1) 7903 vector = 0; 7904 7905 status = vmcs_read16(GUEST_INTR_STATUS); 7906 old = (u8)status & 0xff; 7907 if ((u8)vector != old) { 7908 status &= ~0xff; 7909 status |= (u8)vector; 7910 vmcs_write16(GUEST_INTR_STATUS, status); 7911 } 7912} 7913 7914static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) 7915{ 7916 if (!is_guest_mode(vcpu)) { 7917 vmx_set_rvi(max_irr); 7918 return; 7919 } 7920 7921 if (max_irr == -1) 7922 return; 7923 7924 /* 7925 * In guest mode. If a vmexit is needed, vmx_check_nested_events 7926 * handles it. 7927 */ 7928 if (nested_exit_on_intr(vcpu)) 7929 return; 7930 7931 /* 7932 * Else, fall back to pre-APICv interrupt injection since L2 7933 * is run without virtual interrupt delivery. 7934 */ 7935 if (!kvm_event_needs_reinjection(vcpu) && 7936 vmx_interrupt_allowed(vcpu)) { 7937 kvm_queue_interrupt(vcpu, max_irr, false); 7938 vmx_inject_irq(vcpu); 7939 } 7940} 7941 7942static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) 7943{ 7944 if (!vmx_vm_has_apicv(vcpu->kvm)) 7945 return; 7946 7947 vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]); 7948 vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]); 7949 vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]); 7950 vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]); 7951} 7952 7953static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 7954{ 7955 u32 exit_intr_info; 7956 7957 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY 7958 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) 7959 return; 7960 7961 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7962 exit_intr_info = vmx->exit_intr_info; 7963 7964 /* Handle machine checks before interrupts are enabled */ 7965 if (is_machine_check(exit_intr_info)) 7966 kvm_machine_check(); 7967 7968 /* We need to handle NMIs before interrupts are enabled */ 7969 if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && 7970 (exit_intr_info & INTR_INFO_VALID_MASK)) { 7971 kvm_before_handle_nmi(&vmx->vcpu); 7972 asm("int $2"); 7973 kvm_after_handle_nmi(&vmx->vcpu); 7974 } 7975} 7976 7977static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) 7978{ 7979 u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 7980 7981 /* 7982 * If external interrupt exists, IF bit is set in rflags/eflags on the 7983 * interrupt stack frame, and interrupt will be enabled on a return 7984 * from interrupt handler. 7985 */ 7986 if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) 7987 == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { 7988 unsigned int vector; 7989 unsigned long entry; 7990 gate_desc *desc; 7991 struct vcpu_vmx *vmx = to_vmx(vcpu); 7992#ifdef CONFIG_X86_64 7993 unsigned long tmp; 7994#endif 7995 7996 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 7997 desc = (gate_desc *)vmx->host_idt_base + vector; 7998 entry = gate_offset(*desc); 7999 asm volatile( 8000#ifdef CONFIG_X86_64 8001 "mov %%" _ASM_SP ", %[sp]\n\t" 8002 "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t" 8003 "push $%c[ss]\n\t" 8004 "push %[sp]\n\t" 8005#endif 8006 "pushf\n\t" 8007 "orl $0x200, (%%" _ASM_SP ")\n\t" 8008 __ASM_SIZE(push) " $%c[cs]\n\t" 8009 "call *%[entry]\n\t" 8010 : 8011#ifdef CONFIG_X86_64 8012 [sp]"=&r"(tmp) 8013#endif 8014 : 8015 [entry]"r"(entry), 8016 [ss]"i"(__KERNEL_DS), 8017 [cs]"i"(__KERNEL_CS) 8018 ); 8019 } else 8020 local_irq_enable(); 8021} 8022 8023static bool vmx_mpx_supported(void) 8024{ 8025 return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && 8026 (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); 8027} 8028 8029static bool vmx_xsaves_supported(void) 8030{ 8031 return vmcs_config.cpu_based_2nd_exec_ctrl & 8032 SECONDARY_EXEC_XSAVES; 8033} 8034 8035static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 8036{ 8037 u32 exit_intr_info; 8038 bool unblock_nmi; 8039 u8 vector; 8040 bool idtv_info_valid; 8041 8042 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 8043 8044 if (cpu_has_virtual_nmis()) { 8045 if (vmx->nmi_known_unmasked) 8046 return; 8047 /* 8048 * Can't use vmx->exit_intr_info since we're not sure what 8049 * the exit reason is. 8050 */ 8051 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 8052 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 8053 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 8054 /* 8055 * SDM 3: 27.7.1.2 (September 2008) 8056 * Re-set bit "block by NMI" before VM entry if vmexit caused by 8057 * a guest IRET fault. 8058 * SDM 3: 23.2.2 (September 2008) 8059 * Bit 12 is undefined in any of the following cases: 8060 * If the VM exit sets the valid bit in the IDT-vectoring 8061 * information field. 8062 * If the VM exit is due to a double fault. 8063 */ 8064 if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && 8065 vector != DF_VECTOR && !idtv_info_valid) 8066 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 8067 GUEST_INTR_STATE_NMI); 8068 else 8069 vmx->nmi_known_unmasked = 8070 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) 8071 & GUEST_INTR_STATE_NMI); 8072 } else if (unlikely(vmx->soft_vnmi_blocked)) 8073 vmx->vnmi_blocked_time += 8074 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 8075} 8076 8077static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, 8078 u32 idt_vectoring_info, 8079 int instr_len_field, 8080 int error_code_field) 8081{ 8082 u8 vector; 8083 int type; 8084 bool idtv_info_valid; 8085 8086 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 8087 8088 vcpu->arch.nmi_injected = false; 8089 kvm_clear_exception_queue(vcpu); 8090 kvm_clear_interrupt_queue(vcpu); 8091 8092 if (!idtv_info_valid) 8093 return; 8094 8095 kvm_make_request(KVM_REQ_EVENT, vcpu); 8096 8097 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 8098 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 8099 8100 switch (type) { 8101 case INTR_TYPE_NMI_INTR: 8102 vcpu->arch.nmi_injected = true; 8103 /* 8104 * SDM 3: 27.7.1.2 (September 2008) 8105 * Clear bit "block by NMI" before VM entry if a NMI 8106 * delivery faulted. 8107 */ 8108 vmx_set_nmi_mask(vcpu, false); 8109 break; 8110 case INTR_TYPE_SOFT_EXCEPTION: 8111 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 8112 /* fall through */ 8113 case INTR_TYPE_HARD_EXCEPTION: 8114 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 8115 u32 err = vmcs_read32(error_code_field); 8116 kvm_requeue_exception_e(vcpu, vector, err); 8117 } else 8118 kvm_requeue_exception(vcpu, vector); 8119 break; 8120 case INTR_TYPE_SOFT_INTR: 8121 vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field); 8122 /* fall through */ 8123 case INTR_TYPE_EXT_INTR: 8124 kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR); 8125 break; 8126 default: 8127 break; 8128 } 8129} 8130 8131static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 8132{ 8133 __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info, 8134 VM_EXIT_INSTRUCTION_LEN, 8135 IDT_VECTORING_ERROR_CODE); 8136} 8137 8138static void vmx_cancel_injection(struct kvm_vcpu *vcpu) 8139{ 8140 __vmx_complete_interrupts(vcpu, 8141 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), 8142 VM_ENTRY_INSTRUCTION_LEN, 8143 VM_ENTRY_EXCEPTION_ERROR_CODE); 8144 8145 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 8146} 8147 8148static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) 8149{ 8150 int i, nr_msrs; 8151 struct perf_guest_switch_msr *msrs; 8152 8153 msrs = perf_guest_get_msrs(&nr_msrs); 8154 8155 if (!msrs) 8156 return; 8157 8158 for (i = 0; i < nr_msrs; i++) 8159 if (msrs[i].host == msrs[i].guest) 8160 clear_atomic_switch_msr(vmx, msrs[i].msr); 8161 else 8162 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 8163 msrs[i].host); 8164} 8165 8166static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) 8167{ 8168 struct vcpu_vmx *vmx = to_vmx(vcpu); 8169 unsigned long debugctlmsr, cr4; 8170 8171 /* Record the guest's net vcpu time for enforced NMI injections. */ 8172 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 8173 vmx->entry_time = ktime_get(); 8174 8175 /* Don't enter VMX if guest state is invalid, let the exit handler 8176 start emulation until we arrive back to a valid state */ 8177 if (vmx->emulation_required) 8178 return; 8179 8180 if (vmx->ple_window_dirty) { 8181 vmx->ple_window_dirty = false; 8182 vmcs_write32(PLE_WINDOW, vmx->ple_window); 8183 } 8184 8185 if (vmx->nested.sync_shadow_vmcs) { 8186 copy_vmcs12_to_shadow(vmx); 8187 vmx->nested.sync_shadow_vmcs = false; 8188 } 8189 8190 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 8191 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 8192 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) 8193 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); 8194 8195 cr4 = cr4_read_shadow(); 8196 if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { 8197 vmcs_writel(HOST_CR4, cr4); 8198 vmx->host_state.vmcs_host_cr4 = cr4; 8199 } 8200 8201 /* When single-stepping over STI and MOV SS, we must clear the 8202 * corresponding interruptibility bits in the guest state. Otherwise 8203 * vmentry fails as it then expects bit 14 (BS) in pending debug 8204 * exceptions being set, but that's not correct for the guest debugging 8205 * case. */ 8206 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 8207 vmx_set_interrupt_shadow(vcpu, 0); 8208 8209 atomic_switch_perf_msrs(vmx); 8210 debugctlmsr = get_debugctlmsr(); 8211 8212 vmx->__launched = vmx->loaded_vmcs->launched; 8213 asm( 8214 /* Store host registers */ 8215 "push %%" _ASM_DX "; push %%" _ASM_BP ";" 8216 "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ 8217 "push %%" _ASM_CX " \n\t" 8218 "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 8219 "je 1f \n\t" 8220 "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" 8221 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 8222 "1: \n\t" 8223 /* Reload cr2 if changed */ 8224 "mov %c[cr2](%0), %%" _ASM_AX " \n\t" 8225 "mov %%cr2, %%" _ASM_DX " \n\t" 8226 "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" 8227 "je 2f \n\t" 8228 "mov %%" _ASM_AX", %%cr2 \n\t" 8229 "2: \n\t" 8230 /* Check if vmlaunch of vmresume is needed */ 8231 "cmpl $0, %c[launched](%0) \n\t" 8232 /* Load guest registers. Don't clobber flags. */ 8233 "mov %c[rax](%0), %%" _ASM_AX " \n\t" 8234 "mov %c[rbx](%0), %%" _ASM_BX " \n\t" 8235 "mov %c[rdx](%0), %%" _ASM_DX " \n\t" 8236 "mov %c[rsi](%0), %%" _ASM_SI " \n\t" 8237 "mov %c[rdi](%0), %%" _ASM_DI " \n\t" 8238 "mov %c[rbp](%0), %%" _ASM_BP " \n\t" 8239#ifdef CONFIG_X86_64 8240 "mov %c[r8](%0), %%r8 \n\t" 8241 "mov %c[r9](%0), %%r9 \n\t" 8242 "mov %c[r10](%0), %%r10 \n\t" 8243 "mov %c[r11](%0), %%r11 \n\t" 8244 "mov %c[r12](%0), %%r12 \n\t" 8245 "mov %c[r13](%0), %%r13 \n\t" 8246 "mov %c[r14](%0), %%r14 \n\t" 8247 "mov %c[r15](%0), %%r15 \n\t" 8248#endif 8249 "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ 8250 8251 /* Enter guest mode */ 8252 "jne 1f \n\t" 8253 __ex(ASM_VMX_VMLAUNCH) "\n\t" 8254 "jmp 2f \n\t" 8255 "1: " __ex(ASM_VMX_VMRESUME) "\n\t" 8256 "2: " 8257 /* Save guest registers, load host registers, keep flags */ 8258 "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" 8259 "pop %0 \n\t" 8260 "mov %%" _ASM_AX ", %c[rax](%0) \n\t" 8261 "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" 8262 __ASM_SIZE(pop) " %c[rcx](%0) \n\t" 8263 "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" 8264 "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" 8265 "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" 8266 "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" 8267#ifdef CONFIG_X86_64 8268 "mov %%r8, %c[r8](%0) \n\t" 8269 "mov %%r9, %c[r9](%0) \n\t" 8270 "mov %%r10, %c[r10](%0) \n\t" 8271 "mov %%r11, %c[r11](%0) \n\t" 8272 "mov %%r12, %c[r12](%0) \n\t" 8273 "mov %%r13, %c[r13](%0) \n\t" 8274 "mov %%r14, %c[r14](%0) \n\t" 8275 "mov %%r15, %c[r15](%0) \n\t" 8276#endif 8277 "mov %%cr2, %%" _ASM_AX " \n\t" 8278 "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" 8279 8280 "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" 8281 "setbe %c[fail](%0) \n\t" 8282 ".pushsection .rodata \n\t" 8283 ".global vmx_return \n\t" 8284 "vmx_return: " _ASM_PTR " 2b \n\t" 8285 ".popsection" 8286 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 8287 [launched]"i"(offsetof(struct vcpu_vmx, __launched)), 8288 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 8289 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), 8290 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 8291 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 8292 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 8293 [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])), 8294 [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])), 8295 [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])), 8296 [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])), 8297#ifdef CONFIG_X86_64 8298 [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])), 8299 [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])), 8300 [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])), 8301 [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])), 8302 [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])), 8303 [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])), 8304 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 8305 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 8306#endif 8307 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), 8308 [wordsize]"i"(sizeof(ulong)) 8309 : "cc", "memory" 8310#ifdef CONFIG_X86_64 8311 , "rax", "rbx", "rdi", "rsi" 8312 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 8313#else 8314 , "eax", "ebx", "edi", "esi" 8315#endif 8316 ); 8317 8318 /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ 8319 if (debugctlmsr) 8320 update_debugctlmsr(debugctlmsr); 8321 8322#ifndef CONFIG_X86_64 8323 /* 8324 * The sysexit path does not restore ds/es, so we must set them to 8325 * a reasonable value ourselves. 8326 * 8327 * We can't defer this to vmx_load_host_state() since that function 8328 * may be executed in interrupt context, which saves and restore segments 8329 * around it, nullifying its effect. 8330 */ 8331 loadsegment(ds, __USER_DS); 8332 loadsegment(es, __USER_DS); 8333#endif 8334 8335 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 8336 | (1 << VCPU_EXREG_RFLAGS) 8337 | (1 << VCPU_EXREG_PDPTR) 8338 | (1 << VCPU_EXREG_SEGMENTS) 8339 | (1 << VCPU_EXREG_CR3)); 8340 vcpu->arch.regs_dirty = 0; 8341 8342 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 8343 8344 vmx->loaded_vmcs->launched = 1; 8345 8346 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 8347 trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); 8348 8349 /* 8350 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if 8351 * we did not inject a still-pending event to L1 now because of 8352 * nested_run_pending, we need to re-enable this bit. 8353 */ 8354 if (vmx->nested.nested_run_pending) 8355 kvm_make_request(KVM_REQ_EVENT, vcpu); 8356 8357 vmx->nested.nested_run_pending = 0; 8358 8359 vmx_complete_atomic_exit(vmx); 8360 vmx_recover_nmi_blocking(vmx); 8361 vmx_complete_interrupts(vmx); 8362} 8363 8364static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) 8365{ 8366 struct vcpu_vmx *vmx = to_vmx(vcpu); 8367 int cpu; 8368 8369 if (vmx->loaded_vmcs == &vmx->vmcs01) 8370 return; 8371 8372 cpu = get_cpu(); 8373 vmx->loaded_vmcs = &vmx->vmcs01; 8374 vmx_vcpu_put(vcpu); 8375 vmx_vcpu_load(vcpu, cpu); 8376 vcpu->cpu = cpu; 8377 put_cpu(); 8378} 8379 8380static void vmx_free_vcpu(struct kvm_vcpu *vcpu) 8381{ 8382 struct vcpu_vmx *vmx = to_vmx(vcpu); 8383 8384 if (enable_pml) 8385 vmx_disable_pml(vmx); 8386 free_vpid(vmx); 8387 leave_guest_mode(vcpu); 8388 vmx_load_vmcs01(vcpu); 8389 free_nested(vmx); 8390 free_loaded_vmcs(vmx->loaded_vmcs); 8391 kfree(vmx->guest_msrs); 8392 kvm_vcpu_uninit(vcpu); 8393 kmem_cache_free(kvm_vcpu_cache, vmx); 8394} 8395 8396static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 8397{ 8398 int err; 8399 struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 8400 int cpu; 8401 8402 if (!vmx) 8403 return ERR_PTR(-ENOMEM); 8404 8405 allocate_vpid(vmx); 8406 8407 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 8408 if (err) 8409 goto free_vcpu; 8410 8411 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 8412 BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) 8413 > PAGE_SIZE); 8414 8415 err = -ENOMEM; 8416 if (!vmx->guest_msrs) { 8417 goto uninit_vcpu; 8418 } 8419 8420 vmx->loaded_vmcs = &vmx->vmcs01; 8421 vmx->loaded_vmcs->vmcs = alloc_vmcs(); 8422 if (!vmx->loaded_vmcs->vmcs) 8423 goto free_msrs; 8424 if (!vmm_exclusive) 8425 kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); 8426 loaded_vmcs_init(vmx->loaded_vmcs); 8427 if (!vmm_exclusive) 8428 kvm_cpu_vmxoff(); 8429 8430 cpu = get_cpu(); 8431 vmx_vcpu_load(&vmx->vcpu, cpu); 8432 vmx->vcpu.cpu = cpu; 8433 err = vmx_vcpu_setup(vmx); 8434 vmx_vcpu_put(&vmx->vcpu); 8435 put_cpu(); 8436 if (err) 8437 goto free_vmcs; 8438 if (vm_need_virtualize_apic_accesses(kvm)) { 8439 err = alloc_apic_access_page(kvm); 8440 if (err) 8441 goto free_vmcs; 8442 } 8443 8444 if (enable_ept) { 8445 if (!kvm->arch.ept_identity_map_addr) 8446 kvm->arch.ept_identity_map_addr = 8447 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 8448 err = init_rmode_identity_map(kvm); 8449 if (err) 8450 goto free_vmcs; 8451 } 8452 8453 if (nested) 8454 nested_vmx_setup_ctls_msrs(vmx); 8455 8456 vmx->nested.posted_intr_nv = -1; 8457 vmx->nested.current_vmptr = -1ull; 8458 vmx->nested.current_vmcs12 = NULL; 8459 8460 /* 8461 * If PML is turned on, failure on enabling PML just results in failure 8462 * of creating the vcpu, therefore we can simplify PML logic (by 8463 * avoiding dealing with cases, such as enabling PML partially on vcpus 8464 * for the guest, etc. 8465 */ 8466 if (enable_pml) { 8467 err = vmx_enable_pml(vmx); 8468 if (err) 8469 goto free_vmcs; 8470 } 8471 8472 return &vmx->vcpu; 8473 8474free_vmcs: 8475 free_loaded_vmcs(vmx->loaded_vmcs); 8476free_msrs: 8477 kfree(vmx->guest_msrs); 8478uninit_vcpu: 8479 kvm_vcpu_uninit(&vmx->vcpu); 8480free_vcpu: 8481 free_vpid(vmx); 8482 kmem_cache_free(kvm_vcpu_cache, vmx); 8483 return ERR_PTR(err); 8484} 8485 8486static void __init vmx_check_processor_compat(void *rtn) 8487{ 8488 struct vmcs_config vmcs_conf; 8489 8490 *(int *)rtn = 0; 8491 if (setup_vmcs_config(&vmcs_conf) < 0) 8492 *(int *)rtn = -EIO; 8493 if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) { 8494 printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n", 8495 smp_processor_id()); 8496 *(int *)rtn = -EIO; 8497 } 8498} 8499 8500static int get_ept_level(void) 8501{ 8502 return VMX_EPT_DEFAULT_GAW + 1; 8503} 8504 8505static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 8506{ 8507 u64 ret; 8508 8509 /* For VT-d and EPT combination 8510 * 1. MMIO: always map as UC 8511 * 2. EPT with VT-d: 8512 * a. VT-d without snooping control feature: can't guarantee the 8513 * result, try to trust guest. 8514 * b. VT-d with snooping control feature: snooping control feature of 8515 * VT-d engine can guarantee the cache correctness. Just set it 8516 * to WB to keep consistent with host. So the same as item 3. 8517 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep 8518 * consistent with host MTRR 8519 */ 8520 if (is_mmio) 8521 ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; 8522 else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) 8523 ret = kvm_get_guest_memory_type(vcpu, gfn) << 8524 VMX_EPT_MT_EPTE_SHIFT; 8525 else 8526 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) 8527 | VMX_EPT_IPAT_BIT; 8528 8529 return ret; 8530} 8531 8532static int vmx_get_lpage_level(void) 8533{ 8534 if (enable_ept && !cpu_has_vmx_ept_1g_page()) 8535 return PT_DIRECTORY_LEVEL; 8536 else 8537 /* For shadow and EPT supported 1GB page */ 8538 return PT_PDPE_LEVEL; 8539} 8540 8541static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 8542{ 8543 struct kvm_cpuid_entry2 *best; 8544 struct vcpu_vmx *vmx = to_vmx(vcpu); 8545 u32 exec_control; 8546 8547 vmx->rdtscp_enabled = false; 8548 if (vmx_rdtscp_supported()) { 8549 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8550 if (exec_control & SECONDARY_EXEC_RDTSCP) { 8551 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 8552 if (best && (best->edx & bit(X86_FEATURE_RDTSCP))) 8553 vmx->rdtscp_enabled = true; 8554 else { 8555 exec_control &= ~SECONDARY_EXEC_RDTSCP; 8556 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8557 exec_control); 8558 } 8559 } 8560 if (nested && !vmx->rdtscp_enabled) 8561 vmx->nested.nested_vmx_secondary_ctls_high &= 8562 ~SECONDARY_EXEC_RDTSCP; 8563 } 8564 8565 /* Exposing INVPCID only when PCID is exposed */ 8566 best = kvm_find_cpuid_entry(vcpu, 0x7, 0); 8567 if (vmx_invpcid_supported() && 8568 best && (best->ebx & bit(X86_FEATURE_INVPCID)) && 8569 guest_cpuid_has_pcid(vcpu)) { 8570 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8571 exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; 8572 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8573 exec_control); 8574 } else { 8575 if (cpu_has_secondary_exec_ctrls()) { 8576 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); 8577 exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; 8578 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, 8579 exec_control); 8580 } 8581 if (best) 8582 best->ebx &= ~bit(X86_FEATURE_INVPCID); 8583 } 8584} 8585 8586static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 8587{ 8588 if (func == 1 && nested) 8589 entry->ecx |= bit(X86_FEATURE_VMX); 8590} 8591 8592static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 8593 struct x86_exception *fault) 8594{ 8595 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8596 u32 exit_reason; 8597 8598 if (fault->error_code & PFERR_RSVD_MASK) 8599 exit_reason = EXIT_REASON_EPT_MISCONFIG; 8600 else 8601 exit_reason = EXIT_REASON_EPT_VIOLATION; 8602 nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification); 8603 vmcs12->guest_physical_address = fault->address; 8604} 8605 8606/* Callbacks for nested_ept_init_mmu_context: */ 8607 8608static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) 8609{ 8610 /* return the page table to be shadowed - in our case, EPT12 */ 8611 return get_vmcs12(vcpu)->ept_pointer; 8612} 8613 8614static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 8615{ 8616 WARN_ON(mmu_is_nested(vcpu)); 8617 kvm_init_shadow_ept_mmu(vcpu, 8618 to_vmx(vcpu)->nested.nested_vmx_ept_caps & 8619 VMX_EPT_EXECUTE_ONLY_BIT); 8620 vcpu->arch.mmu.set_cr3 = vmx_set_cr3; 8621 vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; 8622 vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; 8623 8624 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 8625} 8626 8627static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 8628{ 8629 vcpu->arch.walk_mmu = &vcpu->arch.mmu; 8630} 8631 8632static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 8633 u16 error_code) 8634{ 8635 bool inequality, bit; 8636 8637 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 8638 inequality = 8639 (error_code & vmcs12->page_fault_error_code_mask) != 8640 vmcs12->page_fault_error_code_match; 8641 return inequality ^ bit; 8642} 8643 8644static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, 8645 struct x86_exception *fault) 8646{ 8647 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 8648 8649 WARN_ON(!is_guest_mode(vcpu)); 8650 8651 if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) 8652 nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason, 8653 vmcs_read32(VM_EXIT_INTR_INFO), 8654 vmcs_readl(EXIT_QUALIFICATION)); 8655 else 8656 kvm_inject_page_fault(vcpu, fault); 8657} 8658 8659static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, 8660 struct vmcs12 *vmcs12) 8661{ 8662 struct vcpu_vmx *vmx = to_vmx(vcpu); 8663 int maxphyaddr = cpuid_maxphyaddr(vcpu); 8664 8665 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 8666 if (!PAGE_ALIGNED(vmcs12->apic_access_addr) || 8667 vmcs12->apic_access_addr >> maxphyaddr) 8668 return false; 8669 8670 /* 8671 * Translate L1 physical address to host physical 8672 * address for vmcs02. Keep the page pinned, so this 8673 * physical address remains valid. We keep a reference 8674 * to it so we can release it later. 8675 */ 8676 if (vmx->nested.apic_access_page) /* shouldn't happen */ 8677 nested_release_page(vmx->nested.apic_access_page); 8678 vmx->nested.apic_access_page = 8679 nested_get_page(vcpu, vmcs12->apic_access_addr); 8680 } 8681 8682 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 8683 if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) || 8684 vmcs12->virtual_apic_page_addr >> maxphyaddr) 8685 return false; 8686 8687 if (vmx->nested.virtual_apic_page) /* shouldn't happen */ 8688 nested_release_page(vmx->nested.virtual_apic_page); 8689 vmx->nested.virtual_apic_page = 8690 nested_get_page(vcpu, vmcs12->virtual_apic_page_addr); 8691 8692 /* 8693 * Failing the vm entry is _not_ what the processor does 8694 * but it's basically the only possibility we have. 8695 * We could still enter the guest if CR8 load exits are 8696 * enabled, CR8 store exits are enabled, and virtualize APIC 8697 * access is disabled; in this case the processor would never 8698 * use the TPR shadow and we could simply clear the bit from 8699 * the execution control. But such a configuration is useless, 8700 * so let's keep the code simple. 8701 */ 8702 if (!vmx->nested.virtual_apic_page) 8703 return false; 8704 } 8705 8706 if (nested_cpu_has_posted_intr(vmcs12)) { 8707 if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) || 8708 vmcs12->posted_intr_desc_addr >> maxphyaddr) 8709 return false; 8710 8711 if (vmx->nested.pi_desc_page) { /* shouldn't happen */ 8712 kunmap(vmx->nested.pi_desc_page); 8713 nested_release_page(vmx->nested.pi_desc_page); 8714 } 8715 vmx->nested.pi_desc_page = 8716 nested_get_page(vcpu, vmcs12->posted_intr_desc_addr); 8717 if (!vmx->nested.pi_desc_page) 8718 return false; 8719 8720 vmx->nested.pi_desc = 8721 (struct pi_desc *)kmap(vmx->nested.pi_desc_page); 8722 if (!vmx->nested.pi_desc) { 8723 nested_release_page_clean(vmx->nested.pi_desc_page); 8724 return false; 8725 } 8726 vmx->nested.pi_desc = 8727 (struct pi_desc *)((void *)vmx->nested.pi_desc + 8728 (unsigned long)(vmcs12->posted_intr_desc_addr & 8729 (PAGE_SIZE - 1))); 8730 } 8731 8732 return true; 8733} 8734 8735static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) 8736{ 8737 u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; 8738 struct vcpu_vmx *vmx = to_vmx(vcpu); 8739 8740 if (vcpu->arch.virtual_tsc_khz == 0) 8741 return; 8742 8743 /* Make sure short timeouts reliably trigger an immediate vmexit. 8744 * hrtimer_start does not guarantee this. */ 8745 if (preemption_timeout <= 1) { 8746 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 8747 return; 8748 } 8749 8750 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 8751 preemption_timeout *= 1000000; 8752 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 8753 hrtimer_start(&vmx->nested.preemption_timer, 8754 ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL); 8755} 8756 8757static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 8758 struct vmcs12 *vmcs12) 8759{ 8760 int maxphyaddr; 8761 u64 addr; 8762 8763 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 8764 return 0; 8765 8766 if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) { 8767 WARN_ON(1); 8768 return -EINVAL; 8769 } 8770 maxphyaddr = cpuid_maxphyaddr(vcpu); 8771 8772 if (!PAGE_ALIGNED(vmcs12->msr_bitmap) || 8773 ((addr + PAGE_SIZE) >> maxphyaddr)) 8774 return -EINVAL; 8775 8776 return 0; 8777} 8778 8779/* 8780 * Merge L0's and L1's MSR bitmap, return false to indicate that 8781 * we do not use the hardware. 8782 */ 8783static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, 8784 struct vmcs12 *vmcs12) 8785{ 8786 int msr; 8787 struct page *page; 8788 unsigned long *msr_bitmap; 8789 8790 if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) 8791 return false; 8792 8793 page = nested_get_page(vcpu, vmcs12->msr_bitmap); 8794 if (!page) { 8795 WARN_ON(1); 8796 return false; 8797 } 8798 msr_bitmap = (unsigned long *)kmap(page); 8799 if (!msr_bitmap) { 8800 nested_release_page_clean(page); 8801 WARN_ON(1); 8802 return false; 8803 } 8804 8805 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 8806 if (nested_cpu_has_apic_reg_virt(vmcs12)) 8807 for (msr = 0x800; msr <= 0x8ff; msr++) 8808 nested_vmx_disable_intercept_for_msr( 8809 msr_bitmap, 8810 vmx_msr_bitmap_nested, 8811 msr, MSR_TYPE_R); 8812 /* TPR is allowed */ 8813 nested_vmx_disable_intercept_for_msr(msr_bitmap, 8814 vmx_msr_bitmap_nested, 8815 APIC_BASE_MSR + (APIC_TASKPRI >> 4), 8816 MSR_TYPE_R | MSR_TYPE_W); 8817 if (nested_cpu_has_vid(vmcs12)) { 8818 /* EOI and self-IPI are allowed */ 8819 nested_vmx_disable_intercept_for_msr( 8820 msr_bitmap, 8821 vmx_msr_bitmap_nested, 8822 APIC_BASE_MSR + (APIC_EOI >> 4), 8823 MSR_TYPE_W); 8824 nested_vmx_disable_intercept_for_msr( 8825 msr_bitmap, 8826 vmx_msr_bitmap_nested, 8827 APIC_BASE_MSR + (APIC_SELF_IPI >> 4), 8828 MSR_TYPE_W); 8829 } 8830 } else { 8831 /* 8832 * Enable reading intercept of all the x2apic 8833 * MSRs. We should not rely on vmcs12 to do any 8834 * optimizations here, it may have been modified 8835 * by L1. 8836 */ 8837 for (msr = 0x800; msr <= 0x8ff; msr++) 8838 __vmx_enable_intercept_for_msr( 8839 vmx_msr_bitmap_nested, 8840 msr, 8841 MSR_TYPE_R); 8842 8843 __vmx_enable_intercept_for_msr( 8844 vmx_msr_bitmap_nested, 8845 APIC_BASE_MSR + (APIC_TASKPRI >> 4), 8846 MSR_TYPE_W); 8847 __vmx_enable_intercept_for_msr( 8848 vmx_msr_bitmap_nested, 8849 APIC_BASE_MSR + (APIC_EOI >> 4), 8850 MSR_TYPE_W); 8851 __vmx_enable_intercept_for_msr( 8852 vmx_msr_bitmap_nested, 8853 APIC_BASE_MSR + (APIC_SELF_IPI >> 4), 8854 MSR_TYPE_W); 8855 } 8856 kunmap(page); 8857 nested_release_page_clean(page); 8858 8859 return true; 8860} 8861 8862static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 8863 struct vmcs12 *vmcs12) 8864{ 8865 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 8866 !nested_cpu_has_apic_reg_virt(vmcs12) && 8867 !nested_cpu_has_vid(vmcs12) && 8868 !nested_cpu_has_posted_intr(vmcs12)) 8869 return 0; 8870 8871 /* 8872 * If virtualize x2apic mode is enabled, 8873 * virtualize apic access must be disabled. 8874 */ 8875 if (nested_cpu_has_virt_x2apic_mode(vmcs12) && 8876 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) 8877 return -EINVAL; 8878 8879 /* 8880 * If virtual interrupt delivery is enabled, 8881 * we must exit on external interrupts. 8882 */ 8883 if (nested_cpu_has_vid(vmcs12) && 8884 !nested_exit_on_intr(vcpu)) 8885 return -EINVAL; 8886 8887 /* 8888 * bits 15:8 should be zero in posted_intr_nv, 8889 * the descriptor address has been already checked 8890 * in nested_get_vmcs12_pages. 8891 */ 8892 if (nested_cpu_has_posted_intr(vmcs12) && 8893 (!nested_cpu_has_vid(vmcs12) || 8894 !nested_exit_intr_ack_set(vcpu) || 8895 vmcs12->posted_intr_nv & 0xff00)) 8896 return -EINVAL; 8897 8898 /* tpr shadow is needed by all apicv features. */ 8899 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 8900 return -EINVAL; 8901 8902 return 0; 8903} 8904 8905static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 8906 unsigned long count_field, 8907 unsigned long addr_field) 8908{ 8909 int maxphyaddr; 8910 u64 count, addr; 8911 8912 if (vmcs12_read_any(vcpu, count_field, &count) || 8913 vmcs12_read_any(vcpu, addr_field, &addr)) { 8914 WARN_ON(1); 8915 return -EINVAL; 8916 } 8917 if (count == 0) 8918 return 0; 8919 maxphyaddr = cpuid_maxphyaddr(vcpu); 8920 if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr || 8921 (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) { 8922 pr_warn_ratelimited( 8923 "nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)", 8924 addr_field, maxphyaddr, count, addr); 8925 return -EINVAL; 8926 } 8927 return 0; 8928} 8929 8930static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu, 8931 struct vmcs12 *vmcs12) 8932{ 8933 if (vmcs12->vm_exit_msr_load_count == 0 && 8934 vmcs12->vm_exit_msr_store_count == 0 && 8935 vmcs12->vm_entry_msr_load_count == 0) 8936 return 0; /* Fast path */ 8937 if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT, 8938 VM_EXIT_MSR_LOAD_ADDR) || 8939 nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT, 8940 VM_EXIT_MSR_STORE_ADDR) || 8941 nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT, 8942 VM_ENTRY_MSR_LOAD_ADDR)) 8943 return -EINVAL; 8944 return 0; 8945} 8946 8947static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 8948 struct vmx_msr_entry *e) 8949{ 8950 /* x2APIC MSR accesses are not allowed */ 8951 if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) 8952 return -EINVAL; 8953 if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ 8954 e->index == MSR_IA32_UCODE_REV) 8955 return -EINVAL; 8956 if (e->reserved != 0) 8957 return -EINVAL; 8958 return 0; 8959} 8960 8961static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 8962 struct vmx_msr_entry *e) 8963{ 8964 if (e->index == MSR_FS_BASE || 8965 e->index == MSR_GS_BASE || 8966 e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */ 8967 nested_vmx_msr_check_common(vcpu, e)) 8968 return -EINVAL; 8969 return 0; 8970} 8971 8972static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 8973 struct vmx_msr_entry *e) 8974{ 8975 if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */ 8976 nested_vmx_msr_check_common(vcpu, e)) 8977 return -EINVAL; 8978 return 0; 8979} 8980 8981/* 8982 * Load guest's/host's msr at nested entry/exit. 8983 * return 0 for success, entry index for failure. 8984 */ 8985static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 8986{ 8987 u32 i; 8988 struct vmx_msr_entry e; 8989 struct msr_data msr; 8990 8991 msr.host_initiated = false; 8992 for (i = 0; i < count; i++) { 8993 if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), 8994 &e, sizeof(e))) { 8995 pr_warn_ratelimited( 8996 "%s cannot read MSR entry (%u, 0x%08llx)\n", 8997 __func__, i, gpa + i * sizeof(e)); 8998 goto fail; 8999 } 9000 if (nested_vmx_load_msr_check(vcpu, &e)) { 9001 pr_warn_ratelimited( 9002 "%s check failed (%u, 0x%x, 0x%x)\n", 9003 __func__, i, e.index, e.reserved); 9004 goto fail; 9005 } 9006 msr.index = e.index; 9007 msr.data = e.value; 9008 if (kvm_set_msr(vcpu, &msr)) { 9009 pr_warn_ratelimited( 9010 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 9011 __func__, i, e.index, e.value); 9012 goto fail; 9013 } 9014 } 9015 return 0; 9016fail: 9017 return i + 1; 9018} 9019 9020static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 9021{ 9022 u32 i; 9023 struct vmx_msr_entry e; 9024 9025 for (i = 0; i < count; i++) { 9026 if (kvm_read_guest(vcpu->kvm, 9027 gpa + i * sizeof(e), 9028 &e, 2 * sizeof(u32))) { 9029 pr_warn_ratelimited( 9030 "%s cannot read MSR entry (%u, 0x%08llx)\n", 9031 __func__, i, gpa + i * sizeof(e)); 9032 return -EINVAL; 9033 } 9034 if (nested_vmx_store_msr_check(vcpu, &e)) { 9035 pr_warn_ratelimited( 9036 "%s check failed (%u, 0x%x, 0x%x)\n", 9037 __func__, i, e.index, e.reserved); 9038 return -EINVAL; 9039 } 9040 if (kvm_get_msr(vcpu, e.index, &e.value)) { 9041 pr_warn_ratelimited( 9042 "%s cannot read MSR (%u, 0x%x)\n", 9043 __func__, i, e.index); 9044 return -EINVAL; 9045 } 9046 if (kvm_write_guest(vcpu->kvm, 9047 gpa + i * sizeof(e) + 9048 offsetof(struct vmx_msr_entry, value), 9049 &e.value, sizeof(e.value))) { 9050 pr_warn_ratelimited( 9051 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 9052 __func__, i, e.index, e.value); 9053 return -EINVAL; 9054 } 9055 } 9056 return 0; 9057} 9058 9059/* 9060 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 9061 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 9062 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 9063 * guest in a way that will both be appropriate to L1's requests, and our 9064 * needs. In addition to modifying the active vmcs (which is vmcs02), this 9065 * function also has additional necessary side-effects, like setting various 9066 * vcpu->arch fields. 9067 */ 9068static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 9069{ 9070 struct vcpu_vmx *vmx = to_vmx(vcpu); 9071 u32 exec_control; 9072 9073 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 9074 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 9075 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 9076 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 9077 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 9078 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 9079 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 9080 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 9081 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 9082 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 9083 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 9084 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 9085 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 9086 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 9087 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 9088 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 9089 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 9090 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 9091 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 9092 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 9093 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 9094 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 9095 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 9096 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 9097 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 9098 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 9099 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 9100 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 9101 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 9102 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 9103 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 9104 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 9105 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 9106 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 9107 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 9108 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 9109 9110 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 9111 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 9112 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 9113 } else { 9114 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 9115 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); 9116 } 9117 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 9118 vmcs12->vm_entry_intr_info_field); 9119 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 9120 vmcs12->vm_entry_exception_error_code); 9121 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 9122 vmcs12->vm_entry_instruction_len); 9123 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 9124 vmcs12->guest_interruptibility_info); 9125 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 9126 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 9127 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 9128 vmcs12->guest_pending_dbg_exceptions); 9129 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 9130 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 9131 9132 if (nested_cpu_has_xsaves(vmcs12)) 9133 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 9134 vmcs_write64(VMCS_LINK_POINTER, -1ull); 9135 9136 exec_control = vmcs12->pin_based_vm_exec_control; 9137 exec_control |= vmcs_config.pin_based_exec_ctrl; 9138 exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; 9139 9140 if (nested_cpu_has_posted_intr(vmcs12)) { 9141 /* 9142 * Note that we use L0's vector here and in 9143 * vmx_deliver_nested_posted_interrupt. 9144 */ 9145 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 9146 vmx->nested.pi_pending = false; 9147 vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); 9148 vmcs_write64(POSTED_INTR_DESC_ADDR, 9149 page_to_phys(vmx->nested.pi_desc_page) + 9150 (unsigned long)(vmcs12->posted_intr_desc_addr & 9151 (PAGE_SIZE - 1))); 9152 } else 9153 exec_control &= ~PIN_BASED_POSTED_INTR; 9154 9155 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); 9156 9157 vmx->nested.preemption_timer_expired = false; 9158 if (nested_cpu_has_preemption_timer(vmcs12)) 9159 vmx_start_preemption_timer(vcpu); 9160 9161 /* 9162 * Whether page-faults are trapped is determined by a combination of 9163 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. 9164 * If enable_ept, L0 doesn't care about page faults and we should 9165 * set all of these to L1's desires. However, if !enable_ept, L0 does 9166 * care about (at least some) page faults, and because it is not easy 9167 * (if at all possible?) to merge L0 and L1's desires, we simply ask 9168 * to exit on each and every L2 page fault. This is done by setting 9169 * MASK=MATCH=0 and (see below) EB.PF=1. 9170 * Note that below we don't need special code to set EB.PF beyond the 9171 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 9172 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 9173 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 9174 * 9175 * A problem with this approach (when !enable_ept) is that L1 may be 9176 * injected with more page faults than it asked for. This could have 9177 * caused problems, but in practice existing hypervisors don't care. 9178 * To fix this, we will need to emulate the PFEC checking (on the L1 9179 * page tables), using walk_addr(), when injecting PFs to L1. 9180 */ 9181 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 9182 enable_ept ? vmcs12->page_fault_error_code_mask : 0); 9183 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 9184 enable_ept ? vmcs12->page_fault_error_code_match : 0); 9185 9186 if (cpu_has_secondary_exec_ctrls()) { 9187 exec_control = vmx_secondary_exec_control(vmx); 9188 if (!vmx->rdtscp_enabled) 9189 exec_control &= ~SECONDARY_EXEC_RDTSCP; 9190 /* Take the following fields only from vmcs12 */ 9191 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 9192 SECONDARY_EXEC_RDTSCP | 9193 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 9194 SECONDARY_EXEC_APIC_REGISTER_VIRT); 9195 if (nested_cpu_has(vmcs12, 9196 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 9197 exec_control |= vmcs12->secondary_vm_exec_control; 9198 9199 if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) { 9200 /* 9201 * If translation failed, no matter: This feature asks 9202 * to exit when accessing the given address, and if it 9203 * can never be accessed, this feature won't do 9204 * anything anyway. 9205 */ 9206 if (!vmx->nested.apic_access_page) 9207 exec_control &= 9208 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9209 else 9210 vmcs_write64(APIC_ACCESS_ADDR, 9211 page_to_phys(vmx->nested.apic_access_page)); 9212 } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) && 9213 (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) { 9214 exec_control |= 9215 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 9216 kvm_vcpu_reload_apic_access_page(vcpu); 9217 } 9218 9219 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { 9220 vmcs_write64(EOI_EXIT_BITMAP0, 9221 vmcs12->eoi_exit_bitmap0); 9222 vmcs_write64(EOI_EXIT_BITMAP1, 9223 vmcs12->eoi_exit_bitmap1); 9224 vmcs_write64(EOI_EXIT_BITMAP2, 9225 vmcs12->eoi_exit_bitmap2); 9226 vmcs_write64(EOI_EXIT_BITMAP3, 9227 vmcs12->eoi_exit_bitmap3); 9228 vmcs_write16(GUEST_INTR_STATUS, 9229 vmcs12->guest_intr_status); 9230 } 9231 9232 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 9233 } 9234 9235 9236 /* 9237 * Set host-state according to L0's settings (vmcs12 is irrelevant here) 9238 * Some constant fields are set here by vmx_set_constant_host_state(). 9239 * Other fields are different per CPU, and will be set later when 9240 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. 9241 */ 9242 vmx_set_constant_host_state(vmx); 9243 9244 /* 9245 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before 9246 * entry, but only if the current (host) sp changed from the value 9247 * we wrote last (vmx->host_rsp). This cache is no longer relevant 9248 * if we switch vmcs, and rather than hold a separate cache per vmcs, 9249 * here we just force the write to happen on entry. 9250 */ 9251 vmx->host_rsp = 0; 9252 9253 exec_control = vmx_exec_control(vmx); /* L0's desires */ 9254 exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 9255 exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 9256 exec_control &= ~CPU_BASED_TPR_SHADOW; 9257 exec_control |= vmcs12->cpu_based_vm_exec_control; 9258 9259 if (exec_control & CPU_BASED_TPR_SHADOW) { 9260 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 9261 page_to_phys(vmx->nested.virtual_apic_page)); 9262 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 9263 } 9264 9265 if (cpu_has_vmx_msr_bitmap() && 9266 exec_control & CPU_BASED_USE_MSR_BITMAPS) { 9267 nested_vmx_merge_msr_bitmap(vcpu, vmcs12); 9268 /* MSR_BITMAP will be set by following vmx_set_efer. */ 9269 } else 9270 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 9271 9272 /* 9273 * Merging of IO bitmap not currently supported. 9274 * Rather, exit every time. 9275 */ 9276 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 9277 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 9278 9279 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 9280 9281 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 9282 * bitwise-or of what L1 wants to trap for L2, and what we want to 9283 * trap. Note that CR0.TS also needs updating - we do this later. 9284 */ 9285 update_exception_bitmap(vcpu); 9286 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 9287 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 9288 9289 /* L2->L1 exit controls are emulated - the hardware exit is to L0 so 9290 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 9291 * bits are further modified by vmx_set_efer() below. 9292 */ 9293 vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl); 9294 9295 /* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are 9296 * emulated by vmx_set_efer(), below. 9297 */ 9298 vm_entry_controls_init(vmx, 9299 (vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER & 9300 ~VM_ENTRY_IA32E_MODE) | 9301 (vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE)); 9302 9303 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) { 9304 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 9305 vcpu->arch.pat = vmcs12->guest_ia32_pat; 9306 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) 9307 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 9308 9309 9310 set_cr4_guest_host_mask(vmx); 9311 9312 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) 9313 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 9314 9315 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) 9316 vmcs_write64(TSC_OFFSET, 9317 vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); 9318 else 9319 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 9320 9321 if (enable_vpid) { 9322 /* 9323 * Trivially support vpid by letting L2s share their parent 9324 * L1's vpid. TODO: move to a more elaborate solution, giving 9325 * each L2 its own vpid and exposing the vpid feature to L1. 9326 */ 9327 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 9328 vmx_flush_tlb(vcpu); 9329 } 9330 9331 if (nested_cpu_has_ept(vmcs12)) { 9332 kvm_mmu_unload(vcpu); 9333 nested_ept_init_mmu_context(vcpu); 9334 } 9335 9336 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) 9337 vcpu->arch.efer = vmcs12->guest_ia32_efer; 9338 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 9339 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 9340 else 9341 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 9342 /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 9343 vmx_set_efer(vcpu, vcpu->arch.efer); 9344 9345 /* 9346 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified 9347 * TS bit (for lazy fpu) and bits which we consider mandatory enabled. 9348 * The CR0_READ_SHADOW is what L2 should have expected to read given 9349 * the specifications by L1; It's not enough to take 9350 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we 9351 * have more bits than L1 expected. 9352 */ 9353 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 9354 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 9355 9356 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 9357 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 9358 9359 /* shadow page tables on either EPT or shadow page tables */ 9360 kvm_set_cr3(vcpu, vmcs12->guest_cr3); 9361 kvm_mmu_reset_context(vcpu); 9362 9363 if (!enable_ept) 9364 vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; 9365 9366 /* 9367 * L1 may access the L2's PDPTR, so save them to construct vmcs12 9368 */ 9369 if (enable_ept) { 9370 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 9371 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 9372 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 9373 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 9374 } 9375 9376 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); 9377 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); 9378} 9379 9380/* 9381 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 9382 * for running an L2 nested guest. 9383 */ 9384static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 9385{ 9386 struct vmcs12 *vmcs12; 9387 struct vcpu_vmx *vmx = to_vmx(vcpu); 9388 int cpu; 9389 struct loaded_vmcs *vmcs02; 9390 bool ia32e; 9391 u32 msr_entry_idx; 9392 9393 if (!nested_vmx_check_permission(vcpu) || 9394 !nested_vmx_check_vmcs12(vcpu)) 9395 return 1; 9396 9397 skip_emulated_instruction(vcpu); 9398 vmcs12 = get_vmcs12(vcpu); 9399 9400 if (enable_shadow_vmcs) 9401 copy_shadow_to_vmcs12(vmx); 9402 9403 /* 9404 * The nested entry process starts with enforcing various prerequisites 9405 * on vmcs12 as required by the Intel SDM, and act appropriately when 9406 * they fail: As the SDM explains, some conditions should cause the 9407 * instruction to fail, while others will cause the instruction to seem 9408 * to succeed, but return an EXIT_REASON_INVALID_STATE. 9409 * To speed up the normal (success) code path, we should avoid checking 9410 * for misconfigurations which will anyway be caught by the processor 9411 * when using the merged vmcs02. 9412 */ 9413 if (vmcs12->launch_state == launch) { 9414 nested_vmx_failValid(vcpu, 9415 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 9416 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 9417 return 1; 9418 } 9419 9420 if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 9421 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { 9422 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9423 return 1; 9424 } 9425 9426 if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { 9427 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9428 return 1; 9429 } 9430 9431 if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { 9432 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9433 return 1; 9434 } 9435 9436 if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { 9437 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9438 return 1; 9439 } 9440 9441 if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { 9442 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9443 return 1; 9444 } 9445 9446 if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 9447 vmx->nested.nested_vmx_true_procbased_ctls_low, 9448 vmx->nested.nested_vmx_procbased_ctls_high) || 9449 !vmx_control_verify(vmcs12->secondary_vm_exec_control, 9450 vmx->nested.nested_vmx_secondary_ctls_low, 9451 vmx->nested.nested_vmx_secondary_ctls_high) || 9452 !vmx_control_verify(vmcs12->pin_based_vm_exec_control, 9453 vmx->nested.nested_vmx_pinbased_ctls_low, 9454 vmx->nested.nested_vmx_pinbased_ctls_high) || 9455 !vmx_control_verify(vmcs12->vm_exit_controls, 9456 vmx->nested.nested_vmx_true_exit_ctls_low, 9457 vmx->nested.nested_vmx_exit_ctls_high) || 9458 !vmx_control_verify(vmcs12->vm_entry_controls, 9459 vmx->nested.nested_vmx_true_entry_ctls_low, 9460 vmx->nested.nested_vmx_entry_ctls_high)) 9461 { 9462 nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 9463 return 1; 9464 } 9465 9466 if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || 9467 ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9468 nested_vmx_failValid(vcpu, 9469 VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 9470 return 1; 9471 } 9472 9473 if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || 9474 ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { 9475 nested_vmx_entry_failure(vcpu, vmcs12, 9476 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9477 return 1; 9478 } 9479 if (vmcs12->vmcs_link_pointer != -1ull) { 9480 nested_vmx_entry_failure(vcpu, vmcs12, 9481 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); 9482 return 1; 9483 } 9484 9485 /* 9486 * If the load IA32_EFER VM-entry control is 1, the following checks 9487 * are performed on the field for the IA32_EFER MSR: 9488 * - Bits reserved in the IA32_EFER MSR must be 0. 9489 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 9490 * the IA-32e mode guest VM-exit control. It must also be identical 9491 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 9492 * CR0.PG) is 1. 9493 */ 9494 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) { 9495 ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0; 9496 if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) || 9497 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) || 9498 ((vmcs12->guest_cr0 & X86_CR0_PG) && 9499 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { 9500 nested_vmx_entry_failure(vcpu, vmcs12, 9501 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9502 return 1; 9503 } 9504 } 9505 9506 /* 9507 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 9508 * IA32_EFER MSR must be 0 in the field for that register. In addition, 9509 * the values of the LMA and LME bits in the field must each be that of 9510 * the host address-space size VM-exit control. 9511 */ 9512 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 9513 ia32e = (vmcs12->vm_exit_controls & 9514 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0; 9515 if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) || 9516 ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) || 9517 ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { 9518 nested_vmx_entry_failure(vcpu, vmcs12, 9519 EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); 9520 return 1; 9521 } 9522 } 9523 9524 /* 9525 * We're finally done with prerequisite checking, and can start with 9526 * the nested entry. 9527 */ 9528 9529 vmcs02 = nested_get_current_vmcs02(vmx); 9530 if (!vmcs02) 9531 return -ENOMEM; 9532 9533 enter_guest_mode(vcpu); 9534 9535 vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); 9536 9537 if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 9538 vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 9539 9540 cpu = get_cpu(); 9541 vmx->loaded_vmcs = vmcs02; 9542 vmx_vcpu_put(vcpu); 9543 vmx_vcpu_load(vcpu, cpu); 9544 vcpu->cpu = cpu; 9545 put_cpu(); 9546 9547 vmx_segment_cache_clear(vmx); 9548 9549 prepare_vmcs02(vcpu, vmcs12); 9550 9551 msr_entry_idx = nested_vmx_load_msr(vcpu, 9552 vmcs12->vm_entry_msr_load_addr, 9553 vmcs12->vm_entry_msr_load_count); 9554 if (msr_entry_idx) { 9555 leave_guest_mode(vcpu); 9556 vmx_load_vmcs01(vcpu); 9557 nested_vmx_entry_failure(vcpu, vmcs12, 9558 EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); 9559 return 1; 9560 } 9561 9562 vmcs12->launch_state = 1; 9563 9564 if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) 9565 return kvm_vcpu_halt(vcpu); 9566 9567 vmx->nested.nested_run_pending = 1; 9568 9569 /* 9570 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 9571 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 9572 * returned as far as L1 is concerned. It will only return (and set 9573 * the success flag) when L2 exits (see nested_vmx_vmexit()). 9574 */ 9575 return 1; 9576} 9577 9578/* 9579 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 9580 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK). 9581 * This function returns the new value we should put in vmcs12.guest_cr0. 9582 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 9583 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 9584 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 9585 * didn't trap the bit, because if L1 did, so would L0). 9586 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 9587 * been modified by L2, and L1 knows it. So just leave the old value of 9588 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 9589 * isn't relevant, because if L0 traps this bit it can set it to anything. 9590 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 9591 * changed these bits, and therefore they need to be updated, but L0 9592 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 9593 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 9594 */ 9595static inline unsigned long 9596vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 9597{ 9598 return 9599 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 9600 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 9601 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 9602 vcpu->arch.cr0_guest_owned_bits)); 9603} 9604 9605static inline unsigned long 9606vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 9607{ 9608 return 9609 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 9610 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 9611 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 9612 vcpu->arch.cr4_guest_owned_bits)); 9613} 9614 9615static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 9616 struct vmcs12 *vmcs12) 9617{ 9618 u32 idt_vectoring; 9619 unsigned int nr; 9620 9621 if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) { 9622 nr = vcpu->arch.exception.nr; 9623 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 9624 9625 if (kvm_exception_is_soft(nr)) { 9626 vmcs12->vm_exit_instruction_len = 9627 vcpu->arch.event_exit_inst_len; 9628 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 9629 } else 9630 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 9631 9632 if (vcpu->arch.exception.has_error_code) { 9633 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 9634 vmcs12->idt_vectoring_error_code = 9635 vcpu->arch.exception.error_code; 9636 } 9637 9638 vmcs12->idt_vectoring_info_field = idt_vectoring; 9639 } else if (vcpu->arch.nmi_injected) { 9640 vmcs12->idt_vectoring_info_field = 9641 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 9642 } else if (vcpu->arch.interrupt.pending) { 9643 nr = vcpu->arch.interrupt.nr; 9644 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 9645 9646 if (vcpu->arch.interrupt.soft) { 9647 idt_vectoring |= INTR_TYPE_SOFT_INTR; 9648 vmcs12->vm_entry_instruction_len = 9649 vcpu->arch.event_exit_inst_len; 9650 } else 9651 idt_vectoring |= INTR_TYPE_EXT_INTR; 9652 9653 vmcs12->idt_vectoring_info_field = idt_vectoring; 9654 } 9655} 9656 9657static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) 9658{ 9659 struct vcpu_vmx *vmx = to_vmx(vcpu); 9660 9661 if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 9662 vmx->nested.preemption_timer_expired) { 9663 if (vmx->nested.nested_run_pending) 9664 return -EBUSY; 9665 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 9666 return 0; 9667 } 9668 9669 if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { 9670 if (vmx->nested.nested_run_pending || 9671 vcpu->arch.interrupt.pending) 9672 return -EBUSY; 9673 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 9674 NMI_VECTOR | INTR_TYPE_NMI_INTR | 9675 INTR_INFO_VALID_MASK, 0); 9676 /* 9677 * The NMI-triggered VM exit counts as injection: 9678 * clear this one and block further NMIs. 9679 */ 9680 vcpu->arch.nmi_pending = 0; 9681 vmx_set_nmi_mask(vcpu, true); 9682 return 0; 9683 } 9684 9685 if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && 9686 nested_exit_on_intr(vcpu)) { 9687 if (vmx->nested.nested_run_pending) 9688 return -EBUSY; 9689 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 9690 return 0; 9691 } 9692 9693 return vmx_complete_nested_posted_interrupt(vcpu); 9694} 9695 9696static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 9697{ 9698 ktime_t remaining = 9699 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 9700 u64 value; 9701 9702 if (ktime_to_ns(remaining) <= 0) 9703 return 0; 9704 9705 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 9706 do_div(value, 1000000); 9707 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 9708} 9709 9710/* 9711 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 9712 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 9713 * and this function updates it to reflect the changes to the guest state while 9714 * L2 was running (and perhaps made some exits which were handled directly by L0 9715 * without going back to L1), and to reflect the exit reason. 9716 * Note that we do not have to copy here all VMCS fields, just those that 9717 * could have changed by the L2 guest or the exit - i.e., the guest-state and 9718 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 9719 * which already writes to vmcs12 directly. 9720 */ 9721static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 9722 u32 exit_reason, u32 exit_intr_info, 9723 unsigned long exit_qualification) 9724{ 9725 /* update guest state fields: */ 9726 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 9727 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 9728 9729 vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); 9730 vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); 9731 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 9732 9733 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 9734 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 9735 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 9736 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 9737 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 9738 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 9739 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 9740 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 9741 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 9742 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 9743 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 9744 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 9745 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 9746 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 9747 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 9748 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 9749 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 9750 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 9751 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 9752 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 9753 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 9754 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 9755 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 9756 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 9757 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 9758 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 9759 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 9760 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 9761 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 9762 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 9763 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 9764 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 9765 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 9766 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 9767 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 9768 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 9769 9770 vmcs12->guest_interruptibility_info = 9771 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 9772 vmcs12->guest_pending_dbg_exceptions = 9773 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 9774 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 9775 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 9776 else 9777 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 9778 9779 if (nested_cpu_has_preemption_timer(vmcs12)) { 9780 if (vmcs12->vm_exit_controls & 9781 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) 9782 vmcs12->vmx_preemption_timer_value = 9783 vmx_get_preemption_timer_value(vcpu); 9784 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 9785 } 9786 9787 /* 9788 * In some cases (usually, nested EPT), L2 is allowed to change its 9789 * own CR3 without exiting. If it has changed it, we must keep it. 9790 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 9791 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 9792 * 9793 * Additionally, restore L2's PDPTR to vmcs12. 9794 */ 9795 if (enable_ept) { 9796 vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3); 9797 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 9798 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 9799 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 9800 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 9801 } 9802 9803 if (nested_cpu_has_vid(vmcs12)) 9804 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 9805 9806 vmcs12->vm_entry_controls = 9807 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 9808 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 9809 9810 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { 9811 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 9812 vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 9813 } 9814 9815 /* TODO: These cannot have changed unless we have MSR bitmaps and 9816 * the relevant bit asks not to trap the change */ 9817 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) 9818 vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); 9819 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 9820 vmcs12->guest_ia32_efer = vcpu->arch.efer; 9821 vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); 9822 vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); 9823 vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); 9824 if (vmx_mpx_supported()) 9825 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 9826 if (nested_cpu_has_xsaves(vmcs12)) 9827 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP); 9828 9829 /* update exit information fields: */ 9830 9831 vmcs12->vm_exit_reason = exit_reason; 9832 vmcs12->exit_qualification = exit_qualification; 9833 9834 vmcs12->vm_exit_intr_info = exit_intr_info; 9835 if ((vmcs12->vm_exit_intr_info & 9836 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) == 9837 (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) 9838 vmcs12->vm_exit_intr_error_code = 9839 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 9840 vmcs12->idt_vectoring_info_field = 0; 9841 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 9842 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 9843 9844 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 9845 /* vm_entry_intr_info_field is cleared on exit. Emulate this 9846 * instead of reading the real value. */ 9847 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 9848 9849 /* 9850 * Transfer the event that L0 or L1 may wanted to inject into 9851 * L2 to IDT_VECTORING_INFO_FIELD. 9852 */ 9853 vmcs12_save_pending_event(vcpu, vmcs12); 9854 } 9855 9856 /* 9857 * Drop what we picked up for L2 via vmx_complete_interrupts. It is 9858 * preserved above and would only end up incorrectly in L1. 9859 */ 9860 vcpu->arch.nmi_injected = false; 9861 kvm_clear_exception_queue(vcpu); 9862 kvm_clear_interrupt_queue(vcpu); 9863} 9864 9865/* 9866 * A part of what we need to when the nested L2 guest exits and we want to 9867 * run its L1 parent, is to reset L1's guest state to the host state specified 9868 * in vmcs12. 9869 * This function is to be called not only on normal nested exit, but also on 9870 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 9871 * Failures During or After Loading Guest State"). 9872 * This function should be called when the active VMCS is L1's (vmcs01). 9873 */ 9874static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 9875 struct vmcs12 *vmcs12) 9876{ 9877 struct kvm_segment seg; 9878 9879 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 9880 vcpu->arch.efer = vmcs12->host_ia32_efer; 9881 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 9882 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 9883 else 9884 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 9885 vmx_set_efer(vcpu, vcpu->arch.efer); 9886 9887 kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp); 9888 kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip); 9889 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 9890 /* 9891 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 9892 * actually changed, because it depends on the current state of 9893 * fpu_active (which may have changed). 9894 * Note that vmx_set_cr0 refers to efer set above. 9895 */ 9896 vmx_set_cr0(vcpu, vmcs12->host_cr0); 9897 /* 9898 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need 9899 * to apply the same changes to L1's vmcs. We just set cr0 correctly, 9900 * but we also need to update cr0_guest_host_mask and exception_bitmap. 9901 */ 9902 update_exception_bitmap(vcpu); 9903 vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0); 9904 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 9905 9906 /* 9907 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01 9908 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); 9909 */ 9910 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 9911 kvm_set_cr4(vcpu, vmcs12->host_cr4); 9912 9913 nested_ept_uninit_mmu_context(vcpu); 9914 9915 kvm_set_cr3(vcpu, vmcs12->host_cr3); 9916 kvm_mmu_reset_context(vcpu); 9917 9918 if (!enable_ept) 9919 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; 9920 9921 if (enable_vpid) { 9922 /* 9923 * Trivially support vpid by letting L2s share their parent 9924 * L1's vpid. TODO: move to a more elaborate solution, giving 9925 * each L2 its own vpid and exposing the vpid feature to L1. 9926 */ 9927 vmx_flush_tlb(vcpu); 9928 } 9929 9930 9931 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 9932 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 9933 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 9934 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 9935 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 9936 9937 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 9938 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 9939 vmcs_write64(GUEST_BNDCFGS, 0); 9940 9941 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 9942 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 9943 vcpu->arch.pat = vmcs12->host_ia32_pat; 9944 } 9945 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) 9946 vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, 9947 vmcs12->host_ia32_perf_global_ctrl); 9948 9949 /* Set L1 segment info according to Intel SDM 9950 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 9951 seg = (struct kvm_segment) { 9952 .base = 0, 9953 .limit = 0xFFFFFFFF, 9954 .selector = vmcs12->host_cs_selector, 9955 .type = 11, 9956 .present = 1, 9957 .s = 1, 9958 .g = 1 9959 }; 9960 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 9961 seg.l = 1; 9962 else 9963 seg.db = 1; 9964 vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 9965 seg = (struct kvm_segment) { 9966 .base = 0, 9967 .limit = 0xFFFFFFFF, 9968 .type = 3, 9969 .present = 1, 9970 .s = 1, 9971 .db = 1, 9972 .g = 1 9973 }; 9974 seg.selector = vmcs12->host_ds_selector; 9975 vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 9976 seg.selector = vmcs12->host_es_selector; 9977 vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 9978 seg.selector = vmcs12->host_ss_selector; 9979 vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 9980 seg.selector = vmcs12->host_fs_selector; 9981 seg.base = vmcs12->host_fs_base; 9982 vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 9983 seg.selector = vmcs12->host_gs_selector; 9984 seg.base = vmcs12->host_gs_base; 9985 vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 9986 seg = (struct kvm_segment) { 9987 .base = vmcs12->host_tr_base, 9988 .limit = 0x67, 9989 .selector = vmcs12->host_tr_selector, 9990 .type = 11, 9991 .present = 1 9992 }; 9993 vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 9994 9995 kvm_set_dr(vcpu, 7, 0x400); 9996 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 9997 9998 if (cpu_has_vmx_msr_bitmap()) 9999 vmx_set_msr_bitmap(vcpu); 10000 10001 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 10002 vmcs12->vm_exit_msr_load_count)) 10003 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 10004} 10005 10006/* 10007 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 10008 * and modify vmcs12 to make it see what it would expect to see there if 10009 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 10010 */ 10011static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, 10012 u32 exit_intr_info, 10013 unsigned long exit_qualification) 10014{ 10015 struct vcpu_vmx *vmx = to_vmx(vcpu); 10016 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 10017 10018 /* trying to cancel vmlaunch/vmresume is a bug */ 10019 WARN_ON_ONCE(vmx->nested.nested_run_pending); 10020 10021 leave_guest_mode(vcpu); 10022 prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info, 10023 exit_qualification); 10024 10025 if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr, 10026 vmcs12->vm_exit_msr_store_count)) 10027 nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); 10028 10029 vmx_load_vmcs01(vcpu); 10030 10031 if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) 10032 && nested_exit_intr_ack_set(vcpu)) { 10033 int irq = kvm_cpu_get_interrupt(vcpu); 10034 WARN_ON(irq < 0); 10035 vmcs12->vm_exit_intr_info = irq | 10036 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 10037 } 10038 10039 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 10040 vmcs12->exit_qualification, 10041 vmcs12->idt_vectoring_info_field, 10042 vmcs12->vm_exit_intr_info, 10043 vmcs12->vm_exit_intr_error_code, 10044 KVM_ISA_VMX); 10045 10046 vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); 10047 vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); 10048 vmx_segment_cache_clear(vmx); 10049 10050 /* if no vmcs02 cache requested, remove the one we used */ 10051 if (VMCS02_POOL_SIZE == 0) 10052 nested_free_vmcs02(vmx, vmx->nested.current_vmptr); 10053 10054 load_vmcs12_host_state(vcpu, vmcs12); 10055 10056 /* Update TSC_OFFSET if TSC was changed while L2 ran */ 10057 vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); 10058 10059 /* This is needed for same reason as it was needed in prepare_vmcs02 */ 10060 vmx->host_rsp = 0; 10061 10062 /* Unpin physical memory we referred to in vmcs02 */ 10063 if (vmx->nested.apic_access_page) { 10064 nested_release_page(vmx->nested.apic_access_page); 10065 vmx->nested.apic_access_page = NULL; 10066 } 10067 if (vmx->nested.virtual_apic_page) { 10068 nested_release_page(vmx->nested.virtual_apic_page); 10069 vmx->nested.virtual_apic_page = NULL; 10070 } 10071 if (vmx->nested.pi_desc_page) { 10072 kunmap(vmx->nested.pi_desc_page); 10073 nested_release_page(vmx->nested.pi_desc_page); 10074 vmx->nested.pi_desc_page = NULL; 10075 vmx->nested.pi_desc = NULL; 10076 } 10077 10078 /* 10079 * We are now running in L2, mmu_notifier will force to reload the 10080 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1. 10081 */ 10082 kvm_vcpu_reload_apic_access_page(vcpu); 10083 10084 /* 10085 * Exiting from L2 to L1, we're now back to L1 which thinks it just 10086 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the 10087 * success or failure flag accordingly. 10088 */ 10089 if (unlikely(vmx->fail)) { 10090 vmx->fail = 0; 10091 nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); 10092 } else 10093 nested_vmx_succeed(vcpu); 10094 if (enable_shadow_vmcs) 10095 vmx->nested.sync_shadow_vmcs = true; 10096 10097 /* in case we halted in L2 */ 10098 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 10099} 10100 10101/* 10102 * Forcibly leave nested mode in order to be able to reset the VCPU later on. 10103 */ 10104static void vmx_leave_nested(struct kvm_vcpu *vcpu) 10105{ 10106 if (is_guest_mode(vcpu)) 10107 nested_vmx_vmexit(vcpu, -1, 0, 0); 10108 free_nested(to_vmx(vcpu)); 10109} 10110 10111/* 10112 * L1's failure to enter L2 is a subset of a normal exit, as explained in 10113 * 23.7 "VM-entry failures during or after loading guest state" (this also 10114 * lists the acceptable exit-reason and exit-qualification parameters). 10115 * It should only be called before L2 actually succeeded to run, and when 10116 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss). 10117 */ 10118static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu, 10119 struct vmcs12 *vmcs12, 10120 u32 reason, unsigned long qualification) 10121{ 10122 load_vmcs12_host_state(vcpu, vmcs12); 10123 vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY; 10124 vmcs12->exit_qualification = qualification; 10125 nested_vmx_succeed(vcpu); 10126 if (enable_shadow_vmcs) 10127 to_vmx(vcpu)->nested.sync_shadow_vmcs = true; 10128} 10129 10130static int vmx_check_intercept(struct kvm_vcpu *vcpu, 10131 struct x86_instruction_info *info, 10132 enum x86_intercept_stage stage) 10133{ 10134 return X86EMUL_CONTINUE; 10135} 10136 10137static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) 10138{ 10139 if (ple_gap) 10140 shrink_ple_window(vcpu); 10141} 10142 10143static void vmx_slot_enable_log_dirty(struct kvm *kvm, 10144 struct kvm_memory_slot *slot) 10145{ 10146 kvm_mmu_slot_leaf_clear_dirty(kvm, slot); 10147 kvm_mmu_slot_largepage_remove_write_access(kvm, slot); 10148} 10149 10150static void vmx_slot_disable_log_dirty(struct kvm *kvm, 10151 struct kvm_memory_slot *slot) 10152{ 10153 kvm_mmu_slot_set_dirty(kvm, slot); 10154} 10155 10156static void vmx_flush_log_dirty(struct kvm *kvm) 10157{ 10158 kvm_flush_pml_buffers(kvm); 10159} 10160 10161static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, 10162 struct kvm_memory_slot *memslot, 10163 gfn_t offset, unsigned long mask) 10164{ 10165 kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask); 10166} 10167 10168static struct kvm_x86_ops vmx_x86_ops = { 10169 .cpu_has_kvm_support = cpu_has_kvm_support, 10170 .disabled_by_bios = vmx_disabled_by_bios, 10171 .hardware_setup = hardware_setup, 10172 .hardware_unsetup = hardware_unsetup, 10173 .check_processor_compatibility = vmx_check_processor_compat, 10174 .hardware_enable = hardware_enable, 10175 .hardware_disable = hardware_disable, 10176 .cpu_has_accelerated_tpr = report_flexpriority, 10177 10178 .vcpu_create = vmx_create_vcpu, 10179 .vcpu_free = vmx_free_vcpu, 10180 .vcpu_reset = vmx_vcpu_reset, 10181 10182 .prepare_guest_switch = vmx_save_host_state, 10183 .vcpu_load = vmx_vcpu_load, 10184 .vcpu_put = vmx_vcpu_put, 10185 10186 .update_db_bp_intercept = update_exception_bitmap, 10187 .get_msr = vmx_get_msr, 10188 .set_msr = vmx_set_msr, 10189 .get_segment_base = vmx_get_segment_base, 10190 .get_segment = vmx_get_segment, 10191 .set_segment = vmx_set_segment, 10192 .get_cpl = vmx_get_cpl, 10193 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 10194 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 10195 .decache_cr3 = vmx_decache_cr3, 10196 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 10197 .set_cr0 = vmx_set_cr0, 10198 .set_cr3 = vmx_set_cr3, 10199 .set_cr4 = vmx_set_cr4, 10200 .set_efer = vmx_set_efer, 10201 .get_idt = vmx_get_idt, 10202 .set_idt = vmx_set_idt, 10203 .get_gdt = vmx_get_gdt, 10204 .set_gdt = vmx_set_gdt, 10205 .get_dr6 = vmx_get_dr6, 10206 .set_dr6 = vmx_set_dr6, 10207 .set_dr7 = vmx_set_dr7, 10208 .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, 10209 .cache_reg = vmx_cache_reg, 10210 .get_rflags = vmx_get_rflags, 10211 .set_rflags = vmx_set_rflags, 10212 .fpu_activate = vmx_fpu_activate, 10213 .fpu_deactivate = vmx_fpu_deactivate, 10214 10215 .tlb_flush = vmx_flush_tlb, 10216 10217 .run = vmx_vcpu_run, 10218 .handle_exit = vmx_handle_exit, 10219 .skip_emulated_instruction = skip_emulated_instruction, 10220 .set_interrupt_shadow = vmx_set_interrupt_shadow, 10221 .get_interrupt_shadow = vmx_get_interrupt_shadow, 10222 .patch_hypercall = vmx_patch_hypercall, 10223 .set_irq = vmx_inject_irq, 10224 .set_nmi = vmx_inject_nmi, 10225 .queue_exception = vmx_queue_exception, 10226 .cancel_injection = vmx_cancel_injection, 10227 .interrupt_allowed = vmx_interrupt_allowed, 10228 .nmi_allowed = vmx_nmi_allowed, 10229 .get_nmi_mask = vmx_get_nmi_mask, 10230 .set_nmi_mask = vmx_set_nmi_mask, 10231 .enable_nmi_window = enable_nmi_window, 10232 .enable_irq_window = enable_irq_window, 10233 .update_cr8_intercept = update_cr8_intercept, 10234 .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, 10235 .set_apic_access_page_addr = vmx_set_apic_access_page_addr, 10236 .vm_has_apicv = vmx_vm_has_apicv, 10237 .load_eoi_exitmap = vmx_load_eoi_exitmap, 10238 .hwapic_irr_update = vmx_hwapic_irr_update, 10239 .hwapic_isr_update = vmx_hwapic_isr_update, 10240 .sync_pir_to_irr = vmx_sync_pir_to_irr, 10241 .deliver_posted_interrupt = vmx_deliver_posted_interrupt, 10242 10243 .set_tss_addr = vmx_set_tss_addr, 10244 .get_tdp_level = get_ept_level, 10245 .get_mt_mask = vmx_get_mt_mask, 10246 10247 .get_exit_info = vmx_get_exit_info, 10248 10249 .get_lpage_level = vmx_get_lpage_level, 10250 10251 .cpuid_update = vmx_cpuid_update, 10252 10253 .rdtscp_supported = vmx_rdtscp_supported, 10254 .invpcid_supported = vmx_invpcid_supported, 10255 10256 .set_supported_cpuid = vmx_set_supported_cpuid, 10257 10258 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 10259 10260 .set_tsc_khz = vmx_set_tsc_khz, 10261 .read_tsc_offset = vmx_read_tsc_offset, 10262 .write_tsc_offset = vmx_write_tsc_offset, 10263 .adjust_tsc_offset = vmx_adjust_tsc_offset, 10264 .compute_tsc_offset = vmx_compute_tsc_offset, 10265 .read_l1_tsc = vmx_read_l1_tsc, 10266 10267 .set_tdp_cr3 = vmx_set_cr3, 10268 10269 .check_intercept = vmx_check_intercept, 10270 .handle_external_intr = vmx_handle_external_intr, 10271 .mpx_supported = vmx_mpx_supported, 10272 .xsaves_supported = vmx_xsaves_supported, 10273 10274 .check_nested_events = vmx_check_nested_events, 10275 10276 .sched_in = vmx_sched_in, 10277 10278 .slot_enable_log_dirty = vmx_slot_enable_log_dirty, 10279 .slot_disable_log_dirty = vmx_slot_disable_log_dirty, 10280 .flush_log_dirty = vmx_flush_log_dirty, 10281 .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, 10282}; 10283 10284static int __init vmx_init(void) 10285{ 10286 int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), 10287 __alignof__(struct vcpu_vmx), THIS_MODULE); 10288 if (r) 10289 return r; 10290 10291#ifdef CONFIG_KEXEC 10292 rcu_assign_pointer(crash_vmclear_loaded_vmcss, 10293 crash_vmclear_local_loaded_vmcss); 10294#endif 10295 10296 return 0; 10297} 10298 10299static void __exit vmx_exit(void) 10300{ 10301#ifdef CONFIG_KEXEC 10302 RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL); 10303 synchronize_rcu(); 10304#endif 10305 10306 kvm_exit(); 10307} 10308 10309module_init(vmx_init) 10310module_exit(vmx_exit) 10311