1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
6 *
7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 *
10 * Authors:
11 *   Avi Kivity   <avi@qumranet.com>
12 *   Yaniv Kamay  <yaniv@qumranet.com>
13 *
14 * This work is licensed under the terms of the GNU GPL, version 2.  See
15 * the COPYING file in the top-level directory.
16 *
17 */
18
19#include "irq.h"
20#include "mmu.h"
21#include "cpuid.h"
22
23#include <linux/kvm_host.h>
24#include <linux/module.h>
25#include <linux/kernel.h>
26#include <linux/mm.h>
27#include <linux/highmem.h>
28#include <linux/sched.h>
29#include <linux/moduleparam.h>
30#include <linux/mod_devicetable.h>
31#include <linux/ftrace_event.h>
32#include <linux/slab.h>
33#include <linux/tboot.h>
34#include <linux/hrtimer.h>
35#include "kvm_cache_regs.h"
36#include "x86.h"
37
38#include <asm/io.h>
39#include <asm/desc.h>
40#include <asm/vmx.h>
41#include <asm/virtext.h>
42#include <asm/mce.h>
43#include <asm/i387.h>
44#include <asm/xcr.h>
45#include <asm/perf_event.h>
46#include <asm/debugreg.h>
47#include <asm/kexec.h>
48#include <asm/apic.h>
49
50#include "trace.h"
51
52#define __ex(x) __kvm_handle_fault_on_reboot(x)
53#define __ex_clear(x, reg) \
54	____kvm_handle_fault_on_reboot(x, "xor " reg " , " reg)
55
56MODULE_AUTHOR("Qumranet");
57MODULE_LICENSE("GPL");
58
59static const struct x86_cpu_id vmx_cpu_id[] = {
60	X86_FEATURE_MATCH(X86_FEATURE_VMX),
61	{}
62};
63MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
64
65static bool __read_mostly enable_vpid = 1;
66module_param_named(vpid, enable_vpid, bool, 0444);
67
68static bool __read_mostly flexpriority_enabled = 1;
69module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
70
71static bool __read_mostly enable_ept = 1;
72module_param_named(ept, enable_ept, bool, S_IRUGO);
73
74static bool __read_mostly enable_unrestricted_guest = 1;
75module_param_named(unrestricted_guest,
76			enable_unrestricted_guest, bool, S_IRUGO);
77
78static bool __read_mostly enable_ept_ad_bits = 1;
79module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO);
80
81static bool __read_mostly emulate_invalid_guest_state = true;
82module_param(emulate_invalid_guest_state, bool, S_IRUGO);
83
84static bool __read_mostly vmm_exclusive = 1;
85module_param(vmm_exclusive, bool, S_IRUGO);
86
87static bool __read_mostly fasteoi = 1;
88module_param(fasteoi, bool, S_IRUGO);
89
90static bool __read_mostly enable_apicv = 1;
91module_param(enable_apicv, bool, S_IRUGO);
92
93static bool __read_mostly enable_shadow_vmcs = 1;
94module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
95/*
96 * If nested=1, nested virtualization is supported, i.e., guests may use
97 * VMX and be a hypervisor for its own guests. If nested=0, guests may not
98 * use VMX instructions.
99 */
100static bool __read_mostly nested = 0;
101module_param(nested, bool, S_IRUGO);
102
103static u64 __read_mostly host_xss;
104
105static bool __read_mostly enable_pml = 1;
106module_param_named(pml, enable_pml, bool, S_IRUGO);
107
108#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
109#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
110#define KVM_VM_CR0_ALWAYS_ON						\
111	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
112#define KVM_CR4_GUEST_OWNED_BITS				      \
113	(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR      \
114	 | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
115
116#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
117#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
118
119#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
120
121#define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
122
123/*
124 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
125 * ple_gap:    upper bound on the amount of time between two successive
126 *             executions of PAUSE in a loop. Also indicate if ple enabled.
127 *             According to test, this time is usually smaller than 128 cycles.
128 * ple_window: upper bound on the amount of time a guest is allowed to execute
129 *             in a PAUSE loop. Tests indicate that most spinlocks are held for
130 *             less than 2^12 cycles
131 * Time is measured based on a counter that runs at the same rate as the TSC,
132 * refer SDM volume 3b section 21.6.13 & 22.1.3.
133 */
134#define KVM_VMX_DEFAULT_PLE_GAP           128
135#define KVM_VMX_DEFAULT_PLE_WINDOW        4096
136#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW   2
137#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0
138#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX    \
139		INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW
140
141static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
142module_param(ple_gap, int, S_IRUGO);
143
144static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
145module_param(ple_window, int, S_IRUGO);
146
147/* Default doubles per-vcpu window every exit. */
148static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW;
149module_param(ple_window_grow, int, S_IRUGO);
150
151/* Default resets per-vcpu window every exit to ple_window. */
152static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK;
153module_param(ple_window_shrink, int, S_IRUGO);
154
155/* Default is to compute the maximum so we can never overflow. */
156static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
157static int ple_window_max        = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
158module_param(ple_window_max, int, S_IRUGO);
159
160extern const ulong vmx_return;
161
162#define NR_AUTOLOAD_MSRS 8
163#define VMCS02_POOL_SIZE 1
164
165struct vmcs {
166	u32 revision_id;
167	u32 abort;
168	char data[0];
169};
170
171/*
172 * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
173 * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
174 * loaded on this CPU (so we can clear them if the CPU goes down).
175 */
176struct loaded_vmcs {
177	struct vmcs *vmcs;
178	int cpu;
179	int launched;
180	struct list_head loaded_vmcss_on_cpu_link;
181};
182
183struct shared_msr_entry {
184	unsigned index;
185	u64 data;
186	u64 mask;
187};
188
189/*
190 * struct vmcs12 describes the state that our guest hypervisor (L1) keeps for a
191 * single nested guest (L2), hence the name vmcs12. Any VMX implementation has
192 * a VMCS structure, and vmcs12 is our emulated VMX's VMCS. This structure is
193 * stored in guest memory specified by VMPTRLD, but is opaque to the guest,
194 * which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
195 * More than one of these structures may exist, if L1 runs multiple L2 guests.
196 * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
197 * underlying hardware which will be used to run L2.
198 * This structure is packed to ensure that its layout is identical across
199 * machines (necessary for live migration).
200 * If there are changes in this struct, VMCS12_REVISION must be changed.
201 */
202typedef u64 natural_width;
203struct __packed vmcs12 {
204	/* According to the Intel spec, a VMCS region must start with the
205	 * following two fields. Then follow implementation-specific data.
206	 */
207	u32 revision_id;
208	u32 abort;
209
210	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
211	u32 padding[7]; /* room for future expansion */
212
213	u64 io_bitmap_a;
214	u64 io_bitmap_b;
215	u64 msr_bitmap;
216	u64 vm_exit_msr_store_addr;
217	u64 vm_exit_msr_load_addr;
218	u64 vm_entry_msr_load_addr;
219	u64 tsc_offset;
220	u64 virtual_apic_page_addr;
221	u64 apic_access_addr;
222	u64 posted_intr_desc_addr;
223	u64 ept_pointer;
224	u64 eoi_exit_bitmap0;
225	u64 eoi_exit_bitmap1;
226	u64 eoi_exit_bitmap2;
227	u64 eoi_exit_bitmap3;
228	u64 xss_exit_bitmap;
229	u64 guest_physical_address;
230	u64 vmcs_link_pointer;
231	u64 guest_ia32_debugctl;
232	u64 guest_ia32_pat;
233	u64 guest_ia32_efer;
234	u64 guest_ia32_perf_global_ctrl;
235	u64 guest_pdptr0;
236	u64 guest_pdptr1;
237	u64 guest_pdptr2;
238	u64 guest_pdptr3;
239	u64 guest_bndcfgs;
240	u64 host_ia32_pat;
241	u64 host_ia32_efer;
242	u64 host_ia32_perf_global_ctrl;
243	u64 padding64[8]; /* room for future expansion */
244	/*
245	 * To allow migration of L1 (complete with its L2 guests) between
246	 * machines of different natural widths (32 or 64 bit), we cannot have
247	 * unsigned long fields with no explict size. We use u64 (aliased
248	 * natural_width) instead. Luckily, x86 is little-endian.
249	 */
250	natural_width cr0_guest_host_mask;
251	natural_width cr4_guest_host_mask;
252	natural_width cr0_read_shadow;
253	natural_width cr4_read_shadow;
254	natural_width cr3_target_value0;
255	natural_width cr3_target_value1;
256	natural_width cr3_target_value2;
257	natural_width cr3_target_value3;
258	natural_width exit_qualification;
259	natural_width guest_linear_address;
260	natural_width guest_cr0;
261	natural_width guest_cr3;
262	natural_width guest_cr4;
263	natural_width guest_es_base;
264	natural_width guest_cs_base;
265	natural_width guest_ss_base;
266	natural_width guest_ds_base;
267	natural_width guest_fs_base;
268	natural_width guest_gs_base;
269	natural_width guest_ldtr_base;
270	natural_width guest_tr_base;
271	natural_width guest_gdtr_base;
272	natural_width guest_idtr_base;
273	natural_width guest_dr7;
274	natural_width guest_rsp;
275	natural_width guest_rip;
276	natural_width guest_rflags;
277	natural_width guest_pending_dbg_exceptions;
278	natural_width guest_sysenter_esp;
279	natural_width guest_sysenter_eip;
280	natural_width host_cr0;
281	natural_width host_cr3;
282	natural_width host_cr4;
283	natural_width host_fs_base;
284	natural_width host_gs_base;
285	natural_width host_tr_base;
286	natural_width host_gdtr_base;
287	natural_width host_idtr_base;
288	natural_width host_ia32_sysenter_esp;
289	natural_width host_ia32_sysenter_eip;
290	natural_width host_rsp;
291	natural_width host_rip;
292	natural_width paddingl[8]; /* room for future expansion */
293	u32 pin_based_vm_exec_control;
294	u32 cpu_based_vm_exec_control;
295	u32 exception_bitmap;
296	u32 page_fault_error_code_mask;
297	u32 page_fault_error_code_match;
298	u32 cr3_target_count;
299	u32 vm_exit_controls;
300	u32 vm_exit_msr_store_count;
301	u32 vm_exit_msr_load_count;
302	u32 vm_entry_controls;
303	u32 vm_entry_msr_load_count;
304	u32 vm_entry_intr_info_field;
305	u32 vm_entry_exception_error_code;
306	u32 vm_entry_instruction_len;
307	u32 tpr_threshold;
308	u32 secondary_vm_exec_control;
309	u32 vm_instruction_error;
310	u32 vm_exit_reason;
311	u32 vm_exit_intr_info;
312	u32 vm_exit_intr_error_code;
313	u32 idt_vectoring_info_field;
314	u32 idt_vectoring_error_code;
315	u32 vm_exit_instruction_len;
316	u32 vmx_instruction_info;
317	u32 guest_es_limit;
318	u32 guest_cs_limit;
319	u32 guest_ss_limit;
320	u32 guest_ds_limit;
321	u32 guest_fs_limit;
322	u32 guest_gs_limit;
323	u32 guest_ldtr_limit;
324	u32 guest_tr_limit;
325	u32 guest_gdtr_limit;
326	u32 guest_idtr_limit;
327	u32 guest_es_ar_bytes;
328	u32 guest_cs_ar_bytes;
329	u32 guest_ss_ar_bytes;
330	u32 guest_ds_ar_bytes;
331	u32 guest_fs_ar_bytes;
332	u32 guest_gs_ar_bytes;
333	u32 guest_ldtr_ar_bytes;
334	u32 guest_tr_ar_bytes;
335	u32 guest_interruptibility_info;
336	u32 guest_activity_state;
337	u32 guest_sysenter_cs;
338	u32 host_ia32_sysenter_cs;
339	u32 vmx_preemption_timer_value;
340	u32 padding32[7]; /* room for future expansion */
341	u16 virtual_processor_id;
342	u16 posted_intr_nv;
343	u16 guest_es_selector;
344	u16 guest_cs_selector;
345	u16 guest_ss_selector;
346	u16 guest_ds_selector;
347	u16 guest_fs_selector;
348	u16 guest_gs_selector;
349	u16 guest_ldtr_selector;
350	u16 guest_tr_selector;
351	u16 guest_intr_status;
352	u16 host_es_selector;
353	u16 host_cs_selector;
354	u16 host_ss_selector;
355	u16 host_ds_selector;
356	u16 host_fs_selector;
357	u16 host_gs_selector;
358	u16 host_tr_selector;
359};
360
361/*
362 * VMCS12_REVISION is an arbitrary id that should be changed if the content or
363 * layout of struct vmcs12 is changed. MSR_IA32_VMX_BASIC returns this id, and
364 * VMPTRLD verifies that the VMCS region that L1 is loading contains this id.
365 */
366#define VMCS12_REVISION 0x11e57ed0
367
368/*
369 * VMCS12_SIZE is the number of bytes L1 should allocate for the VMXON region
370 * and any VMCS region. Although only sizeof(struct vmcs12) are used by the
371 * current implementation, 4K are reserved to avoid future complications.
372 */
373#define VMCS12_SIZE 0x1000
374
375/* Used to remember the last vmcs02 used for some recently used vmcs12s */
376struct vmcs02_list {
377	struct list_head list;
378	gpa_t vmptr;
379	struct loaded_vmcs vmcs02;
380};
381
382/*
383 * The nested_vmx structure is part of vcpu_vmx, and holds information we need
384 * for correct emulation of VMX (i.e., nested VMX) on this vcpu.
385 */
386struct nested_vmx {
387	/* Has the level1 guest done vmxon? */
388	bool vmxon;
389	gpa_t vmxon_ptr;
390
391	/* The guest-physical address of the current VMCS L1 keeps for L2 */
392	gpa_t current_vmptr;
393	/* The host-usable pointer to the above */
394	struct page *current_vmcs12_page;
395	struct vmcs12 *current_vmcs12;
396	struct vmcs *current_shadow_vmcs;
397	/*
398	 * Indicates if the shadow vmcs must be updated with the
399	 * data hold by vmcs12
400	 */
401	bool sync_shadow_vmcs;
402
403	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
404	struct list_head vmcs02_pool;
405	int vmcs02_num;
406	u64 vmcs01_tsc_offset;
407	/* L2 must run next, and mustn't decide to exit to L1. */
408	bool nested_run_pending;
409	/*
410	 * Guest pages referred to in vmcs02 with host-physical pointers, so
411	 * we must keep them pinned while L2 runs.
412	 */
413	struct page *apic_access_page;
414	struct page *virtual_apic_page;
415	struct page *pi_desc_page;
416	struct pi_desc *pi_desc;
417	bool pi_pending;
418	u16 posted_intr_nv;
419	u64 msr_ia32_feature_control;
420
421	struct hrtimer preemption_timer;
422	bool preemption_timer_expired;
423
424	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
425	u64 vmcs01_debugctl;
426
427	u32 nested_vmx_procbased_ctls_low;
428	u32 nested_vmx_procbased_ctls_high;
429	u32 nested_vmx_true_procbased_ctls_low;
430	u32 nested_vmx_secondary_ctls_low;
431	u32 nested_vmx_secondary_ctls_high;
432	u32 nested_vmx_pinbased_ctls_low;
433	u32 nested_vmx_pinbased_ctls_high;
434	u32 nested_vmx_exit_ctls_low;
435	u32 nested_vmx_exit_ctls_high;
436	u32 nested_vmx_true_exit_ctls_low;
437	u32 nested_vmx_entry_ctls_low;
438	u32 nested_vmx_entry_ctls_high;
439	u32 nested_vmx_true_entry_ctls_low;
440	u32 nested_vmx_misc_low;
441	u32 nested_vmx_misc_high;
442	u32 nested_vmx_ept_caps;
443};
444
445#define POSTED_INTR_ON  0
446/* Posted-Interrupt Descriptor */
447struct pi_desc {
448	u32 pir[8];     /* Posted interrupt requested */
449	u32 control;	/* bit 0 of control is outstanding notification bit */
450	u32 rsvd[7];
451} __aligned(64);
452
453static bool pi_test_and_set_on(struct pi_desc *pi_desc)
454{
455	return test_and_set_bit(POSTED_INTR_ON,
456			(unsigned long *)&pi_desc->control);
457}
458
459static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
460{
461	return test_and_clear_bit(POSTED_INTR_ON,
462			(unsigned long *)&pi_desc->control);
463}
464
465static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
466{
467	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
468}
469
470struct vcpu_vmx {
471	struct kvm_vcpu       vcpu;
472	unsigned long         host_rsp;
473	u8                    fail;
474	bool                  nmi_known_unmasked;
475	u32                   exit_intr_info;
476	u32                   idt_vectoring_info;
477	ulong                 rflags;
478	struct shared_msr_entry *guest_msrs;
479	int                   nmsrs;
480	int                   save_nmsrs;
481	unsigned long	      host_idt_base;
482#ifdef CONFIG_X86_64
483	u64 		      msr_host_kernel_gs_base;
484	u64 		      msr_guest_kernel_gs_base;
485#endif
486	u32 vm_entry_controls_shadow;
487	u32 vm_exit_controls_shadow;
488	/*
489	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
490	 * non-nested (L1) guest, it always points to vmcs01. For a nested
491	 * guest (L2), it points to a different VMCS.
492	 */
493	struct loaded_vmcs    vmcs01;
494	struct loaded_vmcs   *loaded_vmcs;
495	bool                  __launched; /* temporary, used in vmx_vcpu_run */
496	struct msr_autoload {
497		unsigned nr;
498		struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS];
499		struct vmx_msr_entry host[NR_AUTOLOAD_MSRS];
500	} msr_autoload;
501	struct {
502		int           loaded;
503		u16           fs_sel, gs_sel, ldt_sel;
504#ifdef CONFIG_X86_64
505		u16           ds_sel, es_sel;
506#endif
507		int           gs_ldt_reload_needed;
508		int           fs_reload_needed;
509		u64           msr_host_bndcfgs;
510		unsigned long vmcs_host_cr4;	/* May not match real cr4 */
511	} host_state;
512	struct {
513		int vm86_active;
514		ulong save_rflags;
515		struct kvm_segment segs[8];
516	} rmode;
517	struct {
518		u32 bitmask; /* 4 bits per segment (1 bit per field) */
519		struct kvm_save_segment {
520			u16 selector;
521			unsigned long base;
522			u32 limit;
523			u32 ar;
524		} seg[8];
525	} segment_cache;
526	int vpid;
527	bool emulation_required;
528
529	/* Support for vnmi-less CPUs */
530	int soft_vnmi_blocked;
531	ktime_t entry_time;
532	s64 vnmi_blocked_time;
533	u32 exit_reason;
534
535	bool rdtscp_enabled;
536
537	/* Posted interrupt descriptor */
538	struct pi_desc pi_desc;
539
540	/* Support for a guest hypervisor (nested VMX) */
541	struct nested_vmx nested;
542
543	/* Dynamic PLE window. */
544	int ple_window;
545	bool ple_window_dirty;
546
547	/* Support for PML */
548#define PML_ENTITY_NUM		512
549	struct page *pml_pg;
550};
551
552enum segment_cache_field {
553	SEG_FIELD_SEL = 0,
554	SEG_FIELD_BASE = 1,
555	SEG_FIELD_LIMIT = 2,
556	SEG_FIELD_AR = 3,
557
558	SEG_FIELD_NR = 4
559};
560
561static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
562{
563	return container_of(vcpu, struct vcpu_vmx, vcpu);
564}
565
566#define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
567#define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
568#define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
569				[number##_HIGH] = VMCS12_OFFSET(name)+4
570
571
572static unsigned long shadow_read_only_fields[] = {
573	/*
574	 * We do NOT shadow fields that are modified when L0
575	 * traps and emulates any vmx instruction (e.g. VMPTRLD,
576	 * VMXON...) executed by L1.
577	 * For example, VM_INSTRUCTION_ERROR is read
578	 * by L1 if a vmx instruction fails (part of the error path).
579	 * Note the code assumes this logic. If for some reason
580	 * we start shadowing these fields then we need to
581	 * force a shadow sync when L0 emulates vmx instructions
582	 * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified
583	 * by nested_vmx_failValid)
584	 */
585	VM_EXIT_REASON,
586	VM_EXIT_INTR_INFO,
587	VM_EXIT_INSTRUCTION_LEN,
588	IDT_VECTORING_INFO_FIELD,
589	IDT_VECTORING_ERROR_CODE,
590	VM_EXIT_INTR_ERROR_CODE,
591	EXIT_QUALIFICATION,
592	GUEST_LINEAR_ADDRESS,
593	GUEST_PHYSICAL_ADDRESS
594};
595static int max_shadow_read_only_fields =
596	ARRAY_SIZE(shadow_read_only_fields);
597
598static unsigned long shadow_read_write_fields[] = {
599	TPR_THRESHOLD,
600	GUEST_RIP,
601	GUEST_RSP,
602	GUEST_CR0,
603	GUEST_CR3,
604	GUEST_CR4,
605	GUEST_INTERRUPTIBILITY_INFO,
606	GUEST_RFLAGS,
607	GUEST_CS_SELECTOR,
608	GUEST_CS_AR_BYTES,
609	GUEST_CS_LIMIT,
610	GUEST_CS_BASE,
611	GUEST_ES_BASE,
612	GUEST_BNDCFGS,
613	CR0_GUEST_HOST_MASK,
614	CR0_READ_SHADOW,
615	CR4_READ_SHADOW,
616	TSC_OFFSET,
617	EXCEPTION_BITMAP,
618	CPU_BASED_VM_EXEC_CONTROL,
619	VM_ENTRY_EXCEPTION_ERROR_CODE,
620	VM_ENTRY_INTR_INFO_FIELD,
621	VM_ENTRY_INSTRUCTION_LEN,
622	VM_ENTRY_EXCEPTION_ERROR_CODE,
623	HOST_FS_BASE,
624	HOST_GS_BASE,
625	HOST_FS_SELECTOR,
626	HOST_GS_SELECTOR
627};
628static int max_shadow_read_write_fields =
629	ARRAY_SIZE(shadow_read_write_fields);
630
631static const unsigned short vmcs_field_to_offset_table[] = {
632	FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
633	FIELD(POSTED_INTR_NV, posted_intr_nv),
634	FIELD(GUEST_ES_SELECTOR, guest_es_selector),
635	FIELD(GUEST_CS_SELECTOR, guest_cs_selector),
636	FIELD(GUEST_SS_SELECTOR, guest_ss_selector),
637	FIELD(GUEST_DS_SELECTOR, guest_ds_selector),
638	FIELD(GUEST_FS_SELECTOR, guest_fs_selector),
639	FIELD(GUEST_GS_SELECTOR, guest_gs_selector),
640	FIELD(GUEST_LDTR_SELECTOR, guest_ldtr_selector),
641	FIELD(GUEST_TR_SELECTOR, guest_tr_selector),
642	FIELD(GUEST_INTR_STATUS, guest_intr_status),
643	FIELD(HOST_ES_SELECTOR, host_es_selector),
644	FIELD(HOST_CS_SELECTOR, host_cs_selector),
645	FIELD(HOST_SS_SELECTOR, host_ss_selector),
646	FIELD(HOST_DS_SELECTOR, host_ds_selector),
647	FIELD(HOST_FS_SELECTOR, host_fs_selector),
648	FIELD(HOST_GS_SELECTOR, host_gs_selector),
649	FIELD(HOST_TR_SELECTOR, host_tr_selector),
650	FIELD64(IO_BITMAP_A, io_bitmap_a),
651	FIELD64(IO_BITMAP_B, io_bitmap_b),
652	FIELD64(MSR_BITMAP, msr_bitmap),
653	FIELD64(VM_EXIT_MSR_STORE_ADDR, vm_exit_msr_store_addr),
654	FIELD64(VM_EXIT_MSR_LOAD_ADDR, vm_exit_msr_load_addr),
655	FIELD64(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr),
656	FIELD64(TSC_OFFSET, tsc_offset),
657	FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
658	FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
659	FIELD64(POSTED_INTR_DESC_ADDR, posted_intr_desc_addr),
660	FIELD64(EPT_POINTER, ept_pointer),
661	FIELD64(EOI_EXIT_BITMAP0, eoi_exit_bitmap0),
662	FIELD64(EOI_EXIT_BITMAP1, eoi_exit_bitmap1),
663	FIELD64(EOI_EXIT_BITMAP2, eoi_exit_bitmap2),
664	FIELD64(EOI_EXIT_BITMAP3, eoi_exit_bitmap3),
665	FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
666	FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
667	FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
668	FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
669	FIELD64(GUEST_IA32_PAT, guest_ia32_pat),
670	FIELD64(GUEST_IA32_EFER, guest_ia32_efer),
671	FIELD64(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl),
672	FIELD64(GUEST_PDPTR0, guest_pdptr0),
673	FIELD64(GUEST_PDPTR1, guest_pdptr1),
674	FIELD64(GUEST_PDPTR2, guest_pdptr2),
675	FIELD64(GUEST_PDPTR3, guest_pdptr3),
676	FIELD64(GUEST_BNDCFGS, guest_bndcfgs),
677	FIELD64(HOST_IA32_PAT, host_ia32_pat),
678	FIELD64(HOST_IA32_EFER, host_ia32_efer),
679	FIELD64(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl),
680	FIELD(PIN_BASED_VM_EXEC_CONTROL, pin_based_vm_exec_control),
681	FIELD(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control),
682	FIELD(EXCEPTION_BITMAP, exception_bitmap),
683	FIELD(PAGE_FAULT_ERROR_CODE_MASK, page_fault_error_code_mask),
684	FIELD(PAGE_FAULT_ERROR_CODE_MATCH, page_fault_error_code_match),
685	FIELD(CR3_TARGET_COUNT, cr3_target_count),
686	FIELD(VM_EXIT_CONTROLS, vm_exit_controls),
687	FIELD(VM_EXIT_MSR_STORE_COUNT, vm_exit_msr_store_count),
688	FIELD(VM_EXIT_MSR_LOAD_COUNT, vm_exit_msr_load_count),
689	FIELD(VM_ENTRY_CONTROLS, vm_entry_controls),
690	FIELD(VM_ENTRY_MSR_LOAD_COUNT, vm_entry_msr_load_count),
691	FIELD(VM_ENTRY_INTR_INFO_FIELD, vm_entry_intr_info_field),
692	FIELD(VM_ENTRY_EXCEPTION_ERROR_CODE, vm_entry_exception_error_code),
693	FIELD(VM_ENTRY_INSTRUCTION_LEN, vm_entry_instruction_len),
694	FIELD(TPR_THRESHOLD, tpr_threshold),
695	FIELD(SECONDARY_VM_EXEC_CONTROL, secondary_vm_exec_control),
696	FIELD(VM_INSTRUCTION_ERROR, vm_instruction_error),
697	FIELD(VM_EXIT_REASON, vm_exit_reason),
698	FIELD(VM_EXIT_INTR_INFO, vm_exit_intr_info),
699	FIELD(VM_EXIT_INTR_ERROR_CODE, vm_exit_intr_error_code),
700	FIELD(IDT_VECTORING_INFO_FIELD, idt_vectoring_info_field),
701	FIELD(IDT_VECTORING_ERROR_CODE, idt_vectoring_error_code),
702	FIELD(VM_EXIT_INSTRUCTION_LEN, vm_exit_instruction_len),
703	FIELD(VMX_INSTRUCTION_INFO, vmx_instruction_info),
704	FIELD(GUEST_ES_LIMIT, guest_es_limit),
705	FIELD(GUEST_CS_LIMIT, guest_cs_limit),
706	FIELD(GUEST_SS_LIMIT, guest_ss_limit),
707	FIELD(GUEST_DS_LIMIT, guest_ds_limit),
708	FIELD(GUEST_FS_LIMIT, guest_fs_limit),
709	FIELD(GUEST_GS_LIMIT, guest_gs_limit),
710	FIELD(GUEST_LDTR_LIMIT, guest_ldtr_limit),
711	FIELD(GUEST_TR_LIMIT, guest_tr_limit),
712	FIELD(GUEST_GDTR_LIMIT, guest_gdtr_limit),
713	FIELD(GUEST_IDTR_LIMIT, guest_idtr_limit),
714	FIELD(GUEST_ES_AR_BYTES, guest_es_ar_bytes),
715	FIELD(GUEST_CS_AR_BYTES, guest_cs_ar_bytes),
716	FIELD(GUEST_SS_AR_BYTES, guest_ss_ar_bytes),
717	FIELD(GUEST_DS_AR_BYTES, guest_ds_ar_bytes),
718	FIELD(GUEST_FS_AR_BYTES, guest_fs_ar_bytes),
719	FIELD(GUEST_GS_AR_BYTES, guest_gs_ar_bytes),
720	FIELD(GUEST_LDTR_AR_BYTES, guest_ldtr_ar_bytes),
721	FIELD(GUEST_TR_AR_BYTES, guest_tr_ar_bytes),
722	FIELD(GUEST_INTERRUPTIBILITY_INFO, guest_interruptibility_info),
723	FIELD(GUEST_ACTIVITY_STATE, guest_activity_state),
724	FIELD(GUEST_SYSENTER_CS, guest_sysenter_cs),
725	FIELD(HOST_IA32_SYSENTER_CS, host_ia32_sysenter_cs),
726	FIELD(VMX_PREEMPTION_TIMER_VALUE, vmx_preemption_timer_value),
727	FIELD(CR0_GUEST_HOST_MASK, cr0_guest_host_mask),
728	FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
729	FIELD(CR0_READ_SHADOW, cr0_read_shadow),
730	FIELD(CR4_READ_SHADOW, cr4_read_shadow),
731	FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
732	FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
733	FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
734	FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
735	FIELD(EXIT_QUALIFICATION, exit_qualification),
736	FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
737	FIELD(GUEST_CR0, guest_cr0),
738	FIELD(GUEST_CR3, guest_cr3),
739	FIELD(GUEST_CR4, guest_cr4),
740	FIELD(GUEST_ES_BASE, guest_es_base),
741	FIELD(GUEST_CS_BASE, guest_cs_base),
742	FIELD(GUEST_SS_BASE, guest_ss_base),
743	FIELD(GUEST_DS_BASE, guest_ds_base),
744	FIELD(GUEST_FS_BASE, guest_fs_base),
745	FIELD(GUEST_GS_BASE, guest_gs_base),
746	FIELD(GUEST_LDTR_BASE, guest_ldtr_base),
747	FIELD(GUEST_TR_BASE, guest_tr_base),
748	FIELD(GUEST_GDTR_BASE, guest_gdtr_base),
749	FIELD(GUEST_IDTR_BASE, guest_idtr_base),
750	FIELD(GUEST_DR7, guest_dr7),
751	FIELD(GUEST_RSP, guest_rsp),
752	FIELD(GUEST_RIP, guest_rip),
753	FIELD(GUEST_RFLAGS, guest_rflags),
754	FIELD(GUEST_PENDING_DBG_EXCEPTIONS, guest_pending_dbg_exceptions),
755	FIELD(GUEST_SYSENTER_ESP, guest_sysenter_esp),
756	FIELD(GUEST_SYSENTER_EIP, guest_sysenter_eip),
757	FIELD(HOST_CR0, host_cr0),
758	FIELD(HOST_CR3, host_cr3),
759	FIELD(HOST_CR4, host_cr4),
760	FIELD(HOST_FS_BASE, host_fs_base),
761	FIELD(HOST_GS_BASE, host_gs_base),
762	FIELD(HOST_TR_BASE, host_tr_base),
763	FIELD(HOST_GDTR_BASE, host_gdtr_base),
764	FIELD(HOST_IDTR_BASE, host_idtr_base),
765	FIELD(HOST_IA32_SYSENTER_ESP, host_ia32_sysenter_esp),
766	FIELD(HOST_IA32_SYSENTER_EIP, host_ia32_sysenter_eip),
767	FIELD(HOST_RSP, host_rsp),
768	FIELD(HOST_RIP, host_rip),
769};
770
771static inline short vmcs_field_to_offset(unsigned long field)
772{
773	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
774
775	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
776	    vmcs_field_to_offset_table[field] == 0)
777		return -ENOENT;
778
779	return vmcs_field_to_offset_table[field];
780}
781
782static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
783{
784	return to_vmx(vcpu)->nested.current_vmcs12;
785}
786
787static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
788{
789	struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
790	if (is_error_page(page))
791		return NULL;
792
793	return page;
794}
795
796static void nested_release_page(struct page *page)
797{
798	kvm_release_page_dirty(page);
799}
800
801static void nested_release_page_clean(struct page *page)
802{
803	kvm_release_page_clean(page);
804}
805
806static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
807static u64 construct_eptp(unsigned long root_hpa);
808static void kvm_cpu_vmxon(u64 addr);
809static void kvm_cpu_vmxoff(void);
810static bool vmx_mpx_supported(void);
811static bool vmx_xsaves_supported(void);
812static int vmx_vm_has_apicv(struct kvm *kvm);
813static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
814static void vmx_set_segment(struct kvm_vcpu *vcpu,
815			    struct kvm_segment *var, int seg);
816static void vmx_get_segment(struct kvm_vcpu *vcpu,
817			    struct kvm_segment *var, int seg);
818static bool guest_state_valid(struct kvm_vcpu *vcpu);
819static u32 vmx_segment_access_rights(struct kvm_segment *var);
820static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
821static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
822static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
823static int alloc_identity_pagetable(struct kvm *kvm);
824
825static DEFINE_PER_CPU(struct vmcs *, vmxarea);
826static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
827/*
828 * We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
829 * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
830 */
831static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
832static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
833
834static unsigned long *vmx_io_bitmap_a;
835static unsigned long *vmx_io_bitmap_b;
836static unsigned long *vmx_msr_bitmap_legacy;
837static unsigned long *vmx_msr_bitmap_longmode;
838static unsigned long *vmx_msr_bitmap_legacy_x2apic;
839static unsigned long *vmx_msr_bitmap_longmode_x2apic;
840static unsigned long *vmx_msr_bitmap_nested;
841static unsigned long *vmx_vmread_bitmap;
842static unsigned long *vmx_vmwrite_bitmap;
843
844static bool cpu_has_load_ia32_efer;
845static bool cpu_has_load_perf_global_ctrl;
846
847static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
848static DEFINE_SPINLOCK(vmx_vpid_lock);
849
850static struct vmcs_config {
851	int size;
852	int order;
853	u32 revision_id;
854	u32 pin_based_exec_ctrl;
855	u32 cpu_based_exec_ctrl;
856	u32 cpu_based_2nd_exec_ctrl;
857	u32 vmexit_ctrl;
858	u32 vmentry_ctrl;
859} vmcs_config;
860
861static struct vmx_capability {
862	u32 ept;
863	u32 vpid;
864} vmx_capability;
865
866#define VMX_SEGMENT_FIELD(seg)					\
867	[VCPU_SREG_##seg] = {                                   \
868		.selector = GUEST_##seg##_SELECTOR,		\
869		.base = GUEST_##seg##_BASE,		   	\
870		.limit = GUEST_##seg##_LIMIT,		   	\
871		.ar_bytes = GUEST_##seg##_AR_BYTES,	   	\
872	}
873
874static const struct kvm_vmx_segment_field {
875	unsigned selector;
876	unsigned base;
877	unsigned limit;
878	unsigned ar_bytes;
879} kvm_vmx_segment_fields[] = {
880	VMX_SEGMENT_FIELD(CS),
881	VMX_SEGMENT_FIELD(DS),
882	VMX_SEGMENT_FIELD(ES),
883	VMX_SEGMENT_FIELD(FS),
884	VMX_SEGMENT_FIELD(GS),
885	VMX_SEGMENT_FIELD(SS),
886	VMX_SEGMENT_FIELD(TR),
887	VMX_SEGMENT_FIELD(LDTR),
888};
889
890static u64 host_efer;
891
892static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
893
894/*
895 * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
896 * away by decrementing the array size.
897 */
898static const u32 vmx_msr_index[] = {
899#ifdef CONFIG_X86_64
900	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
901#endif
902	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
903};
904
905static inline bool is_page_fault(u32 intr_info)
906{
907	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
908			     INTR_INFO_VALID_MASK)) ==
909		(INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
910}
911
912static inline bool is_no_device(u32 intr_info)
913{
914	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
915			     INTR_INFO_VALID_MASK)) ==
916		(INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
917}
918
919static inline bool is_invalid_opcode(u32 intr_info)
920{
921	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
922			     INTR_INFO_VALID_MASK)) ==
923		(INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
924}
925
926static inline bool is_external_interrupt(u32 intr_info)
927{
928	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
929		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
930}
931
932static inline bool is_machine_check(u32 intr_info)
933{
934	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
935			     INTR_INFO_VALID_MASK)) ==
936		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
937}
938
939static inline bool cpu_has_vmx_msr_bitmap(void)
940{
941	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
942}
943
944static inline bool cpu_has_vmx_tpr_shadow(void)
945{
946	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
947}
948
949static inline bool vm_need_tpr_shadow(struct kvm *kvm)
950{
951	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
952}
953
954static inline bool cpu_has_secondary_exec_ctrls(void)
955{
956	return vmcs_config.cpu_based_exec_ctrl &
957		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
958}
959
960static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
961{
962	return vmcs_config.cpu_based_2nd_exec_ctrl &
963		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
964}
965
966static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
967{
968	return vmcs_config.cpu_based_2nd_exec_ctrl &
969		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
970}
971
972static inline bool cpu_has_vmx_apic_register_virt(void)
973{
974	return vmcs_config.cpu_based_2nd_exec_ctrl &
975		SECONDARY_EXEC_APIC_REGISTER_VIRT;
976}
977
978static inline bool cpu_has_vmx_virtual_intr_delivery(void)
979{
980	return vmcs_config.cpu_based_2nd_exec_ctrl &
981		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
982}
983
984static inline bool cpu_has_vmx_posted_intr(void)
985{
986	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
987}
988
989static inline bool cpu_has_vmx_apicv(void)
990{
991	return cpu_has_vmx_apic_register_virt() &&
992		cpu_has_vmx_virtual_intr_delivery() &&
993		cpu_has_vmx_posted_intr();
994}
995
996static inline bool cpu_has_vmx_flexpriority(void)
997{
998	return cpu_has_vmx_tpr_shadow() &&
999		cpu_has_vmx_virtualize_apic_accesses();
1000}
1001
1002static inline bool cpu_has_vmx_ept_execute_only(void)
1003{
1004	return vmx_capability.ept & VMX_EPT_EXECUTE_ONLY_BIT;
1005}
1006
1007static inline bool cpu_has_vmx_ept_2m_page(void)
1008{
1009	return vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT;
1010}
1011
1012static inline bool cpu_has_vmx_ept_1g_page(void)
1013{
1014	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
1015}
1016
1017static inline bool cpu_has_vmx_ept_4levels(void)
1018{
1019	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
1020}
1021
1022static inline bool cpu_has_vmx_ept_ad_bits(void)
1023{
1024	return vmx_capability.ept & VMX_EPT_AD_BIT;
1025}
1026
1027static inline bool cpu_has_vmx_invept_context(void)
1028{
1029	return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
1030}
1031
1032static inline bool cpu_has_vmx_invept_global(void)
1033{
1034	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
1035}
1036
1037static inline bool cpu_has_vmx_invvpid_single(void)
1038{
1039	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
1040}
1041
1042static inline bool cpu_has_vmx_invvpid_global(void)
1043{
1044	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
1045}
1046
1047static inline bool cpu_has_vmx_ept(void)
1048{
1049	return vmcs_config.cpu_based_2nd_exec_ctrl &
1050		SECONDARY_EXEC_ENABLE_EPT;
1051}
1052
1053static inline bool cpu_has_vmx_unrestricted_guest(void)
1054{
1055	return vmcs_config.cpu_based_2nd_exec_ctrl &
1056		SECONDARY_EXEC_UNRESTRICTED_GUEST;
1057}
1058
1059static inline bool cpu_has_vmx_ple(void)
1060{
1061	return vmcs_config.cpu_based_2nd_exec_ctrl &
1062		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
1063}
1064
1065static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
1066{
1067	return flexpriority_enabled && irqchip_in_kernel(kvm);
1068}
1069
1070static inline bool cpu_has_vmx_vpid(void)
1071{
1072	return vmcs_config.cpu_based_2nd_exec_ctrl &
1073		SECONDARY_EXEC_ENABLE_VPID;
1074}
1075
1076static inline bool cpu_has_vmx_rdtscp(void)
1077{
1078	return vmcs_config.cpu_based_2nd_exec_ctrl &
1079		SECONDARY_EXEC_RDTSCP;
1080}
1081
1082static inline bool cpu_has_vmx_invpcid(void)
1083{
1084	return vmcs_config.cpu_based_2nd_exec_ctrl &
1085		SECONDARY_EXEC_ENABLE_INVPCID;
1086}
1087
1088static inline bool cpu_has_virtual_nmis(void)
1089{
1090	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
1091}
1092
1093static inline bool cpu_has_vmx_wbinvd_exit(void)
1094{
1095	return vmcs_config.cpu_based_2nd_exec_ctrl &
1096		SECONDARY_EXEC_WBINVD_EXITING;
1097}
1098
1099static inline bool cpu_has_vmx_shadow_vmcs(void)
1100{
1101	u64 vmx_msr;
1102	rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
1103	/* check if the cpu supports writing r/o exit information fields */
1104	if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
1105		return false;
1106
1107	return vmcs_config.cpu_based_2nd_exec_ctrl &
1108		SECONDARY_EXEC_SHADOW_VMCS;
1109}
1110
1111static inline bool cpu_has_vmx_pml(void)
1112{
1113	return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
1114}
1115
1116static inline bool report_flexpriority(void)
1117{
1118	return flexpriority_enabled;
1119}
1120
1121static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
1122{
1123	return vmcs12->cpu_based_vm_exec_control & bit;
1124}
1125
1126static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
1127{
1128	return (vmcs12->cpu_based_vm_exec_control &
1129			CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
1130		(vmcs12->secondary_vm_exec_control & bit);
1131}
1132
1133static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
1134{
1135	return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
1136}
1137
1138static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
1139{
1140	return vmcs12->pin_based_vm_exec_control &
1141		PIN_BASED_VMX_PREEMPTION_TIMER;
1142}
1143
1144static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
1145{
1146	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
1147}
1148
1149static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
1150{
1151	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
1152		vmx_xsaves_supported();
1153}
1154
1155static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
1156{
1157	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
1158}
1159
1160static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
1161{
1162	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
1163}
1164
1165static inline bool nested_cpu_has_vid(struct vmcs12 *vmcs12)
1166{
1167	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1168}
1169
1170static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
1171{
1172	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
1173}
1174
1175static inline bool is_exception(u32 intr_info)
1176{
1177	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
1178		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
1179}
1180
1181static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
1182			      u32 exit_intr_info,
1183			      unsigned long exit_qualification);
1184static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
1185			struct vmcs12 *vmcs12,
1186			u32 reason, unsigned long qualification);
1187
1188static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
1189{
1190	int i;
1191
1192	for (i = 0; i < vmx->nmsrs; ++i)
1193		if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
1194			return i;
1195	return -1;
1196}
1197
1198static inline void __invvpid(int ext, u16 vpid, gva_t gva)
1199{
1200    struct {
1201	u64 vpid : 16;
1202	u64 rsvd : 48;
1203	u64 gva;
1204    } operand = { vpid, 0, gva };
1205
1206    asm volatile (__ex(ASM_VMX_INVVPID)
1207		  /* CF==1 or ZF==1 --> rc = -1 */
1208		  "; ja 1f ; ud2 ; 1:"
1209		  : : "a"(&operand), "c"(ext) : "cc", "memory");
1210}
1211
1212static inline void __invept(int ext, u64 eptp, gpa_t gpa)
1213{
1214	struct {
1215		u64 eptp, gpa;
1216	} operand = {eptp, gpa};
1217
1218	asm volatile (__ex(ASM_VMX_INVEPT)
1219			/* CF==1 or ZF==1 --> rc = -1 */
1220			"; ja 1f ; ud2 ; 1:\n"
1221			: : "a" (&operand), "c" (ext) : "cc", "memory");
1222}
1223
1224static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
1225{
1226	int i;
1227
1228	i = __find_msr_index(vmx, msr);
1229	if (i >= 0)
1230		return &vmx->guest_msrs[i];
1231	return NULL;
1232}
1233
1234static void vmcs_clear(struct vmcs *vmcs)
1235{
1236	u64 phys_addr = __pa(vmcs);
1237	u8 error;
1238
1239	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
1240		      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1241		      : "cc", "memory");
1242	if (error)
1243		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
1244		       vmcs, phys_addr);
1245}
1246
1247static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
1248{
1249	vmcs_clear(loaded_vmcs->vmcs);
1250	loaded_vmcs->cpu = -1;
1251	loaded_vmcs->launched = 0;
1252}
1253
1254static void vmcs_load(struct vmcs *vmcs)
1255{
1256	u64 phys_addr = __pa(vmcs);
1257	u8 error;
1258
1259	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
1260			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
1261			: "cc", "memory");
1262	if (error)
1263		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
1264		       vmcs, phys_addr);
1265}
1266
1267#ifdef CONFIG_KEXEC
1268/*
1269 * This bitmap is used to indicate whether the vmclear
1270 * operation is enabled on all cpus. All disabled by
1271 * default.
1272 */
1273static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
1274
1275static inline void crash_enable_local_vmclear(int cpu)
1276{
1277	cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
1278}
1279
1280static inline void crash_disable_local_vmclear(int cpu)
1281{
1282	cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
1283}
1284
1285static inline int crash_local_vmclear_enabled(int cpu)
1286{
1287	return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
1288}
1289
1290static void crash_vmclear_local_loaded_vmcss(void)
1291{
1292	int cpu = raw_smp_processor_id();
1293	struct loaded_vmcs *v;
1294
1295	if (!crash_local_vmclear_enabled(cpu))
1296		return;
1297
1298	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
1299			    loaded_vmcss_on_cpu_link)
1300		vmcs_clear(v->vmcs);
1301}
1302#else
1303static inline void crash_enable_local_vmclear(int cpu) { }
1304static inline void crash_disable_local_vmclear(int cpu) { }
1305#endif /* CONFIG_KEXEC */
1306
1307static void __loaded_vmcs_clear(void *arg)
1308{
1309	struct loaded_vmcs *loaded_vmcs = arg;
1310	int cpu = raw_smp_processor_id();
1311
1312	if (loaded_vmcs->cpu != cpu)
1313		return; /* vcpu migration can race with cpu offline */
1314	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
1315		per_cpu(current_vmcs, cpu) = NULL;
1316	crash_disable_local_vmclear(cpu);
1317	list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
1318
1319	/*
1320	 * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
1321	 * is before setting loaded_vmcs->vcpu to -1 which is done in
1322	 * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
1323	 * then adds the vmcs into percpu list before it is deleted.
1324	 */
1325	smp_wmb();
1326
1327	loaded_vmcs_init(loaded_vmcs);
1328	crash_enable_local_vmclear(cpu);
1329}
1330
1331static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
1332{
1333	int cpu = loaded_vmcs->cpu;
1334
1335	if (cpu != -1)
1336		smp_call_function_single(cpu,
1337			 __loaded_vmcs_clear, loaded_vmcs, 1);
1338}
1339
1340static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
1341{
1342	if (vmx->vpid == 0)
1343		return;
1344
1345	if (cpu_has_vmx_invvpid_single())
1346		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
1347}
1348
1349static inline void vpid_sync_vcpu_global(void)
1350{
1351	if (cpu_has_vmx_invvpid_global())
1352		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
1353}
1354
1355static inline void vpid_sync_context(struct vcpu_vmx *vmx)
1356{
1357	if (cpu_has_vmx_invvpid_single())
1358		vpid_sync_vcpu_single(vmx);
1359	else
1360		vpid_sync_vcpu_global();
1361}
1362
1363static inline void ept_sync_global(void)
1364{
1365	if (cpu_has_vmx_invept_global())
1366		__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
1367}
1368
1369static inline void ept_sync_context(u64 eptp)
1370{
1371	if (enable_ept) {
1372		if (cpu_has_vmx_invept_context())
1373			__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
1374		else
1375			ept_sync_global();
1376	}
1377}
1378
1379static __always_inline unsigned long vmcs_readl(unsigned long field)
1380{
1381	unsigned long value;
1382
1383	asm volatile (__ex_clear(ASM_VMX_VMREAD_RDX_RAX, "%0")
1384		      : "=a"(value) : "d"(field) : "cc");
1385	return value;
1386}
1387
1388static __always_inline u16 vmcs_read16(unsigned long field)
1389{
1390	return vmcs_readl(field);
1391}
1392
1393static __always_inline u32 vmcs_read32(unsigned long field)
1394{
1395	return vmcs_readl(field);
1396}
1397
1398static __always_inline u64 vmcs_read64(unsigned long field)
1399{
1400#ifdef CONFIG_X86_64
1401	return vmcs_readl(field);
1402#else
1403	return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
1404#endif
1405}
1406
1407static noinline void vmwrite_error(unsigned long field, unsigned long value)
1408{
1409	printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
1410	       field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
1411	dump_stack();
1412}
1413
1414static void vmcs_writel(unsigned long field, unsigned long value)
1415{
1416	u8 error;
1417
1418	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
1419		       : "=q"(error) : "a"(value), "d"(field) : "cc");
1420	if (unlikely(error))
1421		vmwrite_error(field, value);
1422}
1423
1424static void vmcs_write16(unsigned long field, u16 value)
1425{
1426	vmcs_writel(field, value);
1427}
1428
1429static void vmcs_write32(unsigned long field, u32 value)
1430{
1431	vmcs_writel(field, value);
1432}
1433
1434static void vmcs_write64(unsigned long field, u64 value)
1435{
1436	vmcs_writel(field, value);
1437#ifndef CONFIG_X86_64
1438	asm volatile ("");
1439	vmcs_writel(field+1, value >> 32);
1440#endif
1441}
1442
1443static void vmcs_clear_bits(unsigned long field, u32 mask)
1444{
1445	vmcs_writel(field, vmcs_readl(field) & ~mask);
1446}
1447
1448static void vmcs_set_bits(unsigned long field, u32 mask)
1449{
1450	vmcs_writel(field, vmcs_readl(field) | mask);
1451}
1452
1453static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
1454{
1455	vmcs_write32(VM_ENTRY_CONTROLS, val);
1456	vmx->vm_entry_controls_shadow = val;
1457}
1458
1459static inline void vm_entry_controls_set(struct vcpu_vmx *vmx, u32 val)
1460{
1461	if (vmx->vm_entry_controls_shadow != val)
1462		vm_entry_controls_init(vmx, val);
1463}
1464
1465static inline u32 vm_entry_controls_get(struct vcpu_vmx *vmx)
1466{
1467	return vmx->vm_entry_controls_shadow;
1468}
1469
1470
1471static inline void vm_entry_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1472{
1473	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) | val);
1474}
1475
1476static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1477{
1478	vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
1479}
1480
1481static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
1482{
1483	vmcs_write32(VM_EXIT_CONTROLS, val);
1484	vmx->vm_exit_controls_shadow = val;
1485}
1486
1487static inline void vm_exit_controls_set(struct vcpu_vmx *vmx, u32 val)
1488{
1489	if (vmx->vm_exit_controls_shadow != val)
1490		vm_exit_controls_init(vmx, val);
1491}
1492
1493static inline u32 vm_exit_controls_get(struct vcpu_vmx *vmx)
1494{
1495	return vmx->vm_exit_controls_shadow;
1496}
1497
1498
1499static inline void vm_exit_controls_setbit(struct vcpu_vmx *vmx, u32 val)
1500{
1501	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) | val);
1502}
1503
1504static inline void vm_exit_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
1505{
1506	vm_exit_controls_set(vmx, vm_exit_controls_get(vmx) & ~val);
1507}
1508
1509static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
1510{
1511	vmx->segment_cache.bitmask = 0;
1512}
1513
1514static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
1515				       unsigned field)
1516{
1517	bool ret;
1518	u32 mask = 1 << (seg * SEG_FIELD_NR + field);
1519
1520	if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
1521		vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
1522		vmx->segment_cache.bitmask = 0;
1523	}
1524	ret = vmx->segment_cache.bitmask & mask;
1525	vmx->segment_cache.bitmask |= mask;
1526	return ret;
1527}
1528
1529static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
1530{
1531	u16 *p = &vmx->segment_cache.seg[seg].selector;
1532
1533	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
1534		*p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
1535	return *p;
1536}
1537
1538static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
1539{
1540	ulong *p = &vmx->segment_cache.seg[seg].base;
1541
1542	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
1543		*p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
1544	return *p;
1545}
1546
1547static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
1548{
1549	u32 *p = &vmx->segment_cache.seg[seg].limit;
1550
1551	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
1552		*p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
1553	return *p;
1554}
1555
1556static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
1557{
1558	u32 *p = &vmx->segment_cache.seg[seg].ar;
1559
1560	if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
1561		*p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
1562	return *p;
1563}
1564
1565static void update_exception_bitmap(struct kvm_vcpu *vcpu)
1566{
1567	u32 eb;
1568
1569	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
1570	     (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
1571	if ((vcpu->guest_debug &
1572	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
1573	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
1574		eb |= 1u << BP_VECTOR;
1575	if (to_vmx(vcpu)->rmode.vm86_active)
1576		eb = ~0;
1577	if (enable_ept)
1578		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
1579	if (vcpu->fpu_active)
1580		eb &= ~(1u << NM_VECTOR);
1581
1582	/* When we are running a nested L2 guest and L1 specified for it a
1583	 * certain exception bitmap, we must trap the same exceptions and pass
1584	 * them to L1. When running L2, we will only handle the exceptions
1585	 * specified above if L1 did not want them.
1586	 */
1587	if (is_guest_mode(vcpu))
1588		eb |= get_vmcs12(vcpu)->exception_bitmap;
1589
1590	vmcs_write32(EXCEPTION_BITMAP, eb);
1591}
1592
1593static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1594		unsigned long entry, unsigned long exit)
1595{
1596	vm_entry_controls_clearbit(vmx, entry);
1597	vm_exit_controls_clearbit(vmx, exit);
1598}
1599
1600static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
1601{
1602	unsigned i;
1603	struct msr_autoload *m = &vmx->msr_autoload;
1604
1605	switch (msr) {
1606	case MSR_EFER:
1607		if (cpu_has_load_ia32_efer) {
1608			clear_atomic_switch_msr_special(vmx,
1609					VM_ENTRY_LOAD_IA32_EFER,
1610					VM_EXIT_LOAD_IA32_EFER);
1611			return;
1612		}
1613		break;
1614	case MSR_CORE_PERF_GLOBAL_CTRL:
1615		if (cpu_has_load_perf_global_ctrl) {
1616			clear_atomic_switch_msr_special(vmx,
1617					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1618					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
1619			return;
1620		}
1621		break;
1622	}
1623
1624	for (i = 0; i < m->nr; ++i)
1625		if (m->guest[i].index == msr)
1626			break;
1627
1628	if (i == m->nr)
1629		return;
1630	--m->nr;
1631	m->guest[i] = m->guest[m->nr];
1632	m->host[i] = m->host[m->nr];
1633	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1634	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1635}
1636
1637static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1638		unsigned long entry, unsigned long exit,
1639		unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1640		u64 guest_val, u64 host_val)
1641{
1642	vmcs_write64(guest_val_vmcs, guest_val);
1643	vmcs_write64(host_val_vmcs, host_val);
1644	vm_entry_controls_setbit(vmx, entry);
1645	vm_exit_controls_setbit(vmx, exit);
1646}
1647
1648static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
1649				  u64 guest_val, u64 host_val)
1650{
1651	unsigned i;
1652	struct msr_autoload *m = &vmx->msr_autoload;
1653
1654	switch (msr) {
1655	case MSR_EFER:
1656		if (cpu_has_load_ia32_efer) {
1657			add_atomic_switch_msr_special(vmx,
1658					VM_ENTRY_LOAD_IA32_EFER,
1659					VM_EXIT_LOAD_IA32_EFER,
1660					GUEST_IA32_EFER,
1661					HOST_IA32_EFER,
1662					guest_val, host_val);
1663			return;
1664		}
1665		break;
1666	case MSR_CORE_PERF_GLOBAL_CTRL:
1667		if (cpu_has_load_perf_global_ctrl) {
1668			add_atomic_switch_msr_special(vmx,
1669					VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1670					VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1671					GUEST_IA32_PERF_GLOBAL_CTRL,
1672					HOST_IA32_PERF_GLOBAL_CTRL,
1673					guest_val, host_val);
1674			return;
1675		}
1676		break;
1677	case MSR_IA32_PEBS_ENABLE:
1678		/* PEBS needs a quiescent period after being disabled (to write
1679		 * a record).  Disabling PEBS through VMX MSR swapping doesn't
1680		 * provide that period, so a CPU could write host's record into
1681		 * guest's memory.
1682		 */
1683		wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
1684	}
1685
1686	for (i = 0; i < m->nr; ++i)
1687		if (m->guest[i].index == msr)
1688			break;
1689
1690	if (i == NR_AUTOLOAD_MSRS) {
1691		printk_once(KERN_WARNING "Not enough msr switch entries. "
1692				"Can't add msr %x\n", msr);
1693		return;
1694	} else if (i == m->nr) {
1695		++m->nr;
1696		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr);
1697		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr);
1698	}
1699
1700	m->guest[i].index = msr;
1701	m->guest[i].value = guest_val;
1702	m->host[i].index = msr;
1703	m->host[i].value = host_val;
1704}
1705
1706static void reload_tss(void)
1707{
1708	/*
1709	 * VT restores TR but not its size.  Useless.
1710	 */
1711	struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
1712	struct desc_struct *descs;
1713
1714	descs = (void *)gdt->address;
1715	descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
1716	load_TR_desc();
1717}
1718
1719static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
1720{
1721	u64 guest_efer = vmx->vcpu.arch.efer;
1722	u64 ignore_bits = 0;
1723
1724	if (!enable_ept) {
1725		/*
1726		 * NX is needed to handle CR0.WP=1, CR4.SMEP=1.  Testing
1727		 * host CPUID is more efficient than testing guest CPUID
1728		 * or CR4.  Host SMEP is anyway a requirement for guest SMEP.
1729		 */
1730		if (boot_cpu_has(X86_FEATURE_SMEP))
1731			guest_efer |= EFER_NX;
1732		else if (!(guest_efer & EFER_NX))
1733			ignore_bits |= EFER_NX;
1734	}
1735
1736	/*
1737	 * LMA and LME handled by hardware; SCE meaningless outside long mode.
1738	 */
1739	ignore_bits |= EFER_SCE;
1740#ifdef CONFIG_X86_64
1741	ignore_bits |= EFER_LMA | EFER_LME;
1742	/* SCE is meaningful only in long mode on Intel */
1743	if (guest_efer & EFER_LMA)
1744		ignore_bits &= ~(u64)EFER_SCE;
1745#endif
1746
1747	clear_atomic_switch_msr(vmx, MSR_EFER);
1748
1749	/*
1750	 * On EPT, we can't emulate NX, so we must switch EFER atomically.
1751	 * On CPUs that support "load IA32_EFER", always switch EFER
1752	 * atomically, since it's faster than switching it manually.
1753	 */
1754	if (cpu_has_load_ia32_efer ||
1755	    (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1756		if (!(guest_efer & EFER_LMA))
1757			guest_efer &= ~EFER_LME;
1758		if (guest_efer != host_efer)
1759			add_atomic_switch_msr(vmx, MSR_EFER,
1760					      guest_efer, host_efer);
1761		return false;
1762	} else {
1763		guest_efer &= ~ignore_bits;
1764		guest_efer |= host_efer & ignore_bits;
1765
1766		vmx->guest_msrs[efer_offset].data = guest_efer;
1767		vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
1768
1769		return true;
1770	}
1771}
1772
1773static unsigned long segment_base(u16 selector)
1774{
1775	struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
1776	struct desc_struct *d;
1777	unsigned long table_base;
1778	unsigned long v;
1779
1780	if (!(selector & ~3))
1781		return 0;
1782
1783	table_base = gdt->address;
1784
1785	if (selector & 4) {           /* from ldt */
1786		u16 ldt_selector = kvm_read_ldt();
1787
1788		if (!(ldt_selector & ~3))
1789			return 0;
1790
1791		table_base = segment_base(ldt_selector);
1792	}
1793	d = (struct desc_struct *)(table_base + (selector & ~7));
1794	v = get_desc_base(d);
1795#ifdef CONFIG_X86_64
1796       if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
1797               v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
1798#endif
1799	return v;
1800}
1801
1802static inline unsigned long kvm_read_tr_base(void)
1803{
1804	u16 tr;
1805	asm("str %0" : "=g"(tr));
1806	return segment_base(tr);
1807}
1808
1809static void vmx_save_host_state(struct kvm_vcpu *vcpu)
1810{
1811	struct vcpu_vmx *vmx = to_vmx(vcpu);
1812	int i;
1813
1814	if (vmx->host_state.loaded)
1815		return;
1816
1817	vmx->host_state.loaded = 1;
1818	/*
1819	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1820	 * allow segment selectors with cpl > 0 or ti == 1.
1821	 */
1822	vmx->host_state.ldt_sel = kvm_read_ldt();
1823	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
1824	savesegment(fs, vmx->host_state.fs_sel);
1825	if (!(vmx->host_state.fs_sel & 7)) {
1826		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
1827		vmx->host_state.fs_reload_needed = 0;
1828	} else {
1829		vmcs_write16(HOST_FS_SELECTOR, 0);
1830		vmx->host_state.fs_reload_needed = 1;
1831	}
1832	savesegment(gs, vmx->host_state.gs_sel);
1833	if (!(vmx->host_state.gs_sel & 7))
1834		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
1835	else {
1836		vmcs_write16(HOST_GS_SELECTOR, 0);
1837		vmx->host_state.gs_ldt_reload_needed = 1;
1838	}
1839
1840#ifdef CONFIG_X86_64
1841	savesegment(ds, vmx->host_state.ds_sel);
1842	savesegment(es, vmx->host_state.es_sel);
1843#endif
1844
1845#ifdef CONFIG_X86_64
1846	vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1847	vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1848#else
1849	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
1850	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
1851#endif
1852
1853#ifdef CONFIG_X86_64
1854	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1855	if (is_long_mode(&vmx->vcpu))
1856		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1857#endif
1858	if (boot_cpu_has(X86_FEATURE_MPX))
1859		rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1860	for (i = 0; i < vmx->save_nmsrs; ++i)
1861		kvm_set_shared_msr(vmx->guest_msrs[i].index,
1862				   vmx->guest_msrs[i].data,
1863				   vmx->guest_msrs[i].mask);
1864}
1865
1866static void __vmx_load_host_state(struct vcpu_vmx *vmx)
1867{
1868	if (!vmx->host_state.loaded)
1869		return;
1870
1871	++vmx->vcpu.stat.host_state_reload;
1872	vmx->host_state.loaded = 0;
1873#ifdef CONFIG_X86_64
1874	if (is_long_mode(&vmx->vcpu))
1875		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1876#endif
1877	if (vmx->host_state.gs_ldt_reload_needed) {
1878		kvm_load_ldt(vmx->host_state.ldt_sel);
1879#ifdef CONFIG_X86_64
1880		load_gs_index(vmx->host_state.gs_sel);
1881#else
1882		loadsegment(gs, vmx->host_state.gs_sel);
1883#endif
1884	}
1885	if (vmx->host_state.fs_reload_needed)
1886		loadsegment(fs, vmx->host_state.fs_sel);
1887#ifdef CONFIG_X86_64
1888	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
1889		loadsegment(ds, vmx->host_state.ds_sel);
1890		loadsegment(es, vmx->host_state.es_sel);
1891	}
1892#endif
1893	reload_tss();
1894#ifdef CONFIG_X86_64
1895	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
1896#endif
1897	if (vmx->host_state.msr_host_bndcfgs)
1898		wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
1899	/*
1900	 * If the FPU is not active (through the host task or
1901	 * the guest vcpu), then restore the cr0.TS bit.
1902	 */
1903	if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
1904		stts();
1905	load_gdt(this_cpu_ptr(&host_gdt));
1906}
1907
1908static void vmx_load_host_state(struct vcpu_vmx *vmx)
1909{
1910	preempt_disable();
1911	__vmx_load_host_state(vmx);
1912	preempt_enable();
1913}
1914
1915/*
1916 * Switches to specified vcpu, until a matching vcpu_put(), but assumes
1917 * vcpu mutex is already taken.
1918 */
1919static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1920{
1921	struct vcpu_vmx *vmx = to_vmx(vcpu);
1922	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1923
1924	if (!vmm_exclusive)
1925		kvm_cpu_vmxon(phys_addr);
1926	else if (vmx->loaded_vmcs->cpu != cpu)
1927		loaded_vmcs_clear(vmx->loaded_vmcs);
1928
1929	if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
1930		per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1931		vmcs_load(vmx->loaded_vmcs->vmcs);
1932	}
1933
1934	if (vmx->loaded_vmcs->cpu != cpu) {
1935		struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
1936		unsigned long sysenter_esp;
1937
1938		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1939		local_irq_disable();
1940		crash_disable_local_vmclear(cpu);
1941
1942		/*
1943		 * Read loaded_vmcs->cpu should be before fetching
1944		 * loaded_vmcs->loaded_vmcss_on_cpu_link.
1945		 * See the comments in __loaded_vmcs_clear().
1946		 */
1947		smp_rmb();
1948
1949		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1950			 &per_cpu(loaded_vmcss_on_cpu, cpu));
1951		crash_enable_local_vmclear(cpu);
1952		local_irq_enable();
1953
1954		/*
1955		 * Linux uses per-cpu TSS and GDT, so set these when switching
1956		 * processors.
1957		 */
1958		vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
1959		vmcs_writel(HOST_GDTR_BASE, gdt->address);   /* 22.2.4 */
1960
1961		rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
1962		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
1963		vmx->loaded_vmcs->cpu = cpu;
1964	}
1965}
1966
1967static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1968{
1969	__vmx_load_host_state(to_vmx(vcpu));
1970	if (!vmm_exclusive) {
1971		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
1972		vcpu->cpu = -1;
1973		kvm_cpu_vmxoff();
1974	}
1975}
1976
1977static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
1978{
1979	ulong cr0;
1980
1981	if (vcpu->fpu_active)
1982		return;
1983	vcpu->fpu_active = 1;
1984	cr0 = vmcs_readl(GUEST_CR0);
1985	cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
1986	cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
1987	vmcs_writel(GUEST_CR0, cr0);
1988	update_exception_bitmap(vcpu);
1989	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
1990	if (is_guest_mode(vcpu))
1991		vcpu->arch.cr0_guest_owned_bits &=
1992			~get_vmcs12(vcpu)->cr0_guest_host_mask;
1993	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
1994}
1995
1996static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
1997
1998/*
1999 * Return the cr0 value that a nested guest would read. This is a combination
2000 * of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
2001 * its hypervisor (cr0_read_shadow).
2002 */
2003static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
2004{
2005	return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
2006		(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
2007}
2008static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
2009{
2010	return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
2011		(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
2012}
2013
2014static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
2015{
2016	/* Note that there is no vcpu->fpu_active = 0 here. The caller must
2017	 * set this *before* calling this function.
2018	 */
2019	vmx_decache_cr0_guest_bits(vcpu);
2020	vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
2021	update_exception_bitmap(vcpu);
2022	vcpu->arch.cr0_guest_owned_bits = 0;
2023	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2024	if (is_guest_mode(vcpu)) {
2025		/*
2026		 * L1's specified read shadow might not contain the TS bit,
2027		 * so now that we turned on shadowing of this bit, we need to
2028		 * set this bit of the shadow. Like in nested_vmx_run we need
2029		 * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
2030		 * up-to-date here because we just decached cr0.TS (and we'll
2031		 * only update vmcs12->guest_cr0 on nested exit).
2032		 */
2033		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2034		vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
2035			(vcpu->arch.cr0 & X86_CR0_TS);
2036		vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2037	} else
2038		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2039}
2040
2041static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
2042{
2043	unsigned long rflags, save_rflags;
2044
2045	if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
2046		__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2047		rflags = vmcs_readl(GUEST_RFLAGS);
2048		if (to_vmx(vcpu)->rmode.vm86_active) {
2049			rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
2050			save_rflags = to_vmx(vcpu)->rmode.save_rflags;
2051			rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
2052		}
2053		to_vmx(vcpu)->rflags = rflags;
2054	}
2055	return to_vmx(vcpu)->rflags;
2056}
2057
2058static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2059{
2060	__set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
2061	to_vmx(vcpu)->rflags = rflags;
2062	if (to_vmx(vcpu)->rmode.vm86_active) {
2063		to_vmx(vcpu)->rmode.save_rflags = rflags;
2064		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
2065	}
2066	vmcs_writel(GUEST_RFLAGS, rflags);
2067}
2068
2069static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
2070{
2071	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2072	int ret = 0;
2073
2074	if (interruptibility & GUEST_INTR_STATE_STI)
2075		ret |= KVM_X86_SHADOW_INT_STI;
2076	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
2077		ret |= KVM_X86_SHADOW_INT_MOV_SS;
2078
2079	return ret;
2080}
2081
2082static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
2083{
2084	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
2085	u32 interruptibility = interruptibility_old;
2086
2087	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
2088
2089	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
2090		interruptibility |= GUEST_INTR_STATE_MOV_SS;
2091	else if (mask & KVM_X86_SHADOW_INT_STI)
2092		interruptibility |= GUEST_INTR_STATE_STI;
2093
2094	if ((interruptibility != interruptibility_old))
2095		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
2096}
2097
2098static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
2099{
2100	unsigned long rip;
2101
2102	rip = kvm_rip_read(vcpu);
2103	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2104	kvm_rip_write(vcpu, rip);
2105
2106	/* skipping an emulated instruction also counts */
2107	vmx_set_interrupt_shadow(vcpu, 0);
2108}
2109
2110/*
2111 * KVM wants to inject page-faults which it got to the guest. This function
2112 * checks whether in a nested guest, we need to inject them to L1 or L2.
2113 */
2114static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned nr)
2115{
2116	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2117
2118	if (!(vmcs12->exception_bitmap & (1u << nr)))
2119		return 0;
2120
2121	nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
2122			  vmcs_read32(VM_EXIT_INTR_INFO),
2123			  vmcs_readl(EXIT_QUALIFICATION));
2124	return 1;
2125}
2126
2127static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
2128				bool has_error_code, u32 error_code,
2129				bool reinject)
2130{
2131	struct vcpu_vmx *vmx = to_vmx(vcpu);
2132	u32 intr_info = nr | INTR_INFO_VALID_MASK;
2133
2134	if (!reinject && is_guest_mode(vcpu) &&
2135	    nested_vmx_check_exception(vcpu, nr))
2136		return;
2137
2138	if (has_error_code) {
2139		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
2140		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
2141	}
2142
2143	if (vmx->rmode.vm86_active) {
2144		int inc_eip = 0;
2145		if (kvm_exception_is_soft(nr))
2146			inc_eip = vcpu->arch.event_exit_inst_len;
2147		if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
2148			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2149		return;
2150	}
2151
2152	if (kvm_exception_is_soft(nr)) {
2153		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2154			     vmx->vcpu.arch.event_exit_inst_len);
2155		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
2156	} else
2157		intr_info |= INTR_TYPE_HARD_EXCEPTION;
2158
2159	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
2160}
2161
2162static bool vmx_rdtscp_supported(void)
2163{
2164	return cpu_has_vmx_rdtscp();
2165}
2166
2167static bool vmx_invpcid_supported(void)
2168{
2169	return cpu_has_vmx_invpcid() && enable_ept;
2170}
2171
2172/*
2173 * Swap MSR entry in host/guest MSR entry array.
2174 */
2175static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
2176{
2177	struct shared_msr_entry tmp;
2178
2179	tmp = vmx->guest_msrs[to];
2180	vmx->guest_msrs[to] = vmx->guest_msrs[from];
2181	vmx->guest_msrs[from] = tmp;
2182}
2183
2184static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
2185{
2186	unsigned long *msr_bitmap;
2187
2188	if (is_guest_mode(vcpu))
2189		msr_bitmap = vmx_msr_bitmap_nested;
2190	else if (irqchip_in_kernel(vcpu->kvm) &&
2191		apic_x2apic_mode(vcpu->arch.apic)) {
2192		if (is_long_mode(vcpu))
2193			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
2194		else
2195			msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
2196	} else {
2197		if (is_long_mode(vcpu))
2198			msr_bitmap = vmx_msr_bitmap_longmode;
2199		else
2200			msr_bitmap = vmx_msr_bitmap_legacy;
2201	}
2202
2203	vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
2204}
2205
2206/*
2207 * Set up the vmcs to automatically save and restore system
2208 * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
2209 * mode, as fiddling with msrs is very expensive.
2210 */
2211static void setup_msrs(struct vcpu_vmx *vmx)
2212{
2213	int save_nmsrs, index;
2214
2215	save_nmsrs = 0;
2216#ifdef CONFIG_X86_64
2217	if (is_long_mode(&vmx->vcpu)) {
2218		index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
2219		if (index >= 0)
2220			move_msr_up(vmx, index, save_nmsrs++);
2221		index = __find_msr_index(vmx, MSR_LSTAR);
2222		if (index >= 0)
2223			move_msr_up(vmx, index, save_nmsrs++);
2224		index = __find_msr_index(vmx, MSR_CSTAR);
2225		if (index >= 0)
2226			move_msr_up(vmx, index, save_nmsrs++);
2227		index = __find_msr_index(vmx, MSR_TSC_AUX);
2228		if (index >= 0 && vmx->rdtscp_enabled)
2229			move_msr_up(vmx, index, save_nmsrs++);
2230		/*
2231		 * MSR_STAR is only needed on long mode guests, and only
2232		 * if efer.sce is enabled.
2233		 */
2234		index = __find_msr_index(vmx, MSR_STAR);
2235		if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
2236			move_msr_up(vmx, index, save_nmsrs++);
2237	}
2238#endif
2239	index = __find_msr_index(vmx, MSR_EFER);
2240	if (index >= 0 && update_transition_efer(vmx, index))
2241		move_msr_up(vmx, index, save_nmsrs++);
2242
2243	vmx->save_nmsrs = save_nmsrs;
2244
2245	if (cpu_has_vmx_msr_bitmap())
2246		vmx_set_msr_bitmap(&vmx->vcpu);
2247}
2248
2249/*
2250 * reads and returns guest's timestamp counter "register"
2251 * guest_tsc = host_tsc + tsc_offset    -- 21.3
2252 */
2253static u64 guest_read_tsc(void)
2254{
2255	u64 host_tsc, tsc_offset;
2256
2257	rdtscll(host_tsc);
2258	tsc_offset = vmcs_read64(TSC_OFFSET);
2259	return host_tsc + tsc_offset;
2260}
2261
2262/*
2263 * Like guest_read_tsc, but always returns L1's notion of the timestamp
2264 * counter, even if a nested guest (L2) is currently running.
2265 */
2266static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2267{
2268	u64 tsc_offset;
2269
2270	tsc_offset = is_guest_mode(vcpu) ?
2271		to_vmx(vcpu)->nested.vmcs01_tsc_offset :
2272		vmcs_read64(TSC_OFFSET);
2273	return host_tsc + tsc_offset;
2274}
2275
2276/*
2277 * Engage any workarounds for mis-matched TSC rates.  Currently limited to
2278 * software catchup for faster rates on slower CPUs.
2279 */
2280static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2281{
2282	if (!scale)
2283		return;
2284
2285	if (user_tsc_khz > tsc_khz) {
2286		vcpu->arch.tsc_catchup = 1;
2287		vcpu->arch.tsc_always_catchup = 1;
2288	} else
2289		WARN(1, "user requested TSC rate below hardware speed\n");
2290}
2291
2292static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
2293{
2294	return vmcs_read64(TSC_OFFSET);
2295}
2296
2297/*
2298 * writes 'offset' into guest's timestamp counter offset register
2299 */
2300static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
2301{
2302	if (is_guest_mode(vcpu)) {
2303		/*
2304		 * We're here if L1 chose not to trap WRMSR to TSC. According
2305		 * to the spec, this should set L1's TSC; The offset that L1
2306		 * set for L2 remains unchanged, and still needs to be added
2307		 * to the newly set TSC to get L2's TSC.
2308		 */
2309		struct vmcs12 *vmcs12;
2310		to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
2311		/* recalculate vmcs02.TSC_OFFSET: */
2312		vmcs12 = get_vmcs12(vcpu);
2313		vmcs_write64(TSC_OFFSET, offset +
2314			(nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ?
2315			 vmcs12->tsc_offset : 0));
2316	} else {
2317		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2318					   vmcs_read64(TSC_OFFSET), offset);
2319		vmcs_write64(TSC_OFFSET, offset);
2320	}
2321}
2322
2323static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
2324{
2325	u64 offset = vmcs_read64(TSC_OFFSET);
2326
2327	vmcs_write64(TSC_OFFSET, offset + adjustment);
2328	if (is_guest_mode(vcpu)) {
2329		/* Even when running L2, the adjustment needs to apply to L1 */
2330		to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
2331	} else
2332		trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
2333					   offset + adjustment);
2334}
2335
2336static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2337{
2338	return target_tsc - native_read_tsc();
2339}
2340
2341static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
2342{
2343	struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
2344	return best && (best->ecx & (1 << (X86_FEATURE_VMX & 31)));
2345}
2346
2347/*
2348 * nested_vmx_allowed() checks whether a guest should be allowed to use VMX
2349 * instructions and MSRs (i.e., nested VMX). Nested VMX is disabled for
2350 * all guests if the "nested" module option is off, and can also be disabled
2351 * for a single guest by disabling its VMX cpuid bit.
2352 */
2353static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu)
2354{
2355	return nested && guest_cpuid_has_vmx(vcpu);
2356}
2357
2358/*
2359 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
2360 * returned for the various VMX controls MSRs when nested VMX is enabled.
2361 * The same values should also be used to verify that vmcs12 control fields are
2362 * valid during nested entry from L1 to L2.
2363 * Each of these control msrs has a low and high 32-bit half: A low bit is on
2364 * if the corresponding bit in the (32-bit) control field *must* be on, and a
2365 * bit in the high half is on if the corresponding bit in the control field
2366 * may be on. See also vmx_control_verify().
2367 */
2368static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
2369{
2370	/*
2371	 * Note that as a general rule, the high half of the MSRs (bits in
2372	 * the control fields which may be 1) should be initialized by the
2373	 * intersection of the underlying hardware's MSR (i.e., features which
2374	 * can be supported) and the list of features we want to expose -
2375	 * because they are known to be properly supported in our code.
2376	 * Also, usually, the low half of the MSRs (bits which must be 1) can
2377	 * be set to 0, meaning that L1 may turn off any of these bits. The
2378	 * reason is that if one of these bits is necessary, it will appear
2379	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
2380	 * fields of vmcs01 and vmcs02, will turn these bits off - and
2381	 * nested_vmx_exit_handled() will not pass related exits to L1.
2382	 * These rules have exceptions below.
2383	 */
2384
2385	/* pin-based controls */
2386	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
2387		vmx->nested.nested_vmx_pinbased_ctls_low,
2388		vmx->nested.nested_vmx_pinbased_ctls_high);
2389	vmx->nested.nested_vmx_pinbased_ctls_low |=
2390		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2391	vmx->nested.nested_vmx_pinbased_ctls_high &=
2392		PIN_BASED_EXT_INTR_MASK |
2393		PIN_BASED_NMI_EXITING |
2394		PIN_BASED_VIRTUAL_NMIS;
2395	vmx->nested.nested_vmx_pinbased_ctls_high |=
2396		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2397		PIN_BASED_VMX_PREEMPTION_TIMER;
2398	if (vmx_vm_has_apicv(vmx->vcpu.kvm))
2399		vmx->nested.nested_vmx_pinbased_ctls_high |=
2400			PIN_BASED_POSTED_INTR;
2401
2402	/* exit controls */
2403	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
2404		vmx->nested.nested_vmx_exit_ctls_low,
2405		vmx->nested.nested_vmx_exit_ctls_high);
2406	vmx->nested.nested_vmx_exit_ctls_low =
2407		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
2408
2409	vmx->nested.nested_vmx_exit_ctls_high &=
2410#ifdef CONFIG_X86_64
2411		VM_EXIT_HOST_ADDR_SPACE_SIZE |
2412#endif
2413		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
2414	vmx->nested.nested_vmx_exit_ctls_high |=
2415		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
2416		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
2417		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
2418
2419	if (vmx_mpx_supported())
2420		vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
2421
2422	/* We support free control of debug control saving. */
2423	vmx->nested.nested_vmx_true_exit_ctls_low =
2424		vmx->nested.nested_vmx_exit_ctls_low &
2425		~VM_EXIT_SAVE_DEBUG_CONTROLS;
2426
2427	/* entry controls */
2428	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
2429		vmx->nested.nested_vmx_entry_ctls_low,
2430		vmx->nested.nested_vmx_entry_ctls_high);
2431	vmx->nested.nested_vmx_entry_ctls_low =
2432		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
2433	vmx->nested.nested_vmx_entry_ctls_high &=
2434#ifdef CONFIG_X86_64
2435		VM_ENTRY_IA32E_MODE |
2436#endif
2437		VM_ENTRY_LOAD_IA32_PAT;
2438	vmx->nested.nested_vmx_entry_ctls_high |=
2439		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
2440	if (vmx_mpx_supported())
2441		vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
2442
2443	/* We support free control of debug control loading. */
2444	vmx->nested.nested_vmx_true_entry_ctls_low =
2445		vmx->nested.nested_vmx_entry_ctls_low &
2446		~VM_ENTRY_LOAD_DEBUG_CONTROLS;
2447
2448	/* cpu-based controls */
2449	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
2450		vmx->nested.nested_vmx_procbased_ctls_low,
2451		vmx->nested.nested_vmx_procbased_ctls_high);
2452	vmx->nested.nested_vmx_procbased_ctls_low =
2453		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
2454	vmx->nested.nested_vmx_procbased_ctls_high &=
2455		CPU_BASED_VIRTUAL_INTR_PENDING |
2456		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
2457		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
2458		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
2459		CPU_BASED_CR3_STORE_EXITING |
2460#ifdef CONFIG_X86_64
2461		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
2462#endif
2463		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
2464		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_EXITING |
2465		CPU_BASED_RDPMC_EXITING | CPU_BASED_RDTSC_EXITING |
2466		CPU_BASED_PAUSE_EXITING | CPU_BASED_TPR_SHADOW |
2467		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2468	/*
2469	 * We can allow some features even when not supported by the
2470	 * hardware. For example, L1 can specify an MSR bitmap - and we
2471	 * can use it to avoid exits to L1 - even when L0 runs L2
2472	 * without MSR bitmaps.
2473	 */
2474	vmx->nested.nested_vmx_procbased_ctls_high |=
2475		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
2476		CPU_BASED_USE_MSR_BITMAPS;
2477
2478	/* We support free control of CR3 access interception. */
2479	vmx->nested.nested_vmx_true_procbased_ctls_low =
2480		vmx->nested.nested_vmx_procbased_ctls_low &
2481		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
2482
2483	/* secondary cpu-based controls */
2484	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
2485		vmx->nested.nested_vmx_secondary_ctls_low,
2486		vmx->nested.nested_vmx_secondary_ctls_high);
2487	vmx->nested.nested_vmx_secondary_ctls_low = 0;
2488	vmx->nested.nested_vmx_secondary_ctls_high &=
2489		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2490		SECONDARY_EXEC_RDTSCP |
2491		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
2492		SECONDARY_EXEC_APIC_REGISTER_VIRT |
2493		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2494		SECONDARY_EXEC_WBINVD_EXITING |
2495		SECONDARY_EXEC_XSAVES;
2496
2497	if (enable_ept) {
2498		/* nested EPT: emulate EPT also to L1 */
2499		vmx->nested.nested_vmx_secondary_ctls_high |=
2500			SECONDARY_EXEC_ENABLE_EPT;
2501		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
2502			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
2503			 VMX_EPT_INVEPT_BIT;
2504		vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
2505		/*
2506		 * For nested guests, we don't do anything specific
2507		 * for single context invalidation. Hence, only advertise
2508		 * support for global context invalidation.
2509		 */
2510		vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT;
2511	} else
2512		vmx->nested.nested_vmx_ept_caps = 0;
2513
2514	if (enable_unrestricted_guest)
2515		vmx->nested.nested_vmx_secondary_ctls_high |=
2516			SECONDARY_EXEC_UNRESTRICTED_GUEST;
2517
2518	/* miscellaneous data */
2519	rdmsr(MSR_IA32_VMX_MISC,
2520		vmx->nested.nested_vmx_misc_low,
2521		vmx->nested.nested_vmx_misc_high);
2522	vmx->nested.nested_vmx_misc_low &= VMX_MISC_SAVE_EFER_LMA;
2523	vmx->nested.nested_vmx_misc_low |=
2524		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
2525		VMX_MISC_ACTIVITY_HLT;
2526	vmx->nested.nested_vmx_misc_high = 0;
2527}
2528
2529static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
2530{
2531	/*
2532	 * Bits 0 in high must be 0, and bits 1 in low must be 1.
2533	 */
2534	return ((control & high) | low) == control;
2535}
2536
2537static inline u64 vmx_control_msr(u32 low, u32 high)
2538{
2539	return low | ((u64)high << 32);
2540}
2541
2542/* Returns 0 on success, non-0 otherwise. */
2543static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2544{
2545	struct vcpu_vmx *vmx = to_vmx(vcpu);
2546
2547	switch (msr_index) {
2548	case MSR_IA32_VMX_BASIC:
2549		/*
2550		 * This MSR reports some information about VMX support. We
2551		 * should return information about the VMX we emulate for the
2552		 * guest, and the VMCS structure we give it - not about the
2553		 * VMX support of the underlying hardware.
2554		 */
2555		*pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS |
2556			   ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
2557			   (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
2558		break;
2559	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2560	case MSR_IA32_VMX_PINBASED_CTLS:
2561		*pdata = vmx_control_msr(
2562			vmx->nested.nested_vmx_pinbased_ctls_low,
2563			vmx->nested.nested_vmx_pinbased_ctls_high);
2564		break;
2565	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2566		*pdata = vmx_control_msr(
2567			vmx->nested.nested_vmx_true_procbased_ctls_low,
2568			vmx->nested.nested_vmx_procbased_ctls_high);
2569		break;
2570	case MSR_IA32_VMX_PROCBASED_CTLS:
2571		*pdata = vmx_control_msr(
2572			vmx->nested.nested_vmx_procbased_ctls_low,
2573			vmx->nested.nested_vmx_procbased_ctls_high);
2574		break;
2575	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2576		*pdata = vmx_control_msr(
2577			vmx->nested.nested_vmx_true_exit_ctls_low,
2578			vmx->nested.nested_vmx_exit_ctls_high);
2579		break;
2580	case MSR_IA32_VMX_EXIT_CTLS:
2581		*pdata = vmx_control_msr(
2582			vmx->nested.nested_vmx_exit_ctls_low,
2583			vmx->nested.nested_vmx_exit_ctls_high);
2584		break;
2585	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2586		*pdata = vmx_control_msr(
2587			vmx->nested.nested_vmx_true_entry_ctls_low,
2588			vmx->nested.nested_vmx_entry_ctls_high);
2589		break;
2590	case MSR_IA32_VMX_ENTRY_CTLS:
2591		*pdata = vmx_control_msr(
2592			vmx->nested.nested_vmx_entry_ctls_low,
2593			vmx->nested.nested_vmx_entry_ctls_high);
2594		break;
2595	case MSR_IA32_VMX_MISC:
2596		*pdata = vmx_control_msr(
2597			vmx->nested.nested_vmx_misc_low,
2598			vmx->nested.nested_vmx_misc_high);
2599		break;
2600	/*
2601	 * These MSRs specify bits which the guest must keep fixed (on or off)
2602	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
2603	 * We picked the standard core2 setting.
2604	 */
2605#define VMXON_CR0_ALWAYSON	(X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
2606#define VMXON_CR4_ALWAYSON	X86_CR4_VMXE
2607	case MSR_IA32_VMX_CR0_FIXED0:
2608		*pdata = VMXON_CR0_ALWAYSON;
2609		break;
2610	case MSR_IA32_VMX_CR0_FIXED1:
2611		*pdata = -1ULL;
2612		break;
2613	case MSR_IA32_VMX_CR4_FIXED0:
2614		*pdata = VMXON_CR4_ALWAYSON;
2615		break;
2616	case MSR_IA32_VMX_CR4_FIXED1:
2617		*pdata = -1ULL;
2618		break;
2619	case MSR_IA32_VMX_VMCS_ENUM:
2620		*pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */
2621		break;
2622	case MSR_IA32_VMX_PROCBASED_CTLS2:
2623		*pdata = vmx_control_msr(
2624			vmx->nested.nested_vmx_secondary_ctls_low,
2625			vmx->nested.nested_vmx_secondary_ctls_high);
2626		break;
2627	case MSR_IA32_VMX_EPT_VPID_CAP:
2628		/* Currently, no nested vpid support */
2629		*pdata = vmx->nested.nested_vmx_ept_caps;
2630		break;
2631	default:
2632		return 1;
2633	}
2634
2635	return 0;
2636}
2637
2638/*
2639 * Reads an msr value (of 'msr_index') into 'pdata'.
2640 * Returns 0 on success, non-0 otherwise.
2641 * Assumes vcpu_load() was already called.
2642 */
2643static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2644{
2645	u64 data;
2646	struct shared_msr_entry *msr;
2647
2648	if (!pdata) {
2649		printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
2650		return -EINVAL;
2651	}
2652
2653	switch (msr_index) {
2654#ifdef CONFIG_X86_64
2655	case MSR_FS_BASE:
2656		data = vmcs_readl(GUEST_FS_BASE);
2657		break;
2658	case MSR_GS_BASE:
2659		data = vmcs_readl(GUEST_GS_BASE);
2660		break;
2661	case MSR_KERNEL_GS_BASE:
2662		vmx_load_host_state(to_vmx(vcpu));
2663		data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
2664		break;
2665#endif
2666	case MSR_EFER:
2667		return kvm_get_msr_common(vcpu, msr_index, pdata);
2668	case MSR_IA32_TSC:
2669		data = guest_read_tsc();
2670		break;
2671	case MSR_IA32_SYSENTER_CS:
2672		data = vmcs_read32(GUEST_SYSENTER_CS);
2673		break;
2674	case MSR_IA32_SYSENTER_EIP:
2675		data = vmcs_readl(GUEST_SYSENTER_EIP);
2676		break;
2677	case MSR_IA32_SYSENTER_ESP:
2678		data = vmcs_readl(GUEST_SYSENTER_ESP);
2679		break;
2680	case MSR_IA32_BNDCFGS:
2681		if (!vmx_mpx_supported())
2682			return 1;
2683		data = vmcs_read64(GUEST_BNDCFGS);
2684		break;
2685	case MSR_IA32_FEATURE_CONTROL:
2686		if (!nested_vmx_allowed(vcpu))
2687			return 1;
2688		data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
2689		break;
2690	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2691		if (!nested_vmx_allowed(vcpu))
2692			return 1;
2693		return vmx_get_vmx_msr(vcpu, msr_index, pdata);
2694	case MSR_IA32_XSS:
2695		if (!vmx_xsaves_supported())
2696			return 1;
2697		data = vcpu->arch.ia32_xss;
2698		break;
2699	case MSR_TSC_AUX:
2700		if (!to_vmx(vcpu)->rdtscp_enabled)
2701			return 1;
2702		/* Otherwise falls through */
2703	default:
2704		msr = find_msr_entry(to_vmx(vcpu), msr_index);
2705		if (msr) {
2706			data = msr->data;
2707			break;
2708		}
2709		return kvm_get_msr_common(vcpu, msr_index, pdata);
2710	}
2711
2712	*pdata = data;
2713	return 0;
2714}
2715
2716static void vmx_leave_nested(struct kvm_vcpu *vcpu);
2717
2718/*
2719 * Writes msr value into into the appropriate "register".
2720 * Returns 0 on success, non-0 otherwise.
2721 * Assumes vcpu_load() was already called.
2722 */
2723static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2724{
2725	struct vcpu_vmx *vmx = to_vmx(vcpu);
2726	struct shared_msr_entry *msr;
2727	int ret = 0;
2728	u32 msr_index = msr_info->index;
2729	u64 data = msr_info->data;
2730
2731	switch (msr_index) {
2732	case MSR_EFER:
2733		ret = kvm_set_msr_common(vcpu, msr_info);
2734		break;
2735#ifdef CONFIG_X86_64
2736	case MSR_FS_BASE:
2737		vmx_segment_cache_clear(vmx);
2738		vmcs_writel(GUEST_FS_BASE, data);
2739		break;
2740	case MSR_GS_BASE:
2741		vmx_segment_cache_clear(vmx);
2742		vmcs_writel(GUEST_GS_BASE, data);
2743		break;
2744	case MSR_KERNEL_GS_BASE:
2745		vmx_load_host_state(vmx);
2746		vmx->msr_guest_kernel_gs_base = data;
2747		break;
2748#endif
2749	case MSR_IA32_SYSENTER_CS:
2750		vmcs_write32(GUEST_SYSENTER_CS, data);
2751		break;
2752	case MSR_IA32_SYSENTER_EIP:
2753		vmcs_writel(GUEST_SYSENTER_EIP, data);
2754		break;
2755	case MSR_IA32_SYSENTER_ESP:
2756		vmcs_writel(GUEST_SYSENTER_ESP, data);
2757		break;
2758	case MSR_IA32_BNDCFGS:
2759		if (!vmx_mpx_supported())
2760			return 1;
2761		vmcs_write64(GUEST_BNDCFGS, data);
2762		break;
2763	case MSR_IA32_TSC:
2764		kvm_write_tsc(vcpu, msr_info);
2765		break;
2766	case MSR_IA32_CR_PAT:
2767		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2768			if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
2769				return 1;
2770			vmcs_write64(GUEST_IA32_PAT, data);
2771			vcpu->arch.pat = data;
2772			break;
2773		}
2774		ret = kvm_set_msr_common(vcpu, msr_info);
2775		break;
2776	case MSR_IA32_TSC_ADJUST:
2777		ret = kvm_set_msr_common(vcpu, msr_info);
2778		break;
2779	case MSR_IA32_FEATURE_CONTROL:
2780		if (!nested_vmx_allowed(vcpu) ||
2781		    (to_vmx(vcpu)->nested.msr_ia32_feature_control &
2782		     FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
2783			return 1;
2784		vmx->nested.msr_ia32_feature_control = data;
2785		if (msr_info->host_initiated && data == 0)
2786			vmx_leave_nested(vcpu);
2787		break;
2788	case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
2789		return 1; /* they are read-only */
2790	case MSR_IA32_XSS:
2791		if (!vmx_xsaves_supported())
2792			return 1;
2793		/*
2794		 * The only supported bit as of Skylake is bit 8, but
2795		 * it is not supported on KVM.
2796		 */
2797		if (data != 0)
2798			return 1;
2799		vcpu->arch.ia32_xss = data;
2800		if (vcpu->arch.ia32_xss != host_xss)
2801			add_atomic_switch_msr(vmx, MSR_IA32_XSS,
2802				vcpu->arch.ia32_xss, host_xss);
2803		else
2804			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
2805		break;
2806	case MSR_TSC_AUX:
2807		if (!vmx->rdtscp_enabled)
2808			return 1;
2809		/* Check reserved bit, higher 32 bits should be zero */
2810		if ((data >> 32) != 0)
2811			return 1;
2812		/* Otherwise falls through */
2813	default:
2814		msr = find_msr_entry(vmx, msr_index);
2815		if (msr) {
2816			u64 old_msr_data = msr->data;
2817			msr->data = data;
2818			if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
2819				preempt_disable();
2820				ret = kvm_set_shared_msr(msr->index, msr->data,
2821							 msr->mask);
2822				preempt_enable();
2823				if (ret)
2824					msr->data = old_msr_data;
2825			}
2826			break;
2827		}
2828		ret = kvm_set_msr_common(vcpu, msr_info);
2829	}
2830
2831	return ret;
2832}
2833
2834static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2835{
2836	__set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
2837	switch (reg) {
2838	case VCPU_REGS_RSP:
2839		vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
2840		break;
2841	case VCPU_REGS_RIP:
2842		vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
2843		break;
2844	case VCPU_EXREG_PDPTR:
2845		if (enable_ept)
2846			ept_save_pdptrs(vcpu);
2847		break;
2848	default:
2849		break;
2850	}
2851}
2852
2853static __init int cpu_has_kvm_support(void)
2854{
2855	return cpu_has_vmx();
2856}
2857
2858static __init int vmx_disabled_by_bios(void)
2859{
2860	u64 msr;
2861
2862	rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
2863	if (msr & FEATURE_CONTROL_LOCKED) {
2864		/* launched w/ TXT and VMX disabled */
2865		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2866			&& tboot_enabled())
2867			return 1;
2868		/* launched w/o TXT and VMX only enabled w/ TXT */
2869		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2870			&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
2871			&& !tboot_enabled()) {
2872			printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
2873				"activate TXT before enabling KVM\n");
2874			return 1;
2875		}
2876		/* launched w/o TXT and VMX disabled */
2877		if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
2878			&& !tboot_enabled())
2879			return 1;
2880	}
2881
2882	return 0;
2883}
2884
2885static void kvm_cpu_vmxon(u64 addr)
2886{
2887	asm volatile (ASM_VMX_VMXON_RAX
2888			: : "a"(&addr), "m"(addr)
2889			: "memory", "cc");
2890}
2891
2892static int hardware_enable(void)
2893{
2894	int cpu = raw_smp_processor_id();
2895	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2896	u64 old, test_bits;
2897
2898	if (cr4_read_shadow() & X86_CR4_VMXE)
2899		return -EBUSY;
2900
2901	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
2902
2903	/*
2904	 * Now we can enable the vmclear operation in kdump
2905	 * since the loaded_vmcss_on_cpu list on this cpu
2906	 * has been initialized.
2907	 *
2908	 * Though the cpu is not in VMX operation now, there
2909	 * is no problem to enable the vmclear operation
2910	 * for the loaded_vmcss_on_cpu list is empty!
2911	 */
2912	crash_enable_local_vmclear(cpu);
2913
2914	rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
2915
2916	test_bits = FEATURE_CONTROL_LOCKED;
2917	test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
2918	if (tboot_enabled())
2919		test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
2920
2921	if ((old & test_bits) != test_bits) {
2922		/* enable and lock */
2923		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
2924	}
2925	cr4_set_bits(X86_CR4_VMXE);
2926
2927	if (vmm_exclusive) {
2928		kvm_cpu_vmxon(phys_addr);
2929		ept_sync_global();
2930	}
2931
2932	native_store_gdt(this_cpu_ptr(&host_gdt));
2933
2934	return 0;
2935}
2936
2937static void vmclear_local_loaded_vmcss(void)
2938{
2939	int cpu = raw_smp_processor_id();
2940	struct loaded_vmcs *v, *n;
2941
2942	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2943				 loaded_vmcss_on_cpu_link)
2944		__loaded_vmcs_clear(v);
2945}
2946
2947
2948/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
2949 * tricks.
2950 */
2951static void kvm_cpu_vmxoff(void)
2952{
2953	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
2954}
2955
2956static void hardware_disable(void)
2957{
2958	if (vmm_exclusive) {
2959		vmclear_local_loaded_vmcss();
2960		kvm_cpu_vmxoff();
2961	}
2962	cr4_clear_bits(X86_CR4_VMXE);
2963}
2964
2965static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
2966				      u32 msr, u32 *result)
2967{
2968	u32 vmx_msr_low, vmx_msr_high;
2969	u32 ctl = ctl_min | ctl_opt;
2970
2971	rdmsr(msr, vmx_msr_low, vmx_msr_high);
2972
2973	ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
2974	ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
2975
2976	/* Ensure minimum (required) set of control bits are supported. */
2977	if (ctl_min & ~ctl)
2978		return -EIO;
2979
2980	*result = ctl;
2981	return 0;
2982}
2983
2984static __init bool allow_1_setting(u32 msr, u32 ctl)
2985{
2986	u32 vmx_msr_low, vmx_msr_high;
2987
2988	rdmsr(msr, vmx_msr_low, vmx_msr_high);
2989	return vmx_msr_high & ctl;
2990}
2991
2992static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
2993{
2994	u32 vmx_msr_low, vmx_msr_high;
2995	u32 min, opt, min2, opt2;
2996	u32 _pin_based_exec_control = 0;
2997	u32 _cpu_based_exec_control = 0;
2998	u32 _cpu_based_2nd_exec_control = 0;
2999	u32 _vmexit_control = 0;
3000	u32 _vmentry_control = 0;
3001
3002	min = CPU_BASED_HLT_EXITING |
3003#ifdef CONFIG_X86_64
3004	      CPU_BASED_CR8_LOAD_EXITING |
3005	      CPU_BASED_CR8_STORE_EXITING |
3006#endif
3007	      CPU_BASED_CR3_LOAD_EXITING |
3008	      CPU_BASED_CR3_STORE_EXITING |
3009	      CPU_BASED_USE_IO_BITMAPS |
3010	      CPU_BASED_MOV_DR_EXITING |
3011	      CPU_BASED_USE_TSC_OFFSETING |
3012	      CPU_BASED_MWAIT_EXITING |
3013	      CPU_BASED_MONITOR_EXITING |
3014	      CPU_BASED_INVLPG_EXITING |
3015	      CPU_BASED_RDPMC_EXITING;
3016
3017	opt = CPU_BASED_TPR_SHADOW |
3018	      CPU_BASED_USE_MSR_BITMAPS |
3019	      CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
3020	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
3021				&_cpu_based_exec_control) < 0)
3022		return -EIO;
3023#ifdef CONFIG_X86_64
3024	if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3025		_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
3026					   ~CPU_BASED_CR8_STORE_EXITING;
3027#endif
3028	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
3029		min2 = 0;
3030		opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
3031			SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3032			SECONDARY_EXEC_WBINVD_EXITING |
3033			SECONDARY_EXEC_ENABLE_VPID |
3034			SECONDARY_EXEC_ENABLE_EPT |
3035			SECONDARY_EXEC_UNRESTRICTED_GUEST |
3036			SECONDARY_EXEC_PAUSE_LOOP_EXITING |
3037			SECONDARY_EXEC_RDTSCP |
3038			SECONDARY_EXEC_ENABLE_INVPCID |
3039			SECONDARY_EXEC_APIC_REGISTER_VIRT |
3040			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
3041			SECONDARY_EXEC_SHADOW_VMCS |
3042			SECONDARY_EXEC_XSAVES |
3043			SECONDARY_EXEC_ENABLE_PML;
3044		if (adjust_vmx_controls(min2, opt2,
3045					MSR_IA32_VMX_PROCBASED_CTLS2,
3046					&_cpu_based_2nd_exec_control) < 0)
3047			return -EIO;
3048	}
3049#ifndef CONFIG_X86_64
3050	if (!(_cpu_based_2nd_exec_control &
3051				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
3052		_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
3053#endif
3054
3055	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
3056		_cpu_based_2nd_exec_control &= ~(
3057				SECONDARY_EXEC_APIC_REGISTER_VIRT |
3058				SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
3059				SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
3060
3061	if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
3062		/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
3063		   enabled */
3064		_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
3065					     CPU_BASED_CR3_STORE_EXITING |
3066					     CPU_BASED_INVLPG_EXITING);
3067		rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
3068		      vmx_capability.ept, vmx_capability.vpid);
3069	}
3070
3071	min = VM_EXIT_SAVE_DEBUG_CONTROLS;
3072#ifdef CONFIG_X86_64
3073	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
3074#endif
3075	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
3076		VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
3077	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
3078				&_vmexit_control) < 0)
3079		return -EIO;
3080
3081	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
3082	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
3083	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
3084				&_pin_based_exec_control) < 0)
3085		return -EIO;
3086
3087	if (!(_cpu_based_2nd_exec_control &
3088		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
3089		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
3090		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
3091
3092	min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
3093	opt = VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
3094	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
3095				&_vmentry_control) < 0)
3096		return -EIO;
3097
3098	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
3099
3100	/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
3101	if ((vmx_msr_high & 0x1fff) > PAGE_SIZE)
3102		return -EIO;
3103
3104#ifdef CONFIG_X86_64
3105	/* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
3106	if (vmx_msr_high & (1u<<16))
3107		return -EIO;
3108#endif
3109
3110	/* Require Write-Back (WB) memory type for VMCS accesses. */
3111	if (((vmx_msr_high >> 18) & 15) != 6)
3112		return -EIO;
3113
3114	vmcs_conf->size = vmx_msr_high & 0x1fff;
3115	vmcs_conf->order = get_order(vmcs_config.size);
3116	vmcs_conf->revision_id = vmx_msr_low;
3117
3118	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
3119	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
3120	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
3121	vmcs_conf->vmexit_ctrl         = _vmexit_control;
3122	vmcs_conf->vmentry_ctrl        = _vmentry_control;
3123
3124	cpu_has_load_ia32_efer =
3125		allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3126				VM_ENTRY_LOAD_IA32_EFER)
3127		&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3128				   VM_EXIT_LOAD_IA32_EFER);
3129
3130	cpu_has_load_perf_global_ctrl =
3131		allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
3132				VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
3133		&& allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
3134				   VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
3135
3136	/*
3137	 * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
3138	 * but due to arrata below it can't be used. Workaround is to use
3139	 * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
3140	 *
3141	 * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
3142	 *
3143	 * AAK155             (model 26)
3144	 * AAP115             (model 30)
3145	 * AAT100             (model 37)
3146	 * BC86,AAY89,BD102   (model 44)
3147	 * BA97               (model 46)
3148	 *
3149	 */
3150	if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) {
3151		switch (boot_cpu_data.x86_model) {
3152		case 26:
3153		case 30:
3154		case 37:
3155		case 44:
3156		case 46:
3157			cpu_has_load_perf_global_ctrl = false;
3158			printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
3159					"does not work properly. Using workaround\n");
3160			break;
3161		default:
3162			break;
3163		}
3164	}
3165
3166	if (cpu_has_xsaves)
3167		rdmsrl(MSR_IA32_XSS, host_xss);
3168
3169	return 0;
3170}
3171
3172static struct vmcs *alloc_vmcs_cpu(int cpu)
3173{
3174	int node = cpu_to_node(cpu);
3175	struct page *pages;
3176	struct vmcs *vmcs;
3177
3178	pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order);
3179	if (!pages)
3180		return NULL;
3181	vmcs = page_address(pages);
3182	memset(vmcs, 0, vmcs_config.size);
3183	vmcs->revision_id = vmcs_config.revision_id; /* vmcs revision id */
3184	return vmcs;
3185}
3186
3187static struct vmcs *alloc_vmcs(void)
3188{
3189	return alloc_vmcs_cpu(raw_smp_processor_id());
3190}
3191
3192static void free_vmcs(struct vmcs *vmcs)
3193{
3194	free_pages((unsigned long)vmcs, vmcs_config.order);
3195}
3196
3197/*
3198 * Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
3199 */
3200static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
3201{
3202	if (!loaded_vmcs->vmcs)
3203		return;
3204	loaded_vmcs_clear(loaded_vmcs);
3205	free_vmcs(loaded_vmcs->vmcs);
3206	loaded_vmcs->vmcs = NULL;
3207}
3208
3209static void free_kvm_area(void)
3210{
3211	int cpu;
3212
3213	for_each_possible_cpu(cpu) {
3214		free_vmcs(per_cpu(vmxarea, cpu));
3215		per_cpu(vmxarea, cpu) = NULL;
3216	}
3217}
3218
3219static void init_vmcs_shadow_fields(void)
3220{
3221	int i, j;
3222
3223	/* No checks for read only fields yet */
3224
3225	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
3226		switch (shadow_read_write_fields[i]) {
3227		case GUEST_BNDCFGS:
3228			if (!vmx_mpx_supported())
3229				continue;
3230			break;
3231		default:
3232			break;
3233		}
3234
3235		if (j < i)
3236			shadow_read_write_fields[j] =
3237				shadow_read_write_fields[i];
3238		j++;
3239	}
3240	max_shadow_read_write_fields = j;
3241
3242	/* shadowed fields guest access without vmexit */
3243	for (i = 0; i < max_shadow_read_write_fields; i++) {
3244		clear_bit(shadow_read_write_fields[i],
3245			  vmx_vmwrite_bitmap);
3246		clear_bit(shadow_read_write_fields[i],
3247			  vmx_vmread_bitmap);
3248	}
3249	for (i = 0; i < max_shadow_read_only_fields; i++)
3250		clear_bit(shadow_read_only_fields[i],
3251			  vmx_vmread_bitmap);
3252}
3253
3254static __init int alloc_kvm_area(void)
3255{
3256	int cpu;
3257
3258	for_each_possible_cpu(cpu) {
3259		struct vmcs *vmcs;
3260
3261		vmcs = alloc_vmcs_cpu(cpu);
3262		if (!vmcs) {
3263			free_kvm_area();
3264			return -ENOMEM;
3265		}
3266
3267		per_cpu(vmxarea, cpu) = vmcs;
3268	}
3269	return 0;
3270}
3271
3272static bool emulation_required(struct kvm_vcpu *vcpu)
3273{
3274	return emulate_invalid_guest_state && !guest_state_valid(vcpu);
3275}
3276
3277static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
3278		struct kvm_segment *save)
3279{
3280	if (!emulate_invalid_guest_state) {
3281		/*
3282		 * CS and SS RPL should be equal during guest entry according
3283		 * to VMX spec, but in reality it is not always so. Since vcpu
3284		 * is in the middle of the transition from real mode to
3285		 * protected mode it is safe to assume that RPL 0 is a good
3286		 * default value.
3287		 */
3288		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
3289			save->selector &= ~SEGMENT_RPL_MASK;
3290		save->dpl = save->selector & SEGMENT_RPL_MASK;
3291		save->s = 1;
3292	}
3293	vmx_set_segment(vcpu, save, seg);
3294}
3295
3296static void enter_pmode(struct kvm_vcpu *vcpu)
3297{
3298	unsigned long flags;
3299	struct vcpu_vmx *vmx = to_vmx(vcpu);
3300
3301	/*
3302	 * Update real mode segment cache. It may be not up-to-date if sement
3303	 * register was written while vcpu was in a guest mode.
3304	 */
3305	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3306	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3307	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3308	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3309	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3310	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3311
3312	vmx->rmode.vm86_active = 0;
3313
3314	vmx_segment_cache_clear(vmx);
3315
3316	vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3317
3318	flags = vmcs_readl(GUEST_RFLAGS);
3319	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3320	flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3321	vmcs_writel(GUEST_RFLAGS, flags);
3322
3323	vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
3324			(vmcs_readl(CR4_READ_SHADOW) & X86_CR4_VME));
3325
3326	update_exception_bitmap(vcpu);
3327
3328	fix_pmode_seg(vcpu, VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3329	fix_pmode_seg(vcpu, VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3330	fix_pmode_seg(vcpu, VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3331	fix_pmode_seg(vcpu, VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3332	fix_pmode_seg(vcpu, VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3333	fix_pmode_seg(vcpu, VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3334}
3335
3336static void fix_rmode_seg(int seg, struct kvm_segment *save)
3337{
3338	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3339	struct kvm_segment var = *save;
3340
3341	var.dpl = 0x3;
3342	if (seg == VCPU_SREG_CS)
3343		var.type = 0x3;
3344
3345	if (!emulate_invalid_guest_state) {
3346		var.selector = var.base >> 4;
3347		var.base = var.base & 0xffff0;
3348		var.limit = 0xffff;
3349		var.g = 0;
3350		var.db = 0;
3351		var.present = 1;
3352		var.s = 1;
3353		var.l = 0;
3354		var.unusable = 0;
3355		var.type = 0x3;
3356		var.avl = 0;
3357		if (save->base & 0xf)
3358			printk_once(KERN_WARNING "kvm: segment base is not "
3359					"paragraph aligned when entering "
3360					"protected mode (seg=%d)", seg);
3361	}
3362
3363	vmcs_write16(sf->selector, var.selector);
3364	vmcs_write32(sf->base, var.base);
3365	vmcs_write32(sf->limit, var.limit);
3366	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
3367}
3368
3369static void enter_rmode(struct kvm_vcpu *vcpu)
3370{
3371	unsigned long flags;
3372	struct vcpu_vmx *vmx = to_vmx(vcpu);
3373
3374	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
3375	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_ES], VCPU_SREG_ES);
3376	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_DS], VCPU_SREG_DS);
3377	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_FS], VCPU_SREG_FS);
3378	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_GS], VCPU_SREG_GS);
3379	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_SS], VCPU_SREG_SS);
3380	vmx_get_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_CS], VCPU_SREG_CS);
3381
3382	vmx->rmode.vm86_active = 1;
3383
3384	/*
3385	 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
3386	 * vcpu. Warn the user that an update is overdue.
3387	 */
3388	if (!vcpu->kvm->arch.tss_addr)
3389		printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
3390			     "called before entering vcpu\n");
3391
3392	vmx_segment_cache_clear(vmx);
3393
3394	vmcs_writel(GUEST_TR_BASE, vcpu->kvm->arch.tss_addr);
3395	vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
3396	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
3397
3398	flags = vmcs_readl(GUEST_RFLAGS);
3399	vmx->rmode.save_rflags = flags;
3400
3401	flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
3402
3403	vmcs_writel(GUEST_RFLAGS, flags);
3404	vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
3405	update_exception_bitmap(vcpu);
3406
3407	fix_rmode_seg(VCPU_SREG_SS, &vmx->rmode.segs[VCPU_SREG_SS]);
3408	fix_rmode_seg(VCPU_SREG_CS, &vmx->rmode.segs[VCPU_SREG_CS]);
3409	fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.segs[VCPU_SREG_ES]);
3410	fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]);
3411	fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]);
3412	fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]);
3413
3414	kvm_mmu_reset_context(vcpu);
3415}
3416
3417static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3418{
3419	struct vcpu_vmx *vmx = to_vmx(vcpu);
3420	struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
3421
3422	if (!msr)
3423		return;
3424
3425	/*
3426	 * Force kernel_gs_base reloading before EFER changes, as control
3427	 * of this msr depends on is_long_mode().
3428	 */
3429	vmx_load_host_state(to_vmx(vcpu));
3430	vcpu->arch.efer = efer;
3431	if (efer & EFER_LMA) {
3432		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3433		msr->data = efer;
3434	} else {
3435		vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3436
3437		msr->data = efer & ~EFER_LME;
3438	}
3439	setup_msrs(vmx);
3440}
3441
3442#ifdef CONFIG_X86_64
3443
3444static void enter_lmode(struct kvm_vcpu *vcpu)
3445{
3446	u32 guest_tr_ar;
3447
3448	vmx_segment_cache_clear(to_vmx(vcpu));
3449
3450	guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
3451	if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
3452		pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3453				     __func__);
3454		vmcs_write32(GUEST_TR_AR_BYTES,
3455			     (guest_tr_ar & ~AR_TYPE_MASK)
3456			     | AR_TYPE_BUSY_64_TSS);
3457	}
3458	vmx_set_efer(vcpu, vcpu->arch.efer | EFER_LMA);
3459}
3460
3461static void exit_lmode(struct kvm_vcpu *vcpu)
3462{
3463	vm_entry_controls_clearbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
3464	vmx_set_efer(vcpu, vcpu->arch.efer & ~EFER_LMA);
3465}
3466
3467#endif
3468
3469static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
3470{
3471	vpid_sync_context(to_vmx(vcpu));
3472	if (enable_ept) {
3473		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3474			return;
3475		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
3476	}
3477}
3478
3479static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
3480{
3481	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
3482
3483	vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
3484	vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
3485}
3486
3487static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
3488{
3489	if (enable_ept && is_paging(vcpu))
3490		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3491	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3492}
3493
3494static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
3495{
3496	ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
3497
3498	vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
3499	vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
3500}
3501
3502static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
3503{
3504	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3505
3506	if (!test_bit(VCPU_EXREG_PDPTR,
3507		      (unsigned long *)&vcpu->arch.regs_dirty))
3508		return;
3509
3510	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3511		vmcs_write64(GUEST_PDPTR0, mmu->pdptrs[0]);
3512		vmcs_write64(GUEST_PDPTR1, mmu->pdptrs[1]);
3513		vmcs_write64(GUEST_PDPTR2, mmu->pdptrs[2]);
3514		vmcs_write64(GUEST_PDPTR3, mmu->pdptrs[3]);
3515	}
3516}
3517
3518static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3519{
3520	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3521
3522	if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
3523		mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
3524		mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
3525		mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
3526		mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
3527	}
3528
3529	__set_bit(VCPU_EXREG_PDPTR,
3530		  (unsigned long *)&vcpu->arch.regs_avail);
3531	__set_bit(VCPU_EXREG_PDPTR,
3532		  (unsigned long *)&vcpu->arch.regs_dirty);
3533}
3534
3535static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
3536
3537static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
3538					unsigned long cr0,
3539					struct kvm_vcpu *vcpu)
3540{
3541	if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
3542		vmx_decache_cr3(vcpu);
3543	if (!(cr0 & X86_CR0_PG)) {
3544		/* From paging/starting to nonpaging */
3545		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3546			     vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
3547			     (CPU_BASED_CR3_LOAD_EXITING |
3548			      CPU_BASED_CR3_STORE_EXITING));
3549		vcpu->arch.cr0 = cr0;
3550		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3551	} else if (!is_paging(vcpu)) {
3552		/* From nonpaging to paging */
3553		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
3554			     vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
3555			     ~(CPU_BASED_CR3_LOAD_EXITING |
3556			       CPU_BASED_CR3_STORE_EXITING));
3557		vcpu->arch.cr0 = cr0;
3558		vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
3559	}
3560
3561	if (!(cr0 & X86_CR0_WP))
3562		*hw_cr0 &= ~X86_CR0_WP;
3563}
3564
3565static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
3566{
3567	struct vcpu_vmx *vmx = to_vmx(vcpu);
3568	unsigned long hw_cr0;
3569
3570	hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK);
3571	if (enable_unrestricted_guest)
3572		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3573	else {
3574		hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
3575
3576		if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3577			enter_pmode(vcpu);
3578
3579		if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3580			enter_rmode(vcpu);
3581	}
3582
3583#ifdef CONFIG_X86_64
3584	if (vcpu->arch.efer & EFER_LME) {
3585		if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
3586			enter_lmode(vcpu);
3587		if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
3588			exit_lmode(vcpu);
3589	}
3590#endif
3591
3592	if (enable_ept)
3593		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
3594
3595	if (!vcpu->fpu_active)
3596		hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
3597
3598	vmcs_writel(CR0_READ_SHADOW, cr0);
3599	vmcs_writel(GUEST_CR0, hw_cr0);
3600	vcpu->arch.cr0 = cr0;
3601
3602	/* depends on vcpu->arch.cr0 to be set to a new value */
3603	vmx->emulation_required = emulation_required(vcpu);
3604}
3605
3606static u64 construct_eptp(unsigned long root_hpa)
3607{
3608	u64 eptp;
3609
3610	/* TODO write the value reading from MSR */
3611	eptp = VMX_EPT_DEFAULT_MT |
3612		VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT;
3613	if (enable_ept_ad_bits)
3614		eptp |= VMX_EPT_AD_ENABLE_BIT;
3615	eptp |= (root_hpa & PAGE_MASK);
3616
3617	return eptp;
3618}
3619
3620static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
3621{
3622	unsigned long guest_cr3;
3623	u64 eptp;
3624
3625	guest_cr3 = cr3;
3626	if (enable_ept) {
3627		eptp = construct_eptp(cr3);
3628		vmcs_write64(EPT_POINTER, eptp);
3629		if (is_paging(vcpu) || is_guest_mode(vcpu))
3630			guest_cr3 = kvm_read_cr3(vcpu);
3631		else
3632			guest_cr3 = vcpu->kvm->arch.ept_identity_map_addr;
3633		ept_load_pdptrs(vcpu);
3634	}
3635
3636	vmx_flush_tlb(vcpu);
3637	vmcs_writel(GUEST_CR3, guest_cr3);
3638}
3639
3640static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
3641{
3642	/*
3643	 * Pass through host's Machine Check Enable value to hw_cr4, which
3644	 * is in force while we are in guest mode.  Do not let guests control
3645	 * this bit, even if host CR4.MCE == 0.
3646	 */
3647	unsigned long hw_cr4 =
3648		(cr4_read_shadow() & X86_CR4_MCE) |
3649		(cr4 & ~X86_CR4_MCE) |
3650		(to_vmx(vcpu)->rmode.vm86_active ?
3651		 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
3652
3653	if (cr4 & X86_CR4_VMXE) {
3654		/*
3655		 * To use VMXON (and later other VMX instructions), a guest
3656		 * must first be able to turn on cr4.VMXE (see handle_vmon()).
3657		 * So basically the check on whether to allow nested VMX
3658		 * is here.
3659		 */
3660		if (!nested_vmx_allowed(vcpu))
3661			return 1;
3662	}
3663	if (to_vmx(vcpu)->nested.vmxon &&
3664	    ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON))
3665		return 1;
3666
3667	vcpu->arch.cr4 = cr4;
3668	if (enable_ept) {
3669		if (!is_paging(vcpu)) {
3670			hw_cr4 &= ~X86_CR4_PAE;
3671			hw_cr4 |= X86_CR4_PSE;
3672		} else if (!(cr4 & X86_CR4_PAE)) {
3673			hw_cr4 &= ~X86_CR4_PAE;
3674		}
3675	}
3676
3677	if (!enable_unrestricted_guest && !is_paging(vcpu))
3678		/*
3679		 * SMEP/SMAP is disabled if CPU is in non-paging mode in
3680		 * hardware.  However KVM always uses paging mode without
3681		 * unrestricted guest.
3682		 * To emulate this behavior, SMEP/SMAP needs to be manually
3683		 * disabled when guest switches to non-paging mode.
3684		 */
3685		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
3686
3687	vmcs_writel(CR4_READ_SHADOW, cr4);
3688	vmcs_writel(GUEST_CR4, hw_cr4);
3689	return 0;
3690}
3691
3692static void vmx_get_segment(struct kvm_vcpu *vcpu,
3693			    struct kvm_segment *var, int seg)
3694{
3695	struct vcpu_vmx *vmx = to_vmx(vcpu);
3696	u32 ar;
3697
3698	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3699		*var = vmx->rmode.segs[seg];
3700		if (seg == VCPU_SREG_TR
3701		    || var->selector == vmx_read_guest_seg_selector(vmx, seg))
3702			return;
3703		var->base = vmx_read_guest_seg_base(vmx, seg);
3704		var->selector = vmx_read_guest_seg_selector(vmx, seg);
3705		return;
3706	}
3707	var->base = vmx_read_guest_seg_base(vmx, seg);
3708	var->limit = vmx_read_guest_seg_limit(vmx, seg);
3709	var->selector = vmx_read_guest_seg_selector(vmx, seg);
3710	ar = vmx_read_guest_seg_ar(vmx, seg);
3711	var->unusable = (ar >> 16) & 1;
3712	var->type = ar & 15;
3713	var->s = (ar >> 4) & 1;
3714	var->dpl = (ar >> 5) & 3;
3715	/*
3716	 * Some userspaces do not preserve unusable property. Since usable
3717	 * segment has to be present according to VMX spec we can use present
3718	 * property to amend userspace bug by making unusable segment always
3719	 * nonpresent. vmx_segment_access_rights() already marks nonpresent
3720	 * segment as unusable.
3721	 */
3722	var->present = !var->unusable;
3723	var->avl = (ar >> 12) & 1;
3724	var->l = (ar >> 13) & 1;
3725	var->db = (ar >> 14) & 1;
3726	var->g = (ar >> 15) & 1;
3727}
3728
3729static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
3730{
3731	struct kvm_segment s;
3732
3733	if (to_vmx(vcpu)->rmode.vm86_active) {
3734		vmx_get_segment(vcpu, &s, seg);
3735		return s.base;
3736	}
3737	return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
3738}
3739
3740static int vmx_get_cpl(struct kvm_vcpu *vcpu)
3741{
3742	struct vcpu_vmx *vmx = to_vmx(vcpu);
3743
3744	if (unlikely(vmx->rmode.vm86_active))
3745		return 0;
3746	else {
3747		int ar = vmx_read_guest_seg_ar(vmx, VCPU_SREG_SS);
3748		return AR_DPL(ar);
3749	}
3750}
3751
3752static u32 vmx_segment_access_rights(struct kvm_segment *var)
3753{
3754	u32 ar;
3755
3756	if (var->unusable || !var->present)
3757		ar = 1 << 16;
3758	else {
3759		ar = var->type & 15;
3760		ar |= (var->s & 1) << 4;
3761		ar |= (var->dpl & 3) << 5;
3762		ar |= (var->present & 1) << 7;
3763		ar |= (var->avl & 1) << 12;
3764		ar |= (var->l & 1) << 13;
3765		ar |= (var->db & 1) << 14;
3766		ar |= (var->g & 1) << 15;
3767	}
3768
3769	return ar;
3770}
3771
3772static void vmx_set_segment(struct kvm_vcpu *vcpu,
3773			    struct kvm_segment *var, int seg)
3774{
3775	struct vcpu_vmx *vmx = to_vmx(vcpu);
3776	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3777
3778	vmx_segment_cache_clear(vmx);
3779
3780	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3781		vmx->rmode.segs[seg] = *var;
3782		if (seg == VCPU_SREG_TR)
3783			vmcs_write16(sf->selector, var->selector);
3784		else if (var->s)
3785			fix_rmode_seg(seg, &vmx->rmode.segs[seg]);
3786		goto out;
3787	}
3788
3789	vmcs_writel(sf->base, var->base);
3790	vmcs_write32(sf->limit, var->limit);
3791	vmcs_write16(sf->selector, var->selector);
3792
3793	/*
3794	 *   Fix the "Accessed" bit in AR field of segment registers for older
3795	 * qemu binaries.
3796	 *   IA32 arch specifies that at the time of processor reset the
3797	 * "Accessed" bit in the AR field of segment registers is 1. And qemu
3798	 * is setting it to 0 in the userland code. This causes invalid guest
3799	 * state vmexit when "unrestricted guest" mode is turned on.
3800	 *    Fix for this setup issue in cpu_reset is being pushed in the qemu
3801	 * tree. Newer qemu binaries with that qemu fix would not need this
3802	 * kvm hack.
3803	 */
3804	if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
3805		var->type |= 0x1; /* Accessed */
3806
3807	vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
3808
3809out:
3810	vmx->emulation_required = emulation_required(vcpu);
3811}
3812
3813static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3814{
3815	u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
3816
3817	*db = (ar >> 14) & 1;
3818	*l = (ar >> 13) & 1;
3819}
3820
3821static void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3822{
3823	dt->size = vmcs_read32(GUEST_IDTR_LIMIT);
3824	dt->address = vmcs_readl(GUEST_IDTR_BASE);
3825}
3826
3827static void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3828{
3829	vmcs_write32(GUEST_IDTR_LIMIT, dt->size);
3830	vmcs_writel(GUEST_IDTR_BASE, dt->address);
3831}
3832
3833static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3834{
3835	dt->size = vmcs_read32(GUEST_GDTR_LIMIT);
3836	dt->address = vmcs_readl(GUEST_GDTR_BASE);
3837}
3838
3839static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
3840{
3841	vmcs_write32(GUEST_GDTR_LIMIT, dt->size);
3842	vmcs_writel(GUEST_GDTR_BASE, dt->address);
3843}
3844
3845static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
3846{
3847	struct kvm_segment var;
3848	u32 ar;
3849
3850	vmx_get_segment(vcpu, &var, seg);
3851	var.dpl = 0x3;
3852	if (seg == VCPU_SREG_CS)
3853		var.type = 0x3;
3854	ar = vmx_segment_access_rights(&var);
3855
3856	if (var.base != (var.selector << 4))
3857		return false;
3858	if (var.limit != 0xffff)
3859		return false;
3860	if (ar != 0xf3)
3861		return false;
3862
3863	return true;
3864}
3865
3866static bool code_segment_valid(struct kvm_vcpu *vcpu)
3867{
3868	struct kvm_segment cs;
3869	unsigned int cs_rpl;
3870
3871	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3872	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3873
3874	if (cs.unusable)
3875		return false;
3876	if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
3877		return false;
3878	if (!cs.s)
3879		return false;
3880	if (cs.type & AR_TYPE_WRITEABLE_MASK) {
3881		if (cs.dpl > cs_rpl)
3882			return false;
3883	} else {
3884		if (cs.dpl != cs_rpl)
3885			return false;
3886	}
3887	if (!cs.present)
3888		return false;
3889
3890	/* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
3891	return true;
3892}
3893
3894static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3895{
3896	struct kvm_segment ss;
3897	unsigned int ss_rpl;
3898
3899	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3900	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3901
3902	if (ss.unusable)
3903		return true;
3904	if (ss.type != 3 && ss.type != 7)
3905		return false;
3906	if (!ss.s)
3907		return false;
3908	if (ss.dpl != ss_rpl) /* DPL != RPL */
3909		return false;
3910	if (!ss.present)
3911		return false;
3912
3913	return true;
3914}
3915
3916static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
3917{
3918	struct kvm_segment var;
3919	unsigned int rpl;
3920
3921	vmx_get_segment(vcpu, &var, seg);
3922	rpl = var.selector & SEGMENT_RPL_MASK;
3923
3924	if (var.unusable)
3925		return true;
3926	if (!var.s)
3927		return false;
3928	if (!var.present)
3929		return false;
3930	if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
3931		if (var.dpl < rpl) /* DPL < RPL */
3932			return false;
3933	}
3934
3935	/* TODO: Add other members to kvm_segment_field to allow checking for other access
3936	 * rights flags
3937	 */
3938	return true;
3939}
3940
3941static bool tr_valid(struct kvm_vcpu *vcpu)
3942{
3943	struct kvm_segment tr;
3944
3945	vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
3946
3947	if (tr.unusable)
3948		return false;
3949	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
3950		return false;
3951	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
3952		return false;
3953	if (!tr.present)
3954		return false;
3955
3956	return true;
3957}
3958
3959static bool ldtr_valid(struct kvm_vcpu *vcpu)
3960{
3961	struct kvm_segment ldtr;
3962
3963	vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
3964
3965	if (ldtr.unusable)
3966		return true;
3967	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
3968		return false;
3969	if (ldtr.type != 2)
3970		return false;
3971	if (!ldtr.present)
3972		return false;
3973
3974	return true;
3975}
3976
3977static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3978{
3979	struct kvm_segment cs, ss;
3980
3981	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
3982	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
3983
3984	return ((cs.selector & SEGMENT_RPL_MASK) ==
3985		 (ss.selector & SEGMENT_RPL_MASK));
3986}
3987
3988/*
3989 * Check if guest state is valid. Returns true if valid, false if
3990 * not.
3991 * We assume that registers are always usable
3992 */
3993static bool guest_state_valid(struct kvm_vcpu *vcpu)
3994{
3995	if (enable_unrestricted_guest)
3996		return true;
3997
3998	/* real mode guest state checks */
3999	if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
4000		if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
4001			return false;
4002		if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
4003			return false;
4004		if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
4005			return false;
4006		if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
4007			return false;
4008		if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
4009			return false;
4010		if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
4011			return false;
4012	} else {
4013	/* protected mode guest state checks */
4014		if (!cs_ss_rpl_check(vcpu))
4015			return false;
4016		if (!code_segment_valid(vcpu))
4017			return false;
4018		if (!stack_segment_valid(vcpu))
4019			return false;
4020		if (!data_segment_valid(vcpu, VCPU_SREG_DS))
4021			return false;
4022		if (!data_segment_valid(vcpu, VCPU_SREG_ES))
4023			return false;
4024		if (!data_segment_valid(vcpu, VCPU_SREG_FS))
4025			return false;
4026		if (!data_segment_valid(vcpu, VCPU_SREG_GS))
4027			return false;
4028		if (!tr_valid(vcpu))
4029			return false;
4030		if (!ldtr_valid(vcpu))
4031			return false;
4032	}
4033	/* TODO:
4034	 * - Add checks on RIP
4035	 * - Add checks on RFLAGS
4036	 */
4037
4038	return true;
4039}
4040
4041static int init_rmode_tss(struct kvm *kvm)
4042{
4043	gfn_t fn;
4044	u16 data = 0;
4045	int idx, r;
4046
4047	idx = srcu_read_lock(&kvm->srcu);
4048	fn = kvm->arch.tss_addr >> PAGE_SHIFT;
4049	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4050	if (r < 0)
4051		goto out;
4052	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
4053	r = kvm_write_guest_page(kvm, fn++, &data,
4054			TSS_IOPB_BASE_OFFSET, sizeof(u16));
4055	if (r < 0)
4056		goto out;
4057	r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
4058	if (r < 0)
4059		goto out;
4060	r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
4061	if (r < 0)
4062		goto out;
4063	data = ~0;
4064	r = kvm_write_guest_page(kvm, fn, &data,
4065				 RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1,
4066				 sizeof(u8));
4067out:
4068	srcu_read_unlock(&kvm->srcu, idx);
4069	return r;
4070}
4071
4072static int init_rmode_identity_map(struct kvm *kvm)
4073{
4074	int i, idx, r = 0;
4075	pfn_t identity_map_pfn;
4076	u32 tmp;
4077
4078	if (!enable_ept)
4079		return 0;
4080
4081	/* Protect kvm->arch.ept_identity_pagetable_done. */
4082	mutex_lock(&kvm->slots_lock);
4083
4084	if (likely(kvm->arch.ept_identity_pagetable_done))
4085		goto out2;
4086
4087	identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
4088
4089	r = alloc_identity_pagetable(kvm);
4090	if (r < 0)
4091		goto out2;
4092
4093	idx = srcu_read_lock(&kvm->srcu);
4094	r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
4095	if (r < 0)
4096		goto out;
4097	/* Set up identity-mapping pagetable for EPT in real mode */
4098	for (i = 0; i < PT32_ENT_PER_PAGE; i++) {
4099		tmp = (i << 22) + (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
4100			_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
4101		r = kvm_write_guest_page(kvm, identity_map_pfn,
4102				&tmp, i * sizeof(tmp), sizeof(tmp));
4103		if (r < 0)
4104			goto out;
4105	}
4106	kvm->arch.ept_identity_pagetable_done = true;
4107
4108out:
4109	srcu_read_unlock(&kvm->srcu, idx);
4110
4111out2:
4112	mutex_unlock(&kvm->slots_lock);
4113	return r;
4114}
4115
4116static void seg_setup(int seg)
4117{
4118	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
4119	unsigned int ar;
4120
4121	vmcs_write16(sf->selector, 0);
4122	vmcs_writel(sf->base, 0);
4123	vmcs_write32(sf->limit, 0xffff);
4124	ar = 0x93;
4125	if (seg == VCPU_SREG_CS)
4126		ar |= 0x08; /* code segment */
4127
4128	vmcs_write32(sf->ar_bytes, ar);
4129}
4130
4131static int alloc_apic_access_page(struct kvm *kvm)
4132{
4133	struct page *page;
4134	struct kvm_userspace_memory_region kvm_userspace_mem;
4135	int r = 0;
4136
4137	mutex_lock(&kvm->slots_lock);
4138	if (kvm->arch.apic_access_page_done)
4139		goto out;
4140	kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
4141	kvm_userspace_mem.flags = 0;
4142	kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
4143	kvm_userspace_mem.memory_size = PAGE_SIZE;
4144	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
4145	if (r)
4146		goto out;
4147
4148	page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
4149	if (is_error_page(page)) {
4150		r = -EFAULT;
4151		goto out;
4152	}
4153
4154	/*
4155	 * Do not pin the page in memory, so that memory hot-unplug
4156	 * is able to migrate it.
4157	 */
4158	put_page(page);
4159	kvm->arch.apic_access_page_done = true;
4160out:
4161	mutex_unlock(&kvm->slots_lock);
4162	return r;
4163}
4164
4165static int alloc_identity_pagetable(struct kvm *kvm)
4166{
4167	/* Called with kvm->slots_lock held. */
4168
4169	struct kvm_userspace_memory_region kvm_userspace_mem;
4170	int r = 0;
4171
4172	BUG_ON(kvm->arch.ept_identity_pagetable_done);
4173
4174	kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
4175	kvm_userspace_mem.flags = 0;
4176	kvm_userspace_mem.guest_phys_addr =
4177		kvm->arch.ept_identity_map_addr;
4178	kvm_userspace_mem.memory_size = PAGE_SIZE;
4179	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
4180
4181	return r;
4182}
4183
4184static void allocate_vpid(struct vcpu_vmx *vmx)
4185{
4186	int vpid;
4187
4188	vmx->vpid = 0;
4189	if (!enable_vpid)
4190		return;
4191	spin_lock(&vmx_vpid_lock);
4192	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
4193	if (vpid < VMX_NR_VPIDS) {
4194		vmx->vpid = vpid;
4195		__set_bit(vpid, vmx_vpid_bitmap);
4196	}
4197	spin_unlock(&vmx_vpid_lock);
4198}
4199
4200static void free_vpid(struct vcpu_vmx *vmx)
4201{
4202	if (!enable_vpid)
4203		return;
4204	spin_lock(&vmx_vpid_lock);
4205	if (vmx->vpid != 0)
4206		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
4207	spin_unlock(&vmx_vpid_lock);
4208}
4209
4210#define MSR_TYPE_R	1
4211#define MSR_TYPE_W	2
4212static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
4213						u32 msr, int type)
4214{
4215	int f = sizeof(unsigned long);
4216
4217	if (!cpu_has_vmx_msr_bitmap())
4218		return;
4219
4220	/*
4221	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4222	 * have the write-low and read-high bitmap offsets the wrong way round.
4223	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4224	 */
4225	if (msr <= 0x1fff) {
4226		if (type & MSR_TYPE_R)
4227			/* read-low */
4228			__clear_bit(msr, msr_bitmap + 0x000 / f);
4229
4230		if (type & MSR_TYPE_W)
4231			/* write-low */
4232			__clear_bit(msr, msr_bitmap + 0x800 / f);
4233
4234	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4235		msr &= 0x1fff;
4236		if (type & MSR_TYPE_R)
4237			/* read-high */
4238			__clear_bit(msr, msr_bitmap + 0x400 / f);
4239
4240		if (type & MSR_TYPE_W)
4241			/* write-high */
4242			__clear_bit(msr, msr_bitmap + 0xc00 / f);
4243
4244	}
4245}
4246
4247static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
4248						u32 msr, int type)
4249{
4250	int f = sizeof(unsigned long);
4251
4252	if (!cpu_has_vmx_msr_bitmap())
4253		return;
4254
4255	/*
4256	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4257	 * have the write-low and read-high bitmap offsets the wrong way round.
4258	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4259	 */
4260	if (msr <= 0x1fff) {
4261		if (type & MSR_TYPE_R)
4262			/* read-low */
4263			__set_bit(msr, msr_bitmap + 0x000 / f);
4264
4265		if (type & MSR_TYPE_W)
4266			/* write-low */
4267			__set_bit(msr, msr_bitmap + 0x800 / f);
4268
4269	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4270		msr &= 0x1fff;
4271		if (type & MSR_TYPE_R)
4272			/* read-high */
4273			__set_bit(msr, msr_bitmap + 0x400 / f);
4274
4275		if (type & MSR_TYPE_W)
4276			/* write-high */
4277			__set_bit(msr, msr_bitmap + 0xc00 / f);
4278
4279	}
4280}
4281
4282/*
4283 * If a msr is allowed by L0, we should check whether it is allowed by L1.
4284 * The corresponding bit will be cleared unless both of L0 and L1 allow it.
4285 */
4286static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
4287					       unsigned long *msr_bitmap_nested,
4288					       u32 msr, int type)
4289{
4290	int f = sizeof(unsigned long);
4291
4292	if (!cpu_has_vmx_msr_bitmap()) {
4293		WARN_ON(1);
4294		return;
4295	}
4296
4297	/*
4298	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
4299	 * have the write-low and read-high bitmap offsets the wrong way round.
4300	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
4301	 */
4302	if (msr <= 0x1fff) {
4303		if (type & MSR_TYPE_R &&
4304		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
4305			/* read-low */
4306			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
4307
4308		if (type & MSR_TYPE_W &&
4309		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
4310			/* write-low */
4311			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
4312
4313	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
4314		msr &= 0x1fff;
4315		if (type & MSR_TYPE_R &&
4316		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
4317			/* read-high */
4318			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
4319
4320		if (type & MSR_TYPE_W &&
4321		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
4322			/* write-high */
4323			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
4324
4325	}
4326}
4327
4328static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
4329{
4330	if (!longmode_only)
4331		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
4332						msr, MSR_TYPE_R | MSR_TYPE_W);
4333	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
4334						msr, MSR_TYPE_R | MSR_TYPE_W);
4335}
4336
4337static void vmx_enable_intercept_msr_read_x2apic(u32 msr)
4338{
4339	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4340			msr, MSR_TYPE_R);
4341	__vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4342			msr, MSR_TYPE_R);
4343}
4344
4345static void vmx_disable_intercept_msr_read_x2apic(u32 msr)
4346{
4347	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4348			msr, MSR_TYPE_R);
4349	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4350			msr, MSR_TYPE_R);
4351}
4352
4353static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
4354{
4355	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
4356			msr, MSR_TYPE_W);
4357	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
4358			msr, MSR_TYPE_W);
4359}
4360
4361static int vmx_vm_has_apicv(struct kvm *kvm)
4362{
4363	return enable_apicv && irqchip_in_kernel(kvm);
4364}
4365
4366static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
4367{
4368	struct vcpu_vmx *vmx = to_vmx(vcpu);
4369	int max_irr;
4370	void *vapic_page;
4371	u16 status;
4372
4373	if (vmx->nested.pi_desc &&
4374	    vmx->nested.pi_pending) {
4375		vmx->nested.pi_pending = false;
4376		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
4377			return 0;
4378
4379		max_irr = find_last_bit(
4380			(unsigned long *)vmx->nested.pi_desc->pir, 256);
4381
4382		if (max_irr == 256)
4383			return 0;
4384
4385		vapic_page = kmap(vmx->nested.virtual_apic_page);
4386		if (!vapic_page) {
4387			WARN_ON(1);
4388			return -ENOMEM;
4389		}
4390		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
4391		kunmap(vmx->nested.virtual_apic_page);
4392
4393		status = vmcs_read16(GUEST_INTR_STATUS);
4394		if ((u8)max_irr > ((u8)status & 0xff)) {
4395			status &= ~0xff;
4396			status |= (u8)max_irr;
4397			vmcs_write16(GUEST_INTR_STATUS, status);
4398		}
4399	}
4400	return 0;
4401}
4402
4403static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
4404{
4405#ifdef CONFIG_SMP
4406	if (vcpu->mode == IN_GUEST_MODE) {
4407		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
4408				POSTED_INTR_VECTOR);
4409		return true;
4410	}
4411#endif
4412	return false;
4413}
4414
4415static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4416						int vector)
4417{
4418	struct vcpu_vmx *vmx = to_vmx(vcpu);
4419
4420	if (is_guest_mode(vcpu) &&
4421	    vector == vmx->nested.posted_intr_nv) {
4422		/* the PIR and ON have been set by L1. */
4423		kvm_vcpu_trigger_posted_interrupt(vcpu);
4424		/*
4425		 * If a posted intr is not recognized by hardware,
4426		 * we will accomplish it in the next vmentry.
4427		 */
4428		vmx->nested.pi_pending = true;
4429		kvm_make_request(KVM_REQ_EVENT, vcpu);
4430		return 0;
4431	}
4432	return -1;
4433}
4434/*
4435 * Send interrupt to vcpu via posted interrupt way.
4436 * 1. If target vcpu is running(non-root mode), send posted interrupt
4437 * notification to vcpu and hardware will sync PIR to vIRR atomically.
4438 * 2. If target vcpu isn't running(root mode), kick it to pick up the
4439 * interrupt from PIR in next vmentry.
4440 */
4441static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
4442{
4443	struct vcpu_vmx *vmx = to_vmx(vcpu);
4444	int r;
4445
4446	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4447	if (!r)
4448		return;
4449
4450	if (pi_test_and_set_pir(vector, &vmx->pi_desc))
4451		return;
4452
4453	r = pi_test_and_set_on(&vmx->pi_desc);
4454	kvm_make_request(KVM_REQ_EVENT, vcpu);
4455	if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
4456		kvm_vcpu_kick(vcpu);
4457}
4458
4459static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
4460{
4461	struct vcpu_vmx *vmx = to_vmx(vcpu);
4462
4463	if (!pi_test_and_clear_on(&vmx->pi_desc))
4464		return;
4465
4466	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
4467}
4468
4469static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
4470{
4471	return;
4472}
4473
4474/*
4475 * Set up the vmcs's constant host-state fields, i.e., host-state fields that
4476 * will not change in the lifetime of the guest.
4477 * Note that host-state that does change is set elsewhere. E.g., host-state
4478 * that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4479 */
4480static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4481{
4482	u32 low32, high32;
4483	unsigned long tmpl;
4484	struct desc_ptr dt;
4485	unsigned long cr4;
4486
4487	vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
4488	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
4489
4490	/* Save the most likely value for this task's CR4 in the VMCS. */
4491	cr4 = cr4_read_shadow();
4492	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
4493	vmx->host_state.vmcs_host_cr4 = cr4;
4494
4495	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
4496#ifdef CONFIG_X86_64
4497	/*
4498	 * Load null selectors, so we can avoid reloading them in
4499	 * __vmx_load_host_state(), in case userspace uses the null selectors
4500	 * too (the expected case).
4501	 */
4502	vmcs_write16(HOST_DS_SELECTOR, 0);
4503	vmcs_write16(HOST_ES_SELECTOR, 0);
4504#else
4505	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4506	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4507#endif
4508	vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
4509	vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
4510
4511	native_store_idt(&dt);
4512	vmcs_writel(HOST_IDTR_BASE, dt.address);   /* 22.2.4 */
4513	vmx->host_idt_base = dt.address;
4514
4515	vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */
4516
4517	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4518	vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
4519	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4520	vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl);   /* 22.2.3 */
4521
4522	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4523		rdmsr(MSR_IA32_CR_PAT, low32, high32);
4524		vmcs_write64(HOST_IA32_PAT, low32 | ((u64) high32 << 32));
4525	}
4526}
4527
4528static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4529{
4530	vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
4531	if (enable_ept)
4532		vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
4533	if (is_guest_mode(&vmx->vcpu))
4534		vmx->vcpu.arch.cr4_guest_owned_bits &=
4535			~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
4536	vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
4537}
4538
4539static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4540{
4541	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4542
4543	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4544		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4545	return pin_based_exec_ctrl;
4546}
4547
4548static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4549{
4550	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4551
4552	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4553		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4554
4555	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
4556		exec_control &= ~CPU_BASED_TPR_SHADOW;
4557#ifdef CONFIG_X86_64
4558		exec_control |= CPU_BASED_CR8_STORE_EXITING |
4559				CPU_BASED_CR8_LOAD_EXITING;
4560#endif
4561	}
4562	if (!enable_ept)
4563		exec_control |= CPU_BASED_CR3_STORE_EXITING |
4564				CPU_BASED_CR3_LOAD_EXITING  |
4565				CPU_BASED_INVLPG_EXITING;
4566	return exec_control;
4567}
4568
4569static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4570{
4571	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4572	if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
4573		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4574	if (vmx->vpid == 0)
4575		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4576	if (!enable_ept) {
4577		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4578		enable_unrestricted_guest = 0;
4579		/* Enable INVPCID for non-ept guests may cause performance regression. */
4580		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
4581	}
4582	if (!enable_unrestricted_guest)
4583		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4584	if (!ple_gap)
4585		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4586	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
4587		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
4588				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4589	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4590	/* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD
4591	   (handle_vmptrld).
4592	   We can NOT enable shadow_vmcs here because we don't have yet
4593	   a current VMCS12
4594	*/
4595	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4596	/* PML is enabled/disabled in creating/destorying vcpu */
4597	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4598
4599	return exec_control;
4600}
4601
4602static void ept_set_mmio_spte_mask(void)
4603{
4604	/*
4605	 * EPT Misconfigurations can be generated if the value of bits 2:0
4606	 * of an EPT paging-structure entry is 110b (write/execute).
4607	 * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
4608	 * spte.
4609	 */
4610	kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
4611}
4612
4613#define VMX_XSS_EXIT_BITMAP 0
4614/*
4615 * Sets up the vmcs for emulated real mode.
4616 */
4617static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
4618{
4619#ifdef CONFIG_X86_64
4620	unsigned long a;
4621#endif
4622	int i;
4623
4624	/* I/O */
4625	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
4626	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
4627
4628	if (enable_shadow_vmcs) {
4629		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
4630		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
4631	}
4632	if (cpu_has_vmx_msr_bitmap())
4633		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
4634
4635	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
4636
4637	/* Control */
4638	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
4639
4640	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
4641
4642	if (cpu_has_secondary_exec_ctrls()) {
4643		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4644				vmx_secondary_exec_control(vmx));
4645	}
4646
4647	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
4648		vmcs_write64(EOI_EXIT_BITMAP0, 0);
4649		vmcs_write64(EOI_EXIT_BITMAP1, 0);
4650		vmcs_write64(EOI_EXIT_BITMAP2, 0);
4651		vmcs_write64(EOI_EXIT_BITMAP3, 0);
4652
4653		vmcs_write16(GUEST_INTR_STATUS, 0);
4654
4655		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
4656		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4657	}
4658
4659	if (ple_gap) {
4660		vmcs_write32(PLE_GAP, ple_gap);
4661		vmx->ple_window = ple_window;
4662		vmx->ple_window_dirty = true;
4663	}
4664
4665	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
4666	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
4667	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
4668
4669	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
4670	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
4671	vmx_set_constant_host_state(vmx);
4672#ifdef CONFIG_X86_64
4673	rdmsrl(MSR_FS_BASE, a);
4674	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
4675	rdmsrl(MSR_GS_BASE, a);
4676	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
4677#else
4678	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
4679	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
4680#endif
4681
4682	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
4683	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
4684	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host));
4685	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
4686	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
4687
4688	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
4689		u32 msr_low, msr_high;
4690		u64 host_pat;
4691		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
4692		host_pat = msr_low | ((u64) msr_high << 32);
4693		/* Write the default value follow host pat */
4694		vmcs_write64(GUEST_IA32_PAT, host_pat);
4695		/* Keep arch.pat sync with GUEST_IA32_PAT */
4696		vmx->vcpu.arch.pat = host_pat;
4697	}
4698
4699	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
4700		u32 index = vmx_msr_index[i];
4701		u32 data_low, data_high;
4702		int j = vmx->nmsrs;
4703
4704		if (rdmsr_safe(index, &data_low, &data_high) < 0)
4705			continue;
4706		if (wrmsr_safe(index, data_low, data_high) < 0)
4707			continue;
4708		vmx->guest_msrs[j].index = i;
4709		vmx->guest_msrs[j].data = 0;
4710		vmx->guest_msrs[j].mask = -1ull;
4711		++vmx->nmsrs;
4712	}
4713
4714
4715	vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
4716
4717	/* 22.2.1, 20.8.1 */
4718	vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
4719
4720	vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
4721	set_cr4_guest_host_mask(vmx);
4722
4723	if (vmx_xsaves_supported())
4724		vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4725
4726	return 0;
4727}
4728
4729static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4730{
4731	struct vcpu_vmx *vmx = to_vmx(vcpu);
4732	struct msr_data apic_base_msr;
4733
4734	vmx->rmode.vm86_active = 0;
4735
4736	vmx->soft_vnmi_blocked = 0;
4737
4738	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
4739	kvm_set_cr8(&vmx->vcpu, 0);
4740	apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
4741	if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
4742		apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
4743	apic_base_msr.host_initiated = true;
4744	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
4745
4746	vmx_segment_cache_clear(vmx);
4747
4748	seg_setup(VCPU_SREG_CS);
4749	vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
4750	vmcs_write32(GUEST_CS_BASE, 0xffff0000);
4751
4752	seg_setup(VCPU_SREG_DS);
4753	seg_setup(VCPU_SREG_ES);
4754	seg_setup(VCPU_SREG_FS);
4755	seg_setup(VCPU_SREG_GS);
4756	seg_setup(VCPU_SREG_SS);
4757
4758	vmcs_write16(GUEST_TR_SELECTOR, 0);
4759	vmcs_writel(GUEST_TR_BASE, 0);
4760	vmcs_write32(GUEST_TR_LIMIT, 0xffff);
4761	vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
4762
4763	vmcs_write16(GUEST_LDTR_SELECTOR, 0);
4764	vmcs_writel(GUEST_LDTR_BASE, 0);
4765	vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
4766	vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
4767
4768	vmcs_write32(GUEST_SYSENTER_CS, 0);
4769	vmcs_writel(GUEST_SYSENTER_ESP, 0);
4770	vmcs_writel(GUEST_SYSENTER_EIP, 0);
4771
4772	vmcs_writel(GUEST_RFLAGS, 0x02);
4773	kvm_rip_write(vcpu, 0xfff0);
4774
4775	vmcs_writel(GUEST_GDTR_BASE, 0);
4776	vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
4777
4778	vmcs_writel(GUEST_IDTR_BASE, 0);
4779	vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
4780
4781	vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4782	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
4783	vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
4784
4785	/* Special registers */
4786	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4787
4788	setup_msrs(vmx);
4789
4790	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
4791
4792	if (cpu_has_vmx_tpr_shadow()) {
4793		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
4794		if (vm_need_tpr_shadow(vmx->vcpu.kvm))
4795			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
4796				     __pa(vmx->vcpu.arch.apic->regs));
4797		vmcs_write32(TPR_THRESHOLD, 0);
4798	}
4799
4800	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4801
4802	if (vmx_vm_has_apicv(vcpu->kvm))
4803		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
4804
4805	if (vmx->vpid != 0)
4806		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
4807
4808	vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
4809	vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
4810	vmx_set_cr4(&vmx->vcpu, 0);
4811	vmx_set_efer(&vmx->vcpu, 0);
4812	vmx_fpu_activate(&vmx->vcpu);
4813	update_exception_bitmap(&vmx->vcpu);
4814
4815	vpid_sync_context(vmx);
4816}
4817
4818/*
4819 * In nested virtualization, check if L1 asked to exit on external interrupts.
4820 * For most existing hypervisors, this will always return true.
4821 */
4822static bool nested_exit_on_intr(struct kvm_vcpu *vcpu)
4823{
4824	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4825		PIN_BASED_EXT_INTR_MASK;
4826}
4827
4828/*
4829 * In nested virtualization, check if L1 has set
4830 * VM_EXIT_ACK_INTR_ON_EXIT
4831 */
4832static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
4833{
4834	return get_vmcs12(vcpu)->vm_exit_controls &
4835		VM_EXIT_ACK_INTR_ON_EXIT;
4836}
4837
4838static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
4839{
4840	return get_vmcs12(vcpu)->pin_based_vm_exec_control &
4841		PIN_BASED_NMI_EXITING;
4842}
4843
4844static void enable_irq_window(struct kvm_vcpu *vcpu)
4845{
4846	u32 cpu_based_vm_exec_control;
4847
4848	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4849	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
4850	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4851}
4852
4853static void enable_nmi_window(struct kvm_vcpu *vcpu)
4854{
4855	u32 cpu_based_vm_exec_control;
4856
4857	if (!cpu_has_virtual_nmis() ||
4858	    vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4859		enable_irq_window(vcpu);
4860		return;
4861	}
4862
4863	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
4864	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
4865	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
4866}
4867
4868static void vmx_inject_irq(struct kvm_vcpu *vcpu)
4869{
4870	struct vcpu_vmx *vmx = to_vmx(vcpu);
4871	uint32_t intr;
4872	int irq = vcpu->arch.interrupt.nr;
4873
4874	trace_kvm_inj_virq(irq);
4875
4876	++vcpu->stat.irq_injections;
4877	if (vmx->rmode.vm86_active) {
4878		int inc_eip = 0;
4879		if (vcpu->arch.interrupt.soft)
4880			inc_eip = vcpu->arch.event_exit_inst_len;
4881		if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
4882			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4883		return;
4884	}
4885	intr = irq | INTR_INFO_VALID_MASK;
4886	if (vcpu->arch.interrupt.soft) {
4887		intr |= INTR_TYPE_SOFT_INTR;
4888		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
4889			     vmx->vcpu.arch.event_exit_inst_len);
4890	} else
4891		intr |= INTR_TYPE_EXT_INTR;
4892	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
4893}
4894
4895static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4896{
4897	struct vcpu_vmx *vmx = to_vmx(vcpu);
4898
4899	if (is_guest_mode(vcpu))
4900		return;
4901
4902	if (!cpu_has_virtual_nmis()) {
4903		/*
4904		 * Tracking the NMI-blocked state in software is built upon
4905		 * finding the next open IRQ window. This, in turn, depends on
4906		 * well-behaving guests: They have to keep IRQs disabled at
4907		 * least as long as the NMI handler runs. Otherwise we may
4908		 * cause NMI nesting, maybe breaking the guest. But as this is
4909		 * highly unlikely, we can live with the residual risk.
4910		 */
4911		vmx->soft_vnmi_blocked = 1;
4912		vmx->vnmi_blocked_time = 0;
4913	}
4914
4915	++vcpu->stat.nmi_injections;
4916	vmx->nmi_known_unmasked = false;
4917	if (vmx->rmode.vm86_active) {
4918		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
4919			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
4920		return;
4921	}
4922	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
4923			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
4924}
4925
4926static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4927{
4928	if (!cpu_has_virtual_nmis())
4929		return to_vmx(vcpu)->soft_vnmi_blocked;
4930	if (to_vmx(vcpu)->nmi_known_unmasked)
4931		return false;
4932	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;
4933}
4934
4935static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4936{
4937	struct vcpu_vmx *vmx = to_vmx(vcpu);
4938
4939	if (!cpu_has_virtual_nmis()) {
4940		if (vmx->soft_vnmi_blocked != masked) {
4941			vmx->soft_vnmi_blocked = masked;
4942			vmx->vnmi_blocked_time = 0;
4943		}
4944	} else {
4945		vmx->nmi_known_unmasked = !masked;
4946		if (masked)
4947			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
4948				      GUEST_INTR_STATE_NMI);
4949		else
4950			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
4951					GUEST_INTR_STATE_NMI);
4952	}
4953}
4954
4955static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
4956{
4957	if (to_vmx(vcpu)->nested.nested_run_pending)
4958		return 0;
4959
4960	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
4961		return 0;
4962
4963	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4964		  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
4965		   | GUEST_INTR_STATE_NMI));
4966}
4967
4968static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
4969{
4970	return (!to_vmx(vcpu)->nested.nested_run_pending &&
4971		vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
4972		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
4973			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
4974}
4975
4976static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
4977{
4978	int ret;
4979	struct kvm_userspace_memory_region tss_mem = {
4980		.slot = TSS_PRIVATE_MEMSLOT,
4981		.guest_phys_addr = addr,
4982		.memory_size = PAGE_SIZE * 3,
4983		.flags = 0,
4984	};
4985
4986	ret = kvm_set_memory_region(kvm, &tss_mem);
4987	if (ret)
4988		return ret;
4989	kvm->arch.tss_addr = addr;
4990	return init_rmode_tss(kvm);
4991}
4992
4993static bool rmode_exception(struct kvm_vcpu *vcpu, int vec)
4994{
4995	switch (vec) {
4996	case BP_VECTOR:
4997		/*
4998		 * Update instruction length as we may reinject the exception
4999		 * from user space while in guest debugging mode.
5000		 */
5001		to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5002			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5003		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5004			return false;
5005		/* fall through */
5006	case DB_VECTOR:
5007		if (vcpu->guest_debug &
5008			(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5009			return false;
5010		/* fall through */
5011	case DE_VECTOR:
5012	case OF_VECTOR:
5013	case BR_VECTOR:
5014	case UD_VECTOR:
5015	case DF_VECTOR:
5016	case SS_VECTOR:
5017	case GP_VECTOR:
5018	case MF_VECTOR:
5019		return true;
5020	break;
5021	}
5022	return false;
5023}
5024
5025static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5026				  int vec, u32 err_code)
5027{
5028	/*
5029	 * Instruction with address size override prefix opcode 0x67
5030	 * Cause the #SS fault with 0 error code in VM86 mode.
5031	 */
5032	if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
5033		if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
5034			if (vcpu->arch.halt_request) {
5035				vcpu->arch.halt_request = 0;
5036				return kvm_vcpu_halt(vcpu);
5037			}
5038			return 1;
5039		}
5040		return 0;
5041	}
5042
5043	/*
5044	 * Forward all other exceptions that are valid in real mode.
5045	 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
5046	 *        the required debugging infrastructure rework.
5047	 */
5048	kvm_queue_exception(vcpu, vec);
5049	return 1;
5050}
5051
5052/*
5053 * Trigger machine check on the host. We assume all the MSRs are already set up
5054 * by the CPU and that we still run on the same CPU as the MCE occurred on.
5055 * We pass a fake environment to the machine check handler because we want
5056 * the guest to be always treated like user space, no matter what context
5057 * it used internally.
5058 */
5059static void kvm_machine_check(void)
5060{
5061#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
5062	struct pt_regs regs = {
5063		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
5064		.flags = X86_EFLAGS_IF,
5065	};
5066
5067	do_machine_check(&regs, 0);
5068#endif
5069}
5070
5071static int handle_machine_check(struct kvm_vcpu *vcpu)
5072{
5073	/* already handled by vcpu_run */
5074	return 1;
5075}
5076
5077static int handle_exception(struct kvm_vcpu *vcpu)
5078{
5079	struct vcpu_vmx *vmx = to_vmx(vcpu);
5080	struct kvm_run *kvm_run = vcpu->run;
5081	u32 intr_info, ex_no, error_code;
5082	unsigned long cr2, rip, dr6;
5083	u32 vect_info;
5084	enum emulation_result er;
5085
5086	vect_info = vmx->idt_vectoring_info;
5087	intr_info = vmx->exit_intr_info;
5088
5089	if (is_machine_check(intr_info))
5090		return handle_machine_check(vcpu);
5091
5092	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
5093		return 1;  /* already handled by vmx_vcpu_run() */
5094
5095	if (is_no_device(intr_info)) {
5096		vmx_fpu_activate(vcpu);
5097		return 1;
5098	}
5099
5100	if (is_invalid_opcode(intr_info)) {
5101		if (is_guest_mode(vcpu)) {
5102			kvm_queue_exception(vcpu, UD_VECTOR);
5103			return 1;
5104		}
5105		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
5106		if (er != EMULATE_DONE)
5107			kvm_queue_exception(vcpu, UD_VECTOR);
5108		return 1;
5109	}
5110
5111	error_code = 0;
5112	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5113		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
5114
5115	/*
5116	 * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5117	 * MMIO, it is better to report an internal error.
5118	 * See the comments in vmx_handle_exit.
5119	 */
5120	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5121	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5122		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5123		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5124		vcpu->run->internal.ndata = 3;
5125		vcpu->run->internal.data[0] = vect_info;
5126		vcpu->run->internal.data[1] = intr_info;
5127		vcpu->run->internal.data[2] = error_code;
5128		return 0;
5129	}
5130
5131	if (is_page_fault(intr_info)) {
5132		/* EPT won't cause page fault directly */
5133		BUG_ON(enable_ept);
5134		cr2 = vmcs_readl(EXIT_QUALIFICATION);
5135		trace_kvm_page_fault(cr2, error_code);
5136
5137		if (kvm_event_needs_reinjection(vcpu))
5138			kvm_mmu_unprotect_page_virt(vcpu, cr2);
5139		return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
5140	}
5141
5142	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5143
5144	if (vmx->rmode.vm86_active && rmode_exception(vcpu, ex_no))
5145		return handle_rmode_exception(vcpu, ex_no, error_code);
5146
5147	switch (ex_no) {
5148	case AC_VECTOR:
5149		kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5150		return 1;
5151	case DB_VECTOR:
5152		dr6 = vmcs_readl(EXIT_QUALIFICATION);
5153		if (!(vcpu->guest_debug &
5154		      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
5155			vcpu->arch.dr6 &= ~15;
5156			vcpu->arch.dr6 |= dr6 | DR6_RTM;
5157			if (!(dr6 & ~DR6_RESERVED)) /* icebp */
5158				skip_emulated_instruction(vcpu);
5159
5160			kvm_queue_exception(vcpu, DB_VECTOR);
5161			return 1;
5162		}
5163		kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
5164		kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
5165		/* fall through */
5166	case BP_VECTOR:
5167		/*
5168		 * Update instruction length as we may reinject #BP from
5169		 * user space while in guest debugging mode. Reading it for
5170		 * #DB as well causes no harm, it is not used in that case.
5171		 */
5172		vmx->vcpu.arch.event_exit_inst_len =
5173			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
5174		kvm_run->exit_reason = KVM_EXIT_DEBUG;
5175		rip = kvm_rip_read(vcpu);
5176		kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
5177		kvm_run->debug.arch.exception = ex_no;
5178		break;
5179	default:
5180		kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5181		kvm_run->ex.exception = ex_no;
5182		kvm_run->ex.error_code = error_code;
5183		break;
5184	}
5185	return 0;
5186}
5187
5188static int handle_external_interrupt(struct kvm_vcpu *vcpu)
5189{
5190	++vcpu->stat.irq_exits;
5191	return 1;
5192}
5193
5194static int handle_triple_fault(struct kvm_vcpu *vcpu)
5195{
5196	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5197	return 0;
5198}
5199
5200static int handle_io(struct kvm_vcpu *vcpu)
5201{
5202	unsigned long exit_qualification;
5203	int size, in, string;
5204	unsigned port;
5205
5206	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5207	string = (exit_qualification & 16) != 0;
5208	in = (exit_qualification & 8) != 0;
5209
5210	++vcpu->stat.io_exits;
5211
5212	if (string || in)
5213		return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5214
5215	port = exit_qualification >> 16;
5216	size = (exit_qualification & 7) + 1;
5217	skip_emulated_instruction(vcpu);
5218
5219	return kvm_fast_pio_out(vcpu, size, port);
5220}
5221
5222static void
5223vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5224{
5225	/*
5226	 * Patch in the VMCALL instruction:
5227	 */
5228	hypercall[0] = 0x0f;
5229	hypercall[1] = 0x01;
5230	hypercall[2] = 0xc1;
5231}
5232
5233static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
5234{
5235	unsigned long always_on = VMXON_CR0_ALWAYSON;
5236	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5237
5238	if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high &
5239		SECONDARY_EXEC_UNRESTRICTED_GUEST &&
5240	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
5241		always_on &= ~(X86_CR0_PE | X86_CR0_PG);
5242	return (val & always_on) == always_on;
5243}
5244
5245/* called to set cr0 as appropriate for a mov-to-cr0 exit. */
5246static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val)
5247{
5248	if (is_guest_mode(vcpu)) {
5249		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5250		unsigned long orig_val = val;
5251
5252		/*
5253		 * We get here when L2 changed cr0 in a way that did not change
5254		 * any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5255		 * but did change L0 shadowed bits. So we first calculate the
5256		 * effective cr0 value that L1 would like to write into the
5257		 * hardware. It consists of the L2-owned bits from the new
5258		 * value combined with the L1-owned bits from L1's guest_cr0.
5259		 */
5260		val = (val & ~vmcs12->cr0_guest_host_mask) |
5261			(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5262
5263		if (!nested_cr0_valid(vcpu, val))
5264			return 1;
5265
5266		if (kvm_set_cr0(vcpu, val))
5267			return 1;
5268		vmcs_writel(CR0_READ_SHADOW, orig_val);
5269		return 0;
5270	} else {
5271		if (to_vmx(vcpu)->nested.vmxon &&
5272		    ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON))
5273			return 1;
5274		return kvm_set_cr0(vcpu, val);
5275	}
5276}
5277
5278static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
5279{
5280	if (is_guest_mode(vcpu)) {
5281		struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5282		unsigned long orig_val = val;
5283
5284		/* analogously to handle_set_cr0 */
5285		val = (val & ~vmcs12->cr4_guest_host_mask) |
5286			(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5287		if (kvm_set_cr4(vcpu, val))
5288			return 1;
5289		vmcs_writel(CR4_READ_SHADOW, orig_val);
5290		return 0;
5291	} else
5292		return kvm_set_cr4(vcpu, val);
5293}
5294
5295/* called to set cr0 as approriate for clts instruction exit. */
5296static void handle_clts(struct kvm_vcpu *vcpu)
5297{
5298	if (is_guest_mode(vcpu)) {
5299		/*
5300		 * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
5301		 * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
5302		 * just pretend it's off (also in arch.cr0 for fpu_activate).
5303		 */
5304		vmcs_writel(CR0_READ_SHADOW,
5305			vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
5306		vcpu->arch.cr0 &= ~X86_CR0_TS;
5307	} else
5308		vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
5309}
5310
5311static int handle_cr(struct kvm_vcpu *vcpu)
5312{
5313	unsigned long exit_qualification, val;
5314	int cr;
5315	int reg;
5316	int err;
5317
5318	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5319	cr = exit_qualification & 15;
5320	reg = (exit_qualification >> 8) & 15;
5321	switch ((exit_qualification >> 4) & 3) {
5322	case 0: /* mov to cr */
5323		val = kvm_register_readl(vcpu, reg);
5324		trace_kvm_cr_write(cr, val);
5325		switch (cr) {
5326		case 0:
5327			err = handle_set_cr0(vcpu, val);
5328			kvm_complete_insn_gp(vcpu, err);
5329			return 1;
5330		case 3:
5331			err = kvm_set_cr3(vcpu, val);
5332			kvm_complete_insn_gp(vcpu, err);
5333			return 1;
5334		case 4:
5335			err = handle_set_cr4(vcpu, val);
5336			kvm_complete_insn_gp(vcpu, err);
5337			return 1;
5338		case 8: {
5339				u8 cr8_prev = kvm_get_cr8(vcpu);
5340				u8 cr8 = (u8)val;
5341				err = kvm_set_cr8(vcpu, cr8);
5342				kvm_complete_insn_gp(vcpu, err);
5343				if (irqchip_in_kernel(vcpu->kvm))
5344					return 1;
5345				if (cr8_prev <= cr8)
5346					return 1;
5347				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5348				return 0;
5349			}
5350		}
5351		break;
5352	case 2: /* clts */
5353		handle_clts(vcpu);
5354		trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
5355		skip_emulated_instruction(vcpu);
5356		vmx_fpu_activate(vcpu);
5357		return 1;
5358	case 1: /*mov from cr*/
5359		switch (cr) {
5360		case 3:
5361			val = kvm_read_cr3(vcpu);
5362			kvm_register_write(vcpu, reg, val);
5363			trace_kvm_cr_read(cr, val);
5364			skip_emulated_instruction(vcpu);
5365			return 1;
5366		case 8:
5367			val = kvm_get_cr8(vcpu);
5368			kvm_register_write(vcpu, reg, val);
5369			trace_kvm_cr_read(cr, val);
5370			skip_emulated_instruction(vcpu);
5371			return 1;
5372		}
5373		break;
5374	case 3: /* lmsw */
5375		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5376		trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
5377		kvm_lmsw(vcpu, val);
5378
5379		skip_emulated_instruction(vcpu);
5380		return 1;
5381	default:
5382		break;
5383	}
5384	vcpu->run->exit_reason = 0;
5385	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5386	       (int)(exit_qualification >> 4) & 3, cr);
5387	return 0;
5388}
5389
5390static int handle_dr(struct kvm_vcpu *vcpu)
5391{
5392	unsigned long exit_qualification;
5393	int dr, dr7, reg;
5394
5395	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5396	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5397
5398	/* First, if DR does not exist, trigger UD */
5399	if (!kvm_require_dr(vcpu, dr))
5400		return 1;
5401
5402	/* Do not handle if the CPL > 0, will trigger GP on re-entry */
5403	if (!kvm_require_cpl(vcpu, 0))
5404		return 1;
5405	dr7 = vmcs_readl(GUEST_DR7);
5406	if (dr7 & DR7_GD) {
5407		/*
5408		 * As the vm-exit takes precedence over the debug trap, we
5409		 * need to emulate the latter, either for the host or the
5410		 * guest debugging itself.
5411		 */
5412		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5413			vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
5414			vcpu->run->debug.arch.dr7 = dr7;
5415			vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5416			vcpu->run->debug.arch.exception = DB_VECTOR;
5417			vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5418			return 0;
5419		} else {
5420			vcpu->arch.dr6 &= ~15;
5421			vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
5422			kvm_queue_exception(vcpu, DB_VECTOR);
5423			return 1;
5424		}
5425	}
5426
5427	if (vcpu->guest_debug == 0) {
5428		u32 cpu_based_vm_exec_control;
5429
5430		cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5431		cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
5432		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5433
5434		/*
5435		 * No more DR vmexits; force a reload of the debug registers
5436		 * and reenter on this instruction.  The next vmexit will
5437		 * retrieve the full state of the debug registers.
5438		 */
5439		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
5440		return 1;
5441	}
5442
5443	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5444	if (exit_qualification & TYPE_MOV_FROM_DR) {
5445		unsigned long val;
5446
5447		if (kvm_get_dr(vcpu, dr, &val))
5448			return 1;
5449		kvm_register_write(vcpu, reg, val);
5450	} else
5451		if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg)))
5452			return 1;
5453
5454	skip_emulated_instruction(vcpu);
5455	return 1;
5456}
5457
5458static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
5459{
5460	return vcpu->arch.dr6;
5461}
5462
5463static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
5464{
5465}
5466
5467static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5468{
5469	u32 cpu_based_vm_exec_control;
5470
5471	get_debugreg(vcpu->arch.db[0], 0);
5472	get_debugreg(vcpu->arch.db[1], 1);
5473	get_debugreg(vcpu->arch.db[2], 2);
5474	get_debugreg(vcpu->arch.db[3], 3);
5475	get_debugreg(vcpu->arch.dr6, 6);
5476	vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
5477
5478	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5479
5480	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5481	cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
5482	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5483}
5484
5485static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
5486{
5487	vmcs_writel(GUEST_DR7, val);
5488}
5489
5490static int handle_cpuid(struct kvm_vcpu *vcpu)
5491{
5492	kvm_emulate_cpuid(vcpu);
5493	return 1;
5494}
5495
5496static int handle_rdmsr(struct kvm_vcpu *vcpu)
5497{
5498	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5499	u64 data;
5500
5501	if (vmx_get_msr(vcpu, ecx, &data)) {
5502		trace_kvm_msr_read_ex(ecx);
5503		kvm_inject_gp(vcpu, 0);
5504		return 1;
5505	}
5506
5507	trace_kvm_msr_read(ecx, data);
5508
5509	/* FIXME: handling of bits 32:63 of rax, rdx */
5510	vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
5511	vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
5512	skip_emulated_instruction(vcpu);
5513	return 1;
5514}
5515
5516static int handle_wrmsr(struct kvm_vcpu *vcpu)
5517{
5518	struct msr_data msr;
5519	u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
5520	u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
5521		| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
5522
5523	msr.data = data;
5524	msr.index = ecx;
5525	msr.host_initiated = false;
5526	if (kvm_set_msr(vcpu, &msr) != 0) {
5527		trace_kvm_msr_write_ex(ecx, data);
5528		kvm_inject_gp(vcpu, 0);
5529		return 1;
5530	}
5531
5532	trace_kvm_msr_write(ecx, data);
5533	skip_emulated_instruction(vcpu);
5534	return 1;
5535}
5536
5537static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5538{
5539	kvm_make_request(KVM_REQ_EVENT, vcpu);
5540	return 1;
5541}
5542
5543static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5544{
5545	u32 cpu_based_vm_exec_control;
5546
5547	/* clear pending irq */
5548	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5549	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
5550	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5551
5552	kvm_make_request(KVM_REQ_EVENT, vcpu);
5553
5554	++vcpu->stat.irq_window_exits;
5555
5556	/*
5557	 * If the user space waits to inject interrupts, exit as soon as
5558	 * possible
5559	 */
5560	if (!irqchip_in_kernel(vcpu->kvm) &&
5561	    vcpu->run->request_interrupt_window &&
5562	    !kvm_cpu_has_interrupt(vcpu)) {
5563		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
5564		return 0;
5565	}
5566	return 1;
5567}
5568
5569static int handle_halt(struct kvm_vcpu *vcpu)
5570{
5571	return kvm_emulate_halt(vcpu);
5572}
5573
5574static int handle_vmcall(struct kvm_vcpu *vcpu)
5575{
5576	kvm_emulate_hypercall(vcpu);
5577	return 1;
5578}
5579
5580static int handle_invd(struct kvm_vcpu *vcpu)
5581{
5582	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5583}
5584
5585static int handle_invlpg(struct kvm_vcpu *vcpu)
5586{
5587	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5588
5589	kvm_mmu_invlpg(vcpu, exit_qualification);
5590	skip_emulated_instruction(vcpu);
5591	return 1;
5592}
5593
5594static int handle_rdpmc(struct kvm_vcpu *vcpu)
5595{
5596	int err;
5597
5598	err = kvm_rdpmc(vcpu);
5599	kvm_complete_insn_gp(vcpu, err);
5600
5601	return 1;
5602}
5603
5604static int handle_wbinvd(struct kvm_vcpu *vcpu)
5605{
5606	kvm_emulate_wbinvd(vcpu);
5607	return 1;
5608}
5609
5610static int handle_xsetbv(struct kvm_vcpu *vcpu)
5611{
5612	u64 new_bv = kvm_read_edx_eax(vcpu);
5613	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
5614
5615	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
5616		skip_emulated_instruction(vcpu);
5617	return 1;
5618}
5619
5620static int handle_xsaves(struct kvm_vcpu *vcpu)
5621{
5622	skip_emulated_instruction(vcpu);
5623	WARN(1, "this should never happen\n");
5624	return 1;
5625}
5626
5627static int handle_xrstors(struct kvm_vcpu *vcpu)
5628{
5629	skip_emulated_instruction(vcpu);
5630	WARN(1, "this should never happen\n");
5631	return 1;
5632}
5633
5634static int handle_apic_access(struct kvm_vcpu *vcpu)
5635{
5636	if (likely(fasteoi)) {
5637		unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5638		int access_type, offset;
5639
5640		access_type = exit_qualification & APIC_ACCESS_TYPE;
5641		offset = exit_qualification & APIC_ACCESS_OFFSET;
5642		/*
5643		 * Sane guest uses MOV to write EOI, with written value
5644		 * not cared. So make a short-circuit here by avoiding
5645		 * heavy instruction emulation.
5646		 */
5647		if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5648		    (offset == APIC_EOI)) {
5649			kvm_lapic_set_eoi(vcpu);
5650			skip_emulated_instruction(vcpu);
5651			return 1;
5652		}
5653	}
5654	return emulate_instruction(vcpu, 0) == EMULATE_DONE;
5655}
5656
5657static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5658{
5659	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5660	int vector = exit_qualification & 0xff;
5661
5662	/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
5663	kvm_apic_set_eoi_accelerated(vcpu, vector);
5664	return 1;
5665}
5666
5667static int handle_apic_write(struct kvm_vcpu *vcpu)
5668{
5669	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5670	u32 offset = exit_qualification & 0xfff;
5671
5672	/* APIC-write VM exit is trap-like and thus no need to adjust IP */
5673	kvm_apic_write_nodecode(vcpu, offset);
5674	return 1;
5675}
5676
5677static int handle_task_switch(struct kvm_vcpu *vcpu)
5678{
5679	struct vcpu_vmx *vmx = to_vmx(vcpu);
5680	unsigned long exit_qualification;
5681	bool has_error_code = false;
5682	u32 error_code = 0;
5683	u16 tss_selector;
5684	int reason, type, idt_v, idt_index;
5685
5686	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5687	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5688	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5689
5690	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5691
5692	reason = (u32)exit_qualification >> 30;
5693	if (reason == TASK_SWITCH_GATE && idt_v) {
5694		switch (type) {
5695		case INTR_TYPE_NMI_INTR:
5696			vcpu->arch.nmi_injected = false;
5697			vmx_set_nmi_mask(vcpu, true);
5698			break;
5699		case INTR_TYPE_EXT_INTR:
5700		case INTR_TYPE_SOFT_INTR:
5701			kvm_clear_interrupt_queue(vcpu);
5702			break;
5703		case INTR_TYPE_HARD_EXCEPTION:
5704			if (vmx->idt_vectoring_info &
5705			    VECTORING_INFO_DELIVER_CODE_MASK) {
5706				has_error_code = true;
5707				error_code =
5708					vmcs_read32(IDT_VECTORING_ERROR_CODE);
5709			}
5710			/* fall through */
5711		case INTR_TYPE_SOFT_EXCEPTION:
5712			kvm_clear_exception_queue(vcpu);
5713			break;
5714		default:
5715			break;
5716		}
5717	}
5718	tss_selector = exit_qualification;
5719
5720	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
5721		       type != INTR_TYPE_EXT_INTR &&
5722		       type != INTR_TYPE_NMI_INTR))
5723		skip_emulated_instruction(vcpu);
5724
5725	if (kvm_task_switch(vcpu, tss_selector,
5726			    type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
5727			    has_error_code, error_code) == EMULATE_FAIL) {
5728		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5729		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5730		vcpu->run->internal.ndata = 0;
5731		return 0;
5732	}
5733
5734	/* clear all local breakpoint enable flags */
5735	vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
5736
5737	/*
5738	 * TODO: What about debug traps on tss switch?
5739	 *       Are we supposed to inject them and update dr6?
5740	 */
5741
5742	return 1;
5743}
5744
5745static int handle_ept_violation(struct kvm_vcpu *vcpu)
5746{
5747	unsigned long exit_qualification;
5748	gpa_t gpa;
5749	u32 error_code;
5750	int gla_validity;
5751
5752	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5753
5754	gla_validity = (exit_qualification >> 7) & 0x3;
5755	if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
5756		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
5757		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
5758			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
5759			vmcs_readl(GUEST_LINEAR_ADDRESS));
5760		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
5761			(long unsigned int)exit_qualification);
5762		vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
5763		vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
5764		return 0;
5765	}
5766
5767	/*
5768	 * EPT violation happened while executing iret from NMI,
5769	 * "blocked by NMI" bit has to be set before next VM entry.
5770	 * There are errata that may cause this bit to not be set:
5771	 * AAK134, BY25.
5772	 */
5773	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5774			cpu_has_virtual_nmis() &&
5775			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
5776		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5777
5778	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5779	trace_kvm_page_fault(gpa, exit_qualification);
5780
5781	/* It is a write fault? */
5782	error_code = exit_qualification & PFERR_WRITE_MASK;
5783	/* It is a fetch fault? */
5784	error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
5785	/* ept page table is present? */
5786	error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
5787
5788	vcpu->arch.exit_qualification = exit_qualification;
5789
5790	return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
5791}
5792
5793static u64 ept_rsvd_mask(u64 spte, int level)
5794{
5795	int i;
5796	u64 mask = 0;
5797
5798	for (i = 51; i > boot_cpu_data.x86_phys_bits; i--)
5799		mask |= (1ULL << i);
5800
5801	if (level == 4)
5802		/* bits 7:3 reserved */
5803		mask |= 0xf8;
5804	else if (spte & (1ULL << 7))
5805		/*
5806		 * 1GB/2MB page, bits 29:12 or 20:12 reserved respectively,
5807		 * level == 1 if the hypervisor is using the ignored bit 7.
5808		 */
5809		mask |= (PAGE_SIZE << ((level - 1) * 9)) - PAGE_SIZE;
5810	else if (level > 1)
5811		/* bits 6:3 reserved */
5812		mask |= 0x78;
5813
5814	return mask;
5815}
5816
5817static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
5818				       int level)
5819{
5820	printk(KERN_ERR "%s: spte 0x%llx level %d\n", __func__, spte, level);
5821
5822	/* 010b (write-only) */
5823	WARN_ON((spte & 0x7) == 0x2);
5824
5825	/* 110b (write/execute) */
5826	WARN_ON((spte & 0x7) == 0x6);
5827
5828	/* 100b (execute-only) and value not supported by logical processor */
5829	if (!cpu_has_vmx_ept_execute_only())
5830		WARN_ON((spte & 0x7) == 0x4);
5831
5832	/* not 000b */
5833	if ((spte & 0x7)) {
5834		u64 rsvd_bits = spte & ept_rsvd_mask(spte, level);
5835
5836		if (rsvd_bits != 0) {
5837			printk(KERN_ERR "%s: rsvd_bits = 0x%llx\n",
5838					 __func__, rsvd_bits);
5839			WARN_ON(1);
5840		}
5841
5842		/* bits 5:3 are _not_ reserved for large page or leaf page */
5843		if ((rsvd_bits & 0x38) == 0) {
5844			u64 ept_mem_type = (spte & 0x38) >> 3;
5845
5846			if (ept_mem_type == 2 || ept_mem_type == 3 ||
5847			    ept_mem_type == 7) {
5848				printk(KERN_ERR "%s: ept_mem_type=0x%llx\n",
5849						__func__, ept_mem_type);
5850				WARN_ON(1);
5851			}
5852		}
5853	}
5854}
5855
5856static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5857{
5858	u64 sptes[4];
5859	int nr_sptes, i, ret;
5860	gpa_t gpa;
5861
5862	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
5863	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
5864		skip_emulated_instruction(vcpu);
5865		return 1;
5866	}
5867
5868	ret = handle_mmio_page_fault_common(vcpu, gpa, true);
5869	if (likely(ret == RET_MMIO_PF_EMULATE))
5870		return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
5871					      EMULATE_DONE;
5872
5873	if (unlikely(ret == RET_MMIO_PF_INVALID))
5874		return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
5875
5876	if (unlikely(ret == RET_MMIO_PF_RETRY))
5877		return 1;
5878
5879	/* It is the real ept misconfig */
5880	printk(KERN_ERR "EPT: Misconfiguration.\n");
5881	printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
5882
5883	nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
5884
5885	for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
5886		ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
5887
5888	vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
5889	vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
5890
5891	return 0;
5892}
5893
5894static int handle_nmi_window(struct kvm_vcpu *vcpu)
5895{
5896	u32 cpu_based_vm_exec_control;
5897
5898	/* clear pending NMI */
5899	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5900	cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
5901	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
5902	++vcpu->stat.nmi_window_exits;
5903	kvm_make_request(KVM_REQ_EVENT, vcpu);
5904
5905	return 1;
5906}
5907
5908static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5909{
5910	struct vcpu_vmx *vmx = to_vmx(vcpu);
5911	enum emulation_result err = EMULATE_DONE;
5912	int ret = 1;
5913	u32 cpu_exec_ctrl;
5914	bool intr_window_requested;
5915	unsigned count = 130;
5916
5917	cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
5918	intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
5919
5920	while (vmx->emulation_required && count-- != 0) {
5921		if (intr_window_requested && vmx_interrupt_allowed(vcpu))
5922			return handle_interrupt_window(&vmx->vcpu);
5923
5924		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
5925			return 1;
5926
5927		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
5928
5929		if (err == EMULATE_USER_EXIT) {
5930			++vcpu->stat.mmio_exits;
5931			ret = 0;
5932			goto out;
5933		}
5934
5935		if (err != EMULATE_DONE) {
5936			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5937			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5938			vcpu->run->internal.ndata = 0;
5939			return 0;
5940		}
5941
5942		if (vcpu->arch.halt_request) {
5943			vcpu->arch.halt_request = 0;
5944			ret = kvm_vcpu_halt(vcpu);
5945			goto out;
5946		}
5947
5948		if (signal_pending(current))
5949			goto out;
5950		if (need_resched())
5951			schedule();
5952	}
5953
5954out:
5955	return ret;
5956}
5957
5958static int __grow_ple_window(int val)
5959{
5960	if (ple_window_grow < 1)
5961		return ple_window;
5962
5963	val = min(val, ple_window_actual_max);
5964
5965	if (ple_window_grow < ple_window)
5966		val *= ple_window_grow;
5967	else
5968		val += ple_window_grow;
5969
5970	return val;
5971}
5972
5973static int __shrink_ple_window(int val, int modifier, int minimum)
5974{
5975	if (modifier < 1)
5976		return ple_window;
5977
5978	if (modifier < ple_window)
5979		val /= modifier;
5980	else
5981		val -= modifier;
5982
5983	return max(val, minimum);
5984}
5985
5986static void grow_ple_window(struct kvm_vcpu *vcpu)
5987{
5988	struct vcpu_vmx *vmx = to_vmx(vcpu);
5989	int old = vmx->ple_window;
5990
5991	vmx->ple_window = __grow_ple_window(old);
5992
5993	if (vmx->ple_window != old)
5994		vmx->ple_window_dirty = true;
5995
5996	trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
5997}
5998
5999static void shrink_ple_window(struct kvm_vcpu *vcpu)
6000{
6001	struct vcpu_vmx *vmx = to_vmx(vcpu);
6002	int old = vmx->ple_window;
6003
6004	vmx->ple_window = __shrink_ple_window(old,
6005	                                      ple_window_shrink, ple_window);
6006
6007	if (vmx->ple_window != old)
6008		vmx->ple_window_dirty = true;
6009
6010	trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
6011}
6012
6013/*
6014 * ple_window_actual_max is computed to be one grow_ple_window() below
6015 * ple_window_max. (See __grow_ple_window for the reason.)
6016 * This prevents overflows, because ple_window_max is int.
6017 * ple_window_max effectively rounded down to a multiple of ple_window_grow in
6018 * this process.
6019 * ple_window_max is also prevented from setting vmx->ple_window < ple_window.
6020 */
6021static void update_ple_window_actual_max(void)
6022{
6023	ple_window_actual_max =
6024			__shrink_ple_window(max(ple_window_max, ple_window),
6025			                    ple_window_grow, INT_MIN);
6026}
6027
6028static __init int hardware_setup(void)
6029{
6030	int r = -ENOMEM, i, msr;
6031
6032	rdmsrl_safe(MSR_EFER, &host_efer);
6033
6034	for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
6035		kvm_define_shared_msr(i, vmx_msr_index[i]);
6036
6037	vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
6038	if (!vmx_io_bitmap_a)
6039		return r;
6040
6041	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
6042	if (!vmx_io_bitmap_b)
6043		goto out;
6044
6045	vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
6046	if (!vmx_msr_bitmap_legacy)
6047		goto out1;
6048
6049	vmx_msr_bitmap_legacy_x2apic =
6050				(unsigned long *)__get_free_page(GFP_KERNEL);
6051	if (!vmx_msr_bitmap_legacy_x2apic)
6052		goto out2;
6053
6054	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
6055	if (!vmx_msr_bitmap_longmode)
6056		goto out3;
6057
6058	vmx_msr_bitmap_longmode_x2apic =
6059				(unsigned long *)__get_free_page(GFP_KERNEL);
6060	if (!vmx_msr_bitmap_longmode_x2apic)
6061		goto out4;
6062
6063	if (nested) {
6064		vmx_msr_bitmap_nested =
6065			(unsigned long *)__get_free_page(GFP_KERNEL);
6066		if (!vmx_msr_bitmap_nested)
6067			goto out5;
6068	}
6069
6070	vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
6071	if (!vmx_vmread_bitmap)
6072		goto out6;
6073
6074	vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
6075	if (!vmx_vmwrite_bitmap)
6076		goto out7;
6077
6078	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
6079	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
6080
6081	/*
6082	 * Allow direct access to the PC debug port (it is often used for I/O
6083	 * delays, but the vmexits simply slow things down).
6084	 */
6085	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
6086	clear_bit(0x80, vmx_io_bitmap_a);
6087
6088	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
6089
6090	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
6091	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
6092	if (nested)
6093		memset(vmx_msr_bitmap_nested, 0xff, PAGE_SIZE);
6094
6095	if (setup_vmcs_config(&vmcs_config) < 0) {
6096		r = -EIO;
6097		goto out8;
6098	}
6099
6100	if (boot_cpu_has(X86_FEATURE_NX))
6101		kvm_enable_efer_bits(EFER_NX);
6102
6103	if (!cpu_has_vmx_vpid())
6104		enable_vpid = 0;
6105	if (!cpu_has_vmx_shadow_vmcs())
6106		enable_shadow_vmcs = 0;
6107	if (enable_shadow_vmcs)
6108		init_vmcs_shadow_fields();
6109
6110	if (!cpu_has_vmx_ept() ||
6111	    !cpu_has_vmx_ept_4levels()) {
6112		enable_ept = 0;
6113		enable_unrestricted_guest = 0;
6114		enable_ept_ad_bits = 0;
6115	}
6116
6117	if (!cpu_has_vmx_ept_ad_bits())
6118		enable_ept_ad_bits = 0;
6119
6120	if (!cpu_has_vmx_unrestricted_guest())
6121		enable_unrestricted_guest = 0;
6122
6123	if (!cpu_has_vmx_flexpriority())
6124		flexpriority_enabled = 0;
6125
6126	/*
6127	 * set_apic_access_page_addr() is used to reload apic access
6128	 * page upon invalidation.  No need to do anything if not
6129	 * using the APIC_ACCESS_ADDR VMCS field.
6130	 */
6131	if (!flexpriority_enabled)
6132		kvm_x86_ops->set_apic_access_page_addr = NULL;
6133
6134	if (!cpu_has_vmx_tpr_shadow())
6135		kvm_x86_ops->update_cr8_intercept = NULL;
6136
6137	if (enable_ept && !cpu_has_vmx_ept_2m_page())
6138		kvm_disable_largepages();
6139
6140	if (!cpu_has_vmx_ple())
6141		ple_gap = 0;
6142
6143	if (!cpu_has_vmx_apicv())
6144		enable_apicv = 0;
6145
6146	if (enable_apicv)
6147		kvm_x86_ops->update_cr8_intercept = NULL;
6148	else {
6149		kvm_x86_ops->hwapic_irr_update = NULL;
6150		kvm_x86_ops->hwapic_isr_update = NULL;
6151		kvm_x86_ops->deliver_posted_interrupt = NULL;
6152		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
6153	}
6154
6155	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
6156	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
6157	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
6158	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
6159	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
6160	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
6161	vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
6162
6163	memcpy(vmx_msr_bitmap_legacy_x2apic,
6164			vmx_msr_bitmap_legacy, PAGE_SIZE);
6165	memcpy(vmx_msr_bitmap_longmode_x2apic,
6166			vmx_msr_bitmap_longmode, PAGE_SIZE);
6167
6168	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
6169
6170	if (enable_apicv) {
6171		for (msr = 0x800; msr <= 0x8ff; msr++)
6172			vmx_disable_intercept_msr_read_x2apic(msr);
6173
6174		/* According SDM, in x2apic mode, the whole id reg is used.
6175		 * But in KVM, it only use the highest eight bits. Need to
6176		 * intercept it */
6177		vmx_enable_intercept_msr_read_x2apic(0x802);
6178		/* TMCCT */
6179		vmx_enable_intercept_msr_read_x2apic(0x839);
6180		/* TPR */
6181		vmx_disable_intercept_msr_write_x2apic(0x808);
6182		/* EOI */
6183		vmx_disable_intercept_msr_write_x2apic(0x80b);
6184		/* SELF-IPI */
6185		vmx_disable_intercept_msr_write_x2apic(0x83f);
6186	}
6187
6188	if (enable_ept) {
6189		kvm_mmu_set_mask_ptes(0ull,
6190			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
6191			(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
6192			0ull, VMX_EPT_EXECUTABLE_MASK);
6193		ept_set_mmio_spte_mask();
6194		kvm_enable_tdp();
6195	} else
6196		kvm_disable_tdp();
6197
6198	update_ple_window_actual_max();
6199
6200	/*
6201	 * Only enable PML when hardware supports PML feature, and both EPT
6202	 * and EPT A/D bit features are enabled -- PML depends on them to work.
6203	 */
6204	if (!enable_ept || !enable_ept_ad_bits || !cpu_has_vmx_pml())
6205		enable_pml = 0;
6206
6207	if (!enable_pml) {
6208		kvm_x86_ops->slot_enable_log_dirty = NULL;
6209		kvm_x86_ops->slot_disable_log_dirty = NULL;
6210		kvm_x86_ops->flush_log_dirty = NULL;
6211		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
6212	}
6213
6214	return alloc_kvm_area();
6215
6216out8:
6217	free_page((unsigned long)vmx_vmwrite_bitmap);
6218out7:
6219	free_page((unsigned long)vmx_vmread_bitmap);
6220out6:
6221	if (nested)
6222		free_page((unsigned long)vmx_msr_bitmap_nested);
6223out5:
6224	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
6225out4:
6226	free_page((unsigned long)vmx_msr_bitmap_longmode);
6227out3:
6228	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
6229out2:
6230	free_page((unsigned long)vmx_msr_bitmap_legacy);
6231out1:
6232	free_page((unsigned long)vmx_io_bitmap_b);
6233out:
6234	free_page((unsigned long)vmx_io_bitmap_a);
6235
6236    return r;
6237}
6238
6239static __exit void hardware_unsetup(void)
6240{
6241	free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
6242	free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
6243	free_page((unsigned long)vmx_msr_bitmap_legacy);
6244	free_page((unsigned long)vmx_msr_bitmap_longmode);
6245	free_page((unsigned long)vmx_io_bitmap_b);
6246	free_page((unsigned long)vmx_io_bitmap_a);
6247	free_page((unsigned long)vmx_vmwrite_bitmap);
6248	free_page((unsigned long)vmx_vmread_bitmap);
6249	if (nested)
6250		free_page((unsigned long)vmx_msr_bitmap_nested);
6251
6252	free_kvm_area();
6253}
6254
6255/*
6256 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
6257 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
6258 */
6259static int handle_pause(struct kvm_vcpu *vcpu)
6260{
6261	if (ple_gap)
6262		grow_ple_window(vcpu);
6263
6264	skip_emulated_instruction(vcpu);
6265	kvm_vcpu_on_spin(vcpu);
6266
6267	return 1;
6268}
6269
6270static int handle_nop(struct kvm_vcpu *vcpu)
6271{
6272	skip_emulated_instruction(vcpu);
6273	return 1;
6274}
6275
6276static int handle_mwait(struct kvm_vcpu *vcpu)
6277{
6278	printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
6279	return handle_nop(vcpu);
6280}
6281
6282static int handle_monitor(struct kvm_vcpu *vcpu)
6283{
6284	printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
6285	return handle_nop(vcpu);
6286}
6287
6288/*
6289 * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
6290 * We could reuse a single VMCS for all the L2 guests, but we also want the
6291 * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
6292 * allows keeping them loaded on the processor, and in the future will allow
6293 * optimizations where prepare_vmcs02 doesn't need to set all the fields on
6294 * every entry if they never change.
6295 * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
6296 * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
6297 *
6298 * The following functions allocate and free a vmcs02 in this pool.
6299 */
6300
6301/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
6302static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
6303{
6304	struct vmcs02_list *item;
6305	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
6306		if (item->vmptr == vmx->nested.current_vmptr) {
6307			list_move(&item->list, &vmx->nested.vmcs02_pool);
6308			return &item->vmcs02;
6309		}
6310
6311	if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
6312		/* Recycle the least recently used VMCS. */
6313		item = list_entry(vmx->nested.vmcs02_pool.prev,
6314			struct vmcs02_list, list);
6315		item->vmptr = vmx->nested.current_vmptr;
6316		list_move(&item->list, &vmx->nested.vmcs02_pool);
6317		return &item->vmcs02;
6318	}
6319
6320	/* Create a new VMCS */
6321	item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
6322	if (!item)
6323		return NULL;
6324	item->vmcs02.vmcs = alloc_vmcs();
6325	if (!item->vmcs02.vmcs) {
6326		kfree(item);
6327		return NULL;
6328	}
6329	loaded_vmcs_init(&item->vmcs02);
6330	item->vmptr = vmx->nested.current_vmptr;
6331	list_add(&(item->list), &(vmx->nested.vmcs02_pool));
6332	vmx->nested.vmcs02_num++;
6333	return &item->vmcs02;
6334}
6335
6336/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
6337static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
6338{
6339	struct vmcs02_list *item;
6340	list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
6341		if (item->vmptr == vmptr) {
6342			free_loaded_vmcs(&item->vmcs02);
6343			list_del(&item->list);
6344			kfree(item);
6345			vmx->nested.vmcs02_num--;
6346			return;
6347		}
6348}
6349
6350/*
6351 * Free all VMCSs saved for this vcpu, except the one pointed by
6352 * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
6353 * must be &vmx->vmcs01.
6354 */
6355static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
6356{
6357	struct vmcs02_list *item, *n;
6358
6359	WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
6360	list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
6361		/*
6362		 * Something will leak if the above WARN triggers.  Better than
6363		 * a use-after-free.
6364		 */
6365		if (vmx->loaded_vmcs == &item->vmcs02)
6366			continue;
6367
6368		free_loaded_vmcs(&item->vmcs02);
6369		list_del(&item->list);
6370		kfree(item);
6371		vmx->nested.vmcs02_num--;
6372	}
6373}
6374
6375/*
6376 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
6377 * set the success or error code of an emulated VMX instruction, as specified
6378 * by Vol 2B, VMX Instruction Reference, "Conventions".
6379 */
6380static void nested_vmx_succeed(struct kvm_vcpu *vcpu)
6381{
6382	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
6383			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
6384			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
6385}
6386
6387static void nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
6388{
6389	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
6390			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
6391			    X86_EFLAGS_SF | X86_EFLAGS_OF))
6392			| X86_EFLAGS_CF);
6393}
6394
6395static void nested_vmx_failValid(struct kvm_vcpu *vcpu,
6396					u32 vm_instruction_error)
6397{
6398	if (to_vmx(vcpu)->nested.current_vmptr == -1ull) {
6399		/*
6400		 * failValid writes the error number to the current VMCS, which
6401		 * can't be done there isn't a current VMCS.
6402		 */
6403		nested_vmx_failInvalid(vcpu);
6404		return;
6405	}
6406	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
6407			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
6408			    X86_EFLAGS_SF | X86_EFLAGS_OF))
6409			| X86_EFLAGS_ZF);
6410	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
6411	/*
6412	 * We don't need to force a shadow sync because
6413	 * VM_INSTRUCTION_ERROR is not shadowed
6414	 */
6415}
6416
6417static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
6418{
6419	/* TODO: not to reset guest simply here. */
6420	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6421	pr_warn("kvm: nested vmx abort, indicator %d\n", indicator);
6422}
6423
6424static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
6425{
6426	struct vcpu_vmx *vmx =
6427		container_of(timer, struct vcpu_vmx, nested.preemption_timer);
6428
6429	vmx->nested.preemption_timer_expired = true;
6430	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
6431	kvm_vcpu_kick(&vmx->vcpu);
6432
6433	return HRTIMER_NORESTART;
6434}
6435
6436/*
6437 * Decode the memory-address operand of a vmx instruction, as recorded on an
6438 * exit caused by such an instruction (run by a guest hypervisor).
6439 * On success, returns 0. When the operand is invalid, returns 1 and throws
6440 * #UD or #GP.
6441 */
6442static int get_vmx_mem_address(struct kvm_vcpu *vcpu,
6443				 unsigned long exit_qualification,
6444				 u32 vmx_instruction_info, gva_t *ret)
6445{
6446	/*
6447	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
6448	 * Execution", on an exit, vmx_instruction_info holds most of the
6449	 * addressing components of the operand. Only the displacement part
6450	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
6451	 * For how an actual address is calculated from all these components,
6452	 * refer to Vol. 1, "Operand Addressing".
6453	 */
6454	int  scaling = vmx_instruction_info & 3;
6455	int  addr_size = (vmx_instruction_info >> 7) & 7;
6456	bool is_reg = vmx_instruction_info & (1u << 10);
6457	int  seg_reg = (vmx_instruction_info >> 15) & 7;
6458	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
6459	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
6460	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
6461	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
6462
6463	if (is_reg) {
6464		kvm_queue_exception(vcpu, UD_VECTOR);
6465		return 1;
6466	}
6467
6468	/* Addr = segment_base + offset */
6469	/* offset = base + [index * scale] + displacement */
6470	*ret = vmx_get_segment_base(vcpu, seg_reg);
6471	if (base_is_valid)
6472		*ret += kvm_register_read(vcpu, base_reg);
6473	if (index_is_valid)
6474		*ret += kvm_register_read(vcpu, index_reg)<<scaling;
6475	*ret += exit_qualification; /* holds the displacement */
6476
6477	if (addr_size == 1) /* 32 bit */
6478		*ret &= 0xffffffff;
6479
6480	/*
6481	 * TODO: throw #GP (and return 1) in various cases that the VM*
6482	 * instructions require it - e.g., offset beyond segment limit,
6483	 * unusable or unreadable/unwritable segment, non-canonical 64-bit
6484	 * address, and so on. Currently these are not checked.
6485	 */
6486	return 0;
6487}
6488
6489/*
6490 * This function performs the various checks including
6491 * - if it's 4KB aligned
6492 * - No bits beyond the physical address width are set
6493 * - Returns 0 on success or else 1
6494 * (Intel SDM Section 30.3)
6495 */
6496static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
6497				  gpa_t *vmpointer)
6498{
6499	gva_t gva;
6500	gpa_t vmptr;
6501	struct x86_exception e;
6502	struct page *page;
6503	struct vcpu_vmx *vmx = to_vmx(vcpu);
6504	int maxphyaddr = cpuid_maxphyaddr(vcpu);
6505
6506	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
6507			vmcs_read32(VMX_INSTRUCTION_INFO), &gva))
6508		return 1;
6509
6510	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vmptr,
6511				sizeof(vmptr), &e)) {
6512		kvm_inject_page_fault(vcpu, &e);
6513		return 1;
6514	}
6515
6516	switch (exit_reason) {
6517	case EXIT_REASON_VMON:
6518		/*
6519		 * SDM 3: 24.11.5
6520		 * The first 4 bytes of VMXON region contain the supported
6521		 * VMCS revision identifier
6522		 *
6523		 * Note - IA32_VMX_BASIC[48] will never be 1
6524		 * for the nested case;
6525		 * which replaces physical address width with 32
6526		 *
6527		 */
6528		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
6529			nested_vmx_failInvalid(vcpu);
6530			skip_emulated_instruction(vcpu);
6531			return 1;
6532		}
6533
6534		page = nested_get_page(vcpu, vmptr);
6535		if (page == NULL ||
6536		    *(u32 *)kmap(page) != VMCS12_REVISION) {
6537			nested_vmx_failInvalid(vcpu);
6538			kunmap(page);
6539			skip_emulated_instruction(vcpu);
6540			return 1;
6541		}
6542		kunmap(page);
6543		vmx->nested.vmxon_ptr = vmptr;
6544		break;
6545	case EXIT_REASON_VMCLEAR:
6546		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
6547			nested_vmx_failValid(vcpu,
6548					     VMXERR_VMCLEAR_INVALID_ADDRESS);
6549			skip_emulated_instruction(vcpu);
6550			return 1;
6551		}
6552
6553		if (vmptr == vmx->nested.vmxon_ptr) {
6554			nested_vmx_failValid(vcpu,
6555					     VMXERR_VMCLEAR_VMXON_POINTER);
6556			skip_emulated_instruction(vcpu);
6557			return 1;
6558		}
6559		break;
6560	case EXIT_REASON_VMPTRLD:
6561		if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) {
6562			nested_vmx_failValid(vcpu,
6563					     VMXERR_VMPTRLD_INVALID_ADDRESS);
6564			skip_emulated_instruction(vcpu);
6565			return 1;
6566		}
6567
6568		if (vmptr == vmx->nested.vmxon_ptr) {
6569			nested_vmx_failValid(vcpu,
6570					     VMXERR_VMCLEAR_VMXON_POINTER);
6571			skip_emulated_instruction(vcpu);
6572			return 1;
6573		}
6574		break;
6575	default:
6576		return 1; /* shouldn't happen */
6577	}
6578
6579	if (vmpointer)
6580		*vmpointer = vmptr;
6581	return 0;
6582}
6583
6584/*
6585 * Emulate the VMXON instruction.
6586 * Currently, we just remember that VMX is active, and do not save or even
6587 * inspect the argument to VMXON (the so-called "VMXON pointer") because we
6588 * do not currently need to store anything in that guest-allocated memory
6589 * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
6590 * argument is different from the VMXON pointer (which the spec says they do).
6591 */
6592static int handle_vmon(struct kvm_vcpu *vcpu)
6593{
6594	struct kvm_segment cs;
6595	struct vcpu_vmx *vmx = to_vmx(vcpu);
6596	struct vmcs *shadow_vmcs;
6597	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
6598		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
6599
6600	/* The Intel VMX Instruction Reference lists a bunch of bits that
6601	 * are prerequisite to running VMXON, most notably cr4.VMXE must be
6602	 * set to 1 (see vmx_set_cr4() for when we allow the guest to set this).
6603	 * Otherwise, we should fail with #UD. We test these now:
6604	 */
6605	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) ||
6606	    !kvm_read_cr0_bits(vcpu, X86_CR0_PE) ||
6607	    (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
6608		kvm_queue_exception(vcpu, UD_VECTOR);
6609		return 1;
6610	}
6611
6612	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
6613	if (is_long_mode(vcpu) && !cs.l) {
6614		kvm_queue_exception(vcpu, UD_VECTOR);
6615		return 1;
6616	}
6617
6618	if (vmx_get_cpl(vcpu)) {
6619		kvm_inject_gp(vcpu, 0);
6620		return 1;
6621	}
6622
6623	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
6624		return 1;
6625
6626	if (vmx->nested.vmxon) {
6627		nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
6628		skip_emulated_instruction(vcpu);
6629		return 1;
6630	}
6631
6632	if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
6633			!= VMXON_NEEDED_FEATURES) {
6634		kvm_inject_gp(vcpu, 0);
6635		return 1;
6636	}
6637
6638	if (enable_shadow_vmcs) {
6639		shadow_vmcs = alloc_vmcs();
6640		if (!shadow_vmcs)
6641			return -ENOMEM;
6642		/* mark vmcs as shadow */
6643		shadow_vmcs->revision_id |= (1u << 31);
6644		/* init shadow vmcs */
6645		vmcs_clear(shadow_vmcs);
6646		vmx->nested.current_shadow_vmcs = shadow_vmcs;
6647	}
6648
6649	INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
6650	vmx->nested.vmcs02_num = 0;
6651
6652	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
6653		     HRTIMER_MODE_REL);
6654	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
6655
6656	vmx->nested.vmxon = true;
6657
6658	skip_emulated_instruction(vcpu);
6659	nested_vmx_succeed(vcpu);
6660	return 1;
6661}
6662
6663/*
6664 * Intel's VMX Instruction Reference specifies a common set of prerequisites
6665 * for running VMX instructions (except VMXON, whose prerequisites are
6666 * slightly different). It also specifies what exception to inject otherwise.
6667 */
6668static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
6669{
6670	struct kvm_segment cs;
6671	struct vcpu_vmx *vmx = to_vmx(vcpu);
6672
6673	if (!vmx->nested.vmxon) {
6674		kvm_queue_exception(vcpu, UD_VECTOR);
6675		return 0;
6676	}
6677
6678	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
6679	if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) ||
6680	    (is_long_mode(vcpu) && !cs.l)) {
6681		kvm_queue_exception(vcpu, UD_VECTOR);
6682		return 0;
6683	}
6684
6685	if (vmx_get_cpl(vcpu)) {
6686		kvm_inject_gp(vcpu, 0);
6687		return 0;
6688	}
6689
6690	return 1;
6691}
6692
6693static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
6694{
6695	u32 exec_control;
6696	if (vmx->nested.current_vmptr == -1ull)
6697		return;
6698
6699	/* current_vmptr and current_vmcs12 are always set/reset together */
6700	if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
6701		return;
6702
6703	if (enable_shadow_vmcs) {
6704		/* copy to memory all shadowed fields in case
6705		   they were modified */
6706		copy_shadow_to_vmcs12(vmx);
6707		vmx->nested.sync_shadow_vmcs = false;
6708		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
6709		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
6710		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
6711		vmcs_write64(VMCS_LINK_POINTER, -1ull);
6712	}
6713	vmx->nested.posted_intr_nv = -1;
6714	kunmap(vmx->nested.current_vmcs12_page);
6715	nested_release_page(vmx->nested.current_vmcs12_page);
6716	vmx->nested.current_vmptr = -1ull;
6717	vmx->nested.current_vmcs12 = NULL;
6718}
6719
6720/*
6721 * Free whatever needs to be freed from vmx->nested when L1 goes down, or
6722 * just stops using VMX.
6723 */
6724static void free_nested(struct vcpu_vmx *vmx)
6725{
6726	if (!vmx->nested.vmxon)
6727		return;
6728
6729	vmx->nested.vmxon = false;
6730	nested_release_vmcs12(vmx);
6731	if (enable_shadow_vmcs)
6732		free_vmcs(vmx->nested.current_shadow_vmcs);
6733	/* Unpin physical memory we referred to in current vmcs02 */
6734	if (vmx->nested.apic_access_page) {
6735		nested_release_page(vmx->nested.apic_access_page);
6736		vmx->nested.apic_access_page = NULL;
6737	}
6738	if (vmx->nested.virtual_apic_page) {
6739		nested_release_page(vmx->nested.virtual_apic_page);
6740		vmx->nested.virtual_apic_page = NULL;
6741	}
6742	if (vmx->nested.pi_desc_page) {
6743		kunmap(vmx->nested.pi_desc_page);
6744		nested_release_page(vmx->nested.pi_desc_page);
6745		vmx->nested.pi_desc_page = NULL;
6746		vmx->nested.pi_desc = NULL;
6747	}
6748
6749	nested_free_all_saved_vmcss(vmx);
6750}
6751
6752/* Emulate the VMXOFF instruction */
6753static int handle_vmoff(struct kvm_vcpu *vcpu)
6754{
6755	if (!nested_vmx_check_permission(vcpu))
6756		return 1;
6757	free_nested(to_vmx(vcpu));
6758	skip_emulated_instruction(vcpu);
6759	nested_vmx_succeed(vcpu);
6760	return 1;
6761}
6762
6763/* Emulate the VMCLEAR instruction */
6764static int handle_vmclear(struct kvm_vcpu *vcpu)
6765{
6766	struct vcpu_vmx *vmx = to_vmx(vcpu);
6767	gpa_t vmptr;
6768	struct vmcs12 *vmcs12;
6769	struct page *page;
6770
6771	if (!nested_vmx_check_permission(vcpu))
6772		return 1;
6773
6774	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr))
6775		return 1;
6776
6777	if (vmptr == vmx->nested.current_vmptr)
6778		nested_release_vmcs12(vmx);
6779
6780	page = nested_get_page(vcpu, vmptr);
6781	if (page == NULL) {
6782		/*
6783		 * For accurate processor emulation, VMCLEAR beyond available
6784		 * physical memory should do nothing at all. However, it is
6785		 * possible that a nested vmx bug, not a guest hypervisor bug,
6786		 * resulted in this case, so let's shut down before doing any
6787		 * more damage:
6788		 */
6789		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
6790		return 1;
6791	}
6792	vmcs12 = kmap(page);
6793	vmcs12->launch_state = 0;
6794	kunmap(page);
6795	nested_release_page(page);
6796
6797	nested_free_vmcs02(vmx, vmptr);
6798
6799	skip_emulated_instruction(vcpu);
6800	nested_vmx_succeed(vcpu);
6801	return 1;
6802}
6803
6804static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
6805
6806/* Emulate the VMLAUNCH instruction */
6807static int handle_vmlaunch(struct kvm_vcpu *vcpu)
6808{
6809	return nested_vmx_run(vcpu, true);
6810}
6811
6812/* Emulate the VMRESUME instruction */
6813static int handle_vmresume(struct kvm_vcpu *vcpu)
6814{
6815
6816	return nested_vmx_run(vcpu, false);
6817}
6818
6819enum vmcs_field_type {
6820	VMCS_FIELD_TYPE_U16 = 0,
6821	VMCS_FIELD_TYPE_U64 = 1,
6822	VMCS_FIELD_TYPE_U32 = 2,
6823	VMCS_FIELD_TYPE_NATURAL_WIDTH = 3
6824};
6825
6826static inline int vmcs_field_type(unsigned long field)
6827{
6828	if (0x1 & field)	/* the *_HIGH fields are all 32 bit */
6829		return VMCS_FIELD_TYPE_U32;
6830	return (field >> 13) & 0x3 ;
6831}
6832
6833static inline int vmcs_field_readonly(unsigned long field)
6834{
6835	return (((field >> 10) & 0x3) == 1);
6836}
6837
6838/*
6839 * Read a vmcs12 field. Since these can have varying lengths and we return
6840 * one type, we chose the biggest type (u64) and zero-extend the return value
6841 * to that size. Note that the caller, handle_vmread, might need to use only
6842 * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
6843 * 64-bit fields are to be returned).
6844 */
6845static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
6846				  unsigned long field, u64 *ret)
6847{
6848	short offset = vmcs_field_to_offset(field);
6849	char *p;
6850
6851	if (offset < 0)
6852		return offset;
6853
6854	p = ((char *)(get_vmcs12(vcpu))) + offset;
6855
6856	switch (vmcs_field_type(field)) {
6857	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6858		*ret = *((natural_width *)p);
6859		return 0;
6860	case VMCS_FIELD_TYPE_U16:
6861		*ret = *((u16 *)p);
6862		return 0;
6863	case VMCS_FIELD_TYPE_U32:
6864		*ret = *((u32 *)p);
6865		return 0;
6866	case VMCS_FIELD_TYPE_U64:
6867		*ret = *((u64 *)p);
6868		return 0;
6869	default:
6870		WARN_ON(1);
6871		return -ENOENT;
6872	}
6873}
6874
6875
6876static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
6877				   unsigned long field, u64 field_value){
6878	short offset = vmcs_field_to_offset(field);
6879	char *p = ((char *) get_vmcs12(vcpu)) + offset;
6880	if (offset < 0)
6881		return offset;
6882
6883	switch (vmcs_field_type(field)) {
6884	case VMCS_FIELD_TYPE_U16:
6885		*(u16 *)p = field_value;
6886		return 0;
6887	case VMCS_FIELD_TYPE_U32:
6888		*(u32 *)p = field_value;
6889		return 0;
6890	case VMCS_FIELD_TYPE_U64:
6891		*(u64 *)p = field_value;
6892		return 0;
6893	case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6894		*(natural_width *)p = field_value;
6895		return 0;
6896	default:
6897		WARN_ON(1);
6898		return -ENOENT;
6899	}
6900
6901}
6902
6903static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
6904{
6905	int i;
6906	unsigned long field;
6907	u64 field_value;
6908	struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
6909	const unsigned long *fields = shadow_read_write_fields;
6910	const int num_fields = max_shadow_read_write_fields;
6911
6912	preempt_disable();
6913
6914	vmcs_load(shadow_vmcs);
6915
6916	for (i = 0; i < num_fields; i++) {
6917		field = fields[i];
6918		switch (vmcs_field_type(field)) {
6919		case VMCS_FIELD_TYPE_U16:
6920			field_value = vmcs_read16(field);
6921			break;
6922		case VMCS_FIELD_TYPE_U32:
6923			field_value = vmcs_read32(field);
6924			break;
6925		case VMCS_FIELD_TYPE_U64:
6926			field_value = vmcs_read64(field);
6927			break;
6928		case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6929			field_value = vmcs_readl(field);
6930			break;
6931		default:
6932			WARN_ON(1);
6933			continue;
6934		}
6935		vmcs12_write_any(&vmx->vcpu, field, field_value);
6936	}
6937
6938	vmcs_clear(shadow_vmcs);
6939	vmcs_load(vmx->loaded_vmcs->vmcs);
6940
6941	preempt_enable();
6942}
6943
6944static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
6945{
6946	const unsigned long *fields[] = {
6947		shadow_read_write_fields,
6948		shadow_read_only_fields
6949	};
6950	const int max_fields[] = {
6951		max_shadow_read_write_fields,
6952		max_shadow_read_only_fields
6953	};
6954	int i, q;
6955	unsigned long field;
6956	u64 field_value = 0;
6957	struct vmcs *shadow_vmcs = vmx->nested.current_shadow_vmcs;
6958
6959	vmcs_load(shadow_vmcs);
6960
6961	for (q = 0; q < ARRAY_SIZE(fields); q++) {
6962		for (i = 0; i < max_fields[q]; i++) {
6963			field = fields[q][i];
6964			vmcs12_read_any(&vmx->vcpu, field, &field_value);
6965
6966			switch (vmcs_field_type(field)) {
6967			case VMCS_FIELD_TYPE_U16:
6968				vmcs_write16(field, (u16)field_value);
6969				break;
6970			case VMCS_FIELD_TYPE_U32:
6971				vmcs_write32(field, (u32)field_value);
6972				break;
6973			case VMCS_FIELD_TYPE_U64:
6974				vmcs_write64(field, (u64)field_value);
6975				break;
6976			case VMCS_FIELD_TYPE_NATURAL_WIDTH:
6977				vmcs_writel(field, (long)field_value);
6978				break;
6979			default:
6980				WARN_ON(1);
6981				break;
6982			}
6983		}
6984	}
6985
6986	vmcs_clear(shadow_vmcs);
6987	vmcs_load(vmx->loaded_vmcs->vmcs);
6988}
6989
6990/*
6991 * VMX instructions which assume a current vmcs12 (i.e., that VMPTRLD was
6992 * used before) all generate the same failure when it is missing.
6993 */
6994static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu)
6995{
6996	struct vcpu_vmx *vmx = to_vmx(vcpu);
6997	if (vmx->nested.current_vmptr == -1ull) {
6998		nested_vmx_failInvalid(vcpu);
6999		skip_emulated_instruction(vcpu);
7000		return 0;
7001	}
7002	return 1;
7003}
7004
7005static int handle_vmread(struct kvm_vcpu *vcpu)
7006{
7007	unsigned long field;
7008	u64 field_value;
7009	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7010	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7011	gva_t gva = 0;
7012
7013	if (!nested_vmx_check_permission(vcpu) ||
7014	    !nested_vmx_check_vmcs12(vcpu))
7015		return 1;
7016
7017	/* Decode instruction info and find the field to read */
7018	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
7019	/* Read the field, zero-extended to a u64 field_value */
7020	if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
7021		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
7022		skip_emulated_instruction(vcpu);
7023		return 1;
7024	}
7025	/*
7026	 * Now copy part of this value to register or memory, as requested.
7027	 * Note that the number of bits actually copied is 32 or 64 depending
7028	 * on the guest's mode (32 or 64 bit), not on the given field's length.
7029	 */
7030	if (vmx_instruction_info & (1u << 10)) {
7031		kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
7032			field_value);
7033	} else {
7034		if (get_vmx_mem_address(vcpu, exit_qualification,
7035				vmx_instruction_info, &gva))
7036			return 1;
7037		/* _system ok, as nested_vmx_check_permission verified cpl=0 */
7038		kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva,
7039			     &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL);
7040	}
7041
7042	nested_vmx_succeed(vcpu);
7043	skip_emulated_instruction(vcpu);
7044	return 1;
7045}
7046
7047
7048static int handle_vmwrite(struct kvm_vcpu *vcpu)
7049{
7050	unsigned long field;
7051	gva_t gva;
7052	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7053	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7054	/* The value to write might be 32 or 64 bits, depending on L1's long
7055	 * mode, and eventually we need to write that into a field of several
7056	 * possible lengths. The code below first zero-extends the value to 64
7057	 * bit (field_value), and then copies only the approriate number of
7058	 * bits into the vmcs12 field.
7059	 */
7060	u64 field_value = 0;
7061	struct x86_exception e;
7062
7063	if (!nested_vmx_check_permission(vcpu) ||
7064	    !nested_vmx_check_vmcs12(vcpu))
7065		return 1;
7066
7067	if (vmx_instruction_info & (1u << 10))
7068		field_value = kvm_register_readl(vcpu,
7069			(((vmx_instruction_info) >> 3) & 0xf));
7070	else {
7071		if (get_vmx_mem_address(vcpu, exit_qualification,
7072				vmx_instruction_info, &gva))
7073			return 1;
7074		if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva,
7075			   &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
7076			kvm_inject_page_fault(vcpu, &e);
7077			return 1;
7078		}
7079	}
7080
7081
7082	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
7083	if (vmcs_field_readonly(field)) {
7084		nested_vmx_failValid(vcpu,
7085			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
7086		skip_emulated_instruction(vcpu);
7087		return 1;
7088	}
7089
7090	if (vmcs12_write_any(vcpu, field, field_value) < 0) {
7091		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
7092		skip_emulated_instruction(vcpu);
7093		return 1;
7094	}
7095
7096	nested_vmx_succeed(vcpu);
7097	skip_emulated_instruction(vcpu);
7098	return 1;
7099}
7100
7101/* Emulate the VMPTRLD instruction */
7102static int handle_vmptrld(struct kvm_vcpu *vcpu)
7103{
7104	struct vcpu_vmx *vmx = to_vmx(vcpu);
7105	gpa_t vmptr;
7106	u32 exec_control;
7107
7108	if (!nested_vmx_check_permission(vcpu))
7109		return 1;
7110
7111	if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMPTRLD, &vmptr))
7112		return 1;
7113
7114	if (vmx->nested.current_vmptr != vmptr) {
7115		struct vmcs12 *new_vmcs12;
7116		struct page *page;
7117		page = nested_get_page(vcpu, vmptr);
7118		if (page == NULL) {
7119			nested_vmx_failInvalid(vcpu);
7120			skip_emulated_instruction(vcpu);
7121			return 1;
7122		}
7123		new_vmcs12 = kmap(page);
7124		if (new_vmcs12->revision_id != VMCS12_REVISION) {
7125			kunmap(page);
7126			nested_release_page_clean(page);
7127			nested_vmx_failValid(vcpu,
7128				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
7129			skip_emulated_instruction(vcpu);
7130			return 1;
7131		}
7132
7133		nested_release_vmcs12(vmx);
7134		vmx->nested.current_vmptr = vmptr;
7135		vmx->nested.current_vmcs12 = new_vmcs12;
7136		vmx->nested.current_vmcs12_page = page;
7137		if (enable_shadow_vmcs) {
7138			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7139			exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
7140			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7141			vmcs_write64(VMCS_LINK_POINTER,
7142				     __pa(vmx->nested.current_shadow_vmcs));
7143			vmx->nested.sync_shadow_vmcs = true;
7144		}
7145	}
7146
7147	nested_vmx_succeed(vcpu);
7148	skip_emulated_instruction(vcpu);
7149	return 1;
7150}
7151
7152/* Emulate the VMPTRST instruction */
7153static int handle_vmptrst(struct kvm_vcpu *vcpu)
7154{
7155	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7156	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7157	gva_t vmcs_gva;
7158	struct x86_exception e;
7159
7160	if (!nested_vmx_check_permission(vcpu))
7161		return 1;
7162
7163	if (get_vmx_mem_address(vcpu, exit_qualification,
7164			vmx_instruction_info, &vmcs_gva))
7165		return 1;
7166	/* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */
7167	if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva,
7168				 (void *)&to_vmx(vcpu)->nested.current_vmptr,
7169				 sizeof(u64), &e)) {
7170		kvm_inject_page_fault(vcpu, &e);
7171		return 1;
7172	}
7173	nested_vmx_succeed(vcpu);
7174	skip_emulated_instruction(vcpu);
7175	return 1;
7176}
7177
7178/* Emulate the INVEPT instruction */
7179static int handle_invept(struct kvm_vcpu *vcpu)
7180{
7181	struct vcpu_vmx *vmx = to_vmx(vcpu);
7182	u32 vmx_instruction_info, types;
7183	unsigned long type;
7184	gva_t gva;
7185	struct x86_exception e;
7186	struct {
7187		u64 eptp, gpa;
7188	} operand;
7189
7190	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
7191	      SECONDARY_EXEC_ENABLE_EPT) ||
7192	    !(vmx->nested.nested_vmx_ept_caps & VMX_EPT_INVEPT_BIT)) {
7193		kvm_queue_exception(vcpu, UD_VECTOR);
7194		return 1;
7195	}
7196
7197	if (!nested_vmx_check_permission(vcpu))
7198		return 1;
7199
7200	if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) {
7201		kvm_queue_exception(vcpu, UD_VECTOR);
7202		return 1;
7203	}
7204
7205	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
7206	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
7207
7208	types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
7209
7210	if (!(types & (1UL << type))) {
7211		nested_vmx_failValid(vcpu,
7212				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
7213		skip_emulated_instruction(vcpu);
7214		return 1;
7215	}
7216
7217	/* According to the Intel VMX instruction reference, the memory
7218	 * operand is read even if it isn't needed (e.g., for type==global)
7219	 */
7220	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
7221			vmx_instruction_info, &gva))
7222		return 1;
7223	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &operand,
7224				sizeof(operand), &e)) {
7225		kvm_inject_page_fault(vcpu, &e);
7226		return 1;
7227	}
7228
7229	switch (type) {
7230	case VMX_EPT_EXTENT_GLOBAL:
7231		kvm_mmu_sync_roots(vcpu);
7232		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
7233		nested_vmx_succeed(vcpu);
7234		break;
7235	default:
7236		/* Trap single context invalidation invept calls */
7237		BUG_ON(1);
7238		break;
7239	}
7240
7241	skip_emulated_instruction(vcpu);
7242	return 1;
7243}
7244
7245static int handle_invvpid(struct kvm_vcpu *vcpu)
7246{
7247	kvm_queue_exception(vcpu, UD_VECTOR);
7248	return 1;
7249}
7250
7251static int handle_pml_full(struct kvm_vcpu *vcpu)
7252{
7253	unsigned long exit_qualification;
7254
7255	trace_kvm_pml_full(vcpu->vcpu_id);
7256
7257	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7258
7259	/*
7260	 * PML buffer FULL happened while executing iret from NMI,
7261	 * "blocked by NMI" bit has to be set before next VM entry.
7262	 */
7263	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
7264			cpu_has_virtual_nmis() &&
7265			(exit_qualification & INTR_INFO_UNBLOCK_NMI))
7266		vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
7267				GUEST_INTR_STATE_NMI);
7268
7269	/*
7270	 * PML buffer already flushed at beginning of VMEXIT. Nothing to do
7271	 * here.., and there's no userspace involvement needed for PML.
7272	 */
7273	return 1;
7274}
7275
7276/*
7277 * The exit handlers return 1 if the exit was handled fully and guest execution
7278 * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
7279 * to be done to userspace and return 0.
7280 */
7281static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
7282	[EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
7283	[EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
7284	[EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
7285	[EXIT_REASON_NMI_WINDOW]	      = handle_nmi_window,
7286	[EXIT_REASON_IO_INSTRUCTION]          = handle_io,
7287	[EXIT_REASON_CR_ACCESS]               = handle_cr,
7288	[EXIT_REASON_DR_ACCESS]               = handle_dr,
7289	[EXIT_REASON_CPUID]                   = handle_cpuid,
7290	[EXIT_REASON_MSR_READ]                = handle_rdmsr,
7291	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
7292	[EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
7293	[EXIT_REASON_HLT]                     = handle_halt,
7294	[EXIT_REASON_INVD]		      = handle_invd,
7295	[EXIT_REASON_INVLPG]		      = handle_invlpg,
7296	[EXIT_REASON_RDPMC]                   = handle_rdpmc,
7297	[EXIT_REASON_VMCALL]                  = handle_vmcall,
7298	[EXIT_REASON_VMCLEAR]	              = handle_vmclear,
7299	[EXIT_REASON_VMLAUNCH]                = handle_vmlaunch,
7300	[EXIT_REASON_VMPTRLD]                 = handle_vmptrld,
7301	[EXIT_REASON_VMPTRST]                 = handle_vmptrst,
7302	[EXIT_REASON_VMREAD]                  = handle_vmread,
7303	[EXIT_REASON_VMRESUME]                = handle_vmresume,
7304	[EXIT_REASON_VMWRITE]                 = handle_vmwrite,
7305	[EXIT_REASON_VMOFF]                   = handle_vmoff,
7306	[EXIT_REASON_VMON]                    = handle_vmon,
7307	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
7308	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
7309	[EXIT_REASON_APIC_WRITE]              = handle_apic_write,
7310	[EXIT_REASON_EOI_INDUCED]             = handle_apic_eoi_induced,
7311	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
7312	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
7313	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
7314	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
7315	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
7316	[EXIT_REASON_EPT_MISCONFIG]           = handle_ept_misconfig,
7317	[EXIT_REASON_PAUSE_INSTRUCTION]       = handle_pause,
7318	[EXIT_REASON_MWAIT_INSTRUCTION]	      = handle_mwait,
7319	[EXIT_REASON_MONITOR_INSTRUCTION]     = handle_monitor,
7320	[EXIT_REASON_INVEPT]                  = handle_invept,
7321	[EXIT_REASON_INVVPID]                 = handle_invvpid,
7322	[EXIT_REASON_XSAVES]                  = handle_xsaves,
7323	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
7324	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
7325};
7326
7327static const int kvm_vmx_max_exit_handlers =
7328	ARRAY_SIZE(kvm_vmx_exit_handlers);
7329
7330static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
7331				       struct vmcs12 *vmcs12)
7332{
7333	unsigned long exit_qualification;
7334	gpa_t bitmap, last_bitmap;
7335	unsigned int port;
7336	int size;
7337	u8 b;
7338
7339	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7340		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
7341
7342	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7343
7344	port = exit_qualification >> 16;
7345	size = (exit_qualification & 7) + 1;
7346
7347	last_bitmap = (gpa_t)-1;
7348	b = -1;
7349
7350	while (size > 0) {
7351		if (port < 0x8000)
7352			bitmap = vmcs12->io_bitmap_a;
7353		else if (port < 0x10000)
7354			bitmap = vmcs12->io_bitmap_b;
7355		else
7356			return true;
7357		bitmap += (port & 0x7fff) / 8;
7358
7359		if (last_bitmap != bitmap)
7360			if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
7361				return true;
7362		if (b & (1 << (port & 7)))
7363			return true;
7364
7365		port++;
7366		size--;
7367		last_bitmap = bitmap;
7368	}
7369
7370	return false;
7371}
7372
7373/*
7374 * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
7375 * rather than handle it ourselves in L0. I.e., check whether L1 expressed
7376 * disinterest in the current event (read or write a specific MSR) by using an
7377 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
7378 */
7379static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
7380	struct vmcs12 *vmcs12, u32 exit_reason)
7381{
7382	u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
7383	gpa_t bitmap;
7384
7385	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
7386		return true;
7387
7388	/*
7389	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
7390	 * for the four combinations of read/write and low/high MSR numbers.
7391	 * First we need to figure out which of the four to use:
7392	 */
7393	bitmap = vmcs12->msr_bitmap;
7394	if (exit_reason == EXIT_REASON_MSR_WRITE)
7395		bitmap += 2048;
7396	if (msr_index >= 0xc0000000) {
7397		msr_index -= 0xc0000000;
7398		bitmap += 1024;
7399	}
7400
7401	/* Then read the msr_index'th bit from this bitmap: */
7402	if (msr_index < 1024*8) {
7403		unsigned char b;
7404		if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
7405			return true;
7406		return 1 & (b >> (msr_index & 7));
7407	} else
7408		return true; /* let L1 handle the wrong parameter */
7409}
7410
7411/*
7412 * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
7413 * rather than handle it ourselves in L0. I.e., check if L1 wanted to
7414 * intercept (via guest_host_mask etc.) the current event.
7415 */
7416static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
7417	struct vmcs12 *vmcs12)
7418{
7419	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
7420	int cr = exit_qualification & 15;
7421	int reg = (exit_qualification >> 8) & 15;
7422	unsigned long val = kvm_register_readl(vcpu, reg);
7423
7424	switch ((exit_qualification >> 4) & 3) {
7425	case 0: /* mov to cr */
7426		switch (cr) {
7427		case 0:
7428			if (vmcs12->cr0_guest_host_mask &
7429			    (val ^ vmcs12->cr0_read_shadow))
7430				return true;
7431			break;
7432		case 3:
7433			if ((vmcs12->cr3_target_count >= 1 &&
7434					vmcs12->cr3_target_value0 == val) ||
7435				(vmcs12->cr3_target_count >= 2 &&
7436					vmcs12->cr3_target_value1 == val) ||
7437				(vmcs12->cr3_target_count >= 3 &&
7438					vmcs12->cr3_target_value2 == val) ||
7439				(vmcs12->cr3_target_count >= 4 &&
7440					vmcs12->cr3_target_value3 == val))
7441				return false;
7442			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
7443				return true;
7444			break;
7445		case 4:
7446			if (vmcs12->cr4_guest_host_mask &
7447			    (vmcs12->cr4_read_shadow ^ val))
7448				return true;
7449			break;
7450		case 8:
7451			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
7452				return true;
7453			break;
7454		}
7455		break;
7456	case 2: /* clts */
7457		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
7458		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
7459			return true;
7460		break;
7461	case 1: /* mov from cr */
7462		switch (cr) {
7463		case 3:
7464			if (vmcs12->cpu_based_vm_exec_control &
7465			    CPU_BASED_CR3_STORE_EXITING)
7466				return true;
7467			break;
7468		case 8:
7469			if (vmcs12->cpu_based_vm_exec_control &
7470			    CPU_BASED_CR8_STORE_EXITING)
7471				return true;
7472			break;
7473		}
7474		break;
7475	case 3: /* lmsw */
7476		/*
7477		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
7478		 * cr0. Other attempted changes are ignored, with no exit.
7479		 */
7480		if (vmcs12->cr0_guest_host_mask & 0xe &
7481		    (val ^ vmcs12->cr0_read_shadow))
7482			return true;
7483		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
7484		    !(vmcs12->cr0_read_shadow & 0x1) &&
7485		    (val & 0x1))
7486			return true;
7487		break;
7488	}
7489	return false;
7490}
7491
7492/*
7493 * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
7494 * should handle it ourselves in L0 (and then continue L2). Only call this
7495 * when in is_guest_mode (L2).
7496 */
7497static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
7498{
7499	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7500	struct vcpu_vmx *vmx = to_vmx(vcpu);
7501	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7502	u32 exit_reason = vmx->exit_reason;
7503
7504	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
7505				vmcs_readl(EXIT_QUALIFICATION),
7506				vmx->idt_vectoring_info,
7507				intr_info,
7508				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
7509				KVM_ISA_VMX);
7510
7511	if (vmx->nested.nested_run_pending)
7512		return false;
7513
7514	if (unlikely(vmx->fail)) {
7515		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
7516				    vmcs_read32(VM_INSTRUCTION_ERROR));
7517		return true;
7518	}
7519
7520	switch (exit_reason) {
7521	case EXIT_REASON_EXCEPTION_NMI:
7522		if (!is_exception(intr_info))
7523			return false;
7524		else if (is_page_fault(intr_info))
7525			return enable_ept;
7526		else if (is_no_device(intr_info) &&
7527			 !(vmcs12->guest_cr0 & X86_CR0_TS))
7528			return false;
7529		return vmcs12->exception_bitmap &
7530				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
7531	case EXIT_REASON_EXTERNAL_INTERRUPT:
7532		return false;
7533	case EXIT_REASON_TRIPLE_FAULT:
7534		return true;
7535	case EXIT_REASON_PENDING_INTERRUPT:
7536		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
7537	case EXIT_REASON_NMI_WINDOW:
7538		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
7539	case EXIT_REASON_TASK_SWITCH:
7540		return true;
7541	case EXIT_REASON_CPUID:
7542		if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
7543			return false;
7544		return true;
7545	case EXIT_REASON_HLT:
7546		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
7547	case EXIT_REASON_INVD:
7548		return true;
7549	case EXIT_REASON_INVLPG:
7550		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
7551	case EXIT_REASON_RDPMC:
7552		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
7553	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
7554		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
7555	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
7556	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
7557	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
7558	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
7559	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
7560	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
7561		/*
7562		 * VMX instructions trap unconditionally. This allows L1 to
7563		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
7564		 */
7565		return true;
7566	case EXIT_REASON_CR_ACCESS:
7567		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
7568	case EXIT_REASON_DR_ACCESS:
7569		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
7570	case EXIT_REASON_IO_INSTRUCTION:
7571		return nested_vmx_exit_handled_io(vcpu, vmcs12);
7572	case EXIT_REASON_MSR_READ:
7573	case EXIT_REASON_MSR_WRITE:
7574		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
7575	case EXIT_REASON_INVALID_STATE:
7576		return true;
7577	case EXIT_REASON_MWAIT_INSTRUCTION:
7578		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
7579	case EXIT_REASON_MONITOR_INSTRUCTION:
7580		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
7581	case EXIT_REASON_PAUSE_INSTRUCTION:
7582		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
7583			nested_cpu_has2(vmcs12,
7584				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
7585	case EXIT_REASON_MCE_DURING_VMENTRY:
7586		return false;
7587	case EXIT_REASON_TPR_BELOW_THRESHOLD:
7588		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
7589	case EXIT_REASON_APIC_ACCESS:
7590		return nested_cpu_has2(vmcs12,
7591			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
7592	case EXIT_REASON_APIC_WRITE:
7593	case EXIT_REASON_EOI_INDUCED:
7594		/* apic_write and eoi_induced should exit unconditionally. */
7595		return true;
7596	case EXIT_REASON_EPT_VIOLATION:
7597		/*
7598		 * L0 always deals with the EPT violation. If nested EPT is
7599		 * used, and the nested mmu code discovers that the address is
7600		 * missing in the guest EPT table (EPT12), the EPT violation
7601		 * will be injected with nested_ept_inject_page_fault()
7602		 */
7603		return false;
7604	case EXIT_REASON_EPT_MISCONFIG:
7605		/*
7606		 * L2 never uses directly L1's EPT, but rather L0's own EPT
7607		 * table (shadow on EPT) or a merged EPT table that L0 built
7608		 * (EPT on EPT). So any problems with the structure of the
7609		 * table is L0's fault.
7610		 */
7611		return false;
7612	case EXIT_REASON_WBINVD:
7613		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
7614	case EXIT_REASON_XSETBV:
7615		return true;
7616	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
7617		/*
7618		 * This should never happen, since it is not possible to
7619		 * set XSS to a non-zero value---neither in L1 nor in L2.
7620		 * If if it were, XSS would have to be checked against
7621		 * the XSS exit bitmap in vmcs12.
7622		 */
7623		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
7624	default:
7625		return true;
7626	}
7627}
7628
7629static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
7630{
7631	*info1 = vmcs_readl(EXIT_QUALIFICATION);
7632	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
7633}
7634
7635static int vmx_enable_pml(struct vcpu_vmx *vmx)
7636{
7637	struct page *pml_pg;
7638	u32 exec_control;
7639
7640	pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
7641	if (!pml_pg)
7642		return -ENOMEM;
7643
7644	vmx->pml_pg = pml_pg;
7645
7646	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
7647	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7648
7649	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7650	exec_control |= SECONDARY_EXEC_ENABLE_PML;
7651	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7652
7653	return 0;
7654}
7655
7656static void vmx_disable_pml(struct vcpu_vmx *vmx)
7657{
7658	u32 exec_control;
7659
7660	ASSERT(vmx->pml_pg);
7661	__free_page(vmx->pml_pg);
7662	vmx->pml_pg = NULL;
7663
7664	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7665	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
7666	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
7667}
7668
7669static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
7670{
7671	struct kvm *kvm = vmx->vcpu.kvm;
7672	u64 *pml_buf;
7673	u16 pml_idx;
7674
7675	pml_idx = vmcs_read16(GUEST_PML_INDEX);
7676
7677	/* Do nothing if PML buffer is empty */
7678	if (pml_idx == (PML_ENTITY_NUM - 1))
7679		return;
7680
7681	/* PML index always points to next available PML buffer entity */
7682	if (pml_idx >= PML_ENTITY_NUM)
7683		pml_idx = 0;
7684	else
7685		pml_idx++;
7686
7687	pml_buf = page_address(vmx->pml_pg);
7688	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
7689		u64 gpa;
7690
7691		gpa = pml_buf[pml_idx];
7692		WARN_ON(gpa & (PAGE_SIZE - 1));
7693		mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
7694	}
7695
7696	/* reset PML index */
7697	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
7698}
7699
7700/*
7701 * Flush all vcpus' PML buffer and update logged GPAs to dirty_bitmap.
7702 * Called before reporting dirty_bitmap to userspace.
7703 */
7704static void kvm_flush_pml_buffers(struct kvm *kvm)
7705{
7706	int i;
7707	struct kvm_vcpu *vcpu;
7708	/*
7709	 * We only need to kick vcpu out of guest mode here, as PML buffer
7710	 * is flushed at beginning of all VMEXITs, and it's obvious that only
7711	 * vcpus running in guest are possible to have unflushed GPAs in PML
7712	 * buffer.
7713	 */
7714	kvm_for_each_vcpu(i, vcpu, kvm)
7715		kvm_vcpu_kick(vcpu);
7716}
7717
7718/*
7719 * The guest has exited.  See if we can fix it or if we need userspace
7720 * assistance.
7721 */
7722static int vmx_handle_exit(struct kvm_vcpu *vcpu)
7723{
7724	struct vcpu_vmx *vmx = to_vmx(vcpu);
7725	u32 exit_reason = vmx->exit_reason;
7726	u32 vectoring_info = vmx->idt_vectoring_info;
7727
7728	/*
7729	 * Flush logged GPAs PML buffer, this will make dirty_bitmap more
7730	 * updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
7731	 * querying dirty_bitmap, we only need to kick all vcpus out of guest
7732	 * mode as if vcpus is in root mode, the PML buffer must has been
7733	 * flushed already.
7734	 */
7735	if (enable_pml)
7736		vmx_flush_pml_buffer(vmx);
7737
7738	/* If guest state is invalid, start emulating */
7739	if (vmx->emulation_required)
7740		return handle_invalid_guest_state(vcpu);
7741
7742	if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
7743		nested_vmx_vmexit(vcpu, exit_reason,
7744				  vmcs_read32(VM_EXIT_INTR_INFO),
7745				  vmcs_readl(EXIT_QUALIFICATION));
7746		return 1;
7747	}
7748
7749	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
7750		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
7751		vcpu->run->fail_entry.hardware_entry_failure_reason
7752			= exit_reason;
7753		return 0;
7754	}
7755
7756	if (unlikely(vmx->fail)) {
7757		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
7758		vcpu->run->fail_entry.hardware_entry_failure_reason
7759			= vmcs_read32(VM_INSTRUCTION_ERROR);
7760		return 0;
7761	}
7762
7763	/*
7764	 * Note:
7765	 * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
7766	 * delivery event since it indicates guest is accessing MMIO.
7767	 * The vm-exit can be triggered again after return to guest that
7768	 * will cause infinite loop.
7769	 */
7770	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
7771			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
7772			exit_reason != EXIT_REASON_EPT_VIOLATION &&
7773			exit_reason != EXIT_REASON_TASK_SWITCH)) {
7774		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
7775		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
7776		vcpu->run->internal.ndata = 2;
7777		vcpu->run->internal.data[0] = vectoring_info;
7778		vcpu->run->internal.data[1] = exit_reason;
7779		return 0;
7780	}
7781
7782	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
7783	    !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
7784					get_vmcs12(vcpu))))) {
7785		if (vmx_interrupt_allowed(vcpu)) {
7786			vmx->soft_vnmi_blocked = 0;
7787		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
7788			   vcpu->arch.nmi_pending) {
7789			/*
7790			 * This CPU don't support us in finding the end of an
7791			 * NMI-blocked window if the guest runs with IRQs
7792			 * disabled. So we pull the trigger after 1 s of
7793			 * futile waiting, but inform the user about this.
7794			 */
7795			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
7796			       "state on VCPU %d after 1 s timeout\n",
7797			       __func__, vcpu->vcpu_id);
7798			vmx->soft_vnmi_blocked = 0;
7799		}
7800	}
7801
7802	if (exit_reason < kvm_vmx_max_exit_handlers
7803	    && kvm_vmx_exit_handlers[exit_reason])
7804		return kvm_vmx_exit_handlers[exit_reason](vcpu);
7805	else {
7806		WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
7807		kvm_queue_exception(vcpu, UD_VECTOR);
7808		return 1;
7809	}
7810}
7811
7812static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
7813{
7814	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7815
7816	if (is_guest_mode(vcpu) &&
7817		nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
7818		return;
7819
7820	if (irr == -1 || tpr < irr) {
7821		vmcs_write32(TPR_THRESHOLD, 0);
7822		return;
7823	}
7824
7825	vmcs_write32(TPR_THRESHOLD, irr);
7826}
7827
7828static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
7829{
7830	u32 sec_exec_control;
7831
7832	/*
7833	 * There is not point to enable virtualize x2apic without enable
7834	 * apicv
7835	 */
7836	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
7837				!vmx_vm_has_apicv(vcpu->kvm))
7838		return;
7839
7840	if (!vm_need_tpr_shadow(vcpu->kvm))
7841		return;
7842
7843	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
7844
7845	if (set) {
7846		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7847		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
7848	} else {
7849		sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
7850		sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
7851	}
7852	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
7853
7854	vmx_set_msr_bitmap(vcpu);
7855}
7856
7857static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
7858{
7859	struct vcpu_vmx *vmx = to_vmx(vcpu);
7860
7861	/*
7862	 * Currently we do not handle the nested case where L2 has an
7863	 * APIC access page of its own; that page is still pinned.
7864	 * Hence, we skip the case where the VCPU is in guest mode _and_
7865	 * L1 prepared an APIC access page for L2.
7866	 *
7867	 * For the case where L1 and L2 share the same APIC access page
7868	 * (flexpriority=Y but SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES clear
7869	 * in the vmcs12), this function will only update either the vmcs01
7870	 * or the vmcs02.  If the former, the vmcs02 will be updated by
7871	 * prepare_vmcs02.  If the latter, the vmcs01 will be updated in
7872	 * the next L2->L1 exit.
7873	 */
7874	if (!is_guest_mode(vcpu) ||
7875	    !nested_cpu_has2(vmx->nested.current_vmcs12,
7876			     SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
7877		vmcs_write64(APIC_ACCESS_ADDR, hpa);
7878}
7879
7880static void vmx_hwapic_isr_update(struct kvm *kvm, int isr)
7881{
7882	u16 status;
7883	u8 old;
7884
7885	if (isr == -1)
7886		isr = 0;
7887
7888	status = vmcs_read16(GUEST_INTR_STATUS);
7889	old = status >> 8;
7890	if (isr != old) {
7891		status &= 0xff;
7892		status |= isr << 8;
7893		vmcs_write16(GUEST_INTR_STATUS, status);
7894	}
7895}
7896
7897static void vmx_set_rvi(int vector)
7898{
7899	u16 status;
7900	u8 old;
7901
7902	if (vector == -1)
7903		vector = 0;
7904
7905	status = vmcs_read16(GUEST_INTR_STATUS);
7906	old = (u8)status & 0xff;
7907	if ((u8)vector != old) {
7908		status &= ~0xff;
7909		status |= (u8)vector;
7910		vmcs_write16(GUEST_INTR_STATUS, status);
7911	}
7912}
7913
7914static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
7915{
7916	if (!is_guest_mode(vcpu)) {
7917		vmx_set_rvi(max_irr);
7918		return;
7919	}
7920
7921	if (max_irr == -1)
7922		return;
7923
7924	/*
7925	 * In guest mode.  If a vmexit is needed, vmx_check_nested_events
7926	 * handles it.
7927	 */
7928	if (nested_exit_on_intr(vcpu))
7929		return;
7930
7931	/*
7932	 * Else, fall back to pre-APICv interrupt injection since L2
7933	 * is run without virtual interrupt delivery.
7934	 */
7935	if (!kvm_event_needs_reinjection(vcpu) &&
7936	    vmx_interrupt_allowed(vcpu)) {
7937		kvm_queue_interrupt(vcpu, max_irr, false);
7938		vmx_inject_irq(vcpu);
7939	}
7940}
7941
7942static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
7943{
7944	if (!vmx_vm_has_apicv(vcpu->kvm))
7945		return;
7946
7947	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
7948	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
7949	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
7950	vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
7951}
7952
7953static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
7954{
7955	u32 exit_intr_info;
7956
7957	if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
7958	      || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
7959		return;
7960
7961	vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7962	exit_intr_info = vmx->exit_intr_info;
7963
7964	/* Handle machine checks before interrupts are enabled */
7965	if (is_machine_check(exit_intr_info))
7966		kvm_machine_check();
7967
7968	/* We need to handle NMIs before interrupts are enabled */
7969	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
7970	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
7971		kvm_before_handle_nmi(&vmx->vcpu);
7972		asm("int $2");
7973		kvm_after_handle_nmi(&vmx->vcpu);
7974	}
7975}
7976
7977static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
7978{
7979	u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
7980
7981	/*
7982	 * If external interrupt exists, IF bit is set in rflags/eflags on the
7983	 * interrupt stack frame, and interrupt will be enabled on a return
7984	 * from interrupt handler.
7985	 */
7986	if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
7987			== (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) {
7988		unsigned int vector;
7989		unsigned long entry;
7990		gate_desc *desc;
7991		struct vcpu_vmx *vmx = to_vmx(vcpu);
7992#ifdef CONFIG_X86_64
7993		unsigned long tmp;
7994#endif
7995
7996		vector =  exit_intr_info & INTR_INFO_VECTOR_MASK;
7997		desc = (gate_desc *)vmx->host_idt_base + vector;
7998		entry = gate_offset(*desc);
7999		asm volatile(
8000#ifdef CONFIG_X86_64
8001			"mov %%" _ASM_SP ", %[sp]\n\t"
8002			"and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
8003			"push $%c[ss]\n\t"
8004			"push %[sp]\n\t"
8005#endif
8006			"pushf\n\t"
8007			"orl $0x200, (%%" _ASM_SP ")\n\t"
8008			__ASM_SIZE(push) " $%c[cs]\n\t"
8009			"call *%[entry]\n\t"
8010			:
8011#ifdef CONFIG_X86_64
8012			[sp]"=&r"(tmp)
8013#endif
8014			:
8015			[entry]"r"(entry),
8016			[ss]"i"(__KERNEL_DS),
8017			[cs]"i"(__KERNEL_CS)
8018			);
8019	} else
8020		local_irq_enable();
8021}
8022
8023static bool vmx_mpx_supported(void)
8024{
8025	return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
8026		(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
8027}
8028
8029static bool vmx_xsaves_supported(void)
8030{
8031	return vmcs_config.cpu_based_2nd_exec_ctrl &
8032		SECONDARY_EXEC_XSAVES;
8033}
8034
8035static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
8036{
8037	u32 exit_intr_info;
8038	bool unblock_nmi;
8039	u8 vector;
8040	bool idtv_info_valid;
8041
8042	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
8043
8044	if (cpu_has_virtual_nmis()) {
8045		if (vmx->nmi_known_unmasked)
8046			return;
8047		/*
8048		 * Can't use vmx->exit_intr_info since we're not sure what
8049		 * the exit reason is.
8050		 */
8051		exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
8052		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
8053		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
8054		/*
8055		 * SDM 3: 27.7.1.2 (September 2008)
8056		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
8057		 * a guest IRET fault.
8058		 * SDM 3: 23.2.2 (September 2008)
8059		 * Bit 12 is undefined in any of the following cases:
8060		 *  If the VM exit sets the valid bit in the IDT-vectoring
8061		 *   information field.
8062		 *  If the VM exit is due to a double fault.
8063		 */
8064		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
8065		    vector != DF_VECTOR && !idtv_info_valid)
8066			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
8067				      GUEST_INTR_STATE_NMI);
8068		else
8069			vmx->nmi_known_unmasked =
8070				!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
8071				  & GUEST_INTR_STATE_NMI);
8072	} else if (unlikely(vmx->soft_vnmi_blocked))
8073		vmx->vnmi_blocked_time +=
8074			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
8075}
8076
8077static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
8078				      u32 idt_vectoring_info,
8079				      int instr_len_field,
8080				      int error_code_field)
8081{
8082	u8 vector;
8083	int type;
8084	bool idtv_info_valid;
8085
8086	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
8087
8088	vcpu->arch.nmi_injected = false;
8089	kvm_clear_exception_queue(vcpu);
8090	kvm_clear_interrupt_queue(vcpu);
8091
8092	if (!idtv_info_valid)
8093		return;
8094
8095	kvm_make_request(KVM_REQ_EVENT, vcpu);
8096
8097	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
8098	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
8099
8100	switch (type) {
8101	case INTR_TYPE_NMI_INTR:
8102		vcpu->arch.nmi_injected = true;
8103		/*
8104		 * SDM 3: 27.7.1.2 (September 2008)
8105		 * Clear bit "block by NMI" before VM entry if a NMI
8106		 * delivery faulted.
8107		 */
8108		vmx_set_nmi_mask(vcpu, false);
8109		break;
8110	case INTR_TYPE_SOFT_EXCEPTION:
8111		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
8112		/* fall through */
8113	case INTR_TYPE_HARD_EXCEPTION:
8114		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
8115			u32 err = vmcs_read32(error_code_field);
8116			kvm_requeue_exception_e(vcpu, vector, err);
8117		} else
8118			kvm_requeue_exception(vcpu, vector);
8119		break;
8120	case INTR_TYPE_SOFT_INTR:
8121		vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
8122		/* fall through */
8123	case INTR_TYPE_EXT_INTR:
8124		kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
8125		break;
8126	default:
8127		break;
8128	}
8129}
8130
8131static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
8132{
8133	__vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
8134				  VM_EXIT_INSTRUCTION_LEN,
8135				  IDT_VECTORING_ERROR_CODE);
8136}
8137
8138static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
8139{
8140	__vmx_complete_interrupts(vcpu,
8141				  vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
8142				  VM_ENTRY_INSTRUCTION_LEN,
8143				  VM_ENTRY_EXCEPTION_ERROR_CODE);
8144
8145	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
8146}
8147
8148static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
8149{
8150	int i, nr_msrs;
8151	struct perf_guest_switch_msr *msrs;
8152
8153	msrs = perf_guest_get_msrs(&nr_msrs);
8154
8155	if (!msrs)
8156		return;
8157
8158	for (i = 0; i < nr_msrs; i++)
8159		if (msrs[i].host == msrs[i].guest)
8160			clear_atomic_switch_msr(vmx, msrs[i].msr);
8161		else
8162			add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest,
8163					msrs[i].host);
8164}
8165
8166static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
8167{
8168	struct vcpu_vmx *vmx = to_vmx(vcpu);
8169	unsigned long debugctlmsr, cr4;
8170
8171	/* Record the guest's net vcpu time for enforced NMI injections. */
8172	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
8173		vmx->entry_time = ktime_get();
8174
8175	/* Don't enter VMX if guest state is invalid, let the exit handler
8176	   start emulation until we arrive back to a valid state */
8177	if (vmx->emulation_required)
8178		return;
8179
8180	if (vmx->ple_window_dirty) {
8181		vmx->ple_window_dirty = false;
8182		vmcs_write32(PLE_WINDOW, vmx->ple_window);
8183	}
8184
8185	if (vmx->nested.sync_shadow_vmcs) {
8186		copy_vmcs12_to_shadow(vmx);
8187		vmx->nested.sync_shadow_vmcs = false;
8188	}
8189
8190	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
8191		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
8192	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
8193		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
8194
8195	cr4 = cr4_read_shadow();
8196	if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
8197		vmcs_writel(HOST_CR4, cr4);
8198		vmx->host_state.vmcs_host_cr4 = cr4;
8199	}
8200
8201	/* When single-stepping over STI and MOV SS, we must clear the
8202	 * corresponding interruptibility bits in the guest state. Otherwise
8203	 * vmentry fails as it then expects bit 14 (BS) in pending debug
8204	 * exceptions being set, but that's not correct for the guest debugging
8205	 * case. */
8206	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
8207		vmx_set_interrupt_shadow(vcpu, 0);
8208
8209	atomic_switch_perf_msrs(vmx);
8210	debugctlmsr = get_debugctlmsr();
8211
8212	vmx->__launched = vmx->loaded_vmcs->launched;
8213	asm(
8214		/* Store host registers */
8215		"push %%" _ASM_DX "; push %%" _ASM_BP ";"
8216		"push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
8217		"push %%" _ASM_CX " \n\t"
8218		"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
8219		"je 1f \n\t"
8220		"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
8221		__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
8222		"1: \n\t"
8223		/* Reload cr2 if changed */
8224		"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
8225		"mov %%cr2, %%" _ASM_DX " \n\t"
8226		"cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
8227		"je 2f \n\t"
8228		"mov %%" _ASM_AX", %%cr2 \n\t"
8229		"2: \n\t"
8230		/* Check if vmlaunch of vmresume is needed */
8231		"cmpl $0, %c[launched](%0) \n\t"
8232		/* Load guest registers.  Don't clobber flags. */
8233		"mov %c[rax](%0), %%" _ASM_AX " \n\t"
8234		"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
8235		"mov %c[rdx](%0), %%" _ASM_DX " \n\t"
8236		"mov %c[rsi](%0), %%" _ASM_SI " \n\t"
8237		"mov %c[rdi](%0), %%" _ASM_DI " \n\t"
8238		"mov %c[rbp](%0), %%" _ASM_BP " \n\t"
8239#ifdef CONFIG_X86_64
8240		"mov %c[r8](%0),  %%r8  \n\t"
8241		"mov %c[r9](%0),  %%r9  \n\t"
8242		"mov %c[r10](%0), %%r10 \n\t"
8243		"mov %c[r11](%0), %%r11 \n\t"
8244		"mov %c[r12](%0), %%r12 \n\t"
8245		"mov %c[r13](%0), %%r13 \n\t"
8246		"mov %c[r14](%0), %%r14 \n\t"
8247		"mov %c[r15](%0), %%r15 \n\t"
8248#endif
8249		"mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */
8250
8251		/* Enter guest mode */
8252		"jne 1f \n\t"
8253		__ex(ASM_VMX_VMLAUNCH) "\n\t"
8254		"jmp 2f \n\t"
8255		"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
8256		"2: "
8257		/* Save guest registers, load host registers, keep flags */
8258		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
8259		"pop %0 \n\t"
8260		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
8261		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
8262		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
8263		"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
8264		"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
8265		"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
8266		"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
8267#ifdef CONFIG_X86_64
8268		"mov %%r8,  %c[r8](%0) \n\t"
8269		"mov %%r9,  %c[r9](%0) \n\t"
8270		"mov %%r10, %c[r10](%0) \n\t"
8271		"mov %%r11, %c[r11](%0) \n\t"
8272		"mov %%r12, %c[r12](%0) \n\t"
8273		"mov %%r13, %c[r13](%0) \n\t"
8274		"mov %%r14, %c[r14](%0) \n\t"
8275		"mov %%r15, %c[r15](%0) \n\t"
8276#endif
8277		"mov %%cr2, %%" _ASM_AX "   \n\t"
8278		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
8279
8280		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
8281		"setbe %c[fail](%0) \n\t"
8282		".pushsection .rodata \n\t"
8283		".global vmx_return \n\t"
8284		"vmx_return: " _ASM_PTR " 2b \n\t"
8285		".popsection"
8286	      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
8287		[launched]"i"(offsetof(struct vcpu_vmx, __launched)),
8288		[fail]"i"(offsetof(struct vcpu_vmx, fail)),
8289		[host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
8290		[rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
8291		[rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
8292		[rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
8293		[rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
8294		[rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
8295		[rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
8296		[rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
8297#ifdef CONFIG_X86_64
8298		[r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
8299		[r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
8300		[r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
8301		[r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
8302		[r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
8303		[r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
8304		[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
8305		[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
8306#endif
8307		[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
8308		[wordsize]"i"(sizeof(ulong))
8309	      : "cc", "memory"
8310#ifdef CONFIG_X86_64
8311		, "rax", "rbx", "rdi", "rsi"
8312		, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
8313#else
8314		, "eax", "ebx", "edi", "esi"
8315#endif
8316	      );
8317
8318	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
8319	if (debugctlmsr)
8320		update_debugctlmsr(debugctlmsr);
8321
8322#ifndef CONFIG_X86_64
8323	/*
8324	 * The sysexit path does not restore ds/es, so we must set them to
8325	 * a reasonable value ourselves.
8326	 *
8327	 * We can't defer this to vmx_load_host_state() since that function
8328	 * may be executed in interrupt context, which saves and restore segments
8329	 * around it, nullifying its effect.
8330	 */
8331	loadsegment(ds, __USER_DS);
8332	loadsegment(es, __USER_DS);
8333#endif
8334
8335	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
8336				  | (1 << VCPU_EXREG_RFLAGS)
8337				  | (1 << VCPU_EXREG_PDPTR)
8338				  | (1 << VCPU_EXREG_SEGMENTS)
8339				  | (1 << VCPU_EXREG_CR3));
8340	vcpu->arch.regs_dirty = 0;
8341
8342	vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
8343
8344	vmx->loaded_vmcs->launched = 1;
8345
8346	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
8347	trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
8348
8349	/*
8350	 * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
8351	 * we did not inject a still-pending event to L1 now because of
8352	 * nested_run_pending, we need to re-enable this bit.
8353	 */
8354	if (vmx->nested.nested_run_pending)
8355		kvm_make_request(KVM_REQ_EVENT, vcpu);
8356
8357	vmx->nested.nested_run_pending = 0;
8358
8359	vmx_complete_atomic_exit(vmx);
8360	vmx_recover_nmi_blocking(vmx);
8361	vmx_complete_interrupts(vmx);
8362}
8363
8364static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
8365{
8366	struct vcpu_vmx *vmx = to_vmx(vcpu);
8367	int cpu;
8368
8369	if (vmx->loaded_vmcs == &vmx->vmcs01)
8370		return;
8371
8372	cpu = get_cpu();
8373	vmx->loaded_vmcs = &vmx->vmcs01;
8374	vmx_vcpu_put(vcpu);
8375	vmx_vcpu_load(vcpu, cpu);
8376	vcpu->cpu = cpu;
8377	put_cpu();
8378}
8379
8380static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
8381{
8382	struct vcpu_vmx *vmx = to_vmx(vcpu);
8383
8384	if (enable_pml)
8385		vmx_disable_pml(vmx);
8386	free_vpid(vmx);
8387	leave_guest_mode(vcpu);
8388	vmx_load_vmcs01(vcpu);
8389	free_nested(vmx);
8390	free_loaded_vmcs(vmx->loaded_vmcs);
8391	kfree(vmx->guest_msrs);
8392	kvm_vcpu_uninit(vcpu);
8393	kmem_cache_free(kvm_vcpu_cache, vmx);
8394}
8395
8396static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
8397{
8398	int err;
8399	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
8400	int cpu;
8401
8402	if (!vmx)
8403		return ERR_PTR(-ENOMEM);
8404
8405	allocate_vpid(vmx);
8406
8407	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
8408	if (err)
8409		goto free_vcpu;
8410
8411	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
8412	BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
8413		     > PAGE_SIZE);
8414
8415	err = -ENOMEM;
8416	if (!vmx->guest_msrs) {
8417		goto uninit_vcpu;
8418	}
8419
8420	vmx->loaded_vmcs = &vmx->vmcs01;
8421	vmx->loaded_vmcs->vmcs = alloc_vmcs();
8422	if (!vmx->loaded_vmcs->vmcs)
8423		goto free_msrs;
8424	if (!vmm_exclusive)
8425		kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id())));
8426	loaded_vmcs_init(vmx->loaded_vmcs);
8427	if (!vmm_exclusive)
8428		kvm_cpu_vmxoff();
8429
8430	cpu = get_cpu();
8431	vmx_vcpu_load(&vmx->vcpu, cpu);
8432	vmx->vcpu.cpu = cpu;
8433	err = vmx_vcpu_setup(vmx);
8434	vmx_vcpu_put(&vmx->vcpu);
8435	put_cpu();
8436	if (err)
8437		goto free_vmcs;
8438	if (vm_need_virtualize_apic_accesses(kvm)) {
8439		err = alloc_apic_access_page(kvm);
8440		if (err)
8441			goto free_vmcs;
8442	}
8443
8444	if (enable_ept) {
8445		if (!kvm->arch.ept_identity_map_addr)
8446			kvm->arch.ept_identity_map_addr =
8447				VMX_EPT_IDENTITY_PAGETABLE_ADDR;
8448		err = init_rmode_identity_map(kvm);
8449		if (err)
8450			goto free_vmcs;
8451	}
8452
8453	if (nested)
8454		nested_vmx_setup_ctls_msrs(vmx);
8455
8456	vmx->nested.posted_intr_nv = -1;
8457	vmx->nested.current_vmptr = -1ull;
8458	vmx->nested.current_vmcs12 = NULL;
8459
8460	/*
8461	 * If PML is turned on, failure on enabling PML just results in failure
8462	 * of creating the vcpu, therefore we can simplify PML logic (by
8463	 * avoiding dealing with cases, such as enabling PML partially on vcpus
8464	 * for the guest, etc.
8465	 */
8466	if (enable_pml) {
8467		err = vmx_enable_pml(vmx);
8468		if (err)
8469			goto free_vmcs;
8470	}
8471
8472	return &vmx->vcpu;
8473
8474free_vmcs:
8475	free_loaded_vmcs(vmx->loaded_vmcs);
8476free_msrs:
8477	kfree(vmx->guest_msrs);
8478uninit_vcpu:
8479	kvm_vcpu_uninit(&vmx->vcpu);
8480free_vcpu:
8481	free_vpid(vmx);
8482	kmem_cache_free(kvm_vcpu_cache, vmx);
8483	return ERR_PTR(err);
8484}
8485
8486static void __init vmx_check_processor_compat(void *rtn)
8487{
8488	struct vmcs_config vmcs_conf;
8489
8490	*(int *)rtn = 0;
8491	if (setup_vmcs_config(&vmcs_conf) < 0)
8492		*(int *)rtn = -EIO;
8493	if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
8494		printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
8495				smp_processor_id());
8496		*(int *)rtn = -EIO;
8497	}
8498}
8499
8500static int get_ept_level(void)
8501{
8502	return VMX_EPT_DEFAULT_GAW + 1;
8503}
8504
8505static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
8506{
8507	u64 ret;
8508
8509	/* For VT-d and EPT combination
8510	 * 1. MMIO: always map as UC
8511	 * 2. EPT with VT-d:
8512	 *   a. VT-d without snooping control feature: can't guarantee the
8513	 *	result, try to trust guest.
8514	 *   b. VT-d with snooping control feature: snooping control feature of
8515	 *	VT-d engine can guarantee the cache correctness. Just set it
8516	 *	to WB to keep consistent with host. So the same as item 3.
8517	 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
8518	 *    consistent with host MTRR
8519	 */
8520	if (is_mmio)
8521		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
8522	else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
8523		ret = kvm_get_guest_memory_type(vcpu, gfn) <<
8524		      VMX_EPT_MT_EPTE_SHIFT;
8525	else
8526		ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
8527			| VMX_EPT_IPAT_BIT;
8528
8529	return ret;
8530}
8531
8532static int vmx_get_lpage_level(void)
8533{
8534	if (enable_ept && !cpu_has_vmx_ept_1g_page())
8535		return PT_DIRECTORY_LEVEL;
8536	else
8537		/* For shadow and EPT supported 1GB page */
8538		return PT_PDPE_LEVEL;
8539}
8540
8541static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
8542{
8543	struct kvm_cpuid_entry2 *best;
8544	struct vcpu_vmx *vmx = to_vmx(vcpu);
8545	u32 exec_control;
8546
8547	vmx->rdtscp_enabled = false;
8548	if (vmx_rdtscp_supported()) {
8549		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8550		if (exec_control & SECONDARY_EXEC_RDTSCP) {
8551			best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
8552			if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
8553				vmx->rdtscp_enabled = true;
8554			else {
8555				exec_control &= ~SECONDARY_EXEC_RDTSCP;
8556				vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8557						exec_control);
8558			}
8559		}
8560		if (nested && !vmx->rdtscp_enabled)
8561			vmx->nested.nested_vmx_secondary_ctls_high &=
8562				~SECONDARY_EXEC_RDTSCP;
8563	}
8564
8565	/* Exposing INVPCID only when PCID is exposed */
8566	best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
8567	if (vmx_invpcid_supported() &&
8568	    best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
8569	    guest_cpuid_has_pcid(vcpu)) {
8570		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8571		exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
8572		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8573			     exec_control);
8574	} else {
8575		if (cpu_has_secondary_exec_ctrls()) {
8576			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
8577			exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
8578			vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
8579				     exec_control);
8580		}
8581		if (best)
8582			best->ebx &= ~bit(X86_FEATURE_INVPCID);
8583	}
8584}
8585
8586static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
8587{
8588	if (func == 1 && nested)
8589		entry->ecx |= bit(X86_FEATURE_VMX);
8590}
8591
8592static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
8593		struct x86_exception *fault)
8594{
8595	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8596	u32 exit_reason;
8597
8598	if (fault->error_code & PFERR_RSVD_MASK)
8599		exit_reason = EXIT_REASON_EPT_MISCONFIG;
8600	else
8601		exit_reason = EXIT_REASON_EPT_VIOLATION;
8602	nested_vmx_vmexit(vcpu, exit_reason, 0, vcpu->arch.exit_qualification);
8603	vmcs12->guest_physical_address = fault->address;
8604}
8605
8606/* Callbacks for nested_ept_init_mmu_context: */
8607
8608static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
8609{
8610	/* return the page table to be shadowed - in our case, EPT12 */
8611	return get_vmcs12(vcpu)->ept_pointer;
8612}
8613
8614static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
8615{
8616	WARN_ON(mmu_is_nested(vcpu));
8617	kvm_init_shadow_ept_mmu(vcpu,
8618			to_vmx(vcpu)->nested.nested_vmx_ept_caps &
8619			VMX_EPT_EXECUTE_ONLY_BIT);
8620	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
8621	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
8622	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
8623
8624	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
8625}
8626
8627static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
8628{
8629	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
8630}
8631
8632static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
8633					    u16 error_code)
8634{
8635	bool inequality, bit;
8636
8637	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
8638	inequality =
8639		(error_code & vmcs12->page_fault_error_code_mask) !=
8640		 vmcs12->page_fault_error_code_match;
8641	return inequality ^ bit;
8642}
8643
8644static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
8645		struct x86_exception *fault)
8646{
8647	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8648
8649	WARN_ON(!is_guest_mode(vcpu));
8650
8651	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
8652		nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
8653				  vmcs_read32(VM_EXIT_INTR_INFO),
8654				  vmcs_readl(EXIT_QUALIFICATION));
8655	else
8656		kvm_inject_page_fault(vcpu, fault);
8657}
8658
8659static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
8660					struct vmcs12 *vmcs12)
8661{
8662	struct vcpu_vmx *vmx = to_vmx(vcpu);
8663	int maxphyaddr = cpuid_maxphyaddr(vcpu);
8664
8665	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
8666		if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
8667		    vmcs12->apic_access_addr >> maxphyaddr)
8668			return false;
8669
8670		/*
8671		 * Translate L1 physical address to host physical
8672		 * address for vmcs02. Keep the page pinned, so this
8673		 * physical address remains valid. We keep a reference
8674		 * to it so we can release it later.
8675		 */
8676		if (vmx->nested.apic_access_page) /* shouldn't happen */
8677			nested_release_page(vmx->nested.apic_access_page);
8678		vmx->nested.apic_access_page =
8679			nested_get_page(vcpu, vmcs12->apic_access_addr);
8680	}
8681
8682	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
8683		if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
8684		    vmcs12->virtual_apic_page_addr >> maxphyaddr)
8685			return false;
8686
8687		if (vmx->nested.virtual_apic_page) /* shouldn't happen */
8688			nested_release_page(vmx->nested.virtual_apic_page);
8689		vmx->nested.virtual_apic_page =
8690			nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
8691
8692		/*
8693		 * Failing the vm entry is _not_ what the processor does
8694		 * but it's basically the only possibility we have.
8695		 * We could still enter the guest if CR8 load exits are
8696		 * enabled, CR8 store exits are enabled, and virtualize APIC
8697		 * access is disabled; in this case the processor would never
8698		 * use the TPR shadow and we could simply clear the bit from
8699		 * the execution control.  But such a configuration is useless,
8700		 * so let's keep the code simple.
8701		 */
8702		if (!vmx->nested.virtual_apic_page)
8703			return false;
8704	}
8705
8706	if (nested_cpu_has_posted_intr(vmcs12)) {
8707		if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
8708		    vmcs12->posted_intr_desc_addr >> maxphyaddr)
8709			return false;
8710
8711		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
8712			kunmap(vmx->nested.pi_desc_page);
8713			nested_release_page(vmx->nested.pi_desc_page);
8714		}
8715		vmx->nested.pi_desc_page =
8716			nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
8717		if (!vmx->nested.pi_desc_page)
8718			return false;
8719
8720		vmx->nested.pi_desc =
8721			(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
8722		if (!vmx->nested.pi_desc) {
8723			nested_release_page_clean(vmx->nested.pi_desc_page);
8724			return false;
8725		}
8726		vmx->nested.pi_desc =
8727			(struct pi_desc *)((void *)vmx->nested.pi_desc +
8728			(unsigned long)(vmcs12->posted_intr_desc_addr &
8729			(PAGE_SIZE - 1)));
8730	}
8731
8732	return true;
8733}
8734
8735static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
8736{
8737	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
8738	struct vcpu_vmx *vmx = to_vmx(vcpu);
8739
8740	if (vcpu->arch.virtual_tsc_khz == 0)
8741		return;
8742
8743	/* Make sure short timeouts reliably trigger an immediate vmexit.
8744	 * hrtimer_start does not guarantee this. */
8745	if (preemption_timeout <= 1) {
8746		vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
8747		return;
8748	}
8749
8750	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
8751	preemption_timeout *= 1000000;
8752	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
8753	hrtimer_start(&vmx->nested.preemption_timer,
8754		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
8755}
8756
8757static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
8758						struct vmcs12 *vmcs12)
8759{
8760	int maxphyaddr;
8761	u64 addr;
8762
8763	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
8764		return 0;
8765
8766	if (vmcs12_read_any(vcpu, MSR_BITMAP, &addr)) {
8767		WARN_ON(1);
8768		return -EINVAL;
8769	}
8770	maxphyaddr = cpuid_maxphyaddr(vcpu);
8771
8772	if (!PAGE_ALIGNED(vmcs12->msr_bitmap) ||
8773	   ((addr + PAGE_SIZE) >> maxphyaddr))
8774		return -EINVAL;
8775
8776	return 0;
8777}
8778
8779/*
8780 * Merge L0's and L1's MSR bitmap, return false to indicate that
8781 * we do not use the hardware.
8782 */
8783static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
8784					       struct vmcs12 *vmcs12)
8785{
8786	int msr;
8787	struct page *page;
8788	unsigned long *msr_bitmap;
8789
8790	if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
8791		return false;
8792
8793	page = nested_get_page(vcpu, vmcs12->msr_bitmap);
8794	if (!page) {
8795		WARN_ON(1);
8796		return false;
8797	}
8798	msr_bitmap = (unsigned long *)kmap(page);
8799	if (!msr_bitmap) {
8800		nested_release_page_clean(page);
8801		WARN_ON(1);
8802		return false;
8803	}
8804
8805	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
8806		if (nested_cpu_has_apic_reg_virt(vmcs12))
8807			for (msr = 0x800; msr <= 0x8ff; msr++)
8808				nested_vmx_disable_intercept_for_msr(
8809					msr_bitmap,
8810					vmx_msr_bitmap_nested,
8811					msr, MSR_TYPE_R);
8812		/* TPR is allowed */
8813		nested_vmx_disable_intercept_for_msr(msr_bitmap,
8814				vmx_msr_bitmap_nested,
8815				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8816				MSR_TYPE_R | MSR_TYPE_W);
8817		if (nested_cpu_has_vid(vmcs12)) {
8818			/* EOI and self-IPI are allowed */
8819			nested_vmx_disable_intercept_for_msr(
8820				msr_bitmap,
8821				vmx_msr_bitmap_nested,
8822				APIC_BASE_MSR + (APIC_EOI >> 4),
8823				MSR_TYPE_W);
8824			nested_vmx_disable_intercept_for_msr(
8825				msr_bitmap,
8826				vmx_msr_bitmap_nested,
8827				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8828				MSR_TYPE_W);
8829		}
8830	} else {
8831		/*
8832		 * Enable reading intercept of all the x2apic
8833		 * MSRs. We should not rely on vmcs12 to do any
8834		 * optimizations here, it may have been modified
8835		 * by L1.
8836		 */
8837		for (msr = 0x800; msr <= 0x8ff; msr++)
8838			__vmx_enable_intercept_for_msr(
8839				vmx_msr_bitmap_nested,
8840				msr,
8841				MSR_TYPE_R);
8842
8843		__vmx_enable_intercept_for_msr(
8844				vmx_msr_bitmap_nested,
8845				APIC_BASE_MSR + (APIC_TASKPRI >> 4),
8846				MSR_TYPE_W);
8847		__vmx_enable_intercept_for_msr(
8848				vmx_msr_bitmap_nested,
8849				APIC_BASE_MSR + (APIC_EOI >> 4),
8850				MSR_TYPE_W);
8851		__vmx_enable_intercept_for_msr(
8852				vmx_msr_bitmap_nested,
8853				APIC_BASE_MSR + (APIC_SELF_IPI >> 4),
8854				MSR_TYPE_W);
8855	}
8856	kunmap(page);
8857	nested_release_page_clean(page);
8858
8859	return true;
8860}
8861
8862static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
8863					   struct vmcs12 *vmcs12)
8864{
8865	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8866	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
8867	    !nested_cpu_has_vid(vmcs12) &&
8868	    !nested_cpu_has_posted_intr(vmcs12))
8869		return 0;
8870
8871	/*
8872	 * If virtualize x2apic mode is enabled,
8873	 * virtualize apic access must be disabled.
8874	 */
8875	if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
8876	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
8877		return -EINVAL;
8878
8879	/*
8880	 * If virtual interrupt delivery is enabled,
8881	 * we must exit on external interrupts.
8882	 */
8883	if (nested_cpu_has_vid(vmcs12) &&
8884	   !nested_exit_on_intr(vcpu))
8885		return -EINVAL;
8886
8887	/*
8888	 * bits 15:8 should be zero in posted_intr_nv,
8889	 * the descriptor address has been already checked
8890	 * in nested_get_vmcs12_pages.
8891	 */
8892	if (nested_cpu_has_posted_intr(vmcs12) &&
8893	   (!nested_cpu_has_vid(vmcs12) ||
8894	    !nested_exit_intr_ack_set(vcpu) ||
8895	    vmcs12->posted_intr_nv & 0xff00))
8896		return -EINVAL;
8897
8898	/* tpr shadow is needed by all apicv features. */
8899	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
8900		return -EINVAL;
8901
8902	return 0;
8903}
8904
8905static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
8906				       unsigned long count_field,
8907				       unsigned long addr_field)
8908{
8909	int maxphyaddr;
8910	u64 count, addr;
8911
8912	if (vmcs12_read_any(vcpu, count_field, &count) ||
8913	    vmcs12_read_any(vcpu, addr_field, &addr)) {
8914		WARN_ON(1);
8915		return -EINVAL;
8916	}
8917	if (count == 0)
8918		return 0;
8919	maxphyaddr = cpuid_maxphyaddr(vcpu);
8920	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
8921	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
8922		pr_warn_ratelimited(
8923			"nVMX: invalid MSR switch (0x%lx, %d, %llu, 0x%08llx)",
8924			addr_field, maxphyaddr, count, addr);
8925		return -EINVAL;
8926	}
8927	return 0;
8928}
8929
8930static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
8931						struct vmcs12 *vmcs12)
8932{
8933	if (vmcs12->vm_exit_msr_load_count == 0 &&
8934	    vmcs12->vm_exit_msr_store_count == 0 &&
8935	    vmcs12->vm_entry_msr_load_count == 0)
8936		return 0; /* Fast path */
8937	if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
8938					VM_EXIT_MSR_LOAD_ADDR) ||
8939	    nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
8940					VM_EXIT_MSR_STORE_ADDR) ||
8941	    nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
8942					VM_ENTRY_MSR_LOAD_ADDR))
8943		return -EINVAL;
8944	return 0;
8945}
8946
8947static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
8948				       struct vmx_msr_entry *e)
8949{
8950	/* x2APIC MSR accesses are not allowed */
8951	if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
8952		return -EINVAL;
8953	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
8954	    e->index == MSR_IA32_UCODE_REV)
8955		return -EINVAL;
8956	if (e->reserved != 0)
8957		return -EINVAL;
8958	return 0;
8959}
8960
8961static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
8962				     struct vmx_msr_entry *e)
8963{
8964	if (e->index == MSR_FS_BASE ||
8965	    e->index == MSR_GS_BASE ||
8966	    e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
8967	    nested_vmx_msr_check_common(vcpu, e))
8968		return -EINVAL;
8969	return 0;
8970}
8971
8972static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
8973				      struct vmx_msr_entry *e)
8974{
8975	if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
8976	    nested_vmx_msr_check_common(vcpu, e))
8977		return -EINVAL;
8978	return 0;
8979}
8980
8981/*
8982 * Load guest's/host's msr at nested entry/exit.
8983 * return 0 for success, entry index for failure.
8984 */
8985static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
8986{
8987	u32 i;
8988	struct vmx_msr_entry e;
8989	struct msr_data msr;
8990
8991	msr.host_initiated = false;
8992	for (i = 0; i < count; i++) {
8993		if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
8994				   &e, sizeof(e))) {
8995			pr_warn_ratelimited(
8996				"%s cannot read MSR entry (%u, 0x%08llx)\n",
8997				__func__, i, gpa + i * sizeof(e));
8998			goto fail;
8999		}
9000		if (nested_vmx_load_msr_check(vcpu, &e)) {
9001			pr_warn_ratelimited(
9002				"%s check failed (%u, 0x%x, 0x%x)\n",
9003				__func__, i, e.index, e.reserved);
9004			goto fail;
9005		}
9006		msr.index = e.index;
9007		msr.data = e.value;
9008		if (kvm_set_msr(vcpu, &msr)) {
9009			pr_warn_ratelimited(
9010				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
9011				__func__, i, e.index, e.value);
9012			goto fail;
9013		}
9014	}
9015	return 0;
9016fail:
9017	return i + 1;
9018}
9019
9020static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
9021{
9022	u32 i;
9023	struct vmx_msr_entry e;
9024
9025	for (i = 0; i < count; i++) {
9026		if (kvm_read_guest(vcpu->kvm,
9027				   gpa + i * sizeof(e),
9028				   &e, 2 * sizeof(u32))) {
9029			pr_warn_ratelimited(
9030				"%s cannot read MSR entry (%u, 0x%08llx)\n",
9031				__func__, i, gpa + i * sizeof(e));
9032			return -EINVAL;
9033		}
9034		if (nested_vmx_store_msr_check(vcpu, &e)) {
9035			pr_warn_ratelimited(
9036				"%s check failed (%u, 0x%x, 0x%x)\n",
9037				__func__, i, e.index, e.reserved);
9038			return -EINVAL;
9039		}
9040		if (kvm_get_msr(vcpu, e.index, &e.value)) {
9041			pr_warn_ratelimited(
9042				"%s cannot read MSR (%u, 0x%x)\n",
9043				__func__, i, e.index);
9044			return -EINVAL;
9045		}
9046		if (kvm_write_guest(vcpu->kvm,
9047				    gpa + i * sizeof(e) +
9048					offsetof(struct vmx_msr_entry, value),
9049				    &e.value, sizeof(e.value))) {
9050			pr_warn_ratelimited(
9051				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
9052				__func__, i, e.index, e.value);
9053			return -EINVAL;
9054		}
9055	}
9056	return 0;
9057}
9058
9059/*
9060 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
9061 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
9062 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
9063 * guest in a way that will both be appropriate to L1's requests, and our
9064 * needs. In addition to modifying the active vmcs (which is vmcs02), this
9065 * function also has additional necessary side-effects, like setting various
9066 * vcpu->arch fields.
9067 */
9068static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9069{
9070	struct vcpu_vmx *vmx = to_vmx(vcpu);
9071	u32 exec_control;
9072
9073	vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
9074	vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
9075	vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
9076	vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
9077	vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
9078	vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
9079	vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
9080	vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
9081	vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
9082	vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
9083	vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
9084	vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
9085	vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
9086	vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
9087	vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
9088	vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
9089	vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
9090	vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
9091	vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
9092	vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
9093	vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
9094	vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
9095	vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
9096	vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
9097	vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
9098	vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
9099	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
9100	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
9101	vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
9102	vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
9103	vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
9104	vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
9105	vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
9106	vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
9107	vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
9108	vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
9109
9110	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
9111		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
9112		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
9113	} else {
9114		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
9115		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
9116	}
9117	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
9118		vmcs12->vm_entry_intr_info_field);
9119	vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
9120		vmcs12->vm_entry_exception_error_code);
9121	vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
9122		vmcs12->vm_entry_instruction_len);
9123	vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
9124		vmcs12->guest_interruptibility_info);
9125	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
9126	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
9127	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
9128		vmcs12->guest_pending_dbg_exceptions);
9129	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
9130	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
9131
9132	if (nested_cpu_has_xsaves(vmcs12))
9133		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
9134	vmcs_write64(VMCS_LINK_POINTER, -1ull);
9135
9136	exec_control = vmcs12->pin_based_vm_exec_control;
9137	exec_control |= vmcs_config.pin_based_exec_ctrl;
9138	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
9139
9140	if (nested_cpu_has_posted_intr(vmcs12)) {
9141		/*
9142		 * Note that we use L0's vector here and in
9143		 * vmx_deliver_nested_posted_interrupt.
9144		 */
9145		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
9146		vmx->nested.pi_pending = false;
9147		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
9148		vmcs_write64(POSTED_INTR_DESC_ADDR,
9149			page_to_phys(vmx->nested.pi_desc_page) +
9150			(unsigned long)(vmcs12->posted_intr_desc_addr &
9151			(PAGE_SIZE - 1)));
9152	} else
9153		exec_control &= ~PIN_BASED_POSTED_INTR;
9154
9155	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
9156
9157	vmx->nested.preemption_timer_expired = false;
9158	if (nested_cpu_has_preemption_timer(vmcs12))
9159		vmx_start_preemption_timer(vcpu);
9160
9161	/*
9162	 * Whether page-faults are trapped is determined by a combination of
9163	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
9164	 * If enable_ept, L0 doesn't care about page faults and we should
9165	 * set all of these to L1's desires. However, if !enable_ept, L0 does
9166	 * care about (at least some) page faults, and because it is not easy
9167	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
9168	 * to exit on each and every L2 page fault. This is done by setting
9169	 * MASK=MATCH=0 and (see below) EB.PF=1.
9170	 * Note that below we don't need special code to set EB.PF beyond the
9171	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
9172	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
9173	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
9174	 *
9175	 * A problem with this approach (when !enable_ept) is that L1 may be
9176	 * injected with more page faults than it asked for. This could have
9177	 * caused problems, but in practice existing hypervisors don't care.
9178	 * To fix this, we will need to emulate the PFEC checking (on the L1
9179	 * page tables), using walk_addr(), when injecting PFs to L1.
9180	 */
9181	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
9182		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
9183	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
9184		enable_ept ? vmcs12->page_fault_error_code_match : 0);
9185
9186	if (cpu_has_secondary_exec_ctrls()) {
9187		exec_control = vmx_secondary_exec_control(vmx);
9188		if (!vmx->rdtscp_enabled)
9189			exec_control &= ~SECONDARY_EXEC_RDTSCP;
9190		/* Take the following fields only from vmcs12 */
9191		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
9192				  SECONDARY_EXEC_RDTSCP |
9193				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
9194				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
9195		if (nested_cpu_has(vmcs12,
9196				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
9197			exec_control |= vmcs12->secondary_vm_exec_control;
9198
9199		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
9200			/*
9201			 * If translation failed, no matter: This feature asks
9202			 * to exit when accessing the given address, and if it
9203			 * can never be accessed, this feature won't do
9204			 * anything anyway.
9205			 */
9206			if (!vmx->nested.apic_access_page)
9207				exec_control &=
9208				  ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
9209			else
9210				vmcs_write64(APIC_ACCESS_ADDR,
9211				  page_to_phys(vmx->nested.apic_access_page));
9212		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
9213			    (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
9214			exec_control |=
9215				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
9216			kvm_vcpu_reload_apic_access_page(vcpu);
9217		}
9218
9219		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
9220			vmcs_write64(EOI_EXIT_BITMAP0,
9221				vmcs12->eoi_exit_bitmap0);
9222			vmcs_write64(EOI_EXIT_BITMAP1,
9223				vmcs12->eoi_exit_bitmap1);
9224			vmcs_write64(EOI_EXIT_BITMAP2,
9225				vmcs12->eoi_exit_bitmap2);
9226			vmcs_write64(EOI_EXIT_BITMAP3,
9227				vmcs12->eoi_exit_bitmap3);
9228			vmcs_write16(GUEST_INTR_STATUS,
9229				vmcs12->guest_intr_status);
9230		}
9231
9232		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
9233	}
9234
9235
9236	/*
9237	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
9238	 * Some constant fields are set here by vmx_set_constant_host_state().
9239	 * Other fields are different per CPU, and will be set later when
9240	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
9241	 */
9242	vmx_set_constant_host_state(vmx);
9243
9244	/*
9245	 * HOST_RSP is normally set correctly in vmx_vcpu_run() just before
9246	 * entry, but only if the current (host) sp changed from the value
9247	 * we wrote last (vmx->host_rsp). This cache is no longer relevant
9248	 * if we switch vmcs, and rather than hold a separate cache per vmcs,
9249	 * here we just force the write to happen on entry.
9250	 */
9251	vmx->host_rsp = 0;
9252
9253	exec_control = vmx_exec_control(vmx); /* L0's desires */
9254	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
9255	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
9256	exec_control &= ~CPU_BASED_TPR_SHADOW;
9257	exec_control |= vmcs12->cpu_based_vm_exec_control;
9258
9259	if (exec_control & CPU_BASED_TPR_SHADOW) {
9260		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
9261				page_to_phys(vmx->nested.virtual_apic_page));
9262		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
9263	}
9264
9265	if (cpu_has_vmx_msr_bitmap() &&
9266	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
9267		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
9268		/* MSR_BITMAP will be set by following vmx_set_efer. */
9269	} else
9270		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
9271
9272	/*
9273	 * Merging of IO bitmap not currently supported.
9274	 * Rather, exit every time.
9275	 */
9276	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
9277	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
9278
9279	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
9280
9281	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
9282	 * bitwise-or of what L1 wants to trap for L2, and what we want to
9283	 * trap. Note that CR0.TS also needs updating - we do this later.
9284	 */
9285	update_exception_bitmap(vcpu);
9286	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
9287	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
9288
9289	/* L2->L1 exit controls are emulated - the hardware exit is to L0 so
9290	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
9291	 * bits are further modified by vmx_set_efer() below.
9292	 */
9293	vmcs_write32(VM_EXIT_CONTROLS, vmcs_config.vmexit_ctrl);
9294
9295	/* vmcs12's VM_ENTRY_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE are
9296	 * emulated by vmx_set_efer(), below.
9297	 */
9298	vm_entry_controls_init(vmx,
9299		(vmcs12->vm_entry_controls & ~VM_ENTRY_LOAD_IA32_EFER &
9300			~VM_ENTRY_IA32E_MODE) |
9301		(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
9302
9303	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
9304		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
9305		vcpu->arch.pat = vmcs12->guest_ia32_pat;
9306	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
9307		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
9308
9309
9310	set_cr4_guest_host_mask(vmx);
9311
9312	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
9313		vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
9314
9315	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
9316		vmcs_write64(TSC_OFFSET,
9317			vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
9318	else
9319		vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
9320
9321	if (enable_vpid) {
9322		/*
9323		 * Trivially support vpid by letting L2s share their parent
9324		 * L1's vpid. TODO: move to a more elaborate solution, giving
9325		 * each L2 its own vpid and exposing the vpid feature to L1.
9326		 */
9327		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
9328		vmx_flush_tlb(vcpu);
9329	}
9330
9331	if (nested_cpu_has_ept(vmcs12)) {
9332		kvm_mmu_unload(vcpu);
9333		nested_ept_init_mmu_context(vcpu);
9334	}
9335
9336	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
9337		vcpu->arch.efer = vmcs12->guest_ia32_efer;
9338	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
9339		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
9340	else
9341		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
9342	/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
9343	vmx_set_efer(vcpu, vcpu->arch.efer);
9344
9345	/*
9346	 * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
9347	 * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
9348	 * The CR0_READ_SHADOW is what L2 should have expected to read given
9349	 * the specifications by L1; It's not enough to take
9350	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
9351	 * have more bits than L1 expected.
9352	 */
9353	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
9354	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
9355
9356	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
9357	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
9358
9359	/* shadow page tables on either EPT or shadow page tables */
9360	kvm_set_cr3(vcpu, vmcs12->guest_cr3);
9361	kvm_mmu_reset_context(vcpu);
9362
9363	if (!enable_ept)
9364		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
9365
9366	/*
9367	 * L1 may access the L2's PDPTR, so save them to construct vmcs12
9368	 */
9369	if (enable_ept) {
9370		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
9371		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
9372		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
9373		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
9374	}
9375
9376	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
9377	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
9378}
9379
9380/*
9381 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
9382 * for running an L2 nested guest.
9383 */
9384static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
9385{
9386	struct vmcs12 *vmcs12;
9387	struct vcpu_vmx *vmx = to_vmx(vcpu);
9388	int cpu;
9389	struct loaded_vmcs *vmcs02;
9390	bool ia32e;
9391	u32 msr_entry_idx;
9392
9393	if (!nested_vmx_check_permission(vcpu) ||
9394	    !nested_vmx_check_vmcs12(vcpu))
9395		return 1;
9396
9397	skip_emulated_instruction(vcpu);
9398	vmcs12 = get_vmcs12(vcpu);
9399
9400	if (enable_shadow_vmcs)
9401		copy_shadow_to_vmcs12(vmx);
9402
9403	/*
9404	 * The nested entry process starts with enforcing various prerequisites
9405	 * on vmcs12 as required by the Intel SDM, and act appropriately when
9406	 * they fail: As the SDM explains, some conditions should cause the
9407	 * instruction to fail, while others will cause the instruction to seem
9408	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
9409	 * To speed up the normal (success) code path, we should avoid checking
9410	 * for misconfigurations which will anyway be caught by the processor
9411	 * when using the merged vmcs02.
9412	 */
9413	if (vmcs12->launch_state == launch) {
9414		nested_vmx_failValid(vcpu,
9415			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
9416			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
9417		return 1;
9418	}
9419
9420	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
9421	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
9422		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9423		return 1;
9424	}
9425
9426	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
9427		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9428		return 1;
9429	}
9430
9431	if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
9432		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9433		return 1;
9434	}
9435
9436	if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
9437		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9438		return 1;
9439	}
9440
9441	if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
9442		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9443		return 1;
9444	}
9445
9446	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
9447				vmx->nested.nested_vmx_true_procbased_ctls_low,
9448				vmx->nested.nested_vmx_procbased_ctls_high) ||
9449	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
9450				vmx->nested.nested_vmx_secondary_ctls_low,
9451				vmx->nested.nested_vmx_secondary_ctls_high) ||
9452	    !vmx_control_verify(vmcs12->pin_based_vm_exec_control,
9453				vmx->nested.nested_vmx_pinbased_ctls_low,
9454				vmx->nested.nested_vmx_pinbased_ctls_high) ||
9455	    !vmx_control_verify(vmcs12->vm_exit_controls,
9456				vmx->nested.nested_vmx_true_exit_ctls_low,
9457				vmx->nested.nested_vmx_exit_ctls_high) ||
9458	    !vmx_control_verify(vmcs12->vm_entry_controls,
9459				vmx->nested.nested_vmx_true_entry_ctls_low,
9460				vmx->nested.nested_vmx_entry_ctls_high))
9461	{
9462		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
9463		return 1;
9464	}
9465
9466	if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) ||
9467	    ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
9468		nested_vmx_failValid(vcpu,
9469			VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
9470		return 1;
9471	}
9472
9473	if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) ||
9474	    ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) {
9475		nested_vmx_entry_failure(vcpu, vmcs12,
9476			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
9477		return 1;
9478	}
9479	if (vmcs12->vmcs_link_pointer != -1ull) {
9480		nested_vmx_entry_failure(vcpu, vmcs12,
9481			EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
9482		return 1;
9483	}
9484
9485	/*
9486	 * If the load IA32_EFER VM-entry control is 1, the following checks
9487	 * are performed on the field for the IA32_EFER MSR:
9488	 * - Bits reserved in the IA32_EFER MSR must be 0.
9489	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
9490	 *   the IA-32e mode guest VM-exit control. It must also be identical
9491	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
9492	 *   CR0.PG) is 1.
9493	 */
9494	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
9495		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
9496		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
9497		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
9498		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
9499		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
9500			nested_vmx_entry_failure(vcpu, vmcs12,
9501				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
9502			return 1;
9503		}
9504	}
9505
9506	/*
9507	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
9508	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
9509	 * the values of the LMA and LME bits in the field must each be that of
9510	 * the host address-space size VM-exit control.
9511	 */
9512	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
9513		ia32e = (vmcs12->vm_exit_controls &
9514			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
9515		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
9516		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
9517		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
9518			nested_vmx_entry_failure(vcpu, vmcs12,
9519				EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
9520			return 1;
9521		}
9522	}
9523
9524	/*
9525	 * We're finally done with prerequisite checking, and can start with
9526	 * the nested entry.
9527	 */
9528
9529	vmcs02 = nested_get_current_vmcs02(vmx);
9530	if (!vmcs02)
9531		return -ENOMEM;
9532
9533	enter_guest_mode(vcpu);
9534
9535	vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
9536
9537	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
9538		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
9539
9540	cpu = get_cpu();
9541	vmx->loaded_vmcs = vmcs02;
9542	vmx_vcpu_put(vcpu);
9543	vmx_vcpu_load(vcpu, cpu);
9544	vcpu->cpu = cpu;
9545	put_cpu();
9546
9547	vmx_segment_cache_clear(vmx);
9548
9549	prepare_vmcs02(vcpu, vmcs12);
9550
9551	msr_entry_idx = nested_vmx_load_msr(vcpu,
9552					    vmcs12->vm_entry_msr_load_addr,
9553					    vmcs12->vm_entry_msr_load_count);
9554	if (msr_entry_idx) {
9555		leave_guest_mode(vcpu);
9556		vmx_load_vmcs01(vcpu);
9557		nested_vmx_entry_failure(vcpu, vmcs12,
9558				EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
9559		return 1;
9560	}
9561
9562	vmcs12->launch_state = 1;
9563
9564	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
9565		return kvm_vcpu_halt(vcpu);
9566
9567	vmx->nested.nested_run_pending = 1;
9568
9569	/*
9570	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
9571	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
9572	 * returned as far as L1 is concerned. It will only return (and set
9573	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
9574	 */
9575	return 1;
9576}
9577
9578/*
9579 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
9580 * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
9581 * This function returns the new value we should put in vmcs12.guest_cr0.
9582 * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
9583 *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
9584 *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
9585 *     didn't trap the bit, because if L1 did, so would L0).
9586 *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
9587 *     been modified by L2, and L1 knows it. So just leave the old value of
9588 *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
9589 *     isn't relevant, because if L0 traps this bit it can set it to anything.
9590 *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
9591 *     changed these bits, and therefore they need to be updated, but L0
9592 *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
9593 *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
9594 */
9595static inline unsigned long
9596vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9597{
9598	return
9599	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
9600	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
9601	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
9602			vcpu->arch.cr0_guest_owned_bits));
9603}
9604
9605static inline unsigned long
9606vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
9607{
9608	return
9609	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
9610	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
9611	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
9612			vcpu->arch.cr4_guest_owned_bits));
9613}
9614
9615static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
9616				       struct vmcs12 *vmcs12)
9617{
9618	u32 idt_vectoring;
9619	unsigned int nr;
9620
9621	if (vcpu->arch.exception.pending && vcpu->arch.exception.reinject) {
9622		nr = vcpu->arch.exception.nr;
9623		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
9624
9625		if (kvm_exception_is_soft(nr)) {
9626			vmcs12->vm_exit_instruction_len =
9627				vcpu->arch.event_exit_inst_len;
9628			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
9629		} else
9630			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
9631
9632		if (vcpu->arch.exception.has_error_code) {
9633			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
9634			vmcs12->idt_vectoring_error_code =
9635				vcpu->arch.exception.error_code;
9636		}
9637
9638		vmcs12->idt_vectoring_info_field = idt_vectoring;
9639	} else if (vcpu->arch.nmi_injected) {
9640		vmcs12->idt_vectoring_info_field =
9641			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
9642	} else if (vcpu->arch.interrupt.pending) {
9643		nr = vcpu->arch.interrupt.nr;
9644		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
9645
9646		if (vcpu->arch.interrupt.soft) {
9647			idt_vectoring |= INTR_TYPE_SOFT_INTR;
9648			vmcs12->vm_entry_instruction_len =
9649				vcpu->arch.event_exit_inst_len;
9650		} else
9651			idt_vectoring |= INTR_TYPE_EXT_INTR;
9652
9653		vmcs12->idt_vectoring_info_field = idt_vectoring;
9654	}
9655}
9656
9657static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
9658{
9659	struct vcpu_vmx *vmx = to_vmx(vcpu);
9660
9661	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
9662	    vmx->nested.preemption_timer_expired) {
9663		if (vmx->nested.nested_run_pending)
9664			return -EBUSY;
9665		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
9666		return 0;
9667	}
9668
9669	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
9670		if (vmx->nested.nested_run_pending ||
9671		    vcpu->arch.interrupt.pending)
9672			return -EBUSY;
9673		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
9674				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
9675				  INTR_INFO_VALID_MASK, 0);
9676		/*
9677		 * The NMI-triggered VM exit counts as injection:
9678		 * clear this one and block further NMIs.
9679		 */
9680		vcpu->arch.nmi_pending = 0;
9681		vmx_set_nmi_mask(vcpu, true);
9682		return 0;
9683	}
9684
9685	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
9686	    nested_exit_on_intr(vcpu)) {
9687		if (vmx->nested.nested_run_pending)
9688			return -EBUSY;
9689		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
9690		return 0;
9691	}
9692
9693	return vmx_complete_nested_posted_interrupt(vcpu);
9694}
9695
9696static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
9697{
9698	ktime_t remaining =
9699		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
9700	u64 value;
9701
9702	if (ktime_to_ns(remaining) <= 0)
9703		return 0;
9704
9705	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
9706	do_div(value, 1000000);
9707	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
9708}
9709
9710/*
9711 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
9712 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
9713 * and this function updates it to reflect the changes to the guest state while
9714 * L2 was running (and perhaps made some exits which were handled directly by L0
9715 * without going back to L1), and to reflect the exit reason.
9716 * Note that we do not have to copy here all VMCS fields, just those that
9717 * could have changed by the L2 guest or the exit - i.e., the guest-state and
9718 * exit-information fields only. Other fields are modified by L1 with VMWRITE,
9719 * which already writes to vmcs12 directly.
9720 */
9721static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
9722			   u32 exit_reason, u32 exit_intr_info,
9723			   unsigned long exit_qualification)
9724{
9725	/* update guest state fields: */
9726	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
9727	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
9728
9729	vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
9730	vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP);
9731	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
9732
9733	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
9734	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
9735	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
9736	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
9737	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
9738	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
9739	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
9740	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
9741	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
9742	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
9743	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
9744	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
9745	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
9746	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
9747	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
9748	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
9749	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
9750	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
9751	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
9752	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
9753	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
9754	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
9755	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
9756	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
9757	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
9758	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
9759	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
9760	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
9761	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
9762	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
9763	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
9764	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
9765	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
9766	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
9767	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
9768	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
9769
9770	vmcs12->guest_interruptibility_info =
9771		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
9772	vmcs12->guest_pending_dbg_exceptions =
9773		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
9774	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
9775		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
9776	else
9777		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
9778
9779	if (nested_cpu_has_preemption_timer(vmcs12)) {
9780		if (vmcs12->vm_exit_controls &
9781		    VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
9782			vmcs12->vmx_preemption_timer_value =
9783				vmx_get_preemption_timer_value(vcpu);
9784		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
9785	}
9786
9787	/*
9788	 * In some cases (usually, nested EPT), L2 is allowed to change its
9789	 * own CR3 without exiting. If it has changed it, we must keep it.
9790	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
9791	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
9792	 *
9793	 * Additionally, restore L2's PDPTR to vmcs12.
9794	 */
9795	if (enable_ept) {
9796		vmcs12->guest_cr3 = vmcs_read64(GUEST_CR3);
9797		vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
9798		vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
9799		vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
9800		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
9801	}
9802
9803	if (nested_cpu_has_vid(vmcs12))
9804		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
9805
9806	vmcs12->vm_entry_controls =
9807		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
9808		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
9809
9810	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
9811		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
9812		vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
9813	}
9814
9815	/* TODO: These cannot have changed unless we have MSR bitmaps and
9816	 * the relevant bit asks not to trap the change */
9817	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
9818		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
9819	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
9820		vmcs12->guest_ia32_efer = vcpu->arch.efer;
9821	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
9822	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
9823	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
9824	if (vmx_mpx_supported())
9825		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
9826	if (nested_cpu_has_xsaves(vmcs12))
9827		vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
9828
9829	/* update exit information fields: */
9830
9831	vmcs12->vm_exit_reason = exit_reason;
9832	vmcs12->exit_qualification = exit_qualification;
9833
9834	vmcs12->vm_exit_intr_info = exit_intr_info;
9835	if ((vmcs12->vm_exit_intr_info &
9836	     (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
9837	    (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
9838		vmcs12->vm_exit_intr_error_code =
9839			vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
9840	vmcs12->idt_vectoring_info_field = 0;
9841	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
9842	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
9843
9844	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
9845		/* vm_entry_intr_info_field is cleared on exit. Emulate this
9846		 * instead of reading the real value. */
9847		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
9848
9849		/*
9850		 * Transfer the event that L0 or L1 may wanted to inject into
9851		 * L2 to IDT_VECTORING_INFO_FIELD.
9852		 */
9853		vmcs12_save_pending_event(vcpu, vmcs12);
9854	}
9855
9856	/*
9857	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
9858	 * preserved above and would only end up incorrectly in L1.
9859	 */
9860	vcpu->arch.nmi_injected = false;
9861	kvm_clear_exception_queue(vcpu);
9862	kvm_clear_interrupt_queue(vcpu);
9863}
9864
9865/*
9866 * A part of what we need to when the nested L2 guest exits and we want to
9867 * run its L1 parent, is to reset L1's guest state to the host state specified
9868 * in vmcs12.
9869 * This function is to be called not only on normal nested exit, but also on
9870 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
9871 * Failures During or After Loading Guest State").
9872 * This function should be called when the active VMCS is L1's (vmcs01).
9873 */
9874static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
9875				   struct vmcs12 *vmcs12)
9876{
9877	struct kvm_segment seg;
9878
9879	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
9880		vcpu->arch.efer = vmcs12->host_ia32_efer;
9881	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
9882		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
9883	else
9884		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
9885	vmx_set_efer(vcpu, vcpu->arch.efer);
9886
9887	kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->host_rsp);
9888	kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->host_rip);
9889	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
9890	/*
9891	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
9892	 * actually changed, because it depends on the current state of
9893	 * fpu_active (which may have changed).
9894	 * Note that vmx_set_cr0 refers to efer set above.
9895	 */
9896	vmx_set_cr0(vcpu, vmcs12->host_cr0);
9897	/*
9898	 * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
9899	 * to apply the same changes to L1's vmcs. We just set cr0 correctly,
9900	 * but we also need to update cr0_guest_host_mask and exception_bitmap.
9901	 */
9902	update_exception_bitmap(vcpu);
9903	vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
9904	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
9905
9906	/*
9907	 * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
9908	 * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
9909	 */
9910	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
9911	kvm_set_cr4(vcpu, vmcs12->host_cr4);
9912
9913	nested_ept_uninit_mmu_context(vcpu);
9914
9915	kvm_set_cr3(vcpu, vmcs12->host_cr3);
9916	kvm_mmu_reset_context(vcpu);
9917
9918	if (!enable_ept)
9919		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
9920
9921	if (enable_vpid) {
9922		/*
9923		 * Trivially support vpid by letting L2s share their parent
9924		 * L1's vpid. TODO: move to a more elaborate solution, giving
9925		 * each L2 its own vpid and exposing the vpid feature to L1.
9926		 */
9927		vmx_flush_tlb(vcpu);
9928	}
9929
9930
9931	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
9932	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
9933	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
9934	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
9935	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
9936
9937	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
9938	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
9939		vmcs_write64(GUEST_BNDCFGS, 0);
9940
9941	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
9942		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
9943		vcpu->arch.pat = vmcs12->host_ia32_pat;
9944	}
9945	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
9946		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
9947			vmcs12->host_ia32_perf_global_ctrl);
9948
9949	/* Set L1 segment info according to Intel SDM
9950	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
9951	seg = (struct kvm_segment) {
9952		.base = 0,
9953		.limit = 0xFFFFFFFF,
9954		.selector = vmcs12->host_cs_selector,
9955		.type = 11,
9956		.present = 1,
9957		.s = 1,
9958		.g = 1
9959	};
9960	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
9961		seg.l = 1;
9962	else
9963		seg.db = 1;
9964	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
9965	seg = (struct kvm_segment) {
9966		.base = 0,
9967		.limit = 0xFFFFFFFF,
9968		.type = 3,
9969		.present = 1,
9970		.s = 1,
9971		.db = 1,
9972		.g = 1
9973	};
9974	seg.selector = vmcs12->host_ds_selector;
9975	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
9976	seg.selector = vmcs12->host_es_selector;
9977	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
9978	seg.selector = vmcs12->host_ss_selector;
9979	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
9980	seg.selector = vmcs12->host_fs_selector;
9981	seg.base = vmcs12->host_fs_base;
9982	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
9983	seg.selector = vmcs12->host_gs_selector;
9984	seg.base = vmcs12->host_gs_base;
9985	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
9986	seg = (struct kvm_segment) {
9987		.base = vmcs12->host_tr_base,
9988		.limit = 0x67,
9989		.selector = vmcs12->host_tr_selector,
9990		.type = 11,
9991		.present = 1
9992	};
9993	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
9994
9995	kvm_set_dr(vcpu, 7, 0x400);
9996	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
9997
9998	if (cpu_has_vmx_msr_bitmap())
9999		vmx_set_msr_bitmap(vcpu);
10000
10001	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
10002				vmcs12->vm_exit_msr_load_count))
10003		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
10004}
10005
10006/*
10007 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
10008 * and modify vmcs12 to make it see what it would expect to see there if
10009 * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
10010 */
10011static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
10012			      u32 exit_intr_info,
10013			      unsigned long exit_qualification)
10014{
10015	struct vcpu_vmx *vmx = to_vmx(vcpu);
10016	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
10017
10018	/* trying to cancel vmlaunch/vmresume is a bug */
10019	WARN_ON_ONCE(vmx->nested.nested_run_pending);
10020
10021	leave_guest_mode(vcpu);
10022	prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
10023		       exit_qualification);
10024
10025	if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
10026				 vmcs12->vm_exit_msr_store_count))
10027		nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
10028
10029	vmx_load_vmcs01(vcpu);
10030
10031	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
10032	    && nested_exit_intr_ack_set(vcpu)) {
10033		int irq = kvm_cpu_get_interrupt(vcpu);
10034		WARN_ON(irq < 0);
10035		vmcs12->vm_exit_intr_info = irq |
10036			INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
10037	}
10038
10039	trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
10040				       vmcs12->exit_qualification,
10041				       vmcs12->idt_vectoring_info_field,
10042				       vmcs12->vm_exit_intr_info,
10043				       vmcs12->vm_exit_intr_error_code,
10044				       KVM_ISA_VMX);
10045
10046	vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
10047	vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
10048	vmx_segment_cache_clear(vmx);
10049
10050	/* if no vmcs02 cache requested, remove the one we used */
10051	if (VMCS02_POOL_SIZE == 0)
10052		nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
10053
10054	load_vmcs12_host_state(vcpu, vmcs12);
10055
10056	/* Update TSC_OFFSET if TSC was changed while L2 ran */
10057	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
10058
10059	/* This is needed for same reason as it was needed in prepare_vmcs02 */
10060	vmx->host_rsp = 0;
10061
10062	/* Unpin physical memory we referred to in vmcs02 */
10063	if (vmx->nested.apic_access_page) {
10064		nested_release_page(vmx->nested.apic_access_page);
10065		vmx->nested.apic_access_page = NULL;
10066	}
10067	if (vmx->nested.virtual_apic_page) {
10068		nested_release_page(vmx->nested.virtual_apic_page);
10069		vmx->nested.virtual_apic_page = NULL;
10070	}
10071	if (vmx->nested.pi_desc_page) {
10072		kunmap(vmx->nested.pi_desc_page);
10073		nested_release_page(vmx->nested.pi_desc_page);
10074		vmx->nested.pi_desc_page = NULL;
10075		vmx->nested.pi_desc = NULL;
10076	}
10077
10078	/*
10079	 * We are now running in L2, mmu_notifier will force to reload the
10080	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
10081	 */
10082	kvm_vcpu_reload_apic_access_page(vcpu);
10083
10084	/*
10085	 * Exiting from L2 to L1, we're now back to L1 which thinks it just
10086	 * finished a VMLAUNCH or VMRESUME instruction, so we need to set the
10087	 * success or failure flag accordingly.
10088	 */
10089	if (unlikely(vmx->fail)) {
10090		vmx->fail = 0;
10091		nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR));
10092	} else
10093		nested_vmx_succeed(vcpu);
10094	if (enable_shadow_vmcs)
10095		vmx->nested.sync_shadow_vmcs = true;
10096
10097	/* in case we halted in L2 */
10098	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
10099}
10100
10101/*
10102 * Forcibly leave nested mode in order to be able to reset the VCPU later on.
10103 */
10104static void vmx_leave_nested(struct kvm_vcpu *vcpu)
10105{
10106	if (is_guest_mode(vcpu))
10107		nested_vmx_vmexit(vcpu, -1, 0, 0);
10108	free_nested(to_vmx(vcpu));
10109}
10110
10111/*
10112 * L1's failure to enter L2 is a subset of a normal exit, as explained in
10113 * 23.7 "VM-entry failures during or after loading guest state" (this also
10114 * lists the acceptable exit-reason and exit-qualification parameters).
10115 * It should only be called before L2 actually succeeded to run, and when
10116 * vmcs01 is current (it doesn't leave_guest_mode() or switch vmcss).
10117 */
10118static void nested_vmx_entry_failure(struct kvm_vcpu *vcpu,
10119			struct vmcs12 *vmcs12,
10120			u32 reason, unsigned long qualification)
10121{
10122	load_vmcs12_host_state(vcpu, vmcs12);
10123	vmcs12->vm_exit_reason = reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
10124	vmcs12->exit_qualification = qualification;
10125	nested_vmx_succeed(vcpu);
10126	if (enable_shadow_vmcs)
10127		to_vmx(vcpu)->nested.sync_shadow_vmcs = true;
10128}
10129
10130static int vmx_check_intercept(struct kvm_vcpu *vcpu,
10131			       struct x86_instruction_info *info,
10132			       enum x86_intercept_stage stage)
10133{
10134	return X86EMUL_CONTINUE;
10135}
10136
10137static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
10138{
10139	if (ple_gap)
10140		shrink_ple_window(vcpu);
10141}
10142
10143static void vmx_slot_enable_log_dirty(struct kvm *kvm,
10144				     struct kvm_memory_slot *slot)
10145{
10146	kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
10147	kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
10148}
10149
10150static void vmx_slot_disable_log_dirty(struct kvm *kvm,
10151				       struct kvm_memory_slot *slot)
10152{
10153	kvm_mmu_slot_set_dirty(kvm, slot);
10154}
10155
10156static void vmx_flush_log_dirty(struct kvm *kvm)
10157{
10158	kvm_flush_pml_buffers(kvm);
10159}
10160
10161static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
10162					   struct kvm_memory_slot *memslot,
10163					   gfn_t offset, unsigned long mask)
10164{
10165	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
10166}
10167
10168static struct kvm_x86_ops vmx_x86_ops = {
10169	.cpu_has_kvm_support = cpu_has_kvm_support,
10170	.disabled_by_bios = vmx_disabled_by_bios,
10171	.hardware_setup = hardware_setup,
10172	.hardware_unsetup = hardware_unsetup,
10173	.check_processor_compatibility = vmx_check_processor_compat,
10174	.hardware_enable = hardware_enable,
10175	.hardware_disable = hardware_disable,
10176	.cpu_has_accelerated_tpr = report_flexpriority,
10177
10178	.vcpu_create = vmx_create_vcpu,
10179	.vcpu_free = vmx_free_vcpu,
10180	.vcpu_reset = vmx_vcpu_reset,
10181
10182	.prepare_guest_switch = vmx_save_host_state,
10183	.vcpu_load = vmx_vcpu_load,
10184	.vcpu_put = vmx_vcpu_put,
10185
10186	.update_db_bp_intercept = update_exception_bitmap,
10187	.get_msr = vmx_get_msr,
10188	.set_msr = vmx_set_msr,
10189	.get_segment_base = vmx_get_segment_base,
10190	.get_segment = vmx_get_segment,
10191	.set_segment = vmx_set_segment,
10192	.get_cpl = vmx_get_cpl,
10193	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
10194	.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
10195	.decache_cr3 = vmx_decache_cr3,
10196	.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
10197	.set_cr0 = vmx_set_cr0,
10198	.set_cr3 = vmx_set_cr3,
10199	.set_cr4 = vmx_set_cr4,
10200	.set_efer = vmx_set_efer,
10201	.get_idt = vmx_get_idt,
10202	.set_idt = vmx_set_idt,
10203	.get_gdt = vmx_get_gdt,
10204	.set_gdt = vmx_set_gdt,
10205	.get_dr6 = vmx_get_dr6,
10206	.set_dr6 = vmx_set_dr6,
10207	.set_dr7 = vmx_set_dr7,
10208	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
10209	.cache_reg = vmx_cache_reg,
10210	.get_rflags = vmx_get_rflags,
10211	.set_rflags = vmx_set_rflags,
10212	.fpu_activate = vmx_fpu_activate,
10213	.fpu_deactivate = vmx_fpu_deactivate,
10214
10215	.tlb_flush = vmx_flush_tlb,
10216
10217	.run = vmx_vcpu_run,
10218	.handle_exit = vmx_handle_exit,
10219	.skip_emulated_instruction = skip_emulated_instruction,
10220	.set_interrupt_shadow = vmx_set_interrupt_shadow,
10221	.get_interrupt_shadow = vmx_get_interrupt_shadow,
10222	.patch_hypercall = vmx_patch_hypercall,
10223	.set_irq = vmx_inject_irq,
10224	.set_nmi = vmx_inject_nmi,
10225	.queue_exception = vmx_queue_exception,
10226	.cancel_injection = vmx_cancel_injection,
10227	.interrupt_allowed = vmx_interrupt_allowed,
10228	.nmi_allowed = vmx_nmi_allowed,
10229	.get_nmi_mask = vmx_get_nmi_mask,
10230	.set_nmi_mask = vmx_set_nmi_mask,
10231	.enable_nmi_window = enable_nmi_window,
10232	.enable_irq_window = enable_irq_window,
10233	.update_cr8_intercept = update_cr8_intercept,
10234	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
10235	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
10236	.vm_has_apicv = vmx_vm_has_apicv,
10237	.load_eoi_exitmap = vmx_load_eoi_exitmap,
10238	.hwapic_irr_update = vmx_hwapic_irr_update,
10239	.hwapic_isr_update = vmx_hwapic_isr_update,
10240	.sync_pir_to_irr = vmx_sync_pir_to_irr,
10241	.deliver_posted_interrupt = vmx_deliver_posted_interrupt,
10242
10243	.set_tss_addr = vmx_set_tss_addr,
10244	.get_tdp_level = get_ept_level,
10245	.get_mt_mask = vmx_get_mt_mask,
10246
10247	.get_exit_info = vmx_get_exit_info,
10248
10249	.get_lpage_level = vmx_get_lpage_level,
10250
10251	.cpuid_update = vmx_cpuid_update,
10252
10253	.rdtscp_supported = vmx_rdtscp_supported,
10254	.invpcid_supported = vmx_invpcid_supported,
10255
10256	.set_supported_cpuid = vmx_set_supported_cpuid,
10257
10258	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
10259
10260	.set_tsc_khz = vmx_set_tsc_khz,
10261	.read_tsc_offset = vmx_read_tsc_offset,
10262	.write_tsc_offset = vmx_write_tsc_offset,
10263	.adjust_tsc_offset = vmx_adjust_tsc_offset,
10264	.compute_tsc_offset = vmx_compute_tsc_offset,
10265	.read_l1_tsc = vmx_read_l1_tsc,
10266
10267	.set_tdp_cr3 = vmx_set_cr3,
10268
10269	.check_intercept = vmx_check_intercept,
10270	.handle_external_intr = vmx_handle_external_intr,
10271	.mpx_supported = vmx_mpx_supported,
10272	.xsaves_supported = vmx_xsaves_supported,
10273
10274	.check_nested_events = vmx_check_nested_events,
10275
10276	.sched_in = vmx_sched_in,
10277
10278	.slot_enable_log_dirty = vmx_slot_enable_log_dirty,
10279	.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
10280	.flush_log_dirty = vmx_flush_log_dirty,
10281	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
10282};
10283
10284static int __init vmx_init(void)
10285{
10286	int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
10287                     __alignof__(struct vcpu_vmx), THIS_MODULE);
10288	if (r)
10289		return r;
10290
10291#ifdef CONFIG_KEXEC
10292	rcu_assign_pointer(crash_vmclear_loaded_vmcss,
10293			   crash_vmclear_local_loaded_vmcss);
10294#endif
10295
10296	return 0;
10297}
10298
10299static void __exit vmx_exit(void)
10300{
10301#ifdef CONFIG_KEXEC
10302	RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
10303	synchronize_rcu();
10304#endif
10305
10306	kvm_exit();
10307}
10308
10309module_init(vmx_init)
10310module_exit(vmx_exit)
10311