1/*
2 * Core of Xen paravirt_ops implementation.
3 *
4 * This file contains the xen_paravirt_ops structure itself, and the
5 * implementations for:
6 * - privileged instructions
7 * - interrupt flags
8 * - segment operations
9 * - booting and setup
10 *
11 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
12 */
13
14#include <linux/cpu.h>
15#include <linux/kernel.h>
16#include <linux/init.h>
17#include <linux/smp.h>
18#include <linux/preempt.h>
19#include <linux/hardirq.h>
20#include <linux/percpu.h>
21#include <linux/delay.h>
22#include <linux/start_kernel.h>
23#include <linux/sched.h>
24#include <linux/kprobes.h>
25#include <linux/bootmem.h>
26#include <linux/module.h>
27#include <linux/mm.h>
28#include <linux/page-flags.h>
29#include <linux/highmem.h>
30#include <linux/console.h>
31#include <linux/pci.h>
32#include <linux/gfp.h>
33#include <linux/memblock.h>
34#include <linux/edd.h>
35
36#ifdef CONFIG_KEXEC_CORE
37#include <linux/kexec.h>
38#endif
39
40#include <xen/xen.h>
41#include <xen/events.h>
42#include <xen/interface/xen.h>
43#include <xen/interface/version.h>
44#include <xen/interface/physdev.h>
45#include <xen/interface/vcpu.h>
46#include <xen/interface/memory.h>
47#include <xen/interface/nmi.h>
48#include <xen/interface/xen-mca.h>
49#include <xen/features.h>
50#include <xen/page.h>
51#include <xen/hvm.h>
52#include <xen/hvc-console.h>
53#include <xen/acpi.h>
54
55#include <asm/paravirt.h>
56#include <asm/apic.h>
57#include <asm/page.h>
58#include <asm/xen/pci.h>
59#include <asm/xen/hypercall.h>
60#include <asm/xen/hypervisor.h>
61#include <asm/fixmap.h>
62#include <asm/processor.h>
63#include <asm/proto.h>
64#include <asm/msr-index.h>
65#include <asm/traps.h>
66#include <asm/setup.h>
67#include <asm/desc.h>
68#include <asm/pgalloc.h>
69#include <asm/pgtable.h>
70#include <asm/tlbflush.h>
71#include <asm/reboot.h>
72#include <asm/stackprotector.h>
73#include <asm/hypervisor.h>
74#include <asm/mach_traps.h>
75#include <asm/mwait.h>
76#include <asm/pci_x86.h>
77#include <asm/pat.h>
78#include <asm/cpu.h>
79
80#ifdef CONFIG_ACPI
81#include <linux/acpi.h>
82#include <asm/acpi.h>
83#include <acpi/pdc_intel.h>
84#include <acpi/processor.h>
85#include <xen/interface/platform.h>
86#endif
87
88#include "xen-ops.h"
89#include "mmu.h"
90#include "smp.h"
91#include "multicalls.h"
92#include "pmu.h"
93
94EXPORT_SYMBOL_GPL(hypercall_page);
95
96/*
97 * Pointer to the xen_vcpu_info structure or
98 * &HYPERVISOR_shared_info->vcpu_info[cpu]. See xen_hvm_init_shared_info
99 * and xen_vcpu_setup for details. By default it points to share_info->vcpu_info
100 * but if the hypervisor supports VCPUOP_register_vcpu_info then it can point
101 * to xen_vcpu_info. The pointer is used in __xen_evtchn_do_upcall to
102 * acknowledge pending events.
103 * Also more subtly it is used by the patched version of irq enable/disable
104 * e.g. xen_irq_enable_direct and xen_iret in PV mode.
105 *
106 * The desire to be able to do those mask/unmask operations as a single
107 * instruction by using the per-cpu offset held in %gs is the real reason
108 * vcpu info is in a per-cpu pointer and the original reason for this
109 * hypercall.
110 *
111 */
112DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
113
114/*
115 * Per CPU pages used if hypervisor supports VCPUOP_register_vcpu_info
116 * hypercall. This can be used both in PV and PVHVM mode. The structure
117 * overrides the default per_cpu(xen_vcpu, cpu) value.
118 */
119DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
120
121enum xen_domain_type xen_domain_type = XEN_NATIVE;
122EXPORT_SYMBOL_GPL(xen_domain_type);
123
124unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
125EXPORT_SYMBOL(machine_to_phys_mapping);
126unsigned long  machine_to_phys_nr;
127EXPORT_SYMBOL(machine_to_phys_nr);
128
129struct start_info *xen_start_info;
130EXPORT_SYMBOL_GPL(xen_start_info);
131
132struct shared_info xen_dummy_shared_info;
133
134void *xen_initial_gdt;
135
136RESERVE_BRK(shared_info_page_brk, PAGE_SIZE);
137__read_mostly int xen_have_vector_callback;
138EXPORT_SYMBOL_GPL(xen_have_vector_callback);
139
140/*
141 * Point at some empty memory to start with. We map the real shared_info
142 * page as soon as fixmap is up and running.
143 */
144struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
145
146/*
147 * Flag to determine whether vcpu info placement is available on all
148 * VCPUs.  We assume it is to start with, and then set it to zero on
149 * the first failure.  This is because it can succeed on some VCPUs
150 * and not others, since it can involve hypervisor memory allocation,
151 * or because the guest failed to guarantee all the appropriate
152 * constraints on all VCPUs (ie buffer can't cross a page boundary).
153 *
154 * Note that any particular CPU may be using a placed vcpu structure,
155 * but we can only optimise if the all are.
156 *
157 * 0: not available, 1: available
158 */
159static int have_vcpu_info_placement = 1;
160
161struct tls_descs {
162	struct desc_struct desc[3];
163};
164
165/*
166 * Updating the 3 TLS descriptors in the GDT on every task switch is
167 * surprisingly expensive so we avoid updating them if they haven't
168 * changed.  Since Xen writes different descriptors than the one
169 * passed in the update_descriptor hypercall we keep shadow copies to
170 * compare against.
171 */
172static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
173
174static void clamp_max_cpus(void)
175{
176#ifdef CONFIG_SMP
177	if (setup_max_cpus > MAX_VIRT_CPUS)
178		setup_max_cpus = MAX_VIRT_CPUS;
179#endif
180}
181
182static void xen_vcpu_setup(int cpu)
183{
184	struct vcpu_register_vcpu_info info;
185	int err;
186	struct vcpu_info *vcpup;
187
188	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
189
190	/*
191	 * This path is called twice on PVHVM - first during bootup via
192	 * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being
193	 * hotplugged: cpu_up -> xen_hvm_cpu_notify.
194	 * As we can only do the VCPUOP_register_vcpu_info once lets
195	 * not over-write its result.
196	 *
197	 * For PV it is called during restore (xen_vcpu_restore) and bootup
198	 * (xen_setup_vcpu_info_placement). The hotplug mechanism does not
199	 * use this function.
200	 */
201	if (xen_hvm_domain()) {
202		if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu))
203			return;
204	}
205	if (cpu < MAX_VIRT_CPUS)
206		per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
207
208	if (!have_vcpu_info_placement) {
209		if (cpu >= MAX_VIRT_CPUS)
210			clamp_max_cpus();
211		return;
212	}
213
214	vcpup = &per_cpu(xen_vcpu_info, cpu);
215	info.mfn = arbitrary_virt_to_mfn(vcpup);
216	info.offset = offset_in_page(vcpup);
217
218	/* Check to see if the hypervisor will put the vcpu_info
219	   structure where we want it, which allows direct access via
220	   a percpu-variable.
221	   N.B. This hypercall can _only_ be called once per CPU. Subsequent
222	   calls will error out with -EINVAL. This is due to the fact that
223	   hypervisor has no unregister variant and this hypercall does not
224	   allow to over-write info.mfn and info.offset.
225	 */
226	err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info);
227
228	if (err) {
229		printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err);
230		have_vcpu_info_placement = 0;
231		clamp_max_cpus();
232	} else {
233		/* This cpu is using the registered vcpu info, even if
234		   later ones fail to. */
235		per_cpu(xen_vcpu, cpu) = vcpup;
236	}
237}
238
239/*
240 * On restore, set the vcpu placement up again.
241 * If it fails, then we're in a bad state, since
242 * we can't back out from using it...
243 */
244void xen_vcpu_restore(void)
245{
246	int cpu;
247
248	for_each_possible_cpu(cpu) {
249		bool other_cpu = (cpu != smp_processor_id());
250		bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL);
251
252		if (other_cpu && is_up &&
253		    HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
254			BUG();
255
256		xen_setup_runstate_info(cpu);
257
258		if (have_vcpu_info_placement)
259			xen_vcpu_setup(cpu);
260
261		if (other_cpu && is_up &&
262		    HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
263			BUG();
264	}
265}
266
267static void __init xen_banner(void)
268{
269	unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
270	struct xen_extraversion extra;
271	HYPERVISOR_xen_version(XENVER_extraversion, &extra);
272
273	pr_info("Booting paravirtualized kernel %son %s\n",
274		xen_feature(XENFEAT_auto_translated_physmap) ?
275			"with PVH extensions " : "", pv_info.name);
276	printk(KERN_INFO "Xen version: %d.%d%s%s\n",
277	       version >> 16, version & 0xffff, extra.extraversion,
278	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
279}
280/* Check if running on Xen version (major, minor) or later */
281bool
282xen_running_on_version_or_later(unsigned int major, unsigned int minor)
283{
284	unsigned int version;
285
286	if (!xen_domain())
287		return false;
288
289	version = HYPERVISOR_xen_version(XENVER_version, NULL);
290	if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) ||
291		((version >> 16) > major))
292		return true;
293	return false;
294}
295
296#define CPUID_THERM_POWER_LEAF 6
297#define APERFMPERF_PRESENT 0
298
299static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
300static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
301
302static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask;
303static __read_mostly unsigned int cpuid_leaf5_ecx_val;
304static __read_mostly unsigned int cpuid_leaf5_edx_val;
305
306static void xen_cpuid(unsigned int *ax, unsigned int *bx,
307		      unsigned int *cx, unsigned int *dx)
308{
309	unsigned maskebx = ~0;
310	unsigned maskecx = ~0;
311	unsigned maskedx = ~0;
312	unsigned setecx = 0;
313	/*
314	 * Mask out inconvenient features, to try and disable as many
315	 * unsupported kernel subsystems as possible.
316	 */
317	switch (*ax) {
318	case 1:
319		maskecx = cpuid_leaf1_ecx_mask;
320		setecx = cpuid_leaf1_ecx_set_mask;
321		maskedx = cpuid_leaf1_edx_mask;
322		break;
323
324	case CPUID_MWAIT_LEAF:
325		/* Synthesize the values.. */
326		*ax = 0;
327		*bx = 0;
328		*cx = cpuid_leaf5_ecx_val;
329		*dx = cpuid_leaf5_edx_val;
330		return;
331
332	case CPUID_THERM_POWER_LEAF:
333		/* Disabling APERFMPERF for kernel usage */
334		maskecx = ~(1 << APERFMPERF_PRESENT);
335		break;
336
337	case 0xb:
338		/* Suppress extended topology stuff */
339		maskebx = 0;
340		break;
341	}
342
343	asm(XEN_EMULATE_PREFIX "cpuid"
344		: "=a" (*ax),
345		  "=b" (*bx),
346		  "=c" (*cx),
347		  "=d" (*dx)
348		: "0" (*ax), "2" (*cx));
349
350	*bx &= maskebx;
351	*cx &= maskecx;
352	*cx |= setecx;
353	*dx &= maskedx;
354
355}
356
357static bool __init xen_check_mwait(void)
358{
359#ifdef CONFIG_ACPI
360	struct xen_platform_op op = {
361		.cmd			= XENPF_set_processor_pminfo,
362		.u.set_pminfo.id	= -1,
363		.u.set_pminfo.type	= XEN_PM_PDC,
364	};
365	uint32_t buf[3];
366	unsigned int ax, bx, cx, dx;
367	unsigned int mwait_mask;
368
369	/* We need to determine whether it is OK to expose the MWAIT
370	 * capability to the kernel to harvest deeper than C3 states from ACPI
371	 * _CST using the processor_harvest_xen.c module. For this to work, we
372	 * need to gather the MWAIT_LEAF values (which the cstate.c code
373	 * checks against). The hypervisor won't expose the MWAIT flag because
374	 * it would break backwards compatibility; so we will find out directly
375	 * from the hardware and hypercall.
376	 */
377	if (!xen_initial_domain())
378		return false;
379
380	/*
381	 * When running under platform earlier than Xen4.2, do not expose
382	 * mwait, to avoid the risk of loading native acpi pad driver
383	 */
384	if (!xen_running_on_version_or_later(4, 2))
385		return false;
386
387	ax = 1;
388	cx = 0;
389
390	native_cpuid(&ax, &bx, &cx, &dx);
391
392	mwait_mask = (1 << (X86_FEATURE_EST % 32)) |
393		     (1 << (X86_FEATURE_MWAIT % 32));
394
395	if ((cx & mwait_mask) != mwait_mask)
396		return false;
397
398	/* We need to emulate the MWAIT_LEAF and for that we need both
399	 * ecx and edx. The hypercall provides only partial information.
400	 */
401
402	ax = CPUID_MWAIT_LEAF;
403	bx = 0;
404	cx = 0;
405	dx = 0;
406
407	native_cpuid(&ax, &bx, &cx, &dx);
408
409	/* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so,
410	 * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3.
411	 */
412	buf[0] = ACPI_PDC_REVISION_ID;
413	buf[1] = 1;
414	buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP);
415
416	set_xen_guest_handle(op.u.set_pminfo.pdc, buf);
417
418	if ((HYPERVISOR_dom0_op(&op) == 0) &&
419	    (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) {
420		cpuid_leaf5_ecx_val = cx;
421		cpuid_leaf5_edx_val = dx;
422	}
423	return true;
424#else
425	return false;
426#endif
427}
428static void __init xen_init_cpuid_mask(void)
429{
430	unsigned int ax, bx, cx, dx;
431	unsigned int xsave_mask;
432
433	cpuid_leaf1_edx_mask =
434		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
435		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
436
437	if (!xen_initial_domain())
438		cpuid_leaf1_edx_mask &=
439			~((1 << X86_FEATURE_ACPI));  /* disable ACPI */
440
441	cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32));
442
443	ax = 1;
444	cx = 0;
445	cpuid(1, &ax, &bx, &cx, &dx);
446
447	xsave_mask =
448		(1 << (X86_FEATURE_XSAVE % 32)) |
449		(1 << (X86_FEATURE_OSXSAVE % 32));
450
451	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
452	if ((cx & xsave_mask) != xsave_mask)
453		cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
454	if (xen_check_mwait())
455		cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32));
456}
457
458static void xen_set_debugreg(int reg, unsigned long val)
459{
460	HYPERVISOR_set_debugreg(reg, val);
461}
462
463static unsigned long xen_get_debugreg(int reg)
464{
465	return HYPERVISOR_get_debugreg(reg);
466}
467
468static void xen_end_context_switch(struct task_struct *next)
469{
470	xen_mc_flush();
471	paravirt_end_context_switch(next);
472}
473
474static unsigned long xen_store_tr(void)
475{
476	return 0;
477}
478
479/*
480 * Set the page permissions for a particular virtual address.  If the
481 * address is a vmalloc mapping (or other non-linear mapping), then
482 * find the linear mapping of the page and also set its protections to
483 * match.
484 */
485static void set_aliased_prot(void *v, pgprot_t prot)
486{
487	int level;
488	pte_t *ptep;
489	pte_t pte;
490	unsigned long pfn;
491	struct page *page;
492	unsigned char dummy;
493
494	ptep = lookup_address((unsigned long)v, &level);
495	BUG_ON(ptep == NULL);
496
497	pfn = pte_pfn(*ptep);
498	page = pfn_to_page(pfn);
499
500	pte = pfn_pte(pfn, prot);
501
502	/*
503	 * Careful: update_va_mapping() will fail if the virtual address
504	 * we're poking isn't populated in the page tables.  We don't
505	 * need to worry about the direct map (that's always in the page
506	 * tables), but we need to be careful about vmap space.  In
507	 * particular, the top level page table can lazily propagate
508	 * entries between processes, so if we've switched mms since we
509	 * vmapped the target in the first place, we might not have the
510	 * top-level page table entry populated.
511	 *
512	 * We disable preemption because we want the same mm active when
513	 * we probe the target and when we issue the hypercall.  We'll
514	 * have the same nominal mm, but if we're a kernel thread, lazy
515	 * mm dropping could change our pgd.
516	 *
517	 * Out of an abundance of caution, this uses __get_user() to fault
518	 * in the target address just in case there's some obscure case
519	 * in which the target address isn't readable.
520	 */
521
522	preempt_disable();
523
524	pagefault_disable();	/* Avoid warnings due to being atomic. */
525	__get_user(dummy, (unsigned char __user __force *)v);
526	pagefault_enable();
527
528	if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
529		BUG();
530
531	if (!PageHighMem(page)) {
532		void *av = __va(PFN_PHYS(pfn));
533
534		if (av != v)
535			if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
536				BUG();
537	} else
538		kmap_flush_unused();
539
540	preempt_enable();
541}
542
543static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
544{
545	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
546	int i;
547
548	/*
549	 * We need to mark the all aliases of the LDT pages RO.  We
550	 * don't need to call vm_flush_aliases(), though, since that's
551	 * only responsible for flushing aliases out the TLBs, not the
552	 * page tables, and Xen will flush the TLB for us if needed.
553	 *
554	 * To avoid confusing future readers: none of this is necessary
555	 * to load the LDT.  The hypervisor only checks this when the
556	 * LDT is faulted in due to subsequent descriptor access.
557	 */
558
559	for(i = 0; i < entries; i += entries_per_page)
560		set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
561}
562
563static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
564{
565	const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
566	int i;
567
568	for(i = 0; i < entries; i += entries_per_page)
569		set_aliased_prot(ldt + i, PAGE_KERNEL);
570}
571
572static void xen_set_ldt(const void *addr, unsigned entries)
573{
574	struct mmuext_op *op;
575	struct multicall_space mcs = xen_mc_entry(sizeof(*op));
576
577	trace_xen_cpu_set_ldt(addr, entries);
578
579	op = mcs.args;
580	op->cmd = MMUEXT_SET_LDT;
581	op->arg1.linear_addr = (unsigned long)addr;
582	op->arg2.nr_ents = entries;
583
584	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
585
586	xen_mc_issue(PARAVIRT_LAZY_CPU);
587}
588
589static void xen_load_gdt(const struct desc_ptr *dtr)
590{
591	unsigned long va = dtr->address;
592	unsigned int size = dtr->size + 1;
593	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
594	unsigned long frames[pages];
595	int f;
596
597	/*
598	 * A GDT can be up to 64k in size, which corresponds to 8192
599	 * 8-byte entries, or 16 4k pages..
600	 */
601
602	BUG_ON(size > 65536);
603	BUG_ON(va & ~PAGE_MASK);
604
605	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
606		int level;
607		pte_t *ptep;
608		unsigned long pfn, mfn;
609		void *virt;
610
611		/*
612		 * The GDT is per-cpu and is in the percpu data area.
613		 * That can be virtually mapped, so we need to do a
614		 * page-walk to get the underlying MFN for the
615		 * hypercall.  The page can also be in the kernel's
616		 * linear range, so we need to RO that mapping too.
617		 */
618		ptep = lookup_address(va, &level);
619		BUG_ON(ptep == NULL);
620
621		pfn = pte_pfn(*ptep);
622		mfn = pfn_to_mfn(pfn);
623		virt = __va(PFN_PHYS(pfn));
624
625		frames[f] = mfn;
626
627		make_lowmem_page_readonly((void *)va);
628		make_lowmem_page_readonly(virt);
629	}
630
631	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
632		BUG();
633}
634
635/*
636 * load_gdt for early boot, when the gdt is only mapped once
637 */
638static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
639{
640	unsigned long va = dtr->address;
641	unsigned int size = dtr->size + 1;
642	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
643	unsigned long frames[pages];
644	int f;
645
646	/*
647	 * A GDT can be up to 64k in size, which corresponds to 8192
648	 * 8-byte entries, or 16 4k pages..
649	 */
650
651	BUG_ON(size > 65536);
652	BUG_ON(va & ~PAGE_MASK);
653
654	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
655		pte_t pte;
656		unsigned long pfn, mfn;
657
658		pfn = virt_to_pfn(va);
659		mfn = pfn_to_mfn(pfn);
660
661		pte = pfn_pte(pfn, PAGE_KERNEL_RO);
662
663		if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0))
664			BUG();
665
666		frames[f] = mfn;
667	}
668
669	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
670		BUG();
671}
672
673static inline bool desc_equal(const struct desc_struct *d1,
674			      const struct desc_struct *d2)
675{
676	return d1->a == d2->a && d1->b == d2->b;
677}
678
679static void load_TLS_descriptor(struct thread_struct *t,
680				unsigned int cpu, unsigned int i)
681{
682	struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
683	struct desc_struct *gdt;
684	xmaddr_t maddr;
685	struct multicall_space mc;
686
687	if (desc_equal(shadow, &t->tls_array[i]))
688		return;
689
690	*shadow = t->tls_array[i];
691
692	gdt = get_cpu_gdt_table(cpu);
693	maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
694	mc = __xen_mc_entry(0);
695
696	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
697}
698
699static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
700{
701	/*
702	 * XXX sleazy hack: If we're being called in a lazy-cpu zone
703	 * and lazy gs handling is enabled, it means we're in a
704	 * context switch, and %gs has just been saved.  This means we
705	 * can zero it out to prevent faults on exit from the
706	 * hypervisor if the next process has no %gs.  Either way, it
707	 * has been saved, and the new value will get loaded properly.
708	 * This will go away as soon as Xen has been modified to not
709	 * save/restore %gs for normal hypercalls.
710	 *
711	 * On x86_64, this hack is not used for %gs, because gs points
712	 * to KERNEL_GS_BASE (and uses it for PDA references), so we
713	 * must not zero %gs on x86_64
714	 *
715	 * For x86_64, we need to zero %fs, otherwise we may get an
716	 * exception between the new %fs descriptor being loaded and
717	 * %fs being effectively cleared at __switch_to().
718	 */
719	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
720#ifdef CONFIG_X86_32
721		lazy_load_gs(0);
722#else
723		loadsegment(fs, 0);
724#endif
725	}
726
727	xen_mc_batch();
728
729	load_TLS_descriptor(t, cpu, 0);
730	load_TLS_descriptor(t, cpu, 1);
731	load_TLS_descriptor(t, cpu, 2);
732
733	xen_mc_issue(PARAVIRT_LAZY_CPU);
734}
735
736#ifdef CONFIG_X86_64
737static void xen_load_gs_index(unsigned int idx)
738{
739	if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
740		BUG();
741}
742#endif
743
744static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
745				const void *ptr)
746{
747	xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
748	u64 entry = *(u64 *)ptr;
749
750	trace_xen_cpu_write_ldt_entry(dt, entrynum, entry);
751
752	preempt_disable();
753
754	xen_mc_flush();
755	if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
756		BUG();
757
758	preempt_enable();
759}
760
761static int cvt_gate_to_trap(int vector, const gate_desc *val,
762			    struct trap_info *info)
763{
764	unsigned long addr;
765
766	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
767		return 0;
768
769	info->vector = vector;
770
771	addr = gate_offset(*val);
772#ifdef CONFIG_X86_64
773	/*
774	 * Look for known traps using IST, and substitute them
775	 * appropriately.  The debugger ones are the only ones we care
776	 * about.  Xen will handle faults like double_fault,
777	 * so we should never see them.  Warn if
778	 * there's an unexpected IST-using fault handler.
779	 */
780	if (addr == (unsigned long)debug)
781		addr = (unsigned long)xen_debug;
782	else if (addr == (unsigned long)int3)
783		addr = (unsigned long)xen_int3;
784	else if (addr == (unsigned long)stack_segment)
785		addr = (unsigned long)xen_stack_segment;
786	else if (addr == (unsigned long)double_fault) {
787		/* Don't need to handle these */
788		return 0;
789#ifdef CONFIG_X86_MCE
790	} else if (addr == (unsigned long)machine_check) {
791		/*
792		 * when xen hypervisor inject vMCE to guest,
793		 * use native mce handler to handle it
794		 */
795		;
796#endif
797	} else if (addr == (unsigned long)nmi)
798		/*
799		 * Use the native version as well.
800		 */
801		;
802	else {
803		/* Some other trap using IST? */
804		if (WARN_ON(val->ist != 0))
805			return 0;
806	}
807#endif	/* CONFIG_X86_64 */
808	info->address = addr;
809
810	info->cs = gate_segment(*val);
811	info->flags = val->dpl;
812	/* interrupt gates clear IF */
813	if (val->type == GATE_INTERRUPT)
814		info->flags |= 1 << 2;
815
816	return 1;
817}
818
819/* Locations of each CPU's IDT */
820static DEFINE_PER_CPU(struct desc_ptr, idt_desc);
821
822/* Set an IDT entry.  If the entry is part of the current IDT, then
823   also update Xen. */
824static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
825{
826	unsigned long p = (unsigned long)&dt[entrynum];
827	unsigned long start, end;
828
829	trace_xen_cpu_write_idt_entry(dt, entrynum, g);
830
831	preempt_disable();
832
833	start = __this_cpu_read(idt_desc.address);
834	end = start + __this_cpu_read(idt_desc.size) + 1;
835
836	xen_mc_flush();
837
838	native_write_idt_entry(dt, entrynum, g);
839
840	if (p >= start && (p + 8) <= end) {
841		struct trap_info info[2];
842
843		info[1].address = 0;
844
845		if (cvt_gate_to_trap(entrynum, g, &info[0]))
846			if (HYPERVISOR_set_trap_table(info))
847				BUG();
848	}
849
850	preempt_enable();
851}
852
853static void xen_convert_trap_info(const struct desc_ptr *desc,
854				  struct trap_info *traps)
855{
856	unsigned in, out, count;
857
858	count = (desc->size+1) / sizeof(gate_desc);
859	BUG_ON(count > 256);
860
861	for (in = out = 0; in < count; in++) {
862		gate_desc *entry = (gate_desc*)(desc->address) + in;
863
864		if (cvt_gate_to_trap(in, entry, &traps[out]))
865			out++;
866	}
867	traps[out].address = 0;
868}
869
870void xen_copy_trap_info(struct trap_info *traps)
871{
872	const struct desc_ptr *desc = this_cpu_ptr(&idt_desc);
873
874	xen_convert_trap_info(desc, traps);
875}
876
877/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
878   hold a spinlock to protect the static traps[] array (static because
879   it avoids allocation, and saves stack space). */
880static void xen_load_idt(const struct desc_ptr *desc)
881{
882	static DEFINE_SPINLOCK(lock);
883	static struct trap_info traps[257];
884
885	trace_xen_cpu_load_idt(desc);
886
887	spin_lock(&lock);
888
889	memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
890
891	xen_convert_trap_info(desc, traps);
892
893	xen_mc_flush();
894	if (HYPERVISOR_set_trap_table(traps))
895		BUG();
896
897	spin_unlock(&lock);
898}
899
900/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
901   they're handled differently. */
902static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
903				const void *desc, int type)
904{
905	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
906
907	preempt_disable();
908
909	switch (type) {
910	case DESC_LDT:
911	case DESC_TSS:
912		/* ignore */
913		break;
914
915	default: {
916		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
917
918		xen_mc_flush();
919		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
920			BUG();
921	}
922
923	}
924
925	preempt_enable();
926}
927
928/*
929 * Version of write_gdt_entry for use at early boot-time needed to
930 * update an entry as simply as possible.
931 */
932static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
933					    const void *desc, int type)
934{
935	trace_xen_cpu_write_gdt_entry(dt, entry, desc, type);
936
937	switch (type) {
938	case DESC_LDT:
939	case DESC_TSS:
940		/* ignore */
941		break;
942
943	default: {
944		xmaddr_t maddr = virt_to_machine(&dt[entry]);
945
946		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
947			dt[entry] = *(struct desc_struct *)desc;
948	}
949
950	}
951}
952
953static void xen_load_sp0(struct tss_struct *tss,
954			 struct thread_struct *thread)
955{
956	struct multicall_space mcs;
957
958	mcs = xen_mc_entry(0);
959	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
960	xen_mc_issue(PARAVIRT_LAZY_CPU);
961	tss->x86_tss.sp0 = thread->sp0;
962}
963
964void xen_set_iopl_mask(unsigned mask)
965{
966	struct physdev_set_iopl set_iopl;
967
968	/* Force the change at ring 0. */
969	set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
970	HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
971}
972
973static void xen_io_delay(void)
974{
975}
976
977static void xen_clts(void)
978{
979	struct multicall_space mcs;
980
981	mcs = xen_mc_entry(0);
982
983	MULTI_fpu_taskswitch(mcs.mc, 0);
984
985	xen_mc_issue(PARAVIRT_LAZY_CPU);
986}
987
988static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
989
990static unsigned long xen_read_cr0(void)
991{
992	unsigned long cr0 = this_cpu_read(xen_cr0_value);
993
994	if (unlikely(cr0 == 0)) {
995		cr0 = native_read_cr0();
996		this_cpu_write(xen_cr0_value, cr0);
997	}
998
999	return cr0;
1000}
1001
1002static void xen_write_cr0(unsigned long cr0)
1003{
1004	struct multicall_space mcs;
1005
1006	this_cpu_write(xen_cr0_value, cr0);
1007
1008	/* Only pay attention to cr0.TS; everything else is
1009	   ignored. */
1010	mcs = xen_mc_entry(0);
1011
1012	MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0);
1013
1014	xen_mc_issue(PARAVIRT_LAZY_CPU);
1015}
1016
1017static void xen_write_cr4(unsigned long cr4)
1018{
1019	cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE);
1020
1021	native_write_cr4(cr4);
1022}
1023#ifdef CONFIG_X86_64
1024static inline unsigned long xen_read_cr8(void)
1025{
1026	return 0;
1027}
1028static inline void xen_write_cr8(unsigned long val)
1029{
1030	BUG_ON(val);
1031}
1032#endif
1033
1034static u64 xen_read_msr_safe(unsigned int msr, int *err)
1035{
1036	u64 val;
1037
1038	if (pmu_msr_read(msr, &val, err))
1039		return val;
1040
1041	val = native_read_msr_safe(msr, err);
1042	switch (msr) {
1043	case MSR_IA32_APICBASE:
1044#ifdef CONFIG_X86_X2APIC
1045		if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
1046#endif
1047			val &= ~X2APIC_ENABLE;
1048		break;
1049	}
1050	return val;
1051}
1052
1053static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
1054{
1055	int ret;
1056
1057	ret = 0;
1058
1059	switch (msr) {
1060#ifdef CONFIG_X86_64
1061		unsigned which;
1062		u64 base;
1063
1064	case MSR_FS_BASE:		which = SEGBASE_FS; goto set;
1065	case MSR_KERNEL_GS_BASE:	which = SEGBASE_GS_USER; goto set;
1066	case MSR_GS_BASE:		which = SEGBASE_GS_KERNEL; goto set;
1067
1068	set:
1069		base = ((u64)high << 32) | low;
1070		if (HYPERVISOR_set_segment_base(which, base) != 0)
1071			ret = -EIO;
1072		break;
1073#endif
1074
1075	case MSR_STAR:
1076	case MSR_CSTAR:
1077	case MSR_LSTAR:
1078	case MSR_SYSCALL_MASK:
1079	case MSR_IA32_SYSENTER_CS:
1080	case MSR_IA32_SYSENTER_ESP:
1081	case MSR_IA32_SYSENTER_EIP:
1082		/* Fast syscall setup is all done in hypercalls, so
1083		   these are all ignored.  Stub them out here to stop
1084		   Xen console noise. */
1085		break;
1086
1087	default:
1088		if (!pmu_msr_write(msr, low, high, &ret))
1089			ret = native_write_msr_safe(msr, low, high);
1090	}
1091
1092	return ret;
1093}
1094
1095void xen_setup_shared_info(void)
1096{
1097	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1098		set_fixmap(FIX_PARAVIRT_BOOTMAP,
1099			   xen_start_info->shared_info);
1100
1101		HYPERVISOR_shared_info =
1102			(struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
1103	} else
1104		HYPERVISOR_shared_info =
1105			(struct shared_info *)__va(xen_start_info->shared_info);
1106
1107#ifndef CONFIG_SMP
1108	/* In UP this is as good a place as any to set up shared info */
1109	xen_setup_vcpu_info_placement();
1110#endif
1111
1112	xen_setup_mfn_list_list();
1113}
1114
1115/* This is called once we have the cpu_possible_mask */
1116void xen_setup_vcpu_info_placement(void)
1117{
1118	int cpu;
1119
1120	for_each_possible_cpu(cpu)
1121		xen_vcpu_setup(cpu);
1122
1123	/* xen_vcpu_setup managed to place the vcpu_info within the
1124	 * percpu area for all cpus, so make use of it. Note that for
1125	 * PVH we want to use native IRQ mechanism. */
1126	if (have_vcpu_info_placement && !xen_pvh_domain()) {
1127		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
1128		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
1129		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
1130		pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
1131		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1132	}
1133}
1134
1135static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1136			  unsigned long addr, unsigned len)
1137{
1138	char *start, *end, *reloc;
1139	unsigned ret;
1140
1141	start = end = reloc = NULL;
1142
1143#define SITE(op, x)							\
1144	case PARAVIRT_PATCH(op.x):					\
1145	if (have_vcpu_info_placement) {					\
1146		start = (char *)xen_##x##_direct;			\
1147		end = xen_##x##_direct_end;				\
1148		reloc = xen_##x##_direct_reloc;				\
1149	}								\
1150	goto patch_site
1151
1152	switch (type) {
1153		SITE(pv_irq_ops, irq_enable);
1154		SITE(pv_irq_ops, irq_disable);
1155		SITE(pv_irq_ops, save_fl);
1156		SITE(pv_irq_ops, restore_fl);
1157#undef SITE
1158
1159	patch_site:
1160		if (start == NULL || (end-start) > len)
1161			goto default_patch;
1162
1163		ret = paravirt_patch_insns(insnbuf, len, start, end);
1164
1165		/* Note: because reloc is assigned from something that
1166		   appears to be an array, gcc assumes it's non-null,
1167		   but doesn't know its relationship with start and
1168		   end. */
1169		if (reloc > start && reloc < end) {
1170			int reloc_off = reloc - start;
1171			long *relocp = (long *)(insnbuf + reloc_off);
1172			long delta = start - (char *)addr;
1173
1174			*relocp += delta;
1175		}
1176		break;
1177
1178	default_patch:
1179	default:
1180		ret = paravirt_patch_default(type, clobbers, insnbuf,
1181					     addr, len);
1182		break;
1183	}
1184
1185	return ret;
1186}
1187
1188static const struct pv_info xen_info __initconst = {
1189	.paravirt_enabled = 1,
1190	.shared_kernel_pmd = 0,
1191
1192#ifdef CONFIG_X86_64
1193	.extra_user_64bit_cs = FLAT_USER_CS64,
1194#endif
1195	.features = 0,
1196	.name = "Xen",
1197};
1198
1199static const struct pv_init_ops xen_init_ops __initconst = {
1200	.patch = xen_patch,
1201};
1202
1203static const struct pv_cpu_ops xen_cpu_ops __initconst = {
1204	.cpuid = xen_cpuid,
1205
1206	.set_debugreg = xen_set_debugreg,
1207	.get_debugreg = xen_get_debugreg,
1208
1209	.clts = xen_clts,
1210
1211	.read_cr0 = xen_read_cr0,
1212	.write_cr0 = xen_write_cr0,
1213
1214	.read_cr4 = native_read_cr4,
1215	.read_cr4_safe = native_read_cr4_safe,
1216	.write_cr4 = xen_write_cr4,
1217
1218#ifdef CONFIG_X86_64
1219	.read_cr8 = xen_read_cr8,
1220	.write_cr8 = xen_write_cr8,
1221#endif
1222
1223	.wbinvd = native_wbinvd,
1224
1225	.read_msr = xen_read_msr_safe,
1226	.write_msr = xen_write_msr_safe,
1227
1228	.read_pmc = xen_read_pmc,
1229
1230	.iret = xen_iret,
1231#ifdef CONFIG_X86_64
1232	.usergs_sysret32 = xen_sysret32,
1233	.usergs_sysret64 = xen_sysret64,
1234#else
1235	.irq_enable_sysexit = xen_sysexit,
1236#endif
1237
1238	.load_tr_desc = paravirt_nop,
1239	.set_ldt = xen_set_ldt,
1240	.load_gdt = xen_load_gdt,
1241	.load_idt = xen_load_idt,
1242	.load_tls = xen_load_tls,
1243#ifdef CONFIG_X86_64
1244	.load_gs_index = xen_load_gs_index,
1245#endif
1246
1247	.alloc_ldt = xen_alloc_ldt,
1248	.free_ldt = xen_free_ldt,
1249
1250	.store_idt = native_store_idt,
1251	.store_tr = xen_store_tr,
1252
1253	.write_ldt_entry = xen_write_ldt_entry,
1254	.write_gdt_entry = xen_write_gdt_entry,
1255	.write_idt_entry = xen_write_idt_entry,
1256	.load_sp0 = xen_load_sp0,
1257
1258	.set_iopl_mask = xen_set_iopl_mask,
1259	.io_delay = xen_io_delay,
1260
1261	/* Xen takes care of %gs when switching to usermode for us */
1262	.swapgs = paravirt_nop,
1263
1264	.start_context_switch = paravirt_start_context_switch,
1265	.end_context_switch = xen_end_context_switch,
1266};
1267
1268static const struct pv_apic_ops xen_apic_ops __initconst = {
1269#ifdef CONFIG_X86_LOCAL_APIC
1270	.startup_ipi_hook = paravirt_nop,
1271#endif
1272};
1273
1274static void xen_reboot(int reason)
1275{
1276	struct sched_shutdown r = { .reason = reason };
1277	int cpu;
1278
1279	for_each_online_cpu(cpu)
1280		xen_pmu_finish(cpu);
1281
1282	if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1283		BUG();
1284}
1285
1286static void xen_restart(char *msg)
1287{
1288	xen_reboot(SHUTDOWN_reboot);
1289}
1290
1291static void xen_emergency_restart(void)
1292{
1293	xen_reboot(SHUTDOWN_reboot);
1294}
1295
1296static void xen_machine_halt(void)
1297{
1298	xen_reboot(SHUTDOWN_poweroff);
1299}
1300
1301static void xen_machine_power_off(void)
1302{
1303	if (pm_power_off)
1304		pm_power_off();
1305	xen_reboot(SHUTDOWN_poweroff);
1306}
1307
1308static void xen_crash_shutdown(struct pt_regs *regs)
1309{
1310	xen_reboot(SHUTDOWN_crash);
1311}
1312
1313static int
1314xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
1315{
1316	xen_reboot(SHUTDOWN_crash);
1317	return NOTIFY_DONE;
1318}
1319
1320static struct notifier_block xen_panic_block = {
1321	.notifier_call= xen_panic_event,
1322	.priority = INT_MIN
1323};
1324
1325int xen_panic_handler_init(void)
1326{
1327	atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block);
1328	return 0;
1329}
1330
1331static const struct machine_ops xen_machine_ops __initconst = {
1332	.restart = xen_restart,
1333	.halt = xen_machine_halt,
1334	.power_off = xen_machine_power_off,
1335	.shutdown = xen_machine_halt,
1336	.crash_shutdown = xen_crash_shutdown,
1337	.emergency_restart = xen_emergency_restart,
1338};
1339
1340static unsigned char xen_get_nmi_reason(void)
1341{
1342	unsigned char reason = 0;
1343
1344	/* Construct a value which looks like it came from port 0x61. */
1345	if (test_bit(_XEN_NMIREASON_io_error,
1346		     &HYPERVISOR_shared_info->arch.nmi_reason))
1347		reason |= NMI_REASON_IOCHK;
1348	if (test_bit(_XEN_NMIREASON_pci_serr,
1349		     &HYPERVISOR_shared_info->arch.nmi_reason))
1350		reason |= NMI_REASON_SERR;
1351
1352	return reason;
1353}
1354
1355static void __init xen_boot_params_init_edd(void)
1356{
1357#if IS_ENABLED(CONFIG_EDD)
1358	struct xen_platform_op op;
1359	struct edd_info *edd_info;
1360	u32 *mbr_signature;
1361	unsigned nr;
1362	int ret;
1363
1364	edd_info = boot_params.eddbuf;
1365	mbr_signature = boot_params.edd_mbr_sig_buffer;
1366
1367	op.cmd = XENPF_firmware_info;
1368
1369	op.u.firmware_info.type = XEN_FW_DISK_INFO;
1370	for (nr = 0; nr < EDDMAXNR; nr++) {
1371		struct edd_info *info = edd_info + nr;
1372
1373		op.u.firmware_info.index = nr;
1374		info->params.length = sizeof(info->params);
1375		set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params,
1376				     &info->params);
1377		ret = HYPERVISOR_dom0_op(&op);
1378		if (ret)
1379			break;
1380
1381#define C(x) info->x = op.u.firmware_info.u.disk_info.x
1382		C(device);
1383		C(version);
1384		C(interface_support);
1385		C(legacy_max_cylinder);
1386		C(legacy_max_head);
1387		C(legacy_sectors_per_track);
1388#undef C
1389	}
1390	boot_params.eddbuf_entries = nr;
1391
1392	op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE;
1393	for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) {
1394		op.u.firmware_info.index = nr;
1395		ret = HYPERVISOR_dom0_op(&op);
1396		if (ret)
1397			break;
1398		mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature;
1399	}
1400	boot_params.edd_mbr_sig_buf_entries = nr;
1401#endif
1402}
1403
1404/*
1405 * Set up the GDT and segment registers for -fstack-protector.  Until
1406 * we do this, we have to be careful not to call any stack-protected
1407 * function, which is most of the kernel.
1408 *
1409 * Note, that it is __ref because the only caller of this after init
1410 * is PVH which is not going to use xen_load_gdt_boot or other
1411 * __init functions.
1412 */
1413static void __ref xen_setup_gdt(int cpu)
1414{
1415	if (xen_feature(XENFEAT_auto_translated_physmap)) {
1416#ifdef CONFIG_X86_64
1417		unsigned long dummy;
1418
1419		load_percpu_segment(cpu); /* We need to access per-cpu area */
1420		switch_to_new_gdt(cpu); /* GDT and GS set */
1421
1422		/* We are switching of the Xen provided GDT to our HVM mode
1423		 * GDT. The new GDT has  __KERNEL_CS with CS.L = 1
1424		 * and we are jumping to reload it.
1425		 */
1426		asm volatile ("pushq %0\n"
1427			      "leaq 1f(%%rip),%0\n"
1428			      "pushq %0\n"
1429			      "lretq\n"
1430			      "1:\n"
1431			      : "=&r" (dummy) : "0" (__KERNEL_CS));
1432
1433		/*
1434		 * While not needed, we also set the %es, %ds, and %fs
1435		 * to zero. We don't care about %ss as it is NULL.
1436		 * Strictly speaking this is not needed as Xen zeros those
1437		 * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
1438		 *
1439		 * Linux zeros them in cpu_init() and in secondary_startup_64
1440		 * (for BSP).
1441		 */
1442		loadsegment(es, 0);
1443		loadsegment(ds, 0);
1444		loadsegment(fs, 0);
1445#else
1446		/* PVH: TODO Implement. */
1447		BUG();
1448#endif
1449		return; /* PVH does not need any PV GDT ops. */
1450	}
1451	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
1452	pv_cpu_ops.load_gdt = xen_load_gdt_boot;
1453
1454	setup_stack_canary_segment(0);
1455	switch_to_new_gdt(0);
1456
1457	pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
1458	pv_cpu_ops.load_gdt = xen_load_gdt;
1459}
1460
1461#ifdef CONFIG_XEN_PVH
1462/*
1463 * A PV guest starts with default flags that are not set for PVH, set them
1464 * here asap.
1465 */
1466static void xen_pvh_set_cr_flags(int cpu)
1467{
1468
1469	/* Some of these are setup in 'secondary_startup_64'. The others:
1470	 * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
1471	 * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
1472	write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
1473
1474	if (!cpu)
1475		return;
1476	/*
1477	 * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
1478	 * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
1479	*/
1480	if (cpu_has_pse)
1481		cr4_set_bits_and_update_boot(X86_CR4_PSE);
1482
1483	if (cpu_has_pge)
1484		cr4_set_bits_and_update_boot(X86_CR4_PGE);
1485}
1486
1487/*
1488 * Note, that it is ref - because the only caller of this after init
1489 * is PVH which is not going to use xen_load_gdt_boot or other
1490 * __init functions.
1491 */
1492void __ref xen_pvh_secondary_vcpu_init(int cpu)
1493{
1494	xen_setup_gdt(cpu);
1495	xen_pvh_set_cr_flags(cpu);
1496}
1497
1498static void __init xen_pvh_early_guest_init(void)
1499{
1500	if (!xen_feature(XENFEAT_auto_translated_physmap))
1501		return;
1502
1503	if (!xen_feature(XENFEAT_hvm_callback_vector))
1504		return;
1505
1506	xen_have_vector_callback = 1;
1507
1508	xen_pvh_early_cpu_init(0, false);
1509	xen_pvh_set_cr_flags(0);
1510
1511#ifdef CONFIG_X86_32
1512	BUG(); /* PVH: Implement proper support. */
1513#endif
1514}
1515#endif    /* CONFIG_XEN_PVH */
1516
1517/* First C function to be called on Xen boot */
1518asmlinkage __visible void __init xen_start_kernel(void)
1519{
1520	struct physdev_set_iopl set_iopl;
1521	unsigned long initrd_start = 0;
1522	u64 pat;
1523	int rc;
1524
1525	if (!xen_start_info)
1526		return;
1527
1528	xen_domain_type = XEN_PV_DOMAIN;
1529
1530	xen_setup_features();
1531#ifdef CONFIG_XEN_PVH
1532	xen_pvh_early_guest_init();
1533#endif
1534	xen_setup_machphys_mapping();
1535
1536	/* Install Xen paravirt ops */
1537	pv_info = xen_info;
1538	if (xen_initial_domain())
1539		pv_info.features |= PV_SUPPORTED_RTC;
1540	pv_init_ops = xen_init_ops;
1541	pv_apic_ops = xen_apic_ops;
1542	if (!xen_pvh_domain()) {
1543		pv_cpu_ops = xen_cpu_ops;
1544
1545		x86_platform.get_nmi_reason = xen_get_nmi_reason;
1546	}
1547
1548	if (xen_feature(XENFEAT_auto_translated_physmap))
1549		x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
1550	else
1551		x86_init.resources.memory_setup = xen_memory_setup;
1552	x86_init.oem.arch_setup = xen_arch_setup;
1553	x86_init.oem.banner = xen_banner;
1554
1555	xen_init_time_ops();
1556
1557	/*
1558	 * Set up some pagetable state before starting to set any ptes.
1559	 */
1560
1561	xen_init_mmu_ops();
1562
1563	/* Prevent unwanted bits from being set in PTEs. */
1564	__supported_pte_mask &= ~_PAGE_GLOBAL;
1565
1566	/*
1567	 * Prevent page tables from being allocated in highmem, even
1568	 * if CONFIG_HIGHPTE is enabled.
1569	 */
1570	__userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1571
1572	/* Work out if we support NX */
1573	x86_configure_nx();
1574
1575	/* Get mfn list */
1576	xen_build_dynamic_phys_to_machine();
1577
1578	/*
1579	 * Set up kernel GDT and segment registers, mainly so that
1580	 * -fstack-protector code can be executed.
1581	 */
1582	xen_setup_gdt(0);
1583
1584	xen_init_irq_ops();
1585	xen_init_cpuid_mask();
1586
1587#ifdef CONFIG_X86_LOCAL_APIC
1588	/*
1589	 * set up the basic apic ops.
1590	 */
1591	xen_init_apic();
1592#endif
1593
1594	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1595		pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1596		pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
1597	}
1598
1599	machine_ops = xen_machine_ops;
1600
1601	/*
1602	 * The only reliable way to retain the initial address of the
1603	 * percpu gdt_page is to remember it here, so we can go and
1604	 * mark it RW later, when the initial percpu area is freed.
1605	 */
1606	xen_initial_gdt = &per_cpu(gdt_page, 0);
1607
1608	xen_smp_init();
1609
1610#ifdef CONFIG_ACPI_NUMA
1611	/*
1612	 * The pages we from Xen are not related to machine pages, so
1613	 * any NUMA information the kernel tries to get from ACPI will
1614	 * be meaningless.  Prevent it from trying.
1615	 */
1616	acpi_numa = -1;
1617#endif
1618	/* Don't do the full vcpu_info placement stuff until we have a
1619	   possible map and a non-dummy shared_info. */
1620	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1621
1622	local_irq_disable();
1623	early_boot_irqs_disabled = true;
1624
1625	xen_raw_console_write("mapping kernel into physical memory\n");
1626	xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base,
1627				   xen_start_info->nr_pages);
1628	xen_reserve_special_pages();
1629
1630	/*
1631	 * Modify the cache mode translation tables to match Xen's PAT
1632	 * configuration.
1633	 */
1634	rdmsrl(MSR_IA32_CR_PAT, pat);
1635	pat_init_cache_modes(pat);
1636
1637	/* keep using Xen gdt for now; no urgent need to change it */
1638
1639#ifdef CONFIG_X86_32
1640	pv_info.kernel_rpl = 1;
1641	if (xen_feature(XENFEAT_supervisor_mode_kernel))
1642		pv_info.kernel_rpl = 0;
1643#else
1644	pv_info.kernel_rpl = 0;
1645#endif
1646	/* set the limit of our address space */
1647	xen_reserve_top();
1648
1649	/* PVH: runs at default kernel iopl of 0 */
1650	if (!xen_pvh_domain()) {
1651		/*
1652		 * We used to do this in xen_arch_setup, but that is too late
1653		 * on AMD were early_cpu_init (run before ->arch_setup()) calls
1654		 * early_amd_init which pokes 0xcf8 port.
1655		 */
1656		set_iopl.iopl = 1;
1657		rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1658		if (rc != 0)
1659			xen_raw_printk("physdev_op failed %d\n", rc);
1660	}
1661
1662#ifdef CONFIG_X86_32
1663	/* set up basic CPUID stuff */
1664	cpu_detect(&new_cpu_data);
1665	set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
1666	new_cpu_data.wp_works_ok = 1;
1667	new_cpu_data.x86_capability[0] = cpuid_edx(1);
1668#endif
1669
1670	if (xen_start_info->mod_start) {
1671	    if (xen_start_info->flags & SIF_MOD_START_PFN)
1672		initrd_start = PFN_PHYS(xen_start_info->mod_start);
1673	    else
1674		initrd_start = __pa(xen_start_info->mod_start);
1675	}
1676
1677	/* Poke various useful things into boot_params */
1678	boot_params.hdr.type_of_loader = (9 << 4) | 0;
1679	boot_params.hdr.ramdisk_image = initrd_start;
1680	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1681	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1682
1683	if (!xen_initial_domain()) {
1684		add_preferred_console("xenboot", 0, NULL);
1685		add_preferred_console("tty", 0, NULL);
1686		add_preferred_console("hvc", 0, NULL);
1687		if (pci_xen)
1688			x86_init.pci.arch_init = pci_xen_init;
1689	} else {
1690		const struct dom0_vga_console_info *info =
1691			(void *)((char *)xen_start_info +
1692				 xen_start_info->console.dom0.info_off);
1693		struct xen_platform_op op = {
1694			.cmd = XENPF_firmware_info,
1695			.interface_version = XENPF_INTERFACE_VERSION,
1696			.u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS,
1697		};
1698
1699		xen_init_vga(info, xen_start_info->console.dom0.info_size);
1700		xen_start_info->console.domU.mfn = 0;
1701		xen_start_info->console.domU.evtchn = 0;
1702
1703		if (HYPERVISOR_dom0_op(&op) == 0)
1704			boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
1705
1706		/* Make sure ACS will be enabled */
1707		pci_request_acs();
1708
1709		xen_acpi_sleep_register();
1710
1711		/* Avoid searching for BIOS MP tables */
1712		x86_init.mpparse.find_smp_config = x86_init_noop;
1713		x86_init.mpparse.get_smp_config = x86_init_uint_noop;
1714
1715		xen_boot_params_init_edd();
1716	}
1717#ifdef CONFIG_PCI
1718	/* PCI BIOS service won't work from a PV guest. */
1719	pci_probe &= ~PCI_PROBE_BIOS;
1720#endif
1721	xen_raw_console_write("about to get started...\n");
1722
1723	xen_setup_runstate_info(0);
1724
1725	xen_efi_init();
1726
1727	/* Start the world */
1728#ifdef CONFIG_X86_32
1729	i386_start_kernel();
1730#else
1731	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
1732	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1733#endif
1734}
1735
1736void __ref xen_hvm_init_shared_info(void)
1737{
1738	int cpu;
1739	struct xen_add_to_physmap xatp;
1740	static struct shared_info *shared_info_page = 0;
1741
1742	if (!shared_info_page)
1743		shared_info_page = (struct shared_info *)
1744			extend_brk(PAGE_SIZE, PAGE_SIZE);
1745	xatp.domid = DOMID_SELF;
1746	xatp.idx = 0;
1747	xatp.space = XENMAPSPACE_shared_info;
1748	xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
1749	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
1750		BUG();
1751
1752	HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
1753
1754	/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
1755	 * page, we use it in the event channel upcall and in some pvclock
1756	 * related functions. We don't need the vcpu_info placement
1757	 * optimizations because we don't use any pv_mmu or pv_irq op on
1758	 * HVM.
1759	 * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
1760	 * online but xen_hvm_init_shared_info is run at resume time too and
1761	 * in that case multiple vcpus might be online. */
1762	for_each_online_cpu(cpu) {
1763		/* Leave it to be NULL. */
1764		if (cpu >= MAX_VIRT_CPUS)
1765			continue;
1766		per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1767	}
1768}
1769
1770#ifdef CONFIG_XEN_PVHVM
1771static void __init init_hvm_pv_info(void)
1772{
1773	int major, minor;
1774	uint32_t eax, ebx, ecx, edx, pages, msr, base;
1775	u64 pfn;
1776
1777	base = xen_cpuid_base();
1778	cpuid(base + 1, &eax, &ebx, &ecx, &edx);
1779
1780	major = eax >> 16;
1781	minor = eax & 0xffff;
1782	printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
1783
1784	cpuid(base + 2, &pages, &msr, &ecx, &edx);
1785
1786	pfn = __pa(hypercall_page);
1787	wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
1788
1789	xen_setup_features();
1790
1791	pv_info.name = "Xen HVM";
1792
1793	xen_domain_type = XEN_HVM_DOMAIN;
1794}
1795
1796static int xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action,
1797			      void *hcpu)
1798{
1799	int cpu = (long)hcpu;
1800	switch (action) {
1801	case CPU_UP_PREPARE:
1802		xen_vcpu_setup(cpu);
1803		if (xen_have_vector_callback) {
1804			if (xen_feature(XENFEAT_hvm_safe_pvclock))
1805				xen_setup_timer(cpu);
1806		}
1807		break;
1808	default:
1809		break;
1810	}
1811	return NOTIFY_OK;
1812}
1813
1814static struct notifier_block xen_hvm_cpu_notifier = {
1815	.notifier_call	= xen_hvm_cpu_notify,
1816};
1817
1818#ifdef CONFIG_KEXEC_CORE
1819static void xen_hvm_shutdown(void)
1820{
1821	native_machine_shutdown();
1822	if (kexec_in_progress)
1823		xen_reboot(SHUTDOWN_soft_reset);
1824}
1825
1826static void xen_hvm_crash_shutdown(struct pt_regs *regs)
1827{
1828	native_machine_crash_shutdown(regs);
1829	xen_reboot(SHUTDOWN_soft_reset);
1830}
1831#endif
1832
1833static void __init xen_hvm_guest_init(void)
1834{
1835	if (xen_pv_domain())
1836		return;
1837
1838	init_hvm_pv_info();
1839
1840	xen_hvm_init_shared_info();
1841
1842	xen_panic_handler_init();
1843
1844	if (xen_feature(XENFEAT_hvm_callback_vector))
1845		xen_have_vector_callback = 1;
1846	xen_hvm_smp_init();
1847	register_cpu_notifier(&xen_hvm_cpu_notifier);
1848	xen_unplug_emulated_devices();
1849	x86_init.irqs.intr_init = xen_init_IRQ;
1850	xen_hvm_init_time_ops();
1851	xen_hvm_init_mmu_ops();
1852#ifdef CONFIG_KEXEC_CORE
1853	machine_ops.shutdown = xen_hvm_shutdown;
1854	machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
1855#endif
1856}
1857#endif
1858
1859static bool xen_nopv = false;
1860static __init int xen_parse_nopv(char *arg)
1861{
1862       xen_nopv = true;
1863       return 0;
1864}
1865early_param("xen_nopv", xen_parse_nopv);
1866
1867static uint32_t __init xen_platform(void)
1868{
1869	if (xen_nopv)
1870		return 0;
1871
1872	return xen_cpuid_base();
1873}
1874
1875bool xen_hvm_need_lapic(void)
1876{
1877	if (xen_nopv)
1878		return false;
1879	if (xen_pv_domain())
1880		return false;
1881	if (!xen_hvm_domain())
1882		return false;
1883	if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
1884		return false;
1885	return true;
1886}
1887EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
1888
1889static void xen_set_cpu_features(struct cpuinfo_x86 *c)
1890{
1891	if (xen_pv_domain()) {
1892		clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
1893		set_cpu_cap(c, X86_FEATURE_XENPV);
1894	}
1895}
1896
1897const struct hypervisor_x86 x86_hyper_xen = {
1898	.name			= "Xen",
1899	.detect			= xen_platform,
1900#ifdef CONFIG_XEN_PVHVM
1901	.init_platform		= xen_hvm_guest_init,
1902#endif
1903	.x2apic_available	= xen_x2apic_para_available,
1904	.set_cpu_features       = xen_set_cpu_features,
1905};
1906EXPORT_SYMBOL(x86_hyper_xen);
1907
1908#ifdef CONFIG_HOTPLUG_CPU
1909void xen_arch_register_cpu(int num)
1910{
1911	arch_register_cpu(num);
1912}
1913EXPORT_SYMBOL(xen_arch_register_cpu);
1914
1915void xen_arch_unregister_cpu(int num)
1916{
1917	arch_unregister_cpu(num);
1918}
1919EXPORT_SYMBOL(xen_arch_unregister_cpu);
1920#endif
1921