1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion.  In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable.  When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest.  This prevents uncontrolled
26 * guest updates to the pagetable.  Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow.  The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use.  This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
41#include <linux/sched.h>
42#include <linux/highmem.h>
43#include <linux/debugfs.h>
44#include <linux/bug.h>
45#include <linux/vmalloc.h>
46#include <linux/module.h>
47#include <linux/gfp.h>
48#include <linux/memblock.h>
49#include <linux/seq_file.h>
50#include <linux/crash_dump.h>
51
52#include <trace/events/xen.h>
53
54#include <asm/pgtable.h>
55#include <asm/tlbflush.h>
56#include <asm/fixmap.h>
57#include <asm/mmu_context.h>
58#include <asm/setup.h>
59#include <asm/paravirt.h>
60#include <asm/e820.h>
61#include <asm/linkage.h>
62#include <asm/page.h>
63#include <asm/init.h>
64#include <asm/pat.h>
65#include <asm/smp.h>
66
67#include <asm/xen/hypercall.h>
68#include <asm/xen/hypervisor.h>
69
70#include <xen/xen.h>
71#include <xen/page.h>
72#include <xen/interface/xen.h>
73#include <xen/interface/hvm/hvm_op.h>
74#include <xen/interface/version.h>
75#include <xen/interface/memory.h>
76#include <xen/hvc-console.h>
77
78#include "multicalls.h"
79#include "mmu.h"
80#include "debugfs.h"
81
82/*
83 * Protects atomic reservation decrease/increase against concurrent increases.
84 * Also protects non-atomic updates of current_pages and balloon lists.
85 */
86DEFINE_SPINLOCK(xen_reservation_lock);
87
88#ifdef CONFIG_X86_32
89/*
90 * Identity map, in addition to plain kernel map.  This needs to be
91 * large enough to allocate page table pages to allocate the rest.
92 * Each page can map 2MB.
93 */
94#define LEVEL1_IDENT_ENTRIES	(PTRS_PER_PTE * 4)
95static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
96#endif
97#ifdef CONFIG_X86_64
98/* l3 pud for userspace vsyscall mapping */
99static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
100#endif /* CONFIG_X86_64 */
101
102/*
103 * Note about cr3 (pagetable base) values:
104 *
105 * xen_cr3 contains the current logical cr3 value; it contains the
106 * last set cr3.  This may not be the current effective cr3, because
107 * its update may be being lazily deferred.  However, a vcpu looking
108 * at its own cr3 can use this value knowing that it everything will
109 * be self-consistent.
110 *
111 * xen_current_cr3 contains the actual vcpu cr3; it is set once the
112 * hypercall to set the vcpu cr3 is complete (so it may be a little
113 * out of date, but it will never be set early).  If one vcpu is
114 * looking at another vcpu's cr3 value, it should use this variable.
115 */
116DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
117DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
118
119static phys_addr_t xen_pt_base, xen_pt_size __initdata;
120
121/*
122 * Just beyond the highest usermode address.  STACK_TOP_MAX has a
123 * redzone above it, so round it up to a PGD boundary.
124 */
125#define USER_LIMIT	((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
126
127unsigned long arbitrary_virt_to_mfn(void *vaddr)
128{
129	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
130
131	return PFN_DOWN(maddr.maddr);
132}
133
134xmaddr_t arbitrary_virt_to_machine(void *vaddr)
135{
136	unsigned long address = (unsigned long)vaddr;
137	unsigned int level;
138	pte_t *pte;
139	unsigned offset;
140
141	/*
142	 * if the PFN is in the linear mapped vaddr range, we can just use
143	 * the (quick) virt_to_machine() p2m lookup
144	 */
145	if (virt_addr_valid(vaddr))
146		return virt_to_machine(vaddr);
147
148	/* otherwise we have to do a (slower) full page-table walk */
149
150	pte = lookup_address(address, &level);
151	BUG_ON(pte == NULL);
152	offset = address & ~PAGE_MASK;
153	return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
154}
155EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
156
157void make_lowmem_page_readonly(void *vaddr)
158{
159	pte_t *pte, ptev;
160	unsigned long address = (unsigned long)vaddr;
161	unsigned int level;
162
163	pte = lookup_address(address, &level);
164	if (pte == NULL)
165		return;		/* vaddr missing */
166
167	ptev = pte_wrprotect(*pte);
168
169	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
170		BUG();
171}
172
173void make_lowmem_page_readwrite(void *vaddr)
174{
175	pte_t *pte, ptev;
176	unsigned long address = (unsigned long)vaddr;
177	unsigned int level;
178
179	pte = lookup_address(address, &level);
180	if (pte == NULL)
181		return;		/* vaddr missing */
182
183	ptev = pte_mkwrite(*pte);
184
185	if (HYPERVISOR_update_va_mapping(address, ptev, 0))
186		BUG();
187}
188
189
190static bool xen_page_pinned(void *ptr)
191{
192	struct page *page = virt_to_page(ptr);
193
194	return PagePinned(page);
195}
196
197void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
198{
199	struct multicall_space mcs;
200	struct mmu_update *u;
201
202	trace_xen_mmu_set_domain_pte(ptep, pteval, domid);
203
204	mcs = xen_mc_entry(sizeof(*u));
205	u = mcs.args;
206
207	/* ptep might be kmapped when using 32-bit HIGHPTE */
208	u->ptr = virt_to_machine(ptep).maddr;
209	u->val = pte_val_ma(pteval);
210
211	MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
212
213	xen_mc_issue(PARAVIRT_LAZY_MMU);
214}
215EXPORT_SYMBOL_GPL(xen_set_domain_pte);
216
217static void xen_extend_mmu_update(const struct mmu_update *update)
218{
219	struct multicall_space mcs;
220	struct mmu_update *u;
221
222	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
223
224	if (mcs.mc != NULL) {
225		mcs.mc->args[1]++;
226	} else {
227		mcs = __xen_mc_entry(sizeof(*u));
228		MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
229	}
230
231	u = mcs.args;
232	*u = *update;
233}
234
235static void xen_extend_mmuext_op(const struct mmuext_op *op)
236{
237	struct multicall_space mcs;
238	struct mmuext_op *u;
239
240	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u));
241
242	if (mcs.mc != NULL) {
243		mcs.mc->args[1]++;
244	} else {
245		mcs = __xen_mc_entry(sizeof(*u));
246		MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
247	}
248
249	u = mcs.args;
250	*u = *op;
251}
252
253static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
254{
255	struct mmu_update u;
256
257	preempt_disable();
258
259	xen_mc_batch();
260
261	/* ptr may be ioremapped for 64-bit pagetable setup */
262	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
263	u.val = pmd_val_ma(val);
264	xen_extend_mmu_update(&u);
265
266	xen_mc_issue(PARAVIRT_LAZY_MMU);
267
268	preempt_enable();
269}
270
271static void xen_set_pmd(pmd_t *ptr, pmd_t val)
272{
273	trace_xen_mmu_set_pmd(ptr, val);
274
275	/* If page is not pinned, we can just update the entry
276	   directly */
277	if (!xen_page_pinned(ptr)) {
278		*ptr = val;
279		return;
280	}
281
282	xen_set_pmd_hyper(ptr, val);
283}
284
285/*
286 * Associate a virtual page frame with a given physical page frame
287 * and protection flags for that frame.
288 */
289void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
290{
291	set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
292}
293
294static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
295{
296	struct mmu_update u;
297
298	if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
299		return false;
300
301	xen_mc_batch();
302
303	u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
304	u.val = pte_val_ma(pteval);
305	xen_extend_mmu_update(&u);
306
307	xen_mc_issue(PARAVIRT_LAZY_MMU);
308
309	return true;
310}
311
312static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
313{
314	if (!xen_batched_set_pte(ptep, pteval)) {
315		/*
316		 * Could call native_set_pte() here and trap and
317		 * emulate the PTE write but with 32-bit guests this
318		 * needs two traps (one for each of the two 32-bit
319		 * words in the PTE) so do one hypercall directly
320		 * instead.
321		 */
322		struct mmu_update u;
323
324		u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
325		u.val = pte_val_ma(pteval);
326		HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF);
327	}
328}
329
330static void xen_set_pte(pte_t *ptep, pte_t pteval)
331{
332	trace_xen_mmu_set_pte(ptep, pteval);
333	__xen_set_pte(ptep, pteval);
334}
335
336static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
337		    pte_t *ptep, pte_t pteval)
338{
339	trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval);
340	__xen_set_pte(ptep, pteval);
341}
342
343pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
344				 unsigned long addr, pte_t *ptep)
345{
346	/* Just return the pte as-is.  We preserve the bits on commit */
347	trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep);
348	return *ptep;
349}
350
351void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
352				 pte_t *ptep, pte_t pte)
353{
354	struct mmu_update u;
355
356	trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte);
357	xen_mc_batch();
358
359	u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
360	u.val = pte_val_ma(pte);
361	xen_extend_mmu_update(&u);
362
363	xen_mc_issue(PARAVIRT_LAZY_MMU);
364}
365
366/* Assume pteval_t is equivalent to all the other *val_t types. */
367static pteval_t pte_mfn_to_pfn(pteval_t val)
368{
369	if (val & _PAGE_PRESENT) {
370		unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
371		unsigned long pfn = mfn_to_pfn(mfn);
372
373		pteval_t flags = val & PTE_FLAGS_MASK;
374		if (unlikely(pfn == ~0))
375			val = flags & ~_PAGE_PRESENT;
376		else
377			val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
378	}
379
380	return val;
381}
382
383static pteval_t pte_pfn_to_mfn(pteval_t val)
384{
385	if (val & _PAGE_PRESENT) {
386		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
387		pteval_t flags = val & PTE_FLAGS_MASK;
388		unsigned long mfn;
389
390		if (!xen_feature(XENFEAT_auto_translated_physmap))
391			mfn = __pfn_to_mfn(pfn);
392		else
393			mfn = pfn;
394		/*
395		 * If there's no mfn for the pfn, then just create an
396		 * empty non-present pte.  Unfortunately this loses
397		 * information about the original pfn, so
398		 * pte_mfn_to_pfn is asymmetric.
399		 */
400		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
401			mfn = 0;
402			flags = 0;
403		} else
404			mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
405		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
406	}
407
408	return val;
409}
410
411__visible pteval_t xen_pte_val(pte_t pte)
412{
413	pteval_t pteval = pte.pte;
414
415	return pte_mfn_to_pfn(pteval);
416}
417PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
418
419__visible pgdval_t xen_pgd_val(pgd_t pgd)
420{
421	return pte_mfn_to_pfn(pgd.pgd);
422}
423PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
424
425__visible pte_t xen_make_pte(pteval_t pte)
426{
427	pte = pte_pfn_to_mfn(pte);
428
429	return native_make_pte(pte);
430}
431PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
432
433__visible pgd_t xen_make_pgd(pgdval_t pgd)
434{
435	pgd = pte_pfn_to_mfn(pgd);
436	return native_make_pgd(pgd);
437}
438PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
439
440__visible pmdval_t xen_pmd_val(pmd_t pmd)
441{
442	return pte_mfn_to_pfn(pmd.pmd);
443}
444PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
445
446static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
447{
448	struct mmu_update u;
449
450	preempt_disable();
451
452	xen_mc_batch();
453
454	/* ptr may be ioremapped for 64-bit pagetable setup */
455	u.ptr = arbitrary_virt_to_machine(ptr).maddr;
456	u.val = pud_val_ma(val);
457	xen_extend_mmu_update(&u);
458
459	xen_mc_issue(PARAVIRT_LAZY_MMU);
460
461	preempt_enable();
462}
463
464static void xen_set_pud(pud_t *ptr, pud_t val)
465{
466	trace_xen_mmu_set_pud(ptr, val);
467
468	/* If page is not pinned, we can just update the entry
469	   directly */
470	if (!xen_page_pinned(ptr)) {
471		*ptr = val;
472		return;
473	}
474
475	xen_set_pud_hyper(ptr, val);
476}
477
478#ifdef CONFIG_X86_PAE
479static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
480{
481	trace_xen_mmu_set_pte_atomic(ptep, pte);
482	set_64bit((u64 *)ptep, native_pte_val(pte));
483}
484
485static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
486{
487	trace_xen_mmu_pte_clear(mm, addr, ptep);
488	if (!xen_batched_set_pte(ptep, native_make_pte(0)))
489		native_pte_clear(mm, addr, ptep);
490}
491
492static void xen_pmd_clear(pmd_t *pmdp)
493{
494	trace_xen_mmu_pmd_clear(pmdp);
495	set_pmd(pmdp, __pmd(0));
496}
497#endif	/* CONFIG_X86_PAE */
498
499__visible pmd_t xen_make_pmd(pmdval_t pmd)
500{
501	pmd = pte_pfn_to_mfn(pmd);
502	return native_make_pmd(pmd);
503}
504PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
505
506#if CONFIG_PGTABLE_LEVELS == 4
507__visible pudval_t xen_pud_val(pud_t pud)
508{
509	return pte_mfn_to_pfn(pud.pud);
510}
511PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
512
513__visible pud_t xen_make_pud(pudval_t pud)
514{
515	pud = pte_pfn_to_mfn(pud);
516
517	return native_make_pud(pud);
518}
519PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
520
521static pgd_t *xen_get_user_pgd(pgd_t *pgd)
522{
523	pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
524	unsigned offset = pgd - pgd_page;
525	pgd_t *user_ptr = NULL;
526
527	if (offset < pgd_index(USER_LIMIT)) {
528		struct page *page = virt_to_page(pgd_page);
529		user_ptr = (pgd_t *)page->private;
530		if (user_ptr)
531			user_ptr += offset;
532	}
533
534	return user_ptr;
535}
536
537static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
538{
539	struct mmu_update u;
540
541	u.ptr = virt_to_machine(ptr).maddr;
542	u.val = pgd_val_ma(val);
543	xen_extend_mmu_update(&u);
544}
545
546/*
547 * Raw hypercall-based set_pgd, intended for in early boot before
548 * there's a page structure.  This implies:
549 *  1. The only existing pagetable is the kernel's
550 *  2. It is always pinned
551 *  3. It has no user pagetable attached to it
552 */
553static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
554{
555	preempt_disable();
556
557	xen_mc_batch();
558
559	__xen_set_pgd_hyper(ptr, val);
560
561	xen_mc_issue(PARAVIRT_LAZY_MMU);
562
563	preempt_enable();
564}
565
566static void xen_set_pgd(pgd_t *ptr, pgd_t val)
567{
568	pgd_t *user_ptr = xen_get_user_pgd(ptr);
569
570	trace_xen_mmu_set_pgd(ptr, user_ptr, val);
571
572	/* If page is not pinned, we can just update the entry
573	   directly */
574	if (!xen_page_pinned(ptr)) {
575		*ptr = val;
576		if (user_ptr) {
577			WARN_ON(xen_page_pinned(user_ptr));
578			*user_ptr = val;
579		}
580		return;
581	}
582
583	/* If it's pinned, then we can at least batch the kernel and
584	   user updates together. */
585	xen_mc_batch();
586
587	__xen_set_pgd_hyper(ptr, val);
588	if (user_ptr)
589		__xen_set_pgd_hyper(user_ptr, val);
590
591	xen_mc_issue(PARAVIRT_LAZY_MMU);
592}
593#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
594
595/*
596 * (Yet another) pagetable walker.  This one is intended for pinning a
597 * pagetable.  This means that it walks a pagetable and calls the
598 * callback function on each page it finds making up the page table,
599 * at every level.  It walks the entire pagetable, but it only bothers
600 * pinning pte pages which are below limit.  In the normal case this
601 * will be STACK_TOP_MAX, but at boot we need to pin up to
602 * FIXADDR_TOP.
603 *
604 * For 32-bit the important bit is that we don't pin beyond there,
605 * because then we start getting into Xen's ptes.
606 *
607 * For 64-bit, we must skip the Xen hole in the middle of the address
608 * space, just after the big x86-64 virtual hole.
609 */
610static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
611			  int (*func)(struct mm_struct *mm, struct page *,
612				      enum pt_level),
613			  unsigned long limit)
614{
615	int flush = 0;
616	unsigned hole_low, hole_high;
617	unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
618	unsigned pgdidx, pudidx, pmdidx;
619
620	/* The limit is the last byte to be touched */
621	limit--;
622	BUG_ON(limit >= FIXADDR_TOP);
623
624	if (xen_feature(XENFEAT_auto_translated_physmap))
625		return 0;
626
627	/*
628	 * 64-bit has a great big hole in the middle of the address
629	 * space, which contains the Xen mappings.  On 32-bit these
630	 * will end up making a zero-sized hole and so is a no-op.
631	 */
632	hole_low = pgd_index(USER_LIMIT);
633	hole_high = pgd_index(PAGE_OFFSET);
634
635	pgdidx_limit = pgd_index(limit);
636#if PTRS_PER_PUD > 1
637	pudidx_limit = pud_index(limit);
638#else
639	pudidx_limit = 0;
640#endif
641#if PTRS_PER_PMD > 1
642	pmdidx_limit = pmd_index(limit);
643#else
644	pmdidx_limit = 0;
645#endif
646
647	for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
648		pud_t *pud;
649
650		if (pgdidx >= hole_low && pgdidx < hole_high)
651			continue;
652
653		if (!pgd_val(pgd[pgdidx]))
654			continue;
655
656		pud = pud_offset(&pgd[pgdidx], 0);
657
658		if (PTRS_PER_PUD > 1) /* not folded */
659			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
660
661		for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
662			pmd_t *pmd;
663
664			if (pgdidx == pgdidx_limit &&
665			    pudidx > pudidx_limit)
666				goto out;
667
668			if (pud_none(pud[pudidx]))
669				continue;
670
671			pmd = pmd_offset(&pud[pudidx], 0);
672
673			if (PTRS_PER_PMD > 1) /* not folded */
674				flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
675
676			for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
677				struct page *pte;
678
679				if (pgdidx == pgdidx_limit &&
680				    pudidx == pudidx_limit &&
681				    pmdidx > pmdidx_limit)
682					goto out;
683
684				if (pmd_none(pmd[pmdidx]))
685					continue;
686
687				pte = pmd_page(pmd[pmdidx]);
688				flush |= (*func)(mm, pte, PT_PTE);
689			}
690		}
691	}
692
693out:
694	/* Do the top level last, so that the callbacks can use it as
695	   a cue to do final things like tlb flushes. */
696	flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
697
698	return flush;
699}
700
701static int xen_pgd_walk(struct mm_struct *mm,
702			int (*func)(struct mm_struct *mm, struct page *,
703				    enum pt_level),
704			unsigned long limit)
705{
706	return __xen_pgd_walk(mm, mm->pgd, func, limit);
707}
708
709/* If we're using split pte locks, then take the page's lock and
710   return a pointer to it.  Otherwise return NULL. */
711static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
712{
713	spinlock_t *ptl = NULL;
714
715#if USE_SPLIT_PTE_PTLOCKS
716	ptl = ptlock_ptr(page);
717	spin_lock_nest_lock(ptl, &mm->page_table_lock);
718#endif
719
720	return ptl;
721}
722
723static void xen_pte_unlock(void *v)
724{
725	spinlock_t *ptl = v;
726	spin_unlock(ptl);
727}
728
729static void xen_do_pin(unsigned level, unsigned long pfn)
730{
731	struct mmuext_op op;
732
733	op.cmd = level;
734	op.arg1.mfn = pfn_to_mfn(pfn);
735
736	xen_extend_mmuext_op(&op);
737}
738
739static int xen_pin_page(struct mm_struct *mm, struct page *page,
740			enum pt_level level)
741{
742	unsigned pgfl = TestSetPagePinned(page);
743	int flush;
744
745	if (pgfl)
746		flush = 0;		/* already pinned */
747	else if (PageHighMem(page))
748		/* kmaps need flushing if we found an unpinned
749		   highpage */
750		flush = 1;
751	else {
752		void *pt = lowmem_page_address(page);
753		unsigned long pfn = page_to_pfn(page);
754		struct multicall_space mcs = __xen_mc_entry(0);
755		spinlock_t *ptl;
756
757		flush = 0;
758
759		/*
760		 * We need to hold the pagetable lock between the time
761		 * we make the pagetable RO and when we actually pin
762		 * it.  If we don't, then other users may come in and
763		 * attempt to update the pagetable by writing it,
764		 * which will fail because the memory is RO but not
765		 * pinned, so Xen won't do the trap'n'emulate.
766		 *
767		 * If we're using split pte locks, we can't hold the
768		 * entire pagetable's worth of locks during the
769		 * traverse, because we may wrap the preempt count (8
770		 * bits).  The solution is to mark RO and pin each PTE
771		 * page while holding the lock.  This means the number
772		 * of locks we end up holding is never more than a
773		 * batch size (~32 entries, at present).
774		 *
775		 * If we're not using split pte locks, we needn't pin
776		 * the PTE pages independently, because we're
777		 * protected by the overall pagetable lock.
778		 */
779		ptl = NULL;
780		if (level == PT_PTE)
781			ptl = xen_pte_lock(page, mm);
782
783		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
784					pfn_pte(pfn, PAGE_KERNEL_RO),
785					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
786
787		if (ptl) {
788			xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
789
790			/* Queue a deferred unlock for when this batch
791			   is completed. */
792			xen_mc_callback(xen_pte_unlock, ptl);
793		}
794	}
795
796	return flush;
797}
798
799/* This is called just after a mm has been created, but it has not
800   been used yet.  We need to make sure that its pagetable is all
801   read-only, and can be pinned. */
802static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
803{
804	trace_xen_mmu_pgd_pin(mm, pgd);
805
806	xen_mc_batch();
807
808	if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
809		/* re-enable interrupts for flushing */
810		xen_mc_issue(0);
811
812		kmap_flush_unused();
813
814		xen_mc_batch();
815	}
816
817#ifdef CONFIG_X86_64
818	{
819		pgd_t *user_pgd = xen_get_user_pgd(pgd);
820
821		xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
822
823		if (user_pgd) {
824			xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
825			xen_do_pin(MMUEXT_PIN_L4_TABLE,
826				   PFN_DOWN(__pa(user_pgd)));
827		}
828	}
829#else /* CONFIG_X86_32 */
830#ifdef CONFIG_X86_PAE
831	/* Need to make sure unshared kernel PMD is pinnable */
832	xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
833		     PT_PMD);
834#endif
835	xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
836#endif /* CONFIG_X86_64 */
837	xen_mc_issue(0);
838}
839
840static void xen_pgd_pin(struct mm_struct *mm)
841{
842	__xen_pgd_pin(mm, mm->pgd);
843}
844
845/*
846 * On save, we need to pin all pagetables to make sure they get their
847 * mfns turned into pfns.  Search the list for any unpinned pgds and pin
848 * them (unpinned pgds are not currently in use, probably because the
849 * process is under construction or destruction).
850 *
851 * Expected to be called in stop_machine() ("equivalent to taking
852 * every spinlock in the system"), so the locking doesn't really
853 * matter all that much.
854 */
855void xen_mm_pin_all(void)
856{
857	struct page *page;
858
859	spin_lock(&pgd_lock);
860
861	list_for_each_entry(page, &pgd_list, lru) {
862		if (!PagePinned(page)) {
863			__xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
864			SetPageSavePinned(page);
865		}
866	}
867
868	spin_unlock(&pgd_lock);
869}
870
871/*
872 * The init_mm pagetable is really pinned as soon as its created, but
873 * that's before we have page structures to store the bits.  So do all
874 * the book-keeping now.
875 */
876static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
877				  enum pt_level level)
878{
879	SetPagePinned(page);
880	return 0;
881}
882
883static void __init xen_mark_init_mm_pinned(void)
884{
885	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
886}
887
888static int xen_unpin_page(struct mm_struct *mm, struct page *page,
889			  enum pt_level level)
890{
891	unsigned pgfl = TestClearPagePinned(page);
892
893	if (pgfl && !PageHighMem(page)) {
894		void *pt = lowmem_page_address(page);
895		unsigned long pfn = page_to_pfn(page);
896		spinlock_t *ptl = NULL;
897		struct multicall_space mcs;
898
899		/*
900		 * Do the converse to pin_page.  If we're using split
901		 * pte locks, we must be holding the lock for while
902		 * the pte page is unpinned but still RO to prevent
903		 * concurrent updates from seeing it in this
904		 * partially-pinned state.
905		 */
906		if (level == PT_PTE) {
907			ptl = xen_pte_lock(page, mm);
908
909			if (ptl)
910				xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
911		}
912
913		mcs = __xen_mc_entry(0);
914
915		MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
916					pfn_pte(pfn, PAGE_KERNEL),
917					level == PT_PGD ? UVMF_TLB_FLUSH : 0);
918
919		if (ptl) {
920			/* unlock when batch completed */
921			xen_mc_callback(xen_pte_unlock, ptl);
922		}
923	}
924
925	return 0;		/* never need to flush on unpin */
926}
927
928/* Release a pagetables pages back as normal RW */
929static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
930{
931	trace_xen_mmu_pgd_unpin(mm, pgd);
932
933	xen_mc_batch();
934
935	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
936
937#ifdef CONFIG_X86_64
938	{
939		pgd_t *user_pgd = xen_get_user_pgd(pgd);
940
941		if (user_pgd) {
942			xen_do_pin(MMUEXT_UNPIN_TABLE,
943				   PFN_DOWN(__pa(user_pgd)));
944			xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
945		}
946	}
947#endif
948
949#ifdef CONFIG_X86_PAE
950	/* Need to make sure unshared kernel PMD is unpinned */
951	xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
952		       PT_PMD);
953#endif
954
955	__xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
956
957	xen_mc_issue(0);
958}
959
960static void xen_pgd_unpin(struct mm_struct *mm)
961{
962	__xen_pgd_unpin(mm, mm->pgd);
963}
964
965/*
966 * On resume, undo any pinning done at save, so that the rest of the
967 * kernel doesn't see any unexpected pinned pagetables.
968 */
969void xen_mm_unpin_all(void)
970{
971	struct page *page;
972
973	spin_lock(&pgd_lock);
974
975	list_for_each_entry(page, &pgd_list, lru) {
976		if (PageSavePinned(page)) {
977			BUG_ON(!PagePinned(page));
978			__xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
979			ClearPageSavePinned(page);
980		}
981	}
982
983	spin_unlock(&pgd_lock);
984}
985
986static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
987{
988	spin_lock(&next->page_table_lock);
989	xen_pgd_pin(next);
990	spin_unlock(&next->page_table_lock);
991}
992
993static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
994{
995	spin_lock(&mm->page_table_lock);
996	xen_pgd_pin(mm);
997	spin_unlock(&mm->page_table_lock);
998}
999
1000
1001#ifdef CONFIG_SMP
1002/* Another cpu may still have their %cr3 pointing at the pagetable, so
1003   we need to repoint it somewhere else before we can unpin it. */
1004static void drop_other_mm_ref(void *info)
1005{
1006	struct mm_struct *mm = info;
1007	struct mm_struct *active_mm;
1008
1009	active_mm = this_cpu_read(cpu_tlbstate.active_mm);
1010
1011	if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1012		leave_mm(smp_processor_id());
1013
1014	/* If this cpu still has a stale cr3 reference, then make sure
1015	   it has been flushed. */
1016	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
1017		load_cr3(swapper_pg_dir);
1018}
1019
1020static void xen_drop_mm_ref(struct mm_struct *mm)
1021{
1022	cpumask_var_t mask;
1023	unsigned cpu;
1024
1025	if (current->active_mm == mm) {
1026		if (current->mm == mm)
1027			load_cr3(swapper_pg_dir);
1028		else
1029			leave_mm(smp_processor_id());
1030	}
1031
1032	/* Get the "official" set of cpus referring to our pagetable. */
1033	if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1034		for_each_online_cpu(cpu) {
1035			if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1036			    && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1037				continue;
1038			smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1039		}
1040		return;
1041	}
1042	cpumask_copy(mask, mm_cpumask(mm));
1043
1044	/* It's possible that a vcpu may have a stale reference to our
1045	   cr3, because its in lazy mode, and it hasn't yet flushed
1046	   its set of pending hypercalls yet.  In this case, we can
1047	   look at its actual current cr3 value, and force it to flush
1048	   if needed. */
1049	for_each_online_cpu(cpu) {
1050		if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1051			cpumask_set_cpu(cpu, mask);
1052	}
1053
1054	if (!cpumask_empty(mask))
1055		smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1056	free_cpumask_var(mask);
1057}
1058#else
1059static void xen_drop_mm_ref(struct mm_struct *mm)
1060{
1061	if (current->active_mm == mm)
1062		load_cr3(swapper_pg_dir);
1063}
1064#endif
1065
1066/*
1067 * While a process runs, Xen pins its pagetables, which means that the
1068 * hypervisor forces it to be read-only, and it controls all updates
1069 * to it.  This means that all pagetable updates have to go via the
1070 * hypervisor, which is moderately expensive.
1071 *
1072 * Since we're pulling the pagetable down, we switch to use init_mm,
1073 * unpin old process pagetable and mark it all read-write, which
1074 * allows further operations on it to be simple memory accesses.
1075 *
1076 * The only subtle point is that another CPU may be still using the
1077 * pagetable because of lazy tlb flushing.  This means we need need to
1078 * switch all CPUs off this pagetable before we can unpin it.
1079 */
1080static void xen_exit_mmap(struct mm_struct *mm)
1081{
1082	get_cpu();		/* make sure we don't move around */
1083	xen_drop_mm_ref(mm);
1084	put_cpu();
1085
1086	spin_lock(&mm->page_table_lock);
1087
1088	/* pgd may not be pinned in the error exit path of execve */
1089	if (xen_page_pinned(mm->pgd))
1090		xen_pgd_unpin(mm);
1091
1092	spin_unlock(&mm->page_table_lock);
1093}
1094
1095static void xen_post_allocator_init(void);
1096
1097static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1098{
1099	struct mmuext_op op;
1100
1101	op.cmd = cmd;
1102	op.arg1.mfn = pfn_to_mfn(pfn);
1103	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1104		BUG();
1105}
1106
1107#ifdef CONFIG_X86_64
1108static void __init xen_cleanhighmap(unsigned long vaddr,
1109				    unsigned long vaddr_end)
1110{
1111	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
1112	pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr);
1113
1114	/* NOTE: The loop is more greedy than the cleanup_highmap variant.
1115	 * We include the PMD passed in on _both_ boundaries. */
1116	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PAGE_SIZE));
1117			pmd++, vaddr += PMD_SIZE) {
1118		if (pmd_none(*pmd))
1119			continue;
1120		if (vaddr < (unsigned long) _text || vaddr > kernel_end)
1121			set_pmd(pmd, __pmd(0));
1122	}
1123	/* In case we did something silly, we should crash in this function
1124	 * instead of somewhere later and be confusing. */
1125	xen_mc_flush();
1126}
1127
1128/*
1129 * Make a page range writeable and free it.
1130 */
1131static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1132{
1133	void *vaddr = __va(paddr);
1134	void *vaddr_end = vaddr + size;
1135
1136	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1137		make_lowmem_page_readwrite(vaddr);
1138
1139	memblock_free(paddr, size);
1140}
1141
1142static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1143{
1144	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1145
1146	if (unpin)
1147		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1148	ClearPagePinned(virt_to_page(__va(pa)));
1149	xen_free_ro_pages(pa, PAGE_SIZE);
1150}
1151
1152/*
1153 * Since it is well isolated we can (and since it is perhaps large we should)
1154 * also free the page tables mapping the initial P->M table.
1155 */
1156static void __init xen_cleanmfnmap(unsigned long vaddr)
1157{
1158	unsigned long va = vaddr & PMD_MASK;
1159	unsigned long pa;
1160	pgd_t *pgd = pgd_offset_k(va);
1161	pud_t *pud_page = pud_offset(pgd, 0);
1162	pud_t *pud;
1163	pmd_t *pmd;
1164	pte_t *pte;
1165	unsigned int i;
1166	bool unpin;
1167
1168	unpin = (vaddr == 2 * PGDIR_SIZE);
1169	set_pgd(pgd, __pgd(0));
1170	do {
1171		pud = pud_page + pud_index(va);
1172		if (pud_none(*pud)) {
1173			va += PUD_SIZE;
1174		} else if (pud_large(*pud)) {
1175			pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
1176			xen_free_ro_pages(pa, PUD_SIZE);
1177			va += PUD_SIZE;
1178		} else {
1179			pmd = pmd_offset(pud, va);
1180			if (pmd_large(*pmd)) {
1181				pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
1182				xen_free_ro_pages(pa, PMD_SIZE);
1183			} else if (!pmd_none(*pmd)) {
1184				pte = pte_offset_kernel(pmd, va);
1185				set_pmd(pmd, __pmd(0));
1186				for (i = 0; i < PTRS_PER_PTE; ++i) {
1187					if (pte_none(pte[i]))
1188						break;
1189					pa = pte_pfn(pte[i]) << PAGE_SHIFT;
1190					xen_free_ro_pages(pa, PAGE_SIZE);
1191				}
1192				xen_cleanmfnmap_free_pgtbl(pte, unpin);
1193			}
1194			va += PMD_SIZE;
1195			if (pmd_index(va))
1196				continue;
1197			set_pud(pud, __pud(0));
1198			xen_cleanmfnmap_free_pgtbl(pmd, unpin);
1199		}
1200
1201	} while (pud_index(va) || pmd_index(va));
1202	xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
1203}
1204
1205static void __init xen_pagetable_p2m_free(void)
1206{
1207	unsigned long size;
1208	unsigned long addr;
1209
1210	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1211
1212	/* No memory or already called. */
1213	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1214		return;
1215
1216	/* using __ka address and sticking INVALID_P2M_ENTRY! */
1217	memset((void *)xen_start_info->mfn_list, 0xff, size);
1218
1219	addr = xen_start_info->mfn_list;
1220	/*
1221	 * We could be in __ka space.
1222	 * We roundup to the PMD, which means that if anybody at this stage is
1223	 * using the __ka address of xen_start_info or
1224	 * xen_start_info->shared_info they are in going to crash. Fortunatly
1225	 * we have already revectored in xen_setup_kernel_pagetable and in
1226	 * xen_setup_shared_info.
1227	 */
1228	size = roundup(size, PMD_SIZE);
1229
1230	if (addr >= __START_KERNEL_map) {
1231		xen_cleanhighmap(addr, addr + size);
1232		size = PAGE_ALIGN(xen_start_info->nr_pages *
1233				  sizeof(unsigned long));
1234		memblock_free(__pa(addr), size);
1235	} else {
1236		xen_cleanmfnmap(addr);
1237	}
1238}
1239
1240static void __init xen_pagetable_cleanhighmap(void)
1241{
1242	unsigned long size;
1243	unsigned long addr;
1244
1245	/* At this stage, cleanup_highmap has already cleaned __ka space
1246	 * from _brk_limit way up to the max_pfn_mapped (which is the end of
1247	 * the ramdisk). We continue on, erasing PMD entries that point to page
1248	 * tables - do note that they are accessible at this stage via __va.
1249	 * For good measure we also round up to the PMD - which means that if
1250	 * anybody is using __ka address to the initial boot-stack - and try
1251	 * to use it - they are going to crash. The xen_start_info has been
1252	 * taken care of already in xen_setup_kernel_pagetable. */
1253	addr = xen_start_info->pt_base;
1254	size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE);
1255
1256	xen_cleanhighmap(addr, addr + size);
1257	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1258#ifdef DEBUG
1259	/* This is superflous and is not neccessary, but you know what
1260	 * lets do it. The MODULES_VADDR -> MODULES_END should be clear of
1261	 * anything at this stage. */
1262	xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1);
1263#endif
1264}
1265#endif
1266
1267static void __init xen_pagetable_p2m_setup(void)
1268{
1269	if (xen_feature(XENFEAT_auto_translated_physmap))
1270		return;
1271
1272	xen_vmalloc_p2m_tree();
1273
1274#ifdef CONFIG_X86_64
1275	xen_pagetable_p2m_free();
1276
1277	xen_pagetable_cleanhighmap();
1278#endif
1279	/* And revector! Bye bye old array */
1280	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1281}
1282
1283static void __init xen_pagetable_init(void)
1284{
1285	paging_init();
1286	xen_post_allocator_init();
1287
1288	xen_pagetable_p2m_setup();
1289
1290	/* Allocate and initialize top and mid mfn levels for p2m structure */
1291	xen_build_mfn_list_list();
1292
1293	/* Remap memory freed due to conflicts with E820 map */
1294	if (!xen_feature(XENFEAT_auto_translated_physmap))
1295		xen_remap_memory();
1296
1297	xen_setup_shared_info();
1298}
1299static void xen_write_cr2(unsigned long cr2)
1300{
1301	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1302}
1303
1304static unsigned long xen_read_cr2(void)
1305{
1306	return this_cpu_read(xen_vcpu)->arch.cr2;
1307}
1308
1309unsigned long xen_read_cr2_direct(void)
1310{
1311	return this_cpu_read(xen_vcpu_info.arch.cr2);
1312}
1313
1314void xen_flush_tlb_all(void)
1315{
1316	struct mmuext_op *op;
1317	struct multicall_space mcs;
1318
1319	trace_xen_mmu_flush_tlb_all(0);
1320
1321	preempt_disable();
1322
1323	mcs = xen_mc_entry(sizeof(*op));
1324
1325	op = mcs.args;
1326	op->cmd = MMUEXT_TLB_FLUSH_ALL;
1327	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1328
1329	xen_mc_issue(PARAVIRT_LAZY_MMU);
1330
1331	preempt_enable();
1332}
1333static void xen_flush_tlb(void)
1334{
1335	struct mmuext_op *op;
1336	struct multicall_space mcs;
1337
1338	trace_xen_mmu_flush_tlb(0);
1339
1340	preempt_disable();
1341
1342	mcs = xen_mc_entry(sizeof(*op));
1343
1344	op = mcs.args;
1345	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1346	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1347
1348	xen_mc_issue(PARAVIRT_LAZY_MMU);
1349
1350	preempt_enable();
1351}
1352
1353static void xen_flush_tlb_single(unsigned long addr)
1354{
1355	struct mmuext_op *op;
1356	struct multicall_space mcs;
1357
1358	trace_xen_mmu_flush_tlb_single(addr);
1359
1360	preempt_disable();
1361
1362	mcs = xen_mc_entry(sizeof(*op));
1363	op = mcs.args;
1364	op->cmd = MMUEXT_INVLPG_LOCAL;
1365	op->arg1.linear_addr = addr & PAGE_MASK;
1366	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1367
1368	xen_mc_issue(PARAVIRT_LAZY_MMU);
1369
1370	preempt_enable();
1371}
1372
1373static void xen_flush_tlb_others(const struct cpumask *cpus,
1374				 struct mm_struct *mm, unsigned long start,
1375				 unsigned long end)
1376{
1377	struct {
1378		struct mmuext_op op;
1379#ifdef CONFIG_SMP
1380		DECLARE_BITMAP(mask, num_processors);
1381#else
1382		DECLARE_BITMAP(mask, NR_CPUS);
1383#endif
1384	} *args;
1385	struct multicall_space mcs;
1386
1387	trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
1388
1389	if (cpumask_empty(cpus))
1390		return;		/* nothing to do */
1391
1392	mcs = xen_mc_entry(sizeof(*args));
1393	args = mcs.args;
1394	args->op.arg2.vcpumask = to_cpumask(args->mask);
1395
1396	/* Remove us, and any offline CPUS. */
1397	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1398	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1399
1400	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1401	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
1402		args->op.cmd = MMUEXT_INVLPG_MULTI;
1403		args->op.arg1.linear_addr = start;
1404	}
1405
1406	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1407
1408	xen_mc_issue(PARAVIRT_LAZY_MMU);
1409}
1410
1411static unsigned long xen_read_cr3(void)
1412{
1413	return this_cpu_read(xen_cr3);
1414}
1415
1416static void set_current_cr3(void *v)
1417{
1418	this_cpu_write(xen_current_cr3, (unsigned long)v);
1419}
1420
1421static void __xen_write_cr3(bool kernel, unsigned long cr3)
1422{
1423	struct mmuext_op op;
1424	unsigned long mfn;
1425
1426	trace_xen_mmu_write_cr3(kernel, cr3);
1427
1428	if (cr3)
1429		mfn = pfn_to_mfn(PFN_DOWN(cr3));
1430	else
1431		mfn = 0;
1432
1433	WARN_ON(mfn == 0 && kernel);
1434
1435	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1436	op.arg1.mfn = mfn;
1437
1438	xen_extend_mmuext_op(&op);
1439
1440	if (kernel) {
1441		this_cpu_write(xen_cr3, cr3);
1442
1443		/* Update xen_current_cr3 once the batch has actually
1444		   been submitted. */
1445		xen_mc_callback(set_current_cr3, (void *)cr3);
1446	}
1447}
1448static void xen_write_cr3(unsigned long cr3)
1449{
1450	BUG_ON(preemptible());
1451
1452	xen_mc_batch();  /* disables interrupts */
1453
1454	/* Update while interrupts are disabled, so its atomic with
1455	   respect to ipis */
1456	this_cpu_write(xen_cr3, cr3);
1457
1458	__xen_write_cr3(true, cr3);
1459
1460#ifdef CONFIG_X86_64
1461	{
1462		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1463		if (user_pgd)
1464			__xen_write_cr3(false, __pa(user_pgd));
1465		else
1466			__xen_write_cr3(false, 0);
1467	}
1468#endif
1469
1470	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1471}
1472
1473#ifdef CONFIG_X86_64
1474/*
1475 * At the start of the day - when Xen launches a guest, it has already
1476 * built pagetables for the guest. We diligently look over them
1477 * in xen_setup_kernel_pagetable and graft as appropiate them in the
1478 * init_level4_pgt and its friends. Then when we are happy we load
1479 * the new init_level4_pgt - and continue on.
1480 *
1481 * The generic code starts (start_kernel) and 'init_mem_mapping' sets
1482 * up the rest of the pagetables. When it has completed it loads the cr3.
1483 * N.B. that baremetal would start at 'start_kernel' (and the early
1484 * #PF handler would create bootstrap pagetables) - so we are running
1485 * with the same assumptions as what to do when write_cr3 is executed
1486 * at this point.
1487 *
1488 * Since there are no user-page tables at all, we have two variants
1489 * of xen_write_cr3 - the early bootup (this one), and the late one
1490 * (xen_write_cr3). The reason we have to do that is that in 64-bit
1491 * the Linux kernel and user-space are both in ring 3 while the
1492 * hypervisor is in ring 0.
1493 */
1494static void __init xen_write_cr3_init(unsigned long cr3)
1495{
1496	BUG_ON(preemptible());
1497
1498	xen_mc_batch();  /* disables interrupts */
1499
1500	/* Update while interrupts are disabled, so its atomic with
1501	   respect to ipis */
1502	this_cpu_write(xen_cr3, cr3);
1503
1504	__xen_write_cr3(true, cr3);
1505
1506	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1507}
1508#endif
1509
1510static int xen_pgd_alloc(struct mm_struct *mm)
1511{
1512	pgd_t *pgd = mm->pgd;
1513	int ret = 0;
1514
1515	BUG_ON(PagePinned(virt_to_page(pgd)));
1516
1517#ifdef CONFIG_X86_64
1518	{
1519		struct page *page = virt_to_page(pgd);
1520		pgd_t *user_pgd;
1521
1522		BUG_ON(page->private != 0);
1523
1524		ret = -ENOMEM;
1525
1526		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1527		page->private = (unsigned long)user_pgd;
1528
1529		if (user_pgd != NULL) {
1530#ifdef CONFIG_X86_VSYSCALL_EMULATION
1531			user_pgd[pgd_index(VSYSCALL_ADDR)] =
1532				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1533#endif
1534			ret = 0;
1535		}
1536
1537		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1538	}
1539#endif
1540
1541	return ret;
1542}
1543
1544static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1545{
1546#ifdef CONFIG_X86_64
1547	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1548
1549	if (user_pgd)
1550		free_page((unsigned long)user_pgd);
1551#endif
1552}
1553
1554#ifdef CONFIG_X86_32
1555static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1556{
1557	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
1558	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1559		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1560			       pte_val_ma(pte));
1561
1562	return pte;
1563}
1564#else /* CONFIG_X86_64 */
1565static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1566{
1567	unsigned long pfn;
1568
1569	if (xen_feature(XENFEAT_writable_page_tables) ||
1570	    xen_feature(XENFEAT_auto_translated_physmap) ||
1571	    xen_start_info->mfn_list >= __START_KERNEL_map)
1572		return pte;
1573
1574	/*
1575	 * Pages belonging to the initial p2m list mapped outside the default
1576	 * address range must be mapped read-only. This region contains the
1577	 * page tables for mapping the p2m list, too, and page tables MUST be
1578	 * mapped read-only.
1579	 */
1580	pfn = pte_pfn(pte);
1581	if (pfn >= xen_start_info->first_p2m_pfn &&
1582	    pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1583		pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW);
1584
1585	return pte;
1586}
1587#endif /* CONFIG_X86_64 */
1588
1589/*
1590 * Init-time set_pte while constructing initial pagetables, which
1591 * doesn't allow RO page table pages to be remapped RW.
1592 *
1593 * If there is no MFN for this PFN then this page is initially
1594 * ballooned out so clear the PTE (as in decrease_reservation() in
1595 * drivers/xen/balloon.c).
1596 *
1597 * Many of these PTE updates are done on unpinned and writable pages
1598 * and doing a hypercall for these is unnecessary and expensive.  At
1599 * this point it is not possible to tell if a page is pinned or not,
1600 * so always write the PTE directly and rely on Xen trapping and
1601 * emulating any updates as necessary.
1602 */
1603static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1604{
1605	if (pte_mfn(pte) != INVALID_P2M_ENTRY)
1606		pte = mask_rw_pte(ptep, pte);
1607	else
1608		pte = __pte_ma(0);
1609
1610	native_set_pte(ptep, pte);
1611}
1612
1613/* Early in boot, while setting up the initial pagetable, assume
1614   everything is pinned. */
1615static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1616{
1617#ifdef CONFIG_FLATMEM
1618	BUG_ON(mem_map);	/* should only be used early */
1619#endif
1620	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1621	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1622}
1623
1624/* Used for pmd and pud */
1625static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1626{
1627#ifdef CONFIG_FLATMEM
1628	BUG_ON(mem_map);	/* should only be used early */
1629#endif
1630	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1631}
1632
1633/* Early release_pte assumes that all pts are pinned, since there's
1634   only init_mm and anything attached to that is pinned. */
1635static void __init xen_release_pte_init(unsigned long pfn)
1636{
1637	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1638	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1639}
1640
1641static void __init xen_release_pmd_init(unsigned long pfn)
1642{
1643	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1644}
1645
1646static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1647{
1648	struct multicall_space mcs;
1649	struct mmuext_op *op;
1650
1651	mcs = __xen_mc_entry(sizeof(*op));
1652	op = mcs.args;
1653	op->cmd = cmd;
1654	op->arg1.mfn = pfn_to_mfn(pfn);
1655
1656	MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
1657}
1658
1659static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1660{
1661	struct multicall_space mcs;
1662	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1663
1664	mcs = __xen_mc_entry(0);
1665	MULTI_update_va_mapping(mcs.mc, (unsigned long)addr,
1666				pfn_pte(pfn, prot), 0);
1667}
1668
1669/* This needs to make sure the new pte page is pinned iff its being
1670   attached to a pinned pagetable. */
1671static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn,
1672				    unsigned level)
1673{
1674	bool pinned = PagePinned(virt_to_page(mm->pgd));
1675
1676	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1677
1678	if (pinned) {
1679		struct page *page = pfn_to_page(pfn);
1680
1681		SetPagePinned(page);
1682
1683		if (!PageHighMem(page)) {
1684			xen_mc_batch();
1685
1686			__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1687
1688			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1689				__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1690
1691			xen_mc_issue(PARAVIRT_LAZY_MMU);
1692		} else {
1693			/* make sure there are no stray mappings of
1694			   this page */
1695			kmap_flush_unused();
1696		}
1697	}
1698}
1699
1700static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1701{
1702	xen_alloc_ptpage(mm, pfn, PT_PTE);
1703}
1704
1705static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1706{
1707	xen_alloc_ptpage(mm, pfn, PT_PMD);
1708}
1709
1710/* This should never happen until we're OK to use struct page */
1711static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1712{
1713	struct page *page = pfn_to_page(pfn);
1714	bool pinned = PagePinned(page);
1715
1716	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1717
1718	if (pinned) {
1719		if (!PageHighMem(page)) {
1720			xen_mc_batch();
1721
1722			if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1723				__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1724
1725			__set_pfn_prot(pfn, PAGE_KERNEL);
1726
1727			xen_mc_issue(PARAVIRT_LAZY_MMU);
1728		}
1729		ClearPagePinned(page);
1730	}
1731}
1732
1733static void xen_release_pte(unsigned long pfn)
1734{
1735	xen_release_ptpage(pfn, PT_PTE);
1736}
1737
1738static void xen_release_pmd(unsigned long pfn)
1739{
1740	xen_release_ptpage(pfn, PT_PMD);
1741}
1742
1743#if CONFIG_PGTABLE_LEVELS == 4
1744static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1745{
1746	xen_alloc_ptpage(mm, pfn, PT_PUD);
1747}
1748
1749static void xen_release_pud(unsigned long pfn)
1750{
1751	xen_release_ptpage(pfn, PT_PUD);
1752}
1753#endif
1754
1755void __init xen_reserve_top(void)
1756{
1757#ifdef CONFIG_X86_32
1758	unsigned long top = HYPERVISOR_VIRT_START;
1759	struct xen_platform_parameters pp;
1760
1761	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1762		top = pp.virt_start;
1763
1764	reserve_top_address(-top);
1765#endif	/* CONFIG_X86_32 */
1766}
1767
1768/*
1769 * Like __va(), but returns address in the kernel mapping (which is
1770 * all we have until the physical memory mapping has been set up.
1771 */
1772static void * __init __ka(phys_addr_t paddr)
1773{
1774#ifdef CONFIG_X86_64
1775	return (void *)(paddr + __START_KERNEL_map);
1776#else
1777	return __va(paddr);
1778#endif
1779}
1780
1781/* Convert a machine address to physical address */
1782static unsigned long __init m2p(phys_addr_t maddr)
1783{
1784	phys_addr_t paddr;
1785
1786	maddr &= PTE_PFN_MASK;
1787	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1788
1789	return paddr;
1790}
1791
1792/* Convert a machine address to kernel virtual */
1793static void * __init m2v(phys_addr_t maddr)
1794{
1795	return __ka(m2p(maddr));
1796}
1797
1798/* Set the page permissions on an identity-mapped pages */
1799static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1800				       unsigned long flags)
1801{
1802	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1803	pte_t pte = pfn_pte(pfn, prot);
1804
1805	/* For PVH no need to set R/O or R/W to pin them or unpin them. */
1806	if (xen_feature(XENFEAT_auto_translated_physmap))
1807		return;
1808
1809	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
1810		BUG();
1811}
1812static void __init set_page_prot(void *addr, pgprot_t prot)
1813{
1814	return set_page_prot_flags(addr, prot, UVMF_NONE);
1815}
1816#ifdef CONFIG_X86_32
1817static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1818{
1819	unsigned pmdidx, pteidx;
1820	unsigned ident_pte;
1821	unsigned long pfn;
1822
1823	level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1824				      PAGE_SIZE);
1825
1826	ident_pte = 0;
1827	pfn = 0;
1828	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1829		pte_t *pte_page;
1830
1831		/* Reuse or allocate a page of ptes */
1832		if (pmd_present(pmd[pmdidx]))
1833			pte_page = m2v(pmd[pmdidx].pmd);
1834		else {
1835			/* Check for free pte pages */
1836			if (ident_pte == LEVEL1_IDENT_ENTRIES)
1837				break;
1838
1839			pte_page = &level1_ident_pgt[ident_pte];
1840			ident_pte += PTRS_PER_PTE;
1841
1842			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1843		}
1844
1845		/* Install mappings */
1846		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1847			pte_t pte;
1848
1849			if (pfn > max_pfn_mapped)
1850				max_pfn_mapped = pfn;
1851
1852			if (!pte_none(pte_page[pteidx]))
1853				continue;
1854
1855			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1856			pte_page[pteidx] = pte;
1857		}
1858	}
1859
1860	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1861		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1862
1863	set_page_prot(pmd, PAGE_KERNEL_RO);
1864}
1865#endif
1866void __init xen_setup_machphys_mapping(void)
1867{
1868	struct xen_machphys_mapping mapping;
1869
1870	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1871		machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1872		machine_to_phys_nr = mapping.max_mfn + 1;
1873	} else {
1874		machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1875	}
1876#ifdef CONFIG_X86_32
1877	WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1))
1878		< machine_to_phys_mapping);
1879#endif
1880}
1881
1882#ifdef CONFIG_X86_64
1883static void __init convert_pfn_mfn(void *v)
1884{
1885	pte_t *pte = v;
1886	int i;
1887
1888	/* All levels are converted the same way, so just treat them
1889	   as ptes. */
1890	for (i = 0; i < PTRS_PER_PTE; i++)
1891		pte[i] = xen_make_pte(pte[i].pte);
1892}
1893static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
1894				 unsigned long addr)
1895{
1896	if (*pt_base == PFN_DOWN(__pa(addr))) {
1897		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1898		clear_page((void *)addr);
1899		(*pt_base)++;
1900	}
1901	if (*pt_end == PFN_DOWN(__pa(addr))) {
1902		set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1903		clear_page((void *)addr);
1904		(*pt_end)--;
1905	}
1906}
1907/*
1908 * Set up the initial kernel pagetable.
1909 *
1910 * We can construct this by grafting the Xen provided pagetable into
1911 * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1912 * level2_ident_pgt, and level2_kernel_pgt.  This means that only the
1913 * kernel has a physical mapping to start with - but that's enough to
1914 * get __va working.  We need to fill in the rest of the physical
1915 * mapping once some sort of allocator has been set up.  NOTE: for
1916 * PVH, the page tables are native.
1917 */
1918void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1919{
1920	pud_t *l3;
1921	pmd_t *l2;
1922	unsigned long addr[3];
1923	unsigned long pt_base, pt_end;
1924	unsigned i;
1925
1926	/* max_pfn_mapped is the last pfn mapped in the initial memory
1927	 * mappings. Considering that on Xen after the kernel mappings we
1928	 * have the mappings of some pages that don't exist in pfn space, we
1929	 * set max_pfn_mapped to the last real pfn mapped. */
1930	if (xen_start_info->mfn_list < __START_KERNEL_map)
1931		max_pfn_mapped = xen_start_info->first_p2m_pfn;
1932	else
1933		max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1934
1935	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1936	pt_end = pt_base + xen_start_info->nr_pt_frames;
1937
1938	/* Zap identity mapping */
1939	init_level4_pgt[0] = __pgd(0);
1940
1941	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1942		/* Pre-constructed entries are in pfn, so convert to mfn */
1943		/* L4[272] -> level3_ident_pgt
1944		 * L4[511] -> level3_kernel_pgt */
1945		convert_pfn_mfn(init_level4_pgt);
1946
1947		/* L3_i[0] -> level2_ident_pgt */
1948		convert_pfn_mfn(level3_ident_pgt);
1949		/* L3_k[510] -> level2_kernel_pgt
1950		 * L3_k[511] -> level2_fixmap_pgt */
1951		convert_pfn_mfn(level3_kernel_pgt);
1952
1953		/* L3_k[511][506] -> level1_fixmap_pgt */
1954		convert_pfn_mfn(level2_fixmap_pgt);
1955	}
1956	/* We get [511][511] and have Xen's version of level2_kernel_pgt */
1957	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1958	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1959
1960	addr[0] = (unsigned long)pgd;
1961	addr[1] = (unsigned long)l3;
1962	addr[2] = (unsigned long)l2;
1963	/* Graft it onto L4[272][0]. Note that we creating an aliasing problem:
1964	 * Both L4[272][0] and L4[511][510] have entries that point to the same
1965	 * L2 (PMD) tables. Meaning that if you modify it in __va space
1966	 * it will be also modified in the __ka space! (But if you just
1967	 * modify the PMD table to point to other PTE's or none, then you
1968	 * are OK - which is what cleanup_highmap does) */
1969	copy_page(level2_ident_pgt, l2);
1970	/* Graft it onto L4[511][510] */
1971	copy_page(level2_kernel_pgt, l2);
1972
1973	/* Copy the initial P->M table mappings if necessary. */
1974	i = pgd_index(xen_start_info->mfn_list);
1975	if (i && i < pgd_index(__START_KERNEL_map))
1976		init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1977
1978	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
1979		/* Make pagetable pieces RO */
1980		set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1981		set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1982		set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1983		set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1984		set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
1985		set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1986		set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1987		set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
1988
1989		/* Pin down new L4 */
1990		pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1991				  PFN_DOWN(__pa_symbol(init_level4_pgt)));
1992
1993		/* Unpin Xen-provided one */
1994		pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1995
1996		/*
1997		 * At this stage there can be no user pgd, and no page
1998		 * structure to attach it to, so make sure we just set kernel
1999		 * pgd.
2000		 */
2001		xen_mc_batch();
2002		__xen_write_cr3(true, __pa(init_level4_pgt));
2003		xen_mc_issue(PARAVIRT_LAZY_CPU);
2004	} else
2005		native_write_cr3(__pa(init_level4_pgt));
2006
2007	/* We can't that easily rip out L3 and L2, as the Xen pagetables are
2008	 * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ...  for
2009	 * the initial domain. For guests using the toolstack, they are in:
2010	 * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
2011	 * rip out the [L4] (pgd), but for guests we shave off three pages.
2012	 */
2013	for (i = 0; i < ARRAY_SIZE(addr); i++)
2014		check_pt_base(&pt_base, &pt_end, addr[i]);
2015
2016	/* Our (by three pages) smaller Xen pagetable that we are using */
2017	xen_pt_base = PFN_PHYS(pt_base);
2018	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
2019	memblock_reserve(xen_pt_base, xen_pt_size);
2020
2021	/* Revector the xen_start_info */
2022	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
2023}
2024
2025/*
2026 * Read a value from a physical address.
2027 */
2028static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
2029{
2030	unsigned long *vaddr;
2031	unsigned long val;
2032
2033	vaddr = early_memremap_ro(addr, sizeof(val));
2034	val = *vaddr;
2035	early_memunmap(vaddr, sizeof(val));
2036	return val;
2037}
2038
2039/*
2040 * Translate a virtual address to a physical one without relying on mapped
2041 * page tables.
2042 */
2043static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
2044{
2045	phys_addr_t pa;
2046	pgd_t pgd;
2047	pud_t pud;
2048	pmd_t pmd;
2049	pte_t pte;
2050
2051	pa = read_cr3();
2052	pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) *
2053						       sizeof(pgd)));
2054	if (!pgd_present(pgd))
2055		return 0;
2056
2057	pa = pgd_val(pgd) & PTE_PFN_MASK;
2058	pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) *
2059						       sizeof(pud)));
2060	if (!pud_present(pud))
2061		return 0;
2062	pa = pud_pfn(pud) << PAGE_SHIFT;
2063	if (pud_large(pud))
2064		return pa + (vaddr & ~PUD_MASK);
2065
2066	pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) *
2067						       sizeof(pmd)));
2068	if (!pmd_present(pmd))
2069		return 0;
2070	pa = pmd_pfn(pmd) << PAGE_SHIFT;
2071	if (pmd_large(pmd))
2072		return pa + (vaddr & ~PMD_MASK);
2073
2074	pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) *
2075						       sizeof(pte)));
2076	if (!pte_present(pte))
2077		return 0;
2078	pa = pte_pfn(pte) << PAGE_SHIFT;
2079
2080	return pa | (vaddr & ~PAGE_MASK);
2081}
2082
2083/*
2084 * Find a new area for the hypervisor supplied p2m list and relocate the p2m to
2085 * this area.
2086 */
2087void __init xen_relocate_p2m(void)
2088{
2089	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
2090	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
2091	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
2092	pte_t *pt;
2093	pmd_t *pmd;
2094	pud_t *pud;
2095	pgd_t *pgd;
2096	unsigned long *new_p2m;
2097
2098	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
2099	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
2100	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
2101	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
2102	n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
2103	n_frames = n_pte + n_pt + n_pmd + n_pud;
2104
2105	new_area = xen_find_free_area(PFN_PHYS(n_frames));
2106	if (!new_area) {
2107		xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n");
2108		BUG();
2109	}
2110
2111	/*
2112	 * Setup the page tables for addressing the new p2m list.
2113	 * We have asked the hypervisor to map the p2m list at the user address
2114	 * PUD_SIZE. It may have done so, or it may have used a kernel space
2115	 * address depending on the Xen version.
2116	 * To avoid any possible virtual address collision, just use
2117	 * 2 * PUD_SIZE for the new area.
2118	 */
2119	pud_phys = new_area;
2120	pmd_phys = pud_phys + PFN_PHYS(n_pud);
2121	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
2122	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
2123
2124	pgd = __va(read_cr3());
2125	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
2126	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
2127		pud = early_memremap(pud_phys, PAGE_SIZE);
2128		clear_page(pud);
2129		for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
2130		     idx_pmd++) {
2131			pmd = early_memremap(pmd_phys, PAGE_SIZE);
2132			clear_page(pmd);
2133			for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
2134			     idx_pt++) {
2135				pt = early_memremap(pt_phys, PAGE_SIZE);
2136				clear_page(pt);
2137				for (idx_pte = 0;
2138				     idx_pte < min(n_pte, PTRS_PER_PTE);
2139				     idx_pte++) {
2140					set_pte(pt + idx_pte,
2141						pfn_pte(p2m_pfn, PAGE_KERNEL));
2142					p2m_pfn++;
2143				}
2144				n_pte -= PTRS_PER_PTE;
2145				early_memunmap(pt, PAGE_SIZE);
2146				make_lowmem_page_readonly(__va(pt_phys));
2147				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
2148						  PFN_DOWN(pt_phys));
2149				set_pmd(pmd + idx_pt,
2150					__pmd(_PAGE_TABLE | pt_phys));
2151				pt_phys += PAGE_SIZE;
2152			}
2153			n_pt -= PTRS_PER_PMD;
2154			early_memunmap(pmd, PAGE_SIZE);
2155			make_lowmem_page_readonly(__va(pmd_phys));
2156			pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
2157					  PFN_DOWN(pmd_phys));
2158			set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
2159			pmd_phys += PAGE_SIZE;
2160		}
2161		n_pmd -= PTRS_PER_PUD;
2162		early_memunmap(pud, PAGE_SIZE);
2163		make_lowmem_page_readonly(__va(pud_phys));
2164		pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
2165		set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
2166		pud_phys += PAGE_SIZE;
2167	}
2168
2169	/* Now copy the old p2m info to the new area. */
2170	memcpy(new_p2m, xen_p2m_addr, size);
2171	xen_p2m_addr = new_p2m;
2172
2173	/* Release the old p2m list and set new list info. */
2174	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
2175	BUG_ON(!p2m_pfn);
2176	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
2177
2178	if (xen_start_info->mfn_list < __START_KERNEL_map) {
2179		pfn = xen_start_info->first_p2m_pfn;
2180		pfn_end = xen_start_info->first_p2m_pfn +
2181			  xen_start_info->nr_p2m_frames;
2182		set_pgd(pgd + 1, __pgd(0));
2183	} else {
2184		pfn = p2m_pfn;
2185		pfn_end = p2m_pfn_end;
2186	}
2187
2188	memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
2189	while (pfn < pfn_end) {
2190		if (pfn == p2m_pfn) {
2191			pfn = p2m_pfn_end;
2192			continue;
2193		}
2194		make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
2195		pfn++;
2196	}
2197
2198	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2199	xen_start_info->first_p2m_pfn =  PFN_DOWN(new_area);
2200	xen_start_info->nr_p2m_frames = n_frames;
2201}
2202
2203#else	/* !CONFIG_X86_64 */
2204static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
2205static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
2206
2207static void __init xen_write_cr3_init(unsigned long cr3)
2208{
2209	unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
2210
2211	BUG_ON(read_cr3() != __pa(initial_page_table));
2212	BUG_ON(cr3 != __pa(swapper_pg_dir));
2213
2214	/*
2215	 * We are switching to swapper_pg_dir for the first time (from
2216	 * initial_page_table) and therefore need to mark that page
2217	 * read-only and then pin it.
2218	 *
2219	 * Xen disallows sharing of kernel PMDs for PAE
2220	 * guests. Therefore we must copy the kernel PMD from
2221	 * initial_page_table into a new kernel PMD to be used in
2222	 * swapper_pg_dir.
2223	 */
2224	swapper_kernel_pmd =
2225		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2226	copy_page(swapper_kernel_pmd, initial_kernel_pmd);
2227	swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
2228		__pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
2229	set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
2230
2231	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
2232	xen_write_cr3(cr3);
2233	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
2234
2235	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
2236			  PFN_DOWN(__pa(initial_page_table)));
2237	set_page_prot(initial_page_table, PAGE_KERNEL);
2238	set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
2239
2240	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2241}
2242
2243/*
2244 * For 32 bit domains xen_start_info->pt_base is the pgd address which might be
2245 * not the first page table in the page table pool.
2246 * Iterate through the initial page tables to find the real page table base.
2247 */
2248static phys_addr_t xen_find_pt_base(pmd_t *pmd)
2249{
2250	phys_addr_t pt_base, paddr;
2251	unsigned pmdidx;
2252
2253	pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd));
2254
2255	for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++)
2256		if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) {
2257			paddr = m2p(pmd[pmdidx].pmd);
2258			pt_base = min(pt_base, paddr);
2259		}
2260
2261	return pt_base;
2262}
2263
2264void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
2265{
2266	pmd_t *kernel_pmd;
2267
2268	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
2269
2270	xen_pt_base = xen_find_pt_base(kernel_pmd);
2271	xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE;
2272
2273	initial_kernel_pmd =
2274		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
2275
2276	max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024);
2277
2278	copy_page(initial_kernel_pmd, kernel_pmd);
2279
2280	xen_map_identity_early(initial_kernel_pmd, max_pfn);
2281
2282	copy_page(initial_page_table, pgd);
2283	initial_page_table[KERNEL_PGD_BOUNDARY] =
2284		__pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
2285
2286	set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
2287	set_page_prot(initial_page_table, PAGE_KERNEL_RO);
2288	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
2289
2290	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
2291
2292	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
2293			  PFN_DOWN(__pa(initial_page_table)));
2294	xen_write_cr3(__pa(initial_page_table));
2295
2296	memblock_reserve(xen_pt_base, xen_pt_size);
2297}
2298#endif	/* CONFIG_X86_64 */
2299
2300void __init xen_reserve_special_pages(void)
2301{
2302	phys_addr_t paddr;
2303
2304	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2305	if (xen_start_info->store_mfn) {
2306		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2307		memblock_reserve(paddr, PAGE_SIZE);
2308	}
2309	if (!xen_initial_domain()) {
2310		paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2311		memblock_reserve(paddr, PAGE_SIZE);
2312	}
2313}
2314
2315void __init xen_pt_check_e820(void)
2316{
2317	if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) {
2318		xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n");
2319		BUG();
2320	}
2321}
2322
2323static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2324
2325static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2326{
2327	pte_t pte;
2328
2329	phys >>= PAGE_SHIFT;
2330
2331	switch (idx) {
2332	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2333	case FIX_RO_IDT:
2334#ifdef CONFIG_X86_32
2335	case FIX_WP_TEST:
2336# ifdef CONFIG_HIGHMEM
2337	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
2338# endif
2339#elif defined(CONFIG_X86_VSYSCALL_EMULATION)
2340	case VSYSCALL_PAGE:
2341#endif
2342	case FIX_TEXT_POKE0:
2343	case FIX_TEXT_POKE1:
2344		/* All local page mappings */
2345		pte = pfn_pte(phys, prot);
2346		break;
2347
2348#ifdef CONFIG_X86_LOCAL_APIC
2349	case FIX_APIC_BASE:	/* maps dummy local APIC */
2350		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2351		break;
2352#endif
2353
2354#ifdef CONFIG_X86_IO_APIC
2355	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2356		/*
2357		 * We just don't map the IO APIC - all access is via
2358		 * hypercalls.  Keep the address in the pte for reference.
2359		 */
2360		pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2361		break;
2362#endif
2363
2364	case FIX_PARAVIRT_BOOTMAP:
2365		/* This is an MFN, but it isn't an IO mapping from the
2366		   IO domain */
2367		pte = mfn_pte(phys, prot);
2368		break;
2369
2370	default:
2371		/* By default, set_fixmap is used for hardware mappings */
2372		pte = mfn_pte(phys, prot);
2373		break;
2374	}
2375
2376	__native_set_fixmap(idx, pte);
2377
2378#ifdef CONFIG_X86_VSYSCALL_EMULATION
2379	/* Replicate changes to map the vsyscall page into the user
2380	   pagetable vsyscall mapping. */
2381	if (idx == VSYSCALL_PAGE) {
2382		unsigned long vaddr = __fix_to_virt(idx);
2383		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
2384	}
2385#endif
2386}
2387
2388static void __init xen_post_allocator_init(void)
2389{
2390	if (xen_feature(XENFEAT_auto_translated_physmap))
2391		return;
2392
2393	pv_mmu_ops.set_pte = xen_set_pte;
2394	pv_mmu_ops.set_pmd = xen_set_pmd;
2395	pv_mmu_ops.set_pud = xen_set_pud;
2396#if CONFIG_PGTABLE_LEVELS == 4
2397	pv_mmu_ops.set_pgd = xen_set_pgd;
2398#endif
2399
2400	/* This will work as long as patching hasn't happened yet
2401	   (which it hasn't) */
2402	pv_mmu_ops.alloc_pte = xen_alloc_pte;
2403	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
2404	pv_mmu_ops.release_pte = xen_release_pte;
2405	pv_mmu_ops.release_pmd = xen_release_pmd;
2406#if CONFIG_PGTABLE_LEVELS == 4
2407	pv_mmu_ops.alloc_pud = xen_alloc_pud;
2408	pv_mmu_ops.release_pud = xen_release_pud;
2409#endif
2410
2411#ifdef CONFIG_X86_64
2412	pv_mmu_ops.write_cr3 = &xen_write_cr3;
2413	SetPagePinned(virt_to_page(level3_user_vsyscall));
2414#endif
2415	xen_mark_init_mm_pinned();
2416}
2417
2418static void xen_leave_lazy_mmu(void)
2419{
2420	preempt_disable();
2421	xen_mc_flush();
2422	paravirt_leave_lazy_mmu();
2423	preempt_enable();
2424}
2425
2426static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2427	.read_cr2 = xen_read_cr2,
2428	.write_cr2 = xen_write_cr2,
2429
2430	.read_cr3 = xen_read_cr3,
2431	.write_cr3 = xen_write_cr3_init,
2432
2433	.flush_tlb_user = xen_flush_tlb,
2434	.flush_tlb_kernel = xen_flush_tlb,
2435	.flush_tlb_single = xen_flush_tlb_single,
2436	.flush_tlb_others = xen_flush_tlb_others,
2437
2438	.pte_update = paravirt_nop,
2439	.pte_update_defer = paravirt_nop,
2440
2441	.pgd_alloc = xen_pgd_alloc,
2442	.pgd_free = xen_pgd_free,
2443
2444	.alloc_pte = xen_alloc_pte_init,
2445	.release_pte = xen_release_pte_init,
2446	.alloc_pmd = xen_alloc_pmd_init,
2447	.release_pmd = xen_release_pmd_init,
2448
2449	.set_pte = xen_set_pte_init,
2450	.set_pte_at = xen_set_pte_at,
2451	.set_pmd = xen_set_pmd_hyper,
2452
2453	.ptep_modify_prot_start = __ptep_modify_prot_start,
2454	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
2455
2456	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2457	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2458
2459	.make_pte = PV_CALLEE_SAVE(xen_make_pte),
2460	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2461
2462#ifdef CONFIG_X86_PAE
2463	.set_pte_atomic = xen_set_pte_atomic,
2464	.pte_clear = xen_pte_clear,
2465	.pmd_clear = xen_pmd_clear,
2466#endif	/* CONFIG_X86_PAE */
2467	.set_pud = xen_set_pud_hyper,
2468
2469	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2470	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2471
2472#if CONFIG_PGTABLE_LEVELS == 4
2473	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2474	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2475	.set_pgd = xen_set_pgd_hyper,
2476
2477	.alloc_pud = xen_alloc_pmd_init,
2478	.release_pud = xen_release_pmd_init,
2479#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
2480
2481	.activate_mm = xen_activate_mm,
2482	.dup_mmap = xen_dup_mmap,
2483	.exit_mmap = xen_exit_mmap,
2484
2485	.lazy_mode = {
2486		.enter = paravirt_enter_lazy_mmu,
2487		.leave = xen_leave_lazy_mmu,
2488		.flush = paravirt_flush_lazy_mmu,
2489	},
2490
2491	.set_fixmap = xen_set_fixmap,
2492};
2493
2494void __init xen_init_mmu_ops(void)
2495{
2496	x86_init.paging.pagetable_init = xen_pagetable_init;
2497
2498	if (xen_feature(XENFEAT_auto_translated_physmap))
2499		return;
2500
2501	pv_mmu_ops = xen_mmu_ops;
2502
2503	memset(dummy_mapping, 0xff, PAGE_SIZE);
2504}
2505
2506/* Protected by xen_reservation_lock. */
2507#define MAX_CONTIG_ORDER 9 /* 2MB */
2508static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2509
2510#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2511static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2512				unsigned long *in_frames,
2513				unsigned long *out_frames)
2514{
2515	int i;
2516	struct multicall_space mcs;
2517
2518	xen_mc_batch();
2519	for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2520		mcs = __xen_mc_entry(0);
2521
2522		if (in_frames)
2523			in_frames[i] = virt_to_mfn(vaddr);
2524
2525		MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2526		__set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2527
2528		if (out_frames)
2529			out_frames[i] = virt_to_pfn(vaddr);
2530	}
2531	xen_mc_issue(0);
2532}
2533
2534/*
2535 * Update the pfn-to-mfn mappings for a virtual address range, either to
2536 * point to an array of mfns, or contiguously from a single starting
2537 * mfn.
2538 */
2539static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2540				     unsigned long *mfns,
2541				     unsigned long first_mfn)
2542{
2543	unsigned i, limit;
2544	unsigned long mfn;
2545
2546	xen_mc_batch();
2547
2548	limit = 1u << order;
2549	for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2550		struct multicall_space mcs;
2551		unsigned flags;
2552
2553		mcs = __xen_mc_entry(0);
2554		if (mfns)
2555			mfn = mfns[i];
2556		else
2557			mfn = first_mfn + i;
2558
2559		if (i < (limit - 1))
2560			flags = 0;
2561		else {
2562			if (order == 0)
2563				flags = UVMF_INVLPG | UVMF_ALL;
2564			else
2565				flags = UVMF_TLB_FLUSH | UVMF_ALL;
2566		}
2567
2568		MULTI_update_va_mapping(mcs.mc, vaddr,
2569				mfn_pte(mfn, PAGE_KERNEL), flags);
2570
2571		set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2572	}
2573
2574	xen_mc_issue(0);
2575}
2576
2577/*
2578 * Perform the hypercall to exchange a region of our pfns to point to
2579 * memory with the required contiguous alignment.  Takes the pfns as
2580 * input, and populates mfns as output.
2581 *
2582 * Returns a success code indicating whether the hypervisor was able to
2583 * satisfy the request or not.
2584 */
2585static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2586			       unsigned long *pfns_in,
2587			       unsigned long extents_out,
2588			       unsigned int order_out,
2589			       unsigned long *mfns_out,
2590			       unsigned int address_bits)
2591{
2592	long rc;
2593	int success;
2594
2595	struct xen_memory_exchange exchange = {
2596		.in = {
2597			.nr_extents   = extents_in,
2598			.extent_order = order_in,
2599			.extent_start = pfns_in,
2600			.domid        = DOMID_SELF
2601		},
2602		.out = {
2603			.nr_extents   = extents_out,
2604			.extent_order = order_out,
2605			.extent_start = mfns_out,
2606			.address_bits = address_bits,
2607			.domid        = DOMID_SELF
2608		}
2609	};
2610
2611	BUG_ON(extents_in << order_in != extents_out << order_out);
2612
2613	rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2614	success = (exchange.nr_exchanged == extents_in);
2615
2616	BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2617	BUG_ON(success && (rc != 0));
2618
2619	return success;
2620}
2621
2622int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2623				 unsigned int address_bits,
2624				 dma_addr_t *dma_handle)
2625{
2626	unsigned long *in_frames = discontig_frames, out_frame;
2627	unsigned long  flags;
2628	int            success;
2629	unsigned long vstart = (unsigned long)phys_to_virt(pstart);
2630
2631	/*
2632	 * Currently an auto-translated guest will not perform I/O, nor will
2633	 * it require PAE page directories below 4GB. Therefore any calls to
2634	 * this function are redundant and can be ignored.
2635	 */
2636
2637	if (xen_feature(XENFEAT_auto_translated_physmap))
2638		return 0;
2639
2640	if (unlikely(order > MAX_CONTIG_ORDER))
2641		return -ENOMEM;
2642
2643	memset((void *) vstart, 0, PAGE_SIZE << order);
2644
2645	spin_lock_irqsave(&xen_reservation_lock, flags);
2646
2647	/* 1. Zap current PTEs, remembering MFNs. */
2648	xen_zap_pfn_range(vstart, order, in_frames, NULL);
2649
2650	/* 2. Get a new contiguous memory extent. */
2651	out_frame = virt_to_pfn(vstart);
2652	success = xen_exchange_memory(1UL << order, 0, in_frames,
2653				      1, order, &out_frame,
2654				      address_bits);
2655
2656	/* 3. Map the new extent in place of old pages. */
2657	if (success)
2658		xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2659	else
2660		xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2661
2662	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2663
2664	*dma_handle = virt_to_machine(vstart).maddr;
2665	return success ? 0 : -ENOMEM;
2666}
2667EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2668
2669void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2670{
2671	unsigned long *out_frames = discontig_frames, in_frame;
2672	unsigned long  flags;
2673	int success;
2674	unsigned long vstart;
2675
2676	if (xen_feature(XENFEAT_auto_translated_physmap))
2677		return;
2678
2679	if (unlikely(order > MAX_CONTIG_ORDER))
2680		return;
2681
2682	vstart = (unsigned long)phys_to_virt(pstart);
2683	memset((void *) vstart, 0, PAGE_SIZE << order);
2684
2685	spin_lock_irqsave(&xen_reservation_lock, flags);
2686
2687	/* 1. Find start MFN of contiguous extent. */
2688	in_frame = virt_to_mfn(vstart);
2689
2690	/* 2. Zap current PTEs. */
2691	xen_zap_pfn_range(vstart, order, NULL, out_frames);
2692
2693	/* 3. Do the exchange for non-contiguous MFNs. */
2694	success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2695					0, out_frames, 0);
2696
2697	/* 4. Map new pages in place of old pages. */
2698	if (success)
2699		xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2700	else
2701		xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2702
2703	spin_unlock_irqrestore(&xen_reservation_lock, flags);
2704}
2705EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2706
2707#ifdef CONFIG_XEN_PVHVM
2708#ifdef CONFIG_PROC_VMCORE
2709/*
2710 * This function is used in two contexts:
2711 * - the kdump kernel has to check whether a pfn of the crashed kernel
2712 *   was a ballooned page. vmcore is using this function to decide
2713 *   whether to access a pfn of the crashed kernel.
2714 * - the kexec kernel has to check whether a pfn was ballooned by the
2715 *   previous kernel. If the pfn is ballooned, handle it properly.
2716 * Returns 0 if the pfn is not backed by a RAM page, the caller may
2717 * handle the pfn special in this case.
2718 */
2719static int xen_oldmem_pfn_is_ram(unsigned long pfn)
2720{
2721	struct xen_hvm_get_mem_type a = {
2722		.domid = DOMID_SELF,
2723		.pfn = pfn,
2724	};
2725	int ram;
2726
2727	if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a))
2728		return -ENXIO;
2729
2730	switch (a.mem_type) {
2731		case HVMMEM_mmio_dm:
2732			ram = 0;
2733			break;
2734		case HVMMEM_ram_rw:
2735		case HVMMEM_ram_ro:
2736		default:
2737			ram = 1;
2738			break;
2739	}
2740
2741	return ram;
2742}
2743#endif
2744
2745static void xen_hvm_exit_mmap(struct mm_struct *mm)
2746{
2747	struct xen_hvm_pagetable_dying a;
2748	int rc;
2749
2750	a.domid = DOMID_SELF;
2751	a.gpa = __pa(mm->pgd);
2752	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2753	WARN_ON_ONCE(rc < 0);
2754}
2755
2756static int is_pagetable_dying_supported(void)
2757{
2758	struct xen_hvm_pagetable_dying a;
2759	int rc = 0;
2760
2761	a.domid = DOMID_SELF;
2762	a.gpa = 0x00;
2763	rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2764	if (rc < 0) {
2765		printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2766		return 0;
2767	}
2768	return 1;
2769}
2770
2771void __init xen_hvm_init_mmu_ops(void)
2772{
2773	if (is_pagetable_dying_supported())
2774		pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2775#ifdef CONFIG_PROC_VMCORE
2776	register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram);
2777#endif
2778}
2779#endif
2780
2781#define REMAP_BATCH_SIZE 16
2782
2783struct remap_data {
2784	xen_pfn_t *mfn;
2785	bool contiguous;
2786	pgprot_t prot;
2787	struct mmu_update *mmu_update;
2788};
2789
2790static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2791				 unsigned long addr, void *data)
2792{
2793	struct remap_data *rmd = data;
2794	pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
2795
2796	/* If we have a contigious range, just update the mfn itself,
2797	   else update pointer to be "next mfn". */
2798	if (rmd->contiguous)
2799		(*rmd->mfn)++;
2800	else
2801		rmd->mfn++;
2802
2803	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2804	rmd->mmu_update->val = pte_val_ma(pte);
2805	rmd->mmu_update++;
2806
2807	return 0;
2808}
2809
2810static int do_remap_gfn(struct vm_area_struct *vma,
2811			unsigned long addr,
2812			xen_pfn_t *gfn, int nr,
2813			int *err_ptr, pgprot_t prot,
2814			unsigned domid,
2815			struct page **pages)
2816{
2817	int err = 0;
2818	struct remap_data rmd;
2819	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2820	unsigned long range;
2821	int mapped = 0;
2822
2823	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
2824
2825	if (xen_feature(XENFEAT_auto_translated_physmap)) {
2826#ifdef CONFIG_XEN_PVH
2827		/* We need to update the local page tables and the xen HAP */
2828		return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
2829						 prot, domid, pages);
2830#else
2831		return -EINVAL;
2832#endif
2833        }
2834
2835	rmd.mfn = gfn;
2836	rmd.prot = prot;
2837	/* We use the err_ptr to indicate if there we are doing a contigious
2838	 * mapping or a discontigious mapping. */
2839	rmd.contiguous = !err_ptr;
2840
2841	while (nr) {
2842		int index = 0;
2843		int done = 0;
2844		int batch = min(REMAP_BATCH_SIZE, nr);
2845		int batch_left = batch;
2846		range = (unsigned long)batch << PAGE_SHIFT;
2847
2848		rmd.mmu_update = mmu_update;
2849		err = apply_to_page_range(vma->vm_mm, addr, range,
2850					  remap_area_mfn_pte_fn, &rmd);
2851		if (err)
2852			goto out;
2853
2854		/* We record the error for each page that gives an error, but
2855		 * continue mapping until the whole set is done */
2856		do {
2857			int i;
2858
2859			err = HYPERVISOR_mmu_update(&mmu_update[index],
2860						    batch_left, &done, domid);
2861
2862			/*
2863			 * @err_ptr may be the same buffer as @gfn, so
2864			 * only clear it after each chunk of @gfn is
2865			 * used.
2866			 */
2867			if (err_ptr) {
2868				for (i = index; i < index + done; i++)
2869					err_ptr[i] = 0;
2870			}
2871			if (err < 0) {
2872				if (!err_ptr)
2873					goto out;
2874				err_ptr[i] = err;
2875				done++; /* Skip failed frame. */
2876			} else
2877				mapped += done;
2878			batch_left -= done;
2879			index += done;
2880		} while (batch_left);
2881
2882		nr -= batch;
2883		addr += range;
2884		if (err_ptr)
2885			err_ptr += batch;
2886		cond_resched();
2887	}
2888out:
2889
2890	xen_flush_tlb_all();
2891
2892	return err < 0 ? err : mapped;
2893}
2894
2895int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
2896			       unsigned long addr,
2897			       xen_pfn_t gfn, int nr,
2898			       pgprot_t prot, unsigned domid,
2899			       struct page **pages)
2900{
2901	return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
2902}
2903EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
2904
2905int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
2906			       unsigned long addr,
2907			       xen_pfn_t *gfn, int nr,
2908			       int *err_ptr, pgprot_t prot,
2909			       unsigned domid, struct page **pages)
2910{
2911	/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
2912	 * and the consequences later is quite hard to detect what the actual
2913	 * cause of "wrong memory was mapped in".
2914	 */
2915	BUG_ON(err_ptr == NULL);
2916	return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages);
2917}
2918EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
2919
2920
2921/* Returns: 0 success */
2922int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
2923			       int numpgs, struct page **pages)
2924{
2925	if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
2926		return 0;
2927
2928#ifdef CONFIG_XEN_PVH
2929	return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
2930#else
2931	return -EINVAL;
2932#endif
2933}
2934EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
2935