1/*
2 * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
3 * Licensed under the GPL
4 */
5
6#include <linux/mm.h>
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <asm/pgtable.h>
10#include <asm/tlbflush.h>
11#include <as-layout.h>
12#include <mem_user.h>
13#include <os.h>
14#include <skas.h>
15#include <kern_util.h>
16
17struct host_vm_change {
18	struct host_vm_op {
19		enum { NONE, MMAP, MUNMAP, MPROTECT } type;
20		union {
21			struct {
22				unsigned long addr;
23				unsigned long len;
24				unsigned int prot;
25				int fd;
26				__u64 offset;
27			} mmap;
28			struct {
29				unsigned long addr;
30				unsigned long len;
31			} munmap;
32			struct {
33				unsigned long addr;
34				unsigned long len;
35				unsigned int prot;
36			} mprotect;
37		} u;
38	} ops[1];
39	int index;
40	struct mm_id *id;
41	void *data;
42	int force;
43};
44
45#define INIT_HVC(mm, force) \
46	((struct host_vm_change) \
47	 { .ops		= { { .type = NONE } },	\
48	   .id		= &mm->context.id, \
49       	   .data	= NULL, \
50	   .index	= 0, \
51	   .force	= force })
52
53static void report_enomem(void)
54{
55	printk(KERN_ERR "UML ran out of memory on the host side! "
56			"This can happen due to a memory limitation or "
57			"vm.max_map_count has been reached.\n");
58}
59
60static int do_ops(struct host_vm_change *hvc, int end,
61		  int finished)
62{
63	struct host_vm_op *op;
64	int i, ret = 0;
65
66	for (i = 0; i < end && !ret; i++) {
67		op = &hvc->ops[i];
68		switch (op->type) {
69		case MMAP:
70			ret = map(hvc->id, op->u.mmap.addr, op->u.mmap.len,
71				  op->u.mmap.prot, op->u.mmap.fd,
72				  op->u.mmap.offset, finished, &hvc->data);
73			break;
74		case MUNMAP:
75			ret = unmap(hvc->id, op->u.munmap.addr,
76				    op->u.munmap.len, finished, &hvc->data);
77			break;
78		case MPROTECT:
79			ret = protect(hvc->id, op->u.mprotect.addr,
80				      op->u.mprotect.len, op->u.mprotect.prot,
81				      finished, &hvc->data);
82			break;
83		default:
84			printk(KERN_ERR "Unknown op type %d in do_ops\n",
85			       op->type);
86			BUG();
87			break;
88		}
89	}
90
91	if (ret == -ENOMEM)
92		report_enomem();
93
94	return ret;
95}
96
97static int add_mmap(unsigned long virt, unsigned long phys, unsigned long len,
98		    unsigned int prot, struct host_vm_change *hvc)
99{
100	__u64 offset;
101	struct host_vm_op *last;
102	int fd, ret = 0;
103
104	fd = phys_mapping(phys, &offset);
105	if (hvc->index != 0) {
106		last = &hvc->ops[hvc->index - 1];
107		if ((last->type == MMAP) &&
108		   (last->u.mmap.addr + last->u.mmap.len == virt) &&
109		   (last->u.mmap.prot == prot) && (last->u.mmap.fd == fd) &&
110		   (last->u.mmap.offset + last->u.mmap.len == offset)) {
111			last->u.mmap.len += len;
112			return 0;
113		}
114	}
115
116	if (hvc->index == ARRAY_SIZE(hvc->ops)) {
117		ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0);
118		hvc->index = 0;
119	}
120
121	hvc->ops[hvc->index++] = ((struct host_vm_op)
122				  { .type	= MMAP,
123				    .u = { .mmap = { .addr	= virt,
124						     .len	= len,
125						     .prot	= prot,
126						     .fd	= fd,
127						     .offset	= offset }
128			   } });
129	return ret;
130}
131
132static int add_munmap(unsigned long addr, unsigned long len,
133		      struct host_vm_change *hvc)
134{
135	struct host_vm_op *last;
136	int ret = 0;
137
138	if ((addr >= STUB_START) && (addr < STUB_END))
139		return -EINVAL;
140
141	if (hvc->index != 0) {
142		last = &hvc->ops[hvc->index - 1];
143		if ((last->type == MUNMAP) &&
144		   (last->u.munmap.addr + last->u.mmap.len == addr)) {
145			last->u.munmap.len += len;
146			return 0;
147		}
148	}
149
150	if (hvc->index == ARRAY_SIZE(hvc->ops)) {
151		ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0);
152		hvc->index = 0;
153	}
154
155	hvc->ops[hvc->index++] = ((struct host_vm_op)
156				  { .type	= MUNMAP,
157			     	    .u = { .munmap = { .addr	= addr,
158						       .len	= len } } });
159	return ret;
160}
161
162static int add_mprotect(unsigned long addr, unsigned long len,
163			unsigned int prot, struct host_vm_change *hvc)
164{
165	struct host_vm_op *last;
166	int ret = 0;
167
168	if (hvc->index != 0) {
169		last = &hvc->ops[hvc->index - 1];
170		if ((last->type == MPROTECT) &&
171		   (last->u.mprotect.addr + last->u.mprotect.len == addr) &&
172		   (last->u.mprotect.prot == prot)) {
173			last->u.mprotect.len += len;
174			return 0;
175		}
176	}
177
178	if (hvc->index == ARRAY_SIZE(hvc->ops)) {
179		ret = do_ops(hvc, ARRAY_SIZE(hvc->ops), 0);
180		hvc->index = 0;
181	}
182
183	hvc->ops[hvc->index++] = ((struct host_vm_op)
184				  { .type	= MPROTECT,
185			     	    .u = { .mprotect = { .addr	= addr,
186							 .len	= len,
187							 .prot	= prot } } });
188	return ret;
189}
190
191#define ADD_ROUND(n, inc) (((n) + (inc)) & ~((inc) - 1))
192
193static inline int update_pte_range(pmd_t *pmd, unsigned long addr,
194				   unsigned long end,
195				   struct host_vm_change *hvc)
196{
197	pte_t *pte;
198	int r, w, x, prot, ret = 0;
199
200	pte = pte_offset_kernel(pmd, addr);
201	do {
202		if ((addr >= STUB_START) && (addr < STUB_END))
203			continue;
204
205		r = pte_read(*pte);
206		w = pte_write(*pte);
207		x = pte_exec(*pte);
208		if (!pte_young(*pte)) {
209			r = 0;
210			w = 0;
211		} else if (!pte_dirty(*pte))
212			w = 0;
213
214		prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) |
215			(x ? UM_PROT_EXEC : 0));
216		if (hvc->force || pte_newpage(*pte)) {
217			if (pte_present(*pte))
218				ret = add_mmap(addr, pte_val(*pte) & PAGE_MASK,
219					       PAGE_SIZE, prot, hvc);
220			else
221				ret = add_munmap(addr, PAGE_SIZE, hvc);
222		} else if (pte_newprot(*pte))
223			ret = add_mprotect(addr, PAGE_SIZE, prot, hvc);
224		*pte = pte_mkuptodate(*pte);
225	} while (pte++, addr += PAGE_SIZE, ((addr < end) && !ret));
226	return ret;
227}
228
229static inline int update_pmd_range(pud_t *pud, unsigned long addr,
230				   unsigned long end,
231				   struct host_vm_change *hvc)
232{
233	pmd_t *pmd;
234	unsigned long next;
235	int ret = 0;
236
237	pmd = pmd_offset(pud, addr);
238	do {
239		next = pmd_addr_end(addr, end);
240		if (!pmd_present(*pmd)) {
241			if (hvc->force || pmd_newpage(*pmd)) {
242				ret = add_munmap(addr, next - addr, hvc);
243				pmd_mkuptodate(*pmd);
244			}
245		}
246		else ret = update_pte_range(pmd, addr, next, hvc);
247	} while (pmd++, addr = next, ((addr < end) && !ret));
248	return ret;
249}
250
251static inline int update_pud_range(pgd_t *pgd, unsigned long addr,
252				   unsigned long end,
253				   struct host_vm_change *hvc)
254{
255	pud_t *pud;
256	unsigned long next;
257	int ret = 0;
258
259	pud = pud_offset(pgd, addr);
260	do {
261		next = pud_addr_end(addr, end);
262		if (!pud_present(*pud)) {
263			if (hvc->force || pud_newpage(*pud)) {
264				ret = add_munmap(addr, next - addr, hvc);
265				pud_mkuptodate(*pud);
266			}
267		}
268		else ret = update_pmd_range(pud, addr, next, hvc);
269	} while (pud++, addr = next, ((addr < end) && !ret));
270	return ret;
271}
272
273void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
274		      unsigned long end_addr, int force)
275{
276	pgd_t *pgd;
277	struct host_vm_change hvc;
278	unsigned long addr = start_addr, next;
279	int ret = 0;
280
281	hvc = INIT_HVC(mm, force);
282	pgd = pgd_offset(mm, addr);
283	do {
284		next = pgd_addr_end(addr, end_addr);
285		if (!pgd_present(*pgd)) {
286			if (force || pgd_newpage(*pgd)) {
287				ret = add_munmap(addr, next - addr, &hvc);
288				pgd_mkuptodate(*pgd);
289			}
290		}
291		else ret = update_pud_range(pgd, addr, next, &hvc);
292	} while (pgd++, addr = next, ((addr < end_addr) && !ret));
293
294	if (!ret)
295		ret = do_ops(&hvc, hvc.index, 1);
296
297	/* This is not an else because ret is modified above */
298	if (ret) {
299		printk(KERN_ERR "fix_range_common: failed, killing current "
300		       "process: %d\n", task_tgid_vnr(current));
301		/* We are under mmap_sem, release it such that current can terminate */
302		up_write(&current->mm->mmap_sem);
303		force_sig(SIGKILL, current);
304		do_signal(&current->thread.regs);
305	}
306}
307
308static int flush_tlb_kernel_range_common(unsigned long start, unsigned long end)
309{
310	struct mm_struct *mm;
311	pgd_t *pgd;
312	pud_t *pud;
313	pmd_t *pmd;
314	pte_t *pte;
315	unsigned long addr, last;
316	int updated = 0, err;
317
318	mm = &init_mm;
319	for (addr = start; addr < end;) {
320		pgd = pgd_offset(mm, addr);
321		if (!pgd_present(*pgd)) {
322			last = ADD_ROUND(addr, PGDIR_SIZE);
323			if (last > end)
324				last = end;
325			if (pgd_newpage(*pgd)) {
326				updated = 1;
327				err = os_unmap_memory((void *) addr,
328						      last - addr);
329				if (err < 0)
330					panic("munmap failed, errno = %d\n",
331					      -err);
332			}
333			addr = last;
334			continue;
335		}
336
337		pud = pud_offset(pgd, addr);
338		if (!pud_present(*pud)) {
339			last = ADD_ROUND(addr, PUD_SIZE);
340			if (last > end)
341				last = end;
342			if (pud_newpage(*pud)) {
343				updated = 1;
344				err = os_unmap_memory((void *) addr,
345						      last - addr);
346				if (err < 0)
347					panic("munmap failed, errno = %d\n",
348					      -err);
349			}
350			addr = last;
351			continue;
352		}
353
354		pmd = pmd_offset(pud, addr);
355		if (!pmd_present(*pmd)) {
356			last = ADD_ROUND(addr, PMD_SIZE);
357			if (last > end)
358				last = end;
359			if (pmd_newpage(*pmd)) {
360				updated = 1;
361				err = os_unmap_memory((void *) addr,
362						      last - addr);
363				if (err < 0)
364					panic("munmap failed, errno = %d\n",
365					      -err);
366			}
367			addr = last;
368			continue;
369		}
370
371		pte = pte_offset_kernel(pmd, addr);
372		if (!pte_present(*pte) || pte_newpage(*pte)) {
373			updated = 1;
374			err = os_unmap_memory((void *) addr,
375					      PAGE_SIZE);
376			if (err < 0)
377				panic("munmap failed, errno = %d\n",
378				      -err);
379			if (pte_present(*pte))
380				map_memory(addr,
381					   pte_val(*pte) & PAGE_MASK,
382					   PAGE_SIZE, 1, 1, 1);
383		}
384		else if (pte_newprot(*pte)) {
385			updated = 1;
386			os_protect_memory((void *) addr, PAGE_SIZE, 1, 1, 1);
387		}
388		addr += PAGE_SIZE;
389	}
390	return updated;
391}
392
393void flush_tlb_page(struct vm_area_struct *vma, unsigned long address)
394{
395	pgd_t *pgd;
396	pud_t *pud;
397	pmd_t *pmd;
398	pte_t *pte;
399	struct mm_struct *mm = vma->vm_mm;
400	void *flush = NULL;
401	int r, w, x, prot, err = 0;
402	struct mm_id *mm_id;
403
404	address &= PAGE_MASK;
405	pgd = pgd_offset(mm, address);
406	if (!pgd_present(*pgd))
407		goto kill;
408
409	pud = pud_offset(pgd, address);
410	if (!pud_present(*pud))
411		goto kill;
412
413	pmd = pmd_offset(pud, address);
414	if (!pmd_present(*pmd))
415		goto kill;
416
417	pte = pte_offset_kernel(pmd, address);
418
419	r = pte_read(*pte);
420	w = pte_write(*pte);
421	x = pte_exec(*pte);
422	if (!pte_young(*pte)) {
423		r = 0;
424		w = 0;
425	} else if (!pte_dirty(*pte)) {
426		w = 0;
427	}
428
429	mm_id = &mm->context.id;
430	prot = ((r ? UM_PROT_READ : 0) | (w ? UM_PROT_WRITE : 0) |
431		(x ? UM_PROT_EXEC : 0));
432	if (pte_newpage(*pte)) {
433		if (pte_present(*pte)) {
434			unsigned long long offset;
435			int fd;
436
437			fd = phys_mapping(pte_val(*pte) & PAGE_MASK, &offset);
438			err = map(mm_id, address, PAGE_SIZE, prot, fd, offset,
439				  1, &flush);
440		}
441		else err = unmap(mm_id, address, PAGE_SIZE, 1, &flush);
442	}
443	else if (pte_newprot(*pte))
444		err = protect(mm_id, address, PAGE_SIZE, prot, 1, &flush);
445
446	if (err) {
447		if (err == -ENOMEM)
448			report_enomem();
449
450		goto kill;
451	}
452
453	*pte = pte_mkuptodate(*pte);
454
455	return;
456
457kill:
458	printk(KERN_ERR "Failed to flush page for address 0x%lx\n", address);
459	force_sig(SIGKILL, current);
460}
461
462pgd_t *pgd_offset_proc(struct mm_struct *mm, unsigned long address)
463{
464	return pgd_offset(mm, address);
465}
466
467pud_t *pud_offset_proc(pgd_t *pgd, unsigned long address)
468{
469	return pud_offset(pgd, address);
470}
471
472pmd_t *pmd_offset_proc(pud_t *pud, unsigned long address)
473{
474	return pmd_offset(pud, address);
475}
476
477pte_t *pte_offset_proc(pmd_t *pmd, unsigned long address)
478{
479	return pte_offset_kernel(pmd, address);
480}
481
482pte_t *addr_pte(struct task_struct *task, unsigned long addr)
483{
484	pgd_t *pgd = pgd_offset(task->mm, addr);
485	pud_t *pud = pud_offset(pgd, addr);
486	pmd_t *pmd = pmd_offset(pud, addr);
487
488	return pte_offset_map(pmd, addr);
489}
490
491void flush_tlb_all(void)
492{
493	flush_tlb_mm(current->mm);
494}
495
496void flush_tlb_kernel_range(unsigned long start, unsigned long end)
497{
498	flush_tlb_kernel_range_common(start, end);
499}
500
501void flush_tlb_kernel_vm(void)
502{
503	flush_tlb_kernel_range_common(start_vm, end_vm);
504}
505
506void __flush_tlb_one(unsigned long addr)
507{
508	flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE);
509}
510
511static void fix_range(struct mm_struct *mm, unsigned long start_addr,
512		      unsigned long end_addr, int force)
513{
514	fix_range_common(mm, start_addr, end_addr, force);
515}
516
517void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
518		     unsigned long end)
519{
520	if (vma->vm_mm == NULL)
521		flush_tlb_kernel_range_common(start, end);
522	else fix_range(vma->vm_mm, start, end, 0);
523}
524EXPORT_SYMBOL(flush_tlb_range);
525
526void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
527			unsigned long end)
528{
529	/*
530	 * Don't bother flushing if this address space is about to be
531	 * destroyed.
532	 */
533	if (atomic_read(&mm->mm_users) == 0)
534		return;
535
536	fix_range(mm, start, end, 0);
537}
538
539void flush_tlb_mm(struct mm_struct *mm)
540{
541	struct vm_area_struct *vma = mm->mmap;
542
543	while (vma != NULL) {
544		fix_range(mm, vma->vm_start, vma->vm_end, 0);
545		vma = vma->vm_next;
546	}
547}
548
549void force_flush_all(void)
550{
551	struct mm_struct *mm = current->mm;
552	struct vm_area_struct *vma = mm->mmap;
553
554	while (vma != NULL) {
555		fix_range(mm, vma->vm_start, vma->vm_end, 1);
556		vma = vma->vm_next;
557	}
558}
559