1/*
2 * Copyright 2007 Andi Kleen, SUSE Labs.
3 * Subject to the GPL, v.2
4 *
5 * This contains most of the x86 vDSO kernel-side code.
6 */
7#include <linux/mm.h>
8#include <linux/err.h>
9#include <linux/sched.h>
10#include <linux/slab.h>
11#include <linux/init.h>
12#include <linux/random.h>
13#include <linux/elf.h>
14#include <linux/cpu.h>
15#include <asm/vgtod.h>
16#include <asm/proto.h>
17#include <asm/vdso.h>
18#include <asm/vvar.h>
19#include <asm/page.h>
20#include <asm/hpet.h>
21#include <asm/desc.h>
22
23#if defined(CONFIG_X86_64)
24unsigned int __read_mostly vdso64_enabled = 1;
25#endif
26
27void __init init_vdso_image(const struct vdso_image *image)
28{
29	int i;
30	int npages = (image->size) / PAGE_SIZE;
31
32	BUG_ON(image->size % PAGE_SIZE != 0);
33	for (i = 0; i < npages; i++)
34		image->text_mapping.pages[i] =
35			virt_to_page(image->data + i*PAGE_SIZE);
36
37	apply_alternatives((struct alt_instr *)(image->data + image->alt),
38			   (struct alt_instr *)(image->data + image->alt +
39						image->alt_len));
40}
41
42struct linux_binprm;
43
44/*
45 * Put the vdso above the (randomized) stack with another randomized
46 * offset.  This way there is no hole in the middle of address space.
47 * To save memory make sure it is still in the same PTE as the stack
48 * top.  This doesn't give that many random bits.
49 *
50 * Note that this algorithm is imperfect: the distribution of the vdso
51 * start address within a PMD is biased toward the end.
52 *
53 * Only used for the 64-bit and x32 vdsos.
54 */
55static unsigned long vdso_addr(unsigned long start, unsigned len)
56{
57#ifdef CONFIG_X86_32
58	return 0;
59#else
60	unsigned long addr, end;
61	unsigned offset;
62
63	/*
64	 * Round up the start address.  It can start out unaligned as a result
65	 * of stack start randomization.
66	 */
67	start = PAGE_ALIGN(start);
68
69	/* Round the lowest possible end address up to a PMD boundary. */
70	end = (start + len + PMD_SIZE - 1) & PMD_MASK;
71	if (end >= TASK_SIZE_MAX)
72		end = TASK_SIZE_MAX;
73	end -= len;
74
75	if (end > start) {
76		offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1);
77		addr = start + (offset << PAGE_SHIFT);
78	} else {
79		addr = start;
80	}
81
82	/*
83	 * Forcibly align the final address in case we have a hardware
84	 * issue that requires alignment for performance reasons.
85	 */
86	addr = align_vdso_addr(addr);
87
88	return addr;
89#endif
90}
91
92static int map_vdso(const struct vdso_image *image, bool calculate_addr)
93{
94	struct mm_struct *mm = current->mm;
95	struct vm_area_struct *vma;
96	unsigned long addr, text_start;
97	int ret = 0;
98	static struct page *no_pages[] = {NULL};
99	static struct vm_special_mapping vvar_mapping = {
100		.name = "[vvar]",
101		.pages = no_pages,
102	};
103
104	if (calculate_addr) {
105		addr = vdso_addr(current->mm->start_stack,
106				 image->size - image->sym_vvar_start);
107	} else {
108		addr = 0;
109	}
110
111	down_write(&mm->mmap_sem);
112
113	addr = get_unmapped_area(NULL, addr,
114				 image->size - image->sym_vvar_start, 0, 0);
115	if (IS_ERR_VALUE(addr)) {
116		ret = addr;
117		goto up_fail;
118	}
119
120	text_start = addr - image->sym_vvar_start;
121	current->mm->context.vdso = (void __user *)text_start;
122
123	/*
124	 * MAYWRITE to allow gdb to COW and set breakpoints
125	 */
126	vma = _install_special_mapping(mm,
127				       text_start,
128				       image->size,
129				       VM_READ|VM_EXEC|
130				       VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
131				       &image->text_mapping);
132
133	if (IS_ERR(vma)) {
134		ret = PTR_ERR(vma);
135		goto up_fail;
136	}
137
138	vma = _install_special_mapping(mm,
139				       addr,
140				       -image->sym_vvar_start,
141				       VM_READ|VM_MAYREAD,
142				       &vvar_mapping);
143
144	if (IS_ERR(vma)) {
145		ret = PTR_ERR(vma);
146		goto up_fail;
147	}
148
149	if (image->sym_vvar_page)
150		ret = remap_pfn_range(vma,
151				      text_start + image->sym_vvar_page,
152				      __pa_symbol(&__vvar_page) >> PAGE_SHIFT,
153				      PAGE_SIZE,
154				      PAGE_READONLY);
155
156	if (ret)
157		goto up_fail;
158
159#ifdef CONFIG_HPET_TIMER
160	if (hpet_address && image->sym_hpet_page) {
161		ret = io_remap_pfn_range(vma,
162			text_start + image->sym_hpet_page,
163			hpet_address >> PAGE_SHIFT,
164			PAGE_SIZE,
165			pgprot_noncached(PAGE_READONLY));
166
167		if (ret)
168			goto up_fail;
169	}
170#endif
171
172up_fail:
173	if (ret)
174		current->mm->context.vdso = NULL;
175
176	up_write(&mm->mmap_sem);
177	return ret;
178}
179
180#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
181static int load_vdso32(void)
182{
183	int ret;
184
185	if (vdso32_enabled != 1)  /* Other values all mean "disabled" */
186		return 0;
187
188	ret = map_vdso(selected_vdso32, false);
189	if (ret)
190		return ret;
191
192	if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
193		current_thread_info()->sysenter_return =
194			current->mm->context.vdso +
195			selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
196
197	return 0;
198}
199#endif
200
201#ifdef CONFIG_X86_64
202int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
203{
204	if (!vdso64_enabled)
205		return 0;
206
207	return map_vdso(&vdso_image_64, true);
208}
209
210#ifdef CONFIG_COMPAT
211int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
212				       int uses_interp)
213{
214#ifdef CONFIG_X86_X32_ABI
215	if (test_thread_flag(TIF_X32)) {
216		if (!vdso64_enabled)
217			return 0;
218
219		return map_vdso(&vdso_image_x32, true);
220	}
221#endif
222
223	return load_vdso32();
224}
225#endif
226#else
227int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
228{
229	return load_vdso32();
230}
231#endif
232
233#ifdef CONFIG_X86_64
234static __init int vdso_setup(char *s)
235{
236	vdso64_enabled = simple_strtoul(s, NULL, 0);
237	return 0;
238}
239__setup("vdso=", vdso_setup);
240#endif
241
242#ifdef CONFIG_X86_64
243static void vgetcpu_cpu_init(void *arg)
244{
245	int cpu = smp_processor_id();
246	struct desc_struct d = { };
247	unsigned long node = 0;
248#ifdef CONFIG_NUMA
249	node = cpu_to_node(cpu);
250#endif
251	if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP))
252		write_rdtscp_aux((node << 12) | cpu);
253
254	/*
255	 * Store cpu number in limit so that it can be loaded
256	 * quickly in user space in vgetcpu. (12 bits for the CPU
257	 * and 8 bits for the node)
258	 */
259	d.limit0 = cpu | ((node & 0xf) << 12);
260	d.limit = node >> 4;
261	d.type = 5;		/* RO data, expand down, accessed */
262	d.dpl = 3;		/* Visible to user code */
263	d.s = 1;		/* Not a system segment */
264	d.p = 1;		/* Present */
265	d.d = 1;		/* 32-bit */
266
267	write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
268}
269
270static int
271vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg)
272{
273	long cpu = (long)arg;
274
275	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
276		smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
277
278	return NOTIFY_DONE;
279}
280
281static int __init init_vdso(void)
282{
283	init_vdso_image(&vdso_image_64);
284
285#ifdef CONFIG_X86_X32_ABI
286	init_vdso_image(&vdso_image_x32);
287#endif
288
289	cpu_notifier_register_begin();
290
291	on_each_cpu(vgetcpu_cpu_init, NULL, 1);
292	/* notifier priority > KVM */
293	__hotcpu_notifier(vgetcpu_cpu_notifier, 30);
294
295	cpu_notifier_register_done();
296
297	return 0;
298}
299subsys_initcall(init_vdso);
300#endif /* CONFIG_X86_64 */
301