1/*
2 * common.c - C code for kernel entry and exit
3 * Copyright (c) 2015 Andrew Lutomirski
4 * GPL v2
5 *
6 * Based on asm and ptrace code by many authors.  The code here originated
7 * in ptrace.c and signal.c.
8 */
9
10#include <linux/kernel.h>
11#include <linux/sched.h>
12#include <linux/mm.h>
13#include <linux/smp.h>
14#include <linux/errno.h>
15#include <linux/ptrace.h>
16#include <linux/tracehook.h>
17#include <linux/audit.h>
18#include <linux/seccomp.h>
19#include <linux/signal.h>
20#include <linux/export.h>
21#include <linux/context_tracking.h>
22#include <linux/user-return-notifier.h>
23#include <linux/uprobes.h>
24
25#include <asm/desc.h>
26#include <asm/traps.h>
27#include <asm/vdso.h>
28#include <asm/uaccess.h>
29
30#define CREATE_TRACE_POINTS
31#include <trace/events/syscalls.h>
32
33static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
34{
35	unsigned long top_of_stack =
36		(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
37	return (struct thread_info *)(top_of_stack - THREAD_SIZE);
38}
39
40#ifdef CONFIG_CONTEXT_TRACKING
41/* Called on entry from user mode with IRQs off. */
42__visible void enter_from_user_mode(void)
43{
44	CT_WARN_ON(ct_state() != CONTEXT_USER);
45	user_exit();
46}
47#endif
48
49static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
50{
51#ifdef CONFIG_X86_64
52	if (arch == AUDIT_ARCH_X86_64) {
53		audit_syscall_entry(regs->orig_ax, regs->di,
54				    regs->si, regs->dx, regs->r10);
55	} else
56#endif
57	{
58		audit_syscall_entry(regs->orig_ax, regs->bx,
59				    regs->cx, regs->dx, regs->si);
60	}
61}
62
63/*
64 * We can return 0 to resume the syscall or anything else to go to phase
65 * 2.  If we resume the syscall, we need to put something appropriate in
66 * regs->orig_ax.
67 *
68 * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
69 * are fully functional.
70 *
71 * For phase 2's benefit, our return value is:
72 * 0:			resume the syscall
73 * 1:			go to phase 2; no seccomp phase 2 needed
74 * anything else:	go to phase 2; pass return value to seccomp
75 */
76unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
77{
78	struct thread_info *ti = pt_regs_to_thread_info(regs);
79	unsigned long ret = 0;
80	u32 work;
81
82	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
83		BUG_ON(regs != task_pt_regs(current));
84
85	work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
86
87#ifdef CONFIG_CONTEXT_TRACKING
88	/*
89	 * If TIF_NOHZ is set, we are required to call user_exit() before
90	 * doing anything that could touch RCU.
91	 */
92	if (work & _TIF_NOHZ) {
93		enter_from_user_mode();
94		work &= ~_TIF_NOHZ;
95	}
96#endif
97
98#ifdef CONFIG_SECCOMP
99	/*
100	 * Do seccomp first -- it should minimize exposure of other
101	 * code, and keeping seccomp fast is probably more valuable
102	 * than the rest of this.
103	 */
104	if (work & _TIF_SECCOMP) {
105		struct seccomp_data sd;
106
107		sd.arch = arch;
108		sd.nr = regs->orig_ax;
109		sd.instruction_pointer = regs->ip;
110#ifdef CONFIG_X86_64
111		if (arch == AUDIT_ARCH_X86_64) {
112			sd.args[0] = regs->di;
113			sd.args[1] = regs->si;
114			sd.args[2] = regs->dx;
115			sd.args[3] = regs->r10;
116			sd.args[4] = regs->r8;
117			sd.args[5] = regs->r9;
118		} else
119#endif
120		{
121			sd.args[0] = regs->bx;
122			sd.args[1] = regs->cx;
123			sd.args[2] = regs->dx;
124			sd.args[3] = regs->si;
125			sd.args[4] = regs->di;
126			sd.args[5] = regs->bp;
127		}
128
129		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
130		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
131
132		ret = seccomp_phase1(&sd);
133		if (ret == SECCOMP_PHASE1_SKIP) {
134			regs->orig_ax = -1;
135			ret = 0;
136		} else if (ret != SECCOMP_PHASE1_OK) {
137			return ret;  /* Go directly to phase 2 */
138		}
139
140		work &= ~_TIF_SECCOMP;
141	}
142#endif
143
144	/* Do our best to finish without phase 2. */
145	if (work == 0)
146		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
147
148#ifdef CONFIG_AUDITSYSCALL
149	if (work == _TIF_SYSCALL_AUDIT) {
150		/*
151		 * If there is no more work to be done except auditing,
152		 * then audit in phase 1.  Phase 2 always audits, so, if
153		 * we audit here, then we can't go on to phase 2.
154		 */
155		do_audit_syscall_entry(regs, arch);
156		return 0;
157	}
158#endif
159
160	return 1;  /* Something is enabled that we can't handle in phase 1 */
161}
162
163/* Returns the syscall nr to run (which should match regs->orig_ax). */
164long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
165				unsigned long phase1_result)
166{
167	struct thread_info *ti = pt_regs_to_thread_info(regs);
168	long ret = 0;
169	u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
170
171	if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
172		BUG_ON(regs != task_pt_regs(current));
173
174	/*
175	 * If we stepped into a sysenter/syscall insn, it trapped in
176	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
177	 * If user-mode had set TF itself, then it's still clear from
178	 * do_debug() and we need to set it again to restore the user
179	 * state.  If we entered on the slow path, TF was already set.
180	 */
181	if (work & _TIF_SINGLESTEP)
182		regs->flags |= X86_EFLAGS_TF;
183
184#ifdef CONFIG_SECCOMP
185	/*
186	 * Call seccomp_phase2 before running the other hooks so that
187	 * they can see any changes made by a seccomp tracer.
188	 */
189	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
190		/* seccomp failures shouldn't expose any additional code. */
191		return -1;
192	}
193#endif
194
195	if (unlikely(work & _TIF_SYSCALL_EMU))
196		ret = -1L;
197
198	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
199	    tracehook_report_syscall_entry(regs))
200		ret = -1L;
201
202	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
203		trace_sys_enter(regs, regs->orig_ax);
204
205	do_audit_syscall_entry(regs, arch);
206
207	return ret ?: regs->orig_ax;
208}
209
210long syscall_trace_enter(struct pt_regs *regs)
211{
212	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
213	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
214
215	if (phase1_result == 0)
216		return regs->orig_ax;
217	else
218		return syscall_trace_enter_phase2(regs, arch, phase1_result);
219}
220
221#define EXIT_TO_USERMODE_LOOP_FLAGS				\
222	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE |	\
223	 _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
224
225static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
226{
227	/*
228	 * In order to return to user mode, we need to have IRQs off with
229	 * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
230	 * _TIF_UPROBE, or _TIF_NEED_RESCHED set.  Several of these flags
231	 * can be set at any time on preemptable kernels if we have IRQs on,
232	 * so we need to loop.  Disabling preemption wouldn't help: doing the
233	 * work to clear some of the flags can sleep.
234	 */
235	while (true) {
236		/* We have work to do. */
237		local_irq_enable();
238
239		if (cached_flags & _TIF_NEED_RESCHED)
240			schedule();
241
242		if (cached_flags & _TIF_UPROBE)
243			uprobe_notify_resume(regs);
244
245		/* deal with pending signal delivery */
246		if (cached_flags & _TIF_SIGPENDING)
247			do_signal(regs);
248
249		if (cached_flags & _TIF_NOTIFY_RESUME) {
250			clear_thread_flag(TIF_NOTIFY_RESUME);
251			tracehook_notify_resume(regs);
252		}
253
254		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
255			fire_user_return_notifiers();
256
257		/* Disable IRQs and retry */
258		local_irq_disable();
259
260		cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
261
262		if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
263			break;
264
265	}
266}
267
268/* Called with IRQs disabled. */
269__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
270{
271	struct thread_info *ti = pt_regs_to_thread_info(regs);
272	u32 cached_flags;
273
274	if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
275		local_irq_disable();
276
277	lockdep_sys_exit();
278
279	cached_flags = READ_ONCE(ti->flags);
280
281	if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
282		exit_to_usermode_loop(regs, cached_flags);
283
284#ifdef CONFIG_COMPAT
285	/*
286	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
287	 * returning to user mode.  We need to clear it *after* signal
288	 * handling, because syscall restart has a fixup for compat
289	 * syscalls.  The fixup is exercised by the ptrace_syscall_32
290	 * selftest.
291	 */
292	ti->status &= ~TS_COMPAT;
293#endif
294
295	user_enter();
296}
297
298#define SYSCALL_EXIT_WORK_FLAGS				\
299	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |	\
300	 _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
301
302static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
303{
304	bool step;
305
306	audit_syscall_exit(regs);
307
308	if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
309		trace_sys_exit(regs, regs->ax);
310
311	/*
312	 * If TIF_SYSCALL_EMU is set, we only get here because of
313	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
314	 * We already reported this syscall instruction in
315	 * syscall_trace_enter().
316	 */
317	step = unlikely(
318		(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
319		== _TIF_SINGLESTEP);
320	if (step || cached_flags & _TIF_SYSCALL_TRACE)
321		tracehook_report_syscall_exit(regs, step);
322}
323
324/*
325 * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
326 * state such that we can immediately switch to user mode.
327 */
328__visible inline void syscall_return_slowpath(struct pt_regs *regs)
329{
330	struct thread_info *ti = pt_regs_to_thread_info(regs);
331	u32 cached_flags = READ_ONCE(ti->flags);
332
333	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
334
335	if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
336	    WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
337		local_irq_enable();
338
339	/*
340	 * First do one-time work.  If these work items are enabled, we
341	 * want to run them exactly once per syscall exit with IRQs on.
342	 */
343	if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
344		syscall_slow_exit_work(regs, cached_flags);
345
346	local_irq_disable();
347	prepare_exit_to_usermode(regs);
348}
349
350#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
351/*
352 * Does a 32-bit syscall.  Called with IRQs on and does all entry and
353 * exit work and returns with IRQs off.  This function is extremely hot
354 * in workloads that use it, and it's usually called from
355 * do_fast_syscall_32, so forcibly inline it to improve performance.
356 */
357#ifdef CONFIG_X86_32
358/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
359__visible
360#else
361/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
362static
363#endif
364__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
365{
366	struct thread_info *ti = pt_regs_to_thread_info(regs);
367	unsigned int nr = (unsigned int)regs->orig_ax;
368
369#ifdef CONFIG_IA32_EMULATION
370	ti->status |= TS_COMPAT;
371#endif
372
373	if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
374		/*
375		 * Subtlety here: if ptrace pokes something larger than
376		 * 2^32-1 into orig_ax, this truncates it.  This may or
377		 * may not be necessary, but it matches the old asm
378		 * behavior.
379		 */
380		nr = syscall_trace_enter(regs);
381	}
382
383	if (likely(nr < IA32_NR_syscalls)) {
384		/*
385		 * It's possible that a 32-bit syscall implementation
386		 * takes a 64-bit parameter but nonetheless assumes that
387		 * the high bits are zero.  Make sure we zero-extend all
388		 * of the args.
389		 */
390		regs->ax = ia32_sys_call_table[nr](
391			(unsigned int)regs->bx, (unsigned int)regs->cx,
392			(unsigned int)regs->dx, (unsigned int)regs->si,
393			(unsigned int)regs->di, (unsigned int)regs->bp);
394	}
395
396	syscall_return_slowpath(regs);
397}
398
399#ifdef CONFIG_X86_64
400/* Handles INT80 on 64-bit kernels */
401__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
402{
403	local_irq_enable();
404	do_syscall_32_irqs_on(regs);
405}
406#endif
407
408/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
409__visible long do_fast_syscall_32(struct pt_regs *regs)
410{
411	/*
412	 * Called using the internal vDSO SYSENTER/SYSCALL32 calling
413	 * convention.  Adjust regs so it looks like we entered using int80.
414	 */
415
416	unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
417		vdso_image_32.sym_int80_landing_pad;
418
419	/*
420	 * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
421	 * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
422	 * Fix it up.
423	 */
424	regs->ip = landing_pad;
425
426	/*
427	 * Fetch EBP from where the vDSO stashed it.
428	 *
429	 * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
430	 */
431	local_irq_enable();
432	if (
433#ifdef CONFIG_X86_64
434		/*
435		 * Micro-optimization: the pointer we're following is explicitly
436		 * 32 bits, so it can't be out of range.
437		 */
438		__get_user(*(u32 *)&regs->bp,
439			    (u32 __user __force *)(unsigned long)(u32)regs->sp)
440#else
441		get_user(*(u32 *)&regs->bp,
442			 (u32 __user __force *)(unsigned long)(u32)regs->sp)
443#endif
444		) {
445
446		/* User code screwed up. */
447		local_irq_disable();
448		regs->ax = -EFAULT;
449#ifdef CONFIG_CONTEXT_TRACKING
450		enter_from_user_mode();
451#endif
452		prepare_exit_to_usermode(regs);
453		return 0;	/* Keep it simple: use IRET. */
454	}
455
456	/* Now this is just like a normal syscall. */
457	do_syscall_32_irqs_on(regs);
458
459#ifdef CONFIG_X86_64
460	/*
461	 * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
462	 * SYSRETL is available on all 64-bit CPUs, so we don't need to
463	 * bother with SYSEXIT.
464	 *
465	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
466	 * because the ECX fixup above will ensure that this is essentially
467	 * never the case.
468	 */
469	return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
470		regs->ip == landing_pad &&
471		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
472#else
473	/*
474	 * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
475	 *
476	 * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
477	 * because the ECX fixup above will ensure that this is essentially
478	 * never the case.
479	 *
480	 * We don't allow syscalls at all from VM86 mode, but we still
481	 * need to check VM, because we might be returning from sys_vm86.
482	 */
483	return static_cpu_has(X86_FEATURE_SEP) &&
484		regs->cs == __USER_CS && regs->ss == __USER_DS &&
485		regs->ip == landing_pad &&
486		(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
487#endif
488}
489#endif
490