1 #include <linux/perf_event.h>
2 #include <linux/types.h>
3 
4 #include <asm/perf_event.h>
5 #include <asm/msr.h>
6 #include <asm/insn.h>
7 
8 #include "perf_event.h"
9 
10 enum {
11 	LBR_FORMAT_32		= 0x00,
12 	LBR_FORMAT_LIP		= 0x01,
13 	LBR_FORMAT_EIP		= 0x02,
14 	LBR_FORMAT_EIP_FLAGS	= 0x03,
15 	LBR_FORMAT_EIP_FLAGS2	= 0x04,
16 	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_EIP_FLAGS2,
17 };
18 
19 static enum {
20 	LBR_EIP_FLAGS		= 1,
21 	LBR_TSX			= 2,
22 } lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
23 	[LBR_FORMAT_EIP_FLAGS]  = LBR_EIP_FLAGS,
24 	[LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
25 };
26 
27 /*
28  * Intel LBR_SELECT bits
29  * Intel Vol3a, April 2011, Section 16.7 Table 16-10
30  *
31  * Hardware branch filter (not available on all CPUs)
32  */
33 #define LBR_KERNEL_BIT		0 /* do not capture at ring0 */
34 #define LBR_USER_BIT		1 /* do not capture at ring > 0 */
35 #define LBR_JCC_BIT		2 /* do not capture conditional branches */
36 #define LBR_REL_CALL_BIT	3 /* do not capture relative calls */
37 #define LBR_IND_CALL_BIT	4 /* do not capture indirect calls */
38 #define LBR_RETURN_BIT		5 /* do not capture near returns */
39 #define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
40 #define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
41 #define LBR_FAR_BIT		8 /* do not capture far branches */
42 #define LBR_CALL_STACK_BIT	9 /* enable call stack */
43 
44 #define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
45 #define LBR_USER	(1 << LBR_USER_BIT)
46 #define LBR_JCC		(1 << LBR_JCC_BIT)
47 #define LBR_REL_CALL	(1 << LBR_REL_CALL_BIT)
48 #define LBR_IND_CALL	(1 << LBR_IND_CALL_BIT)
49 #define LBR_RETURN	(1 << LBR_RETURN_BIT)
50 #define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
51 #define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
52 #define LBR_FAR		(1 << LBR_FAR_BIT)
53 #define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
54 
55 #define LBR_PLM (LBR_KERNEL | LBR_USER)
56 
57 #define LBR_SEL_MASK	0x1ff	/* valid bits in LBR_SELECT */
58 #define LBR_NOT_SUPP	-1	/* LBR filter not supported */
59 #define LBR_IGN		0	/* ignored */
60 
61 #define LBR_ANY		 \
62 	(LBR_JCC	|\
63 	 LBR_REL_CALL	|\
64 	 LBR_IND_CALL	|\
65 	 LBR_RETURN	|\
66 	 LBR_REL_JMP	|\
67 	 LBR_IND_JMP	|\
68 	 LBR_FAR)
69 
70 #define LBR_FROM_FLAG_MISPRED  (1ULL << 63)
71 #define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
72 #define LBR_FROM_FLAG_ABORT    (1ULL << 61)
73 
74 /*
75  * x86control flow change classification
76  * x86control flow changes include branches, interrupts, traps, faults
77  */
78 enum {
79 	X86_BR_NONE		= 0,      /* unknown */
80 
81 	X86_BR_USER		= 1 << 0, /* branch target is user */
82 	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
83 
84 	X86_BR_CALL		= 1 << 2, /* call */
85 	X86_BR_RET		= 1 << 3, /* return */
86 	X86_BR_SYSCALL		= 1 << 4, /* syscall */
87 	X86_BR_SYSRET		= 1 << 5, /* syscall return */
88 	X86_BR_INT		= 1 << 6, /* sw interrupt */
89 	X86_BR_IRET		= 1 << 7, /* return from interrupt */
90 	X86_BR_JCC		= 1 << 8, /* conditional */
91 	X86_BR_JMP		= 1 << 9, /* jump */
92 	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
93 	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
94 	X86_BR_ABORT		= 1 << 12,/* transaction abort */
95 	X86_BR_IN_TX		= 1 << 13,/* in transaction */
96 	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
97 	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
98 	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
99 };
100 
101 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
102 #define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX)
103 
104 #define X86_BR_ANY       \
105 	(X86_BR_CALL    |\
106 	 X86_BR_RET     |\
107 	 X86_BR_SYSCALL |\
108 	 X86_BR_SYSRET  |\
109 	 X86_BR_INT     |\
110 	 X86_BR_IRET    |\
111 	 X86_BR_JCC     |\
112 	 X86_BR_JMP	 |\
113 	 X86_BR_IRQ	 |\
114 	 X86_BR_ABORT	 |\
115 	 X86_BR_IND_CALL |\
116 	 X86_BR_ZERO_CALL)
117 
118 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
119 
120 #define X86_BR_ANY_CALL		 \
121 	(X86_BR_CALL		|\
122 	 X86_BR_IND_CALL	|\
123 	 X86_BR_ZERO_CALL	|\
124 	 X86_BR_SYSCALL		|\
125 	 X86_BR_IRQ		|\
126 	 X86_BR_INT)
127 
128 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
129 
130 /*
131  * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
132  * otherwise it becomes near impossible to get a reliable stack.
133  */
134 
__intel_pmu_lbr_enable(bool pmi)135 static void __intel_pmu_lbr_enable(bool pmi)
136 {
137 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
138 	u64 debugctl, lbr_select = 0, orig_debugctl;
139 
140 	/*
141 	 * No need to reprogram LBR_SELECT in a PMI, as it
142 	 * did not change.
143 	 */
144 	if (cpuc->lbr_sel && !pmi) {
145 		lbr_select = cpuc->lbr_sel->config;
146 		wrmsrl(MSR_LBR_SELECT, lbr_select);
147 	}
148 
149 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
150 	orig_debugctl = debugctl;
151 	debugctl |= DEBUGCTLMSR_LBR;
152 	/*
153 	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
154 	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
155 	 * may cause superfluous increase/decrease of LBR_TOS.
156 	 */
157 	if (!(lbr_select & LBR_CALL_STACK))
158 		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
159 	if (orig_debugctl != debugctl)
160 		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
161 }
162 
__intel_pmu_lbr_disable(void)163 static void __intel_pmu_lbr_disable(void)
164 {
165 	u64 debugctl;
166 
167 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
168 	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
169 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
170 }
171 
intel_pmu_lbr_reset_32(void)172 static void intel_pmu_lbr_reset_32(void)
173 {
174 	int i;
175 
176 	for (i = 0; i < x86_pmu.lbr_nr; i++)
177 		wrmsrl(x86_pmu.lbr_from + i, 0);
178 }
179 
intel_pmu_lbr_reset_64(void)180 static void intel_pmu_lbr_reset_64(void)
181 {
182 	int i;
183 
184 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
185 		wrmsrl(x86_pmu.lbr_from + i, 0);
186 		wrmsrl(x86_pmu.lbr_to   + i, 0);
187 	}
188 }
189 
intel_pmu_lbr_reset(void)190 void intel_pmu_lbr_reset(void)
191 {
192 	if (!x86_pmu.lbr_nr)
193 		return;
194 
195 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
196 		intel_pmu_lbr_reset_32();
197 	else
198 		intel_pmu_lbr_reset_64();
199 }
200 
201 /*
202  * TOS = most recently recorded branch
203  */
intel_pmu_lbr_tos(void)204 static inline u64 intel_pmu_lbr_tos(void)
205 {
206 	u64 tos;
207 
208 	rdmsrl(x86_pmu.lbr_tos, tos);
209 	return tos;
210 }
211 
212 enum {
213 	LBR_NONE,
214 	LBR_VALID,
215 };
216 
__intel_pmu_lbr_restore(struct x86_perf_task_context * task_ctx)217 static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
218 {
219 	int i;
220 	unsigned lbr_idx, mask;
221 	u64 tos;
222 
223 	if (task_ctx->lbr_callstack_users == 0 ||
224 	    task_ctx->lbr_stack_state == LBR_NONE) {
225 		intel_pmu_lbr_reset();
226 		return;
227 	}
228 
229 	mask = x86_pmu.lbr_nr - 1;
230 	tos = intel_pmu_lbr_tos();
231 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
232 		lbr_idx = (tos - i) & mask;
233 		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
234 		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
235 	}
236 	task_ctx->lbr_stack_state = LBR_NONE;
237 }
238 
__intel_pmu_lbr_save(struct x86_perf_task_context * task_ctx)239 static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
240 {
241 	int i;
242 	unsigned lbr_idx, mask;
243 	u64 tos;
244 
245 	if (task_ctx->lbr_callstack_users == 0) {
246 		task_ctx->lbr_stack_state = LBR_NONE;
247 		return;
248 	}
249 
250 	mask = x86_pmu.lbr_nr - 1;
251 	tos = intel_pmu_lbr_tos();
252 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
253 		lbr_idx = (tos - i) & mask;
254 		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
255 		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
256 	}
257 	task_ctx->lbr_stack_state = LBR_VALID;
258 }
259 
intel_pmu_lbr_sched_task(struct perf_event_context * ctx,bool sched_in)260 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
261 {
262 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
263 	struct x86_perf_task_context *task_ctx;
264 
265 	if (!x86_pmu.lbr_nr)
266 		return;
267 
268 	/*
269 	 * If LBR callstack feature is enabled and the stack was saved when
270 	 * the task was scheduled out, restore the stack. Otherwise flush
271 	 * the LBR stack.
272 	 */
273 	task_ctx = ctx ? ctx->task_ctx_data : NULL;
274 	if (task_ctx) {
275 		if (sched_in) {
276 			__intel_pmu_lbr_restore(task_ctx);
277 			cpuc->lbr_context = ctx;
278 		} else {
279 			__intel_pmu_lbr_save(task_ctx);
280 		}
281 		return;
282 	}
283 
284 	/*
285 	 * When sampling the branck stack in system-wide, it may be
286 	 * necessary to flush the stack on context switch. This happens
287 	 * when the branch stack does not tag its entries with the pid
288 	 * of the current task. Otherwise it becomes impossible to
289 	 * associate a branch entry with a task. This ambiguity is more
290 	 * likely to appear when the branch stack supports priv level
291 	 * filtering and the user sets it to monitor only at the user
292 	 * level (which could be a useful measurement in system-wide
293 	 * mode). In that case, the risk is high of having a branch
294 	 * stack with branch from multiple tasks.
295  	 */
296 	if (sched_in) {
297 		intel_pmu_lbr_reset();
298 		cpuc->lbr_context = ctx;
299 	}
300 }
301 
branch_user_callstack(unsigned br_sel)302 static inline bool branch_user_callstack(unsigned br_sel)
303 {
304 	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
305 }
306 
intel_pmu_lbr_enable(struct perf_event * event)307 void intel_pmu_lbr_enable(struct perf_event *event)
308 {
309 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
310 	struct x86_perf_task_context *task_ctx;
311 
312 	if (!x86_pmu.lbr_nr)
313 		return;
314 
315 	/*
316 	 * Reset the LBR stack if we changed task context to
317 	 * avoid data leaks.
318 	 */
319 	if (event->ctx->task && cpuc->lbr_context != event->ctx) {
320 		intel_pmu_lbr_reset();
321 		cpuc->lbr_context = event->ctx;
322 	}
323 	cpuc->br_sel = event->hw.branch_reg.reg;
324 
325 	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
326 					event->ctx->task_ctx_data) {
327 		task_ctx = event->ctx->task_ctx_data;
328 		task_ctx->lbr_callstack_users++;
329 	}
330 
331 	cpuc->lbr_users++;
332 	perf_sched_cb_inc(event->ctx->pmu);
333 }
334 
intel_pmu_lbr_disable(struct perf_event * event)335 void intel_pmu_lbr_disable(struct perf_event *event)
336 {
337 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
338 	struct x86_perf_task_context *task_ctx;
339 
340 	if (!x86_pmu.lbr_nr)
341 		return;
342 
343 	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
344 					event->ctx->task_ctx_data) {
345 		task_ctx = event->ctx->task_ctx_data;
346 		task_ctx->lbr_callstack_users--;
347 	}
348 
349 	cpuc->lbr_users--;
350 	WARN_ON_ONCE(cpuc->lbr_users < 0);
351 	perf_sched_cb_dec(event->ctx->pmu);
352 
353 	if (cpuc->enabled && !cpuc->lbr_users) {
354 		__intel_pmu_lbr_disable();
355 		/* avoid stale pointer */
356 		cpuc->lbr_context = NULL;
357 	}
358 }
359 
intel_pmu_lbr_enable_all(bool pmi)360 void intel_pmu_lbr_enable_all(bool pmi)
361 {
362 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
363 
364 	if (cpuc->lbr_users)
365 		__intel_pmu_lbr_enable(pmi);
366 }
367 
intel_pmu_lbr_disable_all(void)368 void intel_pmu_lbr_disable_all(void)
369 {
370 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
371 
372 	if (cpuc->lbr_users)
373 		__intel_pmu_lbr_disable();
374 }
375 
intel_pmu_lbr_read_32(struct cpu_hw_events * cpuc)376 static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
377 {
378 	unsigned long mask = x86_pmu.lbr_nr - 1;
379 	u64 tos = intel_pmu_lbr_tos();
380 	int i;
381 
382 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
383 		unsigned long lbr_idx = (tos - i) & mask;
384 		union {
385 			struct {
386 				u32 from;
387 				u32 to;
388 			};
389 			u64     lbr;
390 		} msr_lastbranch;
391 
392 		rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
393 
394 		cpuc->lbr_entries[i].from	= msr_lastbranch.from;
395 		cpuc->lbr_entries[i].to		= msr_lastbranch.to;
396 		cpuc->lbr_entries[i].mispred	= 0;
397 		cpuc->lbr_entries[i].predicted	= 0;
398 		cpuc->lbr_entries[i].reserved	= 0;
399 	}
400 	cpuc->lbr_stack.nr = i;
401 }
402 
403 /*
404  * Due to lack of segmentation in Linux the effective address (offset)
405  * is the same as the linear address, allowing us to merge the LIP and EIP
406  * LBR formats.
407  */
intel_pmu_lbr_read_64(struct cpu_hw_events * cpuc)408 static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
409 {
410 	unsigned long mask = x86_pmu.lbr_nr - 1;
411 	int lbr_format = x86_pmu.intel_cap.lbr_format;
412 	u64 tos = intel_pmu_lbr_tos();
413 	int i;
414 	int out = 0;
415 
416 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
417 		unsigned long lbr_idx = (tos - i) & mask;
418 		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
419 		int skip = 0;
420 		int lbr_flags = lbr_desc[lbr_format];
421 
422 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
423 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);
424 
425 		if (lbr_flags & LBR_EIP_FLAGS) {
426 			mis = !!(from & LBR_FROM_FLAG_MISPRED);
427 			pred = !mis;
428 			skip = 1;
429 		}
430 		if (lbr_flags & LBR_TSX) {
431 			in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
432 			abort = !!(from & LBR_FROM_FLAG_ABORT);
433 			skip = 3;
434 		}
435 		from = (u64)((((s64)from) << skip) >> skip);
436 
437 		/*
438 		 * Some CPUs report duplicated abort records,
439 		 * with the second entry not having an abort bit set.
440 		 * Skip them here. This loop runs backwards,
441 		 * so we need to undo the previous record.
442 		 * If the abort just happened outside the window
443 		 * the extra entry cannot be removed.
444 		 */
445 		if (abort && x86_pmu.lbr_double_abort && out > 0)
446 			out--;
447 
448 		cpuc->lbr_entries[out].from	 = from;
449 		cpuc->lbr_entries[out].to	 = to;
450 		cpuc->lbr_entries[out].mispred	 = mis;
451 		cpuc->lbr_entries[out].predicted = pred;
452 		cpuc->lbr_entries[out].in_tx	 = in_tx;
453 		cpuc->lbr_entries[out].abort	 = abort;
454 		cpuc->lbr_entries[out].reserved	 = 0;
455 		out++;
456 	}
457 	cpuc->lbr_stack.nr = out;
458 }
459 
intel_pmu_lbr_read(void)460 void intel_pmu_lbr_read(void)
461 {
462 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
463 
464 	if (!cpuc->lbr_users)
465 		return;
466 
467 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
468 		intel_pmu_lbr_read_32(cpuc);
469 	else
470 		intel_pmu_lbr_read_64(cpuc);
471 
472 	intel_pmu_lbr_filter(cpuc);
473 }
474 
475 /*
476  * SW filter is used:
477  * - in case there is no HW filter
478  * - in case the HW filter has errata or limitations
479  */
intel_pmu_setup_sw_lbr_filter(struct perf_event * event)480 static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
481 {
482 	u64 br_type = event->attr.branch_sample_type;
483 	int mask = 0;
484 
485 	if (br_type & PERF_SAMPLE_BRANCH_USER)
486 		mask |= X86_BR_USER;
487 
488 	if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
489 		mask |= X86_BR_KERNEL;
490 
491 	/* we ignore BRANCH_HV here */
492 
493 	if (br_type & PERF_SAMPLE_BRANCH_ANY)
494 		mask |= X86_BR_ANY;
495 
496 	if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
497 		mask |= X86_BR_ANY_CALL;
498 
499 	if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
500 		mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
501 
502 	if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
503 		mask |= X86_BR_IND_CALL;
504 
505 	if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX)
506 		mask |= X86_BR_ABORT;
507 
508 	if (br_type & PERF_SAMPLE_BRANCH_IN_TX)
509 		mask |= X86_BR_IN_TX;
510 
511 	if (br_type & PERF_SAMPLE_BRANCH_NO_TX)
512 		mask |= X86_BR_NO_TX;
513 
514 	if (br_type & PERF_SAMPLE_BRANCH_COND)
515 		mask |= X86_BR_JCC;
516 
517 	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
518 		if (!x86_pmu_has_lbr_callstack())
519 			return -EOPNOTSUPP;
520 		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
521 			return -EINVAL;
522 		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
523 			X86_BR_CALL_STACK;
524 	}
525 
526 	/*
527 	 * stash actual user request into reg, it may
528 	 * be used by fixup code for some CPU
529 	 */
530 	event->hw.branch_reg.reg = mask;
531 	return 0;
532 }
533 
534 /*
535  * setup the HW LBR filter
536  * Used only when available, may not be enough to disambiguate
537  * all branches, may need the help of the SW filter
538  */
intel_pmu_setup_hw_lbr_filter(struct perf_event * event)539 static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
540 {
541 	struct hw_perf_event_extra *reg;
542 	u64 br_type = event->attr.branch_sample_type;
543 	u64 mask = 0, v;
544 	int i;
545 
546 	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
547 		if (!(br_type & (1ULL << i)))
548 			continue;
549 
550 		v = x86_pmu.lbr_sel_map[i];
551 		if (v == LBR_NOT_SUPP)
552 			return -EOPNOTSUPP;
553 
554 		if (v != LBR_IGN)
555 			mask |= v;
556 	}
557 	reg = &event->hw.branch_reg;
558 	reg->idx = EXTRA_REG_LBR;
559 
560 	/*
561 	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
562 	 * in suppress mode. So LBR_SELECT should be set to
563 	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
564 	 */
565 	reg->config = mask ^ x86_pmu.lbr_sel_mask;
566 
567 	return 0;
568 }
569 
intel_pmu_setup_lbr_filter(struct perf_event * event)570 int intel_pmu_setup_lbr_filter(struct perf_event *event)
571 {
572 	int ret = 0;
573 
574 	/*
575 	 * no LBR on this PMU
576 	 */
577 	if (!x86_pmu.lbr_nr)
578 		return -EOPNOTSUPP;
579 
580 	/*
581 	 * setup SW LBR filter
582 	 */
583 	ret = intel_pmu_setup_sw_lbr_filter(event);
584 	if (ret)
585 		return ret;
586 
587 	/*
588 	 * setup HW LBR filter, if any
589 	 */
590 	if (x86_pmu.lbr_sel_map)
591 		ret = intel_pmu_setup_hw_lbr_filter(event);
592 
593 	return ret;
594 }
595 
596 /*
597  * return the type of control flow change at address "from"
598  * intruction is not necessarily a branch (in case of interrupt).
599  *
600  * The branch type returned also includes the priv level of the
601  * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
602  *
603  * If a branch type is unknown OR the instruction cannot be
604  * decoded (e.g., text page not present), then X86_BR_NONE is
605  * returned.
606  */
branch_type(unsigned long from,unsigned long to,int abort)607 static int branch_type(unsigned long from, unsigned long to, int abort)
608 {
609 	struct insn insn;
610 	void *addr;
611 	int bytes_read, bytes_left;
612 	int ret = X86_BR_NONE;
613 	int ext, to_plm, from_plm;
614 	u8 buf[MAX_INSN_SIZE];
615 	int is64 = 0;
616 
617 	to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
618 	from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
619 
620 	/*
621 	 * maybe zero if lbr did not fill up after a reset by the time
622 	 * we get a PMU interrupt
623 	 */
624 	if (from == 0 || to == 0)
625 		return X86_BR_NONE;
626 
627 	if (abort)
628 		return X86_BR_ABORT | to_plm;
629 
630 	if (from_plm == X86_BR_USER) {
631 		/*
632 		 * can happen if measuring at the user level only
633 		 * and we interrupt in a kernel thread, e.g., idle.
634 		 */
635 		if (!current->mm)
636 			return X86_BR_NONE;
637 
638 		/* may fail if text not present */
639 		bytes_left = copy_from_user_nmi(buf, (void __user *)from,
640 						MAX_INSN_SIZE);
641 		bytes_read = MAX_INSN_SIZE - bytes_left;
642 		if (!bytes_read)
643 			return X86_BR_NONE;
644 
645 		addr = buf;
646 	} else {
647 		/*
648 		 * The LBR logs any address in the IP, even if the IP just
649 		 * faulted. This means userspace can control the from address.
650 		 * Ensure we don't blindy read any address by validating it is
651 		 * a known text address.
652 		 */
653 		if (kernel_text_address(from)) {
654 			addr = (void *)from;
655 			/*
656 			 * Assume we can get the maximum possible size
657 			 * when grabbing kernel data.  This is not
658 			 * _strictly_ true since we could possibly be
659 			 * executing up next to a memory hole, but
660 			 * it is very unlikely to be a problem.
661 			 */
662 			bytes_read = MAX_INSN_SIZE;
663 		} else {
664 			return X86_BR_NONE;
665 		}
666 	}
667 
668 	/*
669 	 * decoder needs to know the ABI especially
670 	 * on 64-bit systems running 32-bit apps
671 	 */
672 #ifdef CONFIG_X86_64
673 	is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
674 #endif
675 	insn_init(&insn, addr, bytes_read, is64);
676 	insn_get_opcode(&insn);
677 	if (!insn.opcode.got)
678 		return X86_BR_ABORT;
679 
680 	switch (insn.opcode.bytes[0]) {
681 	case 0xf:
682 		switch (insn.opcode.bytes[1]) {
683 		case 0x05: /* syscall */
684 		case 0x34: /* sysenter */
685 			ret = X86_BR_SYSCALL;
686 			break;
687 		case 0x07: /* sysret */
688 		case 0x35: /* sysexit */
689 			ret = X86_BR_SYSRET;
690 			break;
691 		case 0x80 ... 0x8f: /* conditional */
692 			ret = X86_BR_JCC;
693 			break;
694 		default:
695 			ret = X86_BR_NONE;
696 		}
697 		break;
698 	case 0x70 ... 0x7f: /* conditional */
699 		ret = X86_BR_JCC;
700 		break;
701 	case 0xc2: /* near ret */
702 	case 0xc3: /* near ret */
703 	case 0xca: /* far ret */
704 	case 0xcb: /* far ret */
705 		ret = X86_BR_RET;
706 		break;
707 	case 0xcf: /* iret */
708 		ret = X86_BR_IRET;
709 		break;
710 	case 0xcc ... 0xce: /* int */
711 		ret = X86_BR_INT;
712 		break;
713 	case 0xe8: /* call near rel */
714 		insn_get_immediate(&insn);
715 		if (insn.immediate1.value == 0) {
716 			/* zero length call */
717 			ret = X86_BR_ZERO_CALL;
718 			break;
719 		}
720 	case 0x9a: /* call far absolute */
721 		ret = X86_BR_CALL;
722 		break;
723 	case 0xe0 ... 0xe3: /* loop jmp */
724 		ret = X86_BR_JCC;
725 		break;
726 	case 0xe9 ... 0xeb: /* jmp */
727 		ret = X86_BR_JMP;
728 		break;
729 	case 0xff: /* call near absolute, call far absolute ind */
730 		insn_get_modrm(&insn);
731 		ext = (insn.modrm.bytes[0] >> 3) & 0x7;
732 		switch (ext) {
733 		case 2: /* near ind call */
734 		case 3: /* far ind call */
735 			ret = X86_BR_IND_CALL;
736 			break;
737 		case 4:
738 		case 5:
739 			ret = X86_BR_JMP;
740 			break;
741 		}
742 		break;
743 	default:
744 		ret = X86_BR_NONE;
745 	}
746 	/*
747 	 * interrupts, traps, faults (and thus ring transition) may
748 	 * occur on any instructions. Thus, to classify them correctly,
749 	 * we need to first look at the from and to priv levels. If they
750 	 * are different and to is in the kernel, then it indicates
751 	 * a ring transition. If the from instruction is not a ring
752 	 * transition instr (syscall, systenter, int), then it means
753 	 * it was a irq, trap or fault.
754 	 *
755 	 * we have no way of detecting kernel to kernel faults.
756 	 */
757 	if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
758 	    && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
759 		ret = X86_BR_IRQ;
760 
761 	/*
762 	 * branch priv level determined by target as
763 	 * is done by HW when LBR_SELECT is implemented
764 	 */
765 	if (ret != X86_BR_NONE)
766 		ret |= to_plm;
767 
768 	return ret;
769 }
770 
771 /*
772  * implement actual branch filter based on user demand.
773  * Hardware may not exactly satisfy that request, thus
774  * we need to inspect opcodes. Mismatched branches are
775  * discarded. Therefore, the number of branches returned
776  * in PERF_SAMPLE_BRANCH_STACK sample may vary.
777  */
778 static void
intel_pmu_lbr_filter(struct cpu_hw_events * cpuc)779 intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
780 {
781 	u64 from, to;
782 	int br_sel = cpuc->br_sel;
783 	int i, j, type;
784 	bool compress = false;
785 
786 	/* if sampling all branches, then nothing to filter */
787 	if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
788 		return;
789 
790 	for (i = 0; i < cpuc->lbr_stack.nr; i++) {
791 
792 		from = cpuc->lbr_entries[i].from;
793 		to = cpuc->lbr_entries[i].to;
794 
795 		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
796 		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
797 			if (cpuc->lbr_entries[i].in_tx)
798 				type |= X86_BR_IN_TX;
799 			else
800 				type |= X86_BR_NO_TX;
801 		}
802 
803 		/* if type does not correspond, then discard */
804 		if (type == X86_BR_NONE || (br_sel & type) != type) {
805 			cpuc->lbr_entries[i].from = 0;
806 			compress = true;
807 		}
808 	}
809 
810 	if (!compress)
811 		return;
812 
813 	/* remove all entries with from=0 */
814 	for (i = 0; i < cpuc->lbr_stack.nr; ) {
815 		if (!cpuc->lbr_entries[i].from) {
816 			j = i;
817 			while (++j < cpuc->lbr_stack.nr)
818 				cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
819 			cpuc->lbr_stack.nr--;
820 			if (!cpuc->lbr_entries[i].from)
821 				continue;
822 		}
823 		i++;
824 	}
825 }
826 
827 /*
828  * Map interface branch filters onto LBR filters
829  */
830 static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
831 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
832 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
833 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
834 	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
835 	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
836 						| LBR_IND_JMP | LBR_FAR,
837 	/*
838 	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
839 	 */
840 	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
841 	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
842 	/*
843 	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
844 	 */
845 	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
846 	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
847 };
848 
849 static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
850 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
851 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
852 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
853 	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
854 	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
855 	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
856 						| LBR_FAR,
857 	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
858 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
859 };
860 
861 static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
862 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
863 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
864 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
865 	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
866 	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
867 	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
868 						| LBR_FAR,
869 	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
870 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
871 	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
872 						| LBR_RETURN | LBR_CALL_STACK,
873 };
874 
875 /* core */
intel_pmu_lbr_init_core(void)876 void __init intel_pmu_lbr_init_core(void)
877 {
878 	x86_pmu.lbr_nr     = 4;
879 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
880 	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
881 	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
882 
883 	/*
884 	 * SW branch filter usage:
885 	 * - compensate for lack of HW filter
886 	 */
887 	pr_cont("4-deep LBR, ");
888 }
889 
890 /* nehalem/westmere */
intel_pmu_lbr_init_nhm(void)891 void __init intel_pmu_lbr_init_nhm(void)
892 {
893 	x86_pmu.lbr_nr     = 16;
894 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
895 	x86_pmu.lbr_from   = MSR_LBR_NHM_FROM;
896 	x86_pmu.lbr_to     = MSR_LBR_NHM_TO;
897 
898 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
899 	x86_pmu.lbr_sel_map  = nhm_lbr_sel_map;
900 
901 	/*
902 	 * SW branch filter usage:
903 	 * - workaround LBR_SEL errata (see above)
904 	 * - support syscall, sysret capture.
905 	 *   That requires LBR_FAR but that means far
906 	 *   jmp need to be filtered out
907 	 */
908 	pr_cont("16-deep LBR, ");
909 }
910 
911 /* sandy bridge */
intel_pmu_lbr_init_snb(void)912 void __init intel_pmu_lbr_init_snb(void)
913 {
914 	x86_pmu.lbr_nr	 = 16;
915 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
916 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
917 	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
918 
919 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
920 	x86_pmu.lbr_sel_map  = snb_lbr_sel_map;
921 
922 	/*
923 	 * SW branch filter usage:
924 	 * - support syscall, sysret capture.
925 	 *   That requires LBR_FAR but that means far
926 	 *   jmp need to be filtered out
927 	 */
928 	pr_cont("16-deep LBR, ");
929 }
930 
931 /* haswell */
intel_pmu_lbr_init_hsw(void)932 void intel_pmu_lbr_init_hsw(void)
933 {
934 	x86_pmu.lbr_nr	 = 16;
935 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
936 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
937 	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
938 
939 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
940 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
941 
942 	pr_cont("16-deep LBR, ");
943 }
944 
945 /* atom */
intel_pmu_lbr_init_atom(void)946 void __init intel_pmu_lbr_init_atom(void)
947 {
948 	/*
949 	 * only models starting at stepping 10 seems
950 	 * to have an operational LBR which can freeze
951 	 * on PMU interrupt
952 	 */
953 	if (boot_cpu_data.x86_model == 28
954 	    && boot_cpu_data.x86_mask < 10) {
955 		pr_cont("LBR disabled due to erratum");
956 		return;
957 	}
958 
959 	x86_pmu.lbr_nr	   = 8;
960 	x86_pmu.lbr_tos    = MSR_LBR_TOS;
961 	x86_pmu.lbr_from   = MSR_LBR_CORE_FROM;
962 	x86_pmu.lbr_to     = MSR_LBR_CORE_TO;
963 
964 	/*
965 	 * SW branch filter usage:
966 	 * - compensate for lack of HW filter
967 	 */
968 	pr_cont("8-deep LBR, ");
969 }
970