1 /*
2  * Intel(R) Processor Trace PMU driver for perf
3  * Copyright (c) 2013-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  *
14  * Intel PT is specified in the Intel Architecture Instruction Set Extensions
15  * Programming Reference:
16  * http://software.intel.com/en-us/intel-isa-extensions
17  */
18 
19 #undef DEBUG
20 
21 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 
23 #include <linux/types.h>
24 #include <linux/slab.h>
25 #include <linux/device.h>
26 
27 #include <asm/perf_event.h>
28 #include <asm/insn.h>
29 #include <asm/io.h>
30 
31 #include "perf_event.h"
32 #include "intel_pt.h"
33 
34 static DEFINE_PER_CPU(struct pt, pt_ctx);
35 
36 static struct pt_pmu pt_pmu;
37 
38 enum cpuid_regs {
39 	CR_EAX = 0,
40 	CR_ECX,
41 	CR_EDX,
42 	CR_EBX
43 };
44 
45 /*
46  * Capabilities of Intel PT hardware, such as number of address bits or
47  * supported output schemes, are cached and exported to userspace as "caps"
48  * attribute group of pt pmu device
49  * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
50  * relevant bits together with intel_pt traces.
51  *
52  * These are necessary for both trace decoding (payloads_lip, contains address
53  * width encoded in IP-related packets), and event configuration (bitmasks with
54  * permitted values for certain bit fields).
55  */
56 #define PT_CAP(_n, _l, _r, _m)						\
57 	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
58 			    .reg = _r, .mask = _m }
59 
60 static struct pt_cap_desc {
61 	const char	*name;
62 	u32		leaf;
63 	u8		reg;
64 	u32		mask;
65 } pt_caps[] = {
66 	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
67 	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
68 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
69 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
70 	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
71 };
72 
pt_cap_get(enum pt_capabilities cap)73 static u32 pt_cap_get(enum pt_capabilities cap)
74 {
75 	struct pt_cap_desc *cd = &pt_caps[cap];
76 	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
77 	unsigned int shift = __ffs(cd->mask);
78 
79 	return (c & cd->mask) >> shift;
80 }
81 
pt_cap_show(struct device * cdev,struct device_attribute * attr,char * buf)82 static ssize_t pt_cap_show(struct device *cdev,
83 			   struct device_attribute *attr,
84 			   char *buf)
85 {
86 	struct dev_ext_attribute *ea =
87 		container_of(attr, struct dev_ext_attribute, attr);
88 	enum pt_capabilities cap = (long)ea->var;
89 
90 	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
91 }
92 
93 static struct attribute_group pt_cap_group = {
94 	.name	= "caps",
95 };
96 
97 PMU_FORMAT_ATTR(tsc,		"config:10"	);
98 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
99 
100 static struct attribute *pt_formats_attr[] = {
101 	&format_attr_tsc.attr,
102 	&format_attr_noretcomp.attr,
103 	NULL,
104 };
105 
106 static struct attribute_group pt_format_group = {
107 	.name	= "format",
108 	.attrs	= pt_formats_attr,
109 };
110 
111 static const struct attribute_group *pt_attr_groups[] = {
112 	&pt_cap_group,
113 	&pt_format_group,
114 	NULL,
115 };
116 
pt_pmu_hw_init(void)117 static int __init pt_pmu_hw_init(void)
118 {
119 	struct dev_ext_attribute *de_attrs;
120 	struct attribute **attrs;
121 	size_t size;
122 	int ret;
123 	long i;
124 
125 	attrs = NULL;
126 	ret = -ENODEV;
127 	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
128 		goto fail;
129 
130 	for (i = 0; i < PT_CPUID_LEAVES; i++) {
131 		cpuid_count(20, i,
132 			    &pt_pmu.caps[CR_EAX + i*4],
133 			    &pt_pmu.caps[CR_EBX + i*4],
134 			    &pt_pmu.caps[CR_ECX + i*4],
135 			    &pt_pmu.caps[CR_EDX + i*4]);
136 	}
137 
138 	ret = -ENOMEM;
139 	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
140 	attrs = kzalloc(size, GFP_KERNEL);
141 	if (!attrs)
142 		goto fail;
143 
144 	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
145 	de_attrs = kzalloc(size, GFP_KERNEL);
146 	if (!de_attrs)
147 		goto fail;
148 
149 	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
150 		struct dev_ext_attribute *de_attr = de_attrs + i;
151 
152 		de_attr->attr.attr.name = pt_caps[i].name;
153 
154 		sysfs_attr_init(&de_attr->attr.attr);
155 
156 		de_attr->attr.attr.mode		= S_IRUGO;
157 		de_attr->attr.show		= pt_cap_show;
158 		de_attr->var			= (void *)i;
159 
160 		attrs[i] = &de_attr->attr.attr;
161 	}
162 
163 	pt_cap_group.attrs = attrs;
164 
165 	return 0;
166 
167 fail:
168 	kfree(attrs);
169 
170 	return ret;
171 }
172 
173 #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
174 
pt_event_valid(struct perf_event * event)175 static bool pt_event_valid(struct perf_event *event)
176 {
177 	u64 config = event->attr.config;
178 
179 	if ((config & PT_CONFIG_MASK) != config)
180 		return false;
181 
182 	return true;
183 }
184 
185 /*
186  * PT configuration helpers
187  * These all are cpu affine and operate on a local PT
188  */
189 
pt_is_running(void)190 static bool pt_is_running(void)
191 {
192 	u64 ctl;
193 
194 	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
195 
196 	return !!(ctl & RTIT_CTL_TRACEEN);
197 }
198 
pt_config(struct perf_event * event)199 static void pt_config(struct perf_event *event)
200 {
201 	u64 reg;
202 
203 	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
204 
205 	if (!event->attr.exclude_kernel)
206 		reg |= RTIT_CTL_OS;
207 	if (!event->attr.exclude_user)
208 		reg |= RTIT_CTL_USR;
209 
210 	reg |= (event->attr.config & PT_CONFIG_MASK);
211 
212 	wrmsrl(MSR_IA32_RTIT_CTL, reg);
213 }
214 
pt_config_start(bool start)215 static void pt_config_start(bool start)
216 {
217 	u64 ctl;
218 
219 	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
220 	if (start)
221 		ctl |= RTIT_CTL_TRACEEN;
222 	else
223 		ctl &= ~RTIT_CTL_TRACEEN;
224 	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
225 
226 	/*
227 	 * A wrmsr that disables trace generation serializes other PT
228 	 * registers and causes all data packets to be written to memory,
229 	 * but a fence is required for the data to become globally visible.
230 	 *
231 	 * The below WMB, separating data store and aux_head store matches
232 	 * the consumer's RMB that separates aux_head load and data load.
233 	 */
234 	if (!start)
235 		wmb();
236 }
237 
pt_config_buffer(void * buf,unsigned int topa_idx,unsigned int output_off)238 static void pt_config_buffer(void *buf, unsigned int topa_idx,
239 			     unsigned int output_off)
240 {
241 	u64 reg;
242 
243 	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
244 
245 	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
246 
247 	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
248 }
249 
250 /*
251  * Keep ToPA table-related metadata on the same page as the actual table,
252  * taking up a few words from the top
253  */
254 
255 #define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
256 
257 /**
258  * struct topa - page-sized ToPA table with metadata at the top
259  * @table:	actual ToPA table entries, as understood by PT hardware
260  * @list:	linkage to struct pt_buffer's list of tables
261  * @phys:	physical address of this page
262  * @offset:	offset of the first entry in this table in the buffer
263  * @size:	total size of all entries in this table
264  * @last:	index of the last initialized entry in this table
265  */
266 struct topa {
267 	struct topa_entry	table[TENTS_PER_PAGE];
268 	struct list_head	list;
269 	u64			phys;
270 	u64			offset;
271 	size_t			size;
272 	int			last;
273 };
274 
275 /* make -1 stand for the last table entry */
276 #define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
277 
278 /**
279  * topa_alloc() - allocate page-sized ToPA table
280  * @cpu:	CPU on which to allocate.
281  * @gfp:	Allocation flags.
282  *
283  * Return:	On success, return the pointer to ToPA table page.
284  */
topa_alloc(int cpu,gfp_t gfp)285 static struct topa *topa_alloc(int cpu, gfp_t gfp)
286 {
287 	int node = cpu_to_node(cpu);
288 	struct topa *topa;
289 	struct page *p;
290 
291 	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
292 	if (!p)
293 		return NULL;
294 
295 	topa = page_address(p);
296 	topa->last = 0;
297 	topa->phys = page_to_phys(p);
298 
299 	/*
300 	 * In case of singe-entry ToPA, always put the self-referencing END
301 	 * link as the 2nd entry in the table
302 	 */
303 	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
304 		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
305 		TOPA_ENTRY(topa, 1)->end = 1;
306 	}
307 
308 	return topa;
309 }
310 
311 /**
312  * topa_free() - free a page-sized ToPA table
313  * @topa:	Table to deallocate.
314  */
topa_free(struct topa * topa)315 static void topa_free(struct topa *topa)
316 {
317 	free_page((unsigned long)topa);
318 }
319 
320 /**
321  * topa_insert_table() - insert a ToPA table into a buffer
322  * @buf:	 PT buffer that's being extended.
323  * @topa:	 New topa table to be inserted.
324  *
325  * If it's the first table in this buffer, set up buffer's pointers
326  * accordingly; otherwise, add a END=1 link entry to @topa to the current
327  * "last" table and adjust the last table pointer to @topa.
328  */
topa_insert_table(struct pt_buffer * buf,struct topa * topa)329 static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
330 {
331 	struct topa *last = buf->last;
332 
333 	list_add_tail(&topa->list, &buf->tables);
334 
335 	if (!buf->first) {
336 		buf->first = buf->last = buf->cur = topa;
337 		return;
338 	}
339 
340 	topa->offset = last->offset + last->size;
341 	buf->last = topa;
342 
343 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
344 		return;
345 
346 	BUG_ON(last->last != TENTS_PER_PAGE - 1);
347 
348 	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
349 	TOPA_ENTRY(last, -1)->end = 1;
350 }
351 
352 /**
353  * topa_table_full() - check if a ToPA table is filled up
354  * @topa:	ToPA table.
355  */
topa_table_full(struct topa * topa)356 static bool topa_table_full(struct topa *topa)
357 {
358 	/* single-entry ToPA is a special case */
359 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
360 		return !!topa->last;
361 
362 	return topa->last == TENTS_PER_PAGE - 1;
363 }
364 
365 /**
366  * topa_insert_pages() - create a list of ToPA tables
367  * @buf:	PT buffer being initialized.
368  * @gfp:	Allocation flags.
369  *
370  * This initializes a list of ToPA tables with entries from
371  * the data_pages provided by rb_alloc_aux().
372  *
373  * Return:	0 on success or error code.
374  */
topa_insert_pages(struct pt_buffer * buf,gfp_t gfp)375 static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
376 {
377 	struct topa *topa = buf->last;
378 	int order = 0;
379 	struct page *p;
380 
381 	p = virt_to_page(buf->data_pages[buf->nr_pages]);
382 	if (PagePrivate(p))
383 		order = page_private(p);
384 
385 	if (topa_table_full(topa)) {
386 		topa = topa_alloc(buf->cpu, gfp);
387 		if (!topa)
388 			return -ENOMEM;
389 
390 		topa_insert_table(buf, topa);
391 	}
392 
393 	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
394 	TOPA_ENTRY(topa, -1)->size = order;
395 	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
396 		TOPA_ENTRY(topa, -1)->intr = 1;
397 		TOPA_ENTRY(topa, -1)->stop = 1;
398 	}
399 
400 	topa->last++;
401 	topa->size += sizes(order);
402 
403 	buf->nr_pages += 1ul << order;
404 
405 	return 0;
406 }
407 
408 /**
409  * pt_topa_dump() - print ToPA tables and their entries
410  * @buf:	PT buffer.
411  */
pt_topa_dump(struct pt_buffer * buf)412 static void pt_topa_dump(struct pt_buffer *buf)
413 {
414 	struct topa *topa;
415 
416 	list_for_each_entry(topa, &buf->tables, list) {
417 		int i;
418 
419 		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
420 			 topa->phys, topa->offset, topa->size);
421 		for (i = 0; i < TENTS_PER_PAGE; i++) {
422 			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
423 				 &topa->table[i],
424 				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
425 				 sizes(topa->table[i].size),
426 				 topa->table[i].end ?  'E' : ' ',
427 				 topa->table[i].intr ? 'I' : ' ',
428 				 topa->table[i].stop ? 'S' : ' ',
429 				 *(u64 *)&topa->table[i]);
430 			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
431 			     topa->table[i].stop) ||
432 			    topa->table[i].end)
433 				break;
434 		}
435 	}
436 }
437 
438 /**
439  * pt_buffer_advance() - advance to the next output region
440  * @buf:	PT buffer.
441  *
442  * Advance the current pointers in the buffer to the next ToPA entry.
443  */
pt_buffer_advance(struct pt_buffer * buf)444 static void pt_buffer_advance(struct pt_buffer *buf)
445 {
446 	buf->output_off = 0;
447 	buf->cur_idx++;
448 
449 	if (buf->cur_idx == buf->cur->last) {
450 		if (buf->cur == buf->last)
451 			buf->cur = buf->first;
452 		else
453 			buf->cur = list_entry(buf->cur->list.next, struct topa,
454 					      list);
455 		buf->cur_idx = 0;
456 	}
457 }
458 
459 /**
460  * pt_update_head() - calculate current offsets and sizes
461  * @pt:		Per-cpu pt context.
462  *
463  * Update buffer's current write pointer position and data size.
464  */
pt_update_head(struct pt * pt)465 static void pt_update_head(struct pt *pt)
466 {
467 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
468 	u64 topa_idx, base, old;
469 
470 	/* offset of the first region in this table from the beginning of buf */
471 	base = buf->cur->offset + buf->output_off;
472 
473 	/* offset of the current output region within this table */
474 	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
475 		base += sizes(buf->cur->table[topa_idx].size);
476 
477 	if (buf->snapshot) {
478 		local_set(&buf->data_size, base);
479 	} else {
480 		old = (local64_xchg(&buf->head, base) &
481 		       ((buf->nr_pages << PAGE_SHIFT) - 1));
482 		if (base < old)
483 			base += buf->nr_pages << PAGE_SHIFT;
484 
485 		local_add(base - old, &buf->data_size);
486 	}
487 }
488 
489 /**
490  * pt_buffer_region() - obtain current output region's address
491  * @buf:	PT buffer.
492  */
pt_buffer_region(struct pt_buffer * buf)493 static void *pt_buffer_region(struct pt_buffer *buf)
494 {
495 	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
496 }
497 
498 /**
499  * pt_buffer_region_size() - obtain current output region's size
500  * @buf:	PT buffer.
501  */
pt_buffer_region_size(struct pt_buffer * buf)502 static size_t pt_buffer_region_size(struct pt_buffer *buf)
503 {
504 	return sizes(buf->cur->table[buf->cur_idx].size);
505 }
506 
507 /**
508  * pt_handle_status() - take care of possible status conditions
509  * @pt:		Per-cpu pt context.
510  */
pt_handle_status(struct pt * pt)511 static void pt_handle_status(struct pt *pt)
512 {
513 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
514 	int advance = 0;
515 	u64 status;
516 
517 	rdmsrl(MSR_IA32_RTIT_STATUS, status);
518 
519 	if (status & RTIT_STATUS_ERROR) {
520 		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
521 		pt_topa_dump(buf);
522 		status &= ~RTIT_STATUS_ERROR;
523 	}
524 
525 	if (status & RTIT_STATUS_STOPPED) {
526 		status &= ~RTIT_STATUS_STOPPED;
527 
528 		/*
529 		 * On systems that only do single-entry ToPA, hitting STOP
530 		 * means we are already losing data; need to let the decoder
531 		 * know.
532 		 */
533 		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
534 		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
535 			local_inc(&buf->lost);
536 			advance++;
537 		}
538 	}
539 
540 	/*
541 	 * Also on single-entry ToPA implementations, interrupt will come
542 	 * before the output reaches its output region's boundary.
543 	 */
544 	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
545 	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
546 		void *head = pt_buffer_region(buf);
547 
548 		/* everything within this margin needs to be zeroed out */
549 		memset(head + buf->output_off, 0,
550 		       pt_buffer_region_size(buf) -
551 		       buf->output_off);
552 		advance++;
553 	}
554 
555 	if (advance)
556 		pt_buffer_advance(buf);
557 
558 	wrmsrl(MSR_IA32_RTIT_STATUS, status);
559 }
560 
561 /**
562  * pt_read_offset() - translate registers into buffer pointers
563  * @buf:	PT buffer.
564  *
565  * Set buffer's output pointers from MSR values.
566  */
pt_read_offset(struct pt_buffer * buf)567 static void pt_read_offset(struct pt_buffer *buf)
568 {
569 	u64 offset, base_topa;
570 
571 	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
572 	buf->cur = phys_to_virt(base_topa);
573 
574 	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
575 	/* offset within current output region */
576 	buf->output_off = offset >> 32;
577 	/* index of current output region within this table */
578 	buf->cur_idx = (offset & 0xffffff80) >> 7;
579 }
580 
581 /**
582  * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
583  * @buf:	PT buffer.
584  * @pg:		Page offset in the buffer.
585  *
586  * When advancing to the next output region (ToPA entry), given a page offset
587  * into the buffer, we need to find the offset of the first page in the next
588  * region.
589  */
pt_topa_next_entry(struct pt_buffer * buf,unsigned int pg)590 static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
591 {
592 	struct topa_entry *te = buf->topa_index[pg];
593 
594 	/* one region */
595 	if (buf->first == buf->last && buf->first->last == 1)
596 		return pg;
597 
598 	do {
599 		pg++;
600 		pg &= buf->nr_pages - 1;
601 	} while (buf->topa_index[pg] == te);
602 
603 	return pg;
604 }
605 
606 /**
607  * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
608  * @buf:	PT buffer.
609  * @handle:	Current output handle.
610  *
611  * Place INT and STOP marks to prevent overwriting old data that the consumer
612  * hasn't yet collected.
613  */
pt_buffer_reset_markers(struct pt_buffer * buf,struct perf_output_handle * handle)614 static int pt_buffer_reset_markers(struct pt_buffer *buf,
615 				   struct perf_output_handle *handle)
616 
617 {
618 	unsigned long head = local64_read(&buf->head);
619 	unsigned long idx, npages, wakeup;
620 
621 	if (buf->snapshot)
622 		return 0;
623 
624 	/* can't stop in the middle of an output region */
625 	if (buf->output_off + handle->size + 1 <
626 	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
627 		return -EINVAL;
628 
629 
630 	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
631 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
632 		return 0;
633 
634 	/* clear STOP and INT from current entry */
635 	buf->topa_index[buf->stop_pos]->stop = 0;
636 	buf->topa_index[buf->stop_pos]->intr = 0;
637 	buf->topa_index[buf->intr_pos]->intr = 0;
638 
639 	/* how many pages till the STOP marker */
640 	npages = handle->size >> PAGE_SHIFT;
641 
642 	/* if it's on a page boundary, fill up one more page */
643 	if (!offset_in_page(head + handle->size + 1))
644 		npages++;
645 
646 	idx = (head >> PAGE_SHIFT) + npages;
647 	idx &= buf->nr_pages - 1;
648 	buf->stop_pos = idx;
649 
650 	wakeup = handle->wakeup >> PAGE_SHIFT;
651 
652 	/* in the worst case, wake up the consumer one page before hard stop */
653 	idx = (head >> PAGE_SHIFT) + npages - 1;
654 	if (idx > wakeup)
655 		idx = wakeup;
656 
657 	idx &= buf->nr_pages - 1;
658 	buf->intr_pos = idx;
659 
660 	buf->topa_index[buf->stop_pos]->stop = 1;
661 	buf->topa_index[buf->stop_pos]->intr = 1;
662 	buf->topa_index[buf->intr_pos]->intr = 1;
663 
664 	return 0;
665 }
666 
667 /**
668  * pt_buffer_setup_topa_index() - build topa_index[] table of regions
669  * @buf:	PT buffer.
670  *
671  * topa_index[] references output regions indexed by offset into the
672  * buffer for purposes of quick reverse lookup.
673  */
pt_buffer_setup_topa_index(struct pt_buffer * buf)674 static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
675 {
676 	struct topa *cur = buf->first, *prev = buf->last;
677 	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
678 		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
679 	int pg = 0, idx = 0, ntopa = 0;
680 
681 	while (pg < buf->nr_pages) {
682 		int tidx;
683 
684 		/* pages within one topa entry */
685 		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
686 			buf->topa_index[pg] = te_prev;
687 
688 		te_prev = te_cur;
689 
690 		if (idx == cur->last - 1) {
691 			/* advance to next topa table */
692 			idx = 0;
693 			cur = list_entry(cur->list.next, struct topa, list);
694 			ntopa++;
695 		} else
696 			idx++;
697 		te_cur = TOPA_ENTRY(cur, idx);
698 	}
699 
700 }
701 
702 /**
703  * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
704  * @buf:	PT buffer.
705  * @head:	Write pointer (aux_head) from AUX buffer.
706  *
707  * Find the ToPA table and entry corresponding to given @head and set buffer's
708  * "current" pointers accordingly.
709  */
pt_buffer_reset_offsets(struct pt_buffer * buf,unsigned long head)710 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
711 {
712 	int pg;
713 
714 	if (buf->snapshot)
715 		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
716 
717 	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
718 	pg = pt_topa_next_entry(buf, pg);
719 
720 	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
721 	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
722 			(unsigned long)buf->cur) / sizeof(struct topa_entry);
723 	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
724 
725 	local64_set(&buf->head, head);
726 	local_set(&buf->data_size, 0);
727 }
728 
729 /**
730  * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
731  * @buf:	PT buffer.
732  */
pt_buffer_fini_topa(struct pt_buffer * buf)733 static void pt_buffer_fini_topa(struct pt_buffer *buf)
734 {
735 	struct topa *topa, *iter;
736 
737 	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
738 		/*
739 		 * right now, this is in free_aux() path only, so
740 		 * no need to unlink this table from the list
741 		 */
742 		topa_free(topa);
743 	}
744 }
745 
746 /**
747  * pt_buffer_init_topa() - initialize ToPA table for pt buffer
748  * @buf:	PT buffer.
749  * @size:	Total size of all regions within this ToPA.
750  * @gfp:	Allocation flags.
751  */
pt_buffer_init_topa(struct pt_buffer * buf,unsigned long nr_pages,gfp_t gfp)752 static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
753 			       gfp_t gfp)
754 {
755 	struct topa *topa;
756 	int err;
757 
758 	topa = topa_alloc(buf->cpu, gfp);
759 	if (!topa)
760 		return -ENOMEM;
761 
762 	topa_insert_table(buf, topa);
763 
764 	while (buf->nr_pages < nr_pages) {
765 		err = topa_insert_pages(buf, gfp);
766 		if (err) {
767 			pt_buffer_fini_topa(buf);
768 			return -ENOMEM;
769 		}
770 	}
771 
772 	pt_buffer_setup_topa_index(buf);
773 
774 	/* link last table to the first one, unless we're double buffering */
775 	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
776 		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
777 		TOPA_ENTRY(buf->last, -1)->end = 1;
778 	}
779 
780 	pt_topa_dump(buf);
781 	return 0;
782 }
783 
784 /**
785  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
786  * @cpu:	Cpu on which to allocate, -1 means current.
787  * @pages:	Array of pointers to buffer pages passed from perf core.
788  * @nr_pages:	Number of pages in the buffer.
789  * @snapshot:	If this is a snapshot/overwrite counter.
790  *
791  * This is a pmu::setup_aux callback that sets up ToPA tables and all the
792  * bookkeeping for an AUX buffer.
793  *
794  * Return:	Our private PT buffer structure.
795  */
796 static void *
pt_buffer_setup_aux(int cpu,void ** pages,int nr_pages,bool snapshot)797 pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
798 {
799 	struct pt_buffer *buf;
800 	int node, ret;
801 
802 	if (!nr_pages)
803 		return NULL;
804 
805 	if (cpu == -1)
806 		cpu = raw_smp_processor_id();
807 	node = cpu_to_node(cpu);
808 
809 	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
810 			   GFP_KERNEL, node);
811 	if (!buf)
812 		return NULL;
813 
814 	buf->cpu = cpu;
815 	buf->snapshot = snapshot;
816 	buf->data_pages = pages;
817 
818 	INIT_LIST_HEAD(&buf->tables);
819 
820 	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
821 	if (ret) {
822 		kfree(buf);
823 		return NULL;
824 	}
825 
826 	return buf;
827 }
828 
829 /**
830  * pt_buffer_free_aux() - perf AUX deallocation path callback
831  * @data:	PT buffer.
832  */
pt_buffer_free_aux(void * data)833 static void pt_buffer_free_aux(void *data)
834 {
835 	struct pt_buffer *buf = data;
836 
837 	pt_buffer_fini_topa(buf);
838 	kfree(buf);
839 }
840 
841 /**
842  * pt_buffer_is_full() - check if the buffer is full
843  * @buf:	PT buffer.
844  * @pt:		Per-cpu pt handle.
845  *
846  * If the user hasn't read data from the output region that aux_head
847  * points to, the buffer is considered full: the user needs to read at
848  * least this region and update aux_tail to point past it.
849  */
pt_buffer_is_full(struct pt_buffer * buf,struct pt * pt)850 static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
851 {
852 	if (buf->snapshot)
853 		return false;
854 
855 	if (local_read(&buf->data_size) >= pt->handle.size)
856 		return true;
857 
858 	return false;
859 }
860 
861 /**
862  * intel_pt_interrupt() - PT PMI handler
863  */
intel_pt_interrupt(void)864 void intel_pt_interrupt(void)
865 {
866 	struct pt *pt = this_cpu_ptr(&pt_ctx);
867 	struct pt_buffer *buf;
868 	struct perf_event *event = pt->handle.event;
869 
870 	/*
871 	 * There may be a dangling PT bit in the interrupt status register
872 	 * after PT has been disabled by pt_event_stop(). Make sure we don't
873 	 * do anything (particularly, re-enable) for this event here.
874 	 */
875 	if (!ACCESS_ONCE(pt->handle_nmi))
876 		return;
877 
878 	pt_config_start(false);
879 
880 	if (!event)
881 		return;
882 
883 	buf = perf_get_aux(&pt->handle);
884 	if (!buf)
885 		return;
886 
887 	pt_read_offset(buf);
888 
889 	pt_handle_status(pt);
890 
891 	pt_update_head(pt);
892 
893 	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
894 			    local_xchg(&buf->lost, 0));
895 
896 	if (!event->hw.state) {
897 		int ret;
898 
899 		buf = perf_aux_output_begin(&pt->handle, event);
900 		if (!buf) {
901 			event->hw.state = PERF_HES_STOPPED;
902 			return;
903 		}
904 
905 		pt_buffer_reset_offsets(buf, pt->handle.head);
906 		ret = pt_buffer_reset_markers(buf, &pt->handle);
907 		if (ret) {
908 			perf_aux_output_end(&pt->handle, 0, true);
909 			return;
910 		}
911 
912 		pt_config_buffer(buf->cur->table, buf->cur_idx,
913 				 buf->output_off);
914 		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
915 		pt_config(event);
916 	}
917 }
918 
919 /*
920  * PMU callbacks
921  */
922 
pt_event_start(struct perf_event * event,int mode)923 static void pt_event_start(struct perf_event *event, int mode)
924 {
925 	struct pt *pt = this_cpu_ptr(&pt_ctx);
926 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
927 
928 	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
929 		event->hw.state = PERF_HES_STOPPED;
930 		return;
931 	}
932 
933 	ACCESS_ONCE(pt->handle_nmi) = 1;
934 	event->hw.state = 0;
935 
936 	pt_config_buffer(buf->cur->table, buf->cur_idx,
937 			 buf->output_off);
938 	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
939 	pt_config(event);
940 }
941 
pt_event_stop(struct perf_event * event,int mode)942 static void pt_event_stop(struct perf_event *event, int mode)
943 {
944 	struct pt *pt = this_cpu_ptr(&pt_ctx);
945 
946 	/*
947 	 * Protect against the PMI racing with disabling wrmsr,
948 	 * see comment in intel_pt_interrupt().
949 	 */
950 	ACCESS_ONCE(pt->handle_nmi) = 0;
951 	pt_config_start(false);
952 
953 	if (event->hw.state == PERF_HES_STOPPED)
954 		return;
955 
956 	event->hw.state = PERF_HES_STOPPED;
957 
958 	if (mode & PERF_EF_UPDATE) {
959 		struct pt *pt = this_cpu_ptr(&pt_ctx);
960 		struct pt_buffer *buf = perf_get_aux(&pt->handle);
961 
962 		if (!buf)
963 			return;
964 
965 		if (WARN_ON_ONCE(pt->handle.event != event))
966 			return;
967 
968 		pt_read_offset(buf);
969 
970 		pt_handle_status(pt);
971 
972 		pt_update_head(pt);
973 	}
974 }
975 
pt_event_del(struct perf_event * event,int mode)976 static void pt_event_del(struct perf_event *event, int mode)
977 {
978 	struct pt *pt = this_cpu_ptr(&pt_ctx);
979 	struct pt_buffer *buf;
980 
981 	pt_event_stop(event, PERF_EF_UPDATE);
982 
983 	buf = perf_get_aux(&pt->handle);
984 
985 	if (buf) {
986 		if (buf->snapshot)
987 			pt->handle.head =
988 				local_xchg(&buf->data_size,
989 					   buf->nr_pages << PAGE_SHIFT);
990 		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
991 				    local_xchg(&buf->lost, 0));
992 	}
993 }
994 
pt_event_add(struct perf_event * event,int mode)995 static int pt_event_add(struct perf_event *event, int mode)
996 {
997 	struct pt_buffer *buf;
998 	struct pt *pt = this_cpu_ptr(&pt_ctx);
999 	struct hw_perf_event *hwc = &event->hw;
1000 	int ret = -EBUSY;
1001 
1002 	if (pt->handle.event)
1003 		goto fail;
1004 
1005 	buf = perf_aux_output_begin(&pt->handle, event);
1006 	ret = -EINVAL;
1007 	if (!buf)
1008 		goto fail_stop;
1009 
1010 	pt_buffer_reset_offsets(buf, pt->handle.head);
1011 	if (!buf->snapshot) {
1012 		ret = pt_buffer_reset_markers(buf, &pt->handle);
1013 		if (ret)
1014 			goto fail_end_stop;
1015 	}
1016 
1017 	if (mode & PERF_EF_START) {
1018 		pt_event_start(event, 0);
1019 		ret = -EBUSY;
1020 		if (hwc->state == PERF_HES_STOPPED)
1021 			goto fail_end_stop;
1022 	} else {
1023 		hwc->state = PERF_HES_STOPPED;
1024 	}
1025 
1026 	return 0;
1027 
1028 fail_end_stop:
1029 	perf_aux_output_end(&pt->handle, 0, true);
1030 fail_stop:
1031 	hwc->state = PERF_HES_STOPPED;
1032 fail:
1033 	return ret;
1034 }
1035 
pt_event_read(struct perf_event * event)1036 static void pt_event_read(struct perf_event *event)
1037 {
1038 }
1039 
pt_event_destroy(struct perf_event * event)1040 static void pt_event_destroy(struct perf_event *event)
1041 {
1042 	x86_del_exclusive(x86_lbr_exclusive_pt);
1043 }
1044 
pt_event_init(struct perf_event * event)1045 static int pt_event_init(struct perf_event *event)
1046 {
1047 	if (event->attr.type != pt_pmu.pmu.type)
1048 		return -ENOENT;
1049 
1050 	if (!pt_event_valid(event))
1051 		return -EINVAL;
1052 
1053 	if (x86_add_exclusive(x86_lbr_exclusive_pt))
1054 		return -EBUSY;
1055 
1056 	event->destroy = pt_event_destroy;
1057 
1058 	return 0;
1059 }
1060 
pt_init(void)1061 static __init int pt_init(void)
1062 {
1063 	int ret, cpu, prior_warn = 0;
1064 
1065 	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1066 	get_online_cpus();
1067 	for_each_online_cpu(cpu) {
1068 		u64 ctl;
1069 
1070 		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1071 		if (!ret && (ctl & RTIT_CTL_TRACEEN))
1072 			prior_warn++;
1073 	}
1074 	put_online_cpus();
1075 
1076 	if (prior_warn) {
1077 		x86_add_exclusive(x86_lbr_exclusive_pt);
1078 		pr_warn("PT is enabled at boot time, doing nothing\n");
1079 
1080 		return -EBUSY;
1081 	}
1082 
1083 	ret = pt_pmu_hw_init();
1084 	if (ret)
1085 		return ret;
1086 
1087 	if (!pt_cap_get(PT_CAP_topa_output)) {
1088 		pr_warn("ToPA output is not supported on this CPU\n");
1089 		return -ENODEV;
1090 	}
1091 
1092 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
1093 		pt_pmu.pmu.capabilities =
1094 			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1095 
1096 	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1097 	pt_pmu.pmu.attr_groups	= pt_attr_groups;
1098 	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
1099 	pt_pmu.pmu.event_init	= pt_event_init;
1100 	pt_pmu.pmu.add		= pt_event_add;
1101 	pt_pmu.pmu.del		= pt_event_del;
1102 	pt_pmu.pmu.start	= pt_event_start;
1103 	pt_pmu.pmu.stop		= pt_event_stop;
1104 	pt_pmu.pmu.read		= pt_event_read;
1105 	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
1106 	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
1107 	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1108 
1109 	return ret;
1110 }
1111 
1112 module_init(pt_init);
1113