1#include "perf_event_intel_uncore.h"
2
3static struct intel_uncore_type *empty_uncore[] = { NULL, };
4struct intel_uncore_type **uncore_msr_uncores = empty_uncore;
5struct intel_uncore_type **uncore_pci_uncores = empty_uncore;
6
7static bool pcidrv_registered;
8struct pci_driver *uncore_pci_driver;
9/* pci bus to socket mapping */
10int uncore_pcibus_to_physid[256] = { [0 ... 255] = -1, };
11struct pci_dev *uncore_extra_pci_dev[UNCORE_SOCKET_MAX][UNCORE_EXTRA_PCI_DEV_MAX];
12
13static DEFINE_RAW_SPINLOCK(uncore_box_lock);
14/* mask of cpus that collect uncore events */
15static cpumask_t uncore_cpu_mask;
16
17/* constraint for the fixed counter */
18static struct event_constraint uncore_constraint_fixed =
19	EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
20struct event_constraint uncore_constraint_empty =
21	EVENT_CONSTRAINT(0, 0, 0);
22
23ssize_t uncore_event_show(struct kobject *kobj,
24			  struct kobj_attribute *attr, char *buf)
25{
26	struct uncore_event_desc *event =
27		container_of(attr, struct uncore_event_desc, attr);
28	return sprintf(buf, "%s", event->config);
29}
30
31struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
32{
33	return container_of(event->pmu, struct intel_uncore_pmu, pmu);
34}
35
36struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
37{
38	struct intel_uncore_box *box;
39
40	box = *per_cpu_ptr(pmu->box, cpu);
41	if (box)
42		return box;
43
44	raw_spin_lock(&uncore_box_lock);
45	/* Recheck in lock to handle races. */
46	if (*per_cpu_ptr(pmu->box, cpu))
47		goto out;
48	list_for_each_entry(box, &pmu->box_list, list) {
49		if (box->phys_id == topology_physical_package_id(cpu)) {
50			atomic_inc(&box->refcnt);
51			*per_cpu_ptr(pmu->box, cpu) = box;
52			break;
53		}
54	}
55out:
56	raw_spin_unlock(&uncore_box_lock);
57
58	return *per_cpu_ptr(pmu->box, cpu);
59}
60
61struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
62{
63	/*
64	 * perf core schedules event on the basis of cpu, uncore events are
65	 * collected by one of the cpus inside a physical package.
66	 */
67	return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
68}
69
70u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
71{
72	u64 count;
73
74	rdmsrl(event->hw.event_base, count);
75
76	return count;
77}
78
79/*
80 * generic get constraint function for shared match/mask registers.
81 */
82struct event_constraint *
83uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
84{
85	struct intel_uncore_extra_reg *er;
86	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
87	struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
88	unsigned long flags;
89	bool ok = false;
90
91	/*
92	 * reg->alloc can be set due to existing state, so for fake box we
93	 * need to ignore this, otherwise we might fail to allocate proper
94	 * fake state for this extra reg constraint.
95	 */
96	if (reg1->idx == EXTRA_REG_NONE ||
97	    (!uncore_box_is_fake(box) && reg1->alloc))
98		return NULL;
99
100	er = &box->shared_regs[reg1->idx];
101	raw_spin_lock_irqsave(&er->lock, flags);
102	if (!atomic_read(&er->ref) ||
103	    (er->config1 == reg1->config && er->config2 == reg2->config)) {
104		atomic_inc(&er->ref);
105		er->config1 = reg1->config;
106		er->config2 = reg2->config;
107		ok = true;
108	}
109	raw_spin_unlock_irqrestore(&er->lock, flags);
110
111	if (ok) {
112		if (!uncore_box_is_fake(box))
113			reg1->alloc = 1;
114		return NULL;
115	}
116
117	return &uncore_constraint_empty;
118}
119
120void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
121{
122	struct intel_uncore_extra_reg *er;
123	struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
124
125	/*
126	 * Only put constraint if extra reg was actually allocated. Also
127	 * takes care of event which do not use an extra shared reg.
128	 *
129	 * Also, if this is a fake box we shouldn't touch any event state
130	 * (reg->alloc) and we don't care about leaving inconsistent box
131	 * state either since it will be thrown out.
132	 */
133	if (uncore_box_is_fake(box) || !reg1->alloc)
134		return;
135
136	er = &box->shared_regs[reg1->idx];
137	atomic_dec(&er->ref);
138	reg1->alloc = 0;
139}
140
141u64 uncore_shared_reg_config(struct intel_uncore_box *box, int idx)
142{
143	struct intel_uncore_extra_reg *er;
144	unsigned long flags;
145	u64 config;
146
147	er = &box->shared_regs[idx];
148
149	raw_spin_lock_irqsave(&er->lock, flags);
150	config = er->config;
151	raw_spin_unlock_irqrestore(&er->lock, flags);
152
153	return config;
154}
155
156static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
157{
158	struct hw_perf_event *hwc = &event->hw;
159
160	hwc->idx = idx;
161	hwc->last_tag = ++box->tags[idx];
162
163	if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
164		hwc->event_base = uncore_fixed_ctr(box);
165		hwc->config_base = uncore_fixed_ctl(box);
166		return;
167	}
168
169	hwc->config_base = uncore_event_ctl(box, hwc->idx);
170	hwc->event_base  = uncore_perf_ctr(box, hwc->idx);
171}
172
173void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
174{
175	u64 prev_count, new_count, delta;
176	int shift;
177
178	if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
179		shift = 64 - uncore_fixed_ctr_bits(box);
180	else
181		shift = 64 - uncore_perf_ctr_bits(box);
182
183	/* the hrtimer might modify the previous event value */
184again:
185	prev_count = local64_read(&event->hw.prev_count);
186	new_count = uncore_read_counter(box, event);
187	if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
188		goto again;
189
190	delta = (new_count << shift) - (prev_count << shift);
191	delta >>= shift;
192
193	local64_add(delta, &event->count);
194}
195
196/*
197 * The overflow interrupt is unavailable for SandyBridge-EP, is broken
198 * for SandyBridge. So we use hrtimer to periodically poll the counter
199 * to avoid overflow.
200 */
201static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
202{
203	struct intel_uncore_box *box;
204	struct perf_event *event;
205	unsigned long flags;
206	int bit;
207
208	box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
209	if (!box->n_active || box->cpu != smp_processor_id())
210		return HRTIMER_NORESTART;
211	/*
212	 * disable local interrupt to prevent uncore_pmu_event_start/stop
213	 * to interrupt the update process
214	 */
215	local_irq_save(flags);
216
217	/*
218	 * handle boxes with an active event list as opposed to active
219	 * counters
220	 */
221	list_for_each_entry(event, &box->active_list, active_entry) {
222		uncore_perf_event_update(box, event);
223	}
224
225	for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
226		uncore_perf_event_update(box, box->events[bit]);
227
228	local_irq_restore(flags);
229
230	hrtimer_forward_now(hrtimer, ns_to_ktime(box->hrtimer_duration));
231	return HRTIMER_RESTART;
232}
233
234void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
235{
236	__hrtimer_start_range_ns(&box->hrtimer,
237			ns_to_ktime(box->hrtimer_duration), 0,
238			HRTIMER_MODE_REL_PINNED, 0);
239}
240
241void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
242{
243	hrtimer_cancel(&box->hrtimer);
244}
245
246static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
247{
248	hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
249	box->hrtimer.function = uncore_pmu_hrtimer;
250}
251
252static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int node)
253{
254	struct intel_uncore_box *box;
255	int i, size;
256
257	size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
258
259	box = kzalloc_node(size, GFP_KERNEL, node);
260	if (!box)
261		return NULL;
262
263	for (i = 0; i < type->num_shared_regs; i++)
264		raw_spin_lock_init(&box->shared_regs[i].lock);
265
266	uncore_pmu_init_hrtimer(box);
267	atomic_set(&box->refcnt, 1);
268	box->cpu = -1;
269	box->phys_id = -1;
270
271	/* set default hrtimer timeout */
272	box->hrtimer_duration = UNCORE_PMU_HRTIMER_INTERVAL;
273
274	INIT_LIST_HEAD(&box->active_list);
275
276	return box;
277}
278
279/*
280 * Using uncore_pmu_event_init pmu event_init callback
281 * as a detection point for uncore events.
282 */
283static int uncore_pmu_event_init(struct perf_event *event);
284
285static bool is_uncore_event(struct perf_event *event)
286{
287	return event->pmu->event_init == uncore_pmu_event_init;
288}
289
290static int
291uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
292{
293	struct perf_event *event;
294	int n, max_count;
295
296	max_count = box->pmu->type->num_counters;
297	if (box->pmu->type->fixed_ctl)
298		max_count++;
299
300	if (box->n_events >= max_count)
301		return -EINVAL;
302
303	n = box->n_events;
304
305	if (is_uncore_event(leader)) {
306		box->event_list[n] = leader;
307		n++;
308	}
309
310	if (!dogrp)
311		return n;
312
313	list_for_each_entry(event, &leader->sibling_list, group_entry) {
314		if (!is_uncore_event(event) ||
315		    event->state <= PERF_EVENT_STATE_OFF)
316			continue;
317
318		if (n >= max_count)
319			return -EINVAL;
320
321		box->event_list[n] = event;
322		n++;
323	}
324	return n;
325}
326
327static struct event_constraint *
328uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
329{
330	struct intel_uncore_type *type = box->pmu->type;
331	struct event_constraint *c;
332
333	if (type->ops->get_constraint) {
334		c = type->ops->get_constraint(box, event);
335		if (c)
336			return c;
337	}
338
339	if (event->attr.config == UNCORE_FIXED_EVENT)
340		return &uncore_constraint_fixed;
341
342	if (type->constraints) {
343		for_each_event_constraint(c, type->constraints) {
344			if ((event->hw.config & c->cmask) == c->code)
345				return c;
346		}
347	}
348
349	return &type->unconstrainted;
350}
351
352static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
353{
354	if (box->pmu->type->ops->put_constraint)
355		box->pmu->type->ops->put_constraint(box, event);
356}
357
358static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
359{
360	unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
361	struct event_constraint *c;
362	int i, wmin, wmax, ret = 0;
363	struct hw_perf_event *hwc;
364
365	bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
366
367	for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
368		c = uncore_get_event_constraint(box, box->event_list[i]);
369		box->event_constraint[i] = c;
370		wmin = min(wmin, c->weight);
371		wmax = max(wmax, c->weight);
372	}
373
374	/* fastpath, try to reuse previous register */
375	for (i = 0; i < n; i++) {
376		hwc = &box->event_list[i]->hw;
377		c = box->event_constraint[i];
378
379		/* never assigned */
380		if (hwc->idx == -1)
381			break;
382
383		/* constraint still honored */
384		if (!test_bit(hwc->idx, c->idxmsk))
385			break;
386
387		/* not already used */
388		if (test_bit(hwc->idx, used_mask))
389			break;
390
391		__set_bit(hwc->idx, used_mask);
392		if (assign)
393			assign[i] = hwc->idx;
394	}
395	/* slow path */
396	if (i != n)
397		ret = perf_assign_events(box->event_constraint, n,
398					 wmin, wmax, n, assign);
399
400	if (!assign || ret) {
401		for (i = 0; i < n; i++)
402			uncore_put_event_constraint(box, box->event_list[i]);
403	}
404	return ret ? -EINVAL : 0;
405}
406
407static void uncore_pmu_event_start(struct perf_event *event, int flags)
408{
409	struct intel_uncore_box *box = uncore_event_to_box(event);
410	int idx = event->hw.idx;
411
412	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
413		return;
414
415	if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
416		return;
417
418	event->hw.state = 0;
419	box->events[idx] = event;
420	box->n_active++;
421	__set_bit(idx, box->active_mask);
422
423	local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
424	uncore_enable_event(box, event);
425
426	if (box->n_active == 1) {
427		uncore_enable_box(box);
428		uncore_pmu_start_hrtimer(box);
429	}
430}
431
432static void uncore_pmu_event_stop(struct perf_event *event, int flags)
433{
434	struct intel_uncore_box *box = uncore_event_to_box(event);
435	struct hw_perf_event *hwc = &event->hw;
436
437	if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
438		uncore_disable_event(box, event);
439		box->n_active--;
440		box->events[hwc->idx] = NULL;
441		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
442		hwc->state |= PERF_HES_STOPPED;
443
444		if (box->n_active == 0) {
445			uncore_disable_box(box);
446			uncore_pmu_cancel_hrtimer(box);
447		}
448	}
449
450	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
451		/*
452		 * Drain the remaining delta count out of a event
453		 * that we are disabling:
454		 */
455		uncore_perf_event_update(box, event);
456		hwc->state |= PERF_HES_UPTODATE;
457	}
458}
459
460static int uncore_pmu_event_add(struct perf_event *event, int flags)
461{
462	struct intel_uncore_box *box = uncore_event_to_box(event);
463	struct hw_perf_event *hwc = &event->hw;
464	int assign[UNCORE_PMC_IDX_MAX];
465	int i, n, ret;
466
467	if (!box)
468		return -ENODEV;
469
470	ret = n = uncore_collect_events(box, event, false);
471	if (ret < 0)
472		return ret;
473
474	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
475	if (!(flags & PERF_EF_START))
476		hwc->state |= PERF_HES_ARCH;
477
478	ret = uncore_assign_events(box, assign, n);
479	if (ret)
480		return ret;
481
482	/* save events moving to new counters */
483	for (i = 0; i < box->n_events; i++) {
484		event = box->event_list[i];
485		hwc = &event->hw;
486
487		if (hwc->idx == assign[i] &&
488			hwc->last_tag == box->tags[assign[i]])
489			continue;
490		/*
491		 * Ensure we don't accidentally enable a stopped
492		 * counter simply because we rescheduled.
493		 */
494		if (hwc->state & PERF_HES_STOPPED)
495			hwc->state |= PERF_HES_ARCH;
496
497		uncore_pmu_event_stop(event, PERF_EF_UPDATE);
498	}
499
500	/* reprogram moved events into new counters */
501	for (i = 0; i < n; i++) {
502		event = box->event_list[i];
503		hwc = &event->hw;
504
505		if (hwc->idx != assign[i] ||
506			hwc->last_tag != box->tags[assign[i]])
507			uncore_assign_hw_event(box, event, assign[i]);
508		else if (i < box->n_events)
509			continue;
510
511		if (hwc->state & PERF_HES_ARCH)
512			continue;
513
514		uncore_pmu_event_start(event, 0);
515	}
516	box->n_events = n;
517
518	return 0;
519}
520
521static void uncore_pmu_event_del(struct perf_event *event, int flags)
522{
523	struct intel_uncore_box *box = uncore_event_to_box(event);
524	int i;
525
526	uncore_pmu_event_stop(event, PERF_EF_UPDATE);
527
528	for (i = 0; i < box->n_events; i++) {
529		if (event == box->event_list[i]) {
530			uncore_put_event_constraint(box, event);
531
532			while (++i < box->n_events)
533				box->event_list[i - 1] = box->event_list[i];
534
535			--box->n_events;
536			break;
537		}
538	}
539
540	event->hw.idx = -1;
541	event->hw.last_tag = ~0ULL;
542}
543
544void uncore_pmu_event_read(struct perf_event *event)
545{
546	struct intel_uncore_box *box = uncore_event_to_box(event);
547	uncore_perf_event_update(box, event);
548}
549
550/*
551 * validation ensures the group can be loaded onto the
552 * PMU if it was the only group available.
553 */
554static int uncore_validate_group(struct intel_uncore_pmu *pmu,
555				struct perf_event *event)
556{
557	struct perf_event *leader = event->group_leader;
558	struct intel_uncore_box *fake_box;
559	int ret = -EINVAL, n;
560
561	fake_box = uncore_alloc_box(pmu->type, NUMA_NO_NODE);
562	if (!fake_box)
563		return -ENOMEM;
564
565	fake_box->pmu = pmu;
566	/*
567	 * the event is not yet connected with its
568	 * siblings therefore we must first collect
569	 * existing siblings, then add the new event
570	 * before we can simulate the scheduling
571	 */
572	n = uncore_collect_events(fake_box, leader, true);
573	if (n < 0)
574		goto out;
575
576	fake_box->n_events = n;
577	n = uncore_collect_events(fake_box, event, false);
578	if (n < 0)
579		goto out;
580
581	fake_box->n_events = n;
582
583	ret = uncore_assign_events(fake_box, NULL, n);
584out:
585	kfree(fake_box);
586	return ret;
587}
588
589static int uncore_pmu_event_init(struct perf_event *event)
590{
591	struct intel_uncore_pmu *pmu;
592	struct intel_uncore_box *box;
593	struct hw_perf_event *hwc = &event->hw;
594	int ret;
595
596	if (event->attr.type != event->pmu->type)
597		return -ENOENT;
598
599	pmu = uncore_event_to_pmu(event);
600	/* no device found for this pmu */
601	if (pmu->func_id < 0)
602		return -ENOENT;
603
604	/*
605	 * Uncore PMU does measure at all privilege level all the time.
606	 * So it doesn't make sense to specify any exclude bits.
607	 */
608	if (event->attr.exclude_user || event->attr.exclude_kernel ||
609			event->attr.exclude_hv || event->attr.exclude_idle)
610		return -EINVAL;
611
612	/* Sampling not supported yet */
613	if (hwc->sample_period)
614		return -EINVAL;
615
616	/*
617	 * Place all uncore events for a particular physical package
618	 * onto a single cpu
619	 */
620	if (event->cpu < 0)
621		return -EINVAL;
622	box = uncore_pmu_to_box(pmu, event->cpu);
623	if (!box || box->cpu < 0)
624		return -EINVAL;
625	event->cpu = box->cpu;
626
627	event->hw.idx = -1;
628	event->hw.last_tag = ~0ULL;
629	event->hw.extra_reg.idx = EXTRA_REG_NONE;
630	event->hw.branch_reg.idx = EXTRA_REG_NONE;
631
632	if (event->attr.config == UNCORE_FIXED_EVENT) {
633		/* no fixed counter */
634		if (!pmu->type->fixed_ctl)
635			return -EINVAL;
636		/*
637		 * if there is only one fixed counter, only the first pmu
638		 * can access the fixed counter
639		 */
640		if (pmu->type->single_fixed && pmu->pmu_idx > 0)
641			return -EINVAL;
642
643		/* fixed counters have event field hardcoded to zero */
644		hwc->config = 0ULL;
645	} else {
646		hwc->config = event->attr.config & pmu->type->event_mask;
647		if (pmu->type->ops->hw_config) {
648			ret = pmu->type->ops->hw_config(box, event);
649			if (ret)
650				return ret;
651		}
652	}
653
654	if (event->group_leader != event)
655		ret = uncore_validate_group(pmu, event);
656	else
657		ret = 0;
658
659	return ret;
660}
661
662static ssize_t uncore_get_attr_cpumask(struct device *dev,
663				struct device_attribute *attr, char *buf)
664{
665	return cpumap_print_to_pagebuf(true, buf, &uncore_cpu_mask);
666}
667
668static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
669
670static struct attribute *uncore_pmu_attrs[] = {
671	&dev_attr_cpumask.attr,
672	NULL,
673};
674
675static struct attribute_group uncore_pmu_attr_group = {
676	.attrs = uncore_pmu_attrs,
677};
678
679static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
680{
681	int ret;
682
683	if (!pmu->type->pmu) {
684		pmu->pmu = (struct pmu) {
685			.attr_groups	= pmu->type->attr_groups,
686			.task_ctx_nr	= perf_invalid_context,
687			.event_init	= uncore_pmu_event_init,
688			.add		= uncore_pmu_event_add,
689			.del		= uncore_pmu_event_del,
690			.start		= uncore_pmu_event_start,
691			.stop		= uncore_pmu_event_stop,
692			.read		= uncore_pmu_event_read,
693		};
694	} else {
695		pmu->pmu = *pmu->type->pmu;
696		pmu->pmu.attr_groups = pmu->type->attr_groups;
697	}
698
699	if (pmu->type->num_boxes == 1) {
700		if (strlen(pmu->type->name) > 0)
701			sprintf(pmu->name, "uncore_%s", pmu->type->name);
702		else
703			sprintf(pmu->name, "uncore");
704	} else {
705		sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
706			pmu->pmu_idx);
707	}
708
709	ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
710	return ret;
711}
712
713static void __init uncore_type_exit(struct intel_uncore_type *type)
714{
715	int i;
716
717	for (i = 0; i < type->num_boxes; i++)
718		free_percpu(type->pmus[i].box);
719	kfree(type->pmus);
720	type->pmus = NULL;
721	kfree(type->events_group);
722	type->events_group = NULL;
723}
724
725static void __init uncore_types_exit(struct intel_uncore_type **types)
726{
727	int i;
728	for (i = 0; types[i]; i++)
729		uncore_type_exit(types[i]);
730}
731
732static int __init uncore_type_init(struct intel_uncore_type *type)
733{
734	struct intel_uncore_pmu *pmus;
735	struct attribute_group *attr_group;
736	struct attribute **attrs;
737	int i, j;
738
739	pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
740	if (!pmus)
741		return -ENOMEM;
742
743	type->pmus = pmus;
744
745	type->unconstrainted = (struct event_constraint)
746		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
747				0, type->num_counters, 0, 0);
748
749	for (i = 0; i < type->num_boxes; i++) {
750		pmus[i].func_id = -1;
751		pmus[i].pmu_idx = i;
752		pmus[i].type = type;
753		INIT_LIST_HEAD(&pmus[i].box_list);
754		pmus[i].box = alloc_percpu(struct intel_uncore_box *);
755		if (!pmus[i].box)
756			goto fail;
757	}
758
759	if (type->event_descs) {
760		i = 0;
761		while (type->event_descs[i].attr.attr.name)
762			i++;
763
764		attr_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
765					sizeof(*attr_group), GFP_KERNEL);
766		if (!attr_group)
767			goto fail;
768
769		attrs = (struct attribute **)(attr_group + 1);
770		attr_group->name = "events";
771		attr_group->attrs = attrs;
772
773		for (j = 0; j < i; j++)
774			attrs[j] = &type->event_descs[j].attr.attr;
775
776		type->events_group = attr_group;
777	}
778
779	type->pmu_group = &uncore_pmu_attr_group;
780	return 0;
781fail:
782	uncore_type_exit(type);
783	return -ENOMEM;
784}
785
786static int __init uncore_types_init(struct intel_uncore_type **types)
787{
788	int i, ret;
789
790	for (i = 0; types[i]; i++) {
791		ret = uncore_type_init(types[i]);
792		if (ret)
793			goto fail;
794	}
795	return 0;
796fail:
797	while (--i >= 0)
798		uncore_type_exit(types[i]);
799	return ret;
800}
801
802/*
803 * add a pci uncore device
804 */
805static int uncore_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
806{
807	struct intel_uncore_pmu *pmu;
808	struct intel_uncore_box *box;
809	struct intel_uncore_type *type;
810	int phys_id;
811	bool first_box = false;
812
813	phys_id = uncore_pcibus_to_physid[pdev->bus->number];
814	if (phys_id < 0)
815		return -ENODEV;
816
817	if (UNCORE_PCI_DEV_TYPE(id->driver_data) == UNCORE_EXTRA_PCI_DEV) {
818		int idx = UNCORE_PCI_DEV_IDX(id->driver_data);
819		uncore_extra_pci_dev[phys_id][idx] = pdev;
820		pci_set_drvdata(pdev, NULL);
821		return 0;
822	}
823
824	type = uncore_pci_uncores[UNCORE_PCI_DEV_TYPE(id->driver_data)];
825	box = uncore_alloc_box(type, NUMA_NO_NODE);
826	if (!box)
827		return -ENOMEM;
828
829	/*
830	 * for performance monitoring unit with multiple boxes,
831	 * each box has a different function id.
832	 */
833	pmu = &type->pmus[UNCORE_PCI_DEV_IDX(id->driver_data)];
834	if (pmu->func_id < 0)
835		pmu->func_id = pdev->devfn;
836	else
837		WARN_ON_ONCE(pmu->func_id != pdev->devfn);
838
839	box->phys_id = phys_id;
840	box->pci_dev = pdev;
841	box->pmu = pmu;
842	uncore_box_init(box);
843	pci_set_drvdata(pdev, box);
844
845	raw_spin_lock(&uncore_box_lock);
846	if (list_empty(&pmu->box_list))
847		first_box = true;
848	list_add_tail(&box->list, &pmu->box_list);
849	raw_spin_unlock(&uncore_box_lock);
850
851	if (first_box)
852		uncore_pmu_register(pmu);
853	return 0;
854}
855
856static void uncore_pci_remove(struct pci_dev *pdev)
857{
858	struct intel_uncore_box *box = pci_get_drvdata(pdev);
859	struct intel_uncore_pmu *pmu;
860	int i, cpu, phys_id = uncore_pcibus_to_physid[pdev->bus->number];
861	bool last_box = false;
862
863	box = pci_get_drvdata(pdev);
864	if (!box) {
865		for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
866			if (uncore_extra_pci_dev[phys_id][i] == pdev) {
867				uncore_extra_pci_dev[phys_id][i] = NULL;
868				break;
869			}
870		}
871		WARN_ON_ONCE(i >= UNCORE_EXTRA_PCI_DEV_MAX);
872		return;
873	}
874
875	pmu = box->pmu;
876	if (WARN_ON_ONCE(phys_id != box->phys_id))
877		return;
878
879	pci_set_drvdata(pdev, NULL);
880
881	raw_spin_lock(&uncore_box_lock);
882	list_del(&box->list);
883	if (list_empty(&pmu->box_list))
884		last_box = true;
885	raw_spin_unlock(&uncore_box_lock);
886
887	for_each_possible_cpu(cpu) {
888		if (*per_cpu_ptr(pmu->box, cpu) == box) {
889			*per_cpu_ptr(pmu->box, cpu) = NULL;
890			atomic_dec(&box->refcnt);
891		}
892	}
893
894	WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
895	kfree(box);
896
897	if (last_box)
898		perf_pmu_unregister(&pmu->pmu);
899}
900
901static int __init uncore_pci_init(void)
902{
903	int ret;
904
905	switch (boot_cpu_data.x86_model) {
906	case 45: /* Sandy Bridge-EP */
907		ret = snbep_uncore_pci_init();
908		break;
909	case 62: /* Ivy Bridge-EP */
910		ret = ivbep_uncore_pci_init();
911		break;
912	case 63: /* Haswell-EP */
913		ret = hswep_uncore_pci_init();
914		break;
915	case 42: /* Sandy Bridge */
916		ret = snb_uncore_pci_init();
917		break;
918	case 58: /* Ivy Bridge */
919		ret = ivb_uncore_pci_init();
920		break;
921	case 60: /* Haswell */
922	case 69: /* Haswell Celeron */
923		ret = hsw_uncore_pci_init();
924		break;
925	default:
926		return 0;
927	}
928
929	if (ret)
930		return ret;
931
932	ret = uncore_types_init(uncore_pci_uncores);
933	if (ret)
934		return ret;
935
936	uncore_pci_driver->probe = uncore_pci_probe;
937	uncore_pci_driver->remove = uncore_pci_remove;
938
939	ret = pci_register_driver(uncore_pci_driver);
940	if (ret == 0)
941		pcidrv_registered = true;
942	else
943		uncore_types_exit(uncore_pci_uncores);
944
945	return ret;
946}
947
948static void __init uncore_pci_exit(void)
949{
950	if (pcidrv_registered) {
951		pcidrv_registered = false;
952		pci_unregister_driver(uncore_pci_driver);
953		uncore_types_exit(uncore_pci_uncores);
954	}
955}
956
957/* CPU hot plug/unplug are serialized by cpu_add_remove_lock mutex */
958static LIST_HEAD(boxes_to_free);
959
960static void uncore_kfree_boxes(void)
961{
962	struct intel_uncore_box *box;
963
964	while (!list_empty(&boxes_to_free)) {
965		box = list_entry(boxes_to_free.next,
966				 struct intel_uncore_box, list);
967		list_del(&box->list);
968		kfree(box);
969	}
970}
971
972static void uncore_cpu_dying(int cpu)
973{
974	struct intel_uncore_type *type;
975	struct intel_uncore_pmu *pmu;
976	struct intel_uncore_box *box;
977	int i, j;
978
979	for (i = 0; uncore_msr_uncores[i]; i++) {
980		type = uncore_msr_uncores[i];
981		for (j = 0; j < type->num_boxes; j++) {
982			pmu = &type->pmus[j];
983			box = *per_cpu_ptr(pmu->box, cpu);
984			*per_cpu_ptr(pmu->box, cpu) = NULL;
985			if (box && atomic_dec_and_test(&box->refcnt))
986				list_add(&box->list, &boxes_to_free);
987		}
988	}
989}
990
991static int uncore_cpu_starting(int cpu)
992{
993	struct intel_uncore_type *type;
994	struct intel_uncore_pmu *pmu;
995	struct intel_uncore_box *box, *exist;
996	int i, j, k, phys_id;
997
998	phys_id = topology_physical_package_id(cpu);
999
1000	for (i = 0; uncore_msr_uncores[i]; i++) {
1001		type = uncore_msr_uncores[i];
1002		for (j = 0; j < type->num_boxes; j++) {
1003			pmu = &type->pmus[j];
1004			box = *per_cpu_ptr(pmu->box, cpu);
1005			/* called by uncore_cpu_init? */
1006			if (box && box->phys_id >= 0) {
1007				uncore_box_init(box);
1008				continue;
1009			}
1010
1011			for_each_online_cpu(k) {
1012				exist = *per_cpu_ptr(pmu->box, k);
1013				if (exist && exist->phys_id == phys_id) {
1014					atomic_inc(&exist->refcnt);
1015					*per_cpu_ptr(pmu->box, cpu) = exist;
1016					if (box) {
1017						list_add(&box->list,
1018							 &boxes_to_free);
1019						box = NULL;
1020					}
1021					break;
1022				}
1023			}
1024
1025			if (box) {
1026				box->phys_id = phys_id;
1027				uncore_box_init(box);
1028			}
1029		}
1030	}
1031	return 0;
1032}
1033
1034static int uncore_cpu_prepare(int cpu, int phys_id)
1035{
1036	struct intel_uncore_type *type;
1037	struct intel_uncore_pmu *pmu;
1038	struct intel_uncore_box *box;
1039	int i, j;
1040
1041	for (i = 0; uncore_msr_uncores[i]; i++) {
1042		type = uncore_msr_uncores[i];
1043		for (j = 0; j < type->num_boxes; j++) {
1044			pmu = &type->pmus[j];
1045			if (pmu->func_id < 0)
1046				pmu->func_id = j;
1047
1048			box = uncore_alloc_box(type, cpu_to_node(cpu));
1049			if (!box)
1050				return -ENOMEM;
1051
1052			box->pmu = pmu;
1053			box->phys_id = phys_id;
1054			*per_cpu_ptr(pmu->box, cpu) = box;
1055		}
1056	}
1057	return 0;
1058}
1059
1060static void
1061uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
1062{
1063	struct intel_uncore_type *type;
1064	struct intel_uncore_pmu *pmu;
1065	struct intel_uncore_box *box;
1066	int i, j;
1067
1068	for (i = 0; uncores[i]; i++) {
1069		type = uncores[i];
1070		for (j = 0; j < type->num_boxes; j++) {
1071			pmu = &type->pmus[j];
1072			if (old_cpu < 0)
1073				box = uncore_pmu_to_box(pmu, new_cpu);
1074			else
1075				box = uncore_pmu_to_box(pmu, old_cpu);
1076			if (!box)
1077				continue;
1078
1079			if (old_cpu < 0) {
1080				WARN_ON_ONCE(box->cpu != -1);
1081				box->cpu = new_cpu;
1082				continue;
1083			}
1084
1085			WARN_ON_ONCE(box->cpu != old_cpu);
1086			if (new_cpu >= 0) {
1087				uncore_pmu_cancel_hrtimer(box);
1088				perf_pmu_migrate_context(&pmu->pmu,
1089						old_cpu, new_cpu);
1090				box->cpu = new_cpu;
1091			} else {
1092				box->cpu = -1;
1093			}
1094		}
1095	}
1096}
1097
1098static void uncore_event_exit_cpu(int cpu)
1099{
1100	int i, phys_id, target;
1101
1102	/* if exiting cpu is used for collecting uncore events */
1103	if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
1104		return;
1105
1106	/* find a new cpu to collect uncore events */
1107	phys_id = topology_physical_package_id(cpu);
1108	target = -1;
1109	for_each_online_cpu(i) {
1110		if (i == cpu)
1111			continue;
1112		if (phys_id == topology_physical_package_id(i)) {
1113			target = i;
1114			break;
1115		}
1116	}
1117
1118	/* migrate uncore events to the new cpu */
1119	if (target >= 0)
1120		cpumask_set_cpu(target, &uncore_cpu_mask);
1121
1122	uncore_change_context(uncore_msr_uncores, cpu, target);
1123	uncore_change_context(uncore_pci_uncores, cpu, target);
1124}
1125
1126static void uncore_event_init_cpu(int cpu)
1127{
1128	int i, phys_id;
1129
1130	phys_id = topology_physical_package_id(cpu);
1131	for_each_cpu(i, &uncore_cpu_mask) {
1132		if (phys_id == topology_physical_package_id(i))
1133			return;
1134	}
1135
1136	cpumask_set_cpu(cpu, &uncore_cpu_mask);
1137
1138	uncore_change_context(uncore_msr_uncores, -1, cpu);
1139	uncore_change_context(uncore_pci_uncores, -1, cpu);
1140}
1141
1142static int uncore_cpu_notifier(struct notifier_block *self,
1143			       unsigned long action, void *hcpu)
1144{
1145	unsigned int cpu = (long)hcpu;
1146
1147	/* allocate/free data structure for uncore box */
1148	switch (action & ~CPU_TASKS_FROZEN) {
1149	case CPU_UP_PREPARE:
1150		uncore_cpu_prepare(cpu, -1);
1151		break;
1152	case CPU_STARTING:
1153		uncore_cpu_starting(cpu);
1154		break;
1155	case CPU_UP_CANCELED:
1156	case CPU_DYING:
1157		uncore_cpu_dying(cpu);
1158		break;
1159	case CPU_ONLINE:
1160	case CPU_DEAD:
1161		uncore_kfree_boxes();
1162		break;
1163	default:
1164		break;
1165	}
1166
1167	/* select the cpu that collects uncore events */
1168	switch (action & ~CPU_TASKS_FROZEN) {
1169	case CPU_DOWN_FAILED:
1170	case CPU_STARTING:
1171		uncore_event_init_cpu(cpu);
1172		break;
1173	case CPU_DOWN_PREPARE:
1174		uncore_event_exit_cpu(cpu);
1175		break;
1176	default:
1177		break;
1178	}
1179
1180	return NOTIFY_OK;
1181}
1182
1183static struct notifier_block uncore_cpu_nb = {
1184	.notifier_call	= uncore_cpu_notifier,
1185	/*
1186	 * to migrate uncore events, our notifier should be executed
1187	 * before perf core's notifier.
1188	 */
1189	.priority	= CPU_PRI_PERF + 1,
1190};
1191
1192static void __init uncore_cpu_setup(void *dummy)
1193{
1194	uncore_cpu_starting(smp_processor_id());
1195}
1196
1197static int __init uncore_cpu_init(void)
1198{
1199	int ret;
1200
1201	switch (boot_cpu_data.x86_model) {
1202	case 26: /* Nehalem */
1203	case 30:
1204	case 37: /* Westmere */
1205	case 44:
1206		nhm_uncore_cpu_init();
1207		break;
1208	case 42: /* Sandy Bridge */
1209	case 58: /* Ivy Bridge */
1210		snb_uncore_cpu_init();
1211		break;
1212	case 45: /* Sandy Bridge-EP */
1213		snbep_uncore_cpu_init();
1214		break;
1215	case 46: /* Nehalem-EX */
1216	case 47: /* Westmere-EX aka. Xeon E7 */
1217		nhmex_uncore_cpu_init();
1218		break;
1219	case 62: /* Ivy Bridge-EP */
1220		ivbep_uncore_cpu_init();
1221		break;
1222	case 63: /* Haswell-EP */
1223		hswep_uncore_cpu_init();
1224		break;
1225	default:
1226		return 0;
1227	}
1228
1229	ret = uncore_types_init(uncore_msr_uncores);
1230	if (ret)
1231		return ret;
1232
1233	return 0;
1234}
1235
1236static int __init uncore_pmus_register(void)
1237{
1238	struct intel_uncore_pmu *pmu;
1239	struct intel_uncore_type *type;
1240	int i, j;
1241
1242	for (i = 0; uncore_msr_uncores[i]; i++) {
1243		type = uncore_msr_uncores[i];
1244		for (j = 0; j < type->num_boxes; j++) {
1245			pmu = &type->pmus[j];
1246			uncore_pmu_register(pmu);
1247		}
1248	}
1249
1250	return 0;
1251}
1252
1253static void __init uncore_cpumask_init(void)
1254{
1255	int cpu;
1256
1257	/*
1258	 * ony invoke once from msr or pci init code
1259	 */
1260	if (!cpumask_empty(&uncore_cpu_mask))
1261		return;
1262
1263	cpu_notifier_register_begin();
1264
1265	for_each_online_cpu(cpu) {
1266		int i, phys_id = topology_physical_package_id(cpu);
1267
1268		for_each_cpu(i, &uncore_cpu_mask) {
1269			if (phys_id == topology_physical_package_id(i)) {
1270				phys_id = -1;
1271				break;
1272			}
1273		}
1274		if (phys_id < 0)
1275			continue;
1276
1277		uncore_cpu_prepare(cpu, phys_id);
1278		uncore_event_init_cpu(cpu);
1279	}
1280	on_each_cpu(uncore_cpu_setup, NULL, 1);
1281
1282	__register_cpu_notifier(&uncore_cpu_nb);
1283
1284	cpu_notifier_register_done();
1285}
1286
1287
1288static int __init intel_uncore_init(void)
1289{
1290	int ret;
1291
1292	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
1293		return -ENODEV;
1294
1295	if (cpu_has_hypervisor)
1296		return -ENODEV;
1297
1298	ret = uncore_pci_init();
1299	if (ret)
1300		goto fail;
1301	ret = uncore_cpu_init();
1302	if (ret) {
1303		uncore_pci_exit();
1304		goto fail;
1305	}
1306	uncore_cpumask_init();
1307
1308	uncore_pmus_register();
1309	return 0;
1310fail:
1311	return ret;
1312}
1313device_initcall(intel_uncore_init);
1314