1/*
2 * VFIO PCI interrupt handling
3 *
4 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
5 *     Author: Alex Williamson <alex.williamson@redhat.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
14 */
15
16#include <linux/device.h>
17#include <linux/interrupt.h>
18#include <linux/eventfd.h>
19#include <linux/msi.h>
20#include <linux/pci.h>
21#include <linux/file.h>
22#include <linux/vfio.h>
23#include <linux/wait.h>
24#include <linux/slab.h>
25
26#include "vfio_pci_private.h"
27
28/*
29 * INTx
30 */
31static void vfio_send_intx_eventfd(void *opaque, void *unused)
32{
33	struct vfio_pci_device *vdev = opaque;
34
35	if (likely(is_intx(vdev) && !vdev->virq_disabled))
36		eventfd_signal(vdev->ctx[0].trigger, 1);
37}
38
39void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
40{
41	struct pci_dev *pdev = vdev->pdev;
42	unsigned long flags;
43
44	spin_lock_irqsave(&vdev->irqlock, flags);
45
46	/*
47	 * Masking can come from interrupt, ioctl, or config space
48	 * via INTx disable.  The latter means this can get called
49	 * even when not using intx delivery.  In this case, just
50	 * try to have the physical bit follow the virtual bit.
51	 */
52	if (unlikely(!is_intx(vdev))) {
53		if (vdev->pci_2_3)
54			pci_intx(pdev, 0);
55	} else if (!vdev->ctx[0].masked) {
56		/*
57		 * Can't use check_and_mask here because we always want to
58		 * mask, not just when something is pending.
59		 */
60		if (vdev->pci_2_3)
61			pci_intx(pdev, 0);
62		else
63			disable_irq_nosync(pdev->irq);
64
65		vdev->ctx[0].masked = true;
66	}
67
68	spin_unlock_irqrestore(&vdev->irqlock, flags);
69}
70
71/*
72 * If this is triggered by an eventfd, we can't call eventfd_signal
73 * or else we'll deadlock on the eventfd wait queue.  Return >0 when
74 * a signal is necessary, which can then be handled via a work queue
75 * or directly depending on the caller.
76 */
77static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
78{
79	struct vfio_pci_device *vdev = opaque;
80	struct pci_dev *pdev = vdev->pdev;
81	unsigned long flags;
82	int ret = 0;
83
84	spin_lock_irqsave(&vdev->irqlock, flags);
85
86	/*
87	 * Unmasking comes from ioctl or config, so again, have the
88	 * physical bit follow the virtual even when not using INTx.
89	 */
90	if (unlikely(!is_intx(vdev))) {
91		if (vdev->pci_2_3)
92			pci_intx(pdev, 1);
93	} else if (vdev->ctx[0].masked && !vdev->virq_disabled) {
94		/*
95		 * A pending interrupt here would immediately trigger,
96		 * but we can avoid that overhead by just re-sending
97		 * the interrupt to the user.
98		 */
99		if (vdev->pci_2_3) {
100			if (!pci_check_and_unmask_intx(pdev))
101				ret = 1;
102		} else
103			enable_irq(pdev->irq);
104
105		vdev->ctx[0].masked = (ret > 0);
106	}
107
108	spin_unlock_irqrestore(&vdev->irqlock, flags);
109
110	return ret;
111}
112
113void vfio_pci_intx_unmask(struct vfio_pci_device *vdev)
114{
115	if (vfio_pci_intx_unmask_handler(vdev, NULL) > 0)
116		vfio_send_intx_eventfd(vdev, NULL);
117}
118
119static irqreturn_t vfio_intx_handler(int irq, void *dev_id)
120{
121	struct vfio_pci_device *vdev = dev_id;
122	unsigned long flags;
123	int ret = IRQ_NONE;
124
125	spin_lock_irqsave(&vdev->irqlock, flags);
126
127	if (!vdev->pci_2_3) {
128		disable_irq_nosync(vdev->pdev->irq);
129		vdev->ctx[0].masked = true;
130		ret = IRQ_HANDLED;
131	} else if (!vdev->ctx[0].masked &&  /* may be shared */
132		   pci_check_and_mask_intx(vdev->pdev)) {
133		vdev->ctx[0].masked = true;
134		ret = IRQ_HANDLED;
135	}
136
137	spin_unlock_irqrestore(&vdev->irqlock, flags);
138
139	if (ret == IRQ_HANDLED)
140		vfio_send_intx_eventfd(vdev, NULL);
141
142	return ret;
143}
144
145static int vfio_intx_enable(struct vfio_pci_device *vdev)
146{
147	if (!is_irq_none(vdev))
148		return -EINVAL;
149
150	if (!vdev->pdev->irq)
151		return -ENODEV;
152
153	vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
154	if (!vdev->ctx)
155		return -ENOMEM;
156
157	vdev->num_ctx = 1;
158
159	/*
160	 * If the virtual interrupt is masked, restore it.  Devices
161	 * supporting DisINTx can be masked at the hardware level
162	 * here, non-PCI-2.3 devices will have to wait until the
163	 * interrupt is enabled.
164	 */
165	vdev->ctx[0].masked = vdev->virq_disabled;
166	if (vdev->pci_2_3)
167		pci_intx(vdev->pdev, !vdev->ctx[0].masked);
168
169	vdev->irq_type = VFIO_PCI_INTX_IRQ_INDEX;
170
171	return 0;
172}
173
174static int vfio_intx_set_signal(struct vfio_pci_device *vdev, int fd)
175{
176	struct pci_dev *pdev = vdev->pdev;
177	unsigned long irqflags = IRQF_SHARED;
178	struct eventfd_ctx *trigger;
179	unsigned long flags;
180	int ret;
181
182	if (vdev->ctx[0].trigger) {
183		free_irq(pdev->irq, vdev);
184		kfree(vdev->ctx[0].name);
185		eventfd_ctx_put(vdev->ctx[0].trigger);
186		vdev->ctx[0].trigger = NULL;
187	}
188
189	if (fd < 0) /* Disable only */
190		return 0;
191
192	vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)",
193				      pci_name(pdev));
194	if (!vdev->ctx[0].name)
195		return -ENOMEM;
196
197	trigger = eventfd_ctx_fdget(fd);
198	if (IS_ERR(trigger)) {
199		kfree(vdev->ctx[0].name);
200		return PTR_ERR(trigger);
201	}
202
203	vdev->ctx[0].trigger = trigger;
204
205	if (!vdev->pci_2_3)
206		irqflags = 0;
207
208	ret = request_irq(pdev->irq, vfio_intx_handler,
209			  irqflags, vdev->ctx[0].name, vdev);
210	if (ret) {
211		vdev->ctx[0].trigger = NULL;
212		kfree(vdev->ctx[0].name);
213		eventfd_ctx_put(trigger);
214		return ret;
215	}
216
217	/*
218	 * INTx disable will stick across the new irq setup,
219	 * disable_irq won't.
220	 */
221	spin_lock_irqsave(&vdev->irqlock, flags);
222	if (!vdev->pci_2_3 && vdev->ctx[0].masked)
223		disable_irq_nosync(pdev->irq);
224	spin_unlock_irqrestore(&vdev->irqlock, flags);
225
226	return 0;
227}
228
229static void vfio_intx_disable(struct vfio_pci_device *vdev)
230{
231	vfio_intx_set_signal(vdev, -1);
232	vfio_virqfd_disable(&vdev->ctx[0].unmask);
233	vfio_virqfd_disable(&vdev->ctx[0].mask);
234	vdev->irq_type = VFIO_PCI_NUM_IRQS;
235	vdev->num_ctx = 0;
236	kfree(vdev->ctx);
237}
238
239/*
240 * MSI/MSI-X
241 */
242static irqreturn_t vfio_msihandler(int irq, void *arg)
243{
244	struct eventfd_ctx *trigger = arg;
245
246	eventfd_signal(trigger, 1);
247	return IRQ_HANDLED;
248}
249
250static int vfio_msi_enable(struct vfio_pci_device *vdev, int nvec, bool msix)
251{
252	struct pci_dev *pdev = vdev->pdev;
253	int ret;
254
255	if (!is_irq_none(vdev))
256		return -EINVAL;
257
258	vdev->ctx = kzalloc(nvec * sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL);
259	if (!vdev->ctx)
260		return -ENOMEM;
261
262	if (msix) {
263		int i;
264
265		vdev->msix = kzalloc(nvec * sizeof(struct msix_entry),
266				     GFP_KERNEL);
267		if (!vdev->msix) {
268			kfree(vdev->ctx);
269			return -ENOMEM;
270		}
271
272		for (i = 0; i < nvec; i++)
273			vdev->msix[i].entry = i;
274
275		ret = pci_enable_msix_range(pdev, vdev->msix, 1, nvec);
276		if (ret < nvec) {
277			if (ret > 0)
278				pci_disable_msix(pdev);
279			kfree(vdev->msix);
280			kfree(vdev->ctx);
281			return ret;
282		}
283	} else {
284		ret = pci_enable_msi_range(pdev, 1, nvec);
285		if (ret < nvec) {
286			if (ret > 0)
287				pci_disable_msi(pdev);
288			kfree(vdev->ctx);
289			return ret;
290		}
291	}
292
293	vdev->num_ctx = nvec;
294	vdev->irq_type = msix ? VFIO_PCI_MSIX_IRQ_INDEX :
295				VFIO_PCI_MSI_IRQ_INDEX;
296
297	if (!msix) {
298		/*
299		 * Compute the virtual hardware field for max msi vectors -
300		 * it is the log base 2 of the number of vectors.
301		 */
302		vdev->msi_qmax = fls(nvec * 2 - 1) - 1;
303	}
304
305	return 0;
306}
307
308static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
309				      int vector, int fd, bool msix)
310{
311	struct pci_dev *pdev = vdev->pdev;
312	int irq = msix ? vdev->msix[vector].vector : pdev->irq + vector;
313	char *name = msix ? "vfio-msix" : "vfio-msi";
314	struct eventfd_ctx *trigger;
315	int ret;
316
317	if (vector >= vdev->num_ctx)
318		return -EINVAL;
319
320	if (vdev->ctx[vector].trigger) {
321		free_irq(irq, vdev->ctx[vector].trigger);
322		kfree(vdev->ctx[vector].name);
323		eventfd_ctx_put(vdev->ctx[vector].trigger);
324		vdev->ctx[vector].trigger = NULL;
325	}
326
327	if (fd < 0)
328		return 0;
329
330	vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "%s[%d](%s)",
331					   name, vector, pci_name(pdev));
332	if (!vdev->ctx[vector].name)
333		return -ENOMEM;
334
335	trigger = eventfd_ctx_fdget(fd);
336	if (IS_ERR(trigger)) {
337		kfree(vdev->ctx[vector].name);
338		return PTR_ERR(trigger);
339	}
340
341	/*
342	 * The MSIx vector table resides in device memory which may be cleared
343	 * via backdoor resets. We don't allow direct access to the vector
344	 * table so even if a userspace driver attempts to save/restore around
345	 * such a reset it would be unsuccessful. To avoid this, restore the
346	 * cached value of the message prior to enabling.
347	 */
348	if (msix) {
349		struct msi_msg msg;
350
351		get_cached_msi_msg(irq, &msg);
352		pci_write_msi_msg(irq, &msg);
353	}
354
355	ret = request_irq(irq, vfio_msihandler, 0,
356			  vdev->ctx[vector].name, trigger);
357	if (ret) {
358		kfree(vdev->ctx[vector].name);
359		eventfd_ctx_put(trigger);
360		return ret;
361	}
362
363	vdev->ctx[vector].trigger = trigger;
364
365	return 0;
366}
367
368static int vfio_msi_set_block(struct vfio_pci_device *vdev, unsigned start,
369			      unsigned count, int32_t *fds, bool msix)
370{
371	int i, j, ret = 0;
372
373	if (start + count > vdev->num_ctx)
374		return -EINVAL;
375
376	for (i = 0, j = start; i < count && !ret; i++, j++) {
377		int fd = fds ? fds[i] : -1;
378		ret = vfio_msi_set_vector_signal(vdev, j, fd, msix);
379	}
380
381	if (ret) {
382		for (--j; j >= start; j--)
383			vfio_msi_set_vector_signal(vdev, j, -1, msix);
384	}
385
386	return ret;
387}
388
389static void vfio_msi_disable(struct vfio_pci_device *vdev, bool msix)
390{
391	struct pci_dev *pdev = vdev->pdev;
392	int i;
393
394	vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
395
396	for (i = 0; i < vdev->num_ctx; i++) {
397		vfio_virqfd_disable(&vdev->ctx[i].unmask);
398		vfio_virqfd_disable(&vdev->ctx[i].mask);
399	}
400
401	if (msix) {
402		pci_disable_msix(vdev->pdev);
403		kfree(vdev->msix);
404	} else
405		pci_disable_msi(pdev);
406
407	vdev->irq_type = VFIO_PCI_NUM_IRQS;
408	vdev->num_ctx = 0;
409	kfree(vdev->ctx);
410}
411
412/*
413 * IOCTL support
414 */
415static int vfio_pci_set_intx_unmask(struct vfio_pci_device *vdev,
416				    unsigned index, unsigned start,
417				    unsigned count, uint32_t flags, void *data)
418{
419	if (!is_intx(vdev) || start != 0 || count != 1)
420		return -EINVAL;
421
422	if (flags & VFIO_IRQ_SET_DATA_NONE) {
423		vfio_pci_intx_unmask(vdev);
424	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
425		uint8_t unmask = *(uint8_t *)data;
426		if (unmask)
427			vfio_pci_intx_unmask(vdev);
428	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
429		int32_t fd = *(int32_t *)data;
430		if (fd >= 0)
431			return vfio_virqfd_enable((void *) vdev,
432						  vfio_pci_intx_unmask_handler,
433						  vfio_send_intx_eventfd, NULL,
434						  &vdev->ctx[0].unmask, fd);
435
436		vfio_virqfd_disable(&vdev->ctx[0].unmask);
437	}
438
439	return 0;
440}
441
442static int vfio_pci_set_intx_mask(struct vfio_pci_device *vdev,
443				  unsigned index, unsigned start,
444				  unsigned count, uint32_t flags, void *data)
445{
446	if (!is_intx(vdev) || start != 0 || count != 1)
447		return -EINVAL;
448
449	if (flags & VFIO_IRQ_SET_DATA_NONE) {
450		vfio_pci_intx_mask(vdev);
451	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
452		uint8_t mask = *(uint8_t *)data;
453		if (mask)
454			vfio_pci_intx_mask(vdev);
455	} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
456		return -ENOTTY; /* XXX implement me */
457	}
458
459	return 0;
460}
461
462static int vfio_pci_set_intx_trigger(struct vfio_pci_device *vdev,
463				     unsigned index, unsigned start,
464				     unsigned count, uint32_t flags, void *data)
465{
466	if (is_intx(vdev) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
467		vfio_intx_disable(vdev);
468		return 0;
469	}
470
471	if (!(is_intx(vdev) || is_irq_none(vdev)) || start != 0 || count != 1)
472		return -EINVAL;
473
474	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
475		int32_t fd = *(int32_t *)data;
476		int ret;
477
478		if (is_intx(vdev))
479			return vfio_intx_set_signal(vdev, fd);
480
481		ret = vfio_intx_enable(vdev);
482		if (ret)
483			return ret;
484
485		ret = vfio_intx_set_signal(vdev, fd);
486		if (ret)
487			vfio_intx_disable(vdev);
488
489		return ret;
490	}
491
492	if (!is_intx(vdev))
493		return -EINVAL;
494
495	if (flags & VFIO_IRQ_SET_DATA_NONE) {
496		vfio_send_intx_eventfd(vdev, NULL);
497	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
498		uint8_t trigger = *(uint8_t *)data;
499		if (trigger)
500			vfio_send_intx_eventfd(vdev, NULL);
501	}
502	return 0;
503}
504
505static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
506				    unsigned index, unsigned start,
507				    unsigned count, uint32_t flags, void *data)
508{
509	int i;
510	bool msix = (index == VFIO_PCI_MSIX_IRQ_INDEX) ? true : false;
511
512	if (irq_is(vdev, index) && !count && (flags & VFIO_IRQ_SET_DATA_NONE)) {
513		vfio_msi_disable(vdev, msix);
514		return 0;
515	}
516
517	if (!(irq_is(vdev, index) || is_irq_none(vdev)))
518		return -EINVAL;
519
520	if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
521		int32_t *fds = data;
522		int ret;
523
524		if (vdev->irq_type == index)
525			return vfio_msi_set_block(vdev, start, count,
526						  fds, msix);
527
528		ret = vfio_msi_enable(vdev, start + count, msix);
529		if (ret)
530			return ret;
531
532		ret = vfio_msi_set_block(vdev, start, count, fds, msix);
533		if (ret)
534			vfio_msi_disable(vdev, msix);
535
536		return ret;
537	}
538
539	if (!irq_is(vdev, index) || start + count > vdev->num_ctx)
540		return -EINVAL;
541
542	for (i = start; i < start + count; i++) {
543		if (!vdev->ctx[i].trigger)
544			continue;
545		if (flags & VFIO_IRQ_SET_DATA_NONE) {
546			eventfd_signal(vdev->ctx[i].trigger, 1);
547		} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
548			uint8_t *bools = data;
549			if (bools[i - start])
550				eventfd_signal(vdev->ctx[i].trigger, 1);
551		}
552	}
553	return 0;
554}
555
556static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx,
557					   uint32_t flags, void *data)
558{
559	int32_t fd = *(int32_t *)data;
560
561	if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
562		return -EINVAL;
563
564	/* DATA_NONE/DATA_BOOL enables loopback testing */
565	if (flags & VFIO_IRQ_SET_DATA_NONE) {
566		if (*ctx)
567			eventfd_signal(*ctx, 1);
568		return 0;
569	} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
570		uint8_t trigger = *(uint8_t *)data;
571		if (trigger && *ctx)
572			eventfd_signal(*ctx, 1);
573		return 0;
574	}
575
576	/* Handle SET_DATA_EVENTFD */
577	if (fd == -1) {
578		if (*ctx)
579			eventfd_ctx_put(*ctx);
580		*ctx = NULL;
581		return 0;
582	} else if (fd >= 0) {
583		struct eventfd_ctx *efdctx;
584		efdctx = eventfd_ctx_fdget(fd);
585		if (IS_ERR(efdctx))
586			return PTR_ERR(efdctx);
587		if (*ctx)
588			eventfd_ctx_put(*ctx);
589		*ctx = efdctx;
590		return 0;
591	} else
592		return -EINVAL;
593}
594
595static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
596				    unsigned index, unsigned start,
597				    unsigned count, uint32_t flags, void *data)
598{
599	if (index != VFIO_PCI_ERR_IRQ_INDEX)
600		return -EINVAL;
601
602	/*
603	 * We should sanitize start & count, but that wasn't caught
604	 * originally, so this IRQ index must forever ignore them :-(
605	 */
606
607	return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data);
608}
609
610static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev,
611				    unsigned index, unsigned start,
612				    unsigned count, uint32_t flags, void *data)
613{
614	if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1)
615		return -EINVAL;
616
617	return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data);
618}
619
620int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
621			    unsigned index, unsigned start, unsigned count,
622			    void *data)
623{
624	int (*func)(struct vfio_pci_device *vdev, unsigned index,
625		    unsigned start, unsigned count, uint32_t flags,
626		    void *data) = NULL;
627
628	switch (index) {
629	case VFIO_PCI_INTX_IRQ_INDEX:
630		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
631		case VFIO_IRQ_SET_ACTION_MASK:
632			func = vfio_pci_set_intx_mask;
633			break;
634		case VFIO_IRQ_SET_ACTION_UNMASK:
635			func = vfio_pci_set_intx_unmask;
636			break;
637		case VFIO_IRQ_SET_ACTION_TRIGGER:
638			func = vfio_pci_set_intx_trigger;
639			break;
640		}
641		break;
642	case VFIO_PCI_MSI_IRQ_INDEX:
643	case VFIO_PCI_MSIX_IRQ_INDEX:
644		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
645		case VFIO_IRQ_SET_ACTION_MASK:
646		case VFIO_IRQ_SET_ACTION_UNMASK:
647			/* XXX Need masking support exported */
648			break;
649		case VFIO_IRQ_SET_ACTION_TRIGGER:
650			func = vfio_pci_set_msi_trigger;
651			break;
652		}
653		break;
654	case VFIO_PCI_ERR_IRQ_INDEX:
655		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
656		case VFIO_IRQ_SET_ACTION_TRIGGER:
657			if (pci_is_pcie(vdev->pdev))
658				func = vfio_pci_set_err_trigger;
659			break;
660		}
661		break;
662	case VFIO_PCI_REQ_IRQ_INDEX:
663		switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
664		case VFIO_IRQ_SET_ACTION_TRIGGER:
665			func = vfio_pci_set_req_trigger;
666			break;
667		}
668		break;
669	}
670
671	if (!func)
672		return -ENOTTY;
673
674	return func(vdev, index, start, count, flags, data);
675}
676