1/*
2 * Kernel-based Virtual Machine - device assignment support
3 *
4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2.  See
7 * the COPYING file in the top-level directory.
8 *
9 */
10
11#include <linux/kvm_host.h>
12#include <linux/kvm.h>
13#include <linux/uaccess.h>
14#include <linux/vmalloc.h>
15#include <linux/errno.h>
16#include <linux/spinlock.h>
17#include <linux/pci.h>
18#include <linux/interrupt.h>
19#include <linux/slab.h>
20#include <linux/namei.h>
21#include <linux/fs.h>
22#include "irq.h"
23#include "assigned-dev.h"
24#include "trace/events/kvm.h"
25
26struct kvm_assigned_dev_kernel {
27	struct kvm_irq_ack_notifier ack_notifier;
28	struct list_head list;
29	int assigned_dev_id;
30	int host_segnr;
31	int host_busnr;
32	int host_devfn;
33	unsigned int entries_nr;
34	int host_irq;
35	bool host_irq_disabled;
36	bool pci_2_3;
37	struct msix_entry *host_msix_entries;
38	int guest_irq;
39	struct msix_entry *guest_msix_entries;
40	unsigned long irq_requested_type;
41	int irq_source_id;
42	int flags;
43	struct pci_dev *dev;
44	struct kvm *kvm;
45	spinlock_t intx_lock;
46	spinlock_t intx_mask_lock;
47	char irq_name[32];
48	struct pci_saved_state *pci_saved_state;
49};
50
51static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
52						      int assigned_dev_id)
53{
54	struct list_head *ptr;
55	struct kvm_assigned_dev_kernel *match;
56
57	list_for_each(ptr, head) {
58		match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
59		if (match->assigned_dev_id == assigned_dev_id)
60			return match;
61	}
62	return NULL;
63}
64
65static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
66				    *assigned_dev, int irq)
67{
68	int i, index;
69	struct msix_entry *host_msix_entries;
70
71	host_msix_entries = assigned_dev->host_msix_entries;
72
73	index = -1;
74	for (i = 0; i < assigned_dev->entries_nr; i++)
75		if (irq == host_msix_entries[i].vector) {
76			index = i;
77			break;
78		}
79	if (index < 0)
80		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
81
82	return index;
83}
84
85static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
86{
87	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
88	int ret;
89
90	spin_lock(&assigned_dev->intx_lock);
91	if (pci_check_and_mask_intx(assigned_dev->dev)) {
92		assigned_dev->host_irq_disabled = true;
93		ret = IRQ_WAKE_THREAD;
94	} else
95		ret = IRQ_NONE;
96	spin_unlock(&assigned_dev->intx_lock);
97
98	return ret;
99}
100
101static void
102kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
103				 int vector)
104{
105	if (unlikely(assigned_dev->irq_requested_type &
106		     KVM_DEV_IRQ_GUEST_INTX)) {
107		spin_lock(&assigned_dev->intx_mask_lock);
108		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
109			kvm_set_irq(assigned_dev->kvm,
110				    assigned_dev->irq_source_id, vector, 1,
111				    false);
112		spin_unlock(&assigned_dev->intx_mask_lock);
113	} else
114		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
115			    vector, 1, false);
116}
117
118static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
119{
120	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
121
122	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
123		spin_lock_irq(&assigned_dev->intx_lock);
124		disable_irq_nosync(irq);
125		assigned_dev->host_irq_disabled = true;
126		spin_unlock_irq(&assigned_dev->intx_lock);
127	}
128
129	kvm_assigned_dev_raise_guest_irq(assigned_dev,
130					 assigned_dev->guest_irq);
131
132	return IRQ_HANDLED;
133}
134
135/*
136 * Deliver an IRQ in an atomic context if we can, or return a failure,
137 * user can retry in a process context.
138 * Return value:
139 *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
140 *  Other values - No need to retry.
141 */
142static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
143				int level)
144{
145	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
146	struct kvm_kernel_irq_routing_entry *e;
147	int ret = -EINVAL;
148	int idx;
149
150	trace_kvm_set_irq(irq, level, irq_source_id);
151
152	/*
153	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
154	 * which would need to be retried from thread context;  when same GSI
155	 * is connected to both PIC and IOAPIC, we'd have to report a
156	 * partial failure here.
157	 * Since there's no easy way to do this, we only support injecting MSI
158	 * which is limited to 1:1 GSI mapping.
159	 */
160	idx = srcu_read_lock(&kvm->irq_srcu);
161	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
162		e = &entries[0];
163		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
164						irq, level);
165	}
166	srcu_read_unlock(&kvm->irq_srcu, idx);
167	return ret;
168}
169
170
171static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
172{
173	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
174	int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
175				       assigned_dev->irq_source_id,
176				       assigned_dev->guest_irq, 1);
177	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
178}
179
180static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
181{
182	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
183
184	kvm_assigned_dev_raise_guest_irq(assigned_dev,
185					 assigned_dev->guest_irq);
186
187	return IRQ_HANDLED;
188}
189
190static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
191{
192	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
193	int index = find_index_from_host_irq(assigned_dev, irq);
194	u32 vector;
195	int ret = 0;
196
197	if (index >= 0) {
198		vector = assigned_dev->guest_msix_entries[index].vector;
199		ret = kvm_set_irq_inatomic(assigned_dev->kvm,
200					   assigned_dev->irq_source_id,
201					   vector, 1);
202	}
203
204	return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
205}
206
207static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
208{
209	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
210	int index = find_index_from_host_irq(assigned_dev, irq);
211	u32 vector;
212
213	if (index >= 0) {
214		vector = assigned_dev->guest_msix_entries[index].vector;
215		kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
216	}
217
218	return IRQ_HANDLED;
219}
220
221/* Ack the irq line for an assigned device */
222static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
223{
224	struct kvm_assigned_dev_kernel *dev =
225		container_of(kian, struct kvm_assigned_dev_kernel,
226			     ack_notifier);
227
228	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
229
230	spin_lock(&dev->intx_mask_lock);
231
232	if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
233		bool reassert = false;
234
235		spin_lock_irq(&dev->intx_lock);
236		/*
237		 * The guest IRQ may be shared so this ack can come from an
238		 * IRQ for another guest device.
239		 */
240		if (dev->host_irq_disabled) {
241			if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
242				enable_irq(dev->host_irq);
243			else if (!pci_check_and_unmask_intx(dev->dev))
244				reassert = true;
245			dev->host_irq_disabled = reassert;
246		}
247		spin_unlock_irq(&dev->intx_lock);
248
249		if (reassert)
250			kvm_set_irq(dev->kvm, dev->irq_source_id,
251				    dev->guest_irq, 1, false);
252	}
253
254	spin_unlock(&dev->intx_mask_lock);
255}
256
257static void deassign_guest_irq(struct kvm *kvm,
258			       struct kvm_assigned_dev_kernel *assigned_dev)
259{
260	if (assigned_dev->ack_notifier.gsi != -1)
261		kvm_unregister_irq_ack_notifier(kvm,
262						&assigned_dev->ack_notifier);
263
264	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
265		    assigned_dev->guest_irq, 0, false);
266
267	if (assigned_dev->irq_source_id != -1)
268		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
269	assigned_dev->irq_source_id = -1;
270	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
271}
272
273/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
274static void deassign_host_irq(struct kvm *kvm,
275			      struct kvm_assigned_dev_kernel *assigned_dev)
276{
277	/*
278	 * We disable irq here to prevent further events.
279	 *
280	 * Notice this maybe result in nested disable if the interrupt type is
281	 * INTx, but it's OK for we are going to free it.
282	 *
283	 * If this function is a part of VM destroy, please ensure that till
284	 * now, the kvm state is still legal for probably we also have to wait
285	 * on a currently running IRQ handler.
286	 */
287	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
288		int i;
289		for (i = 0; i < assigned_dev->entries_nr; i++)
290			disable_irq(assigned_dev->host_msix_entries[i].vector);
291
292		for (i = 0; i < assigned_dev->entries_nr; i++)
293			free_irq(assigned_dev->host_msix_entries[i].vector,
294				 assigned_dev);
295
296		assigned_dev->entries_nr = 0;
297		kfree(assigned_dev->host_msix_entries);
298		kfree(assigned_dev->guest_msix_entries);
299		pci_disable_msix(assigned_dev->dev);
300	} else {
301		/* Deal with MSI and INTx */
302		if ((assigned_dev->irq_requested_type &
303		     KVM_DEV_IRQ_HOST_INTX) &&
304		    (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
305			spin_lock_irq(&assigned_dev->intx_lock);
306			pci_intx(assigned_dev->dev, false);
307			spin_unlock_irq(&assigned_dev->intx_lock);
308			synchronize_irq(assigned_dev->host_irq);
309		} else
310			disable_irq(assigned_dev->host_irq);
311
312		free_irq(assigned_dev->host_irq, assigned_dev);
313
314		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
315			pci_disable_msi(assigned_dev->dev);
316	}
317
318	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
319}
320
321static int kvm_deassign_irq(struct kvm *kvm,
322			    struct kvm_assigned_dev_kernel *assigned_dev,
323			    unsigned long irq_requested_type)
324{
325	unsigned long guest_irq_type, host_irq_type;
326
327	if (!irqchip_in_kernel(kvm))
328		return -EINVAL;
329	/* no irq assignment to deassign */
330	if (!assigned_dev->irq_requested_type)
331		return -ENXIO;
332
333	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
334	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
335
336	if (host_irq_type)
337		deassign_host_irq(kvm, assigned_dev);
338	if (guest_irq_type)
339		deassign_guest_irq(kvm, assigned_dev);
340
341	return 0;
342}
343
344static void kvm_free_assigned_irq(struct kvm *kvm,
345				  struct kvm_assigned_dev_kernel *assigned_dev)
346{
347	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
348}
349
350static void kvm_free_assigned_device(struct kvm *kvm,
351				     struct kvm_assigned_dev_kernel
352				     *assigned_dev)
353{
354	kvm_free_assigned_irq(kvm, assigned_dev);
355
356	pci_reset_function(assigned_dev->dev);
357	if (pci_load_and_free_saved_state(assigned_dev->dev,
358					  &assigned_dev->pci_saved_state))
359		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
360		       __func__, dev_name(&assigned_dev->dev->dev));
361	else
362		pci_restore_state(assigned_dev->dev);
363
364	pci_clear_dev_assigned(assigned_dev->dev);
365
366	pci_release_regions(assigned_dev->dev);
367	pci_disable_device(assigned_dev->dev);
368	pci_dev_put(assigned_dev->dev);
369
370	list_del(&assigned_dev->list);
371	kfree(assigned_dev);
372}
373
374void kvm_free_all_assigned_devices(struct kvm *kvm)
375{
376	struct list_head *ptr, *ptr2;
377	struct kvm_assigned_dev_kernel *assigned_dev;
378
379	list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
380		assigned_dev = list_entry(ptr,
381					  struct kvm_assigned_dev_kernel,
382					  list);
383
384		kvm_free_assigned_device(kvm, assigned_dev);
385	}
386}
387
388static int assigned_device_enable_host_intx(struct kvm *kvm,
389					    struct kvm_assigned_dev_kernel *dev)
390{
391	irq_handler_t irq_handler;
392	unsigned long flags;
393
394	dev->host_irq = dev->dev->irq;
395
396	/*
397	 * We can only share the IRQ line with other host devices if we are
398	 * able to disable the IRQ source at device-level - independently of
399	 * the guest driver. Otherwise host devices may suffer from unbounded
400	 * IRQ latencies when the guest keeps the line asserted.
401	 */
402	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
403		irq_handler = kvm_assigned_dev_intx;
404		flags = IRQF_SHARED;
405	} else {
406		irq_handler = NULL;
407		flags = IRQF_ONESHOT;
408	}
409	if (request_threaded_irq(dev->host_irq, irq_handler,
410				 kvm_assigned_dev_thread_intx, flags,
411				 dev->irq_name, dev))
412		return -EIO;
413
414	if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
415		spin_lock_irq(&dev->intx_lock);
416		pci_intx(dev->dev, true);
417		spin_unlock_irq(&dev->intx_lock);
418	}
419	return 0;
420}
421
422static int assigned_device_enable_host_msi(struct kvm *kvm,
423					   struct kvm_assigned_dev_kernel *dev)
424{
425	int r;
426
427	if (!dev->dev->msi_enabled) {
428		r = pci_enable_msi(dev->dev);
429		if (r)
430			return r;
431	}
432
433	dev->host_irq = dev->dev->irq;
434	if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
435				 kvm_assigned_dev_thread_msi, 0,
436				 dev->irq_name, dev)) {
437		pci_disable_msi(dev->dev);
438		return -EIO;
439	}
440
441	return 0;
442}
443
444static int assigned_device_enable_host_msix(struct kvm *kvm,
445					    struct kvm_assigned_dev_kernel *dev)
446{
447	int i, r = -EINVAL;
448
449	/* host_msix_entries and guest_msix_entries should have been
450	 * initialized */
451	if (dev->entries_nr == 0)
452		return r;
453
454	r = pci_enable_msix_exact(dev->dev,
455				  dev->host_msix_entries, dev->entries_nr);
456	if (r)
457		return r;
458
459	for (i = 0; i < dev->entries_nr; i++) {
460		r = request_threaded_irq(dev->host_msix_entries[i].vector,
461					 kvm_assigned_dev_msix,
462					 kvm_assigned_dev_thread_msix,
463					 0, dev->irq_name, dev);
464		if (r)
465			goto err;
466	}
467
468	return 0;
469err:
470	for (i -= 1; i >= 0; i--)
471		free_irq(dev->host_msix_entries[i].vector, dev);
472	pci_disable_msix(dev->dev);
473	return r;
474}
475
476static int assigned_device_enable_guest_intx(struct kvm *kvm,
477				struct kvm_assigned_dev_kernel *dev,
478				struct kvm_assigned_irq *irq)
479{
480	dev->guest_irq = irq->guest_irq;
481	dev->ack_notifier.gsi = irq->guest_irq;
482	return 0;
483}
484
485static int assigned_device_enable_guest_msi(struct kvm *kvm,
486			struct kvm_assigned_dev_kernel *dev,
487			struct kvm_assigned_irq *irq)
488{
489	dev->guest_irq = irq->guest_irq;
490	dev->ack_notifier.gsi = -1;
491	return 0;
492}
493
494static int assigned_device_enable_guest_msix(struct kvm *kvm,
495			struct kvm_assigned_dev_kernel *dev,
496			struct kvm_assigned_irq *irq)
497{
498	dev->guest_irq = irq->guest_irq;
499	dev->ack_notifier.gsi = -1;
500	return 0;
501}
502
503static int assign_host_irq(struct kvm *kvm,
504			   struct kvm_assigned_dev_kernel *dev,
505			   __u32 host_irq_type)
506{
507	int r = -EEXIST;
508
509	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
510		return r;
511
512	snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
513		 pci_name(dev->dev));
514
515	switch (host_irq_type) {
516	case KVM_DEV_IRQ_HOST_INTX:
517		r = assigned_device_enable_host_intx(kvm, dev);
518		break;
519	case KVM_DEV_IRQ_HOST_MSI:
520		r = assigned_device_enable_host_msi(kvm, dev);
521		break;
522	case KVM_DEV_IRQ_HOST_MSIX:
523		r = assigned_device_enable_host_msix(kvm, dev);
524		break;
525	default:
526		r = -EINVAL;
527	}
528	dev->host_irq_disabled = false;
529
530	if (!r)
531		dev->irq_requested_type |= host_irq_type;
532
533	return r;
534}
535
536static int assign_guest_irq(struct kvm *kvm,
537			    struct kvm_assigned_dev_kernel *dev,
538			    struct kvm_assigned_irq *irq,
539			    unsigned long guest_irq_type)
540{
541	int id;
542	int r = -EEXIST;
543
544	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
545		return r;
546
547	id = kvm_request_irq_source_id(kvm);
548	if (id < 0)
549		return id;
550
551	dev->irq_source_id = id;
552
553	switch (guest_irq_type) {
554	case KVM_DEV_IRQ_GUEST_INTX:
555		r = assigned_device_enable_guest_intx(kvm, dev, irq);
556		break;
557	case KVM_DEV_IRQ_GUEST_MSI:
558		r = assigned_device_enable_guest_msi(kvm, dev, irq);
559		break;
560	case KVM_DEV_IRQ_GUEST_MSIX:
561		r = assigned_device_enable_guest_msix(kvm, dev, irq);
562		break;
563	default:
564		r = -EINVAL;
565	}
566
567	if (!r) {
568		dev->irq_requested_type |= guest_irq_type;
569		if (dev->ack_notifier.gsi != -1)
570			kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
571	} else {
572		kvm_free_irq_source_id(kvm, dev->irq_source_id);
573		dev->irq_source_id = -1;
574	}
575
576	return r;
577}
578
579/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
580static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
581				   struct kvm_assigned_irq *assigned_irq)
582{
583	int r = -EINVAL;
584	struct kvm_assigned_dev_kernel *match;
585	unsigned long host_irq_type, guest_irq_type;
586
587	if (!irqchip_in_kernel(kvm))
588		return r;
589
590	mutex_lock(&kvm->lock);
591	r = -ENODEV;
592	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
593				      assigned_irq->assigned_dev_id);
594	if (!match)
595		goto out;
596
597	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
598	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
599
600	r = -EINVAL;
601	/* can only assign one type at a time */
602	if (hweight_long(host_irq_type) > 1)
603		goto out;
604	if (hweight_long(guest_irq_type) > 1)
605		goto out;
606	if (host_irq_type == 0 && guest_irq_type == 0)
607		goto out;
608
609	r = 0;
610	if (host_irq_type)
611		r = assign_host_irq(kvm, match, host_irq_type);
612	if (r)
613		goto out;
614
615	if (guest_irq_type)
616		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
617out:
618	mutex_unlock(&kvm->lock);
619	return r;
620}
621
622static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
623					 struct kvm_assigned_irq
624					 *assigned_irq)
625{
626	int r = -ENODEV;
627	struct kvm_assigned_dev_kernel *match;
628	unsigned long irq_type;
629
630	mutex_lock(&kvm->lock);
631
632	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
633				      assigned_irq->assigned_dev_id);
634	if (!match)
635		goto out;
636
637	irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
638					  KVM_DEV_IRQ_GUEST_MASK);
639	r = kvm_deassign_irq(kvm, match, irq_type);
640out:
641	mutex_unlock(&kvm->lock);
642	return r;
643}
644
645/*
646 * We want to test whether the caller has been granted permissions to
647 * use this device.  To be able to configure and control the device,
648 * the user needs access to PCI configuration space and BAR resources.
649 * These are accessed through PCI sysfs.  PCI config space is often
650 * passed to the process calling this ioctl via file descriptor, so we
651 * can't rely on access to that file.  We can check for permissions
652 * on each of the BAR resource files, which is a pretty clear
653 * indicator that the user has been granted access to the device.
654 */
655static int probe_sysfs_permissions(struct pci_dev *dev)
656{
657#ifdef CONFIG_SYSFS
658	int i;
659	bool bar_found = false;
660
661	for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
662		char *kpath, *syspath;
663		struct path path;
664		struct inode *inode;
665		int r;
666
667		if (!pci_resource_len(dev, i))
668			continue;
669
670		kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
671		if (!kpath)
672			return -ENOMEM;
673
674		/* Per sysfs-rules, sysfs is always at /sys */
675		syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
676		kfree(kpath);
677		if (!syspath)
678			return -ENOMEM;
679
680		r = kern_path(syspath, LOOKUP_FOLLOW, &path);
681		kfree(syspath);
682		if (r)
683			return r;
684
685		inode = d_backing_inode(path.dentry);
686
687		r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
688		path_put(&path);
689		if (r)
690			return r;
691
692		bar_found = true;
693	}
694
695	/* If no resources, probably something special */
696	if (!bar_found)
697		return -EPERM;
698
699	return 0;
700#else
701	return -EINVAL; /* No way to control the device without sysfs */
702#endif
703}
704
705static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
706				      struct kvm_assigned_pci_dev *assigned_dev)
707{
708	int r = 0, idx;
709	struct kvm_assigned_dev_kernel *match;
710	struct pci_dev *dev;
711
712	if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
713		return -EINVAL;
714
715	mutex_lock(&kvm->lock);
716	idx = srcu_read_lock(&kvm->srcu);
717
718	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
719				      assigned_dev->assigned_dev_id);
720	if (match) {
721		/* device already assigned */
722		r = -EEXIST;
723		goto out;
724	}
725
726	match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
727	if (match == NULL) {
728		printk(KERN_INFO "%s: Couldn't allocate memory\n",
729		       __func__);
730		r = -ENOMEM;
731		goto out;
732	}
733	dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
734				   assigned_dev->busnr,
735				   assigned_dev->devfn);
736	if (!dev) {
737		printk(KERN_INFO "%s: host device not found\n", __func__);
738		r = -EINVAL;
739		goto out_free;
740	}
741
742	/* Don't allow bridges to be assigned */
743	if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
744		r = -EPERM;
745		goto out_put;
746	}
747
748	r = probe_sysfs_permissions(dev);
749	if (r)
750		goto out_put;
751
752	if (pci_enable_device(dev)) {
753		printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
754		r = -EBUSY;
755		goto out_put;
756	}
757	r = pci_request_regions(dev, "kvm_assigned_device");
758	if (r) {
759		printk(KERN_INFO "%s: Could not get access to device regions\n",
760		       __func__);
761		goto out_disable;
762	}
763
764	pci_reset_function(dev);
765	pci_save_state(dev);
766	match->pci_saved_state = pci_store_saved_state(dev);
767	if (!match->pci_saved_state)
768		printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
769		       __func__, dev_name(&dev->dev));
770
771	if (!pci_intx_mask_supported(dev))
772		assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
773
774	match->assigned_dev_id = assigned_dev->assigned_dev_id;
775	match->host_segnr = assigned_dev->segnr;
776	match->host_busnr = assigned_dev->busnr;
777	match->host_devfn = assigned_dev->devfn;
778	match->flags = assigned_dev->flags;
779	match->dev = dev;
780	spin_lock_init(&match->intx_lock);
781	spin_lock_init(&match->intx_mask_lock);
782	match->irq_source_id = -1;
783	match->kvm = kvm;
784	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
785
786	list_add(&match->list, &kvm->arch.assigned_dev_head);
787
788	if (!kvm->arch.iommu_domain) {
789		r = kvm_iommu_map_guest(kvm);
790		if (r)
791			goto out_list_del;
792	}
793	r = kvm_assign_device(kvm, match->dev);
794	if (r)
795		goto out_list_del;
796
797out:
798	srcu_read_unlock(&kvm->srcu, idx);
799	mutex_unlock(&kvm->lock);
800	return r;
801out_list_del:
802	if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
803		printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
804		       __func__, dev_name(&dev->dev));
805	list_del(&match->list);
806	pci_release_regions(dev);
807out_disable:
808	pci_disable_device(dev);
809out_put:
810	pci_dev_put(dev);
811out_free:
812	kfree(match);
813	srcu_read_unlock(&kvm->srcu, idx);
814	mutex_unlock(&kvm->lock);
815	return r;
816}
817
818static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
819		struct kvm_assigned_pci_dev *assigned_dev)
820{
821	int r = 0;
822	struct kvm_assigned_dev_kernel *match;
823
824	mutex_lock(&kvm->lock);
825
826	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
827				      assigned_dev->assigned_dev_id);
828	if (!match) {
829		printk(KERN_INFO "%s: device hasn't been assigned before, "
830		  "so cannot be deassigned\n", __func__);
831		r = -EINVAL;
832		goto out;
833	}
834
835	kvm_deassign_device(kvm, match->dev);
836
837	kvm_free_assigned_device(kvm, match);
838
839out:
840	mutex_unlock(&kvm->lock);
841	return r;
842}
843
844
845static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
846				    struct kvm_assigned_msix_nr *entry_nr)
847{
848	int r = 0;
849	struct kvm_assigned_dev_kernel *adev;
850
851	mutex_lock(&kvm->lock);
852
853	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
854				      entry_nr->assigned_dev_id);
855	if (!adev) {
856		r = -EINVAL;
857		goto msix_nr_out;
858	}
859
860	if (adev->entries_nr == 0) {
861		adev->entries_nr = entry_nr->entry_nr;
862		if (adev->entries_nr == 0 ||
863		    adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
864			r = -EINVAL;
865			goto msix_nr_out;
866		}
867
868		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
869						entry_nr->entry_nr,
870						GFP_KERNEL);
871		if (!adev->host_msix_entries) {
872			r = -ENOMEM;
873			goto msix_nr_out;
874		}
875		adev->guest_msix_entries =
876			kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
877				GFP_KERNEL);
878		if (!adev->guest_msix_entries) {
879			kfree(adev->host_msix_entries);
880			r = -ENOMEM;
881			goto msix_nr_out;
882		}
883	} else /* Not allowed set MSI-X number twice */
884		r = -EINVAL;
885msix_nr_out:
886	mutex_unlock(&kvm->lock);
887	return r;
888}
889
890static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
891				       struct kvm_assigned_msix_entry *entry)
892{
893	int r = 0, i;
894	struct kvm_assigned_dev_kernel *adev;
895
896	mutex_lock(&kvm->lock);
897
898	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
899				      entry->assigned_dev_id);
900
901	if (!adev) {
902		r = -EINVAL;
903		goto msix_entry_out;
904	}
905
906	for (i = 0; i < adev->entries_nr; i++)
907		if (adev->guest_msix_entries[i].vector == 0 ||
908		    adev->guest_msix_entries[i].entry == entry->entry) {
909			adev->guest_msix_entries[i].entry = entry->entry;
910			adev->guest_msix_entries[i].vector = entry->gsi;
911			adev->host_msix_entries[i].entry = entry->entry;
912			break;
913		}
914	if (i == adev->entries_nr) {
915		r = -ENOSPC;
916		goto msix_entry_out;
917	}
918
919msix_entry_out:
920	mutex_unlock(&kvm->lock);
921
922	return r;
923}
924
925static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
926		struct kvm_assigned_pci_dev *assigned_dev)
927{
928	int r = 0;
929	struct kvm_assigned_dev_kernel *match;
930
931	mutex_lock(&kvm->lock);
932
933	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
934				      assigned_dev->assigned_dev_id);
935	if (!match) {
936		r = -ENODEV;
937		goto out;
938	}
939
940	spin_lock(&match->intx_mask_lock);
941
942	match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
943	match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
944
945	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
946		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
947			kvm_set_irq(match->kvm, match->irq_source_id,
948				    match->guest_irq, 0, false);
949			/*
950			 * Masking at hardware-level is performed on demand,
951			 * i.e. when an IRQ actually arrives at the host.
952			 */
953		} else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
954			/*
955			 * Unmask the IRQ line if required. Unmasking at
956			 * device level will be performed by user space.
957			 */
958			spin_lock_irq(&match->intx_lock);
959			if (match->host_irq_disabled) {
960				enable_irq(match->host_irq);
961				match->host_irq_disabled = false;
962			}
963			spin_unlock_irq(&match->intx_lock);
964		}
965	}
966
967	spin_unlock(&match->intx_mask_lock);
968
969out:
970	mutex_unlock(&kvm->lock);
971	return r;
972}
973
974long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
975				  unsigned long arg)
976{
977	void __user *argp = (void __user *)arg;
978	int r;
979
980	switch (ioctl) {
981	case KVM_ASSIGN_PCI_DEVICE: {
982		struct kvm_assigned_pci_dev assigned_dev;
983
984		r = -EFAULT;
985		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
986			goto out;
987		r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
988		if (r)
989			goto out;
990		break;
991	}
992	case KVM_ASSIGN_IRQ: {
993		r = -EOPNOTSUPP;
994		break;
995	}
996	case KVM_ASSIGN_DEV_IRQ: {
997		struct kvm_assigned_irq assigned_irq;
998
999		r = -EFAULT;
1000		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1001			goto out;
1002		r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
1003		if (r)
1004			goto out;
1005		break;
1006	}
1007	case KVM_DEASSIGN_DEV_IRQ: {
1008		struct kvm_assigned_irq assigned_irq;
1009
1010		r = -EFAULT;
1011		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1012			goto out;
1013		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
1014		if (r)
1015			goto out;
1016		break;
1017	}
1018	case KVM_DEASSIGN_PCI_DEVICE: {
1019		struct kvm_assigned_pci_dev assigned_dev;
1020
1021		r = -EFAULT;
1022		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1023			goto out;
1024		r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
1025		if (r)
1026			goto out;
1027		break;
1028	}
1029	case KVM_ASSIGN_SET_MSIX_NR: {
1030		struct kvm_assigned_msix_nr entry_nr;
1031		r = -EFAULT;
1032		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
1033			goto out;
1034		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
1035		if (r)
1036			goto out;
1037		break;
1038	}
1039	case KVM_ASSIGN_SET_MSIX_ENTRY: {
1040		struct kvm_assigned_msix_entry entry;
1041		r = -EFAULT;
1042		if (copy_from_user(&entry, argp, sizeof entry))
1043			goto out;
1044		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
1045		if (r)
1046			goto out;
1047		break;
1048	}
1049	case KVM_ASSIGN_SET_INTX_MASK: {
1050		struct kvm_assigned_pci_dev assigned_dev;
1051
1052		r = -EFAULT;
1053		if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1054			goto out;
1055		r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
1056		break;
1057	}
1058	default:
1059		r = -ENOTTY;
1060		break;
1061	}
1062out:
1063	return r;
1064}
1065