This source file includes following definitions.
- vfio_find_dma
- vfio_link_dma
- vfio_unlink_dma
- vfio_find_vpfn
- vfio_link_pfn
- vfio_unlink_pfn
- vfio_add_to_pfn_list
- vfio_remove_from_pfn_list
- vfio_iova_get_vfio_pfn
- vfio_iova_put_vfio_pfn
- vfio_lock_acct
- is_invalid_reserved_pfn
- put_pfn
- vaddr_get_pfn
- vfio_pin_pages_remote
- vfio_unpin_pages_remote
- vfio_pin_page_external
- vfio_unpin_page_external
- vfio_iommu_type1_pin_pages
- vfio_iommu_type1_unpin_pages
- vfio_sync_unpin
- unmap_unpin_fast
- unmap_unpin_slow
- vfio_unmap_unpin
- vfio_remove_dma
- vfio_pgsize_bitmap
- vfio_dma_do_unmap
- vfio_iommu_map
- vfio_pin_map_dma
- vfio_iommu_iova_dma_valid
- vfio_dma_do_map
- vfio_bus_type
- vfio_iommu_replay
- vfio_test_domain_fgsp
- find_iommu_group
- vfio_iommu_has_sw_msi
- vfio_mdev_get_iommu_device
- vfio_mdev_attach_domain
- vfio_mdev_detach_domain
- vfio_iommu_attach_group
- vfio_iommu_detach_group
- vfio_bus_is_mdev
- vfio_mdev_iommu_device
- vfio_iommu_iova_insert
- vfio_iommu_aper_conflict
- vfio_iommu_aper_resize
- vfio_iommu_resv_conflict
- vfio_iommu_resv_exclude
- vfio_iommu_resv_free
- vfio_iommu_iova_free
- vfio_iommu_iova_get_copy
- vfio_iommu_iova_insert_copy
- vfio_iommu_type1_attach_group
- vfio_iommu_unmap_unpin_all
- vfio_iommu_unmap_unpin_reaccount
- vfio_sanity_check_pfn_list
- vfio_iommu_aper_expand
- vfio_iommu_resv_refresh
- vfio_iommu_type1_detach_group
- vfio_iommu_type1_open
- vfio_release_domain
- vfio_iommu_type1_release
- vfio_domains_have_iommu_cache
- vfio_iommu_iova_add_cap
- vfio_iommu_iova_build_caps
- vfio_iommu_type1_ioctl
- vfio_iommu_type1_register_notifier
- vfio_iommu_type1_unregister_notifier
- vfio_iommu_type1_init
- vfio_iommu_type1_cleanup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24 #include <linux/compat.h>
25 #include <linux/device.h>
26 #include <linux/fs.h>
27 #include <linux/iommu.h>
28 #include <linux/module.h>
29 #include <linux/mm.h>
30 #include <linux/rbtree.h>
31 #include <linux/sched/signal.h>
32 #include <linux/sched/mm.h>
33 #include <linux/slab.h>
34 #include <linux/uaccess.h>
35 #include <linux/vfio.h>
36 #include <linux/workqueue.h>
37 #include <linux/mdev.h>
38 #include <linux/notifier.h>
39 #include <linux/dma-iommu.h>
40 #include <linux/irqdomain.h>
41
42 #define DRIVER_VERSION "0.2"
43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC "Type1 IOMMU driver for VFIO"
45
46 static bool allow_unsafe_interrupts;
47 module_param_named(allow_unsafe_interrupts,
48 allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
49 MODULE_PARM_DESC(allow_unsafe_interrupts,
50 "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
51
52 static bool disable_hugepages;
53 module_param_named(disable_hugepages,
54 disable_hugepages, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(disable_hugepages,
56 "Disable VFIO IOMMU support for IOMMU hugepages.");
57
58 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
59 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
60 MODULE_PARM_DESC(dma_entry_limit,
61 "Maximum number of user DMA mappings per container (65535).");
62
63 struct vfio_iommu {
64 struct list_head domain_list;
65 struct list_head iova_list;
66 struct vfio_domain *external_domain;
67 struct mutex lock;
68 struct rb_root dma_list;
69 struct blocking_notifier_head notifier;
70 unsigned int dma_avail;
71 bool v2;
72 bool nesting;
73 };
74
75 struct vfio_domain {
76 struct iommu_domain *domain;
77 struct list_head next;
78 struct list_head group_list;
79 int prot;
80 bool fgsp;
81 };
82
83 struct vfio_dma {
84 struct rb_node node;
85 dma_addr_t iova;
86 unsigned long vaddr;
87 size_t size;
88 int prot;
89 bool iommu_mapped;
90 bool lock_cap;
91 struct task_struct *task;
92 struct rb_root pfn_list;
93 };
94
95 struct vfio_group {
96 struct iommu_group *iommu_group;
97 struct list_head next;
98 bool mdev_group;
99 };
100
101 struct vfio_iova {
102 struct list_head list;
103 dma_addr_t start;
104 dma_addr_t end;
105 };
106
107
108
109
110 struct vfio_pfn {
111 struct rb_node node;
112 dma_addr_t iova;
113 unsigned long pfn;
114 atomic_t ref_count;
115 };
116
117 struct vfio_regions {
118 struct list_head list;
119 dma_addr_t iova;
120 phys_addr_t phys;
121 size_t len;
122 };
123
124 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
125 (!list_empty(&iommu->domain_list))
126
127 static int put_pfn(unsigned long pfn, int prot);
128
129
130
131
132
133
134 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
135 dma_addr_t start, size_t size)
136 {
137 struct rb_node *node = iommu->dma_list.rb_node;
138
139 while (node) {
140 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
141
142 if (start + size <= dma->iova)
143 node = node->rb_left;
144 else if (start >= dma->iova + dma->size)
145 node = node->rb_right;
146 else
147 return dma;
148 }
149
150 return NULL;
151 }
152
153 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
154 {
155 struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
156 struct vfio_dma *dma;
157
158 while (*link) {
159 parent = *link;
160 dma = rb_entry(parent, struct vfio_dma, node);
161
162 if (new->iova + new->size <= dma->iova)
163 link = &(*link)->rb_left;
164 else
165 link = &(*link)->rb_right;
166 }
167
168 rb_link_node(&new->node, parent, link);
169 rb_insert_color(&new->node, &iommu->dma_list);
170 }
171
172 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
173 {
174 rb_erase(&old->node, &iommu->dma_list);
175 }
176
177
178
179
180 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
181 {
182 struct vfio_pfn *vpfn;
183 struct rb_node *node = dma->pfn_list.rb_node;
184
185 while (node) {
186 vpfn = rb_entry(node, struct vfio_pfn, node);
187
188 if (iova < vpfn->iova)
189 node = node->rb_left;
190 else if (iova > vpfn->iova)
191 node = node->rb_right;
192 else
193 return vpfn;
194 }
195 return NULL;
196 }
197
198 static void vfio_link_pfn(struct vfio_dma *dma,
199 struct vfio_pfn *new)
200 {
201 struct rb_node **link, *parent = NULL;
202 struct vfio_pfn *vpfn;
203
204 link = &dma->pfn_list.rb_node;
205 while (*link) {
206 parent = *link;
207 vpfn = rb_entry(parent, struct vfio_pfn, node);
208
209 if (new->iova < vpfn->iova)
210 link = &(*link)->rb_left;
211 else
212 link = &(*link)->rb_right;
213 }
214
215 rb_link_node(&new->node, parent, link);
216 rb_insert_color(&new->node, &dma->pfn_list);
217 }
218
219 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
220 {
221 rb_erase(&old->node, &dma->pfn_list);
222 }
223
224 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
225 unsigned long pfn)
226 {
227 struct vfio_pfn *vpfn;
228
229 vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
230 if (!vpfn)
231 return -ENOMEM;
232
233 vpfn->iova = iova;
234 vpfn->pfn = pfn;
235 atomic_set(&vpfn->ref_count, 1);
236 vfio_link_pfn(dma, vpfn);
237 return 0;
238 }
239
240 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
241 struct vfio_pfn *vpfn)
242 {
243 vfio_unlink_pfn(dma, vpfn);
244 kfree(vpfn);
245 }
246
247 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
248 unsigned long iova)
249 {
250 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
251
252 if (vpfn)
253 atomic_inc(&vpfn->ref_count);
254 return vpfn;
255 }
256
257 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
258 {
259 int ret = 0;
260
261 if (atomic_dec_and_test(&vpfn->ref_count)) {
262 ret = put_pfn(vpfn->pfn, dma->prot);
263 vfio_remove_from_pfn_list(dma, vpfn);
264 }
265 return ret;
266 }
267
268 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
269 {
270 struct mm_struct *mm;
271 int ret;
272
273 if (!npage)
274 return 0;
275
276 mm = async ? get_task_mm(dma->task) : dma->task->mm;
277 if (!mm)
278 return -ESRCH;
279
280 ret = down_write_killable(&mm->mmap_sem);
281 if (!ret) {
282 ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
283 dma->lock_cap);
284 up_write(&mm->mmap_sem);
285 }
286
287 if (async)
288 mmput(mm);
289
290 return ret;
291 }
292
293
294
295
296
297
298 static bool is_invalid_reserved_pfn(unsigned long pfn)
299 {
300 if (pfn_valid(pfn)) {
301 bool reserved;
302 struct page *tail = pfn_to_page(pfn);
303 struct page *head = compound_head(tail);
304 reserved = !!(PageReserved(head));
305 if (head != tail) {
306
307
308
309
310
311
312
313
314
315
316 smp_rmb();
317 if (PageTail(tail))
318 return reserved;
319 }
320 return PageReserved(tail);
321 }
322
323 return true;
324 }
325
326 static int put_pfn(unsigned long pfn, int prot)
327 {
328 if (!is_invalid_reserved_pfn(pfn)) {
329 struct page *page = pfn_to_page(pfn);
330 if (prot & IOMMU_WRITE)
331 SetPageDirty(page);
332 put_page(page);
333 return 1;
334 }
335 return 0;
336 }
337
338 static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
339 int prot, unsigned long *pfn)
340 {
341 struct page *page[1];
342 struct vm_area_struct *vma;
343 struct vm_area_struct *vmas[1];
344 unsigned int flags = 0;
345 int ret;
346
347 if (prot & IOMMU_WRITE)
348 flags |= FOLL_WRITE;
349
350 down_read(&mm->mmap_sem);
351 if (mm == current->mm) {
352 ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page,
353 vmas);
354 } else {
355 ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page,
356 vmas, NULL);
357
358
359
360
361
362
363
364 if (ret > 0 && vma_is_fsdax(vmas[0])) {
365 ret = -EOPNOTSUPP;
366 put_page(page[0]);
367 }
368 }
369 up_read(&mm->mmap_sem);
370
371 if (ret == 1) {
372 *pfn = page_to_pfn(page[0]);
373 return 0;
374 }
375
376 down_read(&mm->mmap_sem);
377
378 vaddr = untagged_addr(vaddr);
379
380 vma = find_vma_intersection(mm, vaddr, vaddr + 1);
381
382 if (vma && vma->vm_flags & VM_PFNMAP) {
383 if (!follow_pfn(vma, vaddr, pfn) &&
384 is_invalid_reserved_pfn(*pfn))
385 ret = 0;
386 }
387
388 up_read(&mm->mmap_sem);
389 return ret;
390 }
391
392
393
394
395
396
397 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
398 long npage, unsigned long *pfn_base,
399 unsigned long limit)
400 {
401 unsigned long pfn = 0;
402 long ret, pinned = 0, lock_acct = 0;
403 bool rsvd;
404 dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
405
406
407 if (!current->mm)
408 return -ENODEV;
409
410 ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
411 if (ret)
412 return ret;
413
414 pinned++;
415 rsvd = is_invalid_reserved_pfn(*pfn_base);
416
417
418
419
420
421 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
422 if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
423 put_pfn(*pfn_base, dma->prot);
424 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
425 limit << PAGE_SHIFT);
426 return -ENOMEM;
427 }
428 lock_acct++;
429 }
430
431 if (unlikely(disable_hugepages))
432 goto out;
433
434
435 for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
436 pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
437 ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
438 if (ret)
439 break;
440
441 if (pfn != *pfn_base + pinned ||
442 rsvd != is_invalid_reserved_pfn(pfn)) {
443 put_pfn(pfn, dma->prot);
444 break;
445 }
446
447 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
448 if (!dma->lock_cap &&
449 current->mm->locked_vm + lock_acct + 1 > limit) {
450 put_pfn(pfn, dma->prot);
451 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
452 __func__, limit << PAGE_SHIFT);
453 ret = -ENOMEM;
454 goto unpin_out;
455 }
456 lock_acct++;
457 }
458 }
459
460 out:
461 ret = vfio_lock_acct(dma, lock_acct, false);
462
463 unpin_out:
464 if (ret) {
465 if (!rsvd) {
466 for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
467 put_pfn(pfn, dma->prot);
468 }
469
470 return ret;
471 }
472
473 return pinned;
474 }
475
476 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
477 unsigned long pfn, long npage,
478 bool do_accounting)
479 {
480 long unlocked = 0, locked = 0;
481 long i;
482
483 for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
484 if (put_pfn(pfn++, dma->prot)) {
485 unlocked++;
486 if (vfio_find_vpfn(dma, iova))
487 locked++;
488 }
489 }
490
491 if (do_accounting)
492 vfio_lock_acct(dma, locked - unlocked, true);
493
494 return unlocked;
495 }
496
497 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
498 unsigned long *pfn_base, bool do_accounting)
499 {
500 struct mm_struct *mm;
501 int ret;
502
503 mm = get_task_mm(dma->task);
504 if (!mm)
505 return -ENODEV;
506
507 ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
508 if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
509 ret = vfio_lock_acct(dma, 1, true);
510 if (ret) {
511 put_pfn(*pfn_base, dma->prot);
512 if (ret == -ENOMEM)
513 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
514 "(%ld) exceeded\n", __func__,
515 dma->task->comm, task_pid_nr(dma->task),
516 task_rlimit(dma->task, RLIMIT_MEMLOCK));
517 }
518 }
519
520 mmput(mm);
521 return ret;
522 }
523
524 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
525 bool do_accounting)
526 {
527 int unlocked;
528 struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
529
530 if (!vpfn)
531 return 0;
532
533 unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
534
535 if (do_accounting)
536 vfio_lock_acct(dma, -unlocked, true);
537
538 return unlocked;
539 }
540
541 static int vfio_iommu_type1_pin_pages(void *iommu_data,
542 unsigned long *user_pfn,
543 int npage, int prot,
544 unsigned long *phys_pfn)
545 {
546 struct vfio_iommu *iommu = iommu_data;
547 int i, j, ret;
548 unsigned long remote_vaddr;
549 struct vfio_dma *dma;
550 bool do_accounting;
551
552 if (!iommu || !user_pfn || !phys_pfn)
553 return -EINVAL;
554
555
556 if (!iommu->v2)
557 return -EACCES;
558
559 mutex_lock(&iommu->lock);
560
561
562 if (!iommu->notifier.head) {
563 ret = -EINVAL;
564 goto pin_done;
565 }
566
567
568
569
570
571
572 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
573
574 for (i = 0; i < npage; i++) {
575 dma_addr_t iova;
576 struct vfio_pfn *vpfn;
577
578 iova = user_pfn[i] << PAGE_SHIFT;
579 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
580 if (!dma) {
581 ret = -EINVAL;
582 goto pin_unwind;
583 }
584
585 if ((dma->prot & prot) != prot) {
586 ret = -EPERM;
587 goto pin_unwind;
588 }
589
590 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
591 if (vpfn) {
592 phys_pfn[i] = vpfn->pfn;
593 continue;
594 }
595
596 remote_vaddr = dma->vaddr + (iova - dma->iova);
597 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
598 do_accounting);
599 if (ret)
600 goto pin_unwind;
601
602 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
603 if (ret) {
604 vfio_unpin_page_external(dma, iova, do_accounting);
605 goto pin_unwind;
606 }
607 }
608
609 ret = i;
610 goto pin_done;
611
612 pin_unwind:
613 phys_pfn[i] = 0;
614 for (j = 0; j < i; j++) {
615 dma_addr_t iova;
616
617 iova = user_pfn[j] << PAGE_SHIFT;
618 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
619 vfio_unpin_page_external(dma, iova, do_accounting);
620 phys_pfn[j] = 0;
621 }
622 pin_done:
623 mutex_unlock(&iommu->lock);
624 return ret;
625 }
626
627 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
628 unsigned long *user_pfn,
629 int npage)
630 {
631 struct vfio_iommu *iommu = iommu_data;
632 bool do_accounting;
633 int i;
634
635 if (!iommu || !user_pfn)
636 return -EINVAL;
637
638
639 if (!iommu->v2)
640 return -EACCES;
641
642 mutex_lock(&iommu->lock);
643
644 do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
645 for (i = 0; i < npage; i++) {
646 struct vfio_dma *dma;
647 dma_addr_t iova;
648
649 iova = user_pfn[i] << PAGE_SHIFT;
650 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
651 if (!dma)
652 goto unpin_exit;
653 vfio_unpin_page_external(dma, iova, do_accounting);
654 }
655
656 unpin_exit:
657 mutex_unlock(&iommu->lock);
658 return i > npage ? npage : (i > 0 ? i : -EINVAL);
659 }
660
661 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
662 struct list_head *regions,
663 struct iommu_iotlb_gather *iotlb_gather)
664 {
665 long unlocked = 0;
666 struct vfio_regions *entry, *next;
667
668 iommu_tlb_sync(domain->domain, iotlb_gather);
669
670 list_for_each_entry_safe(entry, next, regions, list) {
671 unlocked += vfio_unpin_pages_remote(dma,
672 entry->iova,
673 entry->phys >> PAGE_SHIFT,
674 entry->len >> PAGE_SHIFT,
675 false);
676 list_del(&entry->list);
677 kfree(entry);
678 }
679
680 cond_resched();
681
682 return unlocked;
683 }
684
685
686
687
688
689
690
691
692 #define VFIO_IOMMU_TLB_SYNC_MAX 512
693
694 static size_t unmap_unpin_fast(struct vfio_domain *domain,
695 struct vfio_dma *dma, dma_addr_t *iova,
696 size_t len, phys_addr_t phys, long *unlocked,
697 struct list_head *unmapped_list,
698 int *unmapped_cnt,
699 struct iommu_iotlb_gather *iotlb_gather)
700 {
701 size_t unmapped = 0;
702 struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
703
704 if (entry) {
705 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
706 iotlb_gather);
707
708 if (!unmapped) {
709 kfree(entry);
710 } else {
711 entry->iova = *iova;
712 entry->phys = phys;
713 entry->len = unmapped;
714 list_add_tail(&entry->list, unmapped_list);
715
716 *iova += unmapped;
717 (*unmapped_cnt)++;
718 }
719 }
720
721
722
723
724
725 if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
726 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
727 iotlb_gather);
728 *unmapped_cnt = 0;
729 }
730
731 return unmapped;
732 }
733
734 static size_t unmap_unpin_slow(struct vfio_domain *domain,
735 struct vfio_dma *dma, dma_addr_t *iova,
736 size_t len, phys_addr_t phys,
737 long *unlocked)
738 {
739 size_t unmapped = iommu_unmap(domain->domain, *iova, len);
740
741 if (unmapped) {
742 *unlocked += vfio_unpin_pages_remote(dma, *iova,
743 phys >> PAGE_SHIFT,
744 unmapped >> PAGE_SHIFT,
745 false);
746 *iova += unmapped;
747 cond_resched();
748 }
749 return unmapped;
750 }
751
752 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
753 bool do_accounting)
754 {
755 dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
756 struct vfio_domain *domain, *d;
757 LIST_HEAD(unmapped_region_list);
758 struct iommu_iotlb_gather iotlb_gather;
759 int unmapped_region_cnt = 0;
760 long unlocked = 0;
761
762 if (!dma->size)
763 return 0;
764
765 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
766 return 0;
767
768
769
770
771
772
773
774
775 domain = d = list_first_entry(&iommu->domain_list,
776 struct vfio_domain, next);
777
778 list_for_each_entry_continue(d, &iommu->domain_list, next) {
779 iommu_unmap(d->domain, dma->iova, dma->size);
780 cond_resched();
781 }
782
783 iommu_iotlb_gather_init(&iotlb_gather);
784 while (iova < end) {
785 size_t unmapped, len;
786 phys_addr_t phys, next;
787
788 phys = iommu_iova_to_phys(domain->domain, iova);
789 if (WARN_ON(!phys)) {
790 iova += PAGE_SIZE;
791 continue;
792 }
793
794
795
796
797
798
799 for (len = PAGE_SIZE;
800 !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
801 next = iommu_iova_to_phys(domain->domain, iova + len);
802 if (next != phys + len)
803 break;
804 }
805
806
807
808
809
810 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
811 &unlocked, &unmapped_region_list,
812 &unmapped_region_cnt,
813 &iotlb_gather);
814 if (!unmapped) {
815 unmapped = unmap_unpin_slow(domain, dma, &iova, len,
816 phys, &unlocked);
817 if (WARN_ON(!unmapped))
818 break;
819 }
820 }
821
822 dma->iommu_mapped = false;
823
824 if (unmapped_region_cnt) {
825 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
826 &iotlb_gather);
827 }
828
829 if (do_accounting) {
830 vfio_lock_acct(dma, -unlocked, true);
831 return 0;
832 }
833 return unlocked;
834 }
835
836 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
837 {
838 vfio_unmap_unpin(iommu, dma, true);
839 vfio_unlink_dma(iommu, dma);
840 put_task_struct(dma->task);
841 kfree(dma);
842 iommu->dma_avail++;
843 }
844
845 static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu)
846 {
847 struct vfio_domain *domain;
848 unsigned long bitmap = ULONG_MAX;
849
850 mutex_lock(&iommu->lock);
851 list_for_each_entry(domain, &iommu->domain_list, next)
852 bitmap &= domain->domain->pgsize_bitmap;
853 mutex_unlock(&iommu->lock);
854
855
856
857
858
859
860
861
862
863 if (bitmap & ~PAGE_MASK) {
864 bitmap &= PAGE_MASK;
865 bitmap |= PAGE_SIZE;
866 }
867
868 return bitmap;
869 }
870
871 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
872 struct vfio_iommu_type1_dma_unmap *unmap)
873 {
874 uint64_t mask;
875 struct vfio_dma *dma, *dma_last = NULL;
876 size_t unmapped = 0;
877 int ret = 0, retries = 0;
878
879 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
880
881 if (unmap->iova & mask)
882 return -EINVAL;
883 if (!unmap->size || unmap->size & mask)
884 return -EINVAL;
885 if (unmap->iova + unmap->size - 1 < unmap->iova ||
886 unmap->size > SIZE_MAX)
887 return -EINVAL;
888
889 WARN_ON(mask & PAGE_MASK);
890 again:
891 mutex_lock(&iommu->lock);
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924 if (iommu->v2) {
925 dma = vfio_find_dma(iommu, unmap->iova, 1);
926 if (dma && dma->iova != unmap->iova) {
927 ret = -EINVAL;
928 goto unlock;
929 }
930 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
931 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
932 ret = -EINVAL;
933 goto unlock;
934 }
935 }
936
937 while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
938 if (!iommu->v2 && unmap->iova > dma->iova)
939 break;
940
941
942
943
944 if (dma->task->mm != current->mm)
945 break;
946
947 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
948 struct vfio_iommu_type1_dma_unmap nb_unmap;
949
950 if (dma_last == dma) {
951 BUG_ON(++retries > 10);
952 } else {
953 dma_last = dma;
954 retries = 0;
955 }
956
957 nb_unmap.iova = dma->iova;
958 nb_unmap.size = dma->size;
959
960
961
962
963
964
965
966 mutex_unlock(&iommu->lock);
967 blocking_notifier_call_chain(&iommu->notifier,
968 VFIO_IOMMU_NOTIFY_DMA_UNMAP,
969 &nb_unmap);
970 goto again;
971 }
972 unmapped += dma->size;
973 vfio_remove_dma(iommu, dma);
974 }
975
976 unlock:
977 mutex_unlock(&iommu->lock);
978
979
980 unmap->size = unmapped;
981
982 return ret;
983 }
984
985 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
986 unsigned long pfn, long npage, int prot)
987 {
988 struct vfio_domain *d;
989 int ret;
990
991 list_for_each_entry(d, &iommu->domain_list, next) {
992 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
993 npage << PAGE_SHIFT, prot | d->prot);
994 if (ret)
995 goto unwind;
996
997 cond_resched();
998 }
999
1000 return 0;
1001
1002 unwind:
1003 list_for_each_entry_continue_reverse(d, &iommu->domain_list, next)
1004 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1005
1006 return ret;
1007 }
1008
1009 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1010 size_t map_size)
1011 {
1012 dma_addr_t iova = dma->iova;
1013 unsigned long vaddr = dma->vaddr;
1014 size_t size = map_size;
1015 long npage;
1016 unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1017 int ret = 0;
1018
1019 while (size) {
1020
1021 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1022 size >> PAGE_SHIFT, &pfn, limit);
1023 if (npage <= 0) {
1024 WARN_ON(!npage);
1025 ret = (int)npage;
1026 break;
1027 }
1028
1029
1030 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1031 dma->prot);
1032 if (ret) {
1033 vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1034 npage, true);
1035 break;
1036 }
1037
1038 size -= npage << PAGE_SHIFT;
1039 dma->size += npage << PAGE_SHIFT;
1040 }
1041
1042 dma->iommu_mapped = true;
1043
1044 if (ret)
1045 vfio_remove_dma(iommu, dma);
1046
1047 return ret;
1048 }
1049
1050
1051
1052
1053 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1054 dma_addr_t start, dma_addr_t end)
1055 {
1056 struct list_head *iova = &iommu->iova_list;
1057 struct vfio_iova *node;
1058
1059 list_for_each_entry(node, iova, list) {
1060 if (start >= node->start && end <= node->end)
1061 return true;
1062 }
1063
1064
1065
1066
1067
1068 return list_empty(iova);
1069 }
1070
1071 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1072 struct vfio_iommu_type1_dma_map *map)
1073 {
1074 dma_addr_t iova = map->iova;
1075 unsigned long vaddr = map->vaddr;
1076 size_t size = map->size;
1077 int ret = 0, prot = 0;
1078 uint64_t mask;
1079 struct vfio_dma *dma;
1080
1081
1082 if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1083 return -EINVAL;
1084
1085 mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1;
1086
1087 WARN_ON(mask & PAGE_MASK);
1088
1089
1090 if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1091 prot |= IOMMU_WRITE;
1092 if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1093 prot |= IOMMU_READ;
1094
1095 if (!prot || !size || (size | iova | vaddr) & mask)
1096 return -EINVAL;
1097
1098
1099 if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
1100 return -EINVAL;
1101
1102 mutex_lock(&iommu->lock);
1103
1104 if (vfio_find_dma(iommu, iova, size)) {
1105 ret = -EEXIST;
1106 goto out_unlock;
1107 }
1108
1109 if (!iommu->dma_avail) {
1110 ret = -ENOSPC;
1111 goto out_unlock;
1112 }
1113
1114 if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1115 ret = -EINVAL;
1116 goto out_unlock;
1117 }
1118
1119 dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1120 if (!dma) {
1121 ret = -ENOMEM;
1122 goto out_unlock;
1123 }
1124
1125 iommu->dma_avail--;
1126 dma->iova = iova;
1127 dma->vaddr = vaddr;
1128 dma->prot = prot;
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155 get_task_struct(current->group_leader);
1156 dma->task = current->group_leader;
1157 dma->lock_cap = capable(CAP_IPC_LOCK);
1158
1159 dma->pfn_list = RB_ROOT;
1160
1161
1162 vfio_link_dma(iommu, dma);
1163
1164
1165 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1166 dma->size = size;
1167 else
1168 ret = vfio_pin_map_dma(iommu, dma, size);
1169
1170 out_unlock:
1171 mutex_unlock(&iommu->lock);
1172 return ret;
1173 }
1174
1175 static int vfio_bus_type(struct device *dev, void *data)
1176 {
1177 struct bus_type **bus = data;
1178
1179 if (*bus && *bus != dev->bus)
1180 return -EINVAL;
1181
1182 *bus = dev->bus;
1183
1184 return 0;
1185 }
1186
1187 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1188 struct vfio_domain *domain)
1189 {
1190 struct vfio_domain *d;
1191 struct rb_node *n;
1192 unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1193 int ret;
1194
1195
1196 d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
1197 n = rb_first(&iommu->dma_list);
1198
1199 for (; n; n = rb_next(n)) {
1200 struct vfio_dma *dma;
1201 dma_addr_t iova;
1202
1203 dma = rb_entry(n, struct vfio_dma, node);
1204 iova = dma->iova;
1205
1206 while (iova < dma->iova + dma->size) {
1207 phys_addr_t phys;
1208 size_t size;
1209
1210 if (dma->iommu_mapped) {
1211 phys_addr_t p;
1212 dma_addr_t i;
1213
1214 phys = iommu_iova_to_phys(d->domain, iova);
1215
1216 if (WARN_ON(!phys)) {
1217 iova += PAGE_SIZE;
1218 continue;
1219 }
1220
1221 size = PAGE_SIZE;
1222 p = phys + size;
1223 i = iova + size;
1224 while (i < dma->iova + dma->size &&
1225 p == iommu_iova_to_phys(d->domain, i)) {
1226 size += PAGE_SIZE;
1227 p += PAGE_SIZE;
1228 i += PAGE_SIZE;
1229 }
1230 } else {
1231 unsigned long pfn;
1232 unsigned long vaddr = dma->vaddr +
1233 (iova - dma->iova);
1234 size_t n = dma->iova + dma->size - iova;
1235 long npage;
1236
1237 npage = vfio_pin_pages_remote(dma, vaddr,
1238 n >> PAGE_SHIFT,
1239 &pfn, limit);
1240 if (npage <= 0) {
1241 WARN_ON(!npage);
1242 ret = (int)npage;
1243 return ret;
1244 }
1245
1246 phys = pfn << PAGE_SHIFT;
1247 size = npage << PAGE_SHIFT;
1248 }
1249
1250 ret = iommu_map(domain->domain, iova, phys,
1251 size, dma->prot | domain->prot);
1252 if (ret)
1253 return ret;
1254
1255 iova += size;
1256 }
1257 dma->iommu_mapped = true;
1258 }
1259 return 0;
1260 }
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1273 {
1274 struct page *pages;
1275 int ret, order = get_order(PAGE_SIZE * 2);
1276
1277 pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1278 if (!pages)
1279 return;
1280
1281 ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1282 IOMMU_READ | IOMMU_WRITE | domain->prot);
1283 if (!ret) {
1284 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1285
1286 if (unmapped == PAGE_SIZE)
1287 iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1288 else
1289 domain->fgsp = true;
1290 }
1291
1292 __free_pages(pages, order);
1293 }
1294
1295 static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
1296 struct iommu_group *iommu_group)
1297 {
1298 struct vfio_group *g;
1299
1300 list_for_each_entry(g, &domain->group_list, next) {
1301 if (g->iommu_group == iommu_group)
1302 return g;
1303 }
1304
1305 return NULL;
1306 }
1307
1308 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1309 phys_addr_t *base)
1310 {
1311 struct iommu_resv_region *region;
1312 bool ret = false;
1313
1314 list_for_each_entry(region, group_resv_regions, list) {
1315
1316
1317
1318
1319
1320 if (region->type == IOMMU_RESV_MSI) {
1321 ret = false;
1322 break;
1323 }
1324
1325 if (region->type == IOMMU_RESV_SW_MSI) {
1326 *base = region->start;
1327 ret = true;
1328 }
1329 }
1330
1331 return ret;
1332 }
1333
1334 static struct device *vfio_mdev_get_iommu_device(struct device *dev)
1335 {
1336 struct device *(*fn)(struct device *dev);
1337 struct device *iommu_device;
1338
1339 fn = symbol_get(mdev_get_iommu_device);
1340 if (fn) {
1341 iommu_device = fn(dev);
1342 symbol_put(mdev_get_iommu_device);
1343
1344 return iommu_device;
1345 }
1346
1347 return NULL;
1348 }
1349
1350 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1351 {
1352 struct iommu_domain *domain = data;
1353 struct device *iommu_device;
1354
1355 iommu_device = vfio_mdev_get_iommu_device(dev);
1356 if (iommu_device) {
1357 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1358 return iommu_aux_attach_device(domain, iommu_device);
1359 else
1360 return iommu_attach_device(domain, iommu_device);
1361 }
1362
1363 return -EINVAL;
1364 }
1365
1366 static int vfio_mdev_detach_domain(struct device *dev, void *data)
1367 {
1368 struct iommu_domain *domain = data;
1369 struct device *iommu_device;
1370
1371 iommu_device = vfio_mdev_get_iommu_device(dev);
1372 if (iommu_device) {
1373 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1374 iommu_aux_detach_device(domain, iommu_device);
1375 else
1376 iommu_detach_device(domain, iommu_device);
1377 }
1378
1379 return 0;
1380 }
1381
1382 static int vfio_iommu_attach_group(struct vfio_domain *domain,
1383 struct vfio_group *group)
1384 {
1385 if (group->mdev_group)
1386 return iommu_group_for_each_dev(group->iommu_group,
1387 domain->domain,
1388 vfio_mdev_attach_domain);
1389 else
1390 return iommu_attach_group(domain->domain, group->iommu_group);
1391 }
1392
1393 static void vfio_iommu_detach_group(struct vfio_domain *domain,
1394 struct vfio_group *group)
1395 {
1396 if (group->mdev_group)
1397 iommu_group_for_each_dev(group->iommu_group, domain->domain,
1398 vfio_mdev_detach_domain);
1399 else
1400 iommu_detach_group(domain->domain, group->iommu_group);
1401 }
1402
1403 static bool vfio_bus_is_mdev(struct bus_type *bus)
1404 {
1405 struct bus_type *mdev_bus;
1406 bool ret = false;
1407
1408 mdev_bus = symbol_get(mdev_bus_type);
1409 if (mdev_bus) {
1410 ret = (bus == mdev_bus);
1411 symbol_put(mdev_bus_type);
1412 }
1413
1414 return ret;
1415 }
1416
1417 static int vfio_mdev_iommu_device(struct device *dev, void *data)
1418 {
1419 struct device **old = data, *new;
1420
1421 new = vfio_mdev_get_iommu_device(dev);
1422 if (!new || (*old && *old != new))
1423 return -EINVAL;
1424
1425 *old = new;
1426
1427 return 0;
1428 }
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439 static int vfio_iommu_iova_insert(struct list_head *head,
1440 dma_addr_t start, dma_addr_t end)
1441 {
1442 struct vfio_iova *region;
1443
1444 region = kmalloc(sizeof(*region), GFP_KERNEL);
1445 if (!region)
1446 return -ENOMEM;
1447
1448 INIT_LIST_HEAD(®ion->list);
1449 region->start = start;
1450 region->end = end;
1451
1452 list_add_tail(®ion->list, head);
1453 return 0;
1454 }
1455
1456
1457
1458
1459
1460 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1461 dma_addr_t start, dma_addr_t end)
1462 {
1463 struct vfio_iova *first, *last;
1464 struct list_head *iova = &iommu->iova_list;
1465
1466 if (list_empty(iova))
1467 return false;
1468
1469
1470 first = list_first_entry(iova, struct vfio_iova, list);
1471 last = list_last_entry(iova, struct vfio_iova, list);
1472 if (start > last->end || end < first->start)
1473 return true;
1474
1475
1476 if (start > first->start) {
1477 if (vfio_find_dma(iommu, first->start, start - first->start))
1478 return true;
1479 }
1480
1481
1482 if (end < last->end) {
1483 if (vfio_find_dma(iommu, end + 1, last->end - end))
1484 return true;
1485 }
1486
1487 return false;
1488 }
1489
1490
1491
1492
1493
1494 static int vfio_iommu_aper_resize(struct list_head *iova,
1495 dma_addr_t start, dma_addr_t end)
1496 {
1497 struct vfio_iova *node, *next;
1498
1499 if (list_empty(iova))
1500 return vfio_iommu_iova_insert(iova, start, end);
1501
1502
1503 list_for_each_entry_safe(node, next, iova, list) {
1504 if (start < node->start)
1505 break;
1506 if (start >= node->start && start < node->end) {
1507 node->start = start;
1508 break;
1509 }
1510
1511 list_del(&node->list);
1512 kfree(node);
1513 }
1514
1515
1516 list_for_each_entry_safe(node, next, iova, list) {
1517 if (end > node->end)
1518 continue;
1519 if (end > node->start && end <= node->end) {
1520 node->end = end;
1521 continue;
1522 }
1523
1524 list_del(&node->list);
1525 kfree(node);
1526 }
1527
1528 return 0;
1529 }
1530
1531
1532
1533
1534 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
1535 struct list_head *resv_regions)
1536 {
1537 struct iommu_resv_region *region;
1538
1539
1540 list_for_each_entry(region, resv_regions, list) {
1541 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
1542 continue;
1543
1544 if (vfio_find_dma(iommu, region->start, region->length))
1545 return true;
1546 }
1547
1548 return false;
1549 }
1550
1551
1552
1553
1554
1555 static int vfio_iommu_resv_exclude(struct list_head *iova,
1556 struct list_head *resv_regions)
1557 {
1558 struct iommu_resv_region *resv;
1559 struct vfio_iova *n, *next;
1560
1561 list_for_each_entry(resv, resv_regions, list) {
1562 phys_addr_t start, end;
1563
1564 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1565 continue;
1566
1567 start = resv->start;
1568 end = resv->start + resv->length - 1;
1569
1570 list_for_each_entry_safe(n, next, iova, list) {
1571 int ret = 0;
1572
1573
1574 if (start > n->end || end < n->start)
1575 continue;
1576
1577
1578
1579
1580
1581
1582
1583 if (start > n->start)
1584 ret = vfio_iommu_iova_insert(&n->list, n->start,
1585 start - 1);
1586 if (!ret && end < n->end)
1587 ret = vfio_iommu_iova_insert(&n->list, end + 1,
1588 n->end);
1589 if (ret)
1590 return ret;
1591
1592 list_del(&n->list);
1593 kfree(n);
1594 }
1595 }
1596
1597 if (list_empty(iova))
1598 return -EINVAL;
1599
1600 return 0;
1601 }
1602
1603 static void vfio_iommu_resv_free(struct list_head *resv_regions)
1604 {
1605 struct iommu_resv_region *n, *next;
1606
1607 list_for_each_entry_safe(n, next, resv_regions, list) {
1608 list_del(&n->list);
1609 kfree(n);
1610 }
1611 }
1612
1613 static void vfio_iommu_iova_free(struct list_head *iova)
1614 {
1615 struct vfio_iova *n, *next;
1616
1617 list_for_each_entry_safe(n, next, iova, list) {
1618 list_del(&n->list);
1619 kfree(n);
1620 }
1621 }
1622
1623 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
1624 struct list_head *iova_copy)
1625 {
1626 struct list_head *iova = &iommu->iova_list;
1627 struct vfio_iova *n;
1628 int ret;
1629
1630 list_for_each_entry(n, iova, list) {
1631 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
1632 if (ret)
1633 goto out_free;
1634 }
1635
1636 return 0;
1637
1638 out_free:
1639 vfio_iommu_iova_free(iova_copy);
1640 return ret;
1641 }
1642
1643 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
1644 struct list_head *iova_copy)
1645 {
1646 struct list_head *iova = &iommu->iova_list;
1647
1648 vfio_iommu_iova_free(iova);
1649
1650 list_splice_tail(iova_copy, iova);
1651 }
1652 static int vfio_iommu_type1_attach_group(void *iommu_data,
1653 struct iommu_group *iommu_group)
1654 {
1655 struct vfio_iommu *iommu = iommu_data;
1656 struct vfio_group *group;
1657 struct vfio_domain *domain, *d;
1658 struct bus_type *bus = NULL;
1659 int ret;
1660 bool resv_msi, msi_remap;
1661 phys_addr_t resv_msi_base = 0;
1662 struct iommu_domain_geometry geo;
1663 LIST_HEAD(iova_copy);
1664 LIST_HEAD(group_resv_regions);
1665
1666 mutex_lock(&iommu->lock);
1667
1668 list_for_each_entry(d, &iommu->domain_list, next) {
1669 if (find_iommu_group(d, iommu_group)) {
1670 mutex_unlock(&iommu->lock);
1671 return -EINVAL;
1672 }
1673 }
1674
1675 if (iommu->external_domain) {
1676 if (find_iommu_group(iommu->external_domain, iommu_group)) {
1677 mutex_unlock(&iommu->lock);
1678 return -EINVAL;
1679 }
1680 }
1681
1682 group = kzalloc(sizeof(*group), GFP_KERNEL);
1683 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1684 if (!group || !domain) {
1685 ret = -ENOMEM;
1686 goto out_free;
1687 }
1688
1689 group->iommu_group = iommu_group;
1690
1691
1692 ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
1693 if (ret)
1694 goto out_free;
1695
1696 if (vfio_bus_is_mdev(bus)) {
1697 struct device *iommu_device = NULL;
1698
1699 group->mdev_group = true;
1700
1701
1702 ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
1703 vfio_mdev_iommu_device);
1704 if (ret || !iommu_device) {
1705 if (!iommu->external_domain) {
1706 INIT_LIST_HEAD(&domain->group_list);
1707 iommu->external_domain = domain;
1708 } else {
1709 kfree(domain);
1710 }
1711
1712 list_add(&group->next,
1713 &iommu->external_domain->group_list);
1714 mutex_unlock(&iommu->lock);
1715
1716 return 0;
1717 }
1718
1719 bus = iommu_device->bus;
1720 }
1721
1722 domain->domain = iommu_domain_alloc(bus);
1723 if (!domain->domain) {
1724 ret = -EIO;
1725 goto out_free;
1726 }
1727
1728 if (iommu->nesting) {
1729 int attr = 1;
1730
1731 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
1732 &attr);
1733 if (ret)
1734 goto out_domain;
1735 }
1736
1737 ret = vfio_iommu_attach_group(domain, group);
1738 if (ret)
1739 goto out_domain;
1740
1741
1742 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
1743
1744 if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
1745 geo.aperture_end)) {
1746 ret = -EINVAL;
1747 goto out_detach;
1748 }
1749
1750 ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
1751 if (ret)
1752 goto out_detach;
1753
1754 if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
1755 ret = -EINVAL;
1756 goto out_detach;
1757 }
1758
1759
1760
1761
1762
1763
1764 ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
1765 if (ret)
1766 goto out_detach;
1767
1768 ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
1769 geo.aperture_end);
1770 if (ret)
1771 goto out_detach;
1772
1773 ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
1774 if (ret)
1775 goto out_detach;
1776
1777 resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
1778
1779 INIT_LIST_HEAD(&domain->group_list);
1780 list_add(&group->next, &domain->group_list);
1781
1782 msi_remap = irq_domain_check_msi_remap() ||
1783 iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
1784
1785 if (!allow_unsafe_interrupts && !msi_remap) {
1786 pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
1787 __func__);
1788 ret = -EPERM;
1789 goto out_detach;
1790 }
1791
1792 if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
1793 domain->prot |= IOMMU_CACHE;
1794
1795
1796
1797
1798
1799
1800
1801
1802 list_for_each_entry(d, &iommu->domain_list, next) {
1803 if (d->domain->ops == domain->domain->ops &&
1804 d->prot == domain->prot) {
1805 vfio_iommu_detach_group(domain, group);
1806 if (!vfio_iommu_attach_group(d, group)) {
1807 list_add(&group->next, &d->group_list);
1808 iommu_domain_free(domain->domain);
1809 kfree(domain);
1810 goto done;
1811 }
1812
1813 ret = vfio_iommu_attach_group(domain, group);
1814 if (ret)
1815 goto out_domain;
1816 }
1817 }
1818
1819 vfio_test_domain_fgsp(domain);
1820
1821
1822 ret = vfio_iommu_replay(iommu, domain);
1823 if (ret)
1824 goto out_detach;
1825
1826 if (resv_msi) {
1827 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
1828 if (ret)
1829 goto out_detach;
1830 }
1831
1832 list_add(&domain->next, &iommu->domain_list);
1833 done:
1834
1835 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
1836 mutex_unlock(&iommu->lock);
1837 vfio_iommu_resv_free(&group_resv_regions);
1838
1839 return 0;
1840
1841 out_detach:
1842 vfio_iommu_detach_group(domain, group);
1843 out_domain:
1844 iommu_domain_free(domain->domain);
1845 vfio_iommu_iova_free(&iova_copy);
1846 vfio_iommu_resv_free(&group_resv_regions);
1847 out_free:
1848 kfree(domain);
1849 kfree(group);
1850 mutex_unlock(&iommu->lock);
1851 return ret;
1852 }
1853
1854 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
1855 {
1856 struct rb_node *node;
1857
1858 while ((node = rb_first(&iommu->dma_list)))
1859 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
1860 }
1861
1862 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
1863 {
1864 struct rb_node *n, *p;
1865
1866 n = rb_first(&iommu->dma_list);
1867 for (; n; n = rb_next(n)) {
1868 struct vfio_dma *dma;
1869 long locked = 0, unlocked = 0;
1870
1871 dma = rb_entry(n, struct vfio_dma, node);
1872 unlocked += vfio_unmap_unpin(iommu, dma, false);
1873 p = rb_first(&dma->pfn_list);
1874 for (; p; p = rb_next(p)) {
1875 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
1876 node);
1877
1878 if (!is_invalid_reserved_pfn(vpfn->pfn))
1879 locked++;
1880 }
1881 vfio_lock_acct(dma, locked - unlocked, true);
1882 }
1883 }
1884
1885 static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu)
1886 {
1887 struct rb_node *n;
1888
1889 n = rb_first(&iommu->dma_list);
1890 for (; n; n = rb_next(n)) {
1891 struct vfio_dma *dma;
1892
1893 dma = rb_entry(n, struct vfio_dma, node);
1894
1895 if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list)))
1896 break;
1897 }
1898
1899 WARN_ON(iommu->notifier.head);
1900 }
1901
1902
1903
1904
1905
1906
1907 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
1908 struct list_head *iova_copy)
1909 {
1910 struct vfio_domain *domain;
1911 struct iommu_domain_geometry geo;
1912 struct vfio_iova *node;
1913 dma_addr_t start = 0;
1914 dma_addr_t end = (dma_addr_t)~0;
1915
1916 if (list_empty(iova_copy))
1917 return;
1918
1919 list_for_each_entry(domain, &iommu->domain_list, next) {
1920 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
1921 &geo);
1922 if (geo.aperture_start > start)
1923 start = geo.aperture_start;
1924 if (geo.aperture_end < end)
1925 end = geo.aperture_end;
1926 }
1927
1928
1929 node = list_first_entry(iova_copy, struct vfio_iova, list);
1930 node->start = start;
1931 node = list_last_entry(iova_copy, struct vfio_iova, list);
1932 node->end = end;
1933 }
1934
1935
1936
1937
1938
1939
1940
1941 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
1942 struct list_head *iova_copy)
1943 {
1944 struct vfio_domain *d;
1945 struct vfio_group *g;
1946 struct vfio_iova *node;
1947 dma_addr_t start, end;
1948 LIST_HEAD(resv_regions);
1949 int ret;
1950
1951 if (list_empty(iova_copy))
1952 return -EINVAL;
1953
1954 list_for_each_entry(d, &iommu->domain_list, next) {
1955 list_for_each_entry(g, &d->group_list, next) {
1956 ret = iommu_get_group_resv_regions(g->iommu_group,
1957 &resv_regions);
1958 if (ret)
1959 goto done;
1960 }
1961 }
1962
1963 node = list_first_entry(iova_copy, struct vfio_iova, list);
1964 start = node->start;
1965 node = list_last_entry(iova_copy, struct vfio_iova, list);
1966 end = node->end;
1967
1968
1969 vfio_iommu_iova_free(iova_copy);
1970
1971 ret = vfio_iommu_aper_resize(iova_copy, start, end);
1972 if (ret)
1973 goto done;
1974
1975
1976 ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
1977 done:
1978 vfio_iommu_resv_free(&resv_regions);
1979 return ret;
1980 }
1981
1982 static void vfio_iommu_type1_detach_group(void *iommu_data,
1983 struct iommu_group *iommu_group)
1984 {
1985 struct vfio_iommu *iommu = iommu_data;
1986 struct vfio_domain *domain;
1987 struct vfio_group *group;
1988 LIST_HEAD(iova_copy);
1989
1990 mutex_lock(&iommu->lock);
1991
1992 if (iommu->external_domain) {
1993 group = find_iommu_group(iommu->external_domain, iommu_group);
1994 if (group) {
1995 list_del(&group->next);
1996 kfree(group);
1997
1998 if (list_empty(&iommu->external_domain->group_list)) {
1999 vfio_sanity_check_pfn_list(iommu);
2000
2001 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
2002 vfio_iommu_unmap_unpin_all(iommu);
2003
2004 kfree(iommu->external_domain);
2005 iommu->external_domain = NULL;
2006 }
2007 goto detach_group_done;
2008 }
2009 }
2010
2011
2012
2013
2014
2015
2016 vfio_iommu_iova_get_copy(iommu, &iova_copy);
2017
2018 list_for_each_entry(domain, &iommu->domain_list, next) {
2019 group = find_iommu_group(domain, iommu_group);
2020 if (!group)
2021 continue;
2022
2023 vfio_iommu_detach_group(domain, group);
2024 list_del(&group->next);
2025 kfree(group);
2026
2027
2028
2029
2030
2031
2032
2033 if (list_empty(&domain->group_list)) {
2034 if (list_is_singular(&iommu->domain_list)) {
2035 if (!iommu->external_domain)
2036 vfio_iommu_unmap_unpin_all(iommu);
2037 else
2038 vfio_iommu_unmap_unpin_reaccount(iommu);
2039 }
2040 iommu_domain_free(domain->domain);
2041 list_del(&domain->next);
2042 kfree(domain);
2043 vfio_iommu_aper_expand(iommu, &iova_copy);
2044 }
2045 break;
2046 }
2047
2048 if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2049 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2050 else
2051 vfio_iommu_iova_free(&iova_copy);
2052
2053 detach_group_done:
2054 mutex_unlock(&iommu->lock);
2055 }
2056
2057 static void *vfio_iommu_type1_open(unsigned long arg)
2058 {
2059 struct vfio_iommu *iommu;
2060
2061 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2062 if (!iommu)
2063 return ERR_PTR(-ENOMEM);
2064
2065 switch (arg) {
2066 case VFIO_TYPE1_IOMMU:
2067 break;
2068 case VFIO_TYPE1_NESTING_IOMMU:
2069 iommu->nesting = true;
2070
2071 case VFIO_TYPE1v2_IOMMU:
2072 iommu->v2 = true;
2073 break;
2074 default:
2075 kfree(iommu);
2076 return ERR_PTR(-EINVAL);
2077 }
2078
2079 INIT_LIST_HEAD(&iommu->domain_list);
2080 INIT_LIST_HEAD(&iommu->iova_list);
2081 iommu->dma_list = RB_ROOT;
2082 iommu->dma_avail = dma_entry_limit;
2083 mutex_init(&iommu->lock);
2084 BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2085
2086 return iommu;
2087 }
2088
2089 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2090 {
2091 struct vfio_group *group, *group_tmp;
2092
2093 list_for_each_entry_safe(group, group_tmp,
2094 &domain->group_list, next) {
2095 if (!external)
2096 vfio_iommu_detach_group(domain, group);
2097 list_del(&group->next);
2098 kfree(group);
2099 }
2100
2101 if (!external)
2102 iommu_domain_free(domain->domain);
2103 }
2104
2105 static void vfio_iommu_type1_release(void *iommu_data)
2106 {
2107 struct vfio_iommu *iommu = iommu_data;
2108 struct vfio_domain *domain, *domain_tmp;
2109
2110 if (iommu->external_domain) {
2111 vfio_release_domain(iommu->external_domain, true);
2112 vfio_sanity_check_pfn_list(iommu);
2113 kfree(iommu->external_domain);
2114 }
2115
2116 vfio_iommu_unmap_unpin_all(iommu);
2117
2118 list_for_each_entry_safe(domain, domain_tmp,
2119 &iommu->domain_list, next) {
2120 vfio_release_domain(domain, false);
2121 list_del(&domain->next);
2122 kfree(domain);
2123 }
2124
2125 vfio_iommu_iova_free(&iommu->iova_list);
2126
2127 kfree(iommu);
2128 }
2129
2130 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2131 {
2132 struct vfio_domain *domain;
2133 int ret = 1;
2134
2135 mutex_lock(&iommu->lock);
2136 list_for_each_entry(domain, &iommu->domain_list, next) {
2137 if (!(domain->prot & IOMMU_CACHE)) {
2138 ret = 0;
2139 break;
2140 }
2141 }
2142 mutex_unlock(&iommu->lock);
2143
2144 return ret;
2145 }
2146
2147 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2148 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2149 size_t size)
2150 {
2151 struct vfio_info_cap_header *header;
2152 struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2153
2154 header = vfio_info_cap_add(caps, size,
2155 VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2156 if (IS_ERR(header))
2157 return PTR_ERR(header);
2158
2159 iova_cap = container_of(header,
2160 struct vfio_iommu_type1_info_cap_iova_range,
2161 header);
2162 iova_cap->nr_iovas = cap_iovas->nr_iovas;
2163 memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2164 cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2165 return 0;
2166 }
2167
2168 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2169 struct vfio_info_cap *caps)
2170 {
2171 struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2172 struct vfio_iova *iova;
2173 size_t size;
2174 int iovas = 0, i = 0, ret;
2175
2176 mutex_lock(&iommu->lock);
2177
2178 list_for_each_entry(iova, &iommu->iova_list, list)
2179 iovas++;
2180
2181 if (!iovas) {
2182
2183
2184
2185
2186 ret = 0;
2187 goto out_unlock;
2188 }
2189
2190 size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
2191
2192 cap_iovas = kzalloc(size, GFP_KERNEL);
2193 if (!cap_iovas) {
2194 ret = -ENOMEM;
2195 goto out_unlock;
2196 }
2197
2198 cap_iovas->nr_iovas = iovas;
2199
2200 list_for_each_entry(iova, &iommu->iova_list, list) {
2201 cap_iovas->iova_ranges[i].start = iova->start;
2202 cap_iovas->iova_ranges[i].end = iova->end;
2203 i++;
2204 }
2205
2206 ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2207
2208 kfree(cap_iovas);
2209 out_unlock:
2210 mutex_unlock(&iommu->lock);
2211 return ret;
2212 }
2213
2214 static long vfio_iommu_type1_ioctl(void *iommu_data,
2215 unsigned int cmd, unsigned long arg)
2216 {
2217 struct vfio_iommu *iommu = iommu_data;
2218 unsigned long minsz;
2219
2220 if (cmd == VFIO_CHECK_EXTENSION) {
2221 switch (arg) {
2222 case VFIO_TYPE1_IOMMU:
2223 case VFIO_TYPE1v2_IOMMU:
2224 case VFIO_TYPE1_NESTING_IOMMU:
2225 return 1;
2226 case VFIO_DMA_CC_IOMMU:
2227 if (!iommu)
2228 return 0;
2229 return vfio_domains_have_iommu_cache(iommu);
2230 default:
2231 return 0;
2232 }
2233 } else if (cmd == VFIO_IOMMU_GET_INFO) {
2234 struct vfio_iommu_type1_info info;
2235 struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2236 unsigned long capsz;
2237 int ret;
2238
2239 minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2240
2241
2242 capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2243
2244 if (copy_from_user(&info, (void __user *)arg, minsz))
2245 return -EFAULT;
2246
2247 if (info.argsz < minsz)
2248 return -EINVAL;
2249
2250 if (info.argsz >= capsz) {
2251 minsz = capsz;
2252 info.cap_offset = 0;
2253 }
2254
2255 info.flags = VFIO_IOMMU_INFO_PGSIZES;
2256
2257 info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
2258
2259 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2260 if (ret)
2261 return ret;
2262
2263 if (caps.size) {
2264 info.flags |= VFIO_IOMMU_INFO_CAPS;
2265
2266 if (info.argsz < sizeof(info) + caps.size) {
2267 info.argsz = sizeof(info) + caps.size;
2268 } else {
2269 vfio_info_cap_shift(&caps, sizeof(info));
2270 if (copy_to_user((void __user *)arg +
2271 sizeof(info), caps.buf,
2272 caps.size)) {
2273 kfree(caps.buf);
2274 return -EFAULT;
2275 }
2276 info.cap_offset = sizeof(info);
2277 }
2278
2279 kfree(caps.buf);
2280 }
2281
2282 return copy_to_user((void __user *)arg, &info, minsz) ?
2283 -EFAULT : 0;
2284
2285 } else if (cmd == VFIO_IOMMU_MAP_DMA) {
2286 struct vfio_iommu_type1_dma_map map;
2287 uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
2288 VFIO_DMA_MAP_FLAG_WRITE;
2289
2290 minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2291
2292 if (copy_from_user(&map, (void __user *)arg, minsz))
2293 return -EFAULT;
2294
2295 if (map.argsz < minsz || map.flags & ~mask)
2296 return -EINVAL;
2297
2298 return vfio_dma_do_map(iommu, &map);
2299
2300 } else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
2301 struct vfio_iommu_type1_dma_unmap unmap;
2302 long ret;
2303
2304 minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2305
2306 if (copy_from_user(&unmap, (void __user *)arg, minsz))
2307 return -EFAULT;
2308
2309 if (unmap.argsz < minsz || unmap.flags)
2310 return -EINVAL;
2311
2312 ret = vfio_dma_do_unmap(iommu, &unmap);
2313 if (ret)
2314 return ret;
2315
2316 return copy_to_user((void __user *)arg, &unmap, minsz) ?
2317 -EFAULT : 0;
2318 }
2319
2320 return -ENOTTY;
2321 }
2322
2323 static int vfio_iommu_type1_register_notifier(void *iommu_data,
2324 unsigned long *events,
2325 struct notifier_block *nb)
2326 {
2327 struct vfio_iommu *iommu = iommu_data;
2328
2329
2330 *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
2331
2332
2333 if (*events)
2334 return -EINVAL;
2335
2336 return blocking_notifier_chain_register(&iommu->notifier, nb);
2337 }
2338
2339 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
2340 struct notifier_block *nb)
2341 {
2342 struct vfio_iommu *iommu = iommu_data;
2343
2344 return blocking_notifier_chain_unregister(&iommu->notifier, nb);
2345 }
2346
2347 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
2348 .name = "vfio-iommu-type1",
2349 .owner = THIS_MODULE,
2350 .open = vfio_iommu_type1_open,
2351 .release = vfio_iommu_type1_release,
2352 .ioctl = vfio_iommu_type1_ioctl,
2353 .attach_group = vfio_iommu_type1_attach_group,
2354 .detach_group = vfio_iommu_type1_detach_group,
2355 .pin_pages = vfio_iommu_type1_pin_pages,
2356 .unpin_pages = vfio_iommu_type1_unpin_pages,
2357 .register_notifier = vfio_iommu_type1_register_notifier,
2358 .unregister_notifier = vfio_iommu_type1_unregister_notifier,
2359 };
2360
2361 static int __init vfio_iommu_type1_init(void)
2362 {
2363 return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
2364 }
2365
2366 static void __exit vfio_iommu_type1_cleanup(void)
2367 {
2368 vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
2369 }
2370
2371 module_init(vfio_iommu_type1_init);
2372 module_exit(vfio_iommu_type1_cleanup);
2373
2374 MODULE_VERSION(DRIVER_VERSION);
2375 MODULE_LICENSE("GPL v2");
2376 MODULE_AUTHOR(DRIVER_AUTHOR);
2377 MODULE_DESCRIPTION(DRIVER_DESC);