1/*
2 * VFIO: IOMMU DMA mapping support for TCE on POWER
3 *
4 * Copyright (C) 2013 IBM Corp.  All rights reserved.
5 *     Author: Alexey Kardashevskiy <aik@ozlabs.ru>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10 *
11 * Derived from original vfio_iommu_type1.c:
12 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
13 *     Author: Alex Williamson <alex.williamson@redhat.com>
14 */
15
16#include <linux/module.h>
17#include <linux/pci.h>
18#include <linux/slab.h>
19#include <linux/uaccess.h>
20#include <linux/err.h>
21#include <linux/vfio.h>
22#include <linux/vmalloc.h>
23#include <asm/iommu.h>
24#include <asm/tce.h>
25#include <asm/mmu_context.h>
26
27#define DRIVER_VERSION  "0.1"
28#define DRIVER_AUTHOR   "aik@ozlabs.ru"
29#define DRIVER_DESC     "VFIO IOMMU SPAPR TCE"
30
31static void tce_iommu_detach_group(void *iommu_data,
32		struct iommu_group *iommu_group);
33
34static long try_increment_locked_vm(long npages)
35{
36	long ret = 0, locked, lock_limit;
37
38	if (!current || !current->mm)
39		return -ESRCH; /* process exited */
40
41	if (!npages)
42		return 0;
43
44	down_write(&current->mm->mmap_sem);
45	locked = current->mm->locked_vm + npages;
46	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
47	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
48		ret = -ENOMEM;
49	else
50		current->mm->locked_vm += npages;
51
52	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
53			npages << PAGE_SHIFT,
54			current->mm->locked_vm << PAGE_SHIFT,
55			rlimit(RLIMIT_MEMLOCK),
56			ret ? " - exceeded" : "");
57
58	up_write(&current->mm->mmap_sem);
59
60	return ret;
61}
62
63static void decrement_locked_vm(long npages)
64{
65	if (!current || !current->mm || !npages)
66		return; /* process exited */
67
68	down_write(&current->mm->mmap_sem);
69	if (WARN_ON_ONCE(npages > current->mm->locked_vm))
70		npages = current->mm->locked_vm;
71	current->mm->locked_vm -= npages;
72	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
73			npages << PAGE_SHIFT,
74			current->mm->locked_vm << PAGE_SHIFT,
75			rlimit(RLIMIT_MEMLOCK));
76	up_write(&current->mm->mmap_sem);
77}
78
79/*
80 * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
81 *
82 * This code handles mapping and unmapping of user data buffers
83 * into DMA'ble space using the IOMMU
84 */
85
86struct tce_iommu_group {
87	struct list_head next;
88	struct iommu_group *grp;
89};
90
91/*
92 * The container descriptor supports only a single group per container.
93 * Required by the API as the container is not supplied with the IOMMU group
94 * at the moment of initialization.
95 */
96struct tce_container {
97	struct mutex lock;
98	bool enabled;
99	bool v2;
100	unsigned long locked_pages;
101	struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
102	struct list_head group_list;
103};
104
105static long tce_iommu_unregister_pages(struct tce_container *container,
106		__u64 vaddr, __u64 size)
107{
108	struct mm_iommu_table_group_mem_t *mem;
109
110	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
111		return -EINVAL;
112
113	mem = mm_iommu_find(vaddr, size >> PAGE_SHIFT);
114	if (!mem)
115		return -ENOENT;
116
117	return mm_iommu_put(mem);
118}
119
120static long tce_iommu_register_pages(struct tce_container *container,
121		__u64 vaddr, __u64 size)
122{
123	long ret = 0;
124	struct mm_iommu_table_group_mem_t *mem = NULL;
125	unsigned long entries = size >> PAGE_SHIFT;
126
127	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
128			((vaddr + size) < vaddr))
129		return -EINVAL;
130
131	ret = mm_iommu_get(vaddr, entries, &mem);
132	if (ret)
133		return ret;
134
135	container->enabled = true;
136
137	return 0;
138}
139
140static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl)
141{
142	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
143			tbl->it_size, PAGE_SIZE);
144	unsigned long *uas;
145	long ret;
146
147	BUG_ON(tbl->it_userspace);
148
149	ret = try_increment_locked_vm(cb >> PAGE_SHIFT);
150	if (ret)
151		return ret;
152
153	uas = vzalloc(cb);
154	if (!uas) {
155		decrement_locked_vm(cb >> PAGE_SHIFT);
156		return -ENOMEM;
157	}
158	tbl->it_userspace = uas;
159
160	return 0;
161}
162
163static void tce_iommu_userspace_view_free(struct iommu_table *tbl)
164{
165	unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) *
166			tbl->it_size, PAGE_SIZE);
167
168	if (!tbl->it_userspace)
169		return;
170
171	vfree(tbl->it_userspace);
172	tbl->it_userspace = NULL;
173	decrement_locked_vm(cb >> PAGE_SHIFT);
174}
175
176static bool tce_page_is_contained(struct page *page, unsigned page_shift)
177{
178	/*
179	 * Check that the TCE table granularity is not bigger than the size of
180	 * a page we just found. Otherwise the hardware can get access to
181	 * a bigger memory chunk that it should.
182	 */
183	return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
184}
185
186static inline bool tce_groups_attached(struct tce_container *container)
187{
188	return !list_empty(&container->group_list);
189}
190
191static long tce_iommu_find_table(struct tce_container *container,
192		phys_addr_t ioba, struct iommu_table **ptbl)
193{
194	long i;
195
196	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
197		struct iommu_table *tbl = container->tables[i];
198
199		if (tbl) {
200			unsigned long entry = ioba >> tbl->it_page_shift;
201			unsigned long start = tbl->it_offset;
202			unsigned long end = start + tbl->it_size;
203
204			if ((start <= entry) && (entry < end)) {
205				*ptbl = tbl;
206				return i;
207			}
208		}
209	}
210
211	return -1;
212}
213
214static int tce_iommu_find_free_table(struct tce_container *container)
215{
216	int i;
217
218	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
219		if (!container->tables[i])
220			return i;
221	}
222
223	return -ENOSPC;
224}
225
226static int tce_iommu_enable(struct tce_container *container)
227{
228	int ret = 0;
229	unsigned long locked;
230	struct iommu_table_group *table_group;
231	struct tce_iommu_group *tcegrp;
232
233	if (!current->mm)
234		return -ESRCH; /* process exited */
235
236	if (container->enabled)
237		return -EBUSY;
238
239	/*
240	 * When userspace pages are mapped into the IOMMU, they are effectively
241	 * locked memory, so, theoretically, we need to update the accounting
242	 * of locked pages on each map and unmap.  For powerpc, the map unmap
243	 * paths can be very hot, though, and the accounting would kill
244	 * performance, especially since it would be difficult to impossible
245	 * to handle the accounting in real mode only.
246	 *
247	 * To address that, rather than precisely accounting every page, we
248	 * instead account for a worst case on locked memory when the iommu is
249	 * enabled and disabled.  The worst case upper bound on locked memory
250	 * is the size of the whole iommu window, which is usually relatively
251	 * small (compared to total memory sizes) on POWER hardware.
252	 *
253	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
254	 * that would effectively kill the guest at random points, much better
255	 * enforcing the limit based on the max that the guest can map.
256	 *
257	 * Unfortunately at the moment it counts whole tables, no matter how
258	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
259	 * each with 2GB DMA window, 8GB will be counted here. The reason for
260	 * this is that we cannot tell here the amount of RAM used by the guest
261	 * as this information is only available from KVM and VFIO is
262	 * KVM agnostic.
263	 *
264	 * So we do not allow enabling a container without a group attached
265	 * as there is no way to know how much we should increment
266	 * the locked_vm counter.
267	 */
268	if (!tce_groups_attached(container))
269		return -ENODEV;
270
271	tcegrp = list_first_entry(&container->group_list,
272			struct tce_iommu_group, next);
273	table_group = iommu_group_get_iommudata(tcegrp->grp);
274	if (!table_group)
275		return -ENODEV;
276
277	if (!table_group->tce32_size)
278		return -EPERM;
279
280	locked = table_group->tce32_size >> PAGE_SHIFT;
281	ret = try_increment_locked_vm(locked);
282	if (ret)
283		return ret;
284
285	container->locked_pages = locked;
286
287	container->enabled = true;
288
289	return ret;
290}
291
292static void tce_iommu_disable(struct tce_container *container)
293{
294	if (!container->enabled)
295		return;
296
297	container->enabled = false;
298
299	if (!current->mm)
300		return;
301
302	decrement_locked_vm(container->locked_pages);
303}
304
305static void *tce_iommu_open(unsigned long arg)
306{
307	struct tce_container *container;
308
309	if ((arg != VFIO_SPAPR_TCE_IOMMU) && (arg != VFIO_SPAPR_TCE_v2_IOMMU)) {
310		pr_err("tce_vfio: Wrong IOMMU type\n");
311		return ERR_PTR(-EINVAL);
312	}
313
314	container = kzalloc(sizeof(*container), GFP_KERNEL);
315	if (!container)
316		return ERR_PTR(-ENOMEM);
317
318	mutex_init(&container->lock);
319	INIT_LIST_HEAD_RCU(&container->group_list);
320
321	container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU;
322
323	return container;
324}
325
326static int tce_iommu_clear(struct tce_container *container,
327		struct iommu_table *tbl,
328		unsigned long entry, unsigned long pages);
329static void tce_iommu_free_table(struct iommu_table *tbl);
330
331static void tce_iommu_release(void *iommu_data)
332{
333	struct tce_container *container = iommu_data;
334	struct iommu_table_group *table_group;
335	struct tce_iommu_group *tcegrp;
336	long i;
337
338	while (tce_groups_attached(container)) {
339		tcegrp = list_first_entry(&container->group_list,
340				struct tce_iommu_group, next);
341		table_group = iommu_group_get_iommudata(tcegrp->grp);
342		tce_iommu_detach_group(iommu_data, tcegrp->grp);
343	}
344
345	/*
346	 * If VFIO created a table, it was not disposed
347	 * by tce_iommu_detach_group() so do it now.
348	 */
349	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
350		struct iommu_table *tbl = container->tables[i];
351
352		if (!tbl)
353			continue;
354
355		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
356		tce_iommu_free_table(tbl);
357	}
358
359	tce_iommu_disable(container);
360	mutex_destroy(&container->lock);
361
362	kfree(container);
363}
364
365static void tce_iommu_unuse_page(struct tce_container *container,
366		unsigned long hpa)
367{
368	struct page *page;
369
370	page = pfn_to_page(hpa >> PAGE_SHIFT);
371	put_page(page);
372}
373
374static int tce_iommu_prereg_ua_to_hpa(unsigned long tce, unsigned long size,
375		unsigned long *phpa, struct mm_iommu_table_group_mem_t **pmem)
376{
377	long ret = 0;
378	struct mm_iommu_table_group_mem_t *mem;
379
380	mem = mm_iommu_lookup(tce, size);
381	if (!mem)
382		return -EINVAL;
383
384	ret = mm_iommu_ua_to_hpa(mem, tce, phpa);
385	if (ret)
386		return -EINVAL;
387
388	*pmem = mem;
389
390	return 0;
391}
392
393static void tce_iommu_unuse_page_v2(struct iommu_table *tbl,
394		unsigned long entry)
395{
396	struct mm_iommu_table_group_mem_t *mem = NULL;
397	int ret;
398	unsigned long hpa = 0;
399	unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
400
401	if (!pua || !current || !current->mm)
402		return;
403
404	ret = tce_iommu_prereg_ua_to_hpa(*pua, IOMMU_PAGE_SIZE(tbl),
405			&hpa, &mem);
406	if (ret)
407		pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n",
408				__func__, *pua, entry, ret);
409	if (mem)
410		mm_iommu_mapped_dec(mem);
411
412	*pua = 0;
413}
414
415static int tce_iommu_clear(struct tce_container *container,
416		struct iommu_table *tbl,
417		unsigned long entry, unsigned long pages)
418{
419	unsigned long oldhpa;
420	long ret;
421	enum dma_data_direction direction;
422
423	for ( ; pages; --pages, ++entry) {
424		direction = DMA_NONE;
425		oldhpa = 0;
426		ret = iommu_tce_xchg(tbl, entry, &oldhpa, &direction);
427		if (ret)
428			continue;
429
430		if (direction == DMA_NONE)
431			continue;
432
433		if (container->v2) {
434			tce_iommu_unuse_page_v2(tbl, entry);
435			continue;
436		}
437
438		tce_iommu_unuse_page(container, oldhpa);
439	}
440
441	return 0;
442}
443
444static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
445{
446	struct page *page = NULL;
447	enum dma_data_direction direction = iommu_tce_direction(tce);
448
449	if (get_user_pages_fast(tce & PAGE_MASK, 1,
450			direction != DMA_TO_DEVICE, &page) != 1)
451		return -EFAULT;
452
453	*hpa = __pa((unsigned long) page_address(page));
454
455	return 0;
456}
457
458static long tce_iommu_build(struct tce_container *container,
459		struct iommu_table *tbl,
460		unsigned long entry, unsigned long tce, unsigned long pages,
461		enum dma_data_direction direction)
462{
463	long i, ret = 0;
464	struct page *page;
465	unsigned long hpa;
466	enum dma_data_direction dirtmp;
467
468	for (i = 0; i < pages; ++i) {
469		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
470
471		ret = tce_iommu_use_page(tce, &hpa);
472		if (ret)
473			break;
474
475		page = pfn_to_page(hpa >> PAGE_SHIFT);
476		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
477			ret = -EPERM;
478			break;
479		}
480
481		hpa |= offset;
482		dirtmp = direction;
483		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
484		if (ret) {
485			tce_iommu_unuse_page(container, hpa);
486			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
487					__func__, entry << tbl->it_page_shift,
488					tce, ret);
489			break;
490		}
491
492		if (dirtmp != DMA_NONE)
493			tce_iommu_unuse_page(container, hpa);
494
495		tce += IOMMU_PAGE_SIZE(tbl);
496	}
497
498	if (ret)
499		tce_iommu_clear(container, tbl, entry, i);
500
501	return ret;
502}
503
504static long tce_iommu_build_v2(struct tce_container *container,
505		struct iommu_table *tbl,
506		unsigned long entry, unsigned long tce, unsigned long pages,
507		enum dma_data_direction direction)
508{
509	long i, ret = 0;
510	struct page *page;
511	unsigned long hpa;
512	enum dma_data_direction dirtmp;
513
514	for (i = 0; i < pages; ++i) {
515		struct mm_iommu_table_group_mem_t *mem = NULL;
516		unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl,
517				entry + i);
518
519		ret = tce_iommu_prereg_ua_to_hpa(tce, IOMMU_PAGE_SIZE(tbl),
520				&hpa, &mem);
521		if (ret)
522			break;
523
524		page = pfn_to_page(hpa >> PAGE_SHIFT);
525		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
526			ret = -EPERM;
527			break;
528		}
529
530		/* Preserve offset within IOMMU page */
531		hpa |= tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
532		dirtmp = direction;
533
534		/* The registered region is being unregistered */
535		if (mm_iommu_mapped_inc(mem))
536			break;
537
538		ret = iommu_tce_xchg(tbl, entry + i, &hpa, &dirtmp);
539		if (ret) {
540			/* dirtmp cannot be DMA_NONE here */
541			tce_iommu_unuse_page_v2(tbl, entry + i);
542			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
543					__func__, entry << tbl->it_page_shift,
544					tce, ret);
545			break;
546		}
547
548		if (dirtmp != DMA_NONE)
549			tce_iommu_unuse_page_v2(tbl, entry + i);
550
551		*pua = tce;
552
553		tce += IOMMU_PAGE_SIZE(tbl);
554	}
555
556	if (ret)
557		tce_iommu_clear(container, tbl, entry, i);
558
559	return ret;
560}
561
562static long tce_iommu_create_table(struct tce_container *container,
563			struct iommu_table_group *table_group,
564			int num,
565			__u32 page_shift,
566			__u64 window_size,
567			__u32 levels,
568			struct iommu_table **ptbl)
569{
570	long ret, table_size;
571
572	table_size = table_group->ops->get_table_size(page_shift, window_size,
573			levels);
574	if (!table_size)
575		return -EINVAL;
576
577	ret = try_increment_locked_vm(table_size >> PAGE_SHIFT);
578	if (ret)
579		return ret;
580
581	ret = table_group->ops->create_table(table_group, num,
582			page_shift, window_size, levels, ptbl);
583
584	WARN_ON(!ret && !(*ptbl)->it_ops->free);
585	WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
586
587	if (!ret && container->v2) {
588		ret = tce_iommu_userspace_view_alloc(*ptbl);
589		if (ret)
590			(*ptbl)->it_ops->free(*ptbl);
591	}
592
593	if (ret)
594		decrement_locked_vm(table_size >> PAGE_SHIFT);
595
596	return ret;
597}
598
599static void tce_iommu_free_table(struct iommu_table *tbl)
600{
601	unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
602
603	tce_iommu_userspace_view_free(tbl);
604	tbl->it_ops->free(tbl);
605	decrement_locked_vm(pages);
606}
607
608static long tce_iommu_create_window(struct tce_container *container,
609		__u32 page_shift, __u64 window_size, __u32 levels,
610		__u64 *start_addr)
611{
612	struct tce_iommu_group *tcegrp;
613	struct iommu_table_group *table_group;
614	struct iommu_table *tbl = NULL;
615	long ret, num;
616
617	num = tce_iommu_find_free_table(container);
618	if (num < 0)
619		return num;
620
621	/* Get the first group for ops::create_table */
622	tcegrp = list_first_entry(&container->group_list,
623			struct tce_iommu_group, next);
624	table_group = iommu_group_get_iommudata(tcegrp->grp);
625	if (!table_group)
626		return -EFAULT;
627
628	if (!(table_group->pgsizes & (1ULL << page_shift)))
629		return -EINVAL;
630
631	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
632			!table_group->ops->get_table_size ||
633			!table_group->ops->create_table)
634		return -EPERM;
635
636	/* Create TCE table */
637	ret = tce_iommu_create_table(container, table_group, num,
638			page_shift, window_size, levels, &tbl);
639	if (ret)
640		return ret;
641
642	BUG_ON(!tbl->it_ops->free);
643
644	/*
645	 * Program the table to every group.
646	 * Groups have been tested for compatibility at the attach time.
647	 */
648	list_for_each_entry(tcegrp, &container->group_list, next) {
649		table_group = iommu_group_get_iommudata(tcegrp->grp);
650
651		ret = table_group->ops->set_window(table_group, num, tbl);
652		if (ret)
653			goto unset_exit;
654	}
655
656	container->tables[num] = tbl;
657
658	/* Return start address assigned by platform in create_table() */
659	*start_addr = tbl->it_offset << tbl->it_page_shift;
660
661	return 0;
662
663unset_exit:
664	list_for_each_entry(tcegrp, &container->group_list, next) {
665		table_group = iommu_group_get_iommudata(tcegrp->grp);
666		table_group->ops->unset_window(table_group, num);
667	}
668	tce_iommu_free_table(tbl);
669
670	return ret;
671}
672
673static long tce_iommu_remove_window(struct tce_container *container,
674		__u64 start_addr)
675{
676	struct iommu_table_group *table_group = NULL;
677	struct iommu_table *tbl;
678	struct tce_iommu_group *tcegrp;
679	int num;
680
681	num = tce_iommu_find_table(container, start_addr, &tbl);
682	if (num < 0)
683		return -EINVAL;
684
685	BUG_ON(!tbl->it_size);
686
687	/* Detach groups from IOMMUs */
688	list_for_each_entry(tcegrp, &container->group_list, next) {
689		table_group = iommu_group_get_iommudata(tcegrp->grp);
690
691		/*
692		 * SPAPR TCE IOMMU exposes the default DMA window to
693		 * the guest via dma32_window_start/size of
694		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
695		 * the userspace to remove this window, some do not so
696		 * here we check for the platform capability.
697		 */
698		if (!table_group->ops || !table_group->ops->unset_window)
699			return -EPERM;
700
701		table_group->ops->unset_window(table_group, num);
702	}
703
704	/* Free table */
705	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
706	tce_iommu_free_table(tbl);
707	container->tables[num] = NULL;
708
709	return 0;
710}
711
712static long tce_iommu_ioctl(void *iommu_data,
713				 unsigned int cmd, unsigned long arg)
714{
715	struct tce_container *container = iommu_data;
716	unsigned long minsz, ddwsz;
717	long ret;
718
719	switch (cmd) {
720	case VFIO_CHECK_EXTENSION:
721		switch (arg) {
722		case VFIO_SPAPR_TCE_IOMMU:
723		case VFIO_SPAPR_TCE_v2_IOMMU:
724			ret = 1;
725			break;
726		default:
727			ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
728			break;
729		}
730
731		return (ret < 0) ? 0 : ret;
732
733	case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
734		struct vfio_iommu_spapr_tce_info info;
735		struct tce_iommu_group *tcegrp;
736		struct iommu_table_group *table_group;
737
738		if (!tce_groups_attached(container))
739			return -ENXIO;
740
741		tcegrp = list_first_entry(&container->group_list,
742				struct tce_iommu_group, next);
743		table_group = iommu_group_get_iommudata(tcegrp->grp);
744
745		if (!table_group)
746			return -ENXIO;
747
748		minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
749				dma32_window_size);
750
751		if (copy_from_user(&info, (void __user *)arg, minsz))
752			return -EFAULT;
753
754		if (info.argsz < minsz)
755			return -EINVAL;
756
757		info.dma32_window_start = table_group->tce32_start;
758		info.dma32_window_size = table_group->tce32_size;
759		info.flags = 0;
760		memset(&info.ddw, 0, sizeof(info.ddw));
761
762		if (table_group->max_dynamic_windows_supported &&
763				container->v2) {
764			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
765			info.ddw.pgsizes = table_group->pgsizes;
766			info.ddw.max_dynamic_windows_supported =
767				table_group->max_dynamic_windows_supported;
768			info.ddw.levels = table_group->max_levels;
769		}
770
771		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
772
773		if (info.argsz >= ddwsz)
774			minsz = ddwsz;
775
776		if (copy_to_user((void __user *)arg, &info, minsz))
777			return -EFAULT;
778
779		return 0;
780	}
781	case VFIO_IOMMU_MAP_DMA: {
782		struct vfio_iommu_type1_dma_map param;
783		struct iommu_table *tbl = NULL;
784		long num;
785		enum dma_data_direction direction;
786
787		if (!container->enabled)
788			return -EPERM;
789
790		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
791
792		if (copy_from_user(&param, (void __user *)arg, minsz))
793			return -EFAULT;
794
795		if (param.argsz < minsz)
796			return -EINVAL;
797
798		if (param.flags & ~(VFIO_DMA_MAP_FLAG_READ |
799				VFIO_DMA_MAP_FLAG_WRITE))
800			return -EINVAL;
801
802		num = tce_iommu_find_table(container, param.iova, &tbl);
803		if (num < 0)
804			return -ENXIO;
805
806		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
807				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
808			return -EINVAL;
809
810		/* iova is checked by the IOMMU API */
811		if (param.flags & VFIO_DMA_MAP_FLAG_READ) {
812			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
813				direction = DMA_BIDIRECTIONAL;
814			else
815				direction = DMA_TO_DEVICE;
816		} else {
817			if (param.flags & VFIO_DMA_MAP_FLAG_WRITE)
818				direction = DMA_FROM_DEVICE;
819			else
820				return -EINVAL;
821		}
822
823		ret = iommu_tce_put_param_check(tbl, param.iova, param.vaddr);
824		if (ret)
825			return ret;
826
827		if (container->v2)
828			ret = tce_iommu_build_v2(container, tbl,
829					param.iova >> tbl->it_page_shift,
830					param.vaddr,
831					param.size >> tbl->it_page_shift,
832					direction);
833		else
834			ret = tce_iommu_build(container, tbl,
835					param.iova >> tbl->it_page_shift,
836					param.vaddr,
837					param.size >> tbl->it_page_shift,
838					direction);
839
840		iommu_flush_tce(tbl);
841
842		return ret;
843	}
844	case VFIO_IOMMU_UNMAP_DMA: {
845		struct vfio_iommu_type1_dma_unmap param;
846		struct iommu_table *tbl = NULL;
847		long num;
848
849		if (!container->enabled)
850			return -EPERM;
851
852		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap,
853				size);
854
855		if (copy_from_user(&param, (void __user *)arg, minsz))
856			return -EFAULT;
857
858		if (param.argsz < minsz)
859			return -EINVAL;
860
861		/* No flag is supported now */
862		if (param.flags)
863			return -EINVAL;
864
865		num = tce_iommu_find_table(container, param.iova, &tbl);
866		if (num < 0)
867			return -ENXIO;
868
869		if (param.size & ~IOMMU_PAGE_MASK(tbl))
870			return -EINVAL;
871
872		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
873				param.size >> tbl->it_page_shift);
874		if (ret)
875			return ret;
876
877		ret = tce_iommu_clear(container, tbl,
878				param.iova >> tbl->it_page_shift,
879				param.size >> tbl->it_page_shift);
880		iommu_flush_tce(tbl);
881
882		return ret;
883	}
884	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
885		struct vfio_iommu_spapr_register_memory param;
886
887		if (!container->v2)
888			break;
889
890		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
891				size);
892
893		if (copy_from_user(&param, (void __user *)arg, minsz))
894			return -EFAULT;
895
896		if (param.argsz < minsz)
897			return -EINVAL;
898
899		/* No flag is supported now */
900		if (param.flags)
901			return -EINVAL;
902
903		mutex_lock(&container->lock);
904		ret = tce_iommu_register_pages(container, param.vaddr,
905				param.size);
906		mutex_unlock(&container->lock);
907
908		return ret;
909	}
910	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
911		struct vfio_iommu_spapr_register_memory param;
912
913		if (!container->v2)
914			break;
915
916		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
917				size);
918
919		if (copy_from_user(&param, (void __user *)arg, minsz))
920			return -EFAULT;
921
922		if (param.argsz < minsz)
923			return -EINVAL;
924
925		/* No flag is supported now */
926		if (param.flags)
927			return -EINVAL;
928
929		mutex_lock(&container->lock);
930		ret = tce_iommu_unregister_pages(container, param.vaddr,
931				param.size);
932		mutex_unlock(&container->lock);
933
934		return ret;
935	}
936	case VFIO_IOMMU_ENABLE:
937		if (container->v2)
938			break;
939
940		mutex_lock(&container->lock);
941		ret = tce_iommu_enable(container);
942		mutex_unlock(&container->lock);
943		return ret;
944
945
946	case VFIO_IOMMU_DISABLE:
947		if (container->v2)
948			break;
949
950		mutex_lock(&container->lock);
951		tce_iommu_disable(container);
952		mutex_unlock(&container->lock);
953		return 0;
954
955	case VFIO_EEH_PE_OP: {
956		struct tce_iommu_group *tcegrp;
957
958		ret = 0;
959		list_for_each_entry(tcegrp, &container->group_list, next) {
960			ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
961					cmd, arg);
962			if (ret)
963				return ret;
964		}
965		return ret;
966	}
967
968	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
969		struct vfio_iommu_spapr_tce_create create;
970
971		if (!container->v2)
972			break;
973
974		if (!tce_groups_attached(container))
975			return -ENXIO;
976
977		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
978				start_addr);
979
980		if (copy_from_user(&create, (void __user *)arg, minsz))
981			return -EFAULT;
982
983		if (create.argsz < minsz)
984			return -EINVAL;
985
986		if (create.flags)
987			return -EINVAL;
988
989		mutex_lock(&container->lock);
990
991		ret = tce_iommu_create_window(container, create.page_shift,
992				create.window_size, create.levels,
993				&create.start_addr);
994
995		mutex_unlock(&container->lock);
996
997		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
998			ret = -EFAULT;
999
1000		return ret;
1001	}
1002	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
1003		struct vfio_iommu_spapr_tce_remove remove;
1004
1005		if (!container->v2)
1006			break;
1007
1008		if (!tce_groups_attached(container))
1009			return -ENXIO;
1010
1011		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
1012				start_addr);
1013
1014		if (copy_from_user(&remove, (void __user *)arg, minsz))
1015			return -EFAULT;
1016
1017		if (remove.argsz < minsz)
1018			return -EINVAL;
1019
1020		if (remove.flags)
1021			return -EINVAL;
1022
1023		mutex_lock(&container->lock);
1024
1025		ret = tce_iommu_remove_window(container, remove.start_addr);
1026
1027		mutex_unlock(&container->lock);
1028
1029		return ret;
1030	}
1031	}
1032
1033	return -ENOTTY;
1034}
1035
1036static void tce_iommu_release_ownership(struct tce_container *container,
1037		struct iommu_table_group *table_group)
1038{
1039	int i;
1040
1041	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1042		struct iommu_table *tbl = container->tables[i];
1043
1044		if (!tbl)
1045			continue;
1046
1047		tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
1048		tce_iommu_userspace_view_free(tbl);
1049		if (tbl->it_map)
1050			iommu_release_ownership(tbl);
1051
1052		container->tables[i] = NULL;
1053	}
1054}
1055
1056static int tce_iommu_take_ownership(struct tce_container *container,
1057		struct iommu_table_group *table_group)
1058{
1059	int i, j, rc = 0;
1060
1061	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1062		struct iommu_table *tbl = table_group->tables[i];
1063
1064		if (!tbl || !tbl->it_map)
1065			continue;
1066
1067		rc = tce_iommu_userspace_view_alloc(tbl);
1068		if (!rc)
1069			rc = iommu_take_ownership(tbl);
1070
1071		if (rc) {
1072			for (j = 0; j < i; ++j)
1073				iommu_release_ownership(
1074						table_group->tables[j]);
1075
1076			return rc;
1077		}
1078	}
1079
1080	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1081		container->tables[i] = table_group->tables[i];
1082
1083	return 0;
1084}
1085
1086static void tce_iommu_release_ownership_ddw(struct tce_container *container,
1087		struct iommu_table_group *table_group)
1088{
1089	long i;
1090
1091	if (!table_group->ops->unset_window) {
1092		WARN_ON_ONCE(1);
1093		return;
1094	}
1095
1096	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1097		table_group->ops->unset_window(table_group, i);
1098
1099	table_group->ops->release_ownership(table_group);
1100}
1101
1102static long tce_iommu_take_ownership_ddw(struct tce_container *container,
1103		struct iommu_table_group *table_group)
1104{
1105	long i, ret = 0;
1106	struct iommu_table *tbl = NULL;
1107
1108	if (!table_group->ops->create_table || !table_group->ops->set_window ||
1109			!table_group->ops->release_ownership) {
1110		WARN_ON_ONCE(1);
1111		return -EFAULT;
1112	}
1113
1114	table_group->ops->take_ownership(table_group);
1115
1116	/*
1117	 * If it the first group attached, check if there is
1118	 * a default DMA window and create one if none as
1119	 * the userspace expects it to exist.
1120	 */
1121	if (!tce_groups_attached(container) && !container->tables[0]) {
1122		ret = tce_iommu_create_table(container,
1123				table_group,
1124				0, /* window number */
1125				IOMMU_PAGE_SHIFT_4K,
1126				table_group->tce32_size,
1127				1, /* default levels */
1128				&tbl);
1129		if (ret)
1130			goto release_exit;
1131		else
1132			container->tables[0] = tbl;
1133	}
1134
1135	/* Set all windows to the new group */
1136	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
1137		tbl = container->tables[i];
1138
1139		if (!tbl)
1140			continue;
1141
1142		/* Set the default window to a new group */
1143		ret = table_group->ops->set_window(table_group, i, tbl);
1144		if (ret)
1145			goto release_exit;
1146	}
1147
1148	return 0;
1149
1150release_exit:
1151	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
1152		table_group->ops->unset_window(table_group, i);
1153
1154	table_group->ops->release_ownership(table_group);
1155
1156	return ret;
1157}
1158
1159static int tce_iommu_attach_group(void *iommu_data,
1160		struct iommu_group *iommu_group)
1161{
1162	int ret;
1163	struct tce_container *container = iommu_data;
1164	struct iommu_table_group *table_group;
1165	struct tce_iommu_group *tcegrp = NULL;
1166
1167	mutex_lock(&container->lock);
1168
1169	/* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
1170			iommu_group_id(iommu_group), iommu_group); */
1171	table_group = iommu_group_get_iommudata(iommu_group);
1172
1173	if (tce_groups_attached(container) && (!table_group->ops ||
1174			!table_group->ops->take_ownership ||
1175			!table_group->ops->release_ownership)) {
1176		ret = -EBUSY;
1177		goto unlock_exit;
1178	}
1179
1180	/* Check if new group has the same iommu_ops (i.e. compatible) */
1181	list_for_each_entry(tcegrp, &container->group_list, next) {
1182		struct iommu_table_group *table_group_tmp;
1183
1184		if (tcegrp->grp == iommu_group) {
1185			pr_warn("tce_vfio: Group %d is already attached\n",
1186					iommu_group_id(iommu_group));
1187			ret = -EBUSY;
1188			goto unlock_exit;
1189		}
1190		table_group_tmp = iommu_group_get_iommudata(tcegrp->grp);
1191		if (table_group_tmp->ops != table_group->ops) {
1192			pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
1193					iommu_group_id(iommu_group),
1194					iommu_group_id(tcegrp->grp));
1195			ret = -EPERM;
1196			goto unlock_exit;
1197		}
1198	}
1199
1200	tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
1201	if (!tcegrp) {
1202		ret = -ENOMEM;
1203		goto unlock_exit;
1204	}
1205
1206	if (!table_group->ops || !table_group->ops->take_ownership ||
1207			!table_group->ops->release_ownership)
1208		ret = tce_iommu_take_ownership(container, table_group);
1209	else
1210		ret = tce_iommu_take_ownership_ddw(container, table_group);
1211
1212	if (!ret) {
1213		tcegrp->grp = iommu_group;
1214		list_add(&tcegrp->next, &container->group_list);
1215	}
1216
1217unlock_exit:
1218	if (ret && tcegrp)
1219		kfree(tcegrp);
1220
1221	mutex_unlock(&container->lock);
1222
1223	return ret;
1224}
1225
1226static void tce_iommu_detach_group(void *iommu_data,
1227		struct iommu_group *iommu_group)
1228{
1229	struct tce_container *container = iommu_data;
1230	struct iommu_table_group *table_group;
1231	bool found = false;
1232	struct tce_iommu_group *tcegrp;
1233
1234	mutex_lock(&container->lock);
1235
1236	list_for_each_entry(tcegrp, &container->group_list, next) {
1237		if (tcegrp->grp == iommu_group) {
1238			found = true;
1239			break;
1240		}
1241	}
1242
1243	if (!found) {
1244		pr_warn("tce_vfio: detaching unattached group #%u\n",
1245				iommu_group_id(iommu_group));
1246		goto unlock_exit;
1247	}
1248
1249	list_del(&tcegrp->next);
1250	kfree(tcegrp);
1251
1252	table_group = iommu_group_get_iommudata(iommu_group);
1253	BUG_ON(!table_group);
1254
1255	if (!table_group->ops || !table_group->ops->release_ownership)
1256		tce_iommu_release_ownership(container, table_group);
1257	else
1258		tce_iommu_release_ownership_ddw(container, table_group);
1259
1260unlock_exit:
1261	mutex_unlock(&container->lock);
1262}
1263
1264const struct vfio_iommu_driver_ops tce_iommu_driver_ops = {
1265	.name		= "iommu-vfio-powerpc",
1266	.owner		= THIS_MODULE,
1267	.open		= tce_iommu_open,
1268	.release	= tce_iommu_release,
1269	.ioctl		= tce_iommu_ioctl,
1270	.attach_group	= tce_iommu_attach_group,
1271	.detach_group	= tce_iommu_detach_group,
1272};
1273
1274static int __init tce_iommu_init(void)
1275{
1276	return vfio_register_iommu_driver(&tce_iommu_driver_ops);
1277}
1278
1279static void __exit tce_iommu_cleanup(void)
1280{
1281	vfio_unregister_iommu_driver(&tce_iommu_driver_ops);
1282}
1283
1284module_init(tce_iommu_init);
1285module_exit(tce_iommu_cleanup);
1286
1287MODULE_VERSION(DRIVER_VERSION);
1288MODULE_LICENSE("GPL v2");
1289MODULE_AUTHOR(DRIVER_AUTHOR);
1290MODULE_DESCRIPTION(DRIVER_DESC);
1291
1292