root/drivers/vfio/vfio.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. vfio_iommu_group_get
  2. vfio_iommu_group_put
  3. vfio_noiommu_open
  4. vfio_noiommu_release
  5. vfio_noiommu_ioctl
  6. vfio_noiommu_attach_group
  7. vfio_noiommu_detach_group
  8. vfio_register_iommu_driver
  9. vfio_unregister_iommu_driver
  10. vfio_alloc_group_minor
  11. vfio_free_group_minor
  12. vfio_container_get
  13. vfio_container_release
  14. vfio_container_put
  15. vfio_group_unlock_and_free
  16. vfio_create_group
  17. vfio_group_release
  18. vfio_group_put
  19. vfio_group_put_bg
  20. vfio_group_schedule_put
  21. vfio_group_get
  22. vfio_group_try_get
  23. vfio_group_get_from_iommu
  24. vfio_group_get_from_minor
  25. vfio_group_get_from_dev
  26. vfio_group_create_device
  27. vfio_device_release
  28. vfio_device_put
  29. vfio_device_get
  30. vfio_group_get_device
  31. vfio_dev_whitelisted
  32. vfio_dev_viable
  33. vfio_group_nb_add_dev
  34. vfio_group_nb_verify
  35. vfio_iommu_group_notifier
  36. vfio_add_group_dev
  37. vfio_device_get_from_dev
  38. vfio_device_get_from_name
  39. vfio_device_data
  40. vfio_del_group_dev
  41. vfio_ioctl_check_extension
  42. __vfio_container_attach_groups
  43. vfio_ioctl_set_iommu
  44. vfio_fops_unl_ioctl
  45. vfio_fops_compat_ioctl
  46. vfio_fops_open
  47. vfio_fops_release
  48. vfio_fops_read
  49. vfio_fops_write
  50. vfio_fops_mmap
  51. __vfio_group_unset_container
  52. vfio_group_unset_container
  53. vfio_group_try_dissolve_container
  54. vfio_group_set_container
  55. vfio_group_viable
  56. vfio_group_add_container_user
  57. vfio_group_get_device_fd
  58. vfio_group_fops_unl_ioctl
  59. vfio_group_fops_compat_ioctl
  60. vfio_group_fops_open
  61. vfio_group_fops_release
  62. vfio_device_fops_release
  63. vfio_device_fops_unl_ioctl
  64. vfio_device_fops_read
  65. vfio_device_fops_write
  66. vfio_device_fops_mmap
  67. vfio_device_fops_compat_ioctl
  68. vfio_group_get_external_user
  69. vfio_group_put_external_user
  70. vfio_external_group_match_file
  71. vfio_external_user_iommu_id
  72. vfio_external_check_extension
  73. vfio_info_cap_add
  74. vfio_info_cap_shift
  75. vfio_info_add_capability
  76. vfio_set_irqs_validate_and_prepare
  77. vfio_pin_pages
  78. vfio_unpin_pages
  79. vfio_register_iommu_notifier
  80. vfio_unregister_iommu_notifier
  81. vfio_group_set_kvm
  82. vfio_register_group_notifier
  83. vfio_unregister_group_notifier
  84. vfio_register_notifier
  85. vfio_unregister_notifier
  86. vfio_devnode
  87. vfio_init
  88. vfio_cleanup

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO core
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <alex.williamson@redhat.com>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, pugs@cisco.com
  11  */
  12 
  13 #include <linux/cdev.h>
  14 #include <linux/compat.h>
  15 #include <linux/device.h>
  16 #include <linux/file.h>
  17 #include <linux/anon_inodes.h>
  18 #include <linux/fs.h>
  19 #include <linux/idr.h>
  20 #include <linux/iommu.h>
  21 #include <linux/list.h>
  22 #include <linux/miscdevice.h>
  23 #include <linux/module.h>
  24 #include <linux/mutex.h>
  25 #include <linux/pci.h>
  26 #include <linux/rwsem.h>
  27 #include <linux/sched.h>
  28 #include <linux/slab.h>
  29 #include <linux/stat.h>
  30 #include <linux/string.h>
  31 #include <linux/uaccess.h>
  32 #include <linux/vfio.h>
  33 #include <linux/wait.h>
  34 #include <linux/sched/signal.h>
  35 
  36 #define DRIVER_VERSION  "0.3"
  37 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  38 #define DRIVER_DESC     "VFIO - User Level meta-driver"
  39 
  40 static struct vfio {
  41         struct class                    *class;
  42         struct list_head                iommu_drivers_list;
  43         struct mutex                    iommu_drivers_lock;
  44         struct list_head                group_list;
  45         struct idr                      group_idr;
  46         struct mutex                    group_lock;
  47         struct cdev                     group_cdev;
  48         dev_t                           group_devt;
  49         wait_queue_head_t               release_q;
  50 } vfio;
  51 
  52 struct vfio_iommu_driver {
  53         const struct vfio_iommu_driver_ops      *ops;
  54         struct list_head                        vfio_next;
  55 };
  56 
  57 struct vfio_container {
  58         struct kref                     kref;
  59         struct list_head                group_list;
  60         struct rw_semaphore             group_lock;
  61         struct vfio_iommu_driver        *iommu_driver;
  62         void                            *iommu_data;
  63         bool                            noiommu;
  64 };
  65 
  66 struct vfio_unbound_dev {
  67         struct device                   *dev;
  68         struct list_head                unbound_next;
  69 };
  70 
  71 struct vfio_group {
  72         struct kref                     kref;
  73         int                             minor;
  74         atomic_t                        container_users;
  75         struct iommu_group              *iommu_group;
  76         struct vfio_container           *container;
  77         struct list_head                device_list;
  78         struct mutex                    device_lock;
  79         struct device                   *dev;
  80         struct notifier_block           nb;
  81         struct list_head                vfio_next;
  82         struct list_head                container_next;
  83         struct list_head                unbound_list;
  84         struct mutex                    unbound_lock;
  85         atomic_t                        opened;
  86         wait_queue_head_t               container_q;
  87         bool                            noiommu;
  88         struct kvm                      *kvm;
  89         struct blocking_notifier_head   notifier;
  90 };
  91 
  92 struct vfio_device {
  93         struct kref                     kref;
  94         struct device                   *dev;
  95         const struct vfio_device_ops    *ops;
  96         struct vfio_group               *group;
  97         struct list_head                group_next;
  98         void                            *device_data;
  99 };
 100 
 101 #ifdef CONFIG_VFIO_NOIOMMU
 102 static bool noiommu __read_mostly;
 103 module_param_named(enable_unsafe_noiommu_mode,
 104                    noiommu, bool, S_IRUGO | S_IWUSR);
 105 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
 106 #endif
 107 
 108 /*
 109  * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe
 110  * and remove functions, any use cases other than acquiring the first
 111  * reference for the purpose of calling vfio_add_group_dev() or removing
 112  * that symmetric reference after vfio_del_group_dev() should use the raw
 113  * iommu_group_{get,put} functions.  In particular, vfio_iommu_group_put()
 114  * removes the device from the dummy group and cannot be nested.
 115  */
 116 struct iommu_group *vfio_iommu_group_get(struct device *dev)
 117 {
 118         struct iommu_group *group;
 119         int __maybe_unused ret;
 120 
 121         group = iommu_group_get(dev);
 122 
 123 #ifdef CONFIG_VFIO_NOIOMMU
 124         /*
 125          * With noiommu enabled, an IOMMU group will be created for a device
 126          * that doesn't already have one and doesn't have an iommu_ops on their
 127          * bus.  We set iommudata simply to be able to identify these groups
 128          * as special use and for reclamation later.
 129          */
 130         if (group || !noiommu || iommu_present(dev->bus))
 131                 return group;
 132 
 133         group = iommu_group_alloc();
 134         if (IS_ERR(group))
 135                 return NULL;
 136 
 137         iommu_group_set_name(group, "vfio-noiommu");
 138         iommu_group_set_iommudata(group, &noiommu, NULL);
 139         ret = iommu_group_add_device(group, dev);
 140         if (ret) {
 141                 iommu_group_put(group);
 142                 return NULL;
 143         }
 144 
 145         /*
 146          * Where to taint?  At this point we've added an IOMMU group for a
 147          * device that is not backed by iommu_ops, therefore any iommu_
 148          * callback using iommu_ops can legitimately Oops.  So, while we may
 149          * be about to give a DMA capable device to a user without IOMMU
 150          * protection, which is clearly taint-worthy, let's go ahead and do
 151          * it here.
 152          */
 153         add_taint(TAINT_USER, LOCKDEP_STILL_OK);
 154         dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
 155 #endif
 156 
 157         return group;
 158 }
 159 EXPORT_SYMBOL_GPL(vfio_iommu_group_get);
 160 
 161 void vfio_iommu_group_put(struct iommu_group *group, struct device *dev)
 162 {
 163 #ifdef CONFIG_VFIO_NOIOMMU
 164         if (iommu_group_get_iommudata(group) == &noiommu)
 165                 iommu_group_remove_device(dev);
 166 #endif
 167 
 168         iommu_group_put(group);
 169 }
 170 EXPORT_SYMBOL_GPL(vfio_iommu_group_put);
 171 
 172 #ifdef CONFIG_VFIO_NOIOMMU
 173 static void *vfio_noiommu_open(unsigned long arg)
 174 {
 175         if (arg != VFIO_NOIOMMU_IOMMU)
 176                 return ERR_PTR(-EINVAL);
 177         if (!capable(CAP_SYS_RAWIO))
 178                 return ERR_PTR(-EPERM);
 179 
 180         return NULL;
 181 }
 182 
 183 static void vfio_noiommu_release(void *iommu_data)
 184 {
 185 }
 186 
 187 static long vfio_noiommu_ioctl(void *iommu_data,
 188                                unsigned int cmd, unsigned long arg)
 189 {
 190         if (cmd == VFIO_CHECK_EXTENSION)
 191                 return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0;
 192 
 193         return -ENOTTY;
 194 }
 195 
 196 static int vfio_noiommu_attach_group(void *iommu_data,
 197                                      struct iommu_group *iommu_group)
 198 {
 199         return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL;
 200 }
 201 
 202 static void vfio_noiommu_detach_group(void *iommu_data,
 203                                       struct iommu_group *iommu_group)
 204 {
 205 }
 206 
 207 static const struct vfio_iommu_driver_ops vfio_noiommu_ops = {
 208         .name = "vfio-noiommu",
 209         .owner = THIS_MODULE,
 210         .open = vfio_noiommu_open,
 211         .release = vfio_noiommu_release,
 212         .ioctl = vfio_noiommu_ioctl,
 213         .attach_group = vfio_noiommu_attach_group,
 214         .detach_group = vfio_noiommu_detach_group,
 215 };
 216 #endif
 217 
 218 
 219 /**
 220  * IOMMU driver registration
 221  */
 222 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 223 {
 224         struct vfio_iommu_driver *driver, *tmp;
 225 
 226         driver = kzalloc(sizeof(*driver), GFP_KERNEL);
 227         if (!driver)
 228                 return -ENOMEM;
 229 
 230         driver->ops = ops;
 231 
 232         mutex_lock(&vfio.iommu_drivers_lock);
 233 
 234         /* Check for duplicates */
 235         list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) {
 236                 if (tmp->ops == ops) {
 237                         mutex_unlock(&vfio.iommu_drivers_lock);
 238                         kfree(driver);
 239                         return -EINVAL;
 240                 }
 241         }
 242 
 243         list_add(&driver->vfio_next, &vfio.iommu_drivers_list);
 244 
 245         mutex_unlock(&vfio.iommu_drivers_lock);
 246 
 247         return 0;
 248 }
 249 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver);
 250 
 251 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 252 {
 253         struct vfio_iommu_driver *driver;
 254 
 255         mutex_lock(&vfio.iommu_drivers_lock);
 256         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
 257                 if (driver->ops == ops) {
 258                         list_del(&driver->vfio_next);
 259                         mutex_unlock(&vfio.iommu_drivers_lock);
 260                         kfree(driver);
 261                         return;
 262                 }
 263         }
 264         mutex_unlock(&vfio.iommu_drivers_lock);
 265 }
 266 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 267 
 268 /**
 269  * Group minor allocation/free - both called with vfio.group_lock held
 270  */
 271 static int vfio_alloc_group_minor(struct vfio_group *group)
 272 {
 273         return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL);
 274 }
 275 
 276 static void vfio_free_group_minor(int minor)
 277 {
 278         idr_remove(&vfio.group_idr, minor);
 279 }
 280 
 281 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 282                                      unsigned long action, void *data);
 283 static void vfio_group_get(struct vfio_group *group);
 284 
 285 /**
 286  * Container objects - containers are created when /dev/vfio/vfio is
 287  * opened, but their lifecycle extends until the last user is done, so
 288  * it's freed via kref.  Must support container/group/device being
 289  * closed in any order.
 290  */
 291 static void vfio_container_get(struct vfio_container *container)
 292 {
 293         kref_get(&container->kref);
 294 }
 295 
 296 static void vfio_container_release(struct kref *kref)
 297 {
 298         struct vfio_container *container;
 299         container = container_of(kref, struct vfio_container, kref);
 300 
 301         kfree(container);
 302 }
 303 
 304 static void vfio_container_put(struct vfio_container *container)
 305 {
 306         kref_put(&container->kref, vfio_container_release);
 307 }
 308 
 309 static void vfio_group_unlock_and_free(struct vfio_group *group)
 310 {
 311         mutex_unlock(&vfio.group_lock);
 312         /*
 313          * Unregister outside of lock.  A spurious callback is harmless now
 314          * that the group is no longer in vfio.group_list.
 315          */
 316         iommu_group_unregister_notifier(group->iommu_group, &group->nb);
 317         kfree(group);
 318 }
 319 
 320 /**
 321  * Group objects - create, release, get, put, search
 322  */
 323 static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group)
 324 {
 325         struct vfio_group *group, *tmp;
 326         struct device *dev;
 327         int ret, minor;
 328 
 329         group = kzalloc(sizeof(*group), GFP_KERNEL);
 330         if (!group)
 331                 return ERR_PTR(-ENOMEM);
 332 
 333         kref_init(&group->kref);
 334         INIT_LIST_HEAD(&group->device_list);
 335         mutex_init(&group->device_lock);
 336         INIT_LIST_HEAD(&group->unbound_list);
 337         mutex_init(&group->unbound_lock);
 338         atomic_set(&group->container_users, 0);
 339         atomic_set(&group->opened, 0);
 340         init_waitqueue_head(&group->container_q);
 341         group->iommu_group = iommu_group;
 342 #ifdef CONFIG_VFIO_NOIOMMU
 343         group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu);
 344 #endif
 345         BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
 346 
 347         group->nb.notifier_call = vfio_iommu_group_notifier;
 348 
 349         /*
 350          * blocking notifiers acquire a rwsem around registering and hold
 351          * it around callback.  Therefore, need to register outside of
 352          * vfio.group_lock to avoid A-B/B-A contention.  Our callback won't
 353          * do anything unless it can find the group in vfio.group_list, so
 354          * no harm in registering early.
 355          */
 356         ret = iommu_group_register_notifier(iommu_group, &group->nb);
 357         if (ret) {
 358                 kfree(group);
 359                 return ERR_PTR(ret);
 360         }
 361 
 362         mutex_lock(&vfio.group_lock);
 363 
 364         /* Did we race creating this group? */
 365         list_for_each_entry(tmp, &vfio.group_list, vfio_next) {
 366                 if (tmp->iommu_group == iommu_group) {
 367                         vfio_group_get(tmp);
 368                         vfio_group_unlock_and_free(group);
 369                         return tmp;
 370                 }
 371         }
 372 
 373         minor = vfio_alloc_group_minor(group);
 374         if (minor < 0) {
 375                 vfio_group_unlock_and_free(group);
 376                 return ERR_PTR(minor);
 377         }
 378 
 379         dev = device_create(vfio.class, NULL,
 380                             MKDEV(MAJOR(vfio.group_devt), minor),
 381                             group, "%s%d", group->noiommu ? "noiommu-" : "",
 382                             iommu_group_id(iommu_group));
 383         if (IS_ERR(dev)) {
 384                 vfio_free_group_minor(minor);
 385                 vfio_group_unlock_and_free(group);
 386                 return ERR_CAST(dev);
 387         }
 388 
 389         group->minor = minor;
 390         group->dev = dev;
 391 
 392         list_add(&group->vfio_next, &vfio.group_list);
 393 
 394         mutex_unlock(&vfio.group_lock);
 395 
 396         return group;
 397 }
 398 
 399 /* called with vfio.group_lock held */
 400 static void vfio_group_release(struct kref *kref)
 401 {
 402         struct vfio_group *group = container_of(kref, struct vfio_group, kref);
 403         struct vfio_unbound_dev *unbound, *tmp;
 404         struct iommu_group *iommu_group = group->iommu_group;
 405 
 406         WARN_ON(!list_empty(&group->device_list));
 407         WARN_ON(group->notifier.head);
 408 
 409         list_for_each_entry_safe(unbound, tmp,
 410                                  &group->unbound_list, unbound_next) {
 411                 list_del(&unbound->unbound_next);
 412                 kfree(unbound);
 413         }
 414 
 415         device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor));
 416         list_del(&group->vfio_next);
 417         vfio_free_group_minor(group->minor);
 418         vfio_group_unlock_and_free(group);
 419         iommu_group_put(iommu_group);
 420 }
 421 
 422 static void vfio_group_put(struct vfio_group *group)
 423 {
 424         kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock);
 425 }
 426 
 427 struct vfio_group_put_work {
 428         struct work_struct work;
 429         struct vfio_group *group;
 430 };
 431 
 432 static void vfio_group_put_bg(struct work_struct *work)
 433 {
 434         struct vfio_group_put_work *do_work;
 435 
 436         do_work = container_of(work, struct vfio_group_put_work, work);
 437 
 438         vfio_group_put(do_work->group);
 439         kfree(do_work);
 440 }
 441 
 442 static void vfio_group_schedule_put(struct vfio_group *group)
 443 {
 444         struct vfio_group_put_work *do_work;
 445 
 446         do_work = kmalloc(sizeof(*do_work), GFP_KERNEL);
 447         if (WARN_ON(!do_work))
 448                 return;
 449 
 450         INIT_WORK(&do_work->work, vfio_group_put_bg);
 451         do_work->group = group;
 452         schedule_work(&do_work->work);
 453 }
 454 
 455 /* Assume group_lock or group reference is held */
 456 static void vfio_group_get(struct vfio_group *group)
 457 {
 458         kref_get(&group->kref);
 459 }
 460 
 461 /*
 462  * Not really a try as we will sleep for mutex, but we need to make
 463  * sure the group pointer is valid under lock and get a reference.
 464  */
 465 static struct vfio_group *vfio_group_try_get(struct vfio_group *group)
 466 {
 467         struct vfio_group *target = group;
 468 
 469         mutex_lock(&vfio.group_lock);
 470         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 471                 if (group == target) {
 472                         vfio_group_get(group);
 473                         mutex_unlock(&vfio.group_lock);
 474                         return group;
 475                 }
 476         }
 477         mutex_unlock(&vfio.group_lock);
 478 
 479         return NULL;
 480 }
 481 
 482 static
 483 struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 484 {
 485         struct vfio_group *group;
 486 
 487         mutex_lock(&vfio.group_lock);
 488         list_for_each_entry(group, &vfio.group_list, vfio_next) {
 489                 if (group->iommu_group == iommu_group) {
 490                         vfio_group_get(group);
 491                         mutex_unlock(&vfio.group_lock);
 492                         return group;
 493                 }
 494         }
 495         mutex_unlock(&vfio.group_lock);
 496 
 497         return NULL;
 498 }
 499 
 500 static struct vfio_group *vfio_group_get_from_minor(int minor)
 501 {
 502         struct vfio_group *group;
 503 
 504         mutex_lock(&vfio.group_lock);
 505         group = idr_find(&vfio.group_idr, minor);
 506         if (!group) {
 507                 mutex_unlock(&vfio.group_lock);
 508                 return NULL;
 509         }
 510         vfio_group_get(group);
 511         mutex_unlock(&vfio.group_lock);
 512 
 513         return group;
 514 }
 515 
 516 static struct vfio_group *vfio_group_get_from_dev(struct device *dev)
 517 {
 518         struct iommu_group *iommu_group;
 519         struct vfio_group *group;
 520 
 521         iommu_group = iommu_group_get(dev);
 522         if (!iommu_group)
 523                 return NULL;
 524 
 525         group = vfio_group_get_from_iommu(iommu_group);
 526         iommu_group_put(iommu_group);
 527 
 528         return group;
 529 }
 530 
 531 /**
 532  * Device objects - create, release, get, put, search
 533  */
 534 static
 535 struct vfio_device *vfio_group_create_device(struct vfio_group *group,
 536                                              struct device *dev,
 537                                              const struct vfio_device_ops *ops,
 538                                              void *device_data)
 539 {
 540         struct vfio_device *device;
 541 
 542         device = kzalloc(sizeof(*device), GFP_KERNEL);
 543         if (!device)
 544                 return ERR_PTR(-ENOMEM);
 545 
 546         kref_init(&device->kref);
 547         device->dev = dev;
 548         device->group = group;
 549         device->ops = ops;
 550         device->device_data = device_data;
 551         dev_set_drvdata(dev, device);
 552 
 553         /* No need to get group_lock, caller has group reference */
 554         vfio_group_get(group);
 555 
 556         mutex_lock(&group->device_lock);
 557         list_add(&device->group_next, &group->device_list);
 558         mutex_unlock(&group->device_lock);
 559 
 560         return device;
 561 }
 562 
 563 static void vfio_device_release(struct kref *kref)
 564 {
 565         struct vfio_device *device = container_of(kref,
 566                                                   struct vfio_device, kref);
 567         struct vfio_group *group = device->group;
 568 
 569         list_del(&device->group_next);
 570         mutex_unlock(&group->device_lock);
 571 
 572         dev_set_drvdata(device->dev, NULL);
 573 
 574         kfree(device);
 575 
 576         /* vfio_del_group_dev may be waiting for this device */
 577         wake_up(&vfio.release_q);
 578 }
 579 
 580 /* Device reference always implies a group reference */
 581 void vfio_device_put(struct vfio_device *device)
 582 {
 583         struct vfio_group *group = device->group;
 584         kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 585         vfio_group_put(group);
 586 }
 587 EXPORT_SYMBOL_GPL(vfio_device_put);
 588 
 589 static void vfio_device_get(struct vfio_device *device)
 590 {
 591         vfio_group_get(device->group);
 592         kref_get(&device->kref);
 593 }
 594 
 595 static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
 596                                                  struct device *dev)
 597 {
 598         struct vfio_device *device;
 599 
 600         mutex_lock(&group->device_lock);
 601         list_for_each_entry(device, &group->device_list, group_next) {
 602                 if (device->dev == dev) {
 603                         vfio_device_get(device);
 604                         mutex_unlock(&group->device_lock);
 605                         return device;
 606                 }
 607         }
 608         mutex_unlock(&group->device_lock);
 609         return NULL;
 610 }
 611 
 612 /*
 613  * Some drivers, like pci-stub, are only used to prevent other drivers from
 614  * claiming a device and are therefore perfectly legitimate for a user owned
 615  * group.  The pci-stub driver has no dependencies on DMA or the IOVA mapping
 616  * of the device, but it does prevent the user from having direct access to
 617  * the device, which is useful in some circumstances.
 618  *
 619  * We also assume that we can include PCI interconnect devices, ie. bridges.
 620  * IOMMU grouping on PCI necessitates that if we lack isolation on a bridge
 621  * then all of the downstream devices will be part of the same IOMMU group as
 622  * the bridge.  Thus, if placing the bridge into the user owned IOVA space
 623  * breaks anything, it only does so for user owned devices downstream.  Note
 624  * that error notification via MSI can be affected for platforms that handle
 625  * MSI within the same IOVA space as DMA.
 626  */
 627 static const char * const vfio_driver_whitelist[] = { "pci-stub" };
 628 
 629 static bool vfio_dev_whitelisted(struct device *dev, struct device_driver *drv)
 630 {
 631         if (dev_is_pci(dev)) {
 632                 struct pci_dev *pdev = to_pci_dev(dev);
 633 
 634                 if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
 635                         return true;
 636         }
 637 
 638         return match_string(vfio_driver_whitelist,
 639                             ARRAY_SIZE(vfio_driver_whitelist),
 640                             drv->name) >= 0;
 641 }
 642 
 643 /*
 644  * A vfio group is viable for use by userspace if all devices are in
 645  * one of the following states:
 646  *  - driver-less
 647  *  - bound to a vfio driver
 648  *  - bound to a whitelisted driver
 649  *  - a PCI interconnect device
 650  *
 651  * We use two methods to determine whether a device is bound to a vfio
 652  * driver.  The first is to test whether the device exists in the vfio
 653  * group.  The second is to test if the device exists on the group
 654  * unbound_list, indicating it's in the middle of transitioning from
 655  * a vfio driver to driver-less.
 656  */
 657 static int vfio_dev_viable(struct device *dev, void *data)
 658 {
 659         struct vfio_group *group = data;
 660         struct vfio_device *device;
 661         struct device_driver *drv = READ_ONCE(dev->driver);
 662         struct vfio_unbound_dev *unbound;
 663         int ret = -EINVAL;
 664 
 665         mutex_lock(&group->unbound_lock);
 666         list_for_each_entry(unbound, &group->unbound_list, unbound_next) {
 667                 if (dev == unbound->dev) {
 668                         ret = 0;
 669                         break;
 670                 }
 671         }
 672         mutex_unlock(&group->unbound_lock);
 673 
 674         if (!ret || !drv || vfio_dev_whitelisted(dev, drv))
 675                 return 0;
 676 
 677         device = vfio_group_get_device(group, dev);
 678         if (device) {
 679                 vfio_device_put(device);
 680                 return 0;
 681         }
 682 
 683         return ret;
 684 }
 685 
 686 /**
 687  * Async device support
 688  */
 689 static int vfio_group_nb_add_dev(struct vfio_group *group, struct device *dev)
 690 {
 691         struct vfio_device *device;
 692 
 693         /* Do we already know about it?  We shouldn't */
 694         device = vfio_group_get_device(group, dev);
 695         if (WARN_ON_ONCE(device)) {
 696                 vfio_device_put(device);
 697                 return 0;
 698         }
 699 
 700         /* Nothing to do for idle groups */
 701         if (!atomic_read(&group->container_users))
 702                 return 0;
 703 
 704         /* TODO Prevent device auto probing */
 705         dev_WARN(dev, "Device added to live group %d!\n",
 706                  iommu_group_id(group->iommu_group));
 707 
 708         return 0;
 709 }
 710 
 711 static int vfio_group_nb_verify(struct vfio_group *group, struct device *dev)
 712 {
 713         /* We don't care what happens when the group isn't in use */
 714         if (!atomic_read(&group->container_users))
 715                 return 0;
 716 
 717         return vfio_dev_viable(dev, group);
 718 }
 719 
 720 static int vfio_iommu_group_notifier(struct notifier_block *nb,
 721                                      unsigned long action, void *data)
 722 {
 723         struct vfio_group *group = container_of(nb, struct vfio_group, nb);
 724         struct device *dev = data;
 725         struct vfio_unbound_dev *unbound;
 726 
 727         /*
 728          * Need to go through a group_lock lookup to get a reference or we
 729          * risk racing a group being removed.  Ignore spurious notifies.
 730          */
 731         group = vfio_group_try_get(group);
 732         if (!group)
 733                 return NOTIFY_OK;
 734 
 735         switch (action) {
 736         case IOMMU_GROUP_NOTIFY_ADD_DEVICE:
 737                 vfio_group_nb_add_dev(group, dev);
 738                 break;
 739         case IOMMU_GROUP_NOTIFY_DEL_DEVICE:
 740                 /*
 741                  * Nothing to do here.  If the device is in use, then the
 742                  * vfio sub-driver should block the remove callback until
 743                  * it is unused.  If the device is unused or attached to a
 744                  * stub driver, then it should be released and we don't
 745                  * care that it will be going away.
 746                  */
 747                 break;
 748         case IOMMU_GROUP_NOTIFY_BIND_DRIVER:
 749                 dev_dbg(dev, "%s: group %d binding to driver\n", __func__,
 750                         iommu_group_id(group->iommu_group));
 751                 break;
 752         case IOMMU_GROUP_NOTIFY_BOUND_DRIVER:
 753                 dev_dbg(dev, "%s: group %d bound to driver %s\n", __func__,
 754                         iommu_group_id(group->iommu_group), dev->driver->name);
 755                 BUG_ON(vfio_group_nb_verify(group, dev));
 756                 break;
 757         case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER:
 758                 dev_dbg(dev, "%s: group %d unbinding from driver %s\n",
 759                         __func__, iommu_group_id(group->iommu_group),
 760                         dev->driver->name);
 761                 break;
 762         case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER:
 763                 dev_dbg(dev, "%s: group %d unbound from driver\n", __func__,
 764                         iommu_group_id(group->iommu_group));
 765                 /*
 766                  * XXX An unbound device in a live group is ok, but we'd
 767                  * really like to avoid the above BUG_ON by preventing other
 768                  * drivers from binding to it.  Once that occurs, we have to
 769                  * stop the system to maintain isolation.  At a minimum, we'd
 770                  * want a toggle to disable driver auto probe for this device.
 771                  */
 772 
 773                 mutex_lock(&group->unbound_lock);
 774                 list_for_each_entry(unbound,
 775                                     &group->unbound_list, unbound_next) {
 776                         if (dev == unbound->dev) {
 777                                 list_del(&unbound->unbound_next);
 778                                 kfree(unbound);
 779                                 break;
 780                         }
 781                 }
 782                 mutex_unlock(&group->unbound_lock);
 783                 break;
 784         }
 785 
 786         /*
 787          * If we're the last reference to the group, the group will be
 788          * released, which includes unregistering the iommu group notifier.
 789          * We hold a read-lock on that notifier list, unregistering needs
 790          * a write-lock... deadlock.  Release our reference asynchronously
 791          * to avoid that situation.
 792          */
 793         vfio_group_schedule_put(group);
 794         return NOTIFY_OK;
 795 }
 796 
 797 /**
 798  * VFIO driver API
 799  */
 800 int vfio_add_group_dev(struct device *dev,
 801                        const struct vfio_device_ops *ops, void *device_data)
 802 {
 803         struct iommu_group *iommu_group;
 804         struct vfio_group *group;
 805         struct vfio_device *device;
 806 
 807         iommu_group = iommu_group_get(dev);
 808         if (!iommu_group)
 809                 return -EINVAL;
 810 
 811         group = vfio_group_get_from_iommu(iommu_group);
 812         if (!group) {
 813                 group = vfio_create_group(iommu_group);
 814                 if (IS_ERR(group)) {
 815                         iommu_group_put(iommu_group);
 816                         return PTR_ERR(group);
 817                 }
 818         } else {
 819                 /*
 820                  * A found vfio_group already holds a reference to the
 821                  * iommu_group.  A created vfio_group keeps the reference.
 822                  */
 823                 iommu_group_put(iommu_group);
 824         }
 825 
 826         device = vfio_group_get_device(group, dev);
 827         if (device) {
 828                 dev_WARN(dev, "Device already exists on group %d\n",
 829                          iommu_group_id(iommu_group));
 830                 vfio_device_put(device);
 831                 vfio_group_put(group);
 832                 return -EBUSY;
 833         }
 834 
 835         device = vfio_group_create_device(group, dev, ops, device_data);
 836         if (IS_ERR(device)) {
 837                 vfio_group_put(group);
 838                 return PTR_ERR(device);
 839         }
 840 
 841         /*
 842          * Drop all but the vfio_device reference.  The vfio_device holds
 843          * a reference to the vfio_group, which holds a reference to the
 844          * iommu_group.
 845          */
 846         vfio_group_put(group);
 847 
 848         return 0;
 849 }
 850 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 851 
 852 /**
 853  * Get a reference to the vfio_device for a device.  Even if the
 854  * caller thinks they own the device, they could be racing with a
 855  * release call path, so we can't trust drvdata for the shortcut.
 856  * Go the long way around, from the iommu_group to the vfio_group
 857  * to the vfio_device.
 858  */
 859 struct vfio_device *vfio_device_get_from_dev(struct device *dev)
 860 {
 861         struct vfio_group *group;
 862         struct vfio_device *device;
 863 
 864         group = vfio_group_get_from_dev(dev);
 865         if (!group)
 866                 return NULL;
 867 
 868         device = vfio_group_get_device(group, dev);
 869         vfio_group_put(group);
 870 
 871         return device;
 872 }
 873 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 874 
 875 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 876                                                      char *buf)
 877 {
 878         struct vfio_device *it, *device = NULL;
 879 
 880         mutex_lock(&group->device_lock);
 881         list_for_each_entry(it, &group->device_list, group_next) {
 882                 if (!strcmp(dev_name(it->dev), buf)) {
 883                         device = it;
 884                         vfio_device_get(device);
 885                         break;
 886                 }
 887         }
 888         mutex_unlock(&group->device_lock);
 889 
 890         return device;
 891 }
 892 
 893 /*
 894  * Caller must hold a reference to the vfio_device
 895  */
 896 void *vfio_device_data(struct vfio_device *device)
 897 {
 898         return device->device_data;
 899 }
 900 EXPORT_SYMBOL_GPL(vfio_device_data);
 901 
 902 /*
 903  * Decrement the device reference count and wait for the device to be
 904  * removed.  Open file descriptors for the device... */
 905 void *vfio_del_group_dev(struct device *dev)
 906 {
 907         DEFINE_WAIT_FUNC(wait, woken_wake_function);
 908         struct vfio_device *device = dev_get_drvdata(dev);
 909         struct vfio_group *group = device->group;
 910         void *device_data = device->device_data;
 911         struct vfio_unbound_dev *unbound;
 912         unsigned int i = 0;
 913         bool interrupted = false;
 914 
 915         /*
 916          * The group exists so long as we have a device reference.  Get
 917          * a group reference and use it to scan for the device going away.
 918          */
 919         vfio_group_get(group);
 920 
 921         /*
 922          * When the device is removed from the group, the group suddenly
 923          * becomes non-viable; the device has a driver (until the unbind
 924          * completes), but it's not present in the group.  This is bad news
 925          * for any external users that need to re-acquire a group reference
 926          * in order to match and release their existing reference.  To
 927          * solve this, we track such devices on the unbound_list to bridge
 928          * the gap until they're fully unbound.
 929          */
 930         unbound = kzalloc(sizeof(*unbound), GFP_KERNEL);
 931         if (unbound) {
 932                 unbound->dev = dev;
 933                 mutex_lock(&group->unbound_lock);
 934                 list_add(&unbound->unbound_next, &group->unbound_list);
 935                 mutex_unlock(&group->unbound_lock);
 936         }
 937         WARN_ON(!unbound);
 938 
 939         vfio_device_put(device);
 940 
 941         /*
 942          * If the device is still present in the group after the above
 943          * 'put', then it is in use and we need to request it from the
 944          * bus driver.  The driver may in turn need to request the
 945          * device from the user.  We send the request on an arbitrary
 946          * interval with counter to allow the driver to take escalating
 947          * measures to release the device if it has the ability to do so.
 948          */
 949         add_wait_queue(&vfio.release_q, &wait);
 950 
 951         do {
 952                 device = vfio_group_get_device(group, dev);
 953                 if (!device)
 954                         break;
 955 
 956                 if (device->ops->request)
 957                         device->ops->request(device_data, i++);
 958 
 959                 vfio_device_put(device);
 960 
 961                 if (interrupted) {
 962                         wait_woken(&wait, TASK_UNINTERRUPTIBLE, HZ * 10);
 963                 } else {
 964                         wait_woken(&wait, TASK_INTERRUPTIBLE, HZ * 10);
 965                         if (signal_pending(current)) {
 966                                 interrupted = true;
 967                                 dev_warn(dev,
 968                                          "Device is currently in use, task"
 969                                          " \"%s\" (%d) "
 970                                          "blocked until device is released",
 971                                          current->comm, task_pid_nr(current));
 972                         }
 973                 }
 974 
 975         } while (1);
 976 
 977         remove_wait_queue(&vfio.release_q, &wait);
 978         /*
 979          * In order to support multiple devices per group, devices can be
 980          * plucked from the group while other devices in the group are still
 981          * in use.  The container persists with this group and those remaining
 982          * devices still attached.  If the user creates an isolation violation
 983          * by binding this device to another driver while the group is still in
 984          * use, that's their fault.  However, in the case of removing the last,
 985          * or potentially the only, device in the group there can be no other
 986          * in-use devices in the group.  The user has done their due diligence
 987          * and we should lay no claims to those devices.  In order to do that,
 988          * we need to make sure the group is detached from the container.
 989          * Without this stall, we're potentially racing with a user process
 990          * that may attempt to immediately bind this device to another driver.
 991          */
 992         if (list_empty(&group->device_list))
 993                 wait_event(group->container_q, !group->container);
 994 
 995         vfio_group_put(group);
 996 
 997         return device_data;
 998 }
 999 EXPORT_SYMBOL_GPL(vfio_del_group_dev);
1000 
1001 /**
1002  * VFIO base fd, /dev/vfio/vfio
1003  */
1004 static long vfio_ioctl_check_extension(struct vfio_container *container,
1005                                        unsigned long arg)
1006 {
1007         struct vfio_iommu_driver *driver;
1008         long ret = 0;
1009 
1010         down_read(&container->group_lock);
1011 
1012         driver = container->iommu_driver;
1013 
1014         switch (arg) {
1015                 /* No base extensions yet */
1016         default:
1017                 /*
1018                  * If no driver is set, poll all registered drivers for
1019                  * extensions and return the first positive result.  If
1020                  * a driver is already set, further queries will be passed
1021                  * only to that driver.
1022                  */
1023                 if (!driver) {
1024                         mutex_lock(&vfio.iommu_drivers_lock);
1025                         list_for_each_entry(driver, &vfio.iommu_drivers_list,
1026                                             vfio_next) {
1027 
1028 #ifdef CONFIG_VFIO_NOIOMMU
1029                                 if (!list_empty(&container->group_list) &&
1030                                     (container->noiommu !=
1031                                      (driver->ops == &vfio_noiommu_ops)))
1032                                         continue;
1033 #endif
1034 
1035                                 if (!try_module_get(driver->ops->owner))
1036                                         continue;
1037 
1038                                 ret = driver->ops->ioctl(NULL,
1039                                                          VFIO_CHECK_EXTENSION,
1040                                                          arg);
1041                                 module_put(driver->ops->owner);
1042                                 if (ret > 0)
1043                                         break;
1044                         }
1045                         mutex_unlock(&vfio.iommu_drivers_lock);
1046                 } else
1047                         ret = driver->ops->ioctl(container->iommu_data,
1048                                                  VFIO_CHECK_EXTENSION, arg);
1049         }
1050 
1051         up_read(&container->group_lock);
1052 
1053         return ret;
1054 }
1055 
1056 /* hold write lock on container->group_lock */
1057 static int __vfio_container_attach_groups(struct vfio_container *container,
1058                                           struct vfio_iommu_driver *driver,
1059                                           void *data)
1060 {
1061         struct vfio_group *group;
1062         int ret = -ENODEV;
1063 
1064         list_for_each_entry(group, &container->group_list, container_next) {
1065                 ret = driver->ops->attach_group(data, group->iommu_group);
1066                 if (ret)
1067                         goto unwind;
1068         }
1069 
1070         return ret;
1071 
1072 unwind:
1073         list_for_each_entry_continue_reverse(group, &container->group_list,
1074                                              container_next) {
1075                 driver->ops->detach_group(data, group->iommu_group);
1076         }
1077 
1078         return ret;
1079 }
1080 
1081 static long vfio_ioctl_set_iommu(struct vfio_container *container,
1082                                  unsigned long arg)
1083 {
1084         struct vfio_iommu_driver *driver;
1085         long ret = -ENODEV;
1086 
1087         down_write(&container->group_lock);
1088 
1089         /*
1090          * The container is designed to be an unprivileged interface while
1091          * the group can be assigned to specific users.  Therefore, only by
1092          * adding a group to a container does the user get the privilege of
1093          * enabling the iommu, which may allocate finite resources.  There
1094          * is no unset_iommu, but by removing all the groups from a container,
1095          * the container is deprivileged and returns to an unset state.
1096          */
1097         if (list_empty(&container->group_list) || container->iommu_driver) {
1098                 up_write(&container->group_lock);
1099                 return -EINVAL;
1100         }
1101 
1102         mutex_lock(&vfio.iommu_drivers_lock);
1103         list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) {
1104                 void *data;
1105 
1106 #ifdef CONFIG_VFIO_NOIOMMU
1107                 /*
1108                  * Only noiommu containers can use vfio-noiommu and noiommu
1109                  * containers can only use vfio-noiommu.
1110                  */
1111                 if (container->noiommu != (driver->ops == &vfio_noiommu_ops))
1112                         continue;
1113 #endif
1114 
1115                 if (!try_module_get(driver->ops->owner))
1116                         continue;
1117 
1118                 /*
1119                  * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
1120                  * so test which iommu driver reported support for this
1121                  * extension and call open on them.  We also pass them the
1122                  * magic, allowing a single driver to support multiple
1123                  * interfaces if they'd like.
1124                  */
1125                 if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) {
1126                         module_put(driver->ops->owner);
1127                         continue;
1128                 }
1129 
1130                 data = driver->ops->open(arg);
1131                 if (IS_ERR(data)) {
1132                         ret = PTR_ERR(data);
1133                         module_put(driver->ops->owner);
1134                         continue;
1135                 }
1136 
1137                 ret = __vfio_container_attach_groups(container, driver, data);
1138                 if (ret) {
1139                         driver->ops->release(data);
1140                         module_put(driver->ops->owner);
1141                         continue;
1142                 }
1143 
1144                 container->iommu_driver = driver;
1145                 container->iommu_data = data;
1146                 break;
1147         }
1148 
1149         mutex_unlock(&vfio.iommu_drivers_lock);
1150         up_write(&container->group_lock);
1151 
1152         return ret;
1153 }
1154 
1155 static long vfio_fops_unl_ioctl(struct file *filep,
1156                                 unsigned int cmd, unsigned long arg)
1157 {
1158         struct vfio_container *container = filep->private_data;
1159         struct vfio_iommu_driver *driver;
1160         void *data;
1161         long ret = -EINVAL;
1162 
1163         if (!container)
1164                 return ret;
1165 
1166         switch (cmd) {
1167         case VFIO_GET_API_VERSION:
1168                 ret = VFIO_API_VERSION;
1169                 break;
1170         case VFIO_CHECK_EXTENSION:
1171                 ret = vfio_ioctl_check_extension(container, arg);
1172                 break;
1173         case VFIO_SET_IOMMU:
1174                 ret = vfio_ioctl_set_iommu(container, arg);
1175                 break;
1176         default:
1177                 driver = container->iommu_driver;
1178                 data = container->iommu_data;
1179 
1180                 if (driver) /* passthrough all unrecognized ioctls */
1181                         ret = driver->ops->ioctl(data, cmd, arg);
1182         }
1183 
1184         return ret;
1185 }
1186 
1187 #ifdef CONFIG_COMPAT
1188 static long vfio_fops_compat_ioctl(struct file *filep,
1189                                    unsigned int cmd, unsigned long arg)
1190 {
1191         arg = (unsigned long)compat_ptr(arg);
1192         return vfio_fops_unl_ioctl(filep, cmd, arg);
1193 }
1194 #endif  /* CONFIG_COMPAT */
1195 
1196 static int vfio_fops_open(struct inode *inode, struct file *filep)
1197 {
1198         struct vfio_container *container;
1199 
1200         container = kzalloc(sizeof(*container), GFP_KERNEL);
1201         if (!container)
1202                 return -ENOMEM;
1203 
1204         INIT_LIST_HEAD(&container->group_list);
1205         init_rwsem(&container->group_lock);
1206         kref_init(&container->kref);
1207 
1208         filep->private_data = container;
1209 
1210         return 0;
1211 }
1212 
1213 static int vfio_fops_release(struct inode *inode, struct file *filep)
1214 {
1215         struct vfio_container *container = filep->private_data;
1216 
1217         filep->private_data = NULL;
1218 
1219         vfio_container_put(container);
1220 
1221         return 0;
1222 }
1223 
1224 /*
1225  * Once an iommu driver is set, we optionally pass read/write/mmap
1226  * on to the driver, allowing management interfaces beyond ioctl.
1227  */
1228 static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
1229                               size_t count, loff_t *ppos)
1230 {
1231         struct vfio_container *container = filep->private_data;
1232         struct vfio_iommu_driver *driver;
1233         ssize_t ret = -EINVAL;
1234 
1235         driver = container->iommu_driver;
1236         if (likely(driver && driver->ops->read))
1237                 ret = driver->ops->read(container->iommu_data,
1238                                         buf, count, ppos);
1239 
1240         return ret;
1241 }
1242 
1243 static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
1244                                size_t count, loff_t *ppos)
1245 {
1246         struct vfio_container *container = filep->private_data;
1247         struct vfio_iommu_driver *driver;
1248         ssize_t ret = -EINVAL;
1249 
1250         driver = container->iommu_driver;
1251         if (likely(driver && driver->ops->write))
1252                 ret = driver->ops->write(container->iommu_data,
1253                                          buf, count, ppos);
1254 
1255         return ret;
1256 }
1257 
1258 static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1259 {
1260         struct vfio_container *container = filep->private_data;
1261         struct vfio_iommu_driver *driver;
1262         int ret = -EINVAL;
1263 
1264         driver = container->iommu_driver;
1265         if (likely(driver && driver->ops->mmap))
1266                 ret = driver->ops->mmap(container->iommu_data, vma);
1267 
1268         return ret;
1269 }
1270 
1271 static const struct file_operations vfio_fops = {
1272         .owner          = THIS_MODULE,
1273         .open           = vfio_fops_open,
1274         .release        = vfio_fops_release,
1275         .read           = vfio_fops_read,
1276         .write          = vfio_fops_write,
1277         .unlocked_ioctl = vfio_fops_unl_ioctl,
1278 #ifdef CONFIG_COMPAT
1279         .compat_ioctl   = vfio_fops_compat_ioctl,
1280 #endif
1281         .mmap           = vfio_fops_mmap,
1282 };
1283 
1284 /**
1285  * VFIO Group fd, /dev/vfio/$GROUP
1286  */
1287 static void __vfio_group_unset_container(struct vfio_group *group)
1288 {
1289         struct vfio_container *container = group->container;
1290         struct vfio_iommu_driver *driver;
1291 
1292         down_write(&container->group_lock);
1293 
1294         driver = container->iommu_driver;
1295         if (driver)
1296                 driver->ops->detach_group(container->iommu_data,
1297                                           group->iommu_group);
1298 
1299         group->container = NULL;
1300         wake_up(&group->container_q);
1301         list_del(&group->container_next);
1302 
1303         /* Detaching the last group deprivileges a container, remove iommu */
1304         if (driver && list_empty(&container->group_list)) {
1305                 driver->ops->release(container->iommu_data);
1306                 module_put(driver->ops->owner);
1307                 container->iommu_driver = NULL;
1308                 container->iommu_data = NULL;
1309         }
1310 
1311         up_write(&container->group_lock);
1312 
1313         vfio_container_put(container);
1314 }
1315 
1316 /*
1317  * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1318  * if there was no container to unset.  Since the ioctl is called on
1319  * the group, we know that still exists, therefore the only valid
1320  * transition here is 1->0.
1321  */
1322 static int vfio_group_unset_container(struct vfio_group *group)
1323 {
1324         int users = atomic_cmpxchg(&group->container_users, 1, 0);
1325 
1326         if (!users)
1327                 return -EINVAL;
1328         if (users != 1)
1329                 return -EBUSY;
1330 
1331         __vfio_group_unset_container(group);
1332 
1333         return 0;
1334 }
1335 
1336 /*
1337  * When removing container users, anything that removes the last user
1338  * implicitly removes the group from the container.  That is, if the
1339  * group file descriptor is closed, as well as any device file descriptors,
1340  * the group is free.
1341  */
1342 static void vfio_group_try_dissolve_container(struct vfio_group *group)
1343 {
1344         if (0 == atomic_dec_if_positive(&group->container_users))
1345                 __vfio_group_unset_container(group);
1346 }
1347 
1348 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
1349 {
1350         struct fd f;
1351         struct vfio_container *container;
1352         struct vfio_iommu_driver *driver;
1353         int ret = 0;
1354 
1355         if (atomic_read(&group->container_users))
1356                 return -EINVAL;
1357 
1358         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1359                 return -EPERM;
1360 
1361         f = fdget(container_fd);
1362         if (!f.file)
1363                 return -EBADF;
1364 
1365         /* Sanity check, is this really our fd? */
1366         if (f.file->f_op != &vfio_fops) {
1367                 fdput(f);
1368                 return -EINVAL;
1369         }
1370 
1371         container = f.file->private_data;
1372         WARN_ON(!container); /* fget ensures we don't race vfio_release */
1373 
1374         down_write(&container->group_lock);
1375 
1376         /* Real groups and fake groups cannot mix */
1377         if (!list_empty(&container->group_list) &&
1378             container->noiommu != group->noiommu) {
1379                 ret = -EPERM;
1380                 goto unlock_out;
1381         }
1382 
1383         driver = container->iommu_driver;
1384         if (driver) {
1385                 ret = driver->ops->attach_group(container->iommu_data,
1386                                                 group->iommu_group);
1387                 if (ret)
1388                         goto unlock_out;
1389         }
1390 
1391         group->container = container;
1392         container->noiommu = group->noiommu;
1393         list_add(&group->container_next, &container->group_list);
1394 
1395         /* Get a reference on the container and mark a user within the group */
1396         vfio_container_get(container);
1397         atomic_inc(&group->container_users);
1398 
1399 unlock_out:
1400         up_write(&container->group_lock);
1401         fdput(f);
1402         return ret;
1403 }
1404 
1405 static bool vfio_group_viable(struct vfio_group *group)
1406 {
1407         return (iommu_group_for_each_dev(group->iommu_group,
1408                                          group, vfio_dev_viable) == 0);
1409 }
1410 
1411 static int vfio_group_add_container_user(struct vfio_group *group)
1412 {
1413         if (!atomic_inc_not_zero(&group->container_users))
1414                 return -EINVAL;
1415 
1416         if (group->noiommu) {
1417                 atomic_dec(&group->container_users);
1418                 return -EPERM;
1419         }
1420         if (!group->container->iommu_driver || !vfio_group_viable(group)) {
1421                 atomic_dec(&group->container_users);
1422                 return -EINVAL;
1423         }
1424 
1425         return 0;
1426 }
1427 
1428 static const struct file_operations vfio_device_fops;
1429 
1430 static int vfio_group_get_device_fd(struct vfio_group *group, char *buf)
1431 {
1432         struct vfio_device *device;
1433         struct file *filep;
1434         int ret;
1435 
1436         if (0 == atomic_read(&group->container_users) ||
1437             !group->container->iommu_driver || !vfio_group_viable(group))
1438                 return -EINVAL;
1439 
1440         if (group->noiommu && !capable(CAP_SYS_RAWIO))
1441                 return -EPERM;
1442 
1443         device = vfio_device_get_from_name(group, buf);
1444         if (!device)
1445                 return -ENODEV;
1446 
1447         ret = device->ops->open(device->device_data);
1448         if (ret) {
1449                 vfio_device_put(device);
1450                 return ret;
1451         }
1452 
1453         /*
1454          * We can't use anon_inode_getfd() because we need to modify
1455          * the f_mode flags directly to allow more than just ioctls
1456          */
1457         ret = get_unused_fd_flags(O_CLOEXEC);
1458         if (ret < 0) {
1459                 device->ops->release(device->device_data);
1460                 vfio_device_put(device);
1461                 return ret;
1462         }
1463 
1464         filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
1465                                    device, O_RDWR);
1466         if (IS_ERR(filep)) {
1467                 put_unused_fd(ret);
1468                 ret = PTR_ERR(filep);
1469                 device->ops->release(device->device_data);
1470                 vfio_device_put(device);
1471                 return ret;
1472         }
1473 
1474         /*
1475          * TODO: add an anon_inode interface to do this.
1476          * Appears to be missing by lack of need rather than
1477          * explicitly prevented.  Now there's need.
1478          */
1479         filep->f_mode |= (FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
1480 
1481         atomic_inc(&group->container_users);
1482 
1483         fd_install(ret, filep);
1484 
1485         if (group->noiommu)
1486                 dev_warn(device->dev, "vfio-noiommu device opened by user "
1487                          "(%s:%d)\n", current->comm, task_pid_nr(current));
1488 
1489         return ret;
1490 }
1491 
1492 static long vfio_group_fops_unl_ioctl(struct file *filep,
1493                                       unsigned int cmd, unsigned long arg)
1494 {
1495         struct vfio_group *group = filep->private_data;
1496         long ret = -ENOTTY;
1497 
1498         switch (cmd) {
1499         case VFIO_GROUP_GET_STATUS:
1500         {
1501                 struct vfio_group_status status;
1502                 unsigned long minsz;
1503 
1504                 minsz = offsetofend(struct vfio_group_status, flags);
1505 
1506                 if (copy_from_user(&status, (void __user *)arg, minsz))
1507                         return -EFAULT;
1508 
1509                 if (status.argsz < minsz)
1510                         return -EINVAL;
1511 
1512                 status.flags = 0;
1513 
1514                 if (vfio_group_viable(group))
1515                         status.flags |= VFIO_GROUP_FLAGS_VIABLE;
1516 
1517                 if (group->container)
1518                         status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET;
1519 
1520                 if (copy_to_user((void __user *)arg, &status, minsz))
1521                         return -EFAULT;
1522 
1523                 ret = 0;
1524                 break;
1525         }
1526         case VFIO_GROUP_SET_CONTAINER:
1527         {
1528                 int fd;
1529 
1530                 if (get_user(fd, (int __user *)arg))
1531                         return -EFAULT;
1532 
1533                 if (fd < 0)
1534                         return -EINVAL;
1535 
1536                 ret = vfio_group_set_container(group, fd);
1537                 break;
1538         }
1539         case VFIO_GROUP_UNSET_CONTAINER:
1540                 ret = vfio_group_unset_container(group);
1541                 break;
1542         case VFIO_GROUP_GET_DEVICE_FD:
1543         {
1544                 char *buf;
1545 
1546                 buf = strndup_user((const char __user *)arg, PAGE_SIZE);
1547                 if (IS_ERR(buf))
1548                         return PTR_ERR(buf);
1549 
1550                 ret = vfio_group_get_device_fd(group, buf);
1551                 kfree(buf);
1552                 break;
1553         }
1554         }
1555 
1556         return ret;
1557 }
1558 
1559 #ifdef CONFIG_COMPAT
1560 static long vfio_group_fops_compat_ioctl(struct file *filep,
1561                                          unsigned int cmd, unsigned long arg)
1562 {
1563         arg = (unsigned long)compat_ptr(arg);
1564         return vfio_group_fops_unl_ioctl(filep, cmd, arg);
1565 }
1566 #endif  /* CONFIG_COMPAT */
1567 
1568 static int vfio_group_fops_open(struct inode *inode, struct file *filep)
1569 {
1570         struct vfio_group *group;
1571         int opened;
1572 
1573         group = vfio_group_get_from_minor(iminor(inode));
1574         if (!group)
1575                 return -ENODEV;
1576 
1577         if (group->noiommu && !capable(CAP_SYS_RAWIO)) {
1578                 vfio_group_put(group);
1579                 return -EPERM;
1580         }
1581 
1582         /* Do we need multiple instances of the group open?  Seems not. */
1583         opened = atomic_cmpxchg(&group->opened, 0, 1);
1584         if (opened) {
1585                 vfio_group_put(group);
1586                 return -EBUSY;
1587         }
1588 
1589         /* Is something still in use from a previous open? */
1590         if (group->container) {
1591                 atomic_dec(&group->opened);
1592                 vfio_group_put(group);
1593                 return -EBUSY;
1594         }
1595 
1596         /* Warn if previous user didn't cleanup and re-init to drop them */
1597         if (WARN_ON(group->notifier.head))
1598                 BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);
1599 
1600         filep->private_data = group;
1601 
1602         return 0;
1603 }
1604 
1605 static int vfio_group_fops_release(struct inode *inode, struct file *filep)
1606 {
1607         struct vfio_group *group = filep->private_data;
1608 
1609         filep->private_data = NULL;
1610 
1611         vfio_group_try_dissolve_container(group);
1612 
1613         atomic_dec(&group->opened);
1614 
1615         vfio_group_put(group);
1616 
1617         return 0;
1618 }
1619 
1620 static const struct file_operations vfio_group_fops = {
1621         .owner          = THIS_MODULE,
1622         .unlocked_ioctl = vfio_group_fops_unl_ioctl,
1623 #ifdef CONFIG_COMPAT
1624         .compat_ioctl   = vfio_group_fops_compat_ioctl,
1625 #endif
1626         .open           = vfio_group_fops_open,
1627         .release        = vfio_group_fops_release,
1628 };
1629 
1630 /**
1631  * VFIO Device fd
1632  */
1633 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
1634 {
1635         struct vfio_device *device = filep->private_data;
1636 
1637         device->ops->release(device->device_data);
1638 
1639         vfio_group_try_dissolve_container(device->group);
1640 
1641         vfio_device_put(device);
1642 
1643         return 0;
1644 }
1645 
1646 static long vfio_device_fops_unl_ioctl(struct file *filep,
1647                                        unsigned int cmd, unsigned long arg)
1648 {
1649         struct vfio_device *device = filep->private_data;
1650 
1651         if (unlikely(!device->ops->ioctl))
1652                 return -EINVAL;
1653 
1654         return device->ops->ioctl(device->device_data, cmd, arg);
1655 }
1656 
1657 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1658                                      size_t count, loff_t *ppos)
1659 {
1660         struct vfio_device *device = filep->private_data;
1661 
1662         if (unlikely(!device->ops->read))
1663                 return -EINVAL;
1664 
1665         return device->ops->read(device->device_data, buf, count, ppos);
1666 }
1667 
1668 static ssize_t vfio_device_fops_write(struct file *filep,
1669                                       const char __user *buf,
1670                                       size_t count, loff_t *ppos)
1671 {
1672         struct vfio_device *device = filep->private_data;
1673 
1674         if (unlikely(!device->ops->write))
1675                 return -EINVAL;
1676 
1677         return device->ops->write(device->device_data, buf, count, ppos);
1678 }
1679 
1680 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1681 {
1682         struct vfio_device *device = filep->private_data;
1683 
1684         if (unlikely(!device->ops->mmap))
1685                 return -EINVAL;
1686 
1687         return device->ops->mmap(device->device_data, vma);
1688 }
1689 
1690 #ifdef CONFIG_COMPAT
1691 static long vfio_device_fops_compat_ioctl(struct file *filep,
1692                                           unsigned int cmd, unsigned long arg)
1693 {
1694         arg = (unsigned long)compat_ptr(arg);
1695         return vfio_device_fops_unl_ioctl(filep, cmd, arg);
1696 }
1697 #endif  /* CONFIG_COMPAT */
1698 
1699 static const struct file_operations vfio_device_fops = {
1700         .owner          = THIS_MODULE,
1701         .release        = vfio_device_fops_release,
1702         .read           = vfio_device_fops_read,
1703         .write          = vfio_device_fops_write,
1704         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1705 #ifdef CONFIG_COMPAT
1706         .compat_ioctl   = vfio_device_fops_compat_ioctl,
1707 #endif
1708         .mmap           = vfio_device_fops_mmap,
1709 };
1710 
1711 /**
1712  * External user API, exported by symbols to be linked dynamically.
1713  *
1714  * The protocol includes:
1715  *  1. do normal VFIO init operation:
1716  *      - opening a new container;
1717  *      - attaching group(s) to it;
1718  *      - setting an IOMMU driver for a container.
1719  * When IOMMU is set for a container, all groups in it are
1720  * considered ready to use by an external user.
1721  *
1722  * 2. User space passes a group fd to an external user.
1723  * The external user calls vfio_group_get_external_user()
1724  * to verify that:
1725  *      - the group is initialized;
1726  *      - IOMMU is set for it.
1727  * If both checks passed, vfio_group_get_external_user()
1728  * increments the container user counter to prevent
1729  * the VFIO group from disposal before KVM exits.
1730  *
1731  * 3. The external user calls vfio_external_user_iommu_id()
1732  * to know an IOMMU ID.
1733  *
1734  * 4. When the external KVM finishes, it calls
1735  * vfio_group_put_external_user() to release the VFIO group.
1736  * This call decrements the container user counter.
1737  */
1738 struct vfio_group *vfio_group_get_external_user(struct file *filep)
1739 {
1740         struct vfio_group *group = filep->private_data;
1741         int ret;
1742 
1743         if (filep->f_op != &vfio_group_fops)
1744                 return ERR_PTR(-EINVAL);
1745 
1746         ret = vfio_group_add_container_user(group);
1747         if (ret)
1748                 return ERR_PTR(ret);
1749 
1750         vfio_group_get(group);
1751 
1752         return group;
1753 }
1754 EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
1755 
1756 void vfio_group_put_external_user(struct vfio_group *group)
1757 {
1758         vfio_group_try_dissolve_container(group);
1759         vfio_group_put(group);
1760 }
1761 EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
1762 
1763 bool vfio_external_group_match_file(struct vfio_group *test_group,
1764                                     struct file *filep)
1765 {
1766         struct vfio_group *group = filep->private_data;
1767 
1768         return (filep->f_op == &vfio_group_fops) && (group == test_group);
1769 }
1770 EXPORT_SYMBOL_GPL(vfio_external_group_match_file);
1771 
1772 int vfio_external_user_iommu_id(struct vfio_group *group)
1773 {
1774         return iommu_group_id(group->iommu_group);
1775 }
1776 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
1777 
1778 long vfio_external_check_extension(struct vfio_group *group, unsigned long arg)
1779 {
1780         return vfio_ioctl_check_extension(group->container, arg);
1781 }
1782 EXPORT_SYMBOL_GPL(vfio_external_check_extension);
1783 
1784 /**
1785  * Sub-module support
1786  */
1787 /*
1788  * Helper for managing a buffer of info chain capabilities, allocate or
1789  * reallocate a buffer with additional @size, filling in @id and @version
1790  * of the capability.  A pointer to the new capability is returned.
1791  *
1792  * NB. The chain is based at the head of the buffer, so new entries are
1793  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1794  * next offsets prior to copying to the user buffer.
1795  */
1796 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1797                                                size_t size, u16 id, u16 version)
1798 {
1799         void *buf;
1800         struct vfio_info_cap_header *header, *tmp;
1801 
1802         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1803         if (!buf) {
1804                 kfree(caps->buf);
1805                 caps->size = 0;
1806                 return ERR_PTR(-ENOMEM);
1807         }
1808 
1809         caps->buf = buf;
1810         header = buf + caps->size;
1811 
1812         /* Eventually copied to user buffer, zero */
1813         memset(header, 0, size);
1814 
1815         header->id = id;
1816         header->version = version;
1817 
1818         /* Add to the end of the capability chain */
1819         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1820                 ; /* nothing */
1821 
1822         tmp->next = caps->size;
1823         caps->size += size;
1824 
1825         return header;
1826 }
1827 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1828 
1829 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1830 {
1831         struct vfio_info_cap_header *tmp;
1832         void *buf = (void *)caps->buf;
1833 
1834         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1835                 tmp->next += offset;
1836 }
1837 EXPORT_SYMBOL(vfio_info_cap_shift);
1838 
1839 int vfio_info_add_capability(struct vfio_info_cap *caps,
1840                              struct vfio_info_cap_header *cap, size_t size)
1841 {
1842         struct vfio_info_cap_header *header;
1843 
1844         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1845         if (IS_ERR(header))
1846                 return PTR_ERR(header);
1847 
1848         memcpy(header + 1, cap + 1, size - sizeof(*header));
1849 
1850         return 0;
1851 }
1852 EXPORT_SYMBOL(vfio_info_add_capability);
1853 
1854 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1855                                        int max_irq_type, size_t *data_size)
1856 {
1857         unsigned long minsz;
1858         size_t size;
1859 
1860         minsz = offsetofend(struct vfio_irq_set, count);
1861 
1862         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1863             (hdr->count >= (U32_MAX - hdr->start)) ||
1864             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1865                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1866                 return -EINVAL;
1867 
1868         if (data_size)
1869                 *data_size = 0;
1870 
1871         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1872                 return -EINVAL;
1873 
1874         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1875         case VFIO_IRQ_SET_DATA_NONE:
1876                 size = 0;
1877                 break;
1878         case VFIO_IRQ_SET_DATA_BOOL:
1879                 size = sizeof(uint8_t);
1880                 break;
1881         case VFIO_IRQ_SET_DATA_EVENTFD:
1882                 size = sizeof(int32_t);
1883                 break;
1884         default:
1885                 return -EINVAL;
1886         }
1887 
1888         if (size) {
1889                 if (hdr->argsz - minsz < hdr->count * size)
1890                         return -EINVAL;
1891 
1892                 if (!data_size)
1893                         return -EINVAL;
1894 
1895                 *data_size = hdr->count * size;
1896         }
1897 
1898         return 0;
1899 }
1900 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1901 
1902 /*
1903  * Pin a set of guest PFNs and return their associated host PFNs for local
1904  * domain only.
1905  * @dev [in]     : device
1906  * @user_pfn [in]: array of user/guest PFNs to be pinned.
1907  * @npage [in]   : count of elements in user_pfn array.  This count should not
1908  *                 be greater VFIO_PIN_PAGES_MAX_ENTRIES.
1909  * @prot [in]    : protection flags
1910  * @phys_pfn[out]: array of host PFNs
1911  * Return error or number of pages pinned.
1912  */
1913 int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage,
1914                    int prot, unsigned long *phys_pfn)
1915 {
1916         struct vfio_container *container;
1917         struct vfio_group *group;
1918         struct vfio_iommu_driver *driver;
1919         int ret;
1920 
1921         if (!dev || !user_pfn || !phys_pfn || !npage)
1922                 return -EINVAL;
1923 
1924         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1925                 return -E2BIG;
1926 
1927         group = vfio_group_get_from_dev(dev);
1928         if (!group)
1929                 return -ENODEV;
1930 
1931         ret = vfio_group_add_container_user(group);
1932         if (ret)
1933                 goto err_pin_pages;
1934 
1935         container = group->container;
1936         driver = container->iommu_driver;
1937         if (likely(driver && driver->ops->pin_pages))
1938                 ret = driver->ops->pin_pages(container->iommu_data, user_pfn,
1939                                              npage, prot, phys_pfn);
1940         else
1941                 ret = -ENOTTY;
1942 
1943         vfio_group_try_dissolve_container(group);
1944 
1945 err_pin_pages:
1946         vfio_group_put(group);
1947         return ret;
1948 }
1949 EXPORT_SYMBOL(vfio_pin_pages);
1950 
1951 /*
1952  * Unpin set of host PFNs for local domain only.
1953  * @dev [in]     : device
1954  * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest
1955  *                 PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1956  * @npage [in]   : count of elements in user_pfn array.  This count should not
1957  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1958  * Return error or number of pages unpinned.
1959  */
1960 int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage)
1961 {
1962         struct vfio_container *container;
1963         struct vfio_group *group;
1964         struct vfio_iommu_driver *driver;
1965         int ret;
1966 
1967         if (!dev || !user_pfn || !npage)
1968                 return -EINVAL;
1969 
1970         if (npage > VFIO_PIN_PAGES_MAX_ENTRIES)
1971                 return -E2BIG;
1972 
1973         group = vfio_group_get_from_dev(dev);
1974         if (!group)
1975                 return -ENODEV;
1976 
1977         ret = vfio_group_add_container_user(group);
1978         if (ret)
1979                 goto err_unpin_pages;
1980 
1981         container = group->container;
1982         driver = container->iommu_driver;
1983         if (likely(driver && driver->ops->unpin_pages))
1984                 ret = driver->ops->unpin_pages(container->iommu_data, user_pfn,
1985                                                npage);
1986         else
1987                 ret = -ENOTTY;
1988 
1989         vfio_group_try_dissolve_container(group);
1990 
1991 err_unpin_pages:
1992         vfio_group_put(group);
1993         return ret;
1994 }
1995 EXPORT_SYMBOL(vfio_unpin_pages);
1996 
1997 static int vfio_register_iommu_notifier(struct vfio_group *group,
1998                                         unsigned long *events,
1999                                         struct notifier_block *nb)
2000 {
2001         struct vfio_container *container;
2002         struct vfio_iommu_driver *driver;
2003         int ret;
2004 
2005         ret = vfio_group_add_container_user(group);
2006         if (ret)
2007                 return -EINVAL;
2008 
2009         container = group->container;
2010         driver = container->iommu_driver;
2011         if (likely(driver && driver->ops->register_notifier))
2012                 ret = driver->ops->register_notifier(container->iommu_data,
2013                                                      events, nb);
2014         else
2015                 ret = -ENOTTY;
2016 
2017         vfio_group_try_dissolve_container(group);
2018 
2019         return ret;
2020 }
2021 
2022 static int vfio_unregister_iommu_notifier(struct vfio_group *group,
2023                                           struct notifier_block *nb)
2024 {
2025         struct vfio_container *container;
2026         struct vfio_iommu_driver *driver;
2027         int ret;
2028 
2029         ret = vfio_group_add_container_user(group);
2030         if (ret)
2031                 return -EINVAL;
2032 
2033         container = group->container;
2034         driver = container->iommu_driver;
2035         if (likely(driver && driver->ops->unregister_notifier))
2036                 ret = driver->ops->unregister_notifier(container->iommu_data,
2037                                                        nb);
2038         else
2039                 ret = -ENOTTY;
2040 
2041         vfio_group_try_dissolve_container(group);
2042 
2043         return ret;
2044 }
2045 
2046 void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
2047 {
2048         group->kvm = kvm;
2049         blocking_notifier_call_chain(&group->notifier,
2050                                 VFIO_GROUP_NOTIFY_SET_KVM, kvm);
2051 }
2052 EXPORT_SYMBOL_GPL(vfio_group_set_kvm);
2053 
2054 static int vfio_register_group_notifier(struct vfio_group *group,
2055                                         unsigned long *events,
2056                                         struct notifier_block *nb)
2057 {
2058         int ret;
2059         bool set_kvm = false;
2060 
2061         if (*events & VFIO_GROUP_NOTIFY_SET_KVM)
2062                 set_kvm = true;
2063 
2064         /* clear known events */
2065         *events &= ~VFIO_GROUP_NOTIFY_SET_KVM;
2066 
2067         /* refuse to continue if still events remaining */
2068         if (*events)
2069                 return -EINVAL;
2070 
2071         ret = vfio_group_add_container_user(group);
2072         if (ret)
2073                 return -EINVAL;
2074 
2075         ret = blocking_notifier_chain_register(&group->notifier, nb);
2076 
2077         /*
2078          * The attaching of kvm and vfio_group might already happen, so
2079          * here we replay once upon registration.
2080          */
2081         if (!ret && set_kvm && group->kvm)
2082                 blocking_notifier_call_chain(&group->notifier,
2083                                         VFIO_GROUP_NOTIFY_SET_KVM, group->kvm);
2084 
2085         vfio_group_try_dissolve_container(group);
2086 
2087         return ret;
2088 }
2089 
2090 static int vfio_unregister_group_notifier(struct vfio_group *group,
2091                                          struct notifier_block *nb)
2092 {
2093         int ret;
2094 
2095         ret = vfio_group_add_container_user(group);
2096         if (ret)
2097                 return -EINVAL;
2098 
2099         ret = blocking_notifier_chain_unregister(&group->notifier, nb);
2100 
2101         vfio_group_try_dissolve_container(group);
2102 
2103         return ret;
2104 }
2105 
2106 int vfio_register_notifier(struct device *dev, enum vfio_notify_type type,
2107                            unsigned long *events, struct notifier_block *nb)
2108 {
2109         struct vfio_group *group;
2110         int ret;
2111 
2112         if (!dev || !nb || !events || (*events == 0))
2113                 return -EINVAL;
2114 
2115         group = vfio_group_get_from_dev(dev);
2116         if (!group)
2117                 return -ENODEV;
2118 
2119         switch (type) {
2120         case VFIO_IOMMU_NOTIFY:
2121                 ret = vfio_register_iommu_notifier(group, events, nb);
2122                 break;
2123         case VFIO_GROUP_NOTIFY:
2124                 ret = vfio_register_group_notifier(group, events, nb);
2125                 break;
2126         default:
2127                 ret = -EINVAL;
2128         }
2129 
2130         vfio_group_put(group);
2131         return ret;
2132 }
2133 EXPORT_SYMBOL(vfio_register_notifier);
2134 
2135 int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type,
2136                              struct notifier_block *nb)
2137 {
2138         struct vfio_group *group;
2139         int ret;
2140 
2141         if (!dev || !nb)
2142                 return -EINVAL;
2143 
2144         group = vfio_group_get_from_dev(dev);
2145         if (!group)
2146                 return -ENODEV;
2147 
2148         switch (type) {
2149         case VFIO_IOMMU_NOTIFY:
2150                 ret = vfio_unregister_iommu_notifier(group, nb);
2151                 break;
2152         case VFIO_GROUP_NOTIFY:
2153                 ret = vfio_unregister_group_notifier(group, nb);
2154                 break;
2155         default:
2156                 ret = -EINVAL;
2157         }
2158 
2159         vfio_group_put(group);
2160         return ret;
2161 }
2162 EXPORT_SYMBOL(vfio_unregister_notifier);
2163 
2164 /**
2165  * Module/class support
2166  */
2167 static char *vfio_devnode(struct device *dev, umode_t *mode)
2168 {
2169         return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
2170 }
2171 
2172 static struct miscdevice vfio_dev = {
2173         .minor = VFIO_MINOR,
2174         .name = "vfio",
2175         .fops = &vfio_fops,
2176         .nodename = "vfio/vfio",
2177         .mode = S_IRUGO | S_IWUGO,
2178 };
2179 
2180 static int __init vfio_init(void)
2181 {
2182         int ret;
2183 
2184         idr_init(&vfio.group_idr);
2185         mutex_init(&vfio.group_lock);
2186         mutex_init(&vfio.iommu_drivers_lock);
2187         INIT_LIST_HEAD(&vfio.group_list);
2188         INIT_LIST_HEAD(&vfio.iommu_drivers_list);
2189         init_waitqueue_head(&vfio.release_q);
2190 
2191         ret = misc_register(&vfio_dev);
2192         if (ret) {
2193                 pr_err("vfio: misc device register failed\n");
2194                 return ret;
2195         }
2196 
2197         /* /dev/vfio/$GROUP */
2198         vfio.class = class_create(THIS_MODULE, "vfio");
2199         if (IS_ERR(vfio.class)) {
2200                 ret = PTR_ERR(vfio.class);
2201                 goto err_class;
2202         }
2203 
2204         vfio.class->devnode = vfio_devnode;
2205 
2206         ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
2207         if (ret)
2208                 goto err_alloc_chrdev;
2209 
2210         cdev_init(&vfio.group_cdev, &vfio_group_fops);
2211         ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1);
2212         if (ret)
2213                 goto err_cdev_add;
2214 
2215         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
2216 
2217 #ifdef CONFIG_VFIO_NOIOMMU
2218         vfio_register_iommu_driver(&vfio_noiommu_ops);
2219 #endif
2220         return 0;
2221 
2222 err_cdev_add:
2223         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2224 err_alloc_chrdev:
2225         class_destroy(vfio.class);
2226         vfio.class = NULL;
2227 err_class:
2228         misc_deregister(&vfio_dev);
2229         return ret;
2230 }
2231 
2232 static void __exit vfio_cleanup(void)
2233 {
2234         WARN_ON(!list_empty(&vfio.group_list));
2235 
2236 #ifdef CONFIG_VFIO_NOIOMMU
2237         vfio_unregister_iommu_driver(&vfio_noiommu_ops);
2238 #endif
2239         idr_destroy(&vfio.group_idr);
2240         cdev_del(&vfio.group_cdev);
2241         unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
2242         class_destroy(vfio.class);
2243         vfio.class = NULL;
2244         misc_deregister(&vfio_dev);
2245 }
2246 
2247 module_init(vfio_init);
2248 module_exit(vfio_cleanup);
2249 
2250 MODULE_VERSION(DRIVER_VERSION);
2251 MODULE_LICENSE("GPL v2");
2252 MODULE_AUTHOR(DRIVER_AUTHOR);
2253 MODULE_DESCRIPTION(DRIVER_DESC);
2254 MODULE_ALIAS_MISCDEV(VFIO_MINOR);
2255 MODULE_ALIAS("devname:vfio/vfio");
2256 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");

/* [<][>][^][v][top][bottom][index][help] */