root/drivers/misc/habanalabs/device.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. hl_device_disabled_or_in_reset
  2. hpriv_release
  3. hl_hpriv_get
  4. hl_hpriv_put
  5. hl_device_release
  6. hl_device_release_ctrl
  7. hl_mmap
  8. device_release_func
  9. device_init_cdev
  10. device_cdev_sysfs_add
  11. device_cdev_sysfs_del
  12. device_early_init
  13. device_early_fini
  14. set_freq_to_low_job
  15. hl_device_heartbeat
  16. device_late_init
  17. device_late_fini
  18. hl_device_utilization
  19. hl_device_set_frequency
  20. hl_device_set_debug_mode
  21. hl_device_suspend
  22. hl_device_resume
  23. device_kill_open_processes
  24. device_hard_reset_pending
  25. hl_device_reset
  26. hl_device_init
  27. hl_device_fini
  28. hl_rreg
  29. hl_wreg

   1 // SPDX-License-Identifier: GPL-2.0
   2 
   3 /*
   4  * Copyright 2016-2019 HabanaLabs, Ltd.
   5  * All Rights Reserved.
   6  */
   7 
   8 #define pr_fmt(fmt)                     "habanalabs: " fmt
   9 
  10 #include "habanalabs.h"
  11 
  12 #include <linux/pci.h>
  13 #include <linux/sched/signal.h>
  14 #include <linux/hwmon.h>
  15 #include <uapi/misc/habanalabs.h>
  16 
  17 #define HL_PLDM_PENDING_RESET_PER_SEC   (HL_PENDING_RESET_PER_SEC * 10)
  18 
  19 bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
  20 {
  21         if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
  22                 return true;
  23         else
  24                 return false;
  25 }
  26 
  27 enum hl_device_status hl_device_status(struct hl_device *hdev)
  28 {
  29         enum hl_device_status status;
  30 
  31         if (hdev->disabled)
  32                 status = HL_DEVICE_STATUS_MALFUNCTION;
  33         else if (atomic_read(&hdev->in_reset))
  34                 status = HL_DEVICE_STATUS_IN_RESET;
  35         else
  36                 status = HL_DEVICE_STATUS_OPERATIONAL;
  37 
  38         return status;
  39 };
  40 
  41 static void hpriv_release(struct kref *ref)
  42 {
  43         struct hl_fpriv *hpriv;
  44         struct hl_device *hdev;
  45         struct hl_ctx *ctx;
  46 
  47         hpriv = container_of(ref, struct hl_fpriv, refcount);
  48 
  49         hdev = hpriv->hdev;
  50         ctx = hpriv->ctx;
  51 
  52         put_pid(hpriv->taskpid);
  53 
  54         hl_debugfs_remove_file(hpriv);
  55 
  56         mutex_destroy(&hpriv->restore_phase_mutex);
  57 
  58         mutex_lock(&hdev->fpriv_list_lock);
  59         list_del(&hpriv->dev_node);
  60         hdev->compute_ctx = NULL;
  61         mutex_unlock(&hdev->fpriv_list_lock);
  62 
  63         kfree(hpriv);
  64 }
  65 
  66 void hl_hpriv_get(struct hl_fpriv *hpriv)
  67 {
  68         kref_get(&hpriv->refcount);
  69 }
  70 
  71 void hl_hpriv_put(struct hl_fpriv *hpriv)
  72 {
  73         kref_put(&hpriv->refcount, hpriv_release);
  74 }
  75 
  76 /*
  77  * hl_device_release - release function for habanalabs device
  78  *
  79  * @inode: pointer to inode structure
  80  * @filp: pointer to file structure
  81  *
  82  * Called when process closes an habanalabs device
  83  */
  84 static int hl_device_release(struct inode *inode, struct file *filp)
  85 {
  86         struct hl_fpriv *hpriv = filp->private_data;
  87 
  88         hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
  89         hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
  90 
  91         filp->private_data = NULL;
  92 
  93         hl_hpriv_put(hpriv);
  94 
  95         return 0;
  96 }
  97 
  98 static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
  99 {
 100         struct hl_fpriv *hpriv = filp->private_data;
 101         struct hl_device *hdev;
 102 
 103         filp->private_data = NULL;
 104 
 105         hdev = hpriv->hdev;
 106 
 107         mutex_lock(&hdev->fpriv_list_lock);
 108         list_del(&hpriv->dev_node);
 109         mutex_unlock(&hdev->fpriv_list_lock);
 110 
 111         kfree(hpriv);
 112 
 113         return 0;
 114 }
 115 
 116 /*
 117  * hl_mmap - mmap function for habanalabs device
 118  *
 119  * @*filp: pointer to file structure
 120  * @*vma: pointer to vm_area_struct of the process
 121  *
 122  * Called when process does an mmap on habanalabs device. Call the device's mmap
 123  * function at the end of the common code.
 124  */
 125 static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
 126 {
 127         struct hl_fpriv *hpriv = filp->private_data;
 128 
 129         if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
 130                 vma->vm_pgoff ^= HL_MMAP_CB_MASK;
 131                 return hl_cb_mmap(hpriv, vma);
 132         }
 133 
 134         return -EINVAL;
 135 }
 136 
 137 static const struct file_operations hl_ops = {
 138         .owner = THIS_MODULE,
 139         .open = hl_device_open,
 140         .release = hl_device_release,
 141         .mmap = hl_mmap,
 142         .unlocked_ioctl = hl_ioctl,
 143         .compat_ioctl = hl_ioctl
 144 };
 145 
 146 static const struct file_operations hl_ctrl_ops = {
 147         .owner = THIS_MODULE,
 148         .open = hl_device_open_ctrl,
 149         .release = hl_device_release_ctrl,
 150         .unlocked_ioctl = hl_ioctl_control,
 151         .compat_ioctl = hl_ioctl_control
 152 };
 153 
 154 static void device_release_func(struct device *dev)
 155 {
 156         kfree(dev);
 157 }
 158 
 159 /*
 160  * device_init_cdev - Initialize cdev and device for habanalabs device
 161  *
 162  * @hdev: pointer to habanalabs device structure
 163  * @hclass: pointer to the class object of the device
 164  * @minor: minor number of the specific device
 165  * @fpos: file operations to install for this device
 166  * @name: name of the device as it will appear in the filesystem
 167  * @cdev: pointer to the char device object that will be initialized
 168  * @dev: pointer to the device object that will be initialized
 169  *
 170  * Initialize a cdev and a Linux device for habanalabs's device.
 171  */
 172 static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
 173                                 int minor, const struct file_operations *fops,
 174                                 char *name, struct cdev *cdev,
 175                                 struct device **dev)
 176 {
 177         cdev_init(cdev, fops);
 178         cdev->owner = THIS_MODULE;
 179 
 180         *dev = kzalloc(sizeof(**dev), GFP_KERNEL);
 181         if (!*dev)
 182                 return -ENOMEM;
 183 
 184         device_initialize(*dev);
 185         (*dev)->devt = MKDEV(hdev->major, minor);
 186         (*dev)->class = hclass;
 187         (*dev)->release = device_release_func;
 188         dev_set_drvdata(*dev, hdev);
 189         dev_set_name(*dev, "%s", name);
 190 
 191         return 0;
 192 }
 193 
 194 static int device_cdev_sysfs_add(struct hl_device *hdev)
 195 {
 196         int rc;
 197 
 198         rc = cdev_device_add(&hdev->cdev, hdev->dev);
 199         if (rc) {
 200                 dev_err(hdev->dev,
 201                         "failed to add a char device to the system\n");
 202                 return rc;
 203         }
 204 
 205         rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
 206         if (rc) {
 207                 dev_err(hdev->dev,
 208                         "failed to add a control char device to the system\n");
 209                 goto delete_cdev_device;
 210         }
 211 
 212         /* hl_sysfs_init() must be done after adding the device to the system */
 213         rc = hl_sysfs_init(hdev);
 214         if (rc) {
 215                 dev_err(hdev->dev, "failed to initialize sysfs\n");
 216                 goto delete_ctrl_cdev_device;
 217         }
 218 
 219         hdev->cdev_sysfs_created = true;
 220 
 221         return 0;
 222 
 223 delete_ctrl_cdev_device:
 224         cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
 225 delete_cdev_device:
 226         cdev_device_del(&hdev->cdev, hdev->dev);
 227         return rc;
 228 }
 229 
 230 static void device_cdev_sysfs_del(struct hl_device *hdev)
 231 {
 232         /* device_release() won't be called so must free devices explicitly */
 233         if (!hdev->cdev_sysfs_created) {
 234                 kfree(hdev->dev_ctrl);
 235                 kfree(hdev->dev);
 236                 return;
 237         }
 238 
 239         hl_sysfs_fini(hdev);
 240         cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
 241         cdev_device_del(&hdev->cdev, hdev->dev);
 242 }
 243 
 244 /*
 245  * device_early_init - do some early initialization for the habanalabs device
 246  *
 247  * @hdev: pointer to habanalabs device structure
 248  *
 249  * Install the relevant function pointers and call the early_init function,
 250  * if such a function exists
 251  */
 252 static int device_early_init(struct hl_device *hdev)
 253 {
 254         int rc;
 255 
 256         switch (hdev->asic_type) {
 257         case ASIC_GOYA:
 258                 goya_set_asic_funcs(hdev);
 259                 strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
 260                 break;
 261         default:
 262                 dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
 263                         hdev->asic_type);
 264                 return -EINVAL;
 265         }
 266 
 267         rc = hdev->asic_funcs->early_init(hdev);
 268         if (rc)
 269                 return rc;
 270 
 271         rc = hl_asid_init(hdev);
 272         if (rc)
 273                 goto early_fini;
 274 
 275         hdev->cq_wq = alloc_workqueue("hl-free-jobs", WQ_UNBOUND, 0);
 276         if (hdev->cq_wq == NULL) {
 277                 dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
 278                 rc = -ENOMEM;
 279                 goto asid_fini;
 280         }
 281 
 282         hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
 283         if (hdev->eq_wq == NULL) {
 284                 dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
 285                 rc = -ENOMEM;
 286                 goto free_cq_wq;
 287         }
 288 
 289         hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
 290                                         GFP_KERNEL);
 291         if (!hdev->hl_chip_info) {
 292                 rc = -ENOMEM;
 293                 goto free_eq_wq;
 294         }
 295 
 296         hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
 297                                         sizeof(struct hl_device_idle_busy_ts),
 298                                         (GFP_KERNEL | __GFP_ZERO));
 299         if (!hdev->idle_busy_ts_arr) {
 300                 rc = -ENOMEM;
 301                 goto free_chip_info;
 302         }
 303 
 304         hl_cb_mgr_init(&hdev->kernel_cb_mgr);
 305 
 306         mutex_init(&hdev->send_cpu_message_lock);
 307         mutex_init(&hdev->debug_lock);
 308         mutex_init(&hdev->mmu_cache_lock);
 309         INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
 310         spin_lock_init(&hdev->hw_queues_mirror_lock);
 311         INIT_LIST_HEAD(&hdev->fpriv_list);
 312         mutex_init(&hdev->fpriv_list_lock);
 313         atomic_set(&hdev->in_reset, 0);
 314 
 315         return 0;
 316 
 317 free_chip_info:
 318         kfree(hdev->hl_chip_info);
 319 free_eq_wq:
 320         destroy_workqueue(hdev->eq_wq);
 321 free_cq_wq:
 322         destroy_workqueue(hdev->cq_wq);
 323 asid_fini:
 324         hl_asid_fini(hdev);
 325 early_fini:
 326         if (hdev->asic_funcs->early_fini)
 327                 hdev->asic_funcs->early_fini(hdev);
 328 
 329         return rc;
 330 }
 331 
 332 /*
 333  * device_early_fini - finalize all that was done in device_early_init
 334  *
 335  * @hdev: pointer to habanalabs device structure
 336  *
 337  */
 338 static void device_early_fini(struct hl_device *hdev)
 339 {
 340         mutex_destroy(&hdev->mmu_cache_lock);
 341         mutex_destroy(&hdev->debug_lock);
 342         mutex_destroy(&hdev->send_cpu_message_lock);
 343 
 344         mutex_destroy(&hdev->fpriv_list_lock);
 345 
 346         hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
 347 
 348         kfree(hdev->idle_busy_ts_arr);
 349         kfree(hdev->hl_chip_info);
 350 
 351         destroy_workqueue(hdev->eq_wq);
 352         destroy_workqueue(hdev->cq_wq);
 353 
 354         hl_asid_fini(hdev);
 355 
 356         if (hdev->asic_funcs->early_fini)
 357                 hdev->asic_funcs->early_fini(hdev);
 358 }
 359 
 360 static void set_freq_to_low_job(struct work_struct *work)
 361 {
 362         struct hl_device *hdev = container_of(work, struct hl_device,
 363                                                 work_freq.work);
 364 
 365         mutex_lock(&hdev->fpriv_list_lock);
 366 
 367         if (!hdev->compute_ctx)
 368                 hl_device_set_frequency(hdev, PLL_LOW);
 369 
 370         mutex_unlock(&hdev->fpriv_list_lock);
 371 
 372         schedule_delayed_work(&hdev->work_freq,
 373                         usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
 374 }
 375 
 376 static void hl_device_heartbeat(struct work_struct *work)
 377 {
 378         struct hl_device *hdev = container_of(work, struct hl_device,
 379                                                 work_heartbeat.work);
 380 
 381         if (hl_device_disabled_or_in_reset(hdev))
 382                 goto reschedule;
 383 
 384         if (!hdev->asic_funcs->send_heartbeat(hdev))
 385                 goto reschedule;
 386 
 387         dev_err(hdev->dev, "Device heartbeat failed!\n");
 388         hl_device_reset(hdev, true, false);
 389 
 390         return;
 391 
 392 reschedule:
 393         schedule_delayed_work(&hdev->work_heartbeat,
 394                         usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
 395 }
 396 
 397 /*
 398  * device_late_init - do late stuff initialization for the habanalabs device
 399  *
 400  * @hdev: pointer to habanalabs device structure
 401  *
 402  * Do stuff that either needs the device H/W queues to be active or needs
 403  * to happen after all the rest of the initialization is finished
 404  */
 405 static int device_late_init(struct hl_device *hdev)
 406 {
 407         int rc;
 408 
 409         if (hdev->asic_funcs->late_init) {
 410                 rc = hdev->asic_funcs->late_init(hdev);
 411                 if (rc) {
 412                         dev_err(hdev->dev,
 413                                 "failed late initialization for the H/W\n");
 414                         return rc;
 415                 }
 416         }
 417 
 418         hdev->high_pll = hdev->asic_prop.high_pll;
 419 
 420         /* force setting to low frequency */
 421         hdev->curr_pll_profile = PLL_LOW;
 422 
 423         if (hdev->pm_mng_profile == PM_AUTO)
 424                 hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
 425         else
 426                 hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
 427 
 428         INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
 429         schedule_delayed_work(&hdev->work_freq,
 430         usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
 431 
 432         if (hdev->heartbeat) {
 433                 INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
 434                 schedule_delayed_work(&hdev->work_heartbeat,
 435                                 usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
 436         }
 437 
 438         hdev->late_init_done = true;
 439 
 440         return 0;
 441 }
 442 
 443 /*
 444  * device_late_fini - finalize all that was done in device_late_init
 445  *
 446  * @hdev: pointer to habanalabs device structure
 447  *
 448  */
 449 static void device_late_fini(struct hl_device *hdev)
 450 {
 451         if (!hdev->late_init_done)
 452                 return;
 453 
 454         cancel_delayed_work_sync(&hdev->work_freq);
 455         if (hdev->heartbeat)
 456                 cancel_delayed_work_sync(&hdev->work_heartbeat);
 457 
 458         if (hdev->asic_funcs->late_fini)
 459                 hdev->asic_funcs->late_fini(hdev);
 460 
 461         hdev->late_init_done = false;
 462 }
 463 
 464 uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms)
 465 {
 466         struct hl_device_idle_busy_ts *ts;
 467         ktime_t zero_ktime, curr = ktime_get();
 468         u32 overlap_cnt = 0, last_index = hdev->idle_busy_ts_idx;
 469         s64 period_us, last_start_us, last_end_us, last_busy_time_us,
 470                 total_busy_time_us = 0, total_busy_time_ms;
 471 
 472         zero_ktime = ktime_set(0, 0);
 473         period_us = period_ms * USEC_PER_MSEC;
 474         ts = &hdev->idle_busy_ts_arr[last_index];
 475 
 476         /* check case that device is currently in idle */
 477         if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime) &&
 478                         !ktime_compare(ts->idle_to_busy_ts, zero_ktime)) {
 479 
 480                 last_index--;
 481                 /* Handle case idle_busy_ts_idx was 0 */
 482                 if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
 483                         last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
 484 
 485                 ts = &hdev->idle_busy_ts_arr[last_index];
 486         }
 487 
 488         while (overlap_cnt < HL_IDLE_BUSY_TS_ARR_SIZE) {
 489                 /* Check if we are in last sample case. i.e. if the sample
 490                  * begun before the sampling period. This could be a real
 491                  * sample or 0 so need to handle both cases
 492                  */
 493                 last_start_us = ktime_to_us(
 494                                 ktime_sub(curr, ts->idle_to_busy_ts));
 495 
 496                 if (last_start_us > period_us) {
 497 
 498                         /* First check two cases:
 499                          * 1. If the device is currently busy
 500                          * 2. If the device was idle during the whole sampling
 501                          *    period
 502                          */
 503 
 504                         if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime)) {
 505                                 /* Check if the device is currently busy */
 506                                 if (ktime_compare(ts->idle_to_busy_ts,
 507                                                 zero_ktime))
 508                                         return 100;
 509 
 510                                 /* We either didn't have any activity or we
 511                                  * reached an entry which is 0. Either way,
 512                                  * exit and return what was accumulated so far
 513                                  */
 514                                 break;
 515                         }
 516 
 517                         /* If sample has finished, check it is relevant */
 518                         last_end_us = ktime_to_us(
 519                                         ktime_sub(curr, ts->busy_to_idle_ts));
 520 
 521                         if (last_end_us > period_us)
 522                                 break;
 523 
 524                         /* It is relevant so add it but with adjustment */
 525                         last_busy_time_us = ktime_to_us(
 526                                                 ktime_sub(ts->busy_to_idle_ts,
 527                                                 ts->idle_to_busy_ts));
 528                         total_busy_time_us += last_busy_time_us -
 529                                         (last_start_us - period_us);
 530                         break;
 531                 }
 532 
 533                 /* Check if the sample is finished or still open */
 534                 if (ktime_compare(ts->busy_to_idle_ts, zero_ktime))
 535                         last_busy_time_us = ktime_to_us(
 536                                                 ktime_sub(ts->busy_to_idle_ts,
 537                                                 ts->idle_to_busy_ts));
 538                 else
 539                         last_busy_time_us = ktime_to_us(
 540                                         ktime_sub(curr, ts->idle_to_busy_ts));
 541 
 542                 total_busy_time_us += last_busy_time_us;
 543 
 544                 last_index--;
 545                 /* Handle case idle_busy_ts_idx was 0 */
 546                 if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
 547                         last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
 548 
 549                 ts = &hdev->idle_busy_ts_arr[last_index];
 550 
 551                 overlap_cnt++;
 552         }
 553 
 554         total_busy_time_ms = DIV_ROUND_UP_ULL(total_busy_time_us,
 555                                                 USEC_PER_MSEC);
 556 
 557         return DIV_ROUND_UP_ULL(total_busy_time_ms * 100, period_ms);
 558 }
 559 
 560 /*
 561  * hl_device_set_frequency - set the frequency of the device
 562  *
 563  * @hdev: pointer to habanalabs device structure
 564  * @freq: the new frequency value
 565  *
 566  * Change the frequency if needed. This function has no protection against
 567  * concurrency, therefore it is assumed that the calling function has protected
 568  * itself against the case of calling this function from multiple threads with
 569  * different values
 570  *
 571  * Returns 0 if no change was done, otherwise returns 1
 572  */
 573 int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
 574 {
 575         if ((hdev->pm_mng_profile == PM_MANUAL) ||
 576                         (hdev->curr_pll_profile == freq))
 577                 return 0;
 578 
 579         dev_dbg(hdev->dev, "Changing device frequency to %s\n",
 580                 freq == PLL_HIGH ? "high" : "low");
 581 
 582         hdev->asic_funcs->set_pll_profile(hdev, freq);
 583 
 584         hdev->curr_pll_profile = freq;
 585 
 586         return 1;
 587 }
 588 
 589 int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
 590 {
 591         int rc = 0;
 592 
 593         mutex_lock(&hdev->debug_lock);
 594 
 595         if (!enable) {
 596                 if (!hdev->in_debug) {
 597                         dev_err(hdev->dev,
 598                                 "Failed to disable debug mode because device was not in debug mode\n");
 599                         rc = -EFAULT;
 600                         goto out;
 601                 }
 602 
 603                 if (!hdev->hard_reset_pending)
 604                         hdev->asic_funcs->halt_coresight(hdev);
 605 
 606                 hdev->in_debug = 0;
 607 
 608                 goto out;
 609         }
 610 
 611         if (hdev->in_debug) {
 612                 dev_err(hdev->dev,
 613                         "Failed to enable debug mode because device is already in debug mode\n");
 614                 rc = -EFAULT;
 615                 goto out;
 616         }
 617 
 618         hdev->in_debug = 1;
 619 
 620 out:
 621         mutex_unlock(&hdev->debug_lock);
 622 
 623         return rc;
 624 }
 625 
 626 /*
 627  * hl_device_suspend - initiate device suspend
 628  *
 629  * @hdev: pointer to habanalabs device structure
 630  *
 631  * Puts the hw in the suspend state (all asics).
 632  * Returns 0 for success or an error on failure.
 633  * Called at driver suspend.
 634  */
 635 int hl_device_suspend(struct hl_device *hdev)
 636 {
 637         int rc;
 638 
 639         pci_save_state(hdev->pdev);
 640 
 641         /* Block future CS/VM/JOB completion operations */
 642         rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
 643         if (rc) {
 644                 dev_err(hdev->dev, "Can't suspend while in reset\n");
 645                 return -EIO;
 646         }
 647 
 648         /* This blocks all other stuff that is not blocked by in_reset */
 649         hdev->disabled = true;
 650 
 651         /*
 652          * Flush anyone that is inside the critical section of enqueue
 653          * jobs to the H/W
 654          */
 655         hdev->asic_funcs->hw_queues_lock(hdev);
 656         hdev->asic_funcs->hw_queues_unlock(hdev);
 657 
 658         /* Flush processes that are sending message to CPU */
 659         mutex_lock(&hdev->send_cpu_message_lock);
 660         mutex_unlock(&hdev->send_cpu_message_lock);
 661 
 662         rc = hdev->asic_funcs->suspend(hdev);
 663         if (rc)
 664                 dev_err(hdev->dev,
 665                         "Failed to disable PCI access of device CPU\n");
 666 
 667         /* Shut down the device */
 668         pci_disable_device(hdev->pdev);
 669         pci_set_power_state(hdev->pdev, PCI_D3hot);
 670 
 671         return 0;
 672 }
 673 
 674 /*
 675  * hl_device_resume - initiate device resume
 676  *
 677  * @hdev: pointer to habanalabs device structure
 678  *
 679  * Bring the hw back to operating state (all asics).
 680  * Returns 0 for success or an error on failure.
 681  * Called at driver resume.
 682  */
 683 int hl_device_resume(struct hl_device *hdev)
 684 {
 685         int rc;
 686 
 687         pci_set_power_state(hdev->pdev, PCI_D0);
 688         pci_restore_state(hdev->pdev);
 689         rc = pci_enable_device_mem(hdev->pdev);
 690         if (rc) {
 691                 dev_err(hdev->dev,
 692                         "Failed to enable PCI device in resume\n");
 693                 return rc;
 694         }
 695 
 696         pci_set_master(hdev->pdev);
 697 
 698         rc = hdev->asic_funcs->resume(hdev);
 699         if (rc) {
 700                 dev_err(hdev->dev, "Failed to resume device after suspend\n");
 701                 goto disable_device;
 702         }
 703 
 704 
 705         hdev->disabled = false;
 706         atomic_set(&hdev->in_reset, 0);
 707 
 708         rc = hl_device_reset(hdev, true, false);
 709         if (rc) {
 710                 dev_err(hdev->dev, "Failed to reset device during resume\n");
 711                 goto disable_device;
 712         }
 713 
 714         return 0;
 715 
 716 disable_device:
 717         pci_clear_master(hdev->pdev);
 718         pci_disable_device(hdev->pdev);
 719 
 720         return rc;
 721 }
 722 
 723 static void device_kill_open_processes(struct hl_device *hdev)
 724 {
 725         u16 pending_total, pending_cnt;
 726         struct hl_fpriv *hpriv;
 727         struct task_struct *task = NULL;
 728 
 729         if (hdev->pldm)
 730                 pending_total = HL_PLDM_PENDING_RESET_PER_SEC;
 731         else
 732                 pending_total = HL_PENDING_RESET_PER_SEC;
 733 
 734         /* Giving time for user to close FD, and for processes that are inside
 735          * hl_device_open to finish
 736          */
 737         if (!list_empty(&hdev->fpriv_list))
 738                 ssleep(1);
 739 
 740         mutex_lock(&hdev->fpriv_list_lock);
 741 
 742         /* This section must be protected because we are dereferencing
 743          * pointers that are freed if the process exits
 744          */
 745         list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
 746                 task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
 747                 if (task) {
 748                         dev_info(hdev->dev, "Killing user process pid=%d\n",
 749                                 task_pid_nr(task));
 750                         send_sig(SIGKILL, task, 1);
 751                         usleep_range(1000, 10000);
 752 
 753                         put_task_struct(task);
 754                 }
 755         }
 756 
 757         mutex_unlock(&hdev->fpriv_list_lock);
 758 
 759         /* We killed the open users, but because the driver cleans up after the
 760          * user contexts are closed (e.g. mmu mappings), we need to wait again
 761          * to make sure the cleaning phase is finished before continuing with
 762          * the reset
 763          */
 764 
 765         pending_cnt = pending_total;
 766 
 767         while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
 768                 dev_info(hdev->dev,
 769                         "Waiting for all unmap operations to finish before hard reset\n");
 770 
 771                 pending_cnt--;
 772 
 773                 ssleep(1);
 774         }
 775 
 776         if (!list_empty(&hdev->fpriv_list))
 777                 dev_crit(hdev->dev,
 778                         "Going to hard reset with open user contexts\n");
 779 }
 780 
 781 static void device_hard_reset_pending(struct work_struct *work)
 782 {
 783         struct hl_device_reset_work *device_reset_work =
 784                 container_of(work, struct hl_device_reset_work, reset_work);
 785         struct hl_device *hdev = device_reset_work->hdev;
 786 
 787         hl_device_reset(hdev, true, true);
 788 
 789         kfree(device_reset_work);
 790 }
 791 
 792 /*
 793  * hl_device_reset - reset the device
 794  *
 795  * @hdev: pointer to habanalabs device structure
 796  * @hard_reset: should we do hard reset to all engines or just reset the
 797  *              compute/dma engines
 798  *
 799  * Block future CS and wait for pending CS to be enqueued
 800  * Call ASIC H/W fini
 801  * Flush all completions
 802  * Re-initialize all internal data structures
 803  * Call ASIC H/W init, late_init
 804  * Test queues
 805  * Enable device
 806  *
 807  * Returns 0 for success or an error on failure.
 808  */
 809 int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 810                         bool from_hard_reset_thread)
 811 {
 812         int i, rc;
 813 
 814         if (!hdev->init_done) {
 815                 dev_err(hdev->dev,
 816                         "Can't reset before initialization is done\n");
 817                 return 0;
 818         }
 819 
 820         /*
 821          * Prevent concurrency in this function - only one reset should be
 822          * done at any given time. Only need to perform this if we didn't
 823          * get from the dedicated hard reset thread
 824          */
 825         if (!from_hard_reset_thread) {
 826                 /* Block future CS/VM/JOB completion operations */
 827                 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
 828                 if (rc)
 829                         return 0;
 830 
 831                 /* This also blocks future CS/VM/JOB completion operations */
 832                 hdev->disabled = true;
 833 
 834                 /* Flush anyone that is inside the critical section of enqueue
 835                  * jobs to the H/W
 836                  */
 837                 hdev->asic_funcs->hw_queues_lock(hdev);
 838                 hdev->asic_funcs->hw_queues_unlock(hdev);
 839 
 840                 /* Flush anyone that is inside device open */
 841                 mutex_lock(&hdev->fpriv_list_lock);
 842                 mutex_unlock(&hdev->fpriv_list_lock);
 843 
 844                 dev_err(hdev->dev, "Going to RESET device!\n");
 845         }
 846 
 847 again:
 848         if ((hard_reset) && (!from_hard_reset_thread)) {
 849                 struct hl_device_reset_work *device_reset_work;
 850 
 851                 hdev->hard_reset_pending = true;
 852 
 853                 device_reset_work = kzalloc(sizeof(*device_reset_work),
 854                                                 GFP_ATOMIC);
 855                 if (!device_reset_work) {
 856                         rc = -ENOMEM;
 857                         goto out_err;
 858                 }
 859 
 860                 /*
 861                  * Because the reset function can't run from interrupt or
 862                  * from heartbeat work, we need to call the reset function
 863                  * from a dedicated work
 864                  */
 865                 INIT_WORK(&device_reset_work->reset_work,
 866                                 device_hard_reset_pending);
 867                 device_reset_work->hdev = hdev;
 868                 schedule_work(&device_reset_work->reset_work);
 869 
 870                 return 0;
 871         }
 872 
 873         if (hard_reset) {
 874                 device_late_fini(hdev);
 875 
 876                 /*
 877                  * Now that the heartbeat thread is closed, flush processes
 878                  * which are sending messages to CPU
 879                  */
 880                 mutex_lock(&hdev->send_cpu_message_lock);
 881                 mutex_unlock(&hdev->send_cpu_message_lock);
 882         }
 883 
 884         /*
 885          * Halt the engines and disable interrupts so we won't get any more
 886          * completions from H/W and we won't have any accesses from the
 887          * H/W to the host machine
 888          */
 889         hdev->asic_funcs->halt_engines(hdev, hard_reset);
 890 
 891         /* Go over all the queues, release all CS and their jobs */
 892         hl_cs_rollback_all(hdev);
 893 
 894         /* Kill processes here after CS rollback. This is because the process
 895          * can't really exit until all its CSs are done, which is what we
 896          * do in cs rollback
 897          */
 898         if (from_hard_reset_thread)
 899                 device_kill_open_processes(hdev);
 900 
 901         /* Release kernel context */
 902         if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
 903                 hdev->kernel_ctx = NULL;
 904 
 905         /* Reset the H/W. It will be in idle state after this returns */
 906         hdev->asic_funcs->hw_fini(hdev, hard_reset);
 907 
 908         if (hard_reset) {
 909                 hl_vm_fini(hdev);
 910                 hl_mmu_fini(hdev);
 911                 hl_eq_reset(hdev, &hdev->event_queue);
 912         }
 913 
 914         /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
 915         hl_hw_queue_reset(hdev, hard_reset);
 916         for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 917                 hl_cq_reset(hdev, &hdev->completion_queue[i]);
 918 
 919         hdev->idle_busy_ts_idx = 0;
 920         hdev->idle_busy_ts_arr[0].busy_to_idle_ts = ktime_set(0, 0);
 921         hdev->idle_busy_ts_arr[0].idle_to_busy_ts = ktime_set(0, 0);
 922 
 923         if (hdev->cs_active_cnt)
 924                 dev_crit(hdev->dev, "CS active cnt %d is not 0 during reset\n",
 925                         hdev->cs_active_cnt);
 926 
 927         mutex_lock(&hdev->fpriv_list_lock);
 928 
 929         /* Make sure the context switch phase will run again */
 930         if (hdev->compute_ctx) {
 931                 atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1);
 932                 hdev->compute_ctx->thread_ctx_switch_wait_token = 0;
 933         }
 934 
 935         mutex_unlock(&hdev->fpriv_list_lock);
 936 
 937         /* Finished tear-down, starting to re-initialize */
 938 
 939         if (hard_reset) {
 940                 hdev->device_cpu_disabled = false;
 941                 hdev->hard_reset_pending = false;
 942 
 943                 if (hdev->kernel_ctx) {
 944                         dev_crit(hdev->dev,
 945                                 "kernel ctx was alive during hard reset, something is terribly wrong\n");
 946                         rc = -EBUSY;
 947                         goto out_err;
 948                 }
 949 
 950                 rc = hl_mmu_init(hdev);
 951                 if (rc) {
 952                         dev_err(hdev->dev,
 953                                 "Failed to initialize MMU S/W after hard reset\n");
 954                         goto out_err;
 955                 }
 956 
 957                 /* Allocate the kernel context */
 958                 hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
 959                                                 GFP_KERNEL);
 960                 if (!hdev->kernel_ctx) {
 961                         rc = -ENOMEM;
 962                         goto out_err;
 963                 }
 964 
 965                 hdev->compute_ctx = NULL;
 966 
 967                 rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
 968                 if (rc) {
 969                         dev_err(hdev->dev,
 970                                 "failed to init kernel ctx in hard reset\n");
 971                         kfree(hdev->kernel_ctx);
 972                         hdev->kernel_ctx = NULL;
 973                         goto out_err;
 974                 }
 975         }
 976 
 977         rc = hdev->asic_funcs->hw_init(hdev);
 978         if (rc) {
 979                 dev_err(hdev->dev,
 980                         "failed to initialize the H/W after reset\n");
 981                 goto out_err;
 982         }
 983 
 984         hdev->disabled = false;
 985 
 986         /* Check that the communication with the device is working */
 987         rc = hdev->asic_funcs->test_queues(hdev);
 988         if (rc) {
 989                 dev_err(hdev->dev,
 990                         "Failed to detect if device is alive after reset\n");
 991                 goto out_err;
 992         }
 993 
 994         if (hard_reset) {
 995                 rc = device_late_init(hdev);
 996                 if (rc) {
 997                         dev_err(hdev->dev,
 998                                 "Failed late init after hard reset\n");
 999                         goto out_err;
1000                 }
1001 
1002                 rc = hl_vm_init(hdev);
1003                 if (rc) {
1004                         dev_err(hdev->dev,
1005                                 "Failed to init memory module after hard reset\n");
1006                         goto out_err;
1007                 }
1008 
1009                 hl_set_max_power(hdev, hdev->max_power);
1010         } else {
1011                 rc = hdev->asic_funcs->soft_reset_late_init(hdev);
1012                 if (rc) {
1013                         dev_err(hdev->dev,
1014                                 "Failed late init after soft reset\n");
1015                         goto out_err;
1016                 }
1017         }
1018 
1019         atomic_set(&hdev->in_reset, 0);
1020 
1021         if (hard_reset)
1022                 hdev->hard_reset_cnt++;
1023         else
1024                 hdev->soft_reset_cnt++;
1025 
1026         dev_warn(hdev->dev, "Successfully finished resetting the device\n");
1027 
1028         return 0;
1029 
1030 out_err:
1031         hdev->disabled = true;
1032 
1033         if (hard_reset) {
1034                 dev_err(hdev->dev,
1035                         "Failed to reset! Device is NOT usable\n");
1036                 hdev->hard_reset_cnt++;
1037         } else {
1038                 dev_err(hdev->dev,
1039                         "Failed to do soft-reset, trying hard reset\n");
1040                 hdev->soft_reset_cnt++;
1041                 hard_reset = true;
1042                 goto again;
1043         }
1044 
1045         atomic_set(&hdev->in_reset, 0);
1046 
1047         return rc;
1048 }
1049 
1050 /*
1051  * hl_device_init - main initialization function for habanalabs device
1052  *
1053  * @hdev: pointer to habanalabs device structure
1054  *
1055  * Allocate an id for the device, do early initialization and then call the
1056  * ASIC specific initialization functions. Finally, create the cdev and the
1057  * Linux device to expose it to the user
1058  */
1059 int hl_device_init(struct hl_device *hdev, struct class *hclass)
1060 {
1061         int i, rc, cq_ready_cnt;
1062         char *name;
1063         bool add_cdev_sysfs_on_err = false;
1064 
1065         name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2);
1066         if (!name) {
1067                 rc = -ENOMEM;
1068                 goto out_disabled;
1069         }
1070 
1071         /* Initialize cdev and device structures */
1072         rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
1073                                 &hdev->cdev, &hdev->dev);
1074 
1075         kfree(name);
1076 
1077         if (rc)
1078                 goto out_disabled;
1079 
1080         name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2);
1081         if (!name) {
1082                 rc = -ENOMEM;
1083                 goto free_dev;
1084         }
1085 
1086         /* Initialize cdev and device structures for control device */
1087         rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
1088                                 name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
1089 
1090         kfree(name);
1091 
1092         if (rc)
1093                 goto free_dev;
1094 
1095         /* Initialize ASIC function pointers and perform early init */
1096         rc = device_early_init(hdev);
1097         if (rc)
1098                 goto free_dev_ctrl;
1099 
1100         /*
1101          * Start calling ASIC initialization. First S/W then H/W and finally
1102          * late init
1103          */
1104         rc = hdev->asic_funcs->sw_init(hdev);
1105         if (rc)
1106                 goto early_fini;
1107 
1108         /*
1109          * Initialize the H/W queues. Must be done before hw_init, because
1110          * there the addresses of the kernel queue are being written to the
1111          * registers of the device
1112          */
1113         rc = hl_hw_queues_create(hdev);
1114         if (rc) {
1115                 dev_err(hdev->dev, "failed to initialize kernel queues\n");
1116                 goto sw_fini;
1117         }
1118 
1119         /*
1120          * Initialize the completion queues. Must be done before hw_init,
1121          * because there the addresses of the completion queues are being
1122          * passed as arguments to request_irq
1123          */
1124         hdev->completion_queue =
1125                         kcalloc(hdev->asic_prop.completion_queues_count,
1126                                 sizeof(*hdev->completion_queue), GFP_KERNEL);
1127 
1128         if (!hdev->completion_queue) {
1129                 dev_err(hdev->dev, "failed to allocate completion queues\n");
1130                 rc = -ENOMEM;
1131                 goto hw_queues_destroy;
1132         }
1133 
1134         for (i = 0, cq_ready_cnt = 0;
1135                         i < hdev->asic_prop.completion_queues_count;
1136                         i++, cq_ready_cnt++) {
1137                 rc = hl_cq_init(hdev, &hdev->completion_queue[i], i);
1138                 if (rc) {
1139                         dev_err(hdev->dev,
1140                                 "failed to initialize completion queue\n");
1141                         goto cq_fini;
1142                 }
1143         }
1144 
1145         /*
1146          * Initialize the event queue. Must be done before hw_init,
1147          * because there the address of the event queue is being
1148          * passed as argument to request_irq
1149          */
1150         rc = hl_eq_init(hdev, &hdev->event_queue);
1151         if (rc) {
1152                 dev_err(hdev->dev, "failed to initialize event queue\n");
1153                 goto cq_fini;
1154         }
1155 
1156         /* MMU S/W must be initialized before kernel context is created */
1157         rc = hl_mmu_init(hdev);
1158         if (rc) {
1159                 dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
1160                 goto eq_fini;
1161         }
1162 
1163         /* Allocate the kernel context */
1164         hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
1165         if (!hdev->kernel_ctx) {
1166                 rc = -ENOMEM;
1167                 goto mmu_fini;
1168         }
1169 
1170         hdev->compute_ctx = NULL;
1171 
1172         rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
1173         if (rc) {
1174                 dev_err(hdev->dev, "failed to initialize kernel context\n");
1175                 kfree(hdev->kernel_ctx);
1176                 goto mmu_fini;
1177         }
1178 
1179         rc = hl_cb_pool_init(hdev);
1180         if (rc) {
1181                 dev_err(hdev->dev, "failed to initialize CB pool\n");
1182                 goto release_ctx;
1183         }
1184 
1185         hl_debugfs_add_device(hdev);
1186 
1187         if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
1188                 dev_info(hdev->dev,
1189                         "H/W state is dirty, must reset before initializing\n");
1190                 hdev->asic_funcs->halt_engines(hdev, true);
1191                 hdev->asic_funcs->hw_fini(hdev, true);
1192         }
1193 
1194         /*
1195          * From this point, in case of an error, add char devices and create
1196          * sysfs nodes as part of the error flow, to allow debugging.
1197          */
1198         add_cdev_sysfs_on_err = true;
1199 
1200         rc = hdev->asic_funcs->hw_init(hdev);
1201         if (rc) {
1202                 dev_err(hdev->dev, "failed to initialize the H/W\n");
1203                 rc = 0;
1204                 goto out_disabled;
1205         }
1206 
1207         hdev->disabled = false;
1208 
1209         /* Check that the communication with the device is working */
1210         rc = hdev->asic_funcs->test_queues(hdev);
1211         if (rc) {
1212                 dev_err(hdev->dev, "Failed to detect if device is alive\n");
1213                 rc = 0;
1214                 goto out_disabled;
1215         }
1216 
1217         rc = device_late_init(hdev);
1218         if (rc) {
1219                 dev_err(hdev->dev, "Failed late initialization\n");
1220                 rc = 0;
1221                 goto out_disabled;
1222         }
1223 
1224         dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
1225                 hdev->asic_name,
1226                 hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
1227 
1228         rc = hl_vm_init(hdev);
1229         if (rc) {
1230                 dev_err(hdev->dev, "Failed to initialize memory module\n");
1231                 rc = 0;
1232                 goto out_disabled;
1233         }
1234 
1235         /*
1236          * Expose devices and sysfs nodes to user.
1237          * From here there is no need to add char devices and create sysfs nodes
1238          * in case of an error.
1239          */
1240         add_cdev_sysfs_on_err = false;
1241         rc = device_cdev_sysfs_add(hdev);
1242         if (rc) {
1243                 dev_err(hdev->dev,
1244                         "Failed to add char devices and sysfs nodes\n");
1245                 rc = 0;
1246                 goto out_disabled;
1247         }
1248 
1249         /*
1250          * hl_hwmon_init() must be called after device_late_init(), because only
1251          * there we get the information from the device about which
1252          * hwmon-related sensors the device supports.
1253          * Furthermore, it must be done after adding the device to the system.
1254          */
1255         rc = hl_hwmon_init(hdev);
1256         if (rc) {
1257                 dev_err(hdev->dev, "Failed to initialize hwmon\n");
1258                 rc = 0;
1259                 goto out_disabled;
1260         }
1261 
1262         dev_notice(hdev->dev,
1263                 "Successfully added device to habanalabs driver\n");
1264 
1265         hdev->init_done = true;
1266 
1267         return 0;
1268 
1269 release_ctx:
1270         if (hl_ctx_put(hdev->kernel_ctx) != 1)
1271                 dev_err(hdev->dev,
1272                         "kernel ctx is still alive on initialization failure\n");
1273 mmu_fini:
1274         hl_mmu_fini(hdev);
1275 eq_fini:
1276         hl_eq_fini(hdev, &hdev->event_queue);
1277 cq_fini:
1278         for (i = 0 ; i < cq_ready_cnt ; i++)
1279                 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1280         kfree(hdev->completion_queue);
1281 hw_queues_destroy:
1282         hl_hw_queues_destroy(hdev);
1283 sw_fini:
1284         hdev->asic_funcs->sw_fini(hdev);
1285 early_fini:
1286         device_early_fini(hdev);
1287 free_dev_ctrl:
1288         kfree(hdev->dev_ctrl);
1289 free_dev:
1290         kfree(hdev->dev);
1291 out_disabled:
1292         hdev->disabled = true;
1293         if (add_cdev_sysfs_on_err)
1294                 device_cdev_sysfs_add(hdev);
1295         if (hdev->pdev)
1296                 dev_err(&hdev->pdev->dev,
1297                         "Failed to initialize hl%d. Device is NOT usable !\n",
1298                         hdev->id / 2);
1299         else
1300                 pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
1301                         hdev->id / 2);
1302 
1303         return rc;
1304 }
1305 
1306 /*
1307  * hl_device_fini - main tear-down function for habanalabs device
1308  *
1309  * @hdev: pointer to habanalabs device structure
1310  *
1311  * Destroy the device, call ASIC fini functions and release the id
1312  */
1313 void hl_device_fini(struct hl_device *hdev)
1314 {
1315         int i, rc;
1316         ktime_t timeout;
1317 
1318         dev_info(hdev->dev, "Removing device\n");
1319 
1320         /*
1321          * This function is competing with the reset function, so try to
1322          * take the reset atomic and if we are already in middle of reset,
1323          * wait until reset function is finished. Reset function is designed
1324          * to always finish (could take up to a few seconds in worst case).
1325          */
1326 
1327         timeout = ktime_add_us(ktime_get(),
1328                                 HL_PENDING_RESET_PER_SEC * 1000 * 1000 * 4);
1329         rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
1330         while (rc) {
1331                 usleep_range(50, 200);
1332                 rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
1333                 if (ktime_compare(ktime_get(), timeout) > 0) {
1334                         WARN(1, "Failed to remove device because reset function did not finish\n");
1335                         return;
1336                 }
1337         }
1338 
1339         /* Mark device as disabled */
1340         hdev->disabled = true;
1341 
1342         /* Flush anyone that is inside the critical section of enqueue
1343          * jobs to the H/W
1344          */
1345         hdev->asic_funcs->hw_queues_lock(hdev);
1346         hdev->asic_funcs->hw_queues_unlock(hdev);
1347 
1348         /* Flush anyone that is inside device open */
1349         mutex_lock(&hdev->fpriv_list_lock);
1350         mutex_unlock(&hdev->fpriv_list_lock);
1351 
1352         hdev->hard_reset_pending = true;
1353 
1354         hl_hwmon_fini(hdev);
1355 
1356         device_late_fini(hdev);
1357 
1358         hl_debugfs_remove_device(hdev);
1359 
1360         /*
1361          * Halt the engines and disable interrupts so we won't get any more
1362          * completions from H/W and we won't have any accesses from the
1363          * H/W to the host machine
1364          */
1365         hdev->asic_funcs->halt_engines(hdev, true);
1366 
1367         /* Go over all the queues, release all CS and their jobs */
1368         hl_cs_rollback_all(hdev);
1369 
1370         /* Kill processes here after CS rollback. This is because the process
1371          * can't really exit until all its CSs are done, which is what we
1372          * do in cs rollback
1373          */
1374         device_kill_open_processes(hdev);
1375 
1376         hl_cb_pool_fini(hdev);
1377 
1378         /* Release kernel context */
1379         if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
1380                 dev_err(hdev->dev, "kernel ctx is still alive\n");
1381 
1382         /* Reset the H/W. It will be in idle state after this returns */
1383         hdev->asic_funcs->hw_fini(hdev, true);
1384 
1385         hl_vm_fini(hdev);
1386 
1387         hl_mmu_fini(hdev);
1388 
1389         hl_eq_fini(hdev, &hdev->event_queue);
1390 
1391         for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
1392                 hl_cq_fini(hdev, &hdev->completion_queue[i]);
1393         kfree(hdev->completion_queue);
1394 
1395         hl_hw_queues_destroy(hdev);
1396 
1397         /* Call ASIC S/W finalize function */
1398         hdev->asic_funcs->sw_fini(hdev);
1399 
1400         device_early_fini(hdev);
1401 
1402         /* Hide devices and sysfs nodes from user */
1403         device_cdev_sysfs_del(hdev);
1404 
1405         pr_info("removed device successfully\n");
1406 }
1407 
1408 /*
1409  * MMIO register access helper functions.
1410  */
1411 
1412 /*
1413  * hl_rreg - Read an MMIO register
1414  *
1415  * @hdev: pointer to habanalabs device structure
1416  * @reg: MMIO register offset (in bytes)
1417  *
1418  * Returns the value of the MMIO register we are asked to read
1419  *
1420  */
1421 inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
1422 {
1423         return readl(hdev->rmmio + reg);
1424 }
1425 
1426 /*
1427  * hl_wreg - Write to an MMIO register
1428  *
1429  * @hdev: pointer to habanalabs device structure
1430  * @reg: MMIO register offset (in bytes)
1431  * @val: 32-bit value
1432  *
1433  * Writes the 32-bit value into the MMIO register
1434  *
1435  */
1436 inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
1437 {
1438         writel(val, hdev->rmmio + reg);
1439 }

/* [<][>][^][v][top][bottom][index][help] */