root/drivers/misc/genwqe/card_base.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. genwqe_dev_alloc
  2. genwqe_dev_free
  3. genwqe_bus_reset
  4. genwqe_need_err_masking
  5. genwqe_tweak_hardware
  6. genwqe_recovery_on_fatal_gfir_required
  7. genwqe_flash_readback_fails
  8. genwqe_T_psec
  9. genwqe_setup_pf_jtimer
  10. genwqe_setup_vf_jtimer
  11. genwqe_ffdc_buffs_alloc
  12. genwqe_ffdc_buffs_free
  13. genwqe_read_ids
  14. genwqe_start
  15. genwqe_stop
  16. genwqe_recover_card
  17. genwqe_health_check_cond
  18. genwqe_fir_checking
  19. genwqe_pci_fundamental_reset
  20. genwqe_platform_recovery
  21. genwqe_reload_bistream
  22. genwqe_health_thread
  23. genwqe_health_check_start
  24. genwqe_health_thread_running
  25. genwqe_health_check_stop
  26. genwqe_pci_setup
  27. genwqe_pci_remove
  28. genwqe_probe
  29. genwqe_remove
  30. genwqe_err_error_detected
  31. genwqe_err_slot_reset
  32. genwqe_err_result_none
  33. genwqe_err_resume
  34. genwqe_sriov_configure
  35. genwqe_devnode
  36. genwqe_init_module
  37. genwqe_exit_module

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /**
   3  * IBM Accelerator Family 'GenWQE'
   4  *
   5  * (C) Copyright IBM Corp. 2013
   6  *
   7  * Author: Frank Haverkamp <haver@linux.vnet.ibm.com>
   8  * Author: Joerg-Stephan Vogt <jsvogt@de.ibm.com>
   9  * Author: Michael Jung <mijung@gmx.net>
  10  * Author: Michael Ruettger <michael@ibmra.de>
  11  */
  12 
  13 /*
  14  * Module initialization and PCIe setup. Card health monitoring and
  15  * recovery functionality. Character device creation and deletion are
  16  * controlled from here.
  17  */
  18 
  19 #include <linux/types.h>
  20 #include <linux/pci.h>
  21 #include <linux/err.h>
  22 #include <linux/aer.h>
  23 #include <linux/string.h>
  24 #include <linux/sched.h>
  25 #include <linux/wait.h>
  26 #include <linux/delay.h>
  27 #include <linux/dma-mapping.h>
  28 #include <linux/module.h>
  29 #include <linux/notifier.h>
  30 #include <linux/device.h>
  31 #include <linux/log2.h>
  32 
  33 #include "card_base.h"
  34 #include "card_ddcb.h"
  35 
  36 MODULE_AUTHOR("Frank Haverkamp <haver@linux.vnet.ibm.com>");
  37 MODULE_AUTHOR("Michael Ruettger <michael@ibmra.de>");
  38 MODULE_AUTHOR("Joerg-Stephan Vogt <jsvogt@de.ibm.com>");
  39 MODULE_AUTHOR("Michael Jung <mijung@gmx.net>");
  40 
  41 MODULE_DESCRIPTION("GenWQE Card");
  42 MODULE_VERSION(DRV_VERSION);
  43 MODULE_LICENSE("GPL");
  44 
  45 static char genwqe_driver_name[] = GENWQE_DEVNAME;
  46 static struct class *class_genwqe;
  47 static struct dentry *debugfs_genwqe;
  48 static struct genwqe_dev *genwqe_devices[GENWQE_CARD_NO_MAX];
  49 
  50 /* PCI structure for identifying device by PCI vendor and device ID */
  51 static const struct pci_device_id genwqe_device_table[] = {
  52         { .vendor      = PCI_VENDOR_ID_IBM,
  53           .device      = PCI_DEVICE_GENWQE,
  54           .subvendor   = PCI_SUBVENDOR_ID_IBM,
  55           .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
  56           .class       = (PCI_CLASSCODE_GENWQE5 << 8),
  57           .class_mask  = ~0,
  58           .driver_data = 0 },
  59 
  60         /* Initial SR-IOV bring-up image */
  61         { .vendor      = PCI_VENDOR_ID_IBM,
  62           .device      = PCI_DEVICE_GENWQE,
  63           .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
  64           .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_SRIOV,
  65           .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
  66           .class_mask  = ~0,
  67           .driver_data = 0 },
  68 
  69         { .vendor      = PCI_VENDOR_ID_IBM,  /* VF Vendor ID */
  70           .device      = 0x0000,  /* VF Device ID */
  71           .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
  72           .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_SRIOV,
  73           .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
  74           .class_mask  = ~0,
  75           .driver_data = 0 },
  76 
  77         /* Fixed up image */
  78         { .vendor      = PCI_VENDOR_ID_IBM,
  79           .device      = PCI_DEVICE_GENWQE,
  80           .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
  81           .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
  82           .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
  83           .class_mask  = ~0,
  84           .driver_data = 0 },
  85 
  86         { .vendor      = PCI_VENDOR_ID_IBM,  /* VF Vendor ID */
  87           .device      = 0x0000,  /* VF Device ID */
  88           .subvendor   = PCI_SUBVENDOR_ID_IBM_SRIOV,
  89           .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5,
  90           .class       = (PCI_CLASSCODE_GENWQE5_SRIOV << 8),
  91           .class_mask  = ~0,
  92           .driver_data = 0 },
  93 
  94         /* Even one more ... */
  95         { .vendor      = PCI_VENDOR_ID_IBM,
  96           .device      = PCI_DEVICE_GENWQE,
  97           .subvendor   = PCI_SUBVENDOR_ID_IBM,
  98           .subdevice   = PCI_SUBSYSTEM_ID_GENWQE5_NEW,
  99           .class       = (PCI_CLASSCODE_GENWQE5 << 8),
 100           .class_mask  = ~0,
 101           .driver_data = 0 },
 102 
 103         { 0, }                  /* 0 terminated list. */
 104 };
 105 
 106 MODULE_DEVICE_TABLE(pci, genwqe_device_table);
 107 
 108 /**
 109  * genwqe_dev_alloc() - Create and prepare a new card descriptor
 110  *
 111  * Return: Pointer to card descriptor, or ERR_PTR(err) on error
 112  */
 113 static struct genwqe_dev *genwqe_dev_alloc(void)
 114 {
 115         unsigned int i = 0, j;
 116         struct genwqe_dev *cd;
 117 
 118         for (i = 0; i < GENWQE_CARD_NO_MAX; i++) {
 119                 if (genwqe_devices[i] == NULL)
 120                         break;
 121         }
 122         if (i >= GENWQE_CARD_NO_MAX)
 123                 return ERR_PTR(-ENODEV);
 124 
 125         cd = kzalloc(sizeof(struct genwqe_dev), GFP_KERNEL);
 126         if (!cd)
 127                 return ERR_PTR(-ENOMEM);
 128 
 129         cd->card_idx = i;
 130         cd->class_genwqe = class_genwqe;
 131         cd->debugfs_genwqe = debugfs_genwqe;
 132 
 133         /*
 134          * This comes from kernel config option and can be overritten via
 135          * debugfs.
 136          */
 137         cd->use_platform_recovery = CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY;
 138 
 139         init_waitqueue_head(&cd->queue_waitq);
 140 
 141         spin_lock_init(&cd->file_lock);
 142         INIT_LIST_HEAD(&cd->file_list);
 143 
 144         cd->card_state = GENWQE_CARD_UNUSED;
 145         spin_lock_init(&cd->print_lock);
 146 
 147         cd->ddcb_software_timeout = GENWQE_DDCB_SOFTWARE_TIMEOUT;
 148         cd->kill_timeout = GENWQE_KILL_TIMEOUT;
 149 
 150         for (j = 0; j < GENWQE_MAX_VFS; j++)
 151                 cd->vf_jobtimeout_msec[j] = GENWQE_VF_JOBTIMEOUT_MSEC;
 152 
 153         genwqe_devices[i] = cd;
 154         return cd;
 155 }
 156 
 157 static void genwqe_dev_free(struct genwqe_dev *cd)
 158 {
 159         if (!cd)
 160                 return;
 161 
 162         genwqe_devices[cd->card_idx] = NULL;
 163         kfree(cd);
 164 }
 165 
 166 /**
 167  * genwqe_bus_reset() - Card recovery
 168  *
 169  * pci_reset_function() will recover the device and ensure that the
 170  * registers are accessible again when it completes with success. If
 171  * not, the card will stay dead and registers will be unaccessible
 172  * still.
 173  */
 174 static int genwqe_bus_reset(struct genwqe_dev *cd)
 175 {
 176         int rc = 0;
 177         struct pci_dev *pci_dev = cd->pci_dev;
 178         void __iomem *mmio;
 179 
 180         if (cd->err_inject & GENWQE_INJECT_BUS_RESET_FAILURE)
 181                 return -EIO;
 182 
 183         mmio = cd->mmio;
 184         cd->mmio = NULL;
 185         pci_iounmap(pci_dev, mmio);
 186 
 187         pci_release_mem_regions(pci_dev);
 188 
 189         /*
 190          * Firmware/BIOS might change memory mapping during bus reset.
 191          * Settings like enable bus-mastering, ... are backuped and
 192          * restored by the pci_reset_function().
 193          */
 194         dev_dbg(&pci_dev->dev, "[%s] pci_reset function ...\n", __func__);
 195         rc = pci_reset_function(pci_dev);
 196         if (rc) {
 197                 dev_err(&pci_dev->dev,
 198                         "[%s] err: failed reset func (rc %d)\n", __func__, rc);
 199                 return rc;
 200         }
 201         dev_dbg(&pci_dev->dev, "[%s] done with rc=%d\n", __func__, rc);
 202 
 203         /*
 204          * Here is the right spot to clear the register read
 205          * failure. pci_bus_reset() does this job in real systems.
 206          */
 207         cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE |
 208                             GENWQE_INJECT_GFIR_FATAL |
 209                             GENWQE_INJECT_GFIR_INFO);
 210 
 211         rc = pci_request_mem_regions(pci_dev, genwqe_driver_name);
 212         if (rc) {
 213                 dev_err(&pci_dev->dev,
 214                         "[%s] err: request bars failed (%d)\n", __func__, rc);
 215                 return -EIO;
 216         }
 217 
 218         cd->mmio = pci_iomap(pci_dev, 0, 0);
 219         if (cd->mmio == NULL) {
 220                 dev_err(&pci_dev->dev,
 221                         "[%s] err: mapping BAR0 failed\n", __func__);
 222                 return -ENOMEM;
 223         }
 224         return 0;
 225 }
 226 
 227 /*
 228  * Hardware circumvention section. Certain bitstreams in our test-lab
 229  * had different kinds of problems. Here is where we adjust those
 230  * bitstreams to function will with this version of our device driver.
 231  *
 232  * Thise circumventions are applied to the physical function only.
 233  * The magical numbers below are identifying development/manufacturing
 234  * versions of the bitstream used on the card.
 235  *
 236  * Turn off error reporting for old/manufacturing images.
 237  */
 238 
 239 bool genwqe_need_err_masking(struct genwqe_dev *cd)
 240 {
 241         return (cd->slu_unitcfg & 0xFFFF0ull) < 0x32170ull;
 242 }
 243 
 244 static void genwqe_tweak_hardware(struct genwqe_dev *cd)
 245 {
 246         struct pci_dev *pci_dev = cd->pci_dev;
 247 
 248         /* Mask FIRs for development images */
 249         if (((cd->slu_unitcfg & 0xFFFF0ull) >= 0x32000ull) &&
 250             ((cd->slu_unitcfg & 0xFFFF0ull) <= 0x33250ull)) {
 251                 dev_warn(&pci_dev->dev,
 252                          "FIRs masked due to bitstream %016llx.%016llx\n",
 253                          cd->slu_unitcfg, cd->app_unitcfg);
 254 
 255                 __genwqe_writeq(cd, IO_APP_SEC_LEM_DEBUG_OVR,
 256                                 0xFFFFFFFFFFFFFFFFull);
 257 
 258                 __genwqe_writeq(cd, IO_APP_ERR_ACT_MASK,
 259                                 0x0000000000000000ull);
 260         }
 261 }
 262 
 263 /**
 264  * genwqe_recovery_on_fatal_gfir_required() - Version depended actions
 265  *
 266  * Bitstreams older than 2013-02-17 have a bug where fatal GFIRs must
 267  * be ignored. This is e.g. true for the bitstream we gave to the card
 268  * manufacturer, but also for some old bitstreams we released to our
 269  * test-lab.
 270  */
 271 int genwqe_recovery_on_fatal_gfir_required(struct genwqe_dev *cd)
 272 {
 273         return (cd->slu_unitcfg & 0xFFFF0ull) >= 0x32170ull;
 274 }
 275 
 276 int genwqe_flash_readback_fails(struct genwqe_dev *cd)
 277 {
 278         return (cd->slu_unitcfg & 0xFFFF0ull) < 0x32170ull;
 279 }
 280 
 281 /**
 282  * genwqe_T_psec() - Calculate PF/VF timeout register content
 283  *
 284  * Note: From a design perspective it turned out to be a bad idea to
 285  * use codes here to specifiy the frequency/speed values. An old
 286  * driver cannot understand new codes and is therefore always a
 287  * problem. Better is to measure out the value or put the
 288  * speed/frequency directly into a register which is always a valid
 289  * value for old as well as for new software.
 290  */
 291 /* T = 1/f */
 292 static int genwqe_T_psec(struct genwqe_dev *cd)
 293 {
 294         u16 speed;      /* 1/f -> 250,  200,  166,  175 */
 295         static const int T[] = { 4000, 5000, 6000, 5714 };
 296 
 297         speed = (u16)((cd->slu_unitcfg >> 28) & 0x0full);
 298         if (speed >= ARRAY_SIZE(T))
 299                 return -1;      /* illegal value */
 300 
 301         return T[speed];
 302 }
 303 
 304 /**
 305  * genwqe_setup_pf_jtimer() - Setup PF hardware timeouts for DDCB execution
 306  *
 307  * Do this _after_ card_reset() is called. Otherwise the values will
 308  * vanish. The settings need to be done when the queues are inactive.
 309  *
 310  * The max. timeout value is 2^(10+x) * T (6ns for 166MHz) * 15/16.
 311  * The min. timeout value is 2^(10+x) * T (6ns for 166MHz) * 14/16.
 312  */
 313 static bool genwqe_setup_pf_jtimer(struct genwqe_dev *cd)
 314 {
 315         u32 T = genwqe_T_psec(cd);
 316         u64 x;
 317 
 318         if (GENWQE_PF_JOBTIMEOUT_MSEC == 0)
 319                 return false;
 320 
 321         /* PF: large value needed, flash update 2sec per block */
 322         x = ilog2(GENWQE_PF_JOBTIMEOUT_MSEC *
 323                   16000000000uL/(T * 15)) - 10;
 324 
 325         genwqe_write_vreg(cd, IO_SLC_VF_APPJOB_TIMEOUT,
 326                           0xff00 | (x & 0xff), 0);
 327         return true;
 328 }
 329 
 330 /**
 331  * genwqe_setup_vf_jtimer() - Setup VF hardware timeouts for DDCB execution
 332  */
 333 static bool genwqe_setup_vf_jtimer(struct genwqe_dev *cd)
 334 {
 335         struct pci_dev *pci_dev = cd->pci_dev;
 336         unsigned int vf;
 337         u32 T = genwqe_T_psec(cd);
 338         u64 x;
 339         int totalvfs;
 340 
 341         totalvfs = pci_sriov_get_totalvfs(pci_dev);
 342         if (totalvfs <= 0)
 343                 return false;
 344 
 345         for (vf = 0; vf < totalvfs; vf++) {
 346 
 347                 if (cd->vf_jobtimeout_msec[vf] == 0)
 348                         continue;
 349 
 350                 x = ilog2(cd->vf_jobtimeout_msec[vf] *
 351                           16000000000uL/(T * 15)) - 10;
 352 
 353                 genwqe_write_vreg(cd, IO_SLC_VF_APPJOB_TIMEOUT,
 354                                   0xff00 | (x & 0xff), vf + 1);
 355         }
 356         return true;
 357 }
 358 
 359 static int genwqe_ffdc_buffs_alloc(struct genwqe_dev *cd)
 360 {
 361         unsigned int type, e = 0;
 362 
 363         for (type = 0; type < GENWQE_DBG_UNITS; type++) {
 364                 switch (type) {
 365                 case GENWQE_DBG_UNIT0:
 366                         e = genwqe_ffdc_buff_size(cd, 0);
 367                         break;
 368                 case GENWQE_DBG_UNIT1:
 369                         e = genwqe_ffdc_buff_size(cd, 1);
 370                         break;
 371                 case GENWQE_DBG_UNIT2:
 372                         e = genwqe_ffdc_buff_size(cd, 2);
 373                         break;
 374                 case GENWQE_DBG_REGS:
 375                         e = GENWQE_FFDC_REGS;
 376                         break;
 377                 }
 378 
 379                 /* currently support only the debug units mentioned here */
 380                 cd->ffdc[type].entries = e;
 381                 cd->ffdc[type].regs =
 382                         kmalloc_array(e, sizeof(struct genwqe_reg),
 383                                       GFP_KERNEL);
 384                 /*
 385                  * regs == NULL is ok, the using code treats this as no regs,
 386                  * Printing warning is ok in this case.
 387                  */
 388         }
 389         return 0;
 390 }
 391 
 392 static void genwqe_ffdc_buffs_free(struct genwqe_dev *cd)
 393 {
 394         unsigned int type;
 395 
 396         for (type = 0; type < GENWQE_DBG_UNITS; type++) {
 397                 kfree(cd->ffdc[type].regs);
 398                 cd->ffdc[type].regs = NULL;
 399         }
 400 }
 401 
 402 static int genwqe_read_ids(struct genwqe_dev *cd)
 403 {
 404         int err = 0;
 405         int slu_id;
 406         struct pci_dev *pci_dev = cd->pci_dev;
 407 
 408         cd->slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
 409         if (cd->slu_unitcfg == IO_ILLEGAL_VALUE) {
 410                 dev_err(&pci_dev->dev,
 411                         "err: SLUID=%016llx\n", cd->slu_unitcfg);
 412                 err = -EIO;
 413                 goto out_err;
 414         }
 415 
 416         slu_id = genwqe_get_slu_id(cd);
 417         if (slu_id < GENWQE_SLU_ARCH_REQ || slu_id == 0xff) {
 418                 dev_err(&pci_dev->dev,
 419                         "err: incompatible SLU Architecture %u\n", slu_id);
 420                 err = -ENOENT;
 421                 goto out_err;
 422         }
 423 
 424         cd->app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
 425         if (cd->app_unitcfg == IO_ILLEGAL_VALUE) {
 426                 dev_err(&pci_dev->dev,
 427                         "err: APPID=%016llx\n", cd->app_unitcfg);
 428                 err = -EIO;
 429                 goto out_err;
 430         }
 431         genwqe_read_app_id(cd, cd->app_name, sizeof(cd->app_name));
 432 
 433         /*
 434          * Is access to all registers possible? If we are a VF the
 435          * answer is obvious. If we run fully virtualized, we need to
 436          * check if we can access all registers. If we do not have
 437          * full access we will cause an UR and some informational FIRs
 438          * in the PF, but that should not harm.
 439          */
 440         if (pci_dev->is_virtfn)
 441                 cd->is_privileged = 0;
 442         else
 443                 cd->is_privileged = (__genwqe_readq(cd, IO_SLU_BITSTREAM)
 444                                      != IO_ILLEGAL_VALUE);
 445 
 446  out_err:
 447         return err;
 448 }
 449 
 450 static int genwqe_start(struct genwqe_dev *cd)
 451 {
 452         int err;
 453         struct pci_dev *pci_dev = cd->pci_dev;
 454 
 455         err = genwqe_read_ids(cd);
 456         if (err)
 457                 return err;
 458 
 459         if (genwqe_is_privileged(cd)) {
 460                 /* do this after the tweaks. alloc fail is acceptable */
 461                 genwqe_ffdc_buffs_alloc(cd);
 462                 genwqe_stop_traps(cd);
 463 
 464                 /* Collect registers e.g. FIRs, UNITIDs, traces ... */
 465                 genwqe_read_ffdc_regs(cd, cd->ffdc[GENWQE_DBG_REGS].regs,
 466                                       cd->ffdc[GENWQE_DBG_REGS].entries, 0);
 467 
 468                 genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT0,
 469                                       cd->ffdc[GENWQE_DBG_UNIT0].regs,
 470                                       cd->ffdc[GENWQE_DBG_UNIT0].entries);
 471 
 472                 genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT1,
 473                                       cd->ffdc[GENWQE_DBG_UNIT1].regs,
 474                                       cd->ffdc[GENWQE_DBG_UNIT1].entries);
 475 
 476                 genwqe_ffdc_buff_read(cd, GENWQE_DBG_UNIT2,
 477                                       cd->ffdc[GENWQE_DBG_UNIT2].regs,
 478                                       cd->ffdc[GENWQE_DBG_UNIT2].entries);
 479 
 480                 genwqe_start_traps(cd);
 481 
 482                 if (cd->card_state == GENWQE_CARD_FATAL_ERROR) {
 483                         dev_warn(&pci_dev->dev,
 484                                  "[%s] chip reload/recovery!\n", __func__);
 485 
 486                         /*
 487                          * Stealth Mode: Reload chip on either hot
 488                          * reset or PERST.
 489                          */
 490                         cd->softreset = 0x7Cull;
 491                         __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET,
 492                                        cd->softreset);
 493 
 494                         err = genwqe_bus_reset(cd);
 495                         if (err != 0) {
 496                                 dev_err(&pci_dev->dev,
 497                                         "[%s] err: bus reset failed!\n",
 498                                         __func__);
 499                                 goto out;
 500                         }
 501 
 502                         /*
 503                          * Re-read the IDs because
 504                          * it could happen that the bitstream load
 505                          * failed!
 506                          */
 507                         err = genwqe_read_ids(cd);
 508                         if (err)
 509                                 goto out;
 510                 }
 511         }
 512 
 513         err = genwqe_setup_service_layer(cd);  /* does a reset to the card */
 514         if (err != 0) {
 515                 dev_err(&pci_dev->dev,
 516                         "[%s] err: could not setup servicelayer!\n", __func__);
 517                 err = -ENODEV;
 518                 goto out;
 519         }
 520 
 521         if (genwqe_is_privileged(cd)) {  /* code is running _after_ reset */
 522                 genwqe_tweak_hardware(cd);
 523 
 524                 genwqe_setup_pf_jtimer(cd);
 525                 genwqe_setup_vf_jtimer(cd);
 526         }
 527 
 528         err = genwqe_device_create(cd);
 529         if (err < 0) {
 530                 dev_err(&pci_dev->dev,
 531                         "err: chdev init failed! (err=%d)\n", err);
 532                 goto out_release_service_layer;
 533         }
 534         return 0;
 535 
 536  out_release_service_layer:
 537         genwqe_release_service_layer(cd);
 538  out:
 539         if (genwqe_is_privileged(cd))
 540                 genwqe_ffdc_buffs_free(cd);
 541         return -EIO;
 542 }
 543 
 544 /**
 545  * genwqe_stop() - Stop card operation
 546  *
 547  * Recovery notes:
 548  *   As long as genwqe_thread runs we might access registers during
 549  *   error data capture. Same is with the genwqe_health_thread.
 550  *   When genwqe_bus_reset() fails this function might called two times:
 551  *   first by the genwqe_health_thread() and later by genwqe_remove() to
 552  *   unbind the device. We must be able to survive that.
 553  *
 554  * This function must be robust enough to be called twice.
 555  */
 556 static int genwqe_stop(struct genwqe_dev *cd)
 557 {
 558         genwqe_finish_queue(cd);            /* no register access */
 559         genwqe_device_remove(cd);           /* device removed, procs killed */
 560         genwqe_release_service_layer(cd);   /* here genwqe_thread is stopped */
 561 
 562         if (genwqe_is_privileged(cd)) {
 563                 pci_disable_sriov(cd->pci_dev); /* access pci config space */
 564                 genwqe_ffdc_buffs_free(cd);
 565         }
 566 
 567         return 0;
 568 }
 569 
 570 /**
 571  * genwqe_recover_card() - Try to recover the card if it is possible
 572  *
 573  * If fatal_err is set no register access is possible anymore. It is
 574  * likely that genwqe_start fails in that situation. Proper error
 575  * handling is required in this case.
 576  *
 577  * genwqe_bus_reset() will cause the pci code to call genwqe_remove()
 578  * and later genwqe_probe() for all virtual functions.
 579  */
 580 static int genwqe_recover_card(struct genwqe_dev *cd, int fatal_err)
 581 {
 582         int rc;
 583         struct pci_dev *pci_dev = cd->pci_dev;
 584 
 585         genwqe_stop(cd);
 586 
 587         /*
 588          * Make sure chip is not reloaded to maintain FFDC. Write SLU
 589          * Reset Register, CPLDReset field to 0.
 590          */
 591         if (!fatal_err) {
 592                 cd->softreset = 0x70ull;
 593                 __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET, cd->softreset);
 594         }
 595 
 596         rc = genwqe_bus_reset(cd);
 597         if (rc != 0) {
 598                 dev_err(&pci_dev->dev,
 599                         "[%s] err: card recovery impossible!\n", __func__);
 600                 return rc;
 601         }
 602 
 603         rc = genwqe_start(cd);
 604         if (rc < 0) {
 605                 dev_err(&pci_dev->dev,
 606                         "[%s] err: failed to launch device!\n", __func__);
 607                 return rc;
 608         }
 609         return 0;
 610 }
 611 
 612 static int genwqe_health_check_cond(struct genwqe_dev *cd, u64 *gfir)
 613 {
 614         *gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
 615         return (*gfir & GFIR_ERR_TRIGGER) &&
 616                 genwqe_recovery_on_fatal_gfir_required(cd);
 617 }
 618 
 619 /**
 620  * genwqe_fir_checking() - Check the fault isolation registers of the card
 621  *
 622  * If this code works ok, can be tried out with help of the genwqe_poke tool:
 623  *   sudo ./tools/genwqe_poke 0x8 0xfefefefefef
 624  *
 625  * Now the relevant FIRs/sFIRs should be printed out and the driver should
 626  * invoke recovery (devices are removed and readded).
 627  */
 628 static u64 genwqe_fir_checking(struct genwqe_dev *cd)
 629 {
 630         int j, iterations = 0;
 631         u64 mask, fir, fec, uid, gfir, gfir_masked, sfir, sfec;
 632         u32 fir_addr, fir_clr_addr, fec_addr, sfir_addr, sfec_addr;
 633         struct pci_dev *pci_dev = cd->pci_dev;
 634 
 635  healthMonitor:
 636         iterations++;
 637         if (iterations > 16) {
 638                 dev_err(&pci_dev->dev, "* exit looping after %d times\n",
 639                         iterations);
 640                 goto fatal_error;
 641         }
 642 
 643         gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
 644         if (gfir != 0x0)
 645                 dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n",
 646                                     IO_SLC_CFGREG_GFIR, gfir);
 647         if (gfir == IO_ILLEGAL_VALUE)
 648                 goto fatal_error;
 649 
 650         /*
 651          * Avoid printing when to GFIR bit is on prevents contignous
 652          * printout e.g. for the following bug:
 653          *   FIR set without a 2ndary FIR/FIR cannot be cleared
 654          * Comment out the following if to get the prints:
 655          */
 656         if (gfir == 0)
 657                 return 0;
 658 
 659         gfir_masked = gfir & GFIR_ERR_TRIGGER;  /* fatal errors */
 660 
 661         for (uid = 0; uid < GENWQE_MAX_UNITS; uid++) { /* 0..2 in zEDC */
 662 
 663                 /* read the primary FIR (pfir) */
 664                 fir_addr = (uid << 24) + 0x08;
 665                 fir = __genwqe_readq(cd, fir_addr);
 666                 if (fir == 0x0)
 667                         continue;  /* no error in this unit */
 668 
 669                 dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fir_addr, fir);
 670                 if (fir == IO_ILLEGAL_VALUE)
 671                         goto fatal_error;
 672 
 673                 /* read primary FEC */
 674                 fec_addr = (uid << 24) + 0x18;
 675                 fec = __genwqe_readq(cd, fec_addr);
 676 
 677                 dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fec_addr, fec);
 678                 if (fec == IO_ILLEGAL_VALUE)
 679                         goto fatal_error;
 680 
 681                 for (j = 0, mask = 1ULL; j < 64; j++, mask <<= 1) {
 682 
 683                         /* secondary fir empty, skip it */
 684                         if ((fir & mask) == 0x0)
 685                                 continue;
 686 
 687                         sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
 688                         sfir = __genwqe_readq(cd, sfir_addr);
 689 
 690                         if (sfir == IO_ILLEGAL_VALUE)
 691                                 goto fatal_error;
 692                         dev_err(&pci_dev->dev,
 693                                 "* 0x%08x 0x%016llx\n", sfir_addr, sfir);
 694 
 695                         sfec_addr = (uid << 24) + 0x300 + 0x08 * j;
 696                         sfec = __genwqe_readq(cd, sfec_addr);
 697 
 698                         if (sfec == IO_ILLEGAL_VALUE)
 699                                 goto fatal_error;
 700                         dev_err(&pci_dev->dev,
 701                                 "* 0x%08x 0x%016llx\n", sfec_addr, sfec);
 702 
 703                         gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
 704                         if (gfir == IO_ILLEGAL_VALUE)
 705                                 goto fatal_error;
 706 
 707                         /* gfir turned on during routine! get out and
 708                            start over. */
 709                         if ((gfir_masked == 0x0) &&
 710                             (gfir & GFIR_ERR_TRIGGER)) {
 711                                 goto healthMonitor;
 712                         }
 713 
 714                         /* do not clear if we entered with a fatal gfir */
 715                         if (gfir_masked == 0x0) {
 716 
 717                                 /* NEW clear by mask the logged bits */
 718                                 sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
 719                                 __genwqe_writeq(cd, sfir_addr, sfir);
 720 
 721                                 dev_dbg(&pci_dev->dev,
 722                                         "[HM] Clearing  2ndary FIR 0x%08x with 0x%016llx\n",
 723                                         sfir_addr, sfir);
 724 
 725                                 /*
 726                                  * note, these cannot be error-Firs
 727                                  * since gfir_masked is 0 after sfir
 728                                  * was read. Also, it is safe to do
 729                                  * this write if sfir=0. Still need to
 730                                  * clear the primary. This just means
 731                                  * there is no secondary FIR.
 732                                  */
 733 
 734                                 /* clear by mask the logged bit. */
 735                                 fir_clr_addr = (uid << 24) + 0x10;
 736                                 __genwqe_writeq(cd, fir_clr_addr, mask);
 737 
 738                                 dev_dbg(&pci_dev->dev,
 739                                         "[HM] Clearing primary FIR 0x%08x with 0x%016llx\n",
 740                                         fir_clr_addr, mask);
 741                         }
 742                 }
 743         }
 744         gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
 745         if (gfir == IO_ILLEGAL_VALUE)
 746                 goto fatal_error;
 747 
 748         if ((gfir_masked == 0x0) && (gfir & GFIR_ERR_TRIGGER)) {
 749                 /*
 750                  * Check once more that it didn't go on after all the
 751                  * FIRS were cleared.
 752                  */
 753                 dev_dbg(&pci_dev->dev, "ACK! Another FIR! Recursing %d!\n",
 754                         iterations);
 755                 goto healthMonitor;
 756         }
 757         return gfir_masked;
 758 
 759  fatal_error:
 760         return IO_ILLEGAL_VALUE;
 761 }
 762 
 763 /**
 764  * genwqe_pci_fundamental_reset() - trigger a PCIe fundamental reset on the slot
 765  *
 766  * Note: pci_set_pcie_reset_state() is not implemented on all archs, so this
 767  * reset method will not work in all cases.
 768  *
 769  * Return: 0 on success or error code from pci_set_pcie_reset_state()
 770  */
 771 static int genwqe_pci_fundamental_reset(struct pci_dev *pci_dev)
 772 {
 773         int rc;
 774 
 775         /*
 776          * lock pci config space access from userspace,
 777          * save state and issue PCIe fundamental reset
 778          */
 779         pci_cfg_access_lock(pci_dev);
 780         pci_save_state(pci_dev);
 781         rc = pci_set_pcie_reset_state(pci_dev, pcie_warm_reset);
 782         if (!rc) {
 783                 /* keep PCIe reset asserted for 250ms */
 784                 msleep(250);
 785                 pci_set_pcie_reset_state(pci_dev, pcie_deassert_reset);
 786                 /* Wait for 2s to reload flash and train the link */
 787                 msleep(2000);
 788         }
 789         pci_restore_state(pci_dev);
 790         pci_cfg_access_unlock(pci_dev);
 791         return rc;
 792 }
 793 
 794 
 795 static int genwqe_platform_recovery(struct genwqe_dev *cd)
 796 {
 797         struct pci_dev *pci_dev = cd->pci_dev;
 798         int rc;
 799 
 800         dev_info(&pci_dev->dev,
 801                  "[%s] resetting card for error recovery\n", __func__);
 802 
 803         /* Clear out error injection flags */
 804         cd->err_inject &= ~(GENWQE_INJECT_HARDWARE_FAILURE |
 805                             GENWQE_INJECT_GFIR_FATAL |
 806                             GENWQE_INJECT_GFIR_INFO);
 807 
 808         genwqe_stop(cd);
 809 
 810         /* Try recoverying the card with fundamental reset */
 811         rc = genwqe_pci_fundamental_reset(pci_dev);
 812         if (!rc) {
 813                 rc = genwqe_start(cd);
 814                 if (!rc)
 815                         dev_info(&pci_dev->dev,
 816                                  "[%s] card recovered\n", __func__);
 817                 else
 818                         dev_err(&pci_dev->dev,
 819                                 "[%s] err: cannot start card services! (err=%d)\n",
 820                                 __func__, rc);
 821         } else {
 822                 dev_err(&pci_dev->dev,
 823                         "[%s] card reset failed\n", __func__);
 824         }
 825 
 826         return rc;
 827 }
 828 
 829 /*
 830  * genwqe_reload_bistream() - reload card bitstream
 831  *
 832  * Set the appropriate register and call fundamental reset to reaload the card
 833  * bitstream.
 834  *
 835  * Return: 0 on success, error code otherwise
 836  */
 837 static int genwqe_reload_bistream(struct genwqe_dev *cd)
 838 {
 839         struct pci_dev *pci_dev = cd->pci_dev;
 840         int rc;
 841 
 842         dev_info(&pci_dev->dev,
 843                  "[%s] resetting card for bitstream reload\n",
 844                  __func__);
 845 
 846         genwqe_stop(cd);
 847 
 848         /*
 849          * Cause a CPLD reprogram with the 'next_bitstream'
 850          * partition on PCIe hot or fundamental reset
 851          */
 852         __genwqe_writeq(cd, IO_SLC_CFGREG_SOFTRESET,
 853                         (cd->softreset & 0xcull) | 0x70ull);
 854 
 855         rc = genwqe_pci_fundamental_reset(pci_dev);
 856         if (rc) {
 857                 /*
 858                  * A fundamental reset failure can be caused
 859                  * by lack of support on the arch, so we just
 860                  * log the error and try to start the card
 861                  * again.
 862                  */
 863                 dev_err(&pci_dev->dev,
 864                         "[%s] err: failed to reset card for bitstream reload\n",
 865                         __func__);
 866         }
 867 
 868         rc = genwqe_start(cd);
 869         if (rc) {
 870                 dev_err(&pci_dev->dev,
 871                         "[%s] err: cannot start card services! (err=%d)\n",
 872                         __func__, rc);
 873                 return rc;
 874         }
 875         dev_info(&pci_dev->dev,
 876                  "[%s] card reloaded\n", __func__);
 877         return 0;
 878 }
 879 
 880 
 881 /**
 882  * genwqe_health_thread() - Health checking thread
 883  *
 884  * This thread is only started for the PF of the card.
 885  *
 886  * This thread monitors the health of the card. A critical situation
 887  * is when we read registers which contain -1 (IO_ILLEGAL_VALUE). In
 888  * this case we need to be recovered from outside. Writing to
 889  * registers will very likely not work either.
 890  *
 891  * This thread must only exit if kthread_should_stop() becomes true.
 892  *
 893  * Condition for the health-thread to trigger:
 894  *   a) when a kthread_stop() request comes in or
 895  *   b) a critical GFIR occured
 896  *
 897  * Informational GFIRs are checked and potentially printed in
 898  * GENWQE_HEALTH_CHECK_INTERVAL seconds.
 899  */
 900 static int genwqe_health_thread(void *data)
 901 {
 902         int rc, should_stop = 0;
 903         struct genwqe_dev *cd = data;
 904         struct pci_dev *pci_dev = cd->pci_dev;
 905         u64 gfir, gfir_masked, slu_unitcfg, app_unitcfg;
 906 
 907  health_thread_begin:
 908         while (!kthread_should_stop()) {
 909                 rc = wait_event_interruptible_timeout(cd->health_waitq,
 910                          (genwqe_health_check_cond(cd, &gfir) ||
 911                           (should_stop = kthread_should_stop())),
 912                                 GENWQE_HEALTH_CHECK_INTERVAL * HZ);
 913 
 914                 if (should_stop)
 915                         break;
 916 
 917                 if (gfir == IO_ILLEGAL_VALUE) {
 918                         dev_err(&pci_dev->dev,
 919                                 "[%s] GFIR=%016llx\n", __func__, gfir);
 920                         goto fatal_error;
 921                 }
 922 
 923                 slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
 924                 if (slu_unitcfg == IO_ILLEGAL_VALUE) {
 925                         dev_err(&pci_dev->dev,
 926                                 "[%s] SLU_UNITCFG=%016llx\n",
 927                                 __func__, slu_unitcfg);
 928                         goto fatal_error;
 929                 }
 930 
 931                 app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
 932                 if (app_unitcfg == IO_ILLEGAL_VALUE) {
 933                         dev_err(&pci_dev->dev,
 934                                 "[%s] APP_UNITCFG=%016llx\n",
 935                                 __func__, app_unitcfg);
 936                         goto fatal_error;
 937                 }
 938 
 939                 gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
 940                 if (gfir == IO_ILLEGAL_VALUE) {
 941                         dev_err(&pci_dev->dev,
 942                                 "[%s] %s: GFIR=%016llx\n", __func__,
 943                                 (gfir & GFIR_ERR_TRIGGER) ? "err" : "info",
 944                                 gfir);
 945                         goto fatal_error;
 946                 }
 947 
 948                 gfir_masked = genwqe_fir_checking(cd);
 949                 if (gfir_masked == IO_ILLEGAL_VALUE)
 950                         goto fatal_error;
 951 
 952                 /*
 953                  * GFIR ErrorTrigger bits set => reset the card!
 954                  * Never do this for old/manufacturing images!
 955                  */
 956                 if ((gfir_masked) && !cd->skip_recovery &&
 957                     genwqe_recovery_on_fatal_gfir_required(cd)) {
 958 
 959                         cd->card_state = GENWQE_CARD_FATAL_ERROR;
 960 
 961                         rc = genwqe_recover_card(cd, 0);
 962                         if (rc < 0) {
 963                                 /* FIXME Card is unusable and needs unbind! */
 964                                 goto fatal_error;
 965                         }
 966                 }
 967 
 968                 if (cd->card_state == GENWQE_CARD_RELOAD_BITSTREAM) {
 969                         /* Userspace requested card bitstream reload */
 970                         rc = genwqe_reload_bistream(cd);
 971                         if (rc)
 972                                 goto fatal_error;
 973                 }
 974 
 975                 cd->last_gfir = gfir;
 976                 cond_resched();
 977         }
 978 
 979         return 0;
 980 
 981  fatal_error:
 982         if (cd->use_platform_recovery) {
 983                 /*
 984                  * Since we use raw accessors, EEH errors won't be detected
 985                  * by the platform until we do a non-raw MMIO or config space
 986                  * read
 987                  */
 988                 readq(cd->mmio + IO_SLC_CFGREG_GFIR);
 989 
 990                 /* We do nothing if the card is going over PCI recovery */
 991                 if (pci_channel_offline(pci_dev))
 992                         return -EIO;
 993 
 994                 /*
 995                  * If it's supported by the platform, we try a fundamental reset
 996                  * to recover from a fatal error. Otherwise, we continue to wait
 997                  * for an external recovery procedure to take care of it.
 998                  */
 999                 rc = genwqe_platform_recovery(cd);
1000                 if (!rc)
1001                         goto health_thread_begin;
1002         }
1003 
1004         dev_err(&pci_dev->dev,
1005                 "[%s] card unusable. Please trigger unbind!\n", __func__);
1006 
1007         /* Bring down logical devices to inform user space via udev remove. */
1008         cd->card_state = GENWQE_CARD_FATAL_ERROR;
1009         genwqe_stop(cd);
1010 
1011         /* genwqe_bus_reset failed(). Now wait for genwqe_remove(). */
1012         while (!kthread_should_stop())
1013                 cond_resched();
1014 
1015         return -EIO;
1016 }
1017 
1018 static int genwqe_health_check_start(struct genwqe_dev *cd)
1019 {
1020         int rc;
1021 
1022         if (GENWQE_HEALTH_CHECK_INTERVAL <= 0)
1023                 return 0;       /* valid for disabling the service */
1024 
1025         /* moved before request_irq() */
1026         /* init_waitqueue_head(&cd->health_waitq); */
1027 
1028         cd->health_thread = kthread_run(genwqe_health_thread, cd,
1029                                         GENWQE_DEVNAME "%d_health",
1030                                         cd->card_idx);
1031         if (IS_ERR(cd->health_thread)) {
1032                 rc = PTR_ERR(cd->health_thread);
1033                 cd->health_thread = NULL;
1034                 return rc;
1035         }
1036         return 0;
1037 }
1038 
1039 static int genwqe_health_thread_running(struct genwqe_dev *cd)
1040 {
1041         return cd->health_thread != NULL;
1042 }
1043 
1044 static int genwqe_health_check_stop(struct genwqe_dev *cd)
1045 {
1046         int rc;
1047 
1048         if (!genwqe_health_thread_running(cd))
1049                 return -EIO;
1050 
1051         rc = kthread_stop(cd->health_thread);
1052         cd->health_thread = NULL;
1053         return 0;
1054 }
1055 
1056 /**
1057  * genwqe_pci_setup() - Allocate PCIe related resources for our card
1058  */
1059 static int genwqe_pci_setup(struct genwqe_dev *cd)
1060 {
1061         int err;
1062         struct pci_dev *pci_dev = cd->pci_dev;
1063 
1064         err = pci_enable_device_mem(pci_dev);
1065         if (err) {
1066                 dev_err(&pci_dev->dev,
1067                         "err: failed to enable pci memory (err=%d)\n", err);
1068                 goto err_out;
1069         }
1070 
1071         /* Reserve PCI I/O and memory resources */
1072         err = pci_request_mem_regions(pci_dev, genwqe_driver_name);
1073         if (err) {
1074                 dev_err(&pci_dev->dev,
1075                         "[%s] err: request bars failed (%d)\n", __func__, err);
1076                 err = -EIO;
1077                 goto err_disable_device;
1078         }
1079 
1080         /* check for 64-bit DMA address supported (DAC) */
1081         if (!pci_set_dma_mask(pci_dev, DMA_BIT_MASK(64))) {
1082                 err = pci_set_consistent_dma_mask(pci_dev, DMA_BIT_MASK(64));
1083                 if (err) {
1084                         dev_err(&pci_dev->dev,
1085                                 "err: DMA64 consistent mask error\n");
1086                         err = -EIO;
1087                         goto out_release_resources;
1088                 }
1089         /* check for 32-bit DMA address supported (SAC) */
1090         } else if (!pci_set_dma_mask(pci_dev, DMA_BIT_MASK(32))) {
1091                 err = pci_set_consistent_dma_mask(pci_dev, DMA_BIT_MASK(32));
1092                 if (err) {
1093                         dev_err(&pci_dev->dev,
1094                                 "err: DMA32 consistent mask error\n");
1095                         err = -EIO;
1096                         goto out_release_resources;
1097                 }
1098         } else {
1099                 dev_err(&pci_dev->dev,
1100                         "err: neither DMA32 nor DMA64 supported\n");
1101                 err = -EIO;
1102                 goto out_release_resources;
1103         }
1104 
1105         pci_set_master(pci_dev);
1106         pci_enable_pcie_error_reporting(pci_dev);
1107 
1108         /* EEH recovery requires PCIe fundamental reset */
1109         pci_dev->needs_freset = 1;
1110 
1111         /* request complete BAR-0 space (length = 0) */
1112         cd->mmio_len = pci_resource_len(pci_dev, 0);
1113         cd->mmio = pci_iomap(pci_dev, 0, 0);
1114         if (cd->mmio == NULL) {
1115                 dev_err(&pci_dev->dev,
1116                         "[%s] err: mapping BAR0 failed\n", __func__);
1117                 err = -ENOMEM;
1118                 goto out_release_resources;
1119         }
1120 
1121         cd->num_vfs = pci_sriov_get_totalvfs(pci_dev);
1122         if (cd->num_vfs < 0)
1123                 cd->num_vfs = 0;
1124 
1125         err = genwqe_read_ids(cd);
1126         if (err)
1127                 goto out_iounmap;
1128 
1129         return 0;
1130 
1131  out_iounmap:
1132         pci_iounmap(pci_dev, cd->mmio);
1133  out_release_resources:
1134         pci_release_mem_regions(pci_dev);
1135  err_disable_device:
1136         pci_disable_device(pci_dev);
1137  err_out:
1138         return err;
1139 }
1140 
1141 /**
1142  * genwqe_pci_remove() - Free PCIe related resources for our card
1143  */
1144 static void genwqe_pci_remove(struct genwqe_dev *cd)
1145 {
1146         struct pci_dev *pci_dev = cd->pci_dev;
1147 
1148         if (cd->mmio)
1149                 pci_iounmap(pci_dev, cd->mmio);
1150 
1151         pci_release_mem_regions(pci_dev);
1152         pci_disable_device(pci_dev);
1153 }
1154 
1155 /**
1156  * genwqe_probe() - Device initialization
1157  * @pdev:       PCI device information struct
1158  *
1159  * Callable for multiple cards. This function is called on bind.
1160  *
1161  * Return: 0 if succeeded, < 0 when failed
1162  */
1163 static int genwqe_probe(struct pci_dev *pci_dev,
1164                         const struct pci_device_id *id)
1165 {
1166         int err;
1167         struct genwqe_dev *cd;
1168 
1169         genwqe_init_crc32();
1170 
1171         cd = genwqe_dev_alloc();
1172         if (IS_ERR(cd)) {
1173                 dev_err(&pci_dev->dev, "err: could not alloc mem (err=%d)!\n",
1174                         (int)PTR_ERR(cd));
1175                 return PTR_ERR(cd);
1176         }
1177 
1178         dev_set_drvdata(&pci_dev->dev, cd);
1179         cd->pci_dev = pci_dev;
1180 
1181         err = genwqe_pci_setup(cd);
1182         if (err < 0) {
1183                 dev_err(&pci_dev->dev,
1184                         "err: problems with PCI setup (err=%d)\n", err);
1185                 goto out_free_dev;
1186         }
1187 
1188         err = genwqe_start(cd);
1189         if (err < 0) {
1190                 dev_err(&pci_dev->dev,
1191                         "err: cannot start card services! (err=%d)\n", err);
1192                 goto out_pci_remove;
1193         }
1194 
1195         if (genwqe_is_privileged(cd)) {
1196                 err = genwqe_health_check_start(cd);
1197                 if (err < 0) {
1198                         dev_err(&pci_dev->dev,
1199                                 "err: cannot start health checking! (err=%d)\n",
1200                                 err);
1201                         goto out_stop_services;
1202                 }
1203         }
1204         return 0;
1205 
1206  out_stop_services:
1207         genwqe_stop(cd);
1208  out_pci_remove:
1209         genwqe_pci_remove(cd);
1210  out_free_dev:
1211         genwqe_dev_free(cd);
1212         return err;
1213 }
1214 
1215 /**
1216  * genwqe_remove() - Called when device is removed (hot-plugable)
1217  *
1218  * Or when driver is unloaded respecitively when unbind is done.
1219  */
1220 static void genwqe_remove(struct pci_dev *pci_dev)
1221 {
1222         struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1223 
1224         genwqe_health_check_stop(cd);
1225 
1226         /*
1227          * genwqe_stop() must survive if it is called twice
1228          * sequentially. This happens when the health thread calls it
1229          * and fails on genwqe_bus_reset().
1230          */
1231         genwqe_stop(cd);
1232         genwqe_pci_remove(cd);
1233         genwqe_dev_free(cd);
1234 }
1235 
1236 /*
1237  * genwqe_err_error_detected() - Error detection callback
1238  *
1239  * This callback is called by the PCI subsystem whenever a PCI bus
1240  * error is detected.
1241  */
1242 static pci_ers_result_t genwqe_err_error_detected(struct pci_dev *pci_dev,
1243                                                  enum pci_channel_state state)
1244 {
1245         struct genwqe_dev *cd;
1246 
1247         dev_err(&pci_dev->dev, "[%s] state=%d\n", __func__, state);
1248 
1249         cd = dev_get_drvdata(&pci_dev->dev);
1250         if (cd == NULL)
1251                 return PCI_ERS_RESULT_DISCONNECT;
1252 
1253         /* Stop the card */
1254         genwqe_health_check_stop(cd);
1255         genwqe_stop(cd);
1256 
1257         /*
1258          * On permanent failure, the PCI code will call device remove
1259          * after the return of this function.
1260          * genwqe_stop() can be called twice.
1261          */
1262         if (state == pci_channel_io_perm_failure) {
1263                 return PCI_ERS_RESULT_DISCONNECT;
1264         } else {
1265                 genwqe_pci_remove(cd);
1266                 return PCI_ERS_RESULT_NEED_RESET;
1267         }
1268 }
1269 
1270 static pci_ers_result_t genwqe_err_slot_reset(struct pci_dev *pci_dev)
1271 {
1272         int rc;
1273         struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1274 
1275         rc = genwqe_pci_setup(cd);
1276         if (!rc) {
1277                 return PCI_ERS_RESULT_RECOVERED;
1278         } else {
1279                 dev_err(&pci_dev->dev,
1280                         "err: problems with PCI setup (err=%d)\n", rc);
1281                 return PCI_ERS_RESULT_DISCONNECT;
1282         }
1283 }
1284 
1285 static pci_ers_result_t genwqe_err_result_none(struct pci_dev *dev)
1286 {
1287         return PCI_ERS_RESULT_NONE;
1288 }
1289 
1290 static void genwqe_err_resume(struct pci_dev *pci_dev)
1291 {
1292         int rc;
1293         struct genwqe_dev *cd = dev_get_drvdata(&pci_dev->dev);
1294 
1295         rc = genwqe_start(cd);
1296         if (!rc) {
1297                 rc = genwqe_health_check_start(cd);
1298                 if (rc)
1299                         dev_err(&pci_dev->dev,
1300                                 "err: cannot start health checking! (err=%d)\n",
1301                                 rc);
1302         } else {
1303                 dev_err(&pci_dev->dev,
1304                         "err: cannot start card services! (err=%d)\n", rc);
1305         }
1306 }
1307 
1308 static int genwqe_sriov_configure(struct pci_dev *dev, int numvfs)
1309 {
1310         int rc;
1311         struct genwqe_dev *cd = dev_get_drvdata(&dev->dev);
1312 
1313         if (numvfs > 0) {
1314                 genwqe_setup_vf_jtimer(cd);
1315                 rc = pci_enable_sriov(dev, numvfs);
1316                 if (rc < 0)
1317                         return rc;
1318                 return numvfs;
1319         }
1320         if (numvfs == 0) {
1321                 pci_disable_sriov(dev);
1322                 return 0;
1323         }
1324         return 0;
1325 }
1326 
1327 static struct pci_error_handlers genwqe_err_handler = {
1328         .error_detected = genwqe_err_error_detected,
1329         .mmio_enabled   = genwqe_err_result_none,
1330         .slot_reset     = genwqe_err_slot_reset,
1331         .resume         = genwqe_err_resume,
1332 };
1333 
1334 static struct pci_driver genwqe_driver = {
1335         .name     = genwqe_driver_name,
1336         .id_table = genwqe_device_table,
1337         .probe    = genwqe_probe,
1338         .remove   = genwqe_remove,
1339         .sriov_configure = genwqe_sriov_configure,
1340         .err_handler = &genwqe_err_handler,
1341 };
1342 
1343 /**
1344  * genwqe_devnode() - Set default access mode for genwqe devices.
1345  *
1346  * Default mode should be rw for everybody. Do not change default
1347  * device name.
1348  */
1349 static char *genwqe_devnode(struct device *dev, umode_t *mode)
1350 {
1351         if (mode)
1352                 *mode = 0666;
1353         return NULL;
1354 }
1355 
1356 /**
1357  * genwqe_init_module() - Driver registration and initialization
1358  */
1359 static int __init genwqe_init_module(void)
1360 {
1361         int rc;
1362 
1363         class_genwqe = class_create(THIS_MODULE, GENWQE_DEVNAME);
1364         if (IS_ERR(class_genwqe)) {
1365                 pr_err("[%s] create class failed\n", __func__);
1366                 return -ENOMEM;
1367         }
1368 
1369         class_genwqe->devnode = genwqe_devnode;
1370 
1371         debugfs_genwqe = debugfs_create_dir(GENWQE_DEVNAME, NULL);
1372 
1373         rc = pci_register_driver(&genwqe_driver);
1374         if (rc != 0) {
1375                 pr_err("[%s] pci_reg_driver (rc=%d)\n", __func__, rc);
1376                 goto err_out0;
1377         }
1378 
1379         return rc;
1380 
1381  err_out0:
1382         debugfs_remove(debugfs_genwqe);
1383         class_destroy(class_genwqe);
1384         return rc;
1385 }
1386 
1387 /**
1388  * genwqe_exit_module() - Driver exit
1389  */
1390 static void __exit genwqe_exit_module(void)
1391 {
1392         pci_unregister_driver(&genwqe_driver);
1393         debugfs_remove(debugfs_genwqe);
1394         class_destroy(class_genwqe);
1395 }
1396 
1397 module_init(genwqe_init_module);
1398 module_exit(genwqe_exit_module);

/* [<][>][^][v][top][bottom][index][help] */