root/drivers/edac/ghes_edac.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ghes_edac_count_dimms
  2. get_dimm_smbios_index
  3. ghes_edac_dmidecode
  4. ghes_edac_report_mem_error
  5. ghes_edac_register
  6. ghes_edac_unregister

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * GHES/EDAC Linux driver
   4  *
   5  * Copyright (c) 2013 by Mauro Carvalho Chehab
   6  *
   7  * Red Hat Inc. http://www.redhat.com
   8  */
   9 
  10 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  11 
  12 #include <acpi/ghes.h>
  13 #include <linux/edac.h>
  14 #include <linux/dmi.h>
  15 #include "edac_module.h"
  16 #include <ras/ras_event.h>
  17 
  18 struct ghes_edac_pvt {
  19         struct list_head list;
  20         struct ghes *ghes;
  21         struct mem_ctl_info *mci;
  22 
  23         /* Buffers for the error handling routine */
  24         char detail_location[240];
  25         char other_detail[160];
  26         char msg[80];
  27 };
  28 
  29 static refcount_t ghes_refcount = REFCOUNT_INIT(0);
  30 
  31 /*
  32  * Access to ghes_pvt must be protected by ghes_lock. The spinlock
  33  * also provides the necessary (implicit) memory barrier for the SMP
  34  * case to make the pointer visible on another CPU.
  35  */
  36 static struct ghes_edac_pvt *ghes_pvt;
  37 
  38 /* GHES registration mutex */
  39 static DEFINE_MUTEX(ghes_reg_mutex);
  40 
  41 /*
  42  * Sync with other, potentially concurrent callers of
  43  * ghes_edac_report_mem_error(). We don't know what the
  44  * "inventive" firmware would do.
  45  */
  46 static DEFINE_SPINLOCK(ghes_lock);
  47 
  48 /* "ghes_edac.force_load=1" skips the platform check */
  49 static bool __read_mostly force_load;
  50 module_param(force_load, bool, 0);
  51 
  52 /* Memory Device - Type 17 of SMBIOS spec */
  53 struct memdev_dmi_entry {
  54         u8 type;
  55         u8 length;
  56         u16 handle;
  57         u16 phys_mem_array_handle;
  58         u16 mem_err_info_handle;
  59         u16 total_width;
  60         u16 data_width;
  61         u16 size;
  62         u8 form_factor;
  63         u8 device_set;
  64         u8 device_locator;
  65         u8 bank_locator;
  66         u8 memory_type;
  67         u16 type_detail;
  68         u16 speed;
  69         u8 manufacturer;
  70         u8 serial_number;
  71         u8 asset_tag;
  72         u8 part_number;
  73         u8 attributes;
  74         u32 extended_size;
  75         u16 conf_mem_clk_speed;
  76 } __attribute__((__packed__));
  77 
  78 struct ghes_edac_dimm_fill {
  79         struct mem_ctl_info *mci;
  80         unsigned int count;
  81 };
  82 
  83 static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
  84 {
  85         int *num_dimm = arg;
  86 
  87         if (dh->type == DMI_ENTRY_MEM_DEVICE)
  88                 (*num_dimm)++;
  89 }
  90 
  91 static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle)
  92 {
  93         int i;
  94 
  95         for (i = 0; i < mci->tot_dimms; i++) {
  96                 if (mci->dimms[i]->smbios_handle == handle)
  97                         return i;
  98         }
  99         return -1;
 100 }
 101 
 102 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 103 {
 104         struct ghes_edac_dimm_fill *dimm_fill = arg;
 105         struct mem_ctl_info *mci = dimm_fill->mci;
 106 
 107         if (dh->type == DMI_ENTRY_MEM_DEVICE) {
 108                 struct memdev_dmi_entry *entry = (struct memdev_dmi_entry *)dh;
 109                 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 110                                                        mci->n_layers,
 111                                                        dimm_fill->count, 0, 0);
 112                 u16 rdr_mask = BIT(7) | BIT(13);
 113 
 114                 if (entry->size == 0xffff) {
 115                         pr_info("Can't get DIMM%i size\n",
 116                                 dimm_fill->count);
 117                         dimm->nr_pages = MiB_TO_PAGES(32);/* Unknown */
 118                 } else if (entry->size == 0x7fff) {
 119                         dimm->nr_pages = MiB_TO_PAGES(entry->extended_size);
 120                 } else {
 121                         if (entry->size & BIT(15))
 122                                 dimm->nr_pages = MiB_TO_PAGES((entry->size & 0x7fff) << 10);
 123                         else
 124                                 dimm->nr_pages = MiB_TO_PAGES(entry->size);
 125                 }
 126 
 127                 switch (entry->memory_type) {
 128                 case 0x12:
 129                         if (entry->type_detail & BIT(13))
 130                                 dimm->mtype = MEM_RDDR;
 131                         else
 132                                 dimm->mtype = MEM_DDR;
 133                         break;
 134                 case 0x13:
 135                         if (entry->type_detail & BIT(13))
 136                                 dimm->mtype = MEM_RDDR2;
 137                         else
 138                                 dimm->mtype = MEM_DDR2;
 139                         break;
 140                 case 0x14:
 141                         dimm->mtype = MEM_FB_DDR2;
 142                         break;
 143                 case 0x18:
 144                         if (entry->type_detail & BIT(12))
 145                                 dimm->mtype = MEM_NVDIMM;
 146                         else if (entry->type_detail & BIT(13))
 147                                 dimm->mtype = MEM_RDDR3;
 148                         else
 149                                 dimm->mtype = MEM_DDR3;
 150                         break;
 151                 case 0x1a:
 152                         if (entry->type_detail & BIT(12))
 153                                 dimm->mtype = MEM_NVDIMM;
 154                         else if (entry->type_detail & BIT(13))
 155                                 dimm->mtype = MEM_RDDR4;
 156                         else
 157                                 dimm->mtype = MEM_DDR4;
 158                         break;
 159                 default:
 160                         if (entry->type_detail & BIT(6))
 161                                 dimm->mtype = MEM_RMBS;
 162                         else if ((entry->type_detail & rdr_mask) == rdr_mask)
 163                                 dimm->mtype = MEM_RDR;
 164                         else if (entry->type_detail & BIT(7))
 165                                 dimm->mtype = MEM_SDR;
 166                         else if (entry->type_detail & BIT(9))
 167                                 dimm->mtype = MEM_EDO;
 168                         else
 169                                 dimm->mtype = MEM_UNKNOWN;
 170                 }
 171 
 172                 /*
 173                  * Actually, we can only detect if the memory has bits for
 174                  * checksum or not
 175                  */
 176                 if (entry->total_width == entry->data_width)
 177                         dimm->edac_mode = EDAC_NONE;
 178                 else
 179                         dimm->edac_mode = EDAC_SECDED;
 180 
 181                 dimm->dtype = DEV_UNKNOWN;
 182                 dimm->grain = 128;              /* Likely, worse case */
 183 
 184                 /*
 185                  * FIXME: It shouldn't be hard to also fill the DIMM labels
 186                  */
 187 
 188                 if (dimm->nr_pages) {
 189                         edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
 190                                 dimm_fill->count, edac_mem_types[dimm->mtype],
 191                                 PAGES_TO_MiB(dimm->nr_pages),
 192                                 (dimm->edac_mode != EDAC_NONE) ? "(ECC)" : "");
 193                         edac_dbg(2, "\ttype %d, detail 0x%02x, width %d(total %d)\n",
 194                                 entry->memory_type, entry->type_detail,
 195                                 entry->total_width, entry->data_width);
 196                 }
 197 
 198                 dimm->smbios_handle = entry->handle;
 199 
 200                 dimm_fill->count++;
 201         }
 202 }
 203 
 204 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 205 {
 206         enum hw_event_mc_err_type type;
 207         struct edac_raw_error_desc *e;
 208         struct mem_ctl_info *mci;
 209         struct ghes_edac_pvt *pvt;
 210         unsigned long flags;
 211         char *p;
 212         u8 grain_bits;
 213 
 214         /*
 215          * We can do the locking below because GHES defers error processing
 216          * from NMI to IRQ context. Whenever that changes, we'd at least
 217          * know.
 218          */
 219         if (WARN_ON_ONCE(in_nmi()))
 220                 return;
 221 
 222         spin_lock_irqsave(&ghes_lock, flags);
 223 
 224         pvt = ghes_pvt;
 225         if (!pvt)
 226                 goto unlock;
 227 
 228         mci = pvt->mci;
 229         e = &mci->error_desc;
 230 
 231         /* Cleans the error report buffer */
 232         memset(e, 0, sizeof (*e));
 233         e->error_count = 1;
 234         e->grain = 1;
 235         strcpy(e->label, "unknown label");
 236         e->msg = pvt->msg;
 237         e->other_detail = pvt->other_detail;
 238         e->top_layer = -1;
 239         e->mid_layer = -1;
 240         e->low_layer = -1;
 241         *pvt->other_detail = '\0';
 242         *pvt->msg = '\0';
 243 
 244         switch (sev) {
 245         case GHES_SEV_CORRECTED:
 246                 type = HW_EVENT_ERR_CORRECTED;
 247                 break;
 248         case GHES_SEV_RECOVERABLE:
 249                 type = HW_EVENT_ERR_UNCORRECTED;
 250                 break;
 251         case GHES_SEV_PANIC:
 252                 type = HW_EVENT_ERR_FATAL;
 253                 break;
 254         default:
 255         case GHES_SEV_NO:
 256                 type = HW_EVENT_ERR_INFO;
 257         }
 258 
 259         edac_dbg(1, "error validation_bits: 0x%08llx\n",
 260                  (long long)mem_err->validation_bits);
 261 
 262         /* Error type, mapped on e->msg */
 263         if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_TYPE) {
 264                 p = pvt->msg;
 265                 switch (mem_err->error_type) {
 266                 case 0:
 267                         p += sprintf(p, "Unknown");
 268                         break;
 269                 case 1:
 270                         p += sprintf(p, "No error");
 271                         break;
 272                 case 2:
 273                         p += sprintf(p, "Single-bit ECC");
 274                         break;
 275                 case 3:
 276                         p += sprintf(p, "Multi-bit ECC");
 277                         break;
 278                 case 4:
 279                         p += sprintf(p, "Single-symbol ChipKill ECC");
 280                         break;
 281                 case 5:
 282                         p += sprintf(p, "Multi-symbol ChipKill ECC");
 283                         break;
 284                 case 6:
 285                         p += sprintf(p, "Master abort");
 286                         break;
 287                 case 7:
 288                         p += sprintf(p, "Target abort");
 289                         break;
 290                 case 8:
 291                         p += sprintf(p, "Parity Error");
 292                         break;
 293                 case 9:
 294                         p += sprintf(p, "Watchdog timeout");
 295                         break;
 296                 case 10:
 297                         p += sprintf(p, "Invalid address");
 298                         break;
 299                 case 11:
 300                         p += sprintf(p, "Mirror Broken");
 301                         break;
 302                 case 12:
 303                         p += sprintf(p, "Memory Sparing");
 304                         break;
 305                 case 13:
 306                         p += sprintf(p, "Scrub corrected error");
 307                         break;
 308                 case 14:
 309                         p += sprintf(p, "Scrub uncorrected error");
 310                         break;
 311                 case 15:
 312                         p += sprintf(p, "Physical Memory Map-out event");
 313                         break;
 314                 default:
 315                         p += sprintf(p, "reserved error (%d)",
 316                                      mem_err->error_type);
 317                 }
 318         } else {
 319                 strcpy(pvt->msg, "unknown error");
 320         }
 321 
 322         /* Error address */
 323         if (mem_err->validation_bits & CPER_MEM_VALID_PA) {
 324                 e->page_frame_number = mem_err->physical_addr >> PAGE_SHIFT;
 325                 e->offset_in_page = mem_err->physical_addr & ~PAGE_MASK;
 326         }
 327 
 328         /* Error grain */
 329         if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
 330                 e->grain = ~mem_err->physical_addr_mask + 1;
 331 
 332         /* Memory error location, mapped on e->location */
 333         p = e->location;
 334         if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
 335                 p += sprintf(p, "node:%d ", mem_err->node);
 336         if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
 337                 p += sprintf(p, "card:%d ", mem_err->card);
 338         if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
 339                 p += sprintf(p, "module:%d ", mem_err->module);
 340         if (mem_err->validation_bits & CPER_MEM_VALID_RANK_NUMBER)
 341                 p += sprintf(p, "rank:%d ", mem_err->rank);
 342         if (mem_err->validation_bits & CPER_MEM_VALID_BANK)
 343                 p += sprintf(p, "bank:%d ", mem_err->bank);
 344         if (mem_err->validation_bits & CPER_MEM_VALID_ROW)
 345                 p += sprintf(p, "row:%d ", mem_err->row);
 346         if (mem_err->validation_bits & CPER_MEM_VALID_COLUMN)
 347                 p += sprintf(p, "col:%d ", mem_err->column);
 348         if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
 349                 p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 350         if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
 351                 const char *bank = NULL, *device = NULL;
 352                 int index = -1;
 353 
 354                 dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
 355                 if (bank != NULL && device != NULL)
 356                         p += sprintf(p, "DIMM location:%s %s ", bank, device);
 357                 else
 358                         p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
 359                                      mem_err->mem_dev_handle);
 360 
 361                 index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
 362                 if (index >= 0) {
 363                         e->top_layer = index;
 364                         e->enable_per_layer_report = true;
 365                 }
 366 
 367         }
 368         if (p > e->location)
 369                 *(p - 1) = '\0';
 370 
 371         /* All other fields are mapped on e->other_detail */
 372         p = pvt->other_detail;
 373         if (mem_err->validation_bits & CPER_MEM_VALID_ERROR_STATUS) {
 374                 u64 status = mem_err->error_status;
 375 
 376                 p += sprintf(p, "status(0x%016llx): ", (long long)status);
 377                 switch ((status >> 8) & 0xff) {
 378                 case 1:
 379                         p += sprintf(p, "Error detected internal to the component ");
 380                         break;
 381                 case 16:
 382                         p += sprintf(p, "Error detected in the bus ");
 383                         break;
 384                 case 4:
 385                         p += sprintf(p, "Storage error in DRAM memory ");
 386                         break;
 387                 case 5:
 388                         p += sprintf(p, "Storage error in TLB ");
 389                         break;
 390                 case 6:
 391                         p += sprintf(p, "Storage error in cache ");
 392                         break;
 393                 case 7:
 394                         p += sprintf(p, "Error in one or more functional units ");
 395                         break;
 396                 case 8:
 397                         p += sprintf(p, "component failed self test ");
 398                         break;
 399                 case 9:
 400                         p += sprintf(p, "Overflow or undervalue of internal queue ");
 401                         break;
 402                 case 17:
 403                         p += sprintf(p, "Virtual address not found on IO-TLB or IO-PDIR ");
 404                         break;
 405                 case 18:
 406                         p += sprintf(p, "Improper access error ");
 407                         break;
 408                 case 19:
 409                         p += sprintf(p, "Access to a memory address which is not mapped to any component ");
 410                         break;
 411                 case 20:
 412                         p += sprintf(p, "Loss of Lockstep ");
 413                         break;
 414                 case 21:
 415                         p += sprintf(p, "Response not associated with a request ");
 416                         break;
 417                 case 22:
 418                         p += sprintf(p, "Bus parity error - must also set the A, C, or D Bits ");
 419                         break;
 420                 case 23:
 421                         p += sprintf(p, "Detection of a PATH_ERROR ");
 422                         break;
 423                 case 25:
 424                         p += sprintf(p, "Bus operation timeout ");
 425                         break;
 426                 case 26:
 427                         p += sprintf(p, "A read was issued to data that has been poisoned ");
 428                         break;
 429                 default:
 430                         p += sprintf(p, "reserved ");
 431                         break;
 432                 }
 433         }
 434         if (mem_err->validation_bits & CPER_MEM_VALID_REQUESTOR_ID)
 435                 p += sprintf(p, "requestorID: 0x%016llx ",
 436                              (long long)mem_err->requestor_id);
 437         if (mem_err->validation_bits & CPER_MEM_VALID_RESPONDER_ID)
 438                 p += sprintf(p, "responderID: 0x%016llx ",
 439                              (long long)mem_err->responder_id);
 440         if (mem_err->validation_bits & CPER_MEM_VALID_TARGET_ID)
 441                 p += sprintf(p, "targetID: 0x%016llx ",
 442                              (long long)mem_err->responder_id);
 443         if (p > pvt->other_detail)
 444                 *(p - 1) = '\0';
 445 
 446         /* Sanity-check driver-supplied grain value. */
 447         if (WARN_ON_ONCE(!e->grain))
 448                 e->grain = 1;
 449 
 450         grain_bits = fls_long(e->grain - 1);
 451 
 452         /* Generate the trace event */
 453         snprintf(pvt->detail_location, sizeof(pvt->detail_location),
 454                  "APEI location: %s %s", e->location, e->other_detail);
 455         trace_mc_event(type, e->msg, e->label, e->error_count,
 456                        mci->mc_idx, e->top_layer, e->mid_layer, e->low_layer,
 457                        (e->page_frame_number << PAGE_SHIFT) | e->offset_in_page,
 458                        grain_bits, e->syndrome, pvt->detail_location);
 459 
 460         edac_raw_mc_handle_error(type, mci, e);
 461 
 462 unlock:
 463         spin_unlock_irqrestore(&ghes_lock, flags);
 464 }
 465 
 466 /*
 467  * Known systems that are safe to enable this module.
 468  */
 469 static struct acpi_platform_list plat_list[] = {
 470         {"HPE   ", "Server  ", 0, ACPI_SIG_FADT, all_versions},
 471         { } /* End */
 472 };
 473 
 474 int ghes_edac_register(struct ghes *ghes, struct device *dev)
 475 {
 476         bool fake = false;
 477         int rc = 0, num_dimm = 0;
 478         struct mem_ctl_info *mci;
 479         struct ghes_edac_pvt *pvt;
 480         struct edac_mc_layer layers[1];
 481         struct ghes_edac_dimm_fill dimm_fill;
 482         unsigned long flags;
 483         int idx = -1;
 484 
 485         if (IS_ENABLED(CONFIG_X86)) {
 486                 /* Check if safe to enable on this system */
 487                 idx = acpi_match_platform_list(plat_list);
 488                 if (!force_load && idx < 0)
 489                         return -ENODEV;
 490         } else {
 491                 idx = 0;
 492         }
 493 
 494         /* finish another registration/unregistration instance first */
 495         mutex_lock(&ghes_reg_mutex);
 496 
 497         /*
 498          * We have only one logical memory controller to which all DIMMs belong.
 499          */
 500         if (refcount_inc_not_zero(&ghes_refcount))
 501                 goto unlock;
 502 
 503         /* Get the number of DIMMs */
 504         dmi_walk(ghes_edac_count_dimms, &num_dimm);
 505 
 506         /* Check if we've got a bogus BIOS */
 507         if (num_dimm == 0) {
 508                 fake = true;
 509                 num_dimm = 1;
 510         }
 511 
 512         layers[0].type = EDAC_MC_LAYER_ALL_MEM;
 513         layers[0].size = num_dimm;
 514         layers[0].is_virt_csrow = true;
 515 
 516         mci = edac_mc_alloc(0, ARRAY_SIZE(layers), layers, sizeof(struct ghes_edac_pvt));
 517         if (!mci) {
 518                 pr_info("Can't allocate memory for EDAC data\n");
 519                 rc = -ENOMEM;
 520                 goto unlock;
 521         }
 522 
 523         pvt             = mci->pvt_info;
 524         pvt->ghes       = ghes;
 525         pvt->mci        = mci;
 526 
 527         mci->pdev = dev;
 528         mci->mtype_cap = MEM_FLAG_EMPTY;
 529         mci->edac_ctl_cap = EDAC_FLAG_NONE;
 530         mci->edac_cap = EDAC_FLAG_NONE;
 531         mci->mod_name = "ghes_edac.c";
 532         mci->ctl_name = "ghes_edac";
 533         mci->dev_name = "ghes";
 534 
 535         if (fake) {
 536                 pr_info("This system has a very crappy BIOS: It doesn't even list the DIMMS.\n");
 537                 pr_info("Its SMBIOS info is wrong. It is doubtful that the error report would\n");
 538                 pr_info("work on such system. Use this driver with caution\n");
 539         } else if (idx < 0) {
 540                 pr_info("This EDAC driver relies on BIOS to enumerate memory and get error reports.\n");
 541                 pr_info("Unfortunately, not all BIOSes reflect the memory layout correctly.\n");
 542                 pr_info("So, the end result of using this driver varies from vendor to vendor.\n");
 543                 pr_info("If you find incorrect reports, please contact your hardware vendor\n");
 544                 pr_info("to correct its BIOS.\n");
 545                 pr_info("This system has %d DIMM sockets.\n", num_dimm);
 546         }
 547 
 548         if (!fake) {
 549                 dimm_fill.count = 0;
 550                 dimm_fill.mci = mci;
 551                 dmi_walk(ghes_edac_dmidecode, &dimm_fill);
 552         } else {
 553                 struct dimm_info *dimm = EDAC_DIMM_PTR(mci->layers, mci->dimms,
 554                                                        mci->n_layers, 0, 0, 0);
 555 
 556                 dimm->nr_pages = 1;
 557                 dimm->grain = 128;
 558                 dimm->mtype = MEM_UNKNOWN;
 559                 dimm->dtype = DEV_UNKNOWN;
 560                 dimm->edac_mode = EDAC_SECDED;
 561         }
 562 
 563         rc = edac_mc_add_mc(mci);
 564         if (rc < 0) {
 565                 pr_info("Can't register at EDAC core\n");
 566                 edac_mc_free(mci);
 567                 rc = -ENODEV;
 568                 goto unlock;
 569         }
 570 
 571         spin_lock_irqsave(&ghes_lock, flags);
 572         ghes_pvt = pvt;
 573         spin_unlock_irqrestore(&ghes_lock, flags);
 574 
 575         /* only set on success */
 576         refcount_set(&ghes_refcount, 1);
 577 
 578 unlock:
 579         mutex_unlock(&ghes_reg_mutex);
 580 
 581         return rc;
 582 }
 583 
 584 void ghes_edac_unregister(struct ghes *ghes)
 585 {
 586         struct mem_ctl_info *mci;
 587         unsigned long flags;
 588 
 589         mutex_lock(&ghes_reg_mutex);
 590 
 591         if (!refcount_dec_and_test(&ghes_refcount))
 592                 goto unlock;
 593 
 594         /*
 595          * Wait for the irq handler being finished.
 596          */
 597         spin_lock_irqsave(&ghes_lock, flags);
 598         mci = ghes_pvt ? ghes_pvt->mci : NULL;
 599         ghes_pvt = NULL;
 600         spin_unlock_irqrestore(&ghes_lock, flags);
 601 
 602         if (!mci)
 603                 goto unlock;
 604 
 605         mci = edac_mc_del_mc(mci->pdev);
 606         if (mci)
 607                 edac_mc_free(mci);
 608 
 609 unlock:
 610         mutex_unlock(&ghes_reg_mutex);
 611 }

/* [<][>][^][v][top][bottom][index][help] */