root/drivers/edac/mce_amd.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. amd_report_gart_errors
  2. amd_register_ecc_decoder
  3. amd_unregister_ecc_decoder
  4. f12h_mc0_mce
  5. f10h_mc0_mce
  6. k8_mc0_mce
  7. cat_mc0_mce
  8. f15h_mc0_mce
  9. decode_mc0_mce
  10. k8_mc1_mce
  11. cat_mc1_mce
  12. f15h_mc1_mce
  13. decode_mc1_mce
  14. k8_mc2_mce
  15. f15h_mc2_mce
  16. f16h_mc2_mce
  17. decode_mc2_mce
  18. decode_mc3_mce
  19. decode_mc4_mce
  20. decode_mc5_mce
  21. decode_mc6_mce
  22. decode_smca_error
  23. amd_decode_err_code
  24. ignore_mce
  25. decode_error_status
  26. amd_decode_mce
  27. mce_amd_init
  28. mce_amd_exit

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 #include <linux/module.h>
   3 #include <linux/slab.h>
   4 
   5 #include <asm/cpu.h>
   6 
   7 #include "mce_amd.h"
   8 
   9 static struct amd_decoder_ops *fam_ops;
  10 
  11 static u8 xec_mask       = 0xf;
  12 
  13 static bool report_gart_errors;
  14 static void (*decode_dram_ecc)(int node_id, struct mce *m);
  15 
  16 void amd_report_gart_errors(bool v)
  17 {
  18         report_gart_errors = v;
  19 }
  20 EXPORT_SYMBOL_GPL(amd_report_gart_errors);
  21 
  22 void amd_register_ecc_decoder(void (*f)(int, struct mce *))
  23 {
  24         decode_dram_ecc = f;
  25 }
  26 EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
  27 
  28 void amd_unregister_ecc_decoder(void (*f)(int, struct mce *))
  29 {
  30         if (decode_dram_ecc) {
  31                 WARN_ON(decode_dram_ecc != f);
  32 
  33                 decode_dram_ecc = NULL;
  34         }
  35 }
  36 EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
  37 
  38 /*
  39  * string representation for the different MCA reported error types, see F3x48
  40  * or MSR0000_0411.
  41  */
  42 
  43 /* transaction type */
  44 static const char * const tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
  45 
  46 /* cache level */
  47 static const char * const ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
  48 
  49 /* memory transaction type */
  50 static const char * const rrrr_msgs[] = {
  51        "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
  52 };
  53 
  54 /* participating processor */
  55 const char * const pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
  56 EXPORT_SYMBOL_GPL(pp_msgs);
  57 
  58 /* request timeout */
  59 static const char * const to_msgs[] = { "no timeout", "timed out" };
  60 
  61 /* memory or i/o */
  62 static const char * const ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
  63 
  64 /* internal error type */
  65 static const char * const uu_msgs[] = { "RESV", "RESV", "HWA", "RESV" };
  66 
  67 static const char * const f15h_mc1_mce_desc[] = {
  68         "UC during a demand linefill from L2",
  69         "Parity error during data load from IC",
  70         "Parity error for IC valid bit",
  71         "Main tag parity error",
  72         "Parity error in prediction queue",
  73         "PFB data/address parity error",
  74         "Parity error in the branch status reg",
  75         "PFB promotion address error",
  76         "Tag error during probe/victimization",
  77         "Parity error for IC probe tag valid bit",
  78         "PFB non-cacheable bit parity error",
  79         "PFB valid bit parity error",                   /* xec = 0xd */
  80         "Microcode Patch Buffer",                       /* xec = 010 */
  81         "uop queue",
  82         "insn buffer",
  83         "predecode buffer",
  84         "fetch address FIFO",
  85         "dispatch uop queue"
  86 };
  87 
  88 static const char * const f15h_mc2_mce_desc[] = {
  89         "Fill ECC error on data fills",                 /* xec = 0x4 */
  90         "Fill parity error on insn fills",
  91         "Prefetcher request FIFO parity error",
  92         "PRQ address parity error",
  93         "PRQ data parity error",
  94         "WCC Tag ECC error",
  95         "WCC Data ECC error",
  96         "WCB Data parity error",
  97         "VB Data ECC or parity error",
  98         "L2 Tag ECC error",                             /* xec = 0x10 */
  99         "Hard L2 Tag ECC error",
 100         "Multiple hits on L2 tag",
 101         "XAB parity error",
 102         "PRB address parity error"
 103 };
 104 
 105 static const char * const mc4_mce_desc[] = {
 106         "DRAM ECC error detected on the NB",
 107         "CRC error detected on HT link",
 108         "Link-defined sync error packets detected on HT link",
 109         "HT Master abort",
 110         "HT Target abort",
 111         "Invalid GART PTE entry during GART table walk",
 112         "Unsupported atomic RMW received from an IO link",
 113         "Watchdog timeout due to lack of progress",
 114         "DRAM ECC error detected on the NB",
 115         "SVM DMA Exclusion Vector error",
 116         "HT data error detected on link",
 117         "Protocol error (link, L3, probe filter)",
 118         "NB internal arrays parity error",
 119         "DRAM addr/ctl signals parity error",
 120         "IO link transmission error",
 121         "L3 data cache ECC error",                      /* xec = 0x1c */
 122         "L3 cache tag error",
 123         "L3 LRU parity bits error",
 124         "ECC Error in the Probe Filter directory"
 125 };
 126 
 127 static const char * const mc5_mce_desc[] = {
 128         "CPU Watchdog timer expire",
 129         "Wakeup array dest tag",
 130         "AG payload array",
 131         "EX payload array",
 132         "IDRF array",
 133         "Retire dispatch queue",
 134         "Mapper checkpoint array",
 135         "Physical register file EX0 port",
 136         "Physical register file EX1 port",
 137         "Physical register file AG0 port",
 138         "Physical register file AG1 port",
 139         "Flag register file",
 140         "DE error occurred",
 141         "Retire status queue"
 142 };
 143 
 144 static const char * const mc6_mce_desc[] = {
 145         "Hardware Assertion",
 146         "Free List",
 147         "Physical Register File",
 148         "Retire Queue",
 149         "Scheduler table",
 150         "Status Register File",
 151 };
 152 
 153 /* Scalable MCA error strings */
 154 static const char * const smca_ls_mce_desc[] = {
 155         "Load queue parity error",
 156         "Store queue parity error",
 157         "Miss address buffer payload parity error",
 158         "Level 1 TLB parity error",
 159         "DC Tag error type 5",
 160         "DC Tag error type 6",
 161         "DC Tag error type 1",
 162         "Internal error type 1",
 163         "Internal error type 2",
 164         "System Read Data Error Thread 0",
 165         "System Read Data Error Thread 1",
 166         "DC Tag error type 2",
 167         "DC Data error type 1 and poison consumption",
 168         "DC Data error type 2",
 169         "DC Data error type 3",
 170         "DC Tag error type 4",
 171         "Level 2 TLB parity error",
 172         "PDC parity error",
 173         "DC Tag error type 3",
 174         "DC Tag error type 5",
 175         "L2 Fill Data error",
 176 };
 177 
 178 static const char * const smca_if_mce_desc[] = {
 179         "Op Cache Microtag Probe Port Parity Error",
 180         "IC Microtag or Full Tag Multi-hit Error",
 181         "IC Full Tag Parity Error",
 182         "IC Data Array Parity Error",
 183         "Decoupling Queue PhysAddr Parity Error",
 184         "L0 ITLB Parity Error",
 185         "L1 ITLB Parity Error",
 186         "L2 ITLB Parity Error",
 187         "BPQ Thread 0 Snoop Parity Error",
 188         "BPQ Thread 1 Snoop Parity Error",
 189         "L1 BTB Multi-Match Error",
 190         "L2 BTB Multi-Match Error",
 191         "L2 Cache Response Poison Error",
 192         "System Read Data Error",
 193 };
 194 
 195 static const char * const smca_l2_mce_desc[] = {
 196         "L2M Tag Multiple-Way-Hit error",
 197         "L2M Tag or State Array ECC Error",
 198         "L2M Data Array ECC Error",
 199         "Hardware Assert Error",
 200 };
 201 
 202 static const char * const smca_de_mce_desc[] = {
 203         "Micro-op cache tag parity error",
 204         "Micro-op cache data parity error",
 205         "Instruction buffer parity error",
 206         "Micro-op queue parity error",
 207         "Instruction dispatch queue parity error",
 208         "Fetch address FIFO parity error",
 209         "Patch RAM data parity error",
 210         "Patch RAM sequencer parity error",
 211         "Micro-op buffer parity error"
 212 };
 213 
 214 static const char * const smca_ex_mce_desc[] = {
 215         "Watchdog Timeout error",
 216         "Physical register file parity error",
 217         "Flag register file parity error",
 218         "Immediate displacement register file parity error",
 219         "Address generator payload parity error",
 220         "EX payload parity error",
 221         "Checkpoint queue parity error",
 222         "Retire dispatch queue parity error",
 223         "Retire status queue parity error",
 224         "Scheduling queue parity error",
 225         "Branch buffer queue parity error",
 226         "Hardware Assertion error",
 227 };
 228 
 229 static const char * const smca_fp_mce_desc[] = {
 230         "Physical register file (PRF) parity error",
 231         "Freelist (FL) parity error",
 232         "Schedule queue parity error",
 233         "NSQ parity error",
 234         "Retire queue (RQ) parity error",
 235         "Status register file (SRF) parity error",
 236         "Hardware assertion",
 237 };
 238 
 239 static const char * const smca_l3_mce_desc[] = {
 240         "Shadow Tag Macro ECC Error",
 241         "Shadow Tag Macro Multi-way-hit Error",
 242         "L3M Tag ECC Error",
 243         "L3M Tag Multi-way-hit Error",
 244         "L3M Data ECC Error",
 245         "SDP Parity Error or SystemReadDataError from XI",
 246         "L3 Victim Queue Parity Error",
 247         "L3 Hardware Assertion",
 248 };
 249 
 250 static const char * const smca_cs_mce_desc[] = {
 251         "Illegal Request",
 252         "Address Violation",
 253         "Security Violation",
 254         "Illegal Response",
 255         "Unexpected Response",
 256         "Request or Probe Parity Error",
 257         "Read Response Parity Error",
 258         "Atomic Request Parity Error",
 259         "Probe Filter ECC Error",
 260 };
 261 
 262 static const char * const smca_cs2_mce_desc[] = {
 263         "Illegal Request",
 264         "Address Violation",
 265         "Security Violation",
 266         "Illegal Response",
 267         "Unexpected Response",
 268         "Request or Probe Parity Error",
 269         "Read Response Parity Error",
 270         "Atomic Request Parity Error",
 271         "SDP read response had no match in the CS queue",
 272         "Probe Filter Protocol Error",
 273         "Probe Filter ECC Error",
 274         "SDP read response had an unexpected RETRY error",
 275         "Counter overflow error",
 276         "Counter underflow error",
 277 };
 278 
 279 static const char * const smca_pie_mce_desc[] = {
 280         "Hardware Assert",
 281         "Register security violation",
 282         "Link Error",
 283         "Poison data consumption",
 284         "A deferred error was detected in the DF"
 285 };
 286 
 287 static const char * const smca_umc_mce_desc[] = {
 288         "DRAM ECC error",
 289         "Data poison error",
 290         "SDP parity error",
 291         "Advanced peripheral bus error",
 292         "Address/Command parity error",
 293         "Write data CRC error",
 294         "DCQ SRAM ECC error",
 295         "AES SRAM ECC error",
 296 };
 297 
 298 static const char * const smca_pb_mce_desc[] = {
 299         "An ECC error in the Parameter Block RAM array",
 300 };
 301 
 302 static const char * const smca_psp_mce_desc[] = {
 303         "An ECC or parity error in a PSP RAM instance",
 304 };
 305 
 306 static const char * const smca_psp2_mce_desc[] = {
 307         "High SRAM ECC or parity error",
 308         "Low SRAM ECC or parity error",
 309         "Instruction Cache Bank 0 ECC or parity error",
 310         "Instruction Cache Bank 1 ECC or parity error",
 311         "Instruction Tag Ram 0 parity error",
 312         "Instruction Tag Ram 1 parity error",
 313         "Data Cache Bank 0 ECC or parity error",
 314         "Data Cache Bank 1 ECC or parity error",
 315         "Data Cache Bank 2 ECC or parity error",
 316         "Data Cache Bank 3 ECC or parity error",
 317         "Data Tag Bank 0 parity error",
 318         "Data Tag Bank 1 parity error",
 319         "Data Tag Bank 2 parity error",
 320         "Data Tag Bank 3 parity error",
 321         "Dirty Data Ram parity error",
 322         "TLB Bank 0 parity error",
 323         "TLB Bank 1 parity error",
 324         "System Hub Read Buffer ECC or parity error",
 325 };
 326 
 327 static const char * const smca_smu_mce_desc[] = {
 328         "An ECC or parity error in an SMU RAM instance",
 329 };
 330 
 331 static const char * const smca_smu2_mce_desc[] = {
 332         "High SRAM ECC or parity error",
 333         "Low SRAM ECC or parity error",
 334         "Data Cache Bank A ECC or parity error",
 335         "Data Cache Bank B ECC or parity error",
 336         "Data Tag Cache Bank A ECC or parity error",
 337         "Data Tag Cache Bank B ECC or parity error",
 338         "Instruction Cache Bank A ECC or parity error",
 339         "Instruction Cache Bank B ECC or parity error",
 340         "Instruction Tag Cache Bank A ECC or parity error",
 341         "Instruction Tag Cache Bank B ECC or parity error",
 342         "System Hub Read Buffer ECC or parity error",
 343 };
 344 
 345 static const char * const smca_mp5_mce_desc[] = {
 346         "High SRAM ECC or parity error",
 347         "Low SRAM ECC or parity error",
 348         "Data Cache Bank A ECC or parity error",
 349         "Data Cache Bank B ECC or parity error",
 350         "Data Tag Cache Bank A ECC or parity error",
 351         "Data Tag Cache Bank B ECC or parity error",
 352         "Instruction Cache Bank A ECC or parity error",
 353         "Instruction Cache Bank B ECC or parity error",
 354         "Instruction Tag Cache Bank A ECC or parity error",
 355         "Instruction Tag Cache Bank B ECC or parity error",
 356 };
 357 
 358 static const char * const smca_nbio_mce_desc[] = {
 359         "ECC or Parity error",
 360         "PCIE error",
 361         "SDP ErrEvent error",
 362         "SDP Egress Poison Error",
 363         "IOHC Internal Poison Error",
 364 };
 365 
 366 static const char * const smca_pcie_mce_desc[] = {
 367         "CCIX PER Message logging",
 368         "CCIX Read Response with Status: Non-Data Error",
 369         "CCIX Write Response with Status: Non-Data Error",
 370         "CCIX Read Response with Status: Data Error",
 371         "CCIX Non-okay write response with data error",
 372 };
 373 
 374 struct smca_mce_desc {
 375         const char * const *descs;
 376         unsigned int num_descs;
 377 };
 378 
 379 static struct smca_mce_desc smca_mce_descs[] = {
 380         [SMCA_LS]       = { smca_ls_mce_desc,   ARRAY_SIZE(smca_ls_mce_desc)    },
 381         [SMCA_IF]       = { smca_if_mce_desc,   ARRAY_SIZE(smca_if_mce_desc)    },
 382         [SMCA_L2_CACHE] = { smca_l2_mce_desc,   ARRAY_SIZE(smca_l2_mce_desc)    },
 383         [SMCA_DE]       = { smca_de_mce_desc,   ARRAY_SIZE(smca_de_mce_desc)    },
 384         [SMCA_EX]       = { smca_ex_mce_desc,   ARRAY_SIZE(smca_ex_mce_desc)    },
 385         [SMCA_FP]       = { smca_fp_mce_desc,   ARRAY_SIZE(smca_fp_mce_desc)    },
 386         [SMCA_L3_CACHE] = { smca_l3_mce_desc,   ARRAY_SIZE(smca_l3_mce_desc)    },
 387         [SMCA_CS]       = { smca_cs_mce_desc,   ARRAY_SIZE(smca_cs_mce_desc)    },
 388         [SMCA_CS_V2]    = { smca_cs2_mce_desc,  ARRAY_SIZE(smca_cs2_mce_desc)   },
 389         [SMCA_PIE]      = { smca_pie_mce_desc,  ARRAY_SIZE(smca_pie_mce_desc)   },
 390         [SMCA_UMC]      = { smca_umc_mce_desc,  ARRAY_SIZE(smca_umc_mce_desc)   },
 391         [SMCA_PB]       = { smca_pb_mce_desc,   ARRAY_SIZE(smca_pb_mce_desc)    },
 392         [SMCA_PSP]      = { smca_psp_mce_desc,  ARRAY_SIZE(smca_psp_mce_desc)   },
 393         [SMCA_PSP_V2]   = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)  },
 394         [SMCA_SMU]      = { smca_smu_mce_desc,  ARRAY_SIZE(smca_smu_mce_desc)   },
 395         [SMCA_SMU_V2]   = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc)  },
 396         [SMCA_MP5]      = { smca_mp5_mce_desc,  ARRAY_SIZE(smca_mp5_mce_desc)   },
 397         [SMCA_NBIO]     = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc)  },
 398         [SMCA_PCIE]     = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc)  },
 399 };
 400 
 401 static bool f12h_mc0_mce(u16 ec, u8 xec)
 402 {
 403         bool ret = false;
 404 
 405         if (MEM_ERROR(ec)) {
 406                 u8 ll = LL(ec);
 407                 ret = true;
 408 
 409                 if (ll == LL_L2)
 410                         pr_cont("during L1 linefill from L2.\n");
 411                 else if (ll == LL_L1)
 412                         pr_cont("Data/Tag %s error.\n", R4_MSG(ec));
 413                 else
 414                         ret = false;
 415         }
 416         return ret;
 417 }
 418 
 419 static bool f10h_mc0_mce(u16 ec, u8 xec)
 420 {
 421         if (R4(ec) == R4_GEN && LL(ec) == LL_L1) {
 422                 pr_cont("during data scrub.\n");
 423                 return true;
 424         }
 425         return f12h_mc0_mce(ec, xec);
 426 }
 427 
 428 static bool k8_mc0_mce(u16 ec, u8 xec)
 429 {
 430         if (BUS_ERROR(ec)) {
 431                 pr_cont("during system linefill.\n");
 432                 return true;
 433         }
 434 
 435         return f10h_mc0_mce(ec, xec);
 436 }
 437 
 438 static bool cat_mc0_mce(u16 ec, u8 xec)
 439 {
 440         u8 r4    = R4(ec);
 441         bool ret = true;
 442 
 443         if (MEM_ERROR(ec)) {
 444 
 445                 if (TT(ec) != TT_DATA || LL(ec) != LL_L1)
 446                         return false;
 447 
 448                 switch (r4) {
 449                 case R4_DRD:
 450                 case R4_DWR:
 451                         pr_cont("Data/Tag parity error due to %s.\n",
 452                                 (r4 == R4_DRD ? "load/hw prf" : "store"));
 453                         break;
 454                 case R4_EVICT:
 455                         pr_cont("Copyback parity error on a tag miss.\n");
 456                         break;
 457                 case R4_SNOOP:
 458                         pr_cont("Tag parity error during snoop.\n");
 459                         break;
 460                 default:
 461                         ret = false;
 462                 }
 463         } else if (BUS_ERROR(ec)) {
 464 
 465                 if ((II(ec) != II_MEM && II(ec) != II_IO) || LL(ec) != LL_LG)
 466                         return false;
 467 
 468                 pr_cont("System read data error on a ");
 469 
 470                 switch (r4) {
 471                 case R4_RD:
 472                         pr_cont("TLB reload.\n");
 473                         break;
 474                 case R4_DWR:
 475                         pr_cont("store.\n");
 476                         break;
 477                 case R4_DRD:
 478                         pr_cont("load.\n");
 479                         break;
 480                 default:
 481                         ret = false;
 482                 }
 483         } else {
 484                 ret = false;
 485         }
 486 
 487         return ret;
 488 }
 489 
 490 static bool f15h_mc0_mce(u16 ec, u8 xec)
 491 {
 492         bool ret = true;
 493 
 494         if (MEM_ERROR(ec)) {
 495 
 496                 switch (xec) {
 497                 case 0x0:
 498                         pr_cont("Data Array access error.\n");
 499                         break;
 500 
 501                 case 0x1:
 502                         pr_cont("UC error during a linefill from L2/NB.\n");
 503                         break;
 504 
 505                 case 0x2:
 506                 case 0x11:
 507                         pr_cont("STQ access error.\n");
 508                         break;
 509 
 510                 case 0x3:
 511                         pr_cont("SCB access error.\n");
 512                         break;
 513 
 514                 case 0x10:
 515                         pr_cont("Tag error.\n");
 516                         break;
 517 
 518                 case 0x12:
 519                         pr_cont("LDQ access error.\n");
 520                         break;
 521 
 522                 default:
 523                         ret = false;
 524                 }
 525         } else if (BUS_ERROR(ec)) {
 526 
 527                 if (!xec)
 528                         pr_cont("System Read Data Error.\n");
 529                 else
 530                         pr_cont(" Internal error condition type %d.\n", xec);
 531         } else if (INT_ERROR(ec)) {
 532                 if (xec <= 0x1f)
 533                         pr_cont("Hardware Assert.\n");
 534                 else
 535                         ret = false;
 536 
 537         } else
 538                 ret = false;
 539 
 540         return ret;
 541 }
 542 
 543 static void decode_mc0_mce(struct mce *m)
 544 {
 545         u16 ec = EC(m->status);
 546         u8 xec = XEC(m->status, xec_mask);
 547 
 548         pr_emerg(HW_ERR "MC0 Error: ");
 549 
 550         /* TLB error signatures are the same across families */
 551         if (TLB_ERROR(ec)) {
 552                 if (TT(ec) == TT_DATA) {
 553                         pr_cont("%s TLB %s.\n", LL_MSG(ec),
 554                                 ((xec == 2) ? "locked miss"
 555                                             : (xec ? "multimatch" : "parity")));
 556                         return;
 557                 }
 558         } else if (fam_ops->mc0_mce(ec, xec))
 559                 ;
 560         else
 561                 pr_emerg(HW_ERR "Corrupted MC0 MCE info?\n");
 562 }
 563 
 564 static bool k8_mc1_mce(u16 ec, u8 xec)
 565 {
 566         u8 ll    = LL(ec);
 567         bool ret = true;
 568 
 569         if (!MEM_ERROR(ec))
 570                 return false;
 571 
 572         if (ll == 0x2)
 573                 pr_cont("during a linefill from L2.\n");
 574         else if (ll == 0x1) {
 575                 switch (R4(ec)) {
 576                 case R4_IRD:
 577                         pr_cont("Parity error during data load.\n");
 578                         break;
 579 
 580                 case R4_EVICT:
 581                         pr_cont("Copyback Parity/Victim error.\n");
 582                         break;
 583 
 584                 case R4_SNOOP:
 585                         pr_cont("Tag Snoop error.\n");
 586                         break;
 587 
 588                 default:
 589                         ret = false;
 590                         break;
 591                 }
 592         } else
 593                 ret = false;
 594 
 595         return ret;
 596 }
 597 
 598 static bool cat_mc1_mce(u16 ec, u8 xec)
 599 {
 600         u8 r4    = R4(ec);
 601         bool ret = true;
 602 
 603         if (!MEM_ERROR(ec))
 604                 return false;
 605 
 606         if (TT(ec) != TT_INSTR)
 607                 return false;
 608 
 609         if (r4 == R4_IRD)
 610                 pr_cont("Data/tag array parity error for a tag hit.\n");
 611         else if (r4 == R4_SNOOP)
 612                 pr_cont("Tag error during snoop/victimization.\n");
 613         else if (xec == 0x0)
 614                 pr_cont("Tag parity error from victim castout.\n");
 615         else if (xec == 0x2)
 616                 pr_cont("Microcode patch RAM parity error.\n");
 617         else
 618                 ret = false;
 619 
 620         return ret;
 621 }
 622 
 623 static bool f15h_mc1_mce(u16 ec, u8 xec)
 624 {
 625         bool ret = true;
 626 
 627         if (!MEM_ERROR(ec))
 628                 return false;
 629 
 630         switch (xec) {
 631         case 0x0 ... 0xa:
 632                 pr_cont("%s.\n", f15h_mc1_mce_desc[xec]);
 633                 break;
 634 
 635         case 0xd:
 636                 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-2]);
 637                 break;
 638 
 639         case 0x10:
 640                 pr_cont("%s.\n", f15h_mc1_mce_desc[xec-4]);
 641                 break;
 642 
 643         case 0x11 ... 0x15:
 644                 pr_cont("Decoder %s parity error.\n", f15h_mc1_mce_desc[xec-4]);
 645                 break;
 646 
 647         default:
 648                 ret = false;
 649         }
 650         return ret;
 651 }
 652 
 653 static void decode_mc1_mce(struct mce *m)
 654 {
 655         u16 ec = EC(m->status);
 656         u8 xec = XEC(m->status, xec_mask);
 657 
 658         pr_emerg(HW_ERR "MC1 Error: ");
 659 
 660         if (TLB_ERROR(ec))
 661                 pr_cont("%s TLB %s.\n", LL_MSG(ec),
 662                         (xec ? "multimatch" : "parity error"));
 663         else if (BUS_ERROR(ec)) {
 664                 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
 665 
 666                 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
 667         } else if (INT_ERROR(ec)) {
 668                 if (xec <= 0x3f)
 669                         pr_cont("Hardware Assert.\n");
 670                 else
 671                         goto wrong_mc1_mce;
 672         } else if (fam_ops->mc1_mce(ec, xec))
 673                 ;
 674         else
 675                 goto wrong_mc1_mce;
 676 
 677         return;
 678 
 679 wrong_mc1_mce:
 680         pr_emerg(HW_ERR "Corrupted MC1 MCE info?\n");
 681 }
 682 
 683 static bool k8_mc2_mce(u16 ec, u8 xec)
 684 {
 685         bool ret = true;
 686 
 687         if (xec == 0x1)
 688                 pr_cont(" in the write data buffers.\n");
 689         else if (xec == 0x3)
 690                 pr_cont(" in the victim data buffers.\n");
 691         else if (xec == 0x2 && MEM_ERROR(ec))
 692                 pr_cont(": %s error in the L2 cache tags.\n", R4_MSG(ec));
 693         else if (xec == 0x0) {
 694                 if (TLB_ERROR(ec))
 695                         pr_cont("%s error in a Page Descriptor Cache or Guest TLB.\n",
 696                                 TT_MSG(ec));
 697                 else if (BUS_ERROR(ec))
 698                         pr_cont(": %s/ECC error in data read from NB: %s.\n",
 699                                 R4_MSG(ec), PP_MSG(ec));
 700                 else if (MEM_ERROR(ec)) {
 701                         u8 r4 = R4(ec);
 702 
 703                         if (r4 >= 0x7)
 704                                 pr_cont(": %s error during data copyback.\n",
 705                                         R4_MSG(ec));
 706                         else if (r4 <= 0x1)
 707                                 pr_cont(": %s parity/ECC error during data "
 708                                         "access from L2.\n", R4_MSG(ec));
 709                         else
 710                                 ret = false;
 711                 } else
 712                         ret = false;
 713         } else
 714                 ret = false;
 715 
 716         return ret;
 717 }
 718 
 719 static bool f15h_mc2_mce(u16 ec, u8 xec)
 720 {
 721         bool ret = true;
 722 
 723         if (TLB_ERROR(ec)) {
 724                 if (xec == 0x0)
 725                         pr_cont("Data parity TLB read error.\n");
 726                 else if (xec == 0x1)
 727                         pr_cont("Poison data provided for TLB fill.\n");
 728                 else
 729                         ret = false;
 730         } else if (BUS_ERROR(ec)) {
 731                 if (xec > 2)
 732                         ret = false;
 733 
 734                 pr_cont("Error during attempted NB data read.\n");
 735         } else if (MEM_ERROR(ec)) {
 736                 switch (xec) {
 737                 case 0x4 ... 0xc:
 738                         pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x4]);
 739                         break;
 740 
 741                 case 0x10 ... 0x14:
 742                         pr_cont("%s.\n", f15h_mc2_mce_desc[xec - 0x7]);
 743                         break;
 744 
 745                 default:
 746                         ret = false;
 747                 }
 748         } else if (INT_ERROR(ec)) {
 749                 if (xec <= 0x3f)
 750                         pr_cont("Hardware Assert.\n");
 751                 else
 752                         ret = false;
 753         }
 754 
 755         return ret;
 756 }
 757 
 758 static bool f16h_mc2_mce(u16 ec, u8 xec)
 759 {
 760         u8 r4 = R4(ec);
 761 
 762         if (!MEM_ERROR(ec))
 763                 return false;
 764 
 765         switch (xec) {
 766         case 0x04 ... 0x05:
 767                 pr_cont("%cBUFF parity error.\n", (r4 == R4_RD) ? 'I' : 'O');
 768                 break;
 769 
 770         case 0x09 ... 0x0b:
 771         case 0x0d ... 0x0f:
 772                 pr_cont("ECC error in L2 tag (%s).\n",
 773                         ((r4 == R4_GEN)   ? "BankReq" :
 774                         ((r4 == R4_SNOOP) ? "Prb"     : "Fill")));
 775                 break;
 776 
 777         case 0x10 ... 0x19:
 778         case 0x1b:
 779                 pr_cont("ECC error in L2 data array (%s).\n",
 780                         (((r4 == R4_RD) && !(xec & 0x3)) ? "Hit"  :
 781                         ((r4 == R4_GEN)   ? "Attr" :
 782                         ((r4 == R4_EVICT) ? "Vict" : "Fill"))));
 783                 break;
 784 
 785         case 0x1c ... 0x1d:
 786         case 0x1f:
 787                 pr_cont("Parity error in L2 attribute bits (%s).\n",
 788                         ((r4 == R4_RD)  ? "Hit"  :
 789                         ((r4 == R4_GEN) ? "Attr" : "Fill")));
 790                 break;
 791 
 792         default:
 793                 return false;
 794         }
 795 
 796         return true;
 797 }
 798 
 799 static void decode_mc2_mce(struct mce *m)
 800 {
 801         u16 ec = EC(m->status);
 802         u8 xec = XEC(m->status, xec_mask);
 803 
 804         pr_emerg(HW_ERR "MC2 Error: ");
 805 
 806         if (!fam_ops->mc2_mce(ec, xec))
 807                 pr_cont(HW_ERR "Corrupted MC2 MCE info?\n");
 808 }
 809 
 810 static void decode_mc3_mce(struct mce *m)
 811 {
 812         u16 ec = EC(m->status);
 813         u8 xec = XEC(m->status, xec_mask);
 814 
 815         if (boot_cpu_data.x86 >= 0x14) {
 816                 pr_emerg("You shouldn't be seeing MC3 MCE on this cpu family,"
 817                          " please report on LKML.\n");
 818                 return;
 819         }
 820 
 821         pr_emerg(HW_ERR "MC3 Error");
 822 
 823         if (xec == 0x0) {
 824                 u8 r4 = R4(ec);
 825 
 826                 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
 827                         goto wrong_mc3_mce;
 828 
 829                 pr_cont(" during %s.\n", R4_MSG(ec));
 830         } else
 831                 goto wrong_mc3_mce;
 832 
 833         return;
 834 
 835  wrong_mc3_mce:
 836         pr_emerg(HW_ERR "Corrupted MC3 MCE info?\n");
 837 }
 838 
 839 static void decode_mc4_mce(struct mce *m)
 840 {
 841         unsigned int fam = x86_family(m->cpuid);
 842         int node_id = amd_get_nb_id(m->extcpu);
 843         u16 ec = EC(m->status);
 844         u8 xec = XEC(m->status, 0x1f);
 845         u8 offset = 0;
 846 
 847         pr_emerg(HW_ERR "MC4 Error (node %d): ", node_id);
 848 
 849         switch (xec) {
 850         case 0x0 ... 0xe:
 851 
 852                 /* special handling for DRAM ECCs */
 853                 if (xec == 0x0 || xec == 0x8) {
 854                         /* no ECCs on F11h */
 855                         if (fam == 0x11)
 856                                 goto wrong_mc4_mce;
 857 
 858                         pr_cont("%s.\n", mc4_mce_desc[xec]);
 859 
 860                         if (decode_dram_ecc)
 861                                 decode_dram_ecc(node_id, m);
 862                         return;
 863                 }
 864                 break;
 865 
 866         case 0xf:
 867                 if (TLB_ERROR(ec))
 868                         pr_cont("GART Table Walk data error.\n");
 869                 else if (BUS_ERROR(ec))
 870                         pr_cont("DMA Exclusion Vector Table Walk error.\n");
 871                 else
 872                         goto wrong_mc4_mce;
 873                 return;
 874 
 875         case 0x19:
 876                 if (fam == 0x15 || fam == 0x16)
 877                         pr_cont("Compute Unit Data Error.\n");
 878                 else
 879                         goto wrong_mc4_mce;
 880                 return;
 881 
 882         case 0x1c ... 0x1f:
 883                 offset = 13;
 884                 break;
 885 
 886         default:
 887                 goto wrong_mc4_mce;
 888         }
 889 
 890         pr_cont("%s.\n", mc4_mce_desc[xec - offset]);
 891         return;
 892 
 893  wrong_mc4_mce:
 894         pr_emerg(HW_ERR "Corrupted MC4 MCE info?\n");
 895 }
 896 
 897 static void decode_mc5_mce(struct mce *m)
 898 {
 899         unsigned int fam = x86_family(m->cpuid);
 900         u16 ec = EC(m->status);
 901         u8 xec = XEC(m->status, xec_mask);
 902 
 903         if (fam == 0xf || fam == 0x11)
 904                 goto wrong_mc5_mce;
 905 
 906         pr_emerg(HW_ERR "MC5 Error: ");
 907 
 908         if (INT_ERROR(ec)) {
 909                 if (xec <= 0x1f) {
 910                         pr_cont("Hardware Assert.\n");
 911                         return;
 912                 } else
 913                         goto wrong_mc5_mce;
 914         }
 915 
 916         if (xec == 0x0 || xec == 0xc)
 917                 pr_cont("%s.\n", mc5_mce_desc[xec]);
 918         else if (xec <= 0xd)
 919                 pr_cont("%s parity error.\n", mc5_mce_desc[xec]);
 920         else
 921                 goto wrong_mc5_mce;
 922 
 923         return;
 924 
 925  wrong_mc5_mce:
 926         pr_emerg(HW_ERR "Corrupted MC5 MCE info?\n");
 927 }
 928 
 929 static void decode_mc6_mce(struct mce *m)
 930 {
 931         u8 xec = XEC(m->status, xec_mask);
 932 
 933         pr_emerg(HW_ERR "MC6 Error: ");
 934 
 935         if (xec > 0x5)
 936                 goto wrong_mc6_mce;
 937 
 938         pr_cont("%s parity error.\n", mc6_mce_desc[xec]);
 939         return;
 940 
 941  wrong_mc6_mce:
 942         pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
 943 }
 944 
 945 /* Decode errors according to Scalable MCA specification */
 946 static void decode_smca_error(struct mce *m)
 947 {
 948         struct smca_hwid *hwid;
 949         enum smca_bank_types bank_type;
 950         const char *ip_name;
 951         u8 xec = XEC(m->status, xec_mask);
 952 
 953         if (m->bank >= ARRAY_SIZE(smca_banks))
 954                 return;
 955 
 956         hwid = smca_banks[m->bank].hwid;
 957         if (!hwid)
 958                 return;
 959 
 960         bank_type = hwid->bank_type;
 961 
 962         if (bank_type == SMCA_RESERVED) {
 963                 pr_emerg(HW_ERR "Bank %d is reserved.\n", m->bank);
 964                 return;
 965         }
 966 
 967         ip_name = smca_get_long_name(bank_type);
 968 
 969         pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);
 970 
 971         /* Only print the decode of valid error codes */
 972         if (xec < smca_mce_descs[bank_type].num_descs &&
 973                         (hwid->xec_bitmap & BIT_ULL(xec))) {
 974                 pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
 975         }
 976 
 977         if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc)
 978                 decode_dram_ecc(cpu_to_node(m->extcpu), m);
 979 }
 980 
 981 static inline void amd_decode_err_code(u16 ec)
 982 {
 983         if (INT_ERROR(ec)) {
 984                 pr_emerg(HW_ERR "internal: %s\n", UU_MSG(ec));
 985                 return;
 986         }
 987 
 988         pr_emerg(HW_ERR "cache level: %s", LL_MSG(ec));
 989 
 990         if (BUS_ERROR(ec))
 991                 pr_cont(", mem/io: %s", II_MSG(ec));
 992         else
 993                 pr_cont(", tx: %s", TT_MSG(ec));
 994 
 995         if (MEM_ERROR(ec) || BUS_ERROR(ec)) {
 996                 pr_cont(", mem-tx: %s", R4_MSG(ec));
 997 
 998                 if (BUS_ERROR(ec))
 999                         pr_cont(", part-proc: %s (%s)", PP_MSG(ec), TO_MSG(ec));
1000         }
1001 
1002         pr_cont("\n");
1003 }
1004 
1005 /*
1006  * Filter out unwanted MCE signatures here.
1007  */
1008 static bool ignore_mce(struct mce *m)
1009 {
1010         /*
1011          * NB GART TLB error reporting is disabled by default.
1012          */
1013         if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
1014                 return true;
1015 
1016         return false;
1017 }
1018 
1019 static const char *decode_error_status(struct mce *m)
1020 {
1021         if (m->status & MCI_STATUS_UC) {
1022                 if (m->status & MCI_STATUS_PCC)
1023                         return "System Fatal error.";
1024                 if (m->mcgstatus & MCG_STATUS_RIPV)
1025                         return "Uncorrected, software restartable error.";
1026                 return "Uncorrected, software containable error.";
1027         }
1028 
1029         if (m->status & MCI_STATUS_DEFERRED)
1030                 return "Deferred error, no action required.";
1031 
1032         return "Corrected error, no action required.";
1033 }
1034 
1035 static int
1036 amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
1037 {
1038         struct mce *m = (struct mce *)data;
1039         unsigned int fam = x86_family(m->cpuid);
1040         int ecc;
1041 
1042         if (ignore_mce(m))
1043                 return NOTIFY_STOP;
1044 
1045         pr_emerg(HW_ERR "%s\n", decode_error_status(m));
1046 
1047         pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
1048                 m->extcpu,
1049                 fam, x86_model(m->cpuid), x86_stepping(m->cpuid),
1050                 m->bank,
1051                 ((m->status & MCI_STATUS_OVER)  ? "Over"  : "-"),
1052                 ((m->status & MCI_STATUS_UC)    ? "UE"    :
1053                  (m->status & MCI_STATUS_DEFERRED) ? "-"  : "CE"),
1054                 ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"),
1055                 ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"),
1056                 ((m->status & MCI_STATUS_PCC)   ? "PCC"   : "-"));
1057 
1058         if (boot_cpu_has(X86_FEATURE_SMCA)) {
1059                 u32 low, high;
1060                 u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
1061 
1062                 if (!rdmsr_safe(addr, &low, &high) &&
1063                     (low & MCI_CONFIG_MCAX))
1064                         pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
1065 
1066                 pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-"));
1067         }
1068 
1069         /* do the two bits[14:13] together */
1070         ecc = (m->status >> 45) & 0x3;
1071         if (ecc)
1072                 pr_cont("|%sECC", ((ecc == 2) ? "C" : "U"));
1073 
1074         if (fam >= 0x15) {
1075                 pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-"));
1076 
1077                 /* F15h, bank4, bit 43 is part of McaStatSubCache. */
1078                 if (fam != 0x15 || m->bank != 4)
1079                         pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-"));
1080         }
1081 
1082         if (fam >= 0x17)
1083                 pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-"));
1084 
1085         pr_cont("]: 0x%016llx\n", m->status);
1086 
1087         if (m->status & MCI_STATUS_ADDRV)
1088                 pr_emerg(HW_ERR "Error Addr: 0x%016llx\n", m->addr);
1089 
1090         if (boot_cpu_has(X86_FEATURE_SMCA)) {
1091                 pr_emerg(HW_ERR "IPID: 0x%016llx", m->ipid);
1092 
1093                 if (m->status & MCI_STATUS_SYNDV)
1094                         pr_cont(", Syndrome: 0x%016llx", m->synd);
1095 
1096                 pr_cont("\n");
1097 
1098                 decode_smca_error(m);
1099                 goto err_code;
1100         }
1101 
1102         if (m->tsc)
1103                 pr_emerg(HW_ERR "TSC: %llu\n", m->tsc);
1104 
1105         if (!fam_ops)
1106                 goto err_code;
1107 
1108         switch (m->bank) {
1109         case 0:
1110                 decode_mc0_mce(m);
1111                 break;
1112 
1113         case 1:
1114                 decode_mc1_mce(m);
1115                 break;
1116 
1117         case 2:
1118                 decode_mc2_mce(m);
1119                 break;
1120 
1121         case 3:
1122                 decode_mc3_mce(m);
1123                 break;
1124 
1125         case 4:
1126                 decode_mc4_mce(m);
1127                 break;
1128 
1129         case 5:
1130                 decode_mc5_mce(m);
1131                 break;
1132 
1133         case 6:
1134                 decode_mc6_mce(m);
1135                 break;
1136 
1137         default:
1138                 break;
1139         }
1140 
1141  err_code:
1142         amd_decode_err_code(m->status & 0xffff);
1143 
1144         return NOTIFY_STOP;
1145 }
1146 
1147 static struct notifier_block amd_mce_dec_nb = {
1148         .notifier_call  = amd_decode_mce,
1149         .priority       = MCE_PRIO_EDAC,
1150 };
1151 
1152 static int __init mce_amd_init(void)
1153 {
1154         struct cpuinfo_x86 *c = &boot_cpu_data;
1155 
1156         if (c->x86_vendor != X86_VENDOR_AMD &&
1157             c->x86_vendor != X86_VENDOR_HYGON)
1158                 return -ENODEV;
1159 
1160         fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
1161         if (!fam_ops)
1162                 return -ENOMEM;
1163 
1164         switch (c->x86) {
1165         case 0xf:
1166                 fam_ops->mc0_mce = k8_mc0_mce;
1167                 fam_ops->mc1_mce = k8_mc1_mce;
1168                 fam_ops->mc2_mce = k8_mc2_mce;
1169                 break;
1170 
1171         case 0x10:
1172                 fam_ops->mc0_mce = f10h_mc0_mce;
1173                 fam_ops->mc1_mce = k8_mc1_mce;
1174                 fam_ops->mc2_mce = k8_mc2_mce;
1175                 break;
1176 
1177         case 0x11:
1178                 fam_ops->mc0_mce = k8_mc0_mce;
1179                 fam_ops->mc1_mce = k8_mc1_mce;
1180                 fam_ops->mc2_mce = k8_mc2_mce;
1181                 break;
1182 
1183         case 0x12:
1184                 fam_ops->mc0_mce = f12h_mc0_mce;
1185                 fam_ops->mc1_mce = k8_mc1_mce;
1186                 fam_ops->mc2_mce = k8_mc2_mce;
1187                 break;
1188 
1189         case 0x14:
1190                 fam_ops->mc0_mce = cat_mc0_mce;
1191                 fam_ops->mc1_mce = cat_mc1_mce;
1192                 fam_ops->mc2_mce = k8_mc2_mce;
1193                 break;
1194 
1195         case 0x15:
1196                 xec_mask = c->x86_model == 0x60 ? 0x3f : 0x1f;
1197 
1198                 fam_ops->mc0_mce = f15h_mc0_mce;
1199                 fam_ops->mc1_mce = f15h_mc1_mce;
1200                 fam_ops->mc2_mce = f15h_mc2_mce;
1201                 break;
1202 
1203         case 0x16:
1204                 xec_mask = 0x1f;
1205                 fam_ops->mc0_mce = cat_mc0_mce;
1206                 fam_ops->mc1_mce = cat_mc1_mce;
1207                 fam_ops->mc2_mce = f16h_mc2_mce;
1208                 break;
1209 
1210         case 0x17:
1211         case 0x18:
1212                 xec_mask = 0x3f;
1213                 if (!boot_cpu_has(X86_FEATURE_SMCA)) {
1214                         printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
1215                         goto err_out;
1216                 }
1217                 break;
1218 
1219         default:
1220                 printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
1221                 goto err_out;
1222         }
1223 
1224         pr_info("MCE: In-kernel MCE decoding enabled.\n");
1225 
1226         mce_register_decode_chain(&amd_mce_dec_nb);
1227 
1228         return 0;
1229 
1230 err_out:
1231         kfree(fam_ops);
1232         fam_ops = NULL;
1233         return -EINVAL;
1234 }
1235 early_initcall(mce_amd_init);
1236 
1237 #ifdef MODULE
1238 static void __exit mce_amd_exit(void)
1239 {
1240         mce_unregister_decode_chain(&amd_mce_dec_nb);
1241         kfree(fam_ops);
1242 }
1243 
1244 MODULE_DESCRIPTION("AMD MCE decoder");
1245 MODULE_ALIAS("edac-mce-amd");
1246 MODULE_LICENSE("GPL");
1247 module_exit(mce_amd_exit);
1248 #endif

/* [<][>][^][v][top][bottom][index][help] */