This source file includes following definitions.
- mce_set_error_info
- save_mce_event
- get_mce_event
- release_mce_event
- machine_check_ue_irq_work
- machine_check_ue_event
- machine_check_queue_event
- machine_process_ue_event
- machine_check_process_queued_event
- machine_check_print_event_info
- machine_check_early
- init_debug_trig_function
- hmi_handle_debugtrig
- hmi_exception_realmode
1
2
3
4
5
6
7
8
9 #undef DEBUG
10 #define pr_fmt(fmt) "mce: " fmt
11
12 #include <linux/hardirq.h>
13 #include <linux/types.h>
14 #include <linux/ptrace.h>
15 #include <linux/percpu.h>
16 #include <linux/export.h>
17 #include <linux/irq_work.h>
18
19 #include <asm/machdep.h>
20 #include <asm/mce.h>
21 #include <asm/nmi.h>
22
23 static DEFINE_PER_CPU(int, mce_nest_count);
24 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
25
26
27 static DEFINE_PER_CPU(int, mce_queue_count);
28 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event_queue);
29
30
31 static DEFINE_PER_CPU(int, mce_ue_count);
32 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
33 mce_ue_event_queue);
34
35 static void machine_check_process_queued_event(struct irq_work *work);
36 static void machine_check_ue_irq_work(struct irq_work *work);
37 static void machine_check_ue_event(struct machine_check_event *evt);
38 static void machine_process_ue_event(struct work_struct *work);
39
40 static struct irq_work mce_event_process_work = {
41 .func = machine_check_process_queued_event,
42 };
43
44 static struct irq_work mce_ue_event_irq_work = {
45 .func = machine_check_ue_irq_work,
46 };
47
48 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
49
50 static void mce_set_error_info(struct machine_check_event *mce,
51 struct mce_error_info *mce_err)
52 {
53 mce->error_type = mce_err->error_type;
54 switch (mce_err->error_type) {
55 case MCE_ERROR_TYPE_UE:
56 mce->u.ue_error.ue_error_type = mce_err->u.ue_error_type;
57 break;
58 case MCE_ERROR_TYPE_SLB:
59 mce->u.slb_error.slb_error_type = mce_err->u.slb_error_type;
60 break;
61 case MCE_ERROR_TYPE_ERAT:
62 mce->u.erat_error.erat_error_type = mce_err->u.erat_error_type;
63 break;
64 case MCE_ERROR_TYPE_TLB:
65 mce->u.tlb_error.tlb_error_type = mce_err->u.tlb_error_type;
66 break;
67 case MCE_ERROR_TYPE_USER:
68 mce->u.user_error.user_error_type = mce_err->u.user_error_type;
69 break;
70 case MCE_ERROR_TYPE_RA:
71 mce->u.ra_error.ra_error_type = mce_err->u.ra_error_type;
72 break;
73 case MCE_ERROR_TYPE_LINK:
74 mce->u.link_error.link_error_type = mce_err->u.link_error_type;
75 break;
76 case MCE_ERROR_TYPE_UNKNOWN:
77 default:
78 break;
79 }
80 }
81
82
83
84
85
86 void save_mce_event(struct pt_regs *regs, long handled,
87 struct mce_error_info *mce_err,
88 uint64_t nip, uint64_t addr, uint64_t phys_addr)
89 {
90 int index = __this_cpu_inc_return(mce_nest_count) - 1;
91 struct machine_check_event *mce = this_cpu_ptr(&mce_event[index]);
92
93
94
95
96
97
98 if (index >= MAX_MC_EVT)
99 return;
100
101
102 mce->version = MCE_V1;
103 mce->srr0 = nip;
104 mce->srr1 = regs->msr;
105 mce->gpr3 = regs->gpr[3];
106 mce->in_use = 1;
107 mce->cpu = get_paca()->paca_index;
108
109
110 if (handled && (regs->msr & MSR_RI))
111 mce->disposition = MCE_DISPOSITION_RECOVERED;
112 else
113 mce->disposition = MCE_DISPOSITION_NOT_RECOVERED;
114
115 mce->initiator = mce_err->initiator;
116 mce->severity = mce_err->severity;
117 mce->sync_error = mce_err->sync_error;
118 mce->error_class = mce_err->error_class;
119
120
121
122
123 mce_set_error_info(mce, mce_err);
124
125 if (!addr)
126 return;
127
128 if (mce->error_type == MCE_ERROR_TYPE_TLB) {
129 mce->u.tlb_error.effective_address_provided = true;
130 mce->u.tlb_error.effective_address = addr;
131 } else if (mce->error_type == MCE_ERROR_TYPE_SLB) {
132 mce->u.slb_error.effective_address_provided = true;
133 mce->u.slb_error.effective_address = addr;
134 } else if (mce->error_type == MCE_ERROR_TYPE_ERAT) {
135 mce->u.erat_error.effective_address_provided = true;
136 mce->u.erat_error.effective_address = addr;
137 } else if (mce->error_type == MCE_ERROR_TYPE_USER) {
138 mce->u.user_error.effective_address_provided = true;
139 mce->u.user_error.effective_address = addr;
140 } else if (mce->error_type == MCE_ERROR_TYPE_RA) {
141 mce->u.ra_error.effective_address_provided = true;
142 mce->u.ra_error.effective_address = addr;
143 } else if (mce->error_type == MCE_ERROR_TYPE_LINK) {
144 mce->u.link_error.effective_address_provided = true;
145 mce->u.link_error.effective_address = addr;
146 } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
147 mce->u.ue_error.effective_address_provided = true;
148 mce->u.ue_error.effective_address = addr;
149 if (phys_addr != ULONG_MAX) {
150 mce->u.ue_error.physical_address_provided = true;
151 mce->u.ue_error.physical_address = phys_addr;
152 mce->u.ue_error.ignore_event = mce_err->ignore_event;
153 machine_check_ue_event(mce);
154 }
155 }
156 return;
157 }
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176 int get_mce_event(struct machine_check_event *mce, bool release)
177 {
178 int index = __this_cpu_read(mce_nest_count) - 1;
179 struct machine_check_event *mc_evt;
180 int ret = 0;
181
182
183 if (index < 0)
184 return ret;
185
186
187 if (index < MAX_MC_EVT) {
188 mc_evt = this_cpu_ptr(&mce_event[index]);
189
190 if (mce)
191 *mce = *mc_evt;
192 if (release)
193 mc_evt->in_use = 0;
194 ret = 1;
195 }
196
197 if (release)
198 __this_cpu_dec(mce_nest_count);
199
200 return ret;
201 }
202
203 void release_mce_event(void)
204 {
205 get_mce_event(NULL, true);
206 }
207
208 static void machine_check_ue_irq_work(struct irq_work *work)
209 {
210 schedule_work(&mce_ue_event_work);
211 }
212
213
214
215
216 static void machine_check_ue_event(struct machine_check_event *evt)
217 {
218 int index;
219
220 index = __this_cpu_inc_return(mce_ue_count) - 1;
221
222 if (index >= MAX_MC_EVT) {
223 __this_cpu_dec(mce_ue_count);
224 return;
225 }
226 memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
227
228
229 irq_work_queue(&mce_ue_event_irq_work);
230 }
231
232
233
234
235 void machine_check_queue_event(void)
236 {
237 int index;
238 struct machine_check_event evt;
239
240 if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
241 return;
242
243 index = __this_cpu_inc_return(mce_queue_count) - 1;
244
245 if (index >= MAX_MC_EVT) {
246 __this_cpu_dec(mce_queue_count);
247 return;
248 }
249 memcpy(this_cpu_ptr(&mce_event_queue[index]), &evt, sizeof(evt));
250
251
252 irq_work_queue(&mce_event_process_work);
253 }
254
255
256
257
258 static void machine_process_ue_event(struct work_struct *work)
259 {
260 int index;
261 struct machine_check_event *evt;
262
263 while (__this_cpu_read(mce_ue_count) > 0) {
264 index = __this_cpu_read(mce_ue_count) - 1;
265 evt = this_cpu_ptr(&mce_ue_event_queue[index]);
266 #ifdef CONFIG_MEMORY_FAILURE
267
268
269
270
271
272
273
274
275 if (evt->error_type == MCE_ERROR_TYPE_UE) {
276 if (evt->u.ue_error.ignore_event) {
277 __this_cpu_dec(mce_ue_count);
278 continue;
279 }
280
281 if (evt->u.ue_error.physical_address_provided) {
282 unsigned long pfn;
283
284 pfn = evt->u.ue_error.physical_address >>
285 PAGE_SHIFT;
286 memory_failure(pfn, 0);
287 } else
288 pr_warn("Failed to identify bad address from "
289 "where the uncorrectable error (UE) "
290 "was generated\n");
291 }
292 #endif
293 __this_cpu_dec(mce_ue_count);
294 }
295 }
296
297
298
299
300 static void machine_check_process_queued_event(struct irq_work *work)
301 {
302 int index;
303 struct machine_check_event *evt;
304
305 add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
306
307
308
309
310
311 while (__this_cpu_read(mce_queue_count) > 0) {
312 index = __this_cpu_read(mce_queue_count) - 1;
313 evt = this_cpu_ptr(&mce_event_queue[index]);
314
315 if (evt->error_type == MCE_ERROR_TYPE_UE &&
316 evt->u.ue_error.ignore_event) {
317 __this_cpu_dec(mce_queue_count);
318 continue;
319 }
320 machine_check_print_event_info(evt, false, false);
321 __this_cpu_dec(mce_queue_count);
322 }
323 }
324
325 void machine_check_print_event_info(struct machine_check_event *evt,
326 bool user_mode, bool in_guest)
327 {
328 const char *level, *sevstr, *subtype, *err_type, *initiator;
329 uint64_t ea = 0, pa = 0;
330 int n = 0;
331 char dar_str[50];
332 char pa_str[50];
333 static const char *mc_ue_types[] = {
334 "Indeterminate",
335 "Instruction fetch",
336 "Page table walk ifetch",
337 "Load/Store",
338 "Page table walk Load/Store",
339 };
340 static const char *mc_slb_types[] = {
341 "Indeterminate",
342 "Parity",
343 "Multihit",
344 };
345 static const char *mc_erat_types[] = {
346 "Indeterminate",
347 "Parity",
348 "Multihit",
349 };
350 static const char *mc_tlb_types[] = {
351 "Indeterminate",
352 "Parity",
353 "Multihit",
354 };
355 static const char *mc_user_types[] = {
356 "Indeterminate",
357 "tlbie(l) invalid",
358 };
359 static const char *mc_ra_types[] = {
360 "Indeterminate",
361 "Instruction fetch (bad)",
362 "Instruction fetch (foreign)",
363 "Page table walk ifetch (bad)",
364 "Page table walk ifetch (foreign)",
365 "Load (bad)",
366 "Store (bad)",
367 "Page table walk Load/Store (bad)",
368 "Page table walk Load/Store (foreign)",
369 "Load/Store (foreign)",
370 };
371 static const char *mc_link_types[] = {
372 "Indeterminate",
373 "Instruction fetch (timeout)",
374 "Page table walk ifetch (timeout)",
375 "Load (timeout)",
376 "Store (timeout)",
377 "Page table walk Load/Store (timeout)",
378 };
379 static const char *mc_error_class[] = {
380 "Unknown",
381 "Hardware error",
382 "Probable Hardware error (some chance of software cause)",
383 "Software error",
384 "Probable Software error (some chance of hardware cause)",
385 };
386
387
388 if (evt->version != MCE_V1) {
389 pr_err("Machine Check Exception, Unknown event version %d !\n",
390 evt->version);
391 return;
392 }
393 switch (evt->severity) {
394 case MCE_SEV_NO_ERROR:
395 level = KERN_INFO;
396 sevstr = "Harmless";
397 break;
398 case MCE_SEV_WARNING:
399 level = KERN_WARNING;
400 sevstr = "Warning";
401 break;
402 case MCE_SEV_SEVERE:
403 level = KERN_ERR;
404 sevstr = "Severe";
405 break;
406 case MCE_SEV_FATAL:
407 default:
408 level = KERN_ERR;
409 sevstr = "Fatal";
410 break;
411 }
412
413 switch(evt->initiator) {
414 case MCE_INITIATOR_CPU:
415 initiator = "CPU";
416 break;
417 case MCE_INITIATOR_PCI:
418 initiator = "PCI";
419 break;
420 case MCE_INITIATOR_ISA:
421 initiator = "ISA";
422 break;
423 case MCE_INITIATOR_MEMORY:
424 initiator = "Memory";
425 break;
426 case MCE_INITIATOR_POWERMGM:
427 initiator = "Power Management";
428 break;
429 case MCE_INITIATOR_UNKNOWN:
430 default:
431 initiator = "Unknown";
432 break;
433 }
434
435 switch (evt->error_type) {
436 case MCE_ERROR_TYPE_UE:
437 err_type = "UE";
438 subtype = evt->u.ue_error.ue_error_type <
439 ARRAY_SIZE(mc_ue_types) ?
440 mc_ue_types[evt->u.ue_error.ue_error_type]
441 : "Unknown";
442 if (evt->u.ue_error.effective_address_provided)
443 ea = evt->u.ue_error.effective_address;
444 if (evt->u.ue_error.physical_address_provided)
445 pa = evt->u.ue_error.physical_address;
446 break;
447 case MCE_ERROR_TYPE_SLB:
448 err_type = "SLB";
449 subtype = evt->u.slb_error.slb_error_type <
450 ARRAY_SIZE(mc_slb_types) ?
451 mc_slb_types[evt->u.slb_error.slb_error_type]
452 : "Unknown";
453 if (evt->u.slb_error.effective_address_provided)
454 ea = evt->u.slb_error.effective_address;
455 break;
456 case MCE_ERROR_TYPE_ERAT:
457 err_type = "ERAT";
458 subtype = evt->u.erat_error.erat_error_type <
459 ARRAY_SIZE(mc_erat_types) ?
460 mc_erat_types[evt->u.erat_error.erat_error_type]
461 : "Unknown";
462 if (evt->u.erat_error.effective_address_provided)
463 ea = evt->u.erat_error.effective_address;
464 break;
465 case MCE_ERROR_TYPE_TLB:
466 err_type = "TLB";
467 subtype = evt->u.tlb_error.tlb_error_type <
468 ARRAY_SIZE(mc_tlb_types) ?
469 mc_tlb_types[evt->u.tlb_error.tlb_error_type]
470 : "Unknown";
471 if (evt->u.tlb_error.effective_address_provided)
472 ea = evt->u.tlb_error.effective_address;
473 break;
474 case MCE_ERROR_TYPE_USER:
475 err_type = "User";
476 subtype = evt->u.user_error.user_error_type <
477 ARRAY_SIZE(mc_user_types) ?
478 mc_user_types[evt->u.user_error.user_error_type]
479 : "Unknown";
480 if (evt->u.user_error.effective_address_provided)
481 ea = evt->u.user_error.effective_address;
482 break;
483 case MCE_ERROR_TYPE_RA:
484 err_type = "Real address";
485 subtype = evt->u.ra_error.ra_error_type <
486 ARRAY_SIZE(mc_ra_types) ?
487 mc_ra_types[evt->u.ra_error.ra_error_type]
488 : "Unknown";
489 if (evt->u.ra_error.effective_address_provided)
490 ea = evt->u.ra_error.effective_address;
491 break;
492 case MCE_ERROR_TYPE_LINK:
493 err_type = "Link";
494 subtype = evt->u.link_error.link_error_type <
495 ARRAY_SIZE(mc_link_types) ?
496 mc_link_types[evt->u.link_error.link_error_type]
497 : "Unknown";
498 if (evt->u.link_error.effective_address_provided)
499 ea = evt->u.link_error.effective_address;
500 break;
501 case MCE_ERROR_TYPE_DCACHE:
502 err_type = "D-Cache";
503 subtype = "Unknown";
504 break;
505 case MCE_ERROR_TYPE_ICACHE:
506 err_type = "I-Cache";
507 subtype = "Unknown";
508 break;
509 default:
510 case MCE_ERROR_TYPE_UNKNOWN:
511 err_type = "Unknown";
512 subtype = "";
513 break;
514 }
515
516 dar_str[0] = pa_str[0] = '\0';
517 if (ea && evt->srr0 != ea) {
518
519 n = sprintf(dar_str, "DAR: %016llx ", ea);
520 if (pa)
521 sprintf(dar_str + n, "paddr: %016llx ", pa);
522 } else if (pa) {
523 sprintf(pa_str, " paddr: %016llx", pa);
524 }
525
526 printk("%sMCE: CPU%d: machine check (%s) %s %s %s %s[%s]\n",
527 level, evt->cpu, sevstr, in_guest ? "Guest" : "Host",
528 err_type, subtype, dar_str,
529 evt->disposition == MCE_DISPOSITION_RECOVERED ?
530 "Recovered" : "Not recovered");
531
532 if (in_guest || user_mode) {
533 printk("%sMCE: CPU%d: PID: %d Comm: %s %sNIP: [%016llx]%s\n",
534 level, evt->cpu, current->pid, current->comm,
535 in_guest ? "Guest " : "", evt->srr0, pa_str);
536 } else {
537 printk("%sMCE: CPU%d: NIP: [%016llx] %pS%s\n",
538 level, evt->cpu, evt->srr0, (void *)evt->srr0, pa_str);
539 }
540
541 printk("%sMCE: CPU%d: Initiator %s\n", level, evt->cpu, initiator);
542
543 subtype = evt->error_class < ARRAY_SIZE(mc_error_class) ?
544 mc_error_class[evt->error_class] : "Unknown";
545 printk("%sMCE: CPU%d: %s\n", level, evt->cpu, subtype);
546
547 #ifdef CONFIG_PPC_BOOK3S_64
548
549 if (evt->error_type == MCE_ERROR_TYPE_SLB)
550 slb_dump_contents(local_paca->mce_faulty_slbs);
551 #endif
552 }
553 EXPORT_SYMBOL_GPL(machine_check_print_event_info);
554
555
556
557
558
559
560 long machine_check_early(struct pt_regs *regs)
561 {
562 long handled = 0;
563
564 hv_nmi_check_nonrecoverable(regs);
565
566
567
568
569 if (ppc_md.machine_check_early)
570 handled = ppc_md.machine_check_early(regs);
571 return handled;
572 }
573
574
575 static enum {
576 DTRIG_UNKNOWN,
577 DTRIG_VECTOR_CI,
578 DTRIG_SUSPEND_ESCAPE,
579 } hmer_debug_trig_function;
580
581 static int init_debug_trig_function(void)
582 {
583 int pvr;
584 struct device_node *cpun;
585 struct property *prop = NULL;
586 const char *str;
587
588
589 preempt_disable();
590 cpun = of_get_cpu_node(smp_processor_id(), NULL);
591 if (cpun) {
592 of_property_for_each_string(cpun, "ibm,hmi-special-triggers",
593 prop, str) {
594 if (strcmp(str, "bit17-vector-ci-load") == 0)
595 hmer_debug_trig_function = DTRIG_VECTOR_CI;
596 else if (strcmp(str, "bit17-tm-suspend-escape") == 0)
597 hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
598 }
599 of_node_put(cpun);
600 }
601 preempt_enable();
602
603
604 if (prop)
605 goto out;
606
607 pvr = mfspr(SPRN_PVR);
608
609 if ((PVR_VER(pvr) == PVR_POWER9) && (pvr & 0xe000) == 0) {
610
611 if ((pvr & 0xfff) >= 0x202)
612 hmer_debug_trig_function = DTRIG_SUSPEND_ESCAPE;
613
614 else if ((pvr & 0xfff) >= 0x200)
615 hmer_debug_trig_function = DTRIG_VECTOR_CI;
616 }
617
618 out:
619 switch (hmer_debug_trig_function) {
620 case DTRIG_VECTOR_CI:
621 pr_debug("HMI debug trigger used for vector CI load\n");
622 break;
623 case DTRIG_SUSPEND_ESCAPE:
624 pr_debug("HMI debug trigger used for TM suspend escape\n");
625 break;
626 default:
627 break;
628 }
629 return 0;
630 }
631 __initcall(init_debug_trig_function);
632
633
634
635
636
637
638
639
640 long hmi_handle_debugtrig(struct pt_regs *regs)
641 {
642 unsigned long hmer = mfspr(SPRN_HMER);
643 long ret = 0;
644
645
646 if (!((hmer & HMER_DEBUG_TRIG)
647 && hmer_debug_trig_function != DTRIG_UNKNOWN))
648 return -1;
649
650 hmer &= ~HMER_DEBUG_TRIG;
651
652 mtspr(SPRN_HMER, ~HMER_DEBUG_TRIG);
653
654 switch (hmer_debug_trig_function) {
655 case DTRIG_VECTOR_CI:
656
657
658
659
660
661 if (regs && user_mode(regs))
662 ret = local_paca->hmi_p9_special_emu = 1;
663
664 break;
665
666 default:
667 break;
668 }
669
670
671
672
673 if (hmer & mfspr(SPRN_HMEER))
674 return -1;
675
676 return ret;
677 }
678
679
680
681
682 long hmi_exception_realmode(struct pt_regs *regs)
683 {
684 int ret;
685
686 __this_cpu_inc(irq_stat.hmi_exceptions);
687
688 ret = hmi_handle_debugtrig(regs);
689 if (ret >= 0)
690 return ret;
691
692 wait_for_subcore_guest_exit();
693
694 if (ppc_md.hmi_exception_early)
695 ppc_md.hmi_exception_early(regs);
696
697 wait_for_tb_resync();
698
699 return 1;
700 }