1 /*
2  * Machine check handler.
3  *
4  * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5  * Rest from unknown author(s).
6  * 2004 Andi Kleen. Rewrote most of it.
7  * Copyright 2008 Intel Corporation
8  * Author: Andi Kleen
9  */
10 
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12 
13 #include <linux/thread_info.h>
14 #include <linux/capability.h>
15 #include <linux/miscdevice.h>
16 #include <linux/ratelimit.h>
17 #include <linux/kallsyms.h>
18 #include <linux/rcupdate.h>
19 #include <linux/kobject.h>
20 #include <linux/uaccess.h>
21 #include <linux/kdebug.h>
22 #include <linux/kernel.h>
23 #include <linux/percpu.h>
24 #include <linux/string.h>
25 #include <linux/device.h>
26 #include <linux/syscore_ops.h>
27 #include <linux/delay.h>
28 #include <linux/ctype.h>
29 #include <linux/sched.h>
30 #include <linux/sysfs.h>
31 #include <linux/types.h>
32 #include <linux/slab.h>
33 #include <linux/init.h>
34 #include <linux/kmod.h>
35 #include <linux/poll.h>
36 #include <linux/nmi.h>
37 #include <linux/cpu.h>
38 #include <linux/smp.h>
39 #include <linux/fs.h>
40 #include <linux/mm.h>
41 #include <linux/debugfs.h>
42 #include <linux/irq_work.h>
43 #include <linux/export.h>
44 
45 #include <asm/processor.h>
46 #include <asm/traps.h>
47 #include <asm/tlbflush.h>
48 #include <asm/mce.h>
49 #include <asm/msr.h>
50 
51 #include "mce-internal.h"
52 
53 static DEFINE_MUTEX(mce_chrdev_read_mutex);
54 
55 #define rcu_dereference_check_mce(p) \
56 	rcu_dereference_index_check((p), \
57 			      rcu_read_lock_sched_held() || \
58 			      lockdep_is_held(&mce_chrdev_read_mutex))
59 
60 #define CREATE_TRACE_POINTS
61 #include <trace/events/mce.h>
62 
63 #define SPINUNIT		100	/* 100ns */
64 
65 DEFINE_PER_CPU(unsigned, mce_exception_count);
66 
67 struct mce_bank *mce_banks __read_mostly;
68 struct mce_vendor_flags mce_flags __read_mostly;
69 
70 struct mca_config mca_cfg __read_mostly = {
71 	.bootlog  = -1,
72 	/*
73 	 * Tolerant levels:
74 	 * 0: always panic on uncorrected errors, log corrected errors
75 	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76 	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
77 	 * 3: never panic or SIGBUS, log all errors (for testing only)
78 	 */
79 	.tolerant = 1,
80 	.monarch_timeout = -1
81 };
82 
83 /* User mode helper program triggered by machine check event */
84 static unsigned long		mce_need_notify;
85 static char			mce_helper[128];
86 static char			*mce_helper_argv[2] = { mce_helper, NULL };
87 
88 static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
89 
90 static DEFINE_PER_CPU(struct mce, mces_seen);
91 static int			cpu_missing;
92 
93 /*
94  * MCA banks polled by the period polling timer for corrected events.
95  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
96  */
97 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
98 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
99 };
100 
101 /*
102  * MCA banks controlled through firmware first for corrected errors.
103  * This is a global list of banks for which we won't enable CMCI and we
104  * won't poll. Firmware controls these banks and is responsible for
105  * reporting corrected errors through GHES. Uncorrected/recoverable
106  * errors are still notified through a machine check.
107  */
108 mce_banks_t mce_banks_ce_disabled;
109 
110 static DEFINE_PER_CPU(struct work_struct, mce_work);
111 
112 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
113 
114 /*
115  * CPU/chipset specific EDAC code can register a notifier call here to print
116  * MCE errors in a human-readable form.
117  */
118 static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
119 
120 /* Do initial initialization of a struct mce */
mce_setup(struct mce * m)121 void mce_setup(struct mce *m)
122 {
123 	memset(m, 0, sizeof(struct mce));
124 	m->cpu = m->extcpu = smp_processor_id();
125 	rdtscll(m->tsc);
126 	/* We hope get_seconds stays lockless */
127 	m->time = get_seconds();
128 	m->cpuvendor = boot_cpu_data.x86_vendor;
129 	m->cpuid = cpuid_eax(1);
130 	m->socketid = cpu_data(m->extcpu).phys_proc_id;
131 	m->apicid = cpu_data(m->extcpu).initial_apicid;
132 	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
133 }
134 
135 DEFINE_PER_CPU(struct mce, injectm);
136 EXPORT_PER_CPU_SYMBOL_GPL(injectm);
137 
138 /*
139  * Lockless MCE logging infrastructure.
140  * This avoids deadlocks on printk locks without having to break locks. Also
141  * separate MCEs from kernel messages to avoid bogus bug reports.
142  */
143 
144 static struct mce_log mcelog = {
145 	.signature	= MCE_LOG_SIGNATURE,
146 	.len		= MCE_LOG_LEN,
147 	.recordlen	= sizeof(struct mce),
148 };
149 
mce_log(struct mce * mce)150 void mce_log(struct mce *mce)
151 {
152 	unsigned next, entry;
153 
154 	/* Emit the trace record: */
155 	trace_mce_record(mce);
156 
157 	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
158 
159 	mce->finished = 0;
160 	wmb();
161 	for (;;) {
162 		entry = rcu_dereference_check_mce(mcelog.next);
163 		for (;;) {
164 
165 			/*
166 			 * When the buffer fills up discard new entries.
167 			 * Assume that the earlier errors are the more
168 			 * interesting ones:
169 			 */
170 			if (entry >= MCE_LOG_LEN) {
171 				set_bit(MCE_OVERFLOW,
172 					(unsigned long *)&mcelog.flags);
173 				return;
174 			}
175 			/* Old left over entry. Skip: */
176 			if (mcelog.entry[entry].finished) {
177 				entry++;
178 				continue;
179 			}
180 			break;
181 		}
182 		smp_rmb();
183 		next = entry + 1;
184 		if (cmpxchg(&mcelog.next, entry, next) == entry)
185 			break;
186 	}
187 	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
188 	wmb();
189 	mcelog.entry[entry].finished = 1;
190 	wmb();
191 
192 	mce->finished = 1;
193 	set_bit(0, &mce_need_notify);
194 }
195 
drain_mcelog_buffer(void)196 static void drain_mcelog_buffer(void)
197 {
198 	unsigned int next, i, prev = 0;
199 
200 	next = ACCESS_ONCE(mcelog.next);
201 
202 	do {
203 		struct mce *m;
204 
205 		/* drain what was logged during boot */
206 		for (i = prev; i < next; i++) {
207 			unsigned long start = jiffies;
208 			unsigned retries = 1;
209 
210 			m = &mcelog.entry[i];
211 
212 			while (!m->finished) {
213 				if (time_after_eq(jiffies, start + 2*retries))
214 					retries++;
215 
216 				cpu_relax();
217 
218 				if (!m->finished && retries >= 4) {
219 					pr_err("skipping error being logged currently!\n");
220 					break;
221 				}
222 			}
223 			smp_rmb();
224 			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
225 		}
226 
227 		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
228 		prev = next;
229 		next = cmpxchg(&mcelog.next, prev, 0);
230 	} while (next != prev);
231 }
232 
233 
mce_register_decode_chain(struct notifier_block * nb)234 void mce_register_decode_chain(struct notifier_block *nb)
235 {
236 	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
237 	drain_mcelog_buffer();
238 }
239 EXPORT_SYMBOL_GPL(mce_register_decode_chain);
240 
mce_unregister_decode_chain(struct notifier_block * nb)241 void mce_unregister_decode_chain(struct notifier_block *nb)
242 {
243 	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
244 }
245 EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
246 
print_mce(struct mce * m)247 static void print_mce(struct mce *m)
248 {
249 	int ret = 0;
250 
251 	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
252 	       m->extcpu, m->mcgstatus, m->bank, m->status);
253 
254 	if (m->ip) {
255 		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
256 			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
257 				m->cs, m->ip);
258 
259 		if (m->cs == __KERNEL_CS)
260 			print_symbol("{%s}", m->ip);
261 		pr_cont("\n");
262 	}
263 
264 	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
265 	if (m->addr)
266 		pr_cont("ADDR %llx ", m->addr);
267 	if (m->misc)
268 		pr_cont("MISC %llx ", m->misc);
269 
270 	pr_cont("\n");
271 	/*
272 	 * Note this output is parsed by external tools and old fields
273 	 * should not be changed.
274 	 */
275 	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
276 		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
277 		cpu_data(m->extcpu).microcode);
278 
279 	/*
280 	 * Print out human-readable details about the MCE error,
281 	 * (if the CPU has an implementation for that)
282 	 */
283 	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
284 	if (ret == NOTIFY_STOP)
285 		return;
286 
287 	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
288 }
289 
290 #define PANIC_TIMEOUT 5 /* 5 seconds */
291 
292 static atomic_t mce_panicked;
293 
294 static int fake_panic;
295 static atomic_t mce_fake_panicked;
296 
297 /* Panic in progress. Enable interrupts and wait for final IPI */
wait_for_panic(void)298 static void wait_for_panic(void)
299 {
300 	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
301 
302 	preempt_disable();
303 	local_irq_enable();
304 	while (timeout-- > 0)
305 		udelay(1);
306 	if (panic_timeout == 0)
307 		panic_timeout = mca_cfg.panic_timeout;
308 	panic("Panicing machine check CPU died");
309 }
310 
mce_panic(const char * msg,struct mce * final,char * exp)311 static void mce_panic(const char *msg, struct mce *final, char *exp)
312 {
313 	int i, apei_err = 0;
314 
315 	if (!fake_panic) {
316 		/*
317 		 * Make sure only one CPU runs in machine check panic
318 		 */
319 		if (atomic_inc_return(&mce_panicked) > 1)
320 			wait_for_panic();
321 		barrier();
322 
323 		bust_spinlocks(1);
324 		console_verbose();
325 	} else {
326 		/* Don't log too much for fake panic */
327 		if (atomic_inc_return(&mce_fake_panicked) > 1)
328 			return;
329 	}
330 	/* First print corrected ones that are still unlogged */
331 	for (i = 0; i < MCE_LOG_LEN; i++) {
332 		struct mce *m = &mcelog.entry[i];
333 		if (!(m->status & MCI_STATUS_VAL))
334 			continue;
335 		if (!(m->status & MCI_STATUS_UC)) {
336 			print_mce(m);
337 			if (!apei_err)
338 				apei_err = apei_write_mce(m);
339 		}
340 	}
341 	/* Now print uncorrected but with the final one last */
342 	for (i = 0; i < MCE_LOG_LEN; i++) {
343 		struct mce *m = &mcelog.entry[i];
344 		if (!(m->status & MCI_STATUS_VAL))
345 			continue;
346 		if (!(m->status & MCI_STATUS_UC))
347 			continue;
348 		if (!final || memcmp(m, final, sizeof(struct mce))) {
349 			print_mce(m);
350 			if (!apei_err)
351 				apei_err = apei_write_mce(m);
352 		}
353 	}
354 	if (final) {
355 		print_mce(final);
356 		if (!apei_err)
357 			apei_err = apei_write_mce(final);
358 	}
359 	if (cpu_missing)
360 		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
361 	if (exp)
362 		pr_emerg(HW_ERR "Machine check: %s\n", exp);
363 	if (!fake_panic) {
364 		if (panic_timeout == 0)
365 			panic_timeout = mca_cfg.panic_timeout;
366 		panic(msg);
367 	} else
368 		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
369 }
370 
371 /* Support code for software error injection */
372 
msr_to_offset(u32 msr)373 static int msr_to_offset(u32 msr)
374 {
375 	unsigned bank = __this_cpu_read(injectm.bank);
376 
377 	if (msr == mca_cfg.rip_msr)
378 		return offsetof(struct mce, ip);
379 	if (msr == MSR_IA32_MCx_STATUS(bank))
380 		return offsetof(struct mce, status);
381 	if (msr == MSR_IA32_MCx_ADDR(bank))
382 		return offsetof(struct mce, addr);
383 	if (msr == MSR_IA32_MCx_MISC(bank))
384 		return offsetof(struct mce, misc);
385 	if (msr == MSR_IA32_MCG_STATUS)
386 		return offsetof(struct mce, mcgstatus);
387 	return -1;
388 }
389 
390 /* MSR access wrappers used for error injection */
mce_rdmsrl(u32 msr)391 static u64 mce_rdmsrl(u32 msr)
392 {
393 	u64 v;
394 
395 	if (__this_cpu_read(injectm.finished)) {
396 		int offset = msr_to_offset(msr);
397 
398 		if (offset < 0)
399 			return 0;
400 		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
401 	}
402 
403 	if (rdmsrl_safe(msr, &v)) {
404 		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
405 		/*
406 		 * Return zero in case the access faulted. This should
407 		 * not happen normally but can happen if the CPU does
408 		 * something weird, or if the code is buggy.
409 		 */
410 		v = 0;
411 	}
412 
413 	return v;
414 }
415 
mce_wrmsrl(u32 msr,u64 v)416 static void mce_wrmsrl(u32 msr, u64 v)
417 {
418 	if (__this_cpu_read(injectm.finished)) {
419 		int offset = msr_to_offset(msr);
420 
421 		if (offset >= 0)
422 			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
423 		return;
424 	}
425 	wrmsrl(msr, v);
426 }
427 
428 /*
429  * Collect all global (w.r.t. this processor) status about this machine
430  * check into our "mce" struct so that we can use it later to assess
431  * the severity of the problem as we read per-bank specific details.
432  */
mce_gather_info(struct mce * m,struct pt_regs * regs)433 static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
434 {
435 	mce_setup(m);
436 
437 	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
438 	if (regs) {
439 		/*
440 		 * Get the address of the instruction at the time of
441 		 * the machine check error.
442 		 */
443 		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
444 			m->ip = regs->ip;
445 			m->cs = regs->cs;
446 
447 			/*
448 			 * When in VM86 mode make the cs look like ring 3
449 			 * always. This is a lie, but it's better than passing
450 			 * the additional vm86 bit around everywhere.
451 			 */
452 			if (v8086_mode(regs))
453 				m->cs |= 3;
454 		}
455 		/* Use accurate RIP reporting if available. */
456 		if (mca_cfg.rip_msr)
457 			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
458 	}
459 }
460 
461 /*
462  * Simple lockless ring to communicate PFNs from the exception handler with the
463  * process context work function. This is vastly simplified because there's
464  * only a single reader and a single writer.
465  */
466 #define MCE_RING_SIZE 16	/* we use one entry less */
467 
468 struct mce_ring {
469 	unsigned short start;
470 	unsigned short end;
471 	unsigned long ring[MCE_RING_SIZE];
472 };
473 static DEFINE_PER_CPU(struct mce_ring, mce_ring);
474 
475 /* Runs with CPU affinity in workqueue */
mce_ring_empty(void)476 static int mce_ring_empty(void)
477 {
478 	struct mce_ring *r = this_cpu_ptr(&mce_ring);
479 
480 	return r->start == r->end;
481 }
482 
mce_ring_get(unsigned long * pfn)483 static int mce_ring_get(unsigned long *pfn)
484 {
485 	struct mce_ring *r;
486 	int ret = 0;
487 
488 	*pfn = 0;
489 	get_cpu();
490 	r = this_cpu_ptr(&mce_ring);
491 	if (r->start == r->end)
492 		goto out;
493 	*pfn = r->ring[r->start];
494 	r->start = (r->start + 1) % MCE_RING_SIZE;
495 	ret = 1;
496 out:
497 	put_cpu();
498 	return ret;
499 }
500 
501 /* Always runs in MCE context with preempt off */
mce_ring_add(unsigned long pfn)502 static int mce_ring_add(unsigned long pfn)
503 {
504 	struct mce_ring *r = this_cpu_ptr(&mce_ring);
505 	unsigned next;
506 
507 	next = (r->end + 1) % MCE_RING_SIZE;
508 	if (next == r->start)
509 		return -1;
510 	r->ring[r->end] = pfn;
511 	wmb();
512 	r->end = next;
513 	return 0;
514 }
515 
mce_available(struct cpuinfo_x86 * c)516 int mce_available(struct cpuinfo_x86 *c)
517 {
518 	if (mca_cfg.disabled)
519 		return 0;
520 	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
521 }
522 
mce_schedule_work(void)523 static void mce_schedule_work(void)
524 {
525 	if (!mce_ring_empty())
526 		schedule_work(this_cpu_ptr(&mce_work));
527 }
528 
529 static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
530 
mce_irq_work_cb(struct irq_work * entry)531 static void mce_irq_work_cb(struct irq_work *entry)
532 {
533 	mce_notify_irq();
534 	mce_schedule_work();
535 }
536 
mce_report_event(struct pt_regs * regs)537 static void mce_report_event(struct pt_regs *regs)
538 {
539 	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
540 		mce_notify_irq();
541 		/*
542 		 * Triggering the work queue here is just an insurance
543 		 * policy in case the syscall exit notify handler
544 		 * doesn't run soon enough or ends up running on the
545 		 * wrong CPU (can happen when audit sleeps)
546 		 */
547 		mce_schedule_work();
548 		return;
549 	}
550 
551 	irq_work_queue(this_cpu_ptr(&mce_irq_work));
552 }
553 
554 /*
555  * Read ADDR and MISC registers.
556  */
mce_read_aux(struct mce * m,int i)557 static void mce_read_aux(struct mce *m, int i)
558 {
559 	if (m->status & MCI_STATUS_MISCV)
560 		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
561 	if (m->status & MCI_STATUS_ADDRV) {
562 		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
563 
564 		/*
565 		 * Mask the reported address by the reported granularity.
566 		 */
567 		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
568 			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
569 			m->addr >>= shift;
570 			m->addr <<= shift;
571 		}
572 	}
573 }
574 
memory_error(struct mce * m)575 static bool memory_error(struct mce *m)
576 {
577 	struct cpuinfo_x86 *c = &boot_cpu_data;
578 
579 	if (c->x86_vendor == X86_VENDOR_AMD) {
580 		/*
581 		 * coming soon
582 		 */
583 		return false;
584 	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
585 		/*
586 		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
587 		 *
588 		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
589 		 * indicating a memory error. Bit 8 is used for indicating a
590 		 * cache hierarchy error. The combination of bit 2 and bit 3
591 		 * is used for indicating a `generic' cache hierarchy error
592 		 * But we can't just blindly check the above bits, because if
593 		 * bit 11 is set, then it is a bus/interconnect error - and
594 		 * either way the above bits just gives more detail on what
595 		 * bus/interconnect error happened. Note that bit 12 can be
596 		 * ignored, as it's the "filter" bit.
597 		 */
598 		return (m->status & 0xef80) == BIT(7) ||
599 		       (m->status & 0xef00) == BIT(8) ||
600 		       (m->status & 0xeffc) == 0xc;
601 	}
602 
603 	return false;
604 }
605 
606 DEFINE_PER_CPU(unsigned, mce_poll_count);
607 
608 /*
609  * Poll for corrected events or events that happened before reset.
610  * Those are just logged through /dev/mcelog.
611  *
612  * This is executed in standard interrupt context.
613  *
614  * Note: spec recommends to panic for fatal unsignalled
615  * errors here. However this would be quite problematic --
616  * we would need to reimplement the Monarch handling and
617  * it would mess up the exclusion between exception handler
618  * and poll hander -- * so we skip this for now.
619  * These cases should not happen anyways, or only when the CPU
620  * is already totally * confused. In this case it's likely it will
621  * not fully execute the machine check handler either.
622  */
machine_check_poll(enum mcp_flags flags,mce_banks_t * b)623 bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
624 {
625 	bool error_logged = false;
626 	struct mce m;
627 	int severity;
628 	int i;
629 
630 	this_cpu_inc(mce_poll_count);
631 
632 	mce_gather_info(&m, NULL);
633 
634 	for (i = 0; i < mca_cfg.banks; i++) {
635 		if (!mce_banks[i].ctl || !test_bit(i, *b))
636 			continue;
637 
638 		m.misc = 0;
639 		m.addr = 0;
640 		m.bank = i;
641 		m.tsc = 0;
642 
643 		barrier();
644 		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
645 		if (!(m.status & MCI_STATUS_VAL))
646 			continue;
647 
648 
649 		/*
650 		 * Uncorrected or signalled events are handled by the exception
651 		 * handler when it is enabled, so don't process those here.
652 		 *
653 		 * TBD do the same check for MCI_STATUS_EN here?
654 		 */
655 		if (!(flags & MCP_UC) &&
656 		    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
657 			continue;
658 
659 		mce_read_aux(&m, i);
660 
661 		if (!(flags & MCP_TIMESTAMP))
662 			m.tsc = 0;
663 
664 		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
665 
666 		/*
667 		 * In the cases where we don't have a valid address after all,
668 		 * do not add it into the ring buffer.
669 		 */
670 		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
671 			if (m.status & MCI_STATUS_ADDRV) {
672 				mce_ring_add(m.addr >> PAGE_SHIFT);
673 				mce_schedule_work();
674 			}
675 		}
676 
677 		/*
678 		 * Don't get the IP here because it's unlikely to
679 		 * have anything to do with the actual error location.
680 		 */
681 		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
682 			error_logged = true;
683 			mce_log(&m);
684 		}
685 
686 		/*
687 		 * Clear state for this bank.
688 		 */
689 		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
690 	}
691 
692 	/*
693 	 * Don't clear MCG_STATUS here because it's only defined for
694 	 * exceptions.
695 	 */
696 
697 	sync_core();
698 
699 	return error_logged;
700 }
701 EXPORT_SYMBOL_GPL(machine_check_poll);
702 
703 /*
704  * Do a quick check if any of the events requires a panic.
705  * This decides if we keep the events around or clear them.
706  */
mce_no_way_out(struct mce * m,char ** msg,unsigned long * validp,struct pt_regs * regs)707 static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
708 			  struct pt_regs *regs)
709 {
710 	int i, ret = 0;
711 	char *tmp;
712 
713 	for (i = 0; i < mca_cfg.banks; i++) {
714 		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
715 		if (m->status & MCI_STATUS_VAL) {
716 			__set_bit(i, validp);
717 			if (quirk_no_way_out)
718 				quirk_no_way_out(i, m, regs);
719 		}
720 
721 		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
722 			*msg = tmp;
723 			ret = 1;
724 		}
725 	}
726 	return ret;
727 }
728 
729 /*
730  * Variable to establish order between CPUs while scanning.
731  * Each CPU spins initially until executing is equal its number.
732  */
733 static atomic_t mce_executing;
734 
735 /*
736  * Defines order of CPUs on entry. First CPU becomes Monarch.
737  */
738 static atomic_t mce_callin;
739 
740 /*
741  * Check if a timeout waiting for other CPUs happened.
742  */
mce_timed_out(u64 * t,const char * msg)743 static int mce_timed_out(u64 *t, const char *msg)
744 {
745 	/*
746 	 * The others already did panic for some reason.
747 	 * Bail out like in a timeout.
748 	 * rmb() to tell the compiler that system_state
749 	 * might have been modified by someone else.
750 	 */
751 	rmb();
752 	if (atomic_read(&mce_panicked))
753 		wait_for_panic();
754 	if (!mca_cfg.monarch_timeout)
755 		goto out;
756 	if ((s64)*t < SPINUNIT) {
757 		if (mca_cfg.tolerant <= 1)
758 			mce_panic(msg, NULL, NULL);
759 		cpu_missing = 1;
760 		return 1;
761 	}
762 	*t -= SPINUNIT;
763 out:
764 	touch_nmi_watchdog();
765 	return 0;
766 }
767 
768 /*
769  * The Monarch's reign.  The Monarch is the CPU who entered
770  * the machine check handler first. It waits for the others to
771  * raise the exception too and then grades them. When any
772  * error is fatal panic. Only then let the others continue.
773  *
774  * The other CPUs entering the MCE handler will be controlled by the
775  * Monarch. They are called Subjects.
776  *
777  * This way we prevent any potential data corruption in a unrecoverable case
778  * and also makes sure always all CPU's errors are examined.
779  *
780  * Also this detects the case of a machine check event coming from outer
781  * space (not detected by any CPUs) In this case some external agent wants
782  * us to shut down, so panic too.
783  *
784  * The other CPUs might still decide to panic if the handler happens
785  * in a unrecoverable place, but in this case the system is in a semi-stable
786  * state and won't corrupt anything by itself. It's ok to let the others
787  * continue for a bit first.
788  *
789  * All the spin loops have timeouts; when a timeout happens a CPU
790  * typically elects itself to be Monarch.
791  */
mce_reign(void)792 static void mce_reign(void)
793 {
794 	int cpu;
795 	struct mce *m = NULL;
796 	int global_worst = 0;
797 	char *msg = NULL;
798 	char *nmsg = NULL;
799 
800 	/*
801 	 * This CPU is the Monarch and the other CPUs have run
802 	 * through their handlers.
803 	 * Grade the severity of the errors of all the CPUs.
804 	 */
805 	for_each_possible_cpu(cpu) {
806 		int severity = mce_severity(&per_cpu(mces_seen, cpu),
807 					    mca_cfg.tolerant,
808 					    &nmsg, true);
809 		if (severity > global_worst) {
810 			msg = nmsg;
811 			global_worst = severity;
812 			m = &per_cpu(mces_seen, cpu);
813 		}
814 	}
815 
816 	/*
817 	 * Cannot recover? Panic here then.
818 	 * This dumps all the mces in the log buffer and stops the
819 	 * other CPUs.
820 	 */
821 	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
822 		mce_panic("Fatal machine check", m, msg);
823 
824 	/*
825 	 * For UC somewhere we let the CPU who detects it handle it.
826 	 * Also must let continue the others, otherwise the handling
827 	 * CPU could deadlock on a lock.
828 	 */
829 
830 	/*
831 	 * No machine check event found. Must be some external
832 	 * source or one CPU is hung. Panic.
833 	 */
834 	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
835 		mce_panic("Fatal machine check from unknown source", NULL, NULL);
836 
837 	/*
838 	 * Now clear all the mces_seen so that they don't reappear on
839 	 * the next mce.
840 	 */
841 	for_each_possible_cpu(cpu)
842 		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
843 }
844 
845 static atomic_t global_nwo;
846 
847 /*
848  * Start of Monarch synchronization. This waits until all CPUs have
849  * entered the exception handler and then determines if any of them
850  * saw a fatal event that requires panic. Then it executes them
851  * in the entry order.
852  * TBD double check parallel CPU hotunplug
853  */
mce_start(int * no_way_out)854 static int mce_start(int *no_way_out)
855 {
856 	int order;
857 	int cpus = num_online_cpus();
858 	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
859 
860 	if (!timeout)
861 		return -1;
862 
863 	atomic_add(*no_way_out, &global_nwo);
864 	/*
865 	 * global_nwo should be updated before mce_callin
866 	 */
867 	smp_wmb();
868 	order = atomic_inc_return(&mce_callin);
869 
870 	/*
871 	 * Wait for everyone.
872 	 */
873 	while (atomic_read(&mce_callin) != cpus) {
874 		if (mce_timed_out(&timeout,
875 				  "Timeout: Not all CPUs entered broadcast exception handler")) {
876 			atomic_set(&global_nwo, 0);
877 			return -1;
878 		}
879 		ndelay(SPINUNIT);
880 	}
881 
882 	/*
883 	 * mce_callin should be read before global_nwo
884 	 */
885 	smp_rmb();
886 
887 	if (order == 1) {
888 		/*
889 		 * Monarch: Starts executing now, the others wait.
890 		 */
891 		atomic_set(&mce_executing, 1);
892 	} else {
893 		/*
894 		 * Subject: Now start the scanning loop one by one in
895 		 * the original callin order.
896 		 * This way when there are any shared banks it will be
897 		 * only seen by one CPU before cleared, avoiding duplicates.
898 		 */
899 		while (atomic_read(&mce_executing) < order) {
900 			if (mce_timed_out(&timeout,
901 					  "Timeout: Subject CPUs unable to finish machine check processing")) {
902 				atomic_set(&global_nwo, 0);
903 				return -1;
904 			}
905 			ndelay(SPINUNIT);
906 		}
907 	}
908 
909 	/*
910 	 * Cache the global no_way_out state.
911 	 */
912 	*no_way_out = atomic_read(&global_nwo);
913 
914 	return order;
915 }
916 
917 /*
918  * Synchronize between CPUs after main scanning loop.
919  * This invokes the bulk of the Monarch processing.
920  */
mce_end(int order)921 static int mce_end(int order)
922 {
923 	int ret = -1;
924 	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
925 
926 	if (!timeout)
927 		goto reset;
928 	if (order < 0)
929 		goto reset;
930 
931 	/*
932 	 * Allow others to run.
933 	 */
934 	atomic_inc(&mce_executing);
935 
936 	if (order == 1) {
937 		/* CHECKME: Can this race with a parallel hotplug? */
938 		int cpus = num_online_cpus();
939 
940 		/*
941 		 * Monarch: Wait for everyone to go through their scanning
942 		 * loops.
943 		 */
944 		while (atomic_read(&mce_executing) <= cpus) {
945 			if (mce_timed_out(&timeout,
946 					  "Timeout: Monarch CPU unable to finish machine check processing"))
947 				goto reset;
948 			ndelay(SPINUNIT);
949 		}
950 
951 		mce_reign();
952 		barrier();
953 		ret = 0;
954 	} else {
955 		/*
956 		 * Subject: Wait for Monarch to finish.
957 		 */
958 		while (atomic_read(&mce_executing) != 0) {
959 			if (mce_timed_out(&timeout,
960 					  "Timeout: Monarch CPU did not finish machine check processing"))
961 				goto reset;
962 			ndelay(SPINUNIT);
963 		}
964 
965 		/*
966 		 * Don't reset anything. That's done by the Monarch.
967 		 */
968 		return 0;
969 	}
970 
971 	/*
972 	 * Reset all global state.
973 	 */
974 reset:
975 	atomic_set(&global_nwo, 0);
976 	atomic_set(&mce_callin, 0);
977 	barrier();
978 
979 	/*
980 	 * Let others run again.
981 	 */
982 	atomic_set(&mce_executing, 0);
983 	return ret;
984 }
985 
986 /*
987  * Check if the address reported by the CPU is in a format we can parse.
988  * It would be possible to add code for most other cases, but all would
989  * be somewhat complicated (e.g. segment offset would require an instruction
990  * parser). So only support physical addresses up to page granuality for now.
991  */
mce_usable_address(struct mce * m)992 static int mce_usable_address(struct mce *m)
993 {
994 	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
995 		return 0;
996 	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
997 		return 0;
998 	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
999 		return 0;
1000 	return 1;
1001 }
1002 
mce_clear_state(unsigned long * toclear)1003 static void mce_clear_state(unsigned long *toclear)
1004 {
1005 	int i;
1006 
1007 	for (i = 0; i < mca_cfg.banks; i++) {
1008 		if (test_bit(i, toclear))
1009 			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1010 	}
1011 }
1012 
1013 /*
1014  * The actual machine check handler. This only handles real
1015  * exceptions when something got corrupted coming in through int 18.
1016  *
1017  * This is executed in NMI context not subject to normal locking rules. This
1018  * implies that most kernel services cannot be safely used. Don't even
1019  * think about putting a printk in there!
1020  *
1021  * On Intel systems this is entered on all CPUs in parallel through
1022  * MCE broadcast. However some CPUs might be broken beyond repair,
1023  * so be always careful when synchronizing with others.
1024  */
do_machine_check(struct pt_regs * regs,long error_code)1025 void do_machine_check(struct pt_regs *regs, long error_code)
1026 {
1027 	struct mca_config *cfg = &mca_cfg;
1028 	struct mce m, *final;
1029 	enum ctx_state prev_state;
1030 	int i;
1031 	int worst = 0;
1032 	int severity;
1033 	/*
1034 	 * Establish sequential order between the CPUs entering the machine
1035 	 * check handler.
1036 	 */
1037 	int order;
1038 	/*
1039 	 * If no_way_out gets set, there is no safe way to recover from this
1040 	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1041 	 */
1042 	int no_way_out = 0;
1043 	/*
1044 	 * If kill_it gets set, there might be a way to recover from this
1045 	 * error.
1046 	 */
1047 	int kill_it = 0;
1048 	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1049 	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1050 	char *msg = "Unknown";
1051 	u64 recover_paddr = ~0ull;
1052 	int flags = MF_ACTION_REQUIRED;
1053 
1054 	prev_state = ist_enter(regs);
1055 
1056 	this_cpu_inc(mce_exception_count);
1057 
1058 	if (!cfg->banks)
1059 		goto out;
1060 
1061 	mce_gather_info(&m, regs);
1062 
1063 	final = this_cpu_ptr(&mces_seen);
1064 	*final = m;
1065 
1066 	memset(valid_banks, 0, sizeof(valid_banks));
1067 	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1068 
1069 	barrier();
1070 
1071 	/*
1072 	 * When no restart IP might need to kill or panic.
1073 	 * Assume the worst for now, but if we find the
1074 	 * severity is MCE_AR_SEVERITY we have other options.
1075 	 */
1076 	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1077 		kill_it = 1;
1078 
1079 	/*
1080 	 * Go through all the banks in exclusion of the other CPUs.
1081 	 * This way we don't report duplicated events on shared banks
1082 	 * because the first one to see it will clear it.
1083 	 */
1084 	order = mce_start(&no_way_out);
1085 	for (i = 0; i < cfg->banks; i++) {
1086 		__clear_bit(i, toclear);
1087 		if (!test_bit(i, valid_banks))
1088 			continue;
1089 		if (!mce_banks[i].ctl)
1090 			continue;
1091 
1092 		m.misc = 0;
1093 		m.addr = 0;
1094 		m.bank = i;
1095 
1096 		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1097 		if ((m.status & MCI_STATUS_VAL) == 0)
1098 			continue;
1099 
1100 		/*
1101 		 * Non uncorrected or non signaled errors are handled by
1102 		 * machine_check_poll. Leave them alone, unless this panics.
1103 		 */
1104 		if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1105 			!no_way_out)
1106 			continue;
1107 
1108 		/*
1109 		 * Set taint even when machine check was not enabled.
1110 		 */
1111 		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1112 
1113 		severity = mce_severity(&m, cfg->tolerant, NULL, true);
1114 
1115 		/*
1116 		 * When machine check was for corrected/deferred handler don't
1117 		 * touch, unless we're panicing.
1118 		 */
1119 		if ((severity == MCE_KEEP_SEVERITY ||
1120 		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1121 			continue;
1122 		__set_bit(i, toclear);
1123 		if (severity == MCE_NO_SEVERITY) {
1124 			/*
1125 			 * Machine check event was not enabled. Clear, but
1126 			 * ignore.
1127 			 */
1128 			continue;
1129 		}
1130 
1131 		mce_read_aux(&m, i);
1132 
1133 		/*
1134 		 * Action optional error. Queue address for later processing.
1135 		 * When the ring overflows we just ignore the AO error.
1136 		 * RED-PEN add some logging mechanism when
1137 		 * usable_address or mce_add_ring fails.
1138 		 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1139 		 */
1140 		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1141 			mce_ring_add(m.addr >> PAGE_SHIFT);
1142 
1143 		mce_log(&m);
1144 
1145 		if (severity > worst) {
1146 			*final = m;
1147 			worst = severity;
1148 		}
1149 	}
1150 
1151 	/* mce_clear_state will clear *final, save locally for use later */
1152 	m = *final;
1153 
1154 	if (!no_way_out)
1155 		mce_clear_state(toclear);
1156 
1157 	/*
1158 	 * Do most of the synchronization with other CPUs.
1159 	 * When there's any problem use only local no_way_out state.
1160 	 */
1161 	if (mce_end(order) < 0)
1162 		no_way_out = worst >= MCE_PANIC_SEVERITY;
1163 
1164 	/*
1165 	 * At insane "tolerant" levels we take no action. Otherwise
1166 	 * we only die if we have no other choice. For less serious
1167 	 * issues we try to recover, or limit damage to the current
1168 	 * process.
1169 	 */
1170 	if (cfg->tolerant < 3) {
1171 		if (no_way_out)
1172 			mce_panic("Fatal machine check on current CPU", &m, msg);
1173 		if (worst == MCE_AR_SEVERITY) {
1174 			recover_paddr = m.addr;
1175 			if (!(m.mcgstatus & MCG_STATUS_RIPV))
1176 				flags |= MF_MUST_KILL;
1177 		} else if (kill_it) {
1178 			force_sig(SIGBUS, current);
1179 		}
1180 	}
1181 
1182 	if (worst > 0)
1183 		mce_report_event(regs);
1184 	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1185 out:
1186 	sync_core();
1187 
1188 	if (recover_paddr == ~0ull)
1189 		goto done;
1190 
1191 	pr_err("Uncorrected hardware memory error in user-access at %llx",
1192 		 recover_paddr);
1193 	/*
1194 	 * We must call memory_failure() here even if the current process is
1195 	 * doomed. We still need to mark the page as poisoned and alert any
1196 	 * other users of the page.
1197 	 */
1198 	ist_begin_non_atomic(regs);
1199 	local_irq_enable();
1200 	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1201 		pr_err("Memory error not recovered");
1202 		force_sig(SIGBUS, current);
1203 	}
1204 	local_irq_disable();
1205 	ist_end_non_atomic();
1206 done:
1207 	ist_exit(regs, prev_state);
1208 }
1209 EXPORT_SYMBOL_GPL(do_machine_check);
1210 
1211 #ifndef CONFIG_MEMORY_FAILURE
memory_failure(unsigned long pfn,int vector,int flags)1212 int memory_failure(unsigned long pfn, int vector, int flags)
1213 {
1214 	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1215 	BUG_ON(flags & MF_ACTION_REQUIRED);
1216 	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1217 	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1218 	       pfn);
1219 
1220 	return 0;
1221 }
1222 #endif
1223 
1224 /*
1225  * Action optional processing happens here (picking up
1226  * from the list of faulting pages that do_machine_check()
1227  * placed into the "ring").
1228  */
mce_process_work(struct work_struct * dummy)1229 static void mce_process_work(struct work_struct *dummy)
1230 {
1231 	unsigned long pfn;
1232 
1233 	while (mce_ring_get(&pfn))
1234 		memory_failure(pfn, MCE_VECTOR, 0);
1235 }
1236 
1237 #ifdef CONFIG_X86_MCE_INTEL
1238 /***
1239  * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1240  * @cpu: The CPU on which the event occurred.
1241  * @status: Event status information
1242  *
1243  * This function should be called by the thermal interrupt after the
1244  * event has been processed and the decision was made to log the event
1245  * further.
1246  *
1247  * The status parameter will be saved to the 'status' field of 'struct mce'
1248  * and historically has been the register value of the
1249  * MSR_IA32_THERMAL_STATUS (Intel) msr.
1250  */
mce_log_therm_throt_event(__u64 status)1251 void mce_log_therm_throt_event(__u64 status)
1252 {
1253 	struct mce m;
1254 
1255 	mce_setup(&m);
1256 	m.bank = MCE_THERMAL_BANK;
1257 	m.status = status;
1258 	mce_log(&m);
1259 }
1260 #endif /* CONFIG_X86_MCE_INTEL */
1261 
1262 /*
1263  * Periodic polling timer for "silent" machine check errors.  If the
1264  * poller finds an MCE, poll 2x faster.  When the poller finds no more
1265  * errors, poll 2x slower (up to check_interval seconds).
1266  */
1267 static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1268 
1269 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1270 static DEFINE_PER_CPU(struct timer_list, mce_timer);
1271 
mce_adjust_timer_default(unsigned long interval)1272 static unsigned long mce_adjust_timer_default(unsigned long interval)
1273 {
1274 	return interval;
1275 }
1276 
1277 static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1278 
__restart_timer(struct timer_list * t,unsigned long interval)1279 static void __restart_timer(struct timer_list *t, unsigned long interval)
1280 {
1281 	unsigned long when = jiffies + interval;
1282 	unsigned long flags;
1283 
1284 	local_irq_save(flags);
1285 
1286 	if (timer_pending(t)) {
1287 		if (time_before(when, t->expires))
1288 			mod_timer_pinned(t, when);
1289 	} else {
1290 		t->expires = round_jiffies(when);
1291 		add_timer_on(t, smp_processor_id());
1292 	}
1293 
1294 	local_irq_restore(flags);
1295 }
1296 
mce_timer_fn(unsigned long data)1297 static void mce_timer_fn(unsigned long data)
1298 {
1299 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1300 	int cpu = smp_processor_id();
1301 	unsigned long iv;
1302 
1303 	WARN_ON(cpu != data);
1304 
1305 	iv = __this_cpu_read(mce_next_interval);
1306 
1307 	if (mce_available(this_cpu_ptr(&cpu_info))) {
1308 		machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1309 
1310 		if (mce_intel_cmci_poll()) {
1311 			iv = mce_adjust_timer(iv);
1312 			goto done;
1313 		}
1314 	}
1315 
1316 	/*
1317 	 * Alert userspace if needed. If we logged an MCE, reduce the polling
1318 	 * interval, otherwise increase the polling interval.
1319 	 */
1320 	if (mce_notify_irq())
1321 		iv = max(iv / 2, (unsigned long) HZ/100);
1322 	else
1323 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1324 
1325 done:
1326 	__this_cpu_write(mce_next_interval, iv);
1327 	__restart_timer(t, iv);
1328 }
1329 
1330 /*
1331  * Ensure that the timer is firing in @interval from now.
1332  */
mce_timer_kick(unsigned long interval)1333 void mce_timer_kick(unsigned long interval)
1334 {
1335 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1336 	unsigned long iv = __this_cpu_read(mce_next_interval);
1337 
1338 	__restart_timer(t, interval);
1339 
1340 	if (interval < iv)
1341 		__this_cpu_write(mce_next_interval, interval);
1342 }
1343 
1344 /* Must not be called in IRQ context where del_timer_sync() can deadlock */
mce_timer_delete_all(void)1345 static void mce_timer_delete_all(void)
1346 {
1347 	int cpu;
1348 
1349 	for_each_online_cpu(cpu)
1350 		del_timer_sync(&per_cpu(mce_timer, cpu));
1351 }
1352 
mce_do_trigger(struct work_struct * work)1353 static void mce_do_trigger(struct work_struct *work)
1354 {
1355 	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1356 }
1357 
1358 static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1359 
1360 /*
1361  * Notify the user(s) about new machine check events.
1362  * Can be called from interrupt context, but not from machine check/NMI
1363  * context.
1364  */
mce_notify_irq(void)1365 int mce_notify_irq(void)
1366 {
1367 	/* Not more than two messages every minute */
1368 	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1369 
1370 	if (test_and_clear_bit(0, &mce_need_notify)) {
1371 		/* wake processes polling /dev/mcelog */
1372 		wake_up_interruptible(&mce_chrdev_wait);
1373 
1374 		if (mce_helper[0])
1375 			schedule_work(&mce_trigger_work);
1376 
1377 		if (__ratelimit(&ratelimit))
1378 			pr_info(HW_ERR "Machine check events logged\n");
1379 
1380 		return 1;
1381 	}
1382 	return 0;
1383 }
1384 EXPORT_SYMBOL_GPL(mce_notify_irq);
1385 
__mcheck_cpu_mce_banks_init(void)1386 static int __mcheck_cpu_mce_banks_init(void)
1387 {
1388 	int i;
1389 	u8 num_banks = mca_cfg.banks;
1390 
1391 	mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1392 	if (!mce_banks)
1393 		return -ENOMEM;
1394 
1395 	for (i = 0; i < num_banks; i++) {
1396 		struct mce_bank *b = &mce_banks[i];
1397 
1398 		b->ctl = -1ULL;
1399 		b->init = 1;
1400 	}
1401 	return 0;
1402 }
1403 
1404 /*
1405  * Initialize Machine Checks for a CPU.
1406  */
__mcheck_cpu_cap_init(void)1407 static int __mcheck_cpu_cap_init(void)
1408 {
1409 	unsigned b;
1410 	u64 cap;
1411 
1412 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1413 
1414 	b = cap & MCG_BANKCNT_MASK;
1415 	if (!mca_cfg.banks)
1416 		pr_info("CPU supports %d MCE banks\n", b);
1417 
1418 	if (b > MAX_NR_BANKS) {
1419 		pr_warn("Using only %u machine check banks out of %u\n",
1420 			MAX_NR_BANKS, b);
1421 		b = MAX_NR_BANKS;
1422 	}
1423 
1424 	/* Don't support asymmetric configurations today */
1425 	WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1426 	mca_cfg.banks = b;
1427 
1428 	if (!mce_banks) {
1429 		int err = __mcheck_cpu_mce_banks_init();
1430 
1431 		if (err)
1432 			return err;
1433 	}
1434 
1435 	/* Use accurate RIP reporting if available. */
1436 	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1437 		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1438 
1439 	if (cap & MCG_SER_P)
1440 		mca_cfg.ser = true;
1441 
1442 	return 0;
1443 }
1444 
__mcheck_cpu_init_generic(void)1445 static void __mcheck_cpu_init_generic(void)
1446 {
1447 	enum mcp_flags m_fl = 0;
1448 	mce_banks_t all_banks;
1449 	u64 cap;
1450 	int i;
1451 
1452 	if (!mca_cfg.bootlog)
1453 		m_fl = MCP_DONTLOG;
1454 
1455 	/*
1456 	 * Log the machine checks left over from the previous reset.
1457 	 */
1458 	bitmap_fill(all_banks, MAX_NR_BANKS);
1459 	machine_check_poll(MCP_UC | m_fl, &all_banks);
1460 
1461 	cr4_set_bits(X86_CR4_MCE);
1462 
1463 	rdmsrl(MSR_IA32_MCG_CAP, cap);
1464 	if (cap & MCG_CTL_P)
1465 		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1466 
1467 	for (i = 0; i < mca_cfg.banks; i++) {
1468 		struct mce_bank *b = &mce_banks[i];
1469 
1470 		if (!b->init)
1471 			continue;
1472 		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1473 		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1474 	}
1475 }
1476 
1477 /*
1478  * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1479  * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1480  * Vol 3B Table 15-20). But this confuses both the code that determines
1481  * whether the machine check occurred in kernel or user mode, and also
1482  * the severity assessment code. Pretend that EIPV was set, and take the
1483  * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1484  */
quirk_sandybridge_ifu(int bank,struct mce * m,struct pt_regs * regs)1485 static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1486 {
1487 	if (bank != 0)
1488 		return;
1489 	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1490 		return;
1491 	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1492 		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1493 			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1494 			  MCACOD)) !=
1495 			 (MCI_STATUS_UC|MCI_STATUS_EN|
1496 			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1497 			  MCI_STATUS_AR|MCACOD_INSTR))
1498 		return;
1499 
1500 	m->mcgstatus |= MCG_STATUS_EIPV;
1501 	m->ip = regs->ip;
1502 	m->cs = regs->cs;
1503 }
1504 
1505 /* Add per CPU specific workarounds here */
__mcheck_cpu_apply_quirks(struct cpuinfo_x86 * c)1506 static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1507 {
1508 	struct mca_config *cfg = &mca_cfg;
1509 
1510 	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1511 		pr_info("unknown CPU type - not enabling MCE support\n");
1512 		return -EOPNOTSUPP;
1513 	}
1514 
1515 	/* This should be disabled by the BIOS, but isn't always */
1516 	if (c->x86_vendor == X86_VENDOR_AMD) {
1517 		if (c->x86 == 15 && cfg->banks > 4) {
1518 			/*
1519 			 * disable GART TBL walk error reporting, which
1520 			 * trips off incorrectly with the IOMMU & 3ware
1521 			 * & Cerberus:
1522 			 */
1523 			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1524 		}
1525 		if (c->x86 <= 17 && cfg->bootlog < 0) {
1526 			/*
1527 			 * Lots of broken BIOS around that don't clear them
1528 			 * by default and leave crap in there. Don't log:
1529 			 */
1530 			cfg->bootlog = 0;
1531 		}
1532 		/*
1533 		 * Various K7s with broken bank 0 around. Always disable
1534 		 * by default.
1535 		 */
1536 		if (c->x86 == 6 && cfg->banks > 0)
1537 			mce_banks[0].ctl = 0;
1538 
1539 		/*
1540 		 * overflow_recov is supported for F15h Models 00h-0fh
1541 		 * even though we don't have a CPUID bit for it.
1542 		 */
1543 		if (c->x86 == 0x15 && c->x86_model <= 0xf)
1544 			mce_flags.overflow_recov = 1;
1545 
1546 		/*
1547 		 * Turn off MC4_MISC thresholding banks on those models since
1548 		 * they're not supported there.
1549 		 */
1550 		if (c->x86 == 0x15 &&
1551 		    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1552 			int i;
1553 			u64 hwcr;
1554 			bool need_toggle;
1555 			u32 msrs[] = {
1556 				0x00000413, /* MC4_MISC0 */
1557 				0xc0000408, /* MC4_MISC1 */
1558 			};
1559 
1560 			rdmsrl(MSR_K7_HWCR, hwcr);
1561 
1562 			/* McStatusWrEn has to be set */
1563 			need_toggle = !(hwcr & BIT(18));
1564 
1565 			if (need_toggle)
1566 				wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1567 
1568 			/* Clear CntP bit safely */
1569 			for (i = 0; i < ARRAY_SIZE(msrs); i++)
1570 				msr_clear_bit(msrs[i], 62);
1571 
1572 			/* restore old settings */
1573 			if (need_toggle)
1574 				wrmsrl(MSR_K7_HWCR, hwcr);
1575 		}
1576 	}
1577 
1578 	if (c->x86_vendor == X86_VENDOR_INTEL) {
1579 		/*
1580 		 * SDM documents that on family 6 bank 0 should not be written
1581 		 * because it aliases to another special BIOS controlled
1582 		 * register.
1583 		 * But it's not aliased anymore on model 0x1a+
1584 		 * Don't ignore bank 0 completely because there could be a
1585 		 * valid event later, merely don't write CTL0.
1586 		 */
1587 
1588 		if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1589 			mce_banks[0].init = 0;
1590 
1591 		/*
1592 		 * All newer Intel systems support MCE broadcasting. Enable
1593 		 * synchronization with a one second timeout.
1594 		 */
1595 		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1596 			cfg->monarch_timeout < 0)
1597 			cfg->monarch_timeout = USEC_PER_SEC;
1598 
1599 		/*
1600 		 * There are also broken BIOSes on some Pentium M and
1601 		 * earlier systems:
1602 		 */
1603 		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1604 			cfg->bootlog = 0;
1605 
1606 		if (c->x86 == 6 && c->x86_model == 45)
1607 			quirk_no_way_out = quirk_sandybridge_ifu;
1608 	}
1609 	if (cfg->monarch_timeout < 0)
1610 		cfg->monarch_timeout = 0;
1611 	if (cfg->bootlog != 0)
1612 		cfg->panic_timeout = 30;
1613 
1614 	return 0;
1615 }
1616 
__mcheck_cpu_ancient_init(struct cpuinfo_x86 * c)1617 static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1618 {
1619 	if (c->x86 != 5)
1620 		return 0;
1621 
1622 	switch (c->x86_vendor) {
1623 	case X86_VENDOR_INTEL:
1624 		intel_p5_mcheck_init(c);
1625 		return 1;
1626 		break;
1627 	case X86_VENDOR_CENTAUR:
1628 		winchip_mcheck_init(c);
1629 		return 1;
1630 		break;
1631 	}
1632 
1633 	return 0;
1634 }
1635 
__mcheck_cpu_init_vendor(struct cpuinfo_x86 * c)1636 static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1637 {
1638 	switch (c->x86_vendor) {
1639 	case X86_VENDOR_INTEL:
1640 		mce_intel_feature_init(c);
1641 		mce_adjust_timer = cmci_intel_adjust_timer;
1642 		break;
1643 	case X86_VENDOR_AMD:
1644 		mce_amd_feature_init(c);
1645 		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
1646 		break;
1647 	default:
1648 		break;
1649 	}
1650 }
1651 
mce_start_timer(unsigned int cpu,struct timer_list * t)1652 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1653 {
1654 	unsigned long iv = check_interval * HZ;
1655 
1656 	if (mca_cfg.ignore_ce || !iv)
1657 		return;
1658 
1659 	per_cpu(mce_next_interval, cpu) = iv;
1660 
1661 	t->expires = round_jiffies(jiffies + iv);
1662 	add_timer_on(t, cpu);
1663 }
1664 
__mcheck_cpu_init_timer(void)1665 static void __mcheck_cpu_init_timer(void)
1666 {
1667 	struct timer_list *t = this_cpu_ptr(&mce_timer);
1668 	unsigned int cpu = smp_processor_id();
1669 
1670 	setup_timer(t, mce_timer_fn, cpu);
1671 	mce_start_timer(cpu, t);
1672 }
1673 
1674 /* Handle unconfigured int18 (should never happen) */
unexpected_machine_check(struct pt_regs * regs,long error_code)1675 static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1676 {
1677 	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1678 	       smp_processor_id());
1679 }
1680 
1681 /* Call the installed machine check handler for this CPU setup. */
1682 void (*machine_check_vector)(struct pt_regs *, long error_code) =
1683 						unexpected_machine_check;
1684 
1685 /*
1686  * Called for each booted CPU to set up machine checks.
1687  * Must be called with preempt off:
1688  */
mcheck_cpu_init(struct cpuinfo_x86 * c)1689 void mcheck_cpu_init(struct cpuinfo_x86 *c)
1690 {
1691 	if (mca_cfg.disabled)
1692 		return;
1693 
1694 	if (__mcheck_cpu_ancient_init(c))
1695 		return;
1696 
1697 	if (!mce_available(c))
1698 		return;
1699 
1700 	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1701 		mca_cfg.disabled = true;
1702 		return;
1703 	}
1704 
1705 	machine_check_vector = do_machine_check;
1706 
1707 	__mcheck_cpu_init_generic();
1708 	__mcheck_cpu_init_vendor(c);
1709 	__mcheck_cpu_init_timer();
1710 	INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
1711 	init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
1712 }
1713 
1714 /*
1715  * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1716  */
1717 
1718 static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1719 static int mce_chrdev_open_count;	/* #times opened */
1720 static int mce_chrdev_open_exclu;	/* already open exclusive? */
1721 
mce_chrdev_open(struct inode * inode,struct file * file)1722 static int mce_chrdev_open(struct inode *inode, struct file *file)
1723 {
1724 	spin_lock(&mce_chrdev_state_lock);
1725 
1726 	if (mce_chrdev_open_exclu ||
1727 	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1728 		spin_unlock(&mce_chrdev_state_lock);
1729 
1730 		return -EBUSY;
1731 	}
1732 
1733 	if (file->f_flags & O_EXCL)
1734 		mce_chrdev_open_exclu = 1;
1735 	mce_chrdev_open_count++;
1736 
1737 	spin_unlock(&mce_chrdev_state_lock);
1738 
1739 	return nonseekable_open(inode, file);
1740 }
1741 
mce_chrdev_release(struct inode * inode,struct file * file)1742 static int mce_chrdev_release(struct inode *inode, struct file *file)
1743 {
1744 	spin_lock(&mce_chrdev_state_lock);
1745 
1746 	mce_chrdev_open_count--;
1747 	mce_chrdev_open_exclu = 0;
1748 
1749 	spin_unlock(&mce_chrdev_state_lock);
1750 
1751 	return 0;
1752 }
1753 
collect_tscs(void * data)1754 static void collect_tscs(void *data)
1755 {
1756 	unsigned long *cpu_tsc = (unsigned long *)data;
1757 
1758 	rdtscll(cpu_tsc[smp_processor_id()]);
1759 }
1760 
1761 static int mce_apei_read_done;
1762 
1763 /* Collect MCE record of previous boot in persistent storage via APEI ERST. */
__mce_read_apei(char __user ** ubuf,size_t usize)1764 static int __mce_read_apei(char __user **ubuf, size_t usize)
1765 {
1766 	int rc;
1767 	u64 record_id;
1768 	struct mce m;
1769 
1770 	if (usize < sizeof(struct mce))
1771 		return -EINVAL;
1772 
1773 	rc = apei_read_mce(&m, &record_id);
1774 	/* Error or no more MCE record */
1775 	if (rc <= 0) {
1776 		mce_apei_read_done = 1;
1777 		/*
1778 		 * When ERST is disabled, mce_chrdev_read() should return
1779 		 * "no record" instead of "no device."
1780 		 */
1781 		if (rc == -ENODEV)
1782 			return 0;
1783 		return rc;
1784 	}
1785 	rc = -EFAULT;
1786 	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1787 		return rc;
1788 	/*
1789 	 * In fact, we should have cleared the record after that has
1790 	 * been flushed to the disk or sent to network in
1791 	 * /sbin/mcelog, but we have no interface to support that now,
1792 	 * so just clear it to avoid duplication.
1793 	 */
1794 	rc = apei_clear_mce(record_id);
1795 	if (rc) {
1796 		mce_apei_read_done = 1;
1797 		return rc;
1798 	}
1799 	*ubuf += sizeof(struct mce);
1800 
1801 	return 0;
1802 }
1803 
mce_chrdev_read(struct file * filp,char __user * ubuf,size_t usize,loff_t * off)1804 static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1805 				size_t usize, loff_t *off)
1806 {
1807 	char __user *buf = ubuf;
1808 	unsigned long *cpu_tsc;
1809 	unsigned prev, next;
1810 	int i, err;
1811 
1812 	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1813 	if (!cpu_tsc)
1814 		return -ENOMEM;
1815 
1816 	mutex_lock(&mce_chrdev_read_mutex);
1817 
1818 	if (!mce_apei_read_done) {
1819 		err = __mce_read_apei(&buf, usize);
1820 		if (err || buf != ubuf)
1821 			goto out;
1822 	}
1823 
1824 	next = rcu_dereference_check_mce(mcelog.next);
1825 
1826 	/* Only supports full reads right now */
1827 	err = -EINVAL;
1828 	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1829 		goto out;
1830 
1831 	err = 0;
1832 	prev = 0;
1833 	do {
1834 		for (i = prev; i < next; i++) {
1835 			unsigned long start = jiffies;
1836 			struct mce *m = &mcelog.entry[i];
1837 
1838 			while (!m->finished) {
1839 				if (time_after_eq(jiffies, start + 2)) {
1840 					memset(m, 0, sizeof(*m));
1841 					goto timeout;
1842 				}
1843 				cpu_relax();
1844 			}
1845 			smp_rmb();
1846 			err |= copy_to_user(buf, m, sizeof(*m));
1847 			buf += sizeof(*m);
1848 timeout:
1849 			;
1850 		}
1851 
1852 		memset(mcelog.entry + prev, 0,
1853 		       (next - prev) * sizeof(struct mce));
1854 		prev = next;
1855 		next = cmpxchg(&mcelog.next, prev, 0);
1856 	} while (next != prev);
1857 
1858 	synchronize_sched();
1859 
1860 	/*
1861 	 * Collect entries that were still getting written before the
1862 	 * synchronize.
1863 	 */
1864 	on_each_cpu(collect_tscs, cpu_tsc, 1);
1865 
1866 	for (i = next; i < MCE_LOG_LEN; i++) {
1867 		struct mce *m = &mcelog.entry[i];
1868 
1869 		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1870 			err |= copy_to_user(buf, m, sizeof(*m));
1871 			smp_rmb();
1872 			buf += sizeof(*m);
1873 			memset(m, 0, sizeof(*m));
1874 		}
1875 	}
1876 
1877 	if (err)
1878 		err = -EFAULT;
1879 
1880 out:
1881 	mutex_unlock(&mce_chrdev_read_mutex);
1882 	kfree(cpu_tsc);
1883 
1884 	return err ? err : buf - ubuf;
1885 }
1886 
mce_chrdev_poll(struct file * file,poll_table * wait)1887 static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1888 {
1889 	poll_wait(file, &mce_chrdev_wait, wait);
1890 	if (rcu_access_index(mcelog.next))
1891 		return POLLIN | POLLRDNORM;
1892 	if (!mce_apei_read_done && apei_check_mce())
1893 		return POLLIN | POLLRDNORM;
1894 	return 0;
1895 }
1896 
mce_chrdev_ioctl(struct file * f,unsigned int cmd,unsigned long arg)1897 static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1898 				unsigned long arg)
1899 {
1900 	int __user *p = (int __user *)arg;
1901 
1902 	if (!capable(CAP_SYS_ADMIN))
1903 		return -EPERM;
1904 
1905 	switch (cmd) {
1906 	case MCE_GET_RECORD_LEN:
1907 		return put_user(sizeof(struct mce), p);
1908 	case MCE_GET_LOG_LEN:
1909 		return put_user(MCE_LOG_LEN, p);
1910 	case MCE_GETCLEAR_FLAGS: {
1911 		unsigned flags;
1912 
1913 		do {
1914 			flags = mcelog.flags;
1915 		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1916 
1917 		return put_user(flags, p);
1918 	}
1919 	default:
1920 		return -ENOTTY;
1921 	}
1922 }
1923 
1924 static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1925 			    size_t usize, loff_t *off);
1926 
register_mce_write_callback(ssize_t (* fn)(struct file * filp,const char __user * ubuf,size_t usize,loff_t * off))1927 void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1928 			     const char __user *ubuf,
1929 			     size_t usize, loff_t *off))
1930 {
1931 	mce_write = fn;
1932 }
1933 EXPORT_SYMBOL_GPL(register_mce_write_callback);
1934 
mce_chrdev_write(struct file * filp,const char __user * ubuf,size_t usize,loff_t * off)1935 ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1936 			 size_t usize, loff_t *off)
1937 {
1938 	if (mce_write)
1939 		return mce_write(filp, ubuf, usize, off);
1940 	else
1941 		return -EINVAL;
1942 }
1943 
1944 static const struct file_operations mce_chrdev_ops = {
1945 	.open			= mce_chrdev_open,
1946 	.release		= mce_chrdev_release,
1947 	.read			= mce_chrdev_read,
1948 	.write			= mce_chrdev_write,
1949 	.poll			= mce_chrdev_poll,
1950 	.unlocked_ioctl		= mce_chrdev_ioctl,
1951 	.llseek			= no_llseek,
1952 };
1953 
1954 static struct miscdevice mce_chrdev_device = {
1955 	MISC_MCELOG_MINOR,
1956 	"mcelog",
1957 	&mce_chrdev_ops,
1958 };
1959 
__mce_disable_bank(void * arg)1960 static void __mce_disable_bank(void *arg)
1961 {
1962 	int bank = *((int *)arg);
1963 	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1964 	cmci_disable_bank(bank);
1965 }
1966 
mce_disable_bank(int bank)1967 void mce_disable_bank(int bank)
1968 {
1969 	if (bank >= mca_cfg.banks) {
1970 		pr_warn(FW_BUG
1971 			"Ignoring request to disable invalid MCA bank %d.\n",
1972 			bank);
1973 		return;
1974 	}
1975 	set_bit(bank, mce_banks_ce_disabled);
1976 	on_each_cpu(__mce_disable_bank, &bank, 1);
1977 }
1978 
1979 /*
1980  * mce=off Disables machine check
1981  * mce=no_cmci Disables CMCI
1982  * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1983  * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1984  * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1985  *	monarchtimeout is how long to wait for other CPUs on machine
1986  *	check, or 0 to not wait
1987  * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1988  * mce=nobootlog Don't log MCEs from before booting.
1989  * mce=bios_cmci_threshold Don't program the CMCI threshold
1990  */
mcheck_enable(char * str)1991 static int __init mcheck_enable(char *str)
1992 {
1993 	struct mca_config *cfg = &mca_cfg;
1994 
1995 	if (*str == 0) {
1996 		enable_p5_mce();
1997 		return 1;
1998 	}
1999 	if (*str == '=')
2000 		str++;
2001 	if (!strcmp(str, "off"))
2002 		cfg->disabled = true;
2003 	else if (!strcmp(str, "no_cmci"))
2004 		cfg->cmci_disabled = true;
2005 	else if (!strcmp(str, "dont_log_ce"))
2006 		cfg->dont_log_ce = true;
2007 	else if (!strcmp(str, "ignore_ce"))
2008 		cfg->ignore_ce = true;
2009 	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2010 		cfg->bootlog = (str[0] == 'b');
2011 	else if (!strcmp(str, "bios_cmci_threshold"))
2012 		cfg->bios_cmci_threshold = true;
2013 	else if (isdigit(str[0])) {
2014 		get_option(&str, &(cfg->tolerant));
2015 		if (*str == ',') {
2016 			++str;
2017 			get_option(&str, &(cfg->monarch_timeout));
2018 		}
2019 	} else {
2020 		pr_info("mce argument %s ignored. Please use /sys\n", str);
2021 		return 0;
2022 	}
2023 	return 1;
2024 }
2025 __setup("mce", mcheck_enable);
2026 
mcheck_init(void)2027 int __init mcheck_init(void)
2028 {
2029 	mcheck_intel_therm_init();
2030 	mcheck_vendor_init_severity();
2031 
2032 	return 0;
2033 }
2034 
2035 /*
2036  * mce_syscore: PM support
2037  */
2038 
2039 /*
2040  * Disable machine checks on suspend and shutdown. We can't really handle
2041  * them later.
2042  */
mce_disable_error_reporting(void)2043 static int mce_disable_error_reporting(void)
2044 {
2045 	int i;
2046 
2047 	for (i = 0; i < mca_cfg.banks; i++) {
2048 		struct mce_bank *b = &mce_banks[i];
2049 
2050 		if (b->init)
2051 			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2052 	}
2053 	return 0;
2054 }
2055 
mce_syscore_suspend(void)2056 static int mce_syscore_suspend(void)
2057 {
2058 	return mce_disable_error_reporting();
2059 }
2060 
mce_syscore_shutdown(void)2061 static void mce_syscore_shutdown(void)
2062 {
2063 	mce_disable_error_reporting();
2064 }
2065 
2066 /*
2067  * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2068  * Only one CPU is active at this time, the others get re-added later using
2069  * CPU hotplug:
2070  */
mce_syscore_resume(void)2071 static void mce_syscore_resume(void)
2072 {
2073 	__mcheck_cpu_init_generic();
2074 	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2075 }
2076 
2077 static struct syscore_ops mce_syscore_ops = {
2078 	.suspend	= mce_syscore_suspend,
2079 	.shutdown	= mce_syscore_shutdown,
2080 	.resume		= mce_syscore_resume,
2081 };
2082 
2083 /*
2084  * mce_device: Sysfs support
2085  */
2086 
mce_cpu_restart(void * data)2087 static void mce_cpu_restart(void *data)
2088 {
2089 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2090 		return;
2091 	__mcheck_cpu_init_generic();
2092 	__mcheck_cpu_init_timer();
2093 }
2094 
2095 /* Reinit MCEs after user configuration changes */
mce_restart(void)2096 static void mce_restart(void)
2097 {
2098 	mce_timer_delete_all();
2099 	on_each_cpu(mce_cpu_restart, NULL, 1);
2100 }
2101 
2102 /* Toggle features for corrected errors */
mce_disable_cmci(void * data)2103 static void mce_disable_cmci(void *data)
2104 {
2105 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2106 		return;
2107 	cmci_clear();
2108 }
2109 
mce_enable_ce(void * all)2110 static void mce_enable_ce(void *all)
2111 {
2112 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2113 		return;
2114 	cmci_reenable();
2115 	cmci_recheck();
2116 	if (all)
2117 		__mcheck_cpu_init_timer();
2118 }
2119 
2120 static struct bus_type mce_subsys = {
2121 	.name		= "machinecheck",
2122 	.dev_name	= "machinecheck",
2123 };
2124 
2125 DEFINE_PER_CPU(struct device *, mce_device);
2126 
2127 void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2128 
attr_to_bank(struct device_attribute * attr)2129 static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2130 {
2131 	return container_of(attr, struct mce_bank, attr);
2132 }
2133 
show_bank(struct device * s,struct device_attribute * attr,char * buf)2134 static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2135 			 char *buf)
2136 {
2137 	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2138 }
2139 
set_bank(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2140 static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2141 			const char *buf, size_t size)
2142 {
2143 	u64 new;
2144 
2145 	if (kstrtou64(buf, 0, &new) < 0)
2146 		return -EINVAL;
2147 
2148 	attr_to_bank(attr)->ctl = new;
2149 	mce_restart();
2150 
2151 	return size;
2152 }
2153 
2154 static ssize_t
show_trigger(struct device * s,struct device_attribute * attr,char * buf)2155 show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2156 {
2157 	strcpy(buf, mce_helper);
2158 	strcat(buf, "\n");
2159 	return strlen(mce_helper) + 1;
2160 }
2161 
set_trigger(struct device * s,struct device_attribute * attr,const char * buf,size_t siz)2162 static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2163 				const char *buf, size_t siz)
2164 {
2165 	char *p;
2166 
2167 	strncpy(mce_helper, buf, sizeof(mce_helper));
2168 	mce_helper[sizeof(mce_helper)-1] = 0;
2169 	p = strchr(mce_helper, '\n');
2170 
2171 	if (p)
2172 		*p = 0;
2173 
2174 	return strlen(mce_helper) + !!p;
2175 }
2176 
set_ignore_ce(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2177 static ssize_t set_ignore_ce(struct device *s,
2178 			     struct device_attribute *attr,
2179 			     const char *buf, size_t size)
2180 {
2181 	u64 new;
2182 
2183 	if (kstrtou64(buf, 0, &new) < 0)
2184 		return -EINVAL;
2185 
2186 	if (mca_cfg.ignore_ce ^ !!new) {
2187 		if (new) {
2188 			/* disable ce features */
2189 			mce_timer_delete_all();
2190 			on_each_cpu(mce_disable_cmci, NULL, 1);
2191 			mca_cfg.ignore_ce = true;
2192 		} else {
2193 			/* enable ce features */
2194 			mca_cfg.ignore_ce = false;
2195 			on_each_cpu(mce_enable_ce, (void *)1, 1);
2196 		}
2197 	}
2198 	return size;
2199 }
2200 
set_cmci_disabled(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2201 static ssize_t set_cmci_disabled(struct device *s,
2202 				 struct device_attribute *attr,
2203 				 const char *buf, size_t size)
2204 {
2205 	u64 new;
2206 
2207 	if (kstrtou64(buf, 0, &new) < 0)
2208 		return -EINVAL;
2209 
2210 	if (mca_cfg.cmci_disabled ^ !!new) {
2211 		if (new) {
2212 			/* disable cmci */
2213 			on_each_cpu(mce_disable_cmci, NULL, 1);
2214 			mca_cfg.cmci_disabled = true;
2215 		} else {
2216 			/* enable cmci */
2217 			mca_cfg.cmci_disabled = false;
2218 			on_each_cpu(mce_enable_ce, NULL, 1);
2219 		}
2220 	}
2221 	return size;
2222 }
2223 
store_int_with_restart(struct device * s,struct device_attribute * attr,const char * buf,size_t size)2224 static ssize_t store_int_with_restart(struct device *s,
2225 				      struct device_attribute *attr,
2226 				      const char *buf, size_t size)
2227 {
2228 	ssize_t ret = device_store_int(s, attr, buf, size);
2229 	mce_restart();
2230 	return ret;
2231 }
2232 
2233 static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2234 static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2235 static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2236 static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2237 
2238 static struct dev_ext_attribute dev_attr_check_interval = {
2239 	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2240 	&check_interval
2241 };
2242 
2243 static struct dev_ext_attribute dev_attr_ignore_ce = {
2244 	__ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2245 	&mca_cfg.ignore_ce
2246 };
2247 
2248 static struct dev_ext_attribute dev_attr_cmci_disabled = {
2249 	__ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2250 	&mca_cfg.cmci_disabled
2251 };
2252 
2253 static struct device_attribute *mce_device_attrs[] = {
2254 	&dev_attr_tolerant.attr,
2255 	&dev_attr_check_interval.attr,
2256 	&dev_attr_trigger,
2257 	&dev_attr_monarch_timeout.attr,
2258 	&dev_attr_dont_log_ce.attr,
2259 	&dev_attr_ignore_ce.attr,
2260 	&dev_attr_cmci_disabled.attr,
2261 	NULL
2262 };
2263 
2264 static cpumask_var_t mce_device_initialized;
2265 
mce_device_release(struct device * dev)2266 static void mce_device_release(struct device *dev)
2267 {
2268 	kfree(dev);
2269 }
2270 
2271 /* Per cpu device init. All of the cpus still share the same ctrl bank: */
mce_device_create(unsigned int cpu)2272 static int mce_device_create(unsigned int cpu)
2273 {
2274 	struct device *dev;
2275 	int err;
2276 	int i, j;
2277 
2278 	if (!mce_available(&boot_cpu_data))
2279 		return -EIO;
2280 
2281 	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2282 	if (!dev)
2283 		return -ENOMEM;
2284 	dev->id  = cpu;
2285 	dev->bus = &mce_subsys;
2286 	dev->release = &mce_device_release;
2287 
2288 	err = device_register(dev);
2289 	if (err) {
2290 		put_device(dev);
2291 		return err;
2292 	}
2293 
2294 	for (i = 0; mce_device_attrs[i]; i++) {
2295 		err = device_create_file(dev, mce_device_attrs[i]);
2296 		if (err)
2297 			goto error;
2298 	}
2299 	for (j = 0; j < mca_cfg.banks; j++) {
2300 		err = device_create_file(dev, &mce_banks[j].attr);
2301 		if (err)
2302 			goto error2;
2303 	}
2304 	cpumask_set_cpu(cpu, mce_device_initialized);
2305 	per_cpu(mce_device, cpu) = dev;
2306 
2307 	return 0;
2308 error2:
2309 	while (--j >= 0)
2310 		device_remove_file(dev, &mce_banks[j].attr);
2311 error:
2312 	while (--i >= 0)
2313 		device_remove_file(dev, mce_device_attrs[i]);
2314 
2315 	device_unregister(dev);
2316 
2317 	return err;
2318 }
2319 
mce_device_remove(unsigned int cpu)2320 static void mce_device_remove(unsigned int cpu)
2321 {
2322 	struct device *dev = per_cpu(mce_device, cpu);
2323 	int i;
2324 
2325 	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2326 		return;
2327 
2328 	for (i = 0; mce_device_attrs[i]; i++)
2329 		device_remove_file(dev, mce_device_attrs[i]);
2330 
2331 	for (i = 0; i < mca_cfg.banks; i++)
2332 		device_remove_file(dev, &mce_banks[i].attr);
2333 
2334 	device_unregister(dev);
2335 	cpumask_clear_cpu(cpu, mce_device_initialized);
2336 	per_cpu(mce_device, cpu) = NULL;
2337 }
2338 
2339 /* Make sure there are no machine checks on offlined CPUs. */
mce_disable_cpu(void * h)2340 static void mce_disable_cpu(void *h)
2341 {
2342 	unsigned long action = *(unsigned long *)h;
2343 	int i;
2344 
2345 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2346 		return;
2347 
2348 	if (!(action & CPU_TASKS_FROZEN))
2349 		cmci_clear();
2350 	for (i = 0; i < mca_cfg.banks; i++) {
2351 		struct mce_bank *b = &mce_banks[i];
2352 
2353 		if (b->init)
2354 			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2355 	}
2356 }
2357 
mce_reenable_cpu(void * h)2358 static void mce_reenable_cpu(void *h)
2359 {
2360 	unsigned long action = *(unsigned long *)h;
2361 	int i;
2362 
2363 	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2364 		return;
2365 
2366 	if (!(action & CPU_TASKS_FROZEN))
2367 		cmci_reenable();
2368 	for (i = 0; i < mca_cfg.banks; i++) {
2369 		struct mce_bank *b = &mce_banks[i];
2370 
2371 		if (b->init)
2372 			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2373 	}
2374 }
2375 
2376 /* Get notified when a cpu comes on/off. Be hotplug friendly. */
2377 static int
mce_cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)2378 mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2379 {
2380 	unsigned int cpu = (unsigned long)hcpu;
2381 	struct timer_list *t = &per_cpu(mce_timer, cpu);
2382 
2383 	switch (action & ~CPU_TASKS_FROZEN) {
2384 	case CPU_ONLINE:
2385 		mce_device_create(cpu);
2386 		if (threshold_cpu_callback)
2387 			threshold_cpu_callback(action, cpu);
2388 		break;
2389 	case CPU_DEAD:
2390 		if (threshold_cpu_callback)
2391 			threshold_cpu_callback(action, cpu);
2392 		mce_device_remove(cpu);
2393 		mce_intel_hcpu_update(cpu);
2394 
2395 		/* intentionally ignoring frozen here */
2396 		if (!(action & CPU_TASKS_FROZEN))
2397 			cmci_rediscover();
2398 		break;
2399 	case CPU_DOWN_PREPARE:
2400 		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2401 		del_timer_sync(t);
2402 		break;
2403 	case CPU_DOWN_FAILED:
2404 		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2405 		mce_start_timer(cpu, t);
2406 		break;
2407 	}
2408 
2409 	return NOTIFY_OK;
2410 }
2411 
2412 static struct notifier_block mce_cpu_notifier = {
2413 	.notifier_call = mce_cpu_callback,
2414 };
2415 
mce_init_banks(void)2416 static __init void mce_init_banks(void)
2417 {
2418 	int i;
2419 
2420 	for (i = 0; i < mca_cfg.banks; i++) {
2421 		struct mce_bank *b = &mce_banks[i];
2422 		struct device_attribute *a = &b->attr;
2423 
2424 		sysfs_attr_init(&a->attr);
2425 		a->attr.name	= b->attrname;
2426 		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2427 
2428 		a->attr.mode	= 0644;
2429 		a->show		= show_bank;
2430 		a->store	= set_bank;
2431 	}
2432 }
2433 
mcheck_init_device(void)2434 static __init int mcheck_init_device(void)
2435 {
2436 	int err;
2437 	int i = 0;
2438 
2439 	if (!mce_available(&boot_cpu_data)) {
2440 		err = -EIO;
2441 		goto err_out;
2442 	}
2443 
2444 	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2445 		err = -ENOMEM;
2446 		goto err_out;
2447 	}
2448 
2449 	mce_init_banks();
2450 
2451 	err = subsys_system_register(&mce_subsys, NULL);
2452 	if (err)
2453 		goto err_out_mem;
2454 
2455 	cpu_notifier_register_begin();
2456 	for_each_online_cpu(i) {
2457 		err = mce_device_create(i);
2458 		if (err) {
2459 			/*
2460 			 * Register notifier anyway (and do not unreg it) so
2461 			 * that we don't leave undeleted timers, see notifier
2462 			 * callback above.
2463 			 */
2464 			__register_hotcpu_notifier(&mce_cpu_notifier);
2465 			cpu_notifier_register_done();
2466 			goto err_device_create;
2467 		}
2468 	}
2469 
2470 	__register_hotcpu_notifier(&mce_cpu_notifier);
2471 	cpu_notifier_register_done();
2472 
2473 	register_syscore_ops(&mce_syscore_ops);
2474 
2475 	/* register character device /dev/mcelog */
2476 	err = misc_register(&mce_chrdev_device);
2477 	if (err)
2478 		goto err_register;
2479 
2480 	return 0;
2481 
2482 err_register:
2483 	unregister_syscore_ops(&mce_syscore_ops);
2484 
2485 err_device_create:
2486 	/*
2487 	 * We didn't keep track of which devices were created above, but
2488 	 * even if we had, the set of online cpus might have changed.
2489 	 * Play safe and remove for every possible cpu, since
2490 	 * mce_device_remove() will do the right thing.
2491 	 */
2492 	for_each_possible_cpu(i)
2493 		mce_device_remove(i);
2494 
2495 err_out_mem:
2496 	free_cpumask_var(mce_device_initialized);
2497 
2498 err_out:
2499 	pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2500 
2501 	return err;
2502 }
2503 device_initcall_sync(mcheck_init_device);
2504 
2505 /*
2506  * Old style boot options parsing. Only for compatibility.
2507  */
mcheck_disable(char * str)2508 static int __init mcheck_disable(char *str)
2509 {
2510 	mca_cfg.disabled = true;
2511 	return 1;
2512 }
2513 __setup("nomce", mcheck_disable);
2514 
2515 #ifdef CONFIG_DEBUG_FS
mce_get_debugfs_dir(void)2516 struct dentry *mce_get_debugfs_dir(void)
2517 {
2518 	static struct dentry *dmce;
2519 
2520 	if (!dmce)
2521 		dmce = debugfs_create_dir("mce", NULL);
2522 
2523 	return dmce;
2524 }
2525 
mce_reset(void)2526 static void mce_reset(void)
2527 {
2528 	cpu_missing = 0;
2529 	atomic_set(&mce_fake_panicked, 0);
2530 	atomic_set(&mce_executing, 0);
2531 	atomic_set(&mce_callin, 0);
2532 	atomic_set(&global_nwo, 0);
2533 }
2534 
fake_panic_get(void * data,u64 * val)2535 static int fake_panic_get(void *data, u64 *val)
2536 {
2537 	*val = fake_panic;
2538 	return 0;
2539 }
2540 
fake_panic_set(void * data,u64 val)2541 static int fake_panic_set(void *data, u64 val)
2542 {
2543 	mce_reset();
2544 	fake_panic = val;
2545 	return 0;
2546 }
2547 
2548 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2549 			fake_panic_set, "%llu\n");
2550 
mcheck_debugfs_init(void)2551 static int __init mcheck_debugfs_init(void)
2552 {
2553 	struct dentry *dmce, *ffake_panic;
2554 
2555 	dmce = mce_get_debugfs_dir();
2556 	if (!dmce)
2557 		return -ENOMEM;
2558 	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2559 					  &fake_panic_fops);
2560 	if (!ffake_panic)
2561 		return -ENOMEM;
2562 
2563 	return 0;
2564 }
2565 late_initcall(mcheck_debugfs_init);
2566 #endif
2567