1/*
2 * Machine check handler.
3 *
4 * K8 parts Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 * Rest from unknown author(s).
6 * 2004 Andi Kleen. Rewrote most of it.
7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/thread_info.h>
14#include <linux/capability.h>
15#include <linux/miscdevice.h>
16#include <linux/ratelimit.h>
17#include <linux/kallsyms.h>
18#include <linux/rcupdate.h>
19#include <linux/kobject.h>
20#include <linux/uaccess.h>
21#include <linux/kdebug.h>
22#include <linux/kernel.h>
23#include <linux/percpu.h>
24#include <linux/string.h>
25#include <linux/device.h>
26#include <linux/syscore_ops.h>
27#include <linux/delay.h>
28#include <linux/ctype.h>
29#include <linux/sched.h>
30#include <linux/sysfs.h>
31#include <linux/types.h>
32#include <linux/slab.h>
33#include <linux/init.h>
34#include <linux/kmod.h>
35#include <linux/poll.h>
36#include <linux/nmi.h>
37#include <linux/cpu.h>
38#include <linux/smp.h>
39#include <linux/fs.h>
40#include <linux/mm.h>
41#include <linux/debugfs.h>
42#include <linux/irq_work.h>
43#include <linux/export.h>
44
45#include <asm/processor.h>
46#include <asm/traps.h>
47#include <asm/tlbflush.h>
48#include <asm/mce.h>
49#include <asm/msr.h>
50
51#include "mce-internal.h"
52
53static DEFINE_MUTEX(mce_chrdev_read_mutex);
54
55#define rcu_dereference_check_mce(p) \
56	rcu_dereference_index_check((p), \
57			      rcu_read_lock_sched_held() || \
58			      lockdep_is_held(&mce_chrdev_read_mutex))
59
60#define CREATE_TRACE_POINTS
61#include <trace/events/mce.h>
62
63#define SPINUNIT		100	/* 100ns */
64
65DEFINE_PER_CPU(unsigned, mce_exception_count);
66
67struct mce_bank *mce_banks __read_mostly;
68struct mce_vendor_flags mce_flags __read_mostly;
69
70struct mca_config mca_cfg __read_mostly = {
71	.bootlog  = -1,
72	/*
73	 * Tolerant levels:
74	 * 0: always panic on uncorrected errors, log corrected errors
75	 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
76	 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors
77	 * 3: never panic or SIGBUS, log all errors (for testing only)
78	 */
79	.tolerant = 1,
80	.monarch_timeout = -1
81};
82
83/* User mode helper program triggered by machine check event */
84static unsigned long		mce_need_notify;
85static char			mce_helper[128];
86static char			*mce_helper_argv[2] = { mce_helper, NULL };
87
88static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
89
90static DEFINE_PER_CPU(struct mce, mces_seen);
91static int			cpu_missing;
92
93/*
94 * MCA banks polled by the period polling timer for corrected events.
95 * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
96 */
97DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
98	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
99};
100
101/*
102 * MCA banks controlled through firmware first for corrected errors.
103 * This is a global list of banks for which we won't enable CMCI and we
104 * won't poll. Firmware controls these banks and is responsible for
105 * reporting corrected errors through GHES. Uncorrected/recoverable
106 * errors are still notified through a machine check.
107 */
108mce_banks_t mce_banks_ce_disabled;
109
110static DEFINE_PER_CPU(struct work_struct, mce_work);
111
112static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
113
114/*
115 * CPU/chipset specific EDAC code can register a notifier call here to print
116 * MCE errors in a human-readable form.
117 */
118static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
119
120/* Do initial initialization of a struct mce */
121void mce_setup(struct mce *m)
122{
123	memset(m, 0, sizeof(struct mce));
124	m->cpu = m->extcpu = smp_processor_id();
125	rdtscll(m->tsc);
126	/* We hope get_seconds stays lockless */
127	m->time = get_seconds();
128	m->cpuvendor = boot_cpu_data.x86_vendor;
129	m->cpuid = cpuid_eax(1);
130	m->socketid = cpu_data(m->extcpu).phys_proc_id;
131	m->apicid = cpu_data(m->extcpu).initial_apicid;
132	rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
133}
134
135DEFINE_PER_CPU(struct mce, injectm);
136EXPORT_PER_CPU_SYMBOL_GPL(injectm);
137
138/*
139 * Lockless MCE logging infrastructure.
140 * This avoids deadlocks on printk locks without having to break locks. Also
141 * separate MCEs from kernel messages to avoid bogus bug reports.
142 */
143
144static struct mce_log mcelog = {
145	.signature	= MCE_LOG_SIGNATURE,
146	.len		= MCE_LOG_LEN,
147	.recordlen	= sizeof(struct mce),
148};
149
150void mce_log(struct mce *mce)
151{
152	unsigned next, entry;
153
154	/* Emit the trace record: */
155	trace_mce_record(mce);
156
157	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
158
159	mce->finished = 0;
160	wmb();
161	for (;;) {
162		entry = rcu_dereference_check_mce(mcelog.next);
163		for (;;) {
164
165			/*
166			 * When the buffer fills up discard new entries.
167			 * Assume that the earlier errors are the more
168			 * interesting ones:
169			 */
170			if (entry >= MCE_LOG_LEN) {
171				set_bit(MCE_OVERFLOW,
172					(unsigned long *)&mcelog.flags);
173				return;
174			}
175			/* Old left over entry. Skip: */
176			if (mcelog.entry[entry].finished) {
177				entry++;
178				continue;
179			}
180			break;
181		}
182		smp_rmb();
183		next = entry + 1;
184		if (cmpxchg(&mcelog.next, entry, next) == entry)
185			break;
186	}
187	memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
188	wmb();
189	mcelog.entry[entry].finished = 1;
190	wmb();
191
192	mce->finished = 1;
193	set_bit(0, &mce_need_notify);
194}
195
196static void drain_mcelog_buffer(void)
197{
198	unsigned int next, i, prev = 0;
199
200	next = ACCESS_ONCE(mcelog.next);
201
202	do {
203		struct mce *m;
204
205		/* drain what was logged during boot */
206		for (i = prev; i < next; i++) {
207			unsigned long start = jiffies;
208			unsigned retries = 1;
209
210			m = &mcelog.entry[i];
211
212			while (!m->finished) {
213				if (time_after_eq(jiffies, start + 2*retries))
214					retries++;
215
216				cpu_relax();
217
218				if (!m->finished && retries >= 4) {
219					pr_err("skipping error being logged currently!\n");
220					break;
221				}
222			}
223			smp_rmb();
224			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
225		}
226
227		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
228		prev = next;
229		next = cmpxchg(&mcelog.next, prev, 0);
230	} while (next != prev);
231}
232
233
234void mce_register_decode_chain(struct notifier_block *nb)
235{
236	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
237	drain_mcelog_buffer();
238}
239EXPORT_SYMBOL_GPL(mce_register_decode_chain);
240
241void mce_unregister_decode_chain(struct notifier_block *nb)
242{
243	atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
244}
245EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
246
247static void print_mce(struct mce *m)
248{
249	int ret = 0;
250
251	pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
252	       m->extcpu, m->mcgstatus, m->bank, m->status);
253
254	if (m->ip) {
255		pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
256			!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
257				m->cs, m->ip);
258
259		if (m->cs == __KERNEL_CS)
260			print_symbol("{%s}", m->ip);
261		pr_cont("\n");
262	}
263
264	pr_emerg(HW_ERR "TSC %llx ", m->tsc);
265	if (m->addr)
266		pr_cont("ADDR %llx ", m->addr);
267	if (m->misc)
268		pr_cont("MISC %llx ", m->misc);
269
270	pr_cont("\n");
271	/*
272	 * Note this output is parsed by external tools and old fields
273	 * should not be changed.
274	 */
275	pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
276		m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
277		cpu_data(m->extcpu).microcode);
278
279	/*
280	 * Print out human-readable details about the MCE error,
281	 * (if the CPU has an implementation for that)
282	 */
283	ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
284	if (ret == NOTIFY_STOP)
285		return;
286
287	pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
288}
289
290#define PANIC_TIMEOUT 5 /* 5 seconds */
291
292static atomic_t mce_panicked;
293
294static int fake_panic;
295static atomic_t mce_fake_panicked;
296
297/* Panic in progress. Enable interrupts and wait for final IPI */
298static void wait_for_panic(void)
299{
300	long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
301
302	preempt_disable();
303	local_irq_enable();
304	while (timeout-- > 0)
305		udelay(1);
306	if (panic_timeout == 0)
307		panic_timeout = mca_cfg.panic_timeout;
308	panic("Panicing machine check CPU died");
309}
310
311static void mce_panic(const char *msg, struct mce *final, char *exp)
312{
313	int i, apei_err = 0;
314
315	if (!fake_panic) {
316		/*
317		 * Make sure only one CPU runs in machine check panic
318		 */
319		if (atomic_inc_return(&mce_panicked) > 1)
320			wait_for_panic();
321		barrier();
322
323		bust_spinlocks(1);
324		console_verbose();
325	} else {
326		/* Don't log too much for fake panic */
327		if (atomic_inc_return(&mce_fake_panicked) > 1)
328			return;
329	}
330	/* First print corrected ones that are still unlogged */
331	for (i = 0; i < MCE_LOG_LEN; i++) {
332		struct mce *m = &mcelog.entry[i];
333		if (!(m->status & MCI_STATUS_VAL))
334			continue;
335		if (!(m->status & MCI_STATUS_UC)) {
336			print_mce(m);
337			if (!apei_err)
338				apei_err = apei_write_mce(m);
339		}
340	}
341	/* Now print uncorrected but with the final one last */
342	for (i = 0; i < MCE_LOG_LEN; i++) {
343		struct mce *m = &mcelog.entry[i];
344		if (!(m->status & MCI_STATUS_VAL))
345			continue;
346		if (!(m->status & MCI_STATUS_UC))
347			continue;
348		if (!final || memcmp(m, final, sizeof(struct mce))) {
349			print_mce(m);
350			if (!apei_err)
351				apei_err = apei_write_mce(m);
352		}
353	}
354	if (final) {
355		print_mce(final);
356		if (!apei_err)
357			apei_err = apei_write_mce(final);
358	}
359	if (cpu_missing)
360		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
361	if (exp)
362		pr_emerg(HW_ERR "Machine check: %s\n", exp);
363	if (!fake_panic) {
364		if (panic_timeout == 0)
365			panic_timeout = mca_cfg.panic_timeout;
366		panic(msg);
367	} else
368		pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
369}
370
371/* Support code for software error injection */
372
373static int msr_to_offset(u32 msr)
374{
375	unsigned bank = __this_cpu_read(injectm.bank);
376
377	if (msr == mca_cfg.rip_msr)
378		return offsetof(struct mce, ip);
379	if (msr == MSR_IA32_MCx_STATUS(bank))
380		return offsetof(struct mce, status);
381	if (msr == MSR_IA32_MCx_ADDR(bank))
382		return offsetof(struct mce, addr);
383	if (msr == MSR_IA32_MCx_MISC(bank))
384		return offsetof(struct mce, misc);
385	if (msr == MSR_IA32_MCG_STATUS)
386		return offsetof(struct mce, mcgstatus);
387	return -1;
388}
389
390/* MSR access wrappers used for error injection */
391static u64 mce_rdmsrl(u32 msr)
392{
393	u64 v;
394
395	if (__this_cpu_read(injectm.finished)) {
396		int offset = msr_to_offset(msr);
397
398		if (offset < 0)
399			return 0;
400		return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
401	}
402
403	if (rdmsrl_safe(msr, &v)) {
404		WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
405		/*
406		 * Return zero in case the access faulted. This should
407		 * not happen normally but can happen if the CPU does
408		 * something weird, or if the code is buggy.
409		 */
410		v = 0;
411	}
412
413	return v;
414}
415
416static void mce_wrmsrl(u32 msr, u64 v)
417{
418	if (__this_cpu_read(injectm.finished)) {
419		int offset = msr_to_offset(msr);
420
421		if (offset >= 0)
422			*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
423		return;
424	}
425	wrmsrl(msr, v);
426}
427
428/*
429 * Collect all global (w.r.t. this processor) status about this machine
430 * check into our "mce" struct so that we can use it later to assess
431 * the severity of the problem as we read per-bank specific details.
432 */
433static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
434{
435	mce_setup(m);
436
437	m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
438	if (regs) {
439		/*
440		 * Get the address of the instruction at the time of
441		 * the machine check error.
442		 */
443		if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
444			m->ip = regs->ip;
445			m->cs = regs->cs;
446
447			/*
448			 * When in VM86 mode make the cs look like ring 3
449			 * always. This is a lie, but it's better than passing
450			 * the additional vm86 bit around everywhere.
451			 */
452			if (v8086_mode(regs))
453				m->cs |= 3;
454		}
455		/* Use accurate RIP reporting if available. */
456		if (mca_cfg.rip_msr)
457			m->ip = mce_rdmsrl(mca_cfg.rip_msr);
458	}
459}
460
461/*
462 * Simple lockless ring to communicate PFNs from the exception handler with the
463 * process context work function. This is vastly simplified because there's
464 * only a single reader and a single writer.
465 */
466#define MCE_RING_SIZE 16	/* we use one entry less */
467
468struct mce_ring {
469	unsigned short start;
470	unsigned short end;
471	unsigned long ring[MCE_RING_SIZE];
472};
473static DEFINE_PER_CPU(struct mce_ring, mce_ring);
474
475/* Runs with CPU affinity in workqueue */
476static int mce_ring_empty(void)
477{
478	struct mce_ring *r = this_cpu_ptr(&mce_ring);
479
480	return r->start == r->end;
481}
482
483static int mce_ring_get(unsigned long *pfn)
484{
485	struct mce_ring *r;
486	int ret = 0;
487
488	*pfn = 0;
489	get_cpu();
490	r = this_cpu_ptr(&mce_ring);
491	if (r->start == r->end)
492		goto out;
493	*pfn = r->ring[r->start];
494	r->start = (r->start + 1) % MCE_RING_SIZE;
495	ret = 1;
496out:
497	put_cpu();
498	return ret;
499}
500
501/* Always runs in MCE context with preempt off */
502static int mce_ring_add(unsigned long pfn)
503{
504	struct mce_ring *r = this_cpu_ptr(&mce_ring);
505	unsigned next;
506
507	next = (r->end + 1) % MCE_RING_SIZE;
508	if (next == r->start)
509		return -1;
510	r->ring[r->end] = pfn;
511	wmb();
512	r->end = next;
513	return 0;
514}
515
516int mce_available(struct cpuinfo_x86 *c)
517{
518	if (mca_cfg.disabled)
519		return 0;
520	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
521}
522
523static void mce_schedule_work(void)
524{
525	if (!mce_ring_empty())
526		schedule_work(this_cpu_ptr(&mce_work));
527}
528
529static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
530
531static void mce_irq_work_cb(struct irq_work *entry)
532{
533	mce_notify_irq();
534	mce_schedule_work();
535}
536
537static void mce_report_event(struct pt_regs *regs)
538{
539	if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) {
540		mce_notify_irq();
541		/*
542		 * Triggering the work queue here is just an insurance
543		 * policy in case the syscall exit notify handler
544		 * doesn't run soon enough or ends up running on the
545		 * wrong CPU (can happen when audit sleeps)
546		 */
547		mce_schedule_work();
548		return;
549	}
550
551	irq_work_queue(this_cpu_ptr(&mce_irq_work));
552}
553
554/*
555 * Read ADDR and MISC registers.
556 */
557static void mce_read_aux(struct mce *m, int i)
558{
559	if (m->status & MCI_STATUS_MISCV)
560		m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
561	if (m->status & MCI_STATUS_ADDRV) {
562		m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
563
564		/*
565		 * Mask the reported address by the reported granularity.
566		 */
567		if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
568			u8 shift = MCI_MISC_ADDR_LSB(m->misc);
569			m->addr >>= shift;
570			m->addr <<= shift;
571		}
572	}
573}
574
575static bool memory_error(struct mce *m)
576{
577	struct cpuinfo_x86 *c = &boot_cpu_data;
578
579	if (c->x86_vendor == X86_VENDOR_AMD) {
580		/*
581		 * coming soon
582		 */
583		return false;
584	} else if (c->x86_vendor == X86_VENDOR_INTEL) {
585		/*
586		 * Intel SDM Volume 3B - 15.9.2 Compound Error Codes
587		 *
588		 * Bit 7 of the MCACOD field of IA32_MCi_STATUS is used for
589		 * indicating a memory error. Bit 8 is used for indicating a
590		 * cache hierarchy error. The combination of bit 2 and bit 3
591		 * is used for indicating a `generic' cache hierarchy error
592		 * But we can't just blindly check the above bits, because if
593		 * bit 11 is set, then it is a bus/interconnect error - and
594		 * either way the above bits just gives more detail on what
595		 * bus/interconnect error happened. Note that bit 12 can be
596		 * ignored, as it's the "filter" bit.
597		 */
598		return (m->status & 0xef80) == BIT(7) ||
599		       (m->status & 0xef00) == BIT(8) ||
600		       (m->status & 0xeffc) == 0xc;
601	}
602
603	return false;
604}
605
606DEFINE_PER_CPU(unsigned, mce_poll_count);
607
608/*
609 * Poll for corrected events or events that happened before reset.
610 * Those are just logged through /dev/mcelog.
611 *
612 * This is executed in standard interrupt context.
613 *
614 * Note: spec recommends to panic for fatal unsignalled
615 * errors here. However this would be quite problematic --
616 * we would need to reimplement the Monarch handling and
617 * it would mess up the exclusion between exception handler
618 * and poll hander -- * so we skip this for now.
619 * These cases should not happen anyways, or only when the CPU
620 * is already totally * confused. In this case it's likely it will
621 * not fully execute the machine check handler either.
622 */
623bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
624{
625	bool error_logged = false;
626	struct mce m;
627	int severity;
628	int i;
629
630	this_cpu_inc(mce_poll_count);
631
632	mce_gather_info(&m, NULL);
633
634	for (i = 0; i < mca_cfg.banks; i++) {
635		if (!mce_banks[i].ctl || !test_bit(i, *b))
636			continue;
637
638		m.misc = 0;
639		m.addr = 0;
640		m.bank = i;
641		m.tsc = 0;
642
643		barrier();
644		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
645		if (!(m.status & MCI_STATUS_VAL))
646			continue;
647
648
649		/*
650		 * Uncorrected or signalled events are handled by the exception
651		 * handler when it is enabled, so don't process those here.
652		 *
653		 * TBD do the same check for MCI_STATUS_EN here?
654		 */
655		if (!(flags & MCP_UC) &&
656		    (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC)))
657			continue;
658
659		mce_read_aux(&m, i);
660
661		if (!(flags & MCP_TIMESTAMP))
662			m.tsc = 0;
663
664		severity = mce_severity(&m, mca_cfg.tolerant, NULL, false);
665
666		/*
667		 * In the cases where we don't have a valid address after all,
668		 * do not add it into the ring buffer.
669		 */
670		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
671			if (m.status & MCI_STATUS_ADDRV) {
672				mce_ring_add(m.addr >> PAGE_SHIFT);
673				mce_schedule_work();
674			}
675		}
676
677		/*
678		 * Don't get the IP here because it's unlikely to
679		 * have anything to do with the actual error location.
680		 */
681		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
682			error_logged = true;
683			mce_log(&m);
684		}
685
686		/*
687		 * Clear state for this bank.
688		 */
689		mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
690	}
691
692	/*
693	 * Don't clear MCG_STATUS here because it's only defined for
694	 * exceptions.
695	 */
696
697	sync_core();
698
699	return error_logged;
700}
701EXPORT_SYMBOL_GPL(machine_check_poll);
702
703/*
704 * Do a quick check if any of the events requires a panic.
705 * This decides if we keep the events around or clear them.
706 */
707static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
708			  struct pt_regs *regs)
709{
710	int i, ret = 0;
711	char *tmp;
712
713	for (i = 0; i < mca_cfg.banks; i++) {
714		m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
715		if (m->status & MCI_STATUS_VAL) {
716			__set_bit(i, validp);
717			if (quirk_no_way_out)
718				quirk_no_way_out(i, m, regs);
719		}
720
721		if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
722			*msg = tmp;
723			ret = 1;
724		}
725	}
726	return ret;
727}
728
729/*
730 * Variable to establish order between CPUs while scanning.
731 * Each CPU spins initially until executing is equal its number.
732 */
733static atomic_t mce_executing;
734
735/*
736 * Defines order of CPUs on entry. First CPU becomes Monarch.
737 */
738static atomic_t mce_callin;
739
740/*
741 * Check if a timeout waiting for other CPUs happened.
742 */
743static int mce_timed_out(u64 *t, const char *msg)
744{
745	/*
746	 * The others already did panic for some reason.
747	 * Bail out like in a timeout.
748	 * rmb() to tell the compiler that system_state
749	 * might have been modified by someone else.
750	 */
751	rmb();
752	if (atomic_read(&mce_panicked))
753		wait_for_panic();
754	if (!mca_cfg.monarch_timeout)
755		goto out;
756	if ((s64)*t < SPINUNIT) {
757		if (mca_cfg.tolerant <= 1)
758			mce_panic(msg, NULL, NULL);
759		cpu_missing = 1;
760		return 1;
761	}
762	*t -= SPINUNIT;
763out:
764	touch_nmi_watchdog();
765	return 0;
766}
767
768/*
769 * The Monarch's reign.  The Monarch is the CPU who entered
770 * the machine check handler first. It waits for the others to
771 * raise the exception too and then grades them. When any
772 * error is fatal panic. Only then let the others continue.
773 *
774 * The other CPUs entering the MCE handler will be controlled by the
775 * Monarch. They are called Subjects.
776 *
777 * This way we prevent any potential data corruption in a unrecoverable case
778 * and also makes sure always all CPU's errors are examined.
779 *
780 * Also this detects the case of a machine check event coming from outer
781 * space (not detected by any CPUs) In this case some external agent wants
782 * us to shut down, so panic too.
783 *
784 * The other CPUs might still decide to panic if the handler happens
785 * in a unrecoverable place, but in this case the system is in a semi-stable
786 * state and won't corrupt anything by itself. It's ok to let the others
787 * continue for a bit first.
788 *
789 * All the spin loops have timeouts; when a timeout happens a CPU
790 * typically elects itself to be Monarch.
791 */
792static void mce_reign(void)
793{
794	int cpu;
795	struct mce *m = NULL;
796	int global_worst = 0;
797	char *msg = NULL;
798	char *nmsg = NULL;
799
800	/*
801	 * This CPU is the Monarch and the other CPUs have run
802	 * through their handlers.
803	 * Grade the severity of the errors of all the CPUs.
804	 */
805	for_each_possible_cpu(cpu) {
806		int severity = mce_severity(&per_cpu(mces_seen, cpu),
807					    mca_cfg.tolerant,
808					    &nmsg, true);
809		if (severity > global_worst) {
810			msg = nmsg;
811			global_worst = severity;
812			m = &per_cpu(mces_seen, cpu);
813		}
814	}
815
816	/*
817	 * Cannot recover? Panic here then.
818	 * This dumps all the mces in the log buffer and stops the
819	 * other CPUs.
820	 */
821	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
822		mce_panic("Fatal machine check", m, msg);
823
824	/*
825	 * For UC somewhere we let the CPU who detects it handle it.
826	 * Also must let continue the others, otherwise the handling
827	 * CPU could deadlock on a lock.
828	 */
829
830	/*
831	 * No machine check event found. Must be some external
832	 * source or one CPU is hung. Panic.
833	 */
834	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
835		mce_panic("Fatal machine check from unknown source", NULL, NULL);
836
837	/*
838	 * Now clear all the mces_seen so that they don't reappear on
839	 * the next mce.
840	 */
841	for_each_possible_cpu(cpu)
842		memset(&per_cpu(mces_seen, cpu), 0, sizeof(struct mce));
843}
844
845static atomic_t global_nwo;
846
847/*
848 * Start of Monarch synchronization. This waits until all CPUs have
849 * entered the exception handler and then determines if any of them
850 * saw a fatal event that requires panic. Then it executes them
851 * in the entry order.
852 * TBD double check parallel CPU hotunplug
853 */
854static int mce_start(int *no_way_out)
855{
856	int order;
857	int cpus = num_online_cpus();
858	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
859
860	if (!timeout)
861		return -1;
862
863	atomic_add(*no_way_out, &global_nwo);
864	/*
865	 * global_nwo should be updated before mce_callin
866	 */
867	smp_wmb();
868	order = atomic_inc_return(&mce_callin);
869
870	/*
871	 * Wait for everyone.
872	 */
873	while (atomic_read(&mce_callin) != cpus) {
874		if (mce_timed_out(&timeout,
875				  "Timeout: Not all CPUs entered broadcast exception handler")) {
876			atomic_set(&global_nwo, 0);
877			return -1;
878		}
879		ndelay(SPINUNIT);
880	}
881
882	/*
883	 * mce_callin should be read before global_nwo
884	 */
885	smp_rmb();
886
887	if (order == 1) {
888		/*
889		 * Monarch: Starts executing now, the others wait.
890		 */
891		atomic_set(&mce_executing, 1);
892	} else {
893		/*
894		 * Subject: Now start the scanning loop one by one in
895		 * the original callin order.
896		 * This way when there are any shared banks it will be
897		 * only seen by one CPU before cleared, avoiding duplicates.
898		 */
899		while (atomic_read(&mce_executing) < order) {
900			if (mce_timed_out(&timeout,
901					  "Timeout: Subject CPUs unable to finish machine check processing")) {
902				atomic_set(&global_nwo, 0);
903				return -1;
904			}
905			ndelay(SPINUNIT);
906		}
907	}
908
909	/*
910	 * Cache the global no_way_out state.
911	 */
912	*no_way_out = atomic_read(&global_nwo);
913
914	return order;
915}
916
917/*
918 * Synchronize between CPUs after main scanning loop.
919 * This invokes the bulk of the Monarch processing.
920 */
921static int mce_end(int order)
922{
923	int ret = -1;
924	u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC;
925
926	if (!timeout)
927		goto reset;
928	if (order < 0)
929		goto reset;
930
931	/*
932	 * Allow others to run.
933	 */
934	atomic_inc(&mce_executing);
935
936	if (order == 1) {
937		/* CHECKME: Can this race with a parallel hotplug? */
938		int cpus = num_online_cpus();
939
940		/*
941		 * Monarch: Wait for everyone to go through their scanning
942		 * loops.
943		 */
944		while (atomic_read(&mce_executing) <= cpus) {
945			if (mce_timed_out(&timeout,
946					  "Timeout: Monarch CPU unable to finish machine check processing"))
947				goto reset;
948			ndelay(SPINUNIT);
949		}
950
951		mce_reign();
952		barrier();
953		ret = 0;
954	} else {
955		/*
956		 * Subject: Wait for Monarch to finish.
957		 */
958		while (atomic_read(&mce_executing) != 0) {
959			if (mce_timed_out(&timeout,
960					  "Timeout: Monarch CPU did not finish machine check processing"))
961				goto reset;
962			ndelay(SPINUNIT);
963		}
964
965		/*
966		 * Don't reset anything. That's done by the Monarch.
967		 */
968		return 0;
969	}
970
971	/*
972	 * Reset all global state.
973	 */
974reset:
975	atomic_set(&global_nwo, 0);
976	atomic_set(&mce_callin, 0);
977	barrier();
978
979	/*
980	 * Let others run again.
981	 */
982	atomic_set(&mce_executing, 0);
983	return ret;
984}
985
986/*
987 * Check if the address reported by the CPU is in a format we can parse.
988 * It would be possible to add code for most other cases, but all would
989 * be somewhat complicated (e.g. segment offset would require an instruction
990 * parser). So only support physical addresses up to page granuality for now.
991 */
992static int mce_usable_address(struct mce *m)
993{
994	if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV))
995		return 0;
996	if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT)
997		return 0;
998	if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS)
999		return 0;
1000	return 1;
1001}
1002
1003static void mce_clear_state(unsigned long *toclear)
1004{
1005	int i;
1006
1007	for (i = 0; i < mca_cfg.banks; i++) {
1008		if (test_bit(i, toclear))
1009			mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1010	}
1011}
1012
1013/*
1014 * The actual machine check handler. This only handles real
1015 * exceptions when something got corrupted coming in through int 18.
1016 *
1017 * This is executed in NMI context not subject to normal locking rules. This
1018 * implies that most kernel services cannot be safely used. Don't even
1019 * think about putting a printk in there!
1020 *
1021 * On Intel systems this is entered on all CPUs in parallel through
1022 * MCE broadcast. However some CPUs might be broken beyond repair,
1023 * so be always careful when synchronizing with others.
1024 */
1025void do_machine_check(struct pt_regs *regs, long error_code)
1026{
1027	struct mca_config *cfg = &mca_cfg;
1028	struct mce m, *final;
1029	enum ctx_state prev_state;
1030	int i;
1031	int worst = 0;
1032	int severity;
1033	/*
1034	 * Establish sequential order between the CPUs entering the machine
1035	 * check handler.
1036	 */
1037	int order;
1038	/*
1039	 * If no_way_out gets set, there is no safe way to recover from this
1040	 * MCE.  If mca_cfg.tolerant is cranked up, we'll try anyway.
1041	 */
1042	int no_way_out = 0;
1043	/*
1044	 * If kill_it gets set, there might be a way to recover from this
1045	 * error.
1046	 */
1047	int kill_it = 0;
1048	DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1049	DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1050	char *msg = "Unknown";
1051	u64 recover_paddr = ~0ull;
1052	int flags = MF_ACTION_REQUIRED;
1053
1054	prev_state = ist_enter(regs);
1055
1056	this_cpu_inc(mce_exception_count);
1057
1058	if (!cfg->banks)
1059		goto out;
1060
1061	mce_gather_info(&m, regs);
1062
1063	final = this_cpu_ptr(&mces_seen);
1064	*final = m;
1065
1066	memset(valid_banks, 0, sizeof(valid_banks));
1067	no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1068
1069	barrier();
1070
1071	/*
1072	 * When no restart IP might need to kill or panic.
1073	 * Assume the worst for now, but if we find the
1074	 * severity is MCE_AR_SEVERITY we have other options.
1075	 */
1076	if (!(m.mcgstatus & MCG_STATUS_RIPV))
1077		kill_it = 1;
1078
1079	/*
1080	 * Go through all the banks in exclusion of the other CPUs.
1081	 * This way we don't report duplicated events on shared banks
1082	 * because the first one to see it will clear it.
1083	 */
1084	order = mce_start(&no_way_out);
1085	for (i = 0; i < cfg->banks; i++) {
1086		__clear_bit(i, toclear);
1087		if (!test_bit(i, valid_banks))
1088			continue;
1089		if (!mce_banks[i].ctl)
1090			continue;
1091
1092		m.misc = 0;
1093		m.addr = 0;
1094		m.bank = i;
1095
1096		m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
1097		if ((m.status & MCI_STATUS_VAL) == 0)
1098			continue;
1099
1100		/*
1101		 * Non uncorrected or non signaled errors are handled by
1102		 * machine_check_poll. Leave them alone, unless this panics.
1103		 */
1104		if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1105			!no_way_out)
1106			continue;
1107
1108		/*
1109		 * Set taint even when machine check was not enabled.
1110		 */
1111		add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
1112
1113		severity = mce_severity(&m, cfg->tolerant, NULL, true);
1114
1115		/*
1116		 * When machine check was for corrected/deferred handler don't
1117		 * touch, unless we're panicing.
1118		 */
1119		if ((severity == MCE_KEEP_SEVERITY ||
1120		     severity == MCE_UCNA_SEVERITY) && !no_way_out)
1121			continue;
1122		__set_bit(i, toclear);
1123		if (severity == MCE_NO_SEVERITY) {
1124			/*
1125			 * Machine check event was not enabled. Clear, but
1126			 * ignore.
1127			 */
1128			continue;
1129		}
1130
1131		mce_read_aux(&m, i);
1132
1133		/*
1134		 * Action optional error. Queue address for later processing.
1135		 * When the ring overflows we just ignore the AO error.
1136		 * RED-PEN add some logging mechanism when
1137		 * usable_address or mce_add_ring fails.
1138		 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
1139		 */
1140		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1141			mce_ring_add(m.addr >> PAGE_SHIFT);
1142
1143		mce_log(&m);
1144
1145		if (severity > worst) {
1146			*final = m;
1147			worst = severity;
1148		}
1149	}
1150
1151	/* mce_clear_state will clear *final, save locally for use later */
1152	m = *final;
1153
1154	if (!no_way_out)
1155		mce_clear_state(toclear);
1156
1157	/*
1158	 * Do most of the synchronization with other CPUs.
1159	 * When there's any problem use only local no_way_out state.
1160	 */
1161	if (mce_end(order) < 0)
1162		no_way_out = worst >= MCE_PANIC_SEVERITY;
1163
1164	/*
1165	 * At insane "tolerant" levels we take no action. Otherwise
1166	 * we only die if we have no other choice. For less serious
1167	 * issues we try to recover, or limit damage to the current
1168	 * process.
1169	 */
1170	if (cfg->tolerant < 3) {
1171		if (no_way_out)
1172			mce_panic("Fatal machine check on current CPU", &m, msg);
1173		if (worst == MCE_AR_SEVERITY) {
1174			recover_paddr = m.addr;
1175			if (!(m.mcgstatus & MCG_STATUS_RIPV))
1176				flags |= MF_MUST_KILL;
1177		} else if (kill_it) {
1178			force_sig(SIGBUS, current);
1179		}
1180	}
1181
1182	if (worst > 0)
1183		mce_report_event(regs);
1184	mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
1185out:
1186	sync_core();
1187
1188	if (recover_paddr == ~0ull)
1189		goto done;
1190
1191	pr_err("Uncorrected hardware memory error in user-access at %llx",
1192		 recover_paddr);
1193	/*
1194	 * We must call memory_failure() here even if the current process is
1195	 * doomed. We still need to mark the page as poisoned and alert any
1196	 * other users of the page.
1197	 */
1198	ist_begin_non_atomic(regs);
1199	local_irq_enable();
1200	if (memory_failure(recover_paddr >> PAGE_SHIFT, MCE_VECTOR, flags) < 0) {
1201		pr_err("Memory error not recovered");
1202		force_sig(SIGBUS, current);
1203	}
1204	local_irq_disable();
1205	ist_end_non_atomic();
1206done:
1207	ist_exit(regs, prev_state);
1208}
1209EXPORT_SYMBOL_GPL(do_machine_check);
1210
1211#ifndef CONFIG_MEMORY_FAILURE
1212int memory_failure(unsigned long pfn, int vector, int flags)
1213{
1214	/* mce_severity() should not hand us an ACTION_REQUIRED error */
1215	BUG_ON(flags & MF_ACTION_REQUIRED);
1216	pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1217	       "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1218	       pfn);
1219
1220	return 0;
1221}
1222#endif
1223
1224/*
1225 * Action optional processing happens here (picking up
1226 * from the list of faulting pages that do_machine_check()
1227 * placed into the "ring").
1228 */
1229static void mce_process_work(struct work_struct *dummy)
1230{
1231	unsigned long pfn;
1232
1233	while (mce_ring_get(&pfn))
1234		memory_failure(pfn, MCE_VECTOR, 0);
1235}
1236
1237#ifdef CONFIG_X86_MCE_INTEL
1238/***
1239 * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
1240 * @cpu: The CPU on which the event occurred.
1241 * @status: Event status information
1242 *
1243 * This function should be called by the thermal interrupt after the
1244 * event has been processed and the decision was made to log the event
1245 * further.
1246 *
1247 * The status parameter will be saved to the 'status' field of 'struct mce'
1248 * and historically has been the register value of the
1249 * MSR_IA32_THERMAL_STATUS (Intel) msr.
1250 */
1251void mce_log_therm_throt_event(__u64 status)
1252{
1253	struct mce m;
1254
1255	mce_setup(&m);
1256	m.bank = MCE_THERMAL_BANK;
1257	m.status = status;
1258	mce_log(&m);
1259}
1260#endif /* CONFIG_X86_MCE_INTEL */
1261
1262/*
1263 * Periodic polling timer for "silent" machine check errors.  If the
1264 * poller finds an MCE, poll 2x faster.  When the poller finds no more
1265 * errors, poll 2x slower (up to check_interval seconds).
1266 */
1267static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
1268
1269static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
1270static DEFINE_PER_CPU(struct timer_list, mce_timer);
1271
1272static unsigned long mce_adjust_timer_default(unsigned long interval)
1273{
1274	return interval;
1275}
1276
1277static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
1278
1279static void __restart_timer(struct timer_list *t, unsigned long interval)
1280{
1281	unsigned long when = jiffies + interval;
1282	unsigned long flags;
1283
1284	local_irq_save(flags);
1285
1286	if (timer_pending(t)) {
1287		if (time_before(when, t->expires))
1288			mod_timer_pinned(t, when);
1289	} else {
1290		t->expires = round_jiffies(when);
1291		add_timer_on(t, smp_processor_id());
1292	}
1293
1294	local_irq_restore(flags);
1295}
1296
1297static void mce_timer_fn(unsigned long data)
1298{
1299	struct timer_list *t = this_cpu_ptr(&mce_timer);
1300	int cpu = smp_processor_id();
1301	unsigned long iv;
1302
1303	WARN_ON(cpu != data);
1304
1305	iv = __this_cpu_read(mce_next_interval);
1306
1307	if (mce_available(this_cpu_ptr(&cpu_info))) {
1308		machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
1309
1310		if (mce_intel_cmci_poll()) {
1311			iv = mce_adjust_timer(iv);
1312			goto done;
1313		}
1314	}
1315
1316	/*
1317	 * Alert userspace if needed. If we logged an MCE, reduce the polling
1318	 * interval, otherwise increase the polling interval.
1319	 */
1320	if (mce_notify_irq())
1321		iv = max(iv / 2, (unsigned long) HZ/100);
1322	else
1323		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
1324
1325done:
1326	__this_cpu_write(mce_next_interval, iv);
1327	__restart_timer(t, iv);
1328}
1329
1330/*
1331 * Ensure that the timer is firing in @interval from now.
1332 */
1333void mce_timer_kick(unsigned long interval)
1334{
1335	struct timer_list *t = this_cpu_ptr(&mce_timer);
1336	unsigned long iv = __this_cpu_read(mce_next_interval);
1337
1338	__restart_timer(t, interval);
1339
1340	if (interval < iv)
1341		__this_cpu_write(mce_next_interval, interval);
1342}
1343
1344/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1345static void mce_timer_delete_all(void)
1346{
1347	int cpu;
1348
1349	for_each_online_cpu(cpu)
1350		del_timer_sync(&per_cpu(mce_timer, cpu));
1351}
1352
1353static void mce_do_trigger(struct work_struct *work)
1354{
1355	call_usermodehelper(mce_helper, mce_helper_argv, NULL, UMH_NO_WAIT);
1356}
1357
1358static DECLARE_WORK(mce_trigger_work, mce_do_trigger);
1359
1360/*
1361 * Notify the user(s) about new machine check events.
1362 * Can be called from interrupt context, but not from machine check/NMI
1363 * context.
1364 */
1365int mce_notify_irq(void)
1366{
1367	/* Not more than two messages every minute */
1368	static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1369
1370	if (test_and_clear_bit(0, &mce_need_notify)) {
1371		/* wake processes polling /dev/mcelog */
1372		wake_up_interruptible(&mce_chrdev_wait);
1373
1374		if (mce_helper[0])
1375			schedule_work(&mce_trigger_work);
1376
1377		if (__ratelimit(&ratelimit))
1378			pr_info(HW_ERR "Machine check events logged\n");
1379
1380		return 1;
1381	}
1382	return 0;
1383}
1384EXPORT_SYMBOL_GPL(mce_notify_irq);
1385
1386static int __mcheck_cpu_mce_banks_init(void)
1387{
1388	int i;
1389	u8 num_banks = mca_cfg.banks;
1390
1391	mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL);
1392	if (!mce_banks)
1393		return -ENOMEM;
1394
1395	for (i = 0; i < num_banks; i++) {
1396		struct mce_bank *b = &mce_banks[i];
1397
1398		b->ctl = -1ULL;
1399		b->init = 1;
1400	}
1401	return 0;
1402}
1403
1404/*
1405 * Initialize Machine Checks for a CPU.
1406 */
1407static int __mcheck_cpu_cap_init(void)
1408{
1409	unsigned b;
1410	u64 cap;
1411
1412	rdmsrl(MSR_IA32_MCG_CAP, cap);
1413
1414	b = cap & MCG_BANKCNT_MASK;
1415	if (!mca_cfg.banks)
1416		pr_info("CPU supports %d MCE banks\n", b);
1417
1418	if (b > MAX_NR_BANKS) {
1419		pr_warn("Using only %u machine check banks out of %u\n",
1420			MAX_NR_BANKS, b);
1421		b = MAX_NR_BANKS;
1422	}
1423
1424	/* Don't support asymmetric configurations today */
1425	WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks);
1426	mca_cfg.banks = b;
1427
1428	if (!mce_banks) {
1429		int err = __mcheck_cpu_mce_banks_init();
1430
1431		if (err)
1432			return err;
1433	}
1434
1435	/* Use accurate RIP reporting if available. */
1436	if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1437		mca_cfg.rip_msr = MSR_IA32_MCG_EIP;
1438
1439	if (cap & MCG_SER_P)
1440		mca_cfg.ser = true;
1441
1442	return 0;
1443}
1444
1445static void __mcheck_cpu_init_generic(void)
1446{
1447	enum mcp_flags m_fl = 0;
1448	mce_banks_t all_banks;
1449	u64 cap;
1450	int i;
1451
1452	if (!mca_cfg.bootlog)
1453		m_fl = MCP_DONTLOG;
1454
1455	/*
1456	 * Log the machine checks left over from the previous reset.
1457	 */
1458	bitmap_fill(all_banks, MAX_NR_BANKS);
1459	machine_check_poll(MCP_UC | m_fl, &all_banks);
1460
1461	cr4_set_bits(X86_CR4_MCE);
1462
1463	rdmsrl(MSR_IA32_MCG_CAP, cap);
1464	if (cap & MCG_CTL_P)
1465		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1466
1467	for (i = 0; i < mca_cfg.banks; i++) {
1468		struct mce_bank *b = &mce_banks[i];
1469
1470		if (!b->init)
1471			continue;
1472		wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1473		wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1474	}
1475}
1476
1477/*
1478 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1479 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1480 * Vol 3B Table 15-20). But this confuses both the code that determines
1481 * whether the machine check occurred in kernel or user mode, and also
1482 * the severity assessment code. Pretend that EIPV was set, and take the
1483 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1484 */
1485static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1486{
1487	if (bank != 0)
1488		return;
1489	if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1490		return;
1491	if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1492		          MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1493			  MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1494			  MCACOD)) !=
1495			 (MCI_STATUS_UC|MCI_STATUS_EN|
1496			  MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1497			  MCI_STATUS_AR|MCACOD_INSTR))
1498		return;
1499
1500	m->mcgstatus |= MCG_STATUS_EIPV;
1501	m->ip = regs->ip;
1502	m->cs = regs->cs;
1503}
1504
1505/* Add per CPU specific workarounds here */
1506static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1507{
1508	struct mca_config *cfg = &mca_cfg;
1509
1510	if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1511		pr_info("unknown CPU type - not enabling MCE support\n");
1512		return -EOPNOTSUPP;
1513	}
1514
1515	/* This should be disabled by the BIOS, but isn't always */
1516	if (c->x86_vendor == X86_VENDOR_AMD) {
1517		if (c->x86 == 15 && cfg->banks > 4) {
1518			/*
1519			 * disable GART TBL walk error reporting, which
1520			 * trips off incorrectly with the IOMMU & 3ware
1521			 * & Cerberus:
1522			 */
1523			clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1524		}
1525		if (c->x86 <= 17 && cfg->bootlog < 0) {
1526			/*
1527			 * Lots of broken BIOS around that don't clear them
1528			 * by default and leave crap in there. Don't log:
1529			 */
1530			cfg->bootlog = 0;
1531		}
1532		/*
1533		 * Various K7s with broken bank 0 around. Always disable
1534		 * by default.
1535		 */
1536		if (c->x86 == 6 && cfg->banks > 0)
1537			mce_banks[0].ctl = 0;
1538
1539		/*
1540		 * overflow_recov is supported for F15h Models 00h-0fh
1541		 * even though we don't have a CPUID bit for it.
1542		 */
1543		if (c->x86 == 0x15 && c->x86_model <= 0xf)
1544			mce_flags.overflow_recov = 1;
1545
1546		/*
1547		 * Turn off MC4_MISC thresholding banks on those models since
1548		 * they're not supported there.
1549		 */
1550		if (c->x86 == 0x15 &&
1551		    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1552			int i;
1553			u64 hwcr;
1554			bool need_toggle;
1555			u32 msrs[] = {
1556				0x00000413, /* MC4_MISC0 */
1557				0xc0000408, /* MC4_MISC1 */
1558			};
1559
1560			rdmsrl(MSR_K7_HWCR, hwcr);
1561
1562			/* McStatusWrEn has to be set */
1563			need_toggle = !(hwcr & BIT(18));
1564
1565			if (need_toggle)
1566				wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1567
1568			/* Clear CntP bit safely */
1569			for (i = 0; i < ARRAY_SIZE(msrs); i++)
1570				msr_clear_bit(msrs[i], 62);
1571
1572			/* restore old settings */
1573			if (need_toggle)
1574				wrmsrl(MSR_K7_HWCR, hwcr);
1575		}
1576	}
1577
1578	if (c->x86_vendor == X86_VENDOR_INTEL) {
1579		/*
1580		 * SDM documents that on family 6 bank 0 should not be written
1581		 * because it aliases to another special BIOS controlled
1582		 * register.
1583		 * But it's not aliased anymore on model 0x1a+
1584		 * Don't ignore bank 0 completely because there could be a
1585		 * valid event later, merely don't write CTL0.
1586		 */
1587
1588		if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0)
1589			mce_banks[0].init = 0;
1590
1591		/*
1592		 * All newer Intel systems support MCE broadcasting. Enable
1593		 * synchronization with a one second timeout.
1594		 */
1595		if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1596			cfg->monarch_timeout < 0)
1597			cfg->monarch_timeout = USEC_PER_SEC;
1598
1599		/*
1600		 * There are also broken BIOSes on some Pentium M and
1601		 * earlier systems:
1602		 */
1603		if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0)
1604			cfg->bootlog = 0;
1605
1606		if (c->x86 == 6 && c->x86_model == 45)
1607			quirk_no_way_out = quirk_sandybridge_ifu;
1608	}
1609	if (cfg->monarch_timeout < 0)
1610		cfg->monarch_timeout = 0;
1611	if (cfg->bootlog != 0)
1612		cfg->panic_timeout = 30;
1613
1614	return 0;
1615}
1616
1617static int __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1618{
1619	if (c->x86 != 5)
1620		return 0;
1621
1622	switch (c->x86_vendor) {
1623	case X86_VENDOR_INTEL:
1624		intel_p5_mcheck_init(c);
1625		return 1;
1626		break;
1627	case X86_VENDOR_CENTAUR:
1628		winchip_mcheck_init(c);
1629		return 1;
1630		break;
1631	}
1632
1633	return 0;
1634}
1635
1636static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1637{
1638	switch (c->x86_vendor) {
1639	case X86_VENDOR_INTEL:
1640		mce_intel_feature_init(c);
1641		mce_adjust_timer = cmci_intel_adjust_timer;
1642		break;
1643	case X86_VENDOR_AMD:
1644		mce_amd_feature_init(c);
1645		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
1646		break;
1647	default:
1648		break;
1649	}
1650}
1651
1652static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1653{
1654	unsigned long iv = check_interval * HZ;
1655
1656	if (mca_cfg.ignore_ce || !iv)
1657		return;
1658
1659	per_cpu(mce_next_interval, cpu) = iv;
1660
1661	t->expires = round_jiffies(jiffies + iv);
1662	add_timer_on(t, cpu);
1663}
1664
1665static void __mcheck_cpu_init_timer(void)
1666{
1667	struct timer_list *t = this_cpu_ptr(&mce_timer);
1668	unsigned int cpu = smp_processor_id();
1669
1670	setup_timer(t, mce_timer_fn, cpu);
1671	mce_start_timer(cpu, t);
1672}
1673
1674/* Handle unconfigured int18 (should never happen) */
1675static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1676{
1677	pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
1678	       smp_processor_id());
1679}
1680
1681/* Call the installed machine check handler for this CPU setup. */
1682void (*machine_check_vector)(struct pt_regs *, long error_code) =
1683						unexpected_machine_check;
1684
1685/*
1686 * Called for each booted CPU to set up machine checks.
1687 * Must be called with preempt off:
1688 */
1689void mcheck_cpu_init(struct cpuinfo_x86 *c)
1690{
1691	if (mca_cfg.disabled)
1692		return;
1693
1694	if (__mcheck_cpu_ancient_init(c))
1695		return;
1696
1697	if (!mce_available(c))
1698		return;
1699
1700	if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1701		mca_cfg.disabled = true;
1702		return;
1703	}
1704
1705	machine_check_vector = do_machine_check;
1706
1707	__mcheck_cpu_init_generic();
1708	__mcheck_cpu_init_vendor(c);
1709	__mcheck_cpu_init_timer();
1710	INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
1711	init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
1712}
1713
1714/*
1715 * mce_chrdev: Character device /dev/mcelog to read and clear the MCE log.
1716 */
1717
1718static DEFINE_SPINLOCK(mce_chrdev_state_lock);
1719static int mce_chrdev_open_count;	/* #times opened */
1720static int mce_chrdev_open_exclu;	/* already open exclusive? */
1721
1722static int mce_chrdev_open(struct inode *inode, struct file *file)
1723{
1724	spin_lock(&mce_chrdev_state_lock);
1725
1726	if (mce_chrdev_open_exclu ||
1727	    (mce_chrdev_open_count && (file->f_flags & O_EXCL))) {
1728		spin_unlock(&mce_chrdev_state_lock);
1729
1730		return -EBUSY;
1731	}
1732
1733	if (file->f_flags & O_EXCL)
1734		mce_chrdev_open_exclu = 1;
1735	mce_chrdev_open_count++;
1736
1737	spin_unlock(&mce_chrdev_state_lock);
1738
1739	return nonseekable_open(inode, file);
1740}
1741
1742static int mce_chrdev_release(struct inode *inode, struct file *file)
1743{
1744	spin_lock(&mce_chrdev_state_lock);
1745
1746	mce_chrdev_open_count--;
1747	mce_chrdev_open_exclu = 0;
1748
1749	spin_unlock(&mce_chrdev_state_lock);
1750
1751	return 0;
1752}
1753
1754static void collect_tscs(void *data)
1755{
1756	unsigned long *cpu_tsc = (unsigned long *)data;
1757
1758	rdtscll(cpu_tsc[smp_processor_id()]);
1759}
1760
1761static int mce_apei_read_done;
1762
1763/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
1764static int __mce_read_apei(char __user **ubuf, size_t usize)
1765{
1766	int rc;
1767	u64 record_id;
1768	struct mce m;
1769
1770	if (usize < sizeof(struct mce))
1771		return -EINVAL;
1772
1773	rc = apei_read_mce(&m, &record_id);
1774	/* Error or no more MCE record */
1775	if (rc <= 0) {
1776		mce_apei_read_done = 1;
1777		/*
1778		 * When ERST is disabled, mce_chrdev_read() should return
1779		 * "no record" instead of "no device."
1780		 */
1781		if (rc == -ENODEV)
1782			return 0;
1783		return rc;
1784	}
1785	rc = -EFAULT;
1786	if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
1787		return rc;
1788	/*
1789	 * In fact, we should have cleared the record after that has
1790	 * been flushed to the disk or sent to network in
1791	 * /sbin/mcelog, but we have no interface to support that now,
1792	 * so just clear it to avoid duplication.
1793	 */
1794	rc = apei_clear_mce(record_id);
1795	if (rc) {
1796		mce_apei_read_done = 1;
1797		return rc;
1798	}
1799	*ubuf += sizeof(struct mce);
1800
1801	return 0;
1802}
1803
1804static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
1805				size_t usize, loff_t *off)
1806{
1807	char __user *buf = ubuf;
1808	unsigned long *cpu_tsc;
1809	unsigned prev, next;
1810	int i, err;
1811
1812	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
1813	if (!cpu_tsc)
1814		return -ENOMEM;
1815
1816	mutex_lock(&mce_chrdev_read_mutex);
1817
1818	if (!mce_apei_read_done) {
1819		err = __mce_read_apei(&buf, usize);
1820		if (err || buf != ubuf)
1821			goto out;
1822	}
1823
1824	next = rcu_dereference_check_mce(mcelog.next);
1825
1826	/* Only supports full reads right now */
1827	err = -EINVAL;
1828	if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
1829		goto out;
1830
1831	err = 0;
1832	prev = 0;
1833	do {
1834		for (i = prev; i < next; i++) {
1835			unsigned long start = jiffies;
1836			struct mce *m = &mcelog.entry[i];
1837
1838			while (!m->finished) {
1839				if (time_after_eq(jiffies, start + 2)) {
1840					memset(m, 0, sizeof(*m));
1841					goto timeout;
1842				}
1843				cpu_relax();
1844			}
1845			smp_rmb();
1846			err |= copy_to_user(buf, m, sizeof(*m));
1847			buf += sizeof(*m);
1848timeout:
1849			;
1850		}
1851
1852		memset(mcelog.entry + prev, 0,
1853		       (next - prev) * sizeof(struct mce));
1854		prev = next;
1855		next = cmpxchg(&mcelog.next, prev, 0);
1856	} while (next != prev);
1857
1858	synchronize_sched();
1859
1860	/*
1861	 * Collect entries that were still getting written before the
1862	 * synchronize.
1863	 */
1864	on_each_cpu(collect_tscs, cpu_tsc, 1);
1865
1866	for (i = next; i < MCE_LOG_LEN; i++) {
1867		struct mce *m = &mcelog.entry[i];
1868
1869		if (m->finished && m->tsc < cpu_tsc[m->cpu]) {
1870			err |= copy_to_user(buf, m, sizeof(*m));
1871			smp_rmb();
1872			buf += sizeof(*m);
1873			memset(m, 0, sizeof(*m));
1874		}
1875	}
1876
1877	if (err)
1878		err = -EFAULT;
1879
1880out:
1881	mutex_unlock(&mce_chrdev_read_mutex);
1882	kfree(cpu_tsc);
1883
1884	return err ? err : buf - ubuf;
1885}
1886
1887static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait)
1888{
1889	poll_wait(file, &mce_chrdev_wait, wait);
1890	if (rcu_access_index(mcelog.next))
1891		return POLLIN | POLLRDNORM;
1892	if (!mce_apei_read_done && apei_check_mce())
1893		return POLLIN | POLLRDNORM;
1894	return 0;
1895}
1896
1897static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1898				unsigned long arg)
1899{
1900	int __user *p = (int __user *)arg;
1901
1902	if (!capable(CAP_SYS_ADMIN))
1903		return -EPERM;
1904
1905	switch (cmd) {
1906	case MCE_GET_RECORD_LEN:
1907		return put_user(sizeof(struct mce), p);
1908	case MCE_GET_LOG_LEN:
1909		return put_user(MCE_LOG_LEN, p);
1910	case MCE_GETCLEAR_FLAGS: {
1911		unsigned flags;
1912
1913		do {
1914			flags = mcelog.flags;
1915		} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
1916
1917		return put_user(flags, p);
1918	}
1919	default:
1920		return -ENOTTY;
1921	}
1922}
1923
1924static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf,
1925			    size_t usize, loff_t *off);
1926
1927void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1928			     const char __user *ubuf,
1929			     size_t usize, loff_t *off))
1930{
1931	mce_write = fn;
1932}
1933EXPORT_SYMBOL_GPL(register_mce_write_callback);
1934
1935ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1936			 size_t usize, loff_t *off)
1937{
1938	if (mce_write)
1939		return mce_write(filp, ubuf, usize, off);
1940	else
1941		return -EINVAL;
1942}
1943
1944static const struct file_operations mce_chrdev_ops = {
1945	.open			= mce_chrdev_open,
1946	.release		= mce_chrdev_release,
1947	.read			= mce_chrdev_read,
1948	.write			= mce_chrdev_write,
1949	.poll			= mce_chrdev_poll,
1950	.unlocked_ioctl		= mce_chrdev_ioctl,
1951	.llseek			= no_llseek,
1952};
1953
1954static struct miscdevice mce_chrdev_device = {
1955	MISC_MCELOG_MINOR,
1956	"mcelog",
1957	&mce_chrdev_ops,
1958};
1959
1960static void __mce_disable_bank(void *arg)
1961{
1962	int bank = *((int *)arg);
1963	__clear_bit(bank, this_cpu_ptr(mce_poll_banks));
1964	cmci_disable_bank(bank);
1965}
1966
1967void mce_disable_bank(int bank)
1968{
1969	if (bank >= mca_cfg.banks) {
1970		pr_warn(FW_BUG
1971			"Ignoring request to disable invalid MCA bank %d.\n",
1972			bank);
1973		return;
1974	}
1975	set_bit(bank, mce_banks_ce_disabled);
1976	on_each_cpu(__mce_disable_bank, &bank, 1);
1977}
1978
1979/*
1980 * mce=off Disables machine check
1981 * mce=no_cmci Disables CMCI
1982 * mce=dont_log_ce Clears corrected events silently, no log created for CEs.
1983 * mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
1984 * mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
1985 *	monarchtimeout is how long to wait for other CPUs on machine
1986 *	check, or 0 to not wait
1987 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1988 * mce=nobootlog Don't log MCEs from before booting.
1989 * mce=bios_cmci_threshold Don't program the CMCI threshold
1990 */
1991static int __init mcheck_enable(char *str)
1992{
1993	struct mca_config *cfg = &mca_cfg;
1994
1995	if (*str == 0) {
1996		enable_p5_mce();
1997		return 1;
1998	}
1999	if (*str == '=')
2000		str++;
2001	if (!strcmp(str, "off"))
2002		cfg->disabled = true;
2003	else if (!strcmp(str, "no_cmci"))
2004		cfg->cmci_disabled = true;
2005	else if (!strcmp(str, "dont_log_ce"))
2006		cfg->dont_log_ce = true;
2007	else if (!strcmp(str, "ignore_ce"))
2008		cfg->ignore_ce = true;
2009	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
2010		cfg->bootlog = (str[0] == 'b');
2011	else if (!strcmp(str, "bios_cmci_threshold"))
2012		cfg->bios_cmci_threshold = true;
2013	else if (isdigit(str[0])) {
2014		get_option(&str, &(cfg->tolerant));
2015		if (*str == ',') {
2016			++str;
2017			get_option(&str, &(cfg->monarch_timeout));
2018		}
2019	} else {
2020		pr_info("mce argument %s ignored. Please use /sys\n", str);
2021		return 0;
2022	}
2023	return 1;
2024}
2025__setup("mce", mcheck_enable);
2026
2027int __init mcheck_init(void)
2028{
2029	mcheck_intel_therm_init();
2030	mcheck_vendor_init_severity();
2031
2032	return 0;
2033}
2034
2035/*
2036 * mce_syscore: PM support
2037 */
2038
2039/*
2040 * Disable machine checks on suspend and shutdown. We can't really handle
2041 * them later.
2042 */
2043static int mce_disable_error_reporting(void)
2044{
2045	int i;
2046
2047	for (i = 0; i < mca_cfg.banks; i++) {
2048		struct mce_bank *b = &mce_banks[i];
2049
2050		if (b->init)
2051			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2052	}
2053	return 0;
2054}
2055
2056static int mce_syscore_suspend(void)
2057{
2058	return mce_disable_error_reporting();
2059}
2060
2061static void mce_syscore_shutdown(void)
2062{
2063	mce_disable_error_reporting();
2064}
2065
2066/*
2067 * On resume clear all MCE state. Don't want to see leftovers from the BIOS.
2068 * Only one CPU is active at this time, the others get re-added later using
2069 * CPU hotplug:
2070 */
2071static void mce_syscore_resume(void)
2072{
2073	__mcheck_cpu_init_generic();
2074	__mcheck_cpu_init_vendor(raw_cpu_ptr(&cpu_info));
2075}
2076
2077static struct syscore_ops mce_syscore_ops = {
2078	.suspend	= mce_syscore_suspend,
2079	.shutdown	= mce_syscore_shutdown,
2080	.resume		= mce_syscore_resume,
2081};
2082
2083/*
2084 * mce_device: Sysfs support
2085 */
2086
2087static void mce_cpu_restart(void *data)
2088{
2089	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2090		return;
2091	__mcheck_cpu_init_generic();
2092	__mcheck_cpu_init_timer();
2093}
2094
2095/* Reinit MCEs after user configuration changes */
2096static void mce_restart(void)
2097{
2098	mce_timer_delete_all();
2099	on_each_cpu(mce_cpu_restart, NULL, 1);
2100}
2101
2102/* Toggle features for corrected errors */
2103static void mce_disable_cmci(void *data)
2104{
2105	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2106		return;
2107	cmci_clear();
2108}
2109
2110static void mce_enable_ce(void *all)
2111{
2112	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2113		return;
2114	cmci_reenable();
2115	cmci_recheck();
2116	if (all)
2117		__mcheck_cpu_init_timer();
2118}
2119
2120static struct bus_type mce_subsys = {
2121	.name		= "machinecheck",
2122	.dev_name	= "machinecheck",
2123};
2124
2125DEFINE_PER_CPU(struct device *, mce_device);
2126
2127void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2128
2129static inline struct mce_bank *attr_to_bank(struct device_attribute *attr)
2130{
2131	return container_of(attr, struct mce_bank, attr);
2132}
2133
2134static ssize_t show_bank(struct device *s, struct device_attribute *attr,
2135			 char *buf)
2136{
2137	return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2138}
2139
2140static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2141			const char *buf, size_t size)
2142{
2143	u64 new;
2144
2145	if (kstrtou64(buf, 0, &new) < 0)
2146		return -EINVAL;
2147
2148	attr_to_bank(attr)->ctl = new;
2149	mce_restart();
2150
2151	return size;
2152}
2153
2154static ssize_t
2155show_trigger(struct device *s, struct device_attribute *attr, char *buf)
2156{
2157	strcpy(buf, mce_helper);
2158	strcat(buf, "\n");
2159	return strlen(mce_helper) + 1;
2160}
2161
2162static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2163				const char *buf, size_t siz)
2164{
2165	char *p;
2166
2167	strncpy(mce_helper, buf, sizeof(mce_helper));
2168	mce_helper[sizeof(mce_helper)-1] = 0;
2169	p = strchr(mce_helper, '\n');
2170
2171	if (p)
2172		*p = 0;
2173
2174	return strlen(mce_helper) + !!p;
2175}
2176
2177static ssize_t set_ignore_ce(struct device *s,
2178			     struct device_attribute *attr,
2179			     const char *buf, size_t size)
2180{
2181	u64 new;
2182
2183	if (kstrtou64(buf, 0, &new) < 0)
2184		return -EINVAL;
2185
2186	if (mca_cfg.ignore_ce ^ !!new) {
2187		if (new) {
2188			/* disable ce features */
2189			mce_timer_delete_all();
2190			on_each_cpu(mce_disable_cmci, NULL, 1);
2191			mca_cfg.ignore_ce = true;
2192		} else {
2193			/* enable ce features */
2194			mca_cfg.ignore_ce = false;
2195			on_each_cpu(mce_enable_ce, (void *)1, 1);
2196		}
2197	}
2198	return size;
2199}
2200
2201static ssize_t set_cmci_disabled(struct device *s,
2202				 struct device_attribute *attr,
2203				 const char *buf, size_t size)
2204{
2205	u64 new;
2206
2207	if (kstrtou64(buf, 0, &new) < 0)
2208		return -EINVAL;
2209
2210	if (mca_cfg.cmci_disabled ^ !!new) {
2211		if (new) {
2212			/* disable cmci */
2213			on_each_cpu(mce_disable_cmci, NULL, 1);
2214			mca_cfg.cmci_disabled = true;
2215		} else {
2216			/* enable cmci */
2217			mca_cfg.cmci_disabled = false;
2218			on_each_cpu(mce_enable_ce, NULL, 1);
2219		}
2220	}
2221	return size;
2222}
2223
2224static ssize_t store_int_with_restart(struct device *s,
2225				      struct device_attribute *attr,
2226				      const char *buf, size_t size)
2227{
2228	ssize_t ret = device_store_int(s, attr, buf, size);
2229	mce_restart();
2230	return ret;
2231}
2232
2233static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger);
2234static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant);
2235static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout);
2236static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce);
2237
2238static struct dev_ext_attribute dev_attr_check_interval = {
2239	__ATTR(check_interval, 0644, device_show_int, store_int_with_restart),
2240	&check_interval
2241};
2242
2243static struct dev_ext_attribute dev_attr_ignore_ce = {
2244	__ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce),
2245	&mca_cfg.ignore_ce
2246};
2247
2248static struct dev_ext_attribute dev_attr_cmci_disabled = {
2249	__ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled),
2250	&mca_cfg.cmci_disabled
2251};
2252
2253static struct device_attribute *mce_device_attrs[] = {
2254	&dev_attr_tolerant.attr,
2255	&dev_attr_check_interval.attr,
2256	&dev_attr_trigger,
2257	&dev_attr_monarch_timeout.attr,
2258	&dev_attr_dont_log_ce.attr,
2259	&dev_attr_ignore_ce.attr,
2260	&dev_attr_cmci_disabled.attr,
2261	NULL
2262};
2263
2264static cpumask_var_t mce_device_initialized;
2265
2266static void mce_device_release(struct device *dev)
2267{
2268	kfree(dev);
2269}
2270
2271/* Per cpu device init. All of the cpus still share the same ctrl bank: */
2272static int mce_device_create(unsigned int cpu)
2273{
2274	struct device *dev;
2275	int err;
2276	int i, j;
2277
2278	if (!mce_available(&boot_cpu_data))
2279		return -EIO;
2280
2281	dev = kzalloc(sizeof *dev, GFP_KERNEL);
2282	if (!dev)
2283		return -ENOMEM;
2284	dev->id  = cpu;
2285	dev->bus = &mce_subsys;
2286	dev->release = &mce_device_release;
2287
2288	err = device_register(dev);
2289	if (err) {
2290		put_device(dev);
2291		return err;
2292	}
2293
2294	for (i = 0; mce_device_attrs[i]; i++) {
2295		err = device_create_file(dev, mce_device_attrs[i]);
2296		if (err)
2297			goto error;
2298	}
2299	for (j = 0; j < mca_cfg.banks; j++) {
2300		err = device_create_file(dev, &mce_banks[j].attr);
2301		if (err)
2302			goto error2;
2303	}
2304	cpumask_set_cpu(cpu, mce_device_initialized);
2305	per_cpu(mce_device, cpu) = dev;
2306
2307	return 0;
2308error2:
2309	while (--j >= 0)
2310		device_remove_file(dev, &mce_banks[j].attr);
2311error:
2312	while (--i >= 0)
2313		device_remove_file(dev, mce_device_attrs[i]);
2314
2315	device_unregister(dev);
2316
2317	return err;
2318}
2319
2320static void mce_device_remove(unsigned int cpu)
2321{
2322	struct device *dev = per_cpu(mce_device, cpu);
2323	int i;
2324
2325	if (!cpumask_test_cpu(cpu, mce_device_initialized))
2326		return;
2327
2328	for (i = 0; mce_device_attrs[i]; i++)
2329		device_remove_file(dev, mce_device_attrs[i]);
2330
2331	for (i = 0; i < mca_cfg.banks; i++)
2332		device_remove_file(dev, &mce_banks[i].attr);
2333
2334	device_unregister(dev);
2335	cpumask_clear_cpu(cpu, mce_device_initialized);
2336	per_cpu(mce_device, cpu) = NULL;
2337}
2338
2339/* Make sure there are no machine checks on offlined CPUs. */
2340static void mce_disable_cpu(void *h)
2341{
2342	unsigned long action = *(unsigned long *)h;
2343	int i;
2344
2345	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2346		return;
2347
2348	if (!(action & CPU_TASKS_FROZEN))
2349		cmci_clear();
2350	for (i = 0; i < mca_cfg.banks; i++) {
2351		struct mce_bank *b = &mce_banks[i];
2352
2353		if (b->init)
2354			wrmsrl(MSR_IA32_MCx_CTL(i), 0);
2355	}
2356}
2357
2358static void mce_reenable_cpu(void *h)
2359{
2360	unsigned long action = *(unsigned long *)h;
2361	int i;
2362
2363	if (!mce_available(raw_cpu_ptr(&cpu_info)))
2364		return;
2365
2366	if (!(action & CPU_TASKS_FROZEN))
2367		cmci_reenable();
2368	for (i = 0; i < mca_cfg.banks; i++) {
2369		struct mce_bank *b = &mce_banks[i];
2370
2371		if (b->init)
2372			wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
2373	}
2374}
2375
2376/* Get notified when a cpu comes on/off. Be hotplug friendly. */
2377static int
2378mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2379{
2380	unsigned int cpu = (unsigned long)hcpu;
2381	struct timer_list *t = &per_cpu(mce_timer, cpu);
2382
2383	switch (action & ~CPU_TASKS_FROZEN) {
2384	case CPU_ONLINE:
2385		mce_device_create(cpu);
2386		if (threshold_cpu_callback)
2387			threshold_cpu_callback(action, cpu);
2388		break;
2389	case CPU_DEAD:
2390		if (threshold_cpu_callback)
2391			threshold_cpu_callback(action, cpu);
2392		mce_device_remove(cpu);
2393		mce_intel_hcpu_update(cpu);
2394
2395		/* intentionally ignoring frozen here */
2396		if (!(action & CPU_TASKS_FROZEN))
2397			cmci_rediscover();
2398		break;
2399	case CPU_DOWN_PREPARE:
2400		smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2401		del_timer_sync(t);
2402		break;
2403	case CPU_DOWN_FAILED:
2404		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2405		mce_start_timer(cpu, t);
2406		break;
2407	}
2408
2409	return NOTIFY_OK;
2410}
2411
2412static struct notifier_block mce_cpu_notifier = {
2413	.notifier_call = mce_cpu_callback,
2414};
2415
2416static __init void mce_init_banks(void)
2417{
2418	int i;
2419
2420	for (i = 0; i < mca_cfg.banks; i++) {
2421		struct mce_bank *b = &mce_banks[i];
2422		struct device_attribute *a = &b->attr;
2423
2424		sysfs_attr_init(&a->attr);
2425		a->attr.name	= b->attrname;
2426		snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2427
2428		a->attr.mode	= 0644;
2429		a->show		= show_bank;
2430		a->store	= set_bank;
2431	}
2432}
2433
2434static __init int mcheck_init_device(void)
2435{
2436	int err;
2437	int i = 0;
2438
2439	if (!mce_available(&boot_cpu_data)) {
2440		err = -EIO;
2441		goto err_out;
2442	}
2443
2444	if (!zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL)) {
2445		err = -ENOMEM;
2446		goto err_out;
2447	}
2448
2449	mce_init_banks();
2450
2451	err = subsys_system_register(&mce_subsys, NULL);
2452	if (err)
2453		goto err_out_mem;
2454
2455	cpu_notifier_register_begin();
2456	for_each_online_cpu(i) {
2457		err = mce_device_create(i);
2458		if (err) {
2459			/*
2460			 * Register notifier anyway (and do not unreg it) so
2461			 * that we don't leave undeleted timers, see notifier
2462			 * callback above.
2463			 */
2464			__register_hotcpu_notifier(&mce_cpu_notifier);
2465			cpu_notifier_register_done();
2466			goto err_device_create;
2467		}
2468	}
2469
2470	__register_hotcpu_notifier(&mce_cpu_notifier);
2471	cpu_notifier_register_done();
2472
2473	register_syscore_ops(&mce_syscore_ops);
2474
2475	/* register character device /dev/mcelog */
2476	err = misc_register(&mce_chrdev_device);
2477	if (err)
2478		goto err_register;
2479
2480	return 0;
2481
2482err_register:
2483	unregister_syscore_ops(&mce_syscore_ops);
2484
2485err_device_create:
2486	/*
2487	 * We didn't keep track of which devices were created above, but
2488	 * even if we had, the set of online cpus might have changed.
2489	 * Play safe and remove for every possible cpu, since
2490	 * mce_device_remove() will do the right thing.
2491	 */
2492	for_each_possible_cpu(i)
2493		mce_device_remove(i);
2494
2495err_out_mem:
2496	free_cpumask_var(mce_device_initialized);
2497
2498err_out:
2499	pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
2500
2501	return err;
2502}
2503device_initcall_sync(mcheck_init_device);
2504
2505/*
2506 * Old style boot options parsing. Only for compatibility.
2507 */
2508static int __init mcheck_disable(char *str)
2509{
2510	mca_cfg.disabled = true;
2511	return 1;
2512}
2513__setup("nomce", mcheck_disable);
2514
2515#ifdef CONFIG_DEBUG_FS
2516struct dentry *mce_get_debugfs_dir(void)
2517{
2518	static struct dentry *dmce;
2519
2520	if (!dmce)
2521		dmce = debugfs_create_dir("mce", NULL);
2522
2523	return dmce;
2524}
2525
2526static void mce_reset(void)
2527{
2528	cpu_missing = 0;
2529	atomic_set(&mce_fake_panicked, 0);
2530	atomic_set(&mce_executing, 0);
2531	atomic_set(&mce_callin, 0);
2532	atomic_set(&global_nwo, 0);
2533}
2534
2535static int fake_panic_get(void *data, u64 *val)
2536{
2537	*val = fake_panic;
2538	return 0;
2539}
2540
2541static int fake_panic_set(void *data, u64 val)
2542{
2543	mce_reset();
2544	fake_panic = val;
2545	return 0;
2546}
2547
2548DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2549			fake_panic_set, "%llu\n");
2550
2551static int __init mcheck_debugfs_init(void)
2552{
2553	struct dentry *dmce, *ffake_panic;
2554
2555	dmce = mce_get_debugfs_dir();
2556	if (!dmce)
2557		return -ENOMEM;
2558	ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2559					  &fake_panic_fops);
2560	if (!ffake_panic)
2561		return -ENOMEM;
2562
2563	return 0;
2564}
2565late_initcall(mcheck_debugfs_init);
2566#endif
2567