1/* Pseudo NMI support on sparc64 systems. 2 * 3 * Copyright (C) 2009 David S. Miller <davem@davemloft.net> 4 * 5 * The NMI watchdog support and infrastructure is based almost 6 * entirely upon the x86 NMI support code. 7 */ 8#include <linux/kernel.h> 9#include <linux/param.h> 10#include <linux/init.h> 11#include <linux/percpu.h> 12#include <linux/nmi.h> 13#include <linux/export.h> 14#include <linux/kprobes.h> 15#include <linux/kernel_stat.h> 16#include <linux/reboot.h> 17#include <linux/slab.h> 18#include <linux/kdebug.h> 19#include <linux/delay.h> 20#include <linux/smp.h> 21 22#include <asm/perf_event.h> 23#include <asm/ptrace.h> 24#include <asm/pcr.h> 25 26#include "kstack.h" 27 28/* We don't have a real NMI on sparc64, but we can fake one 29 * up using profiling counter overflow interrupts and interrupt 30 * levels. 31 * 32 * The profile overflow interrupts at level 15, so we use 33 * level 14 as our IRQ off level. 34 */ 35 36static int panic_on_timeout; 37 38/* nmi_active: 39 * >0: the NMI watchdog is active, but can be disabled 40 * <0: the NMI watchdog has not been set up, and cannot be enabled 41 * 0: the NMI watchdog is disabled, but can be enabled 42 */ 43atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ 44EXPORT_SYMBOL(nmi_active); 45 46static unsigned int nmi_hz = HZ; 47static DEFINE_PER_CPU(short, wd_enabled); 48static int endflag __initdata; 49 50static DEFINE_PER_CPU(unsigned int, last_irq_sum); 51static DEFINE_PER_CPU(long, alert_counter); 52static DEFINE_PER_CPU(int, nmi_touch); 53 54void touch_nmi_watchdog(void) 55{ 56 if (atomic_read(&nmi_active)) { 57 int cpu; 58 59 for_each_present_cpu(cpu) { 60 if (per_cpu(nmi_touch, cpu) != 1) 61 per_cpu(nmi_touch, cpu) = 1; 62 } 63 } 64 65 touch_softlockup_watchdog(); 66} 67EXPORT_SYMBOL(touch_nmi_watchdog); 68 69static void die_nmi(const char *str, struct pt_regs *regs, int do_panic) 70{ 71 int this_cpu = smp_processor_id(); 72 73 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 74 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) 75 return; 76 77 if (do_panic || panic_on_oops) 78 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); 79 else 80 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 81} 82 83notrace __kprobes void perfctr_irq(int irq, struct pt_regs *regs) 84{ 85 unsigned int sum, touched = 0; 86 void *orig_sp; 87 88 clear_softint(1 << irq); 89 90 local_cpu_data().__nmi_count++; 91 92 nmi_enter(); 93 94 orig_sp = set_hardirq_stack(); 95 96 if (notify_die(DIE_NMI, "nmi", regs, 0, 97 pt_regs_trap_type(regs), SIGINT) == NOTIFY_STOP) 98 touched = 1; 99 else 100 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 101 102 sum = local_cpu_data().irq0_irqs; 103 if (__this_cpu_read(nmi_touch)) { 104 __this_cpu_write(nmi_touch, 0); 105 touched = 1; 106 } 107 if (!touched && __this_cpu_read(last_irq_sum) == sum) { 108 __this_cpu_inc(alert_counter); 109 if (__this_cpu_read(alert_counter) == 30 * nmi_hz) 110 die_nmi("BUG: NMI Watchdog detected LOCKUP", 111 regs, panic_on_timeout); 112 } else { 113 __this_cpu_write(last_irq_sum, sum); 114 __this_cpu_write(alert_counter, 0); 115 } 116 if (__this_cpu_read(wd_enabled)) { 117 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); 118 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); 119 } 120 121 restore_hardirq_stack(orig_sp); 122 123 nmi_exit(); 124} 125 126static inline unsigned int get_nmi_count(int cpu) 127{ 128 return cpu_data(cpu).__nmi_count; 129} 130 131static __init void nmi_cpu_busy(void *data) 132{ 133 while (endflag == 0) 134 mb(); 135} 136 137static void report_broken_nmi(int cpu, int *prev_nmi_count) 138{ 139 printk(KERN_CONT "\n"); 140 141 printk(KERN_WARNING 142 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", 143 cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); 144 145 printk(KERN_WARNING 146 "Please report this to bugzilla.kernel.org,\n"); 147 printk(KERN_WARNING 148 "and attach the output of the 'dmesg' command.\n"); 149 150 per_cpu(wd_enabled, cpu) = 0; 151 atomic_dec(&nmi_active); 152} 153 154void stop_nmi_watchdog(void *unused) 155{ 156 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 157 __this_cpu_write(wd_enabled, 0); 158 atomic_dec(&nmi_active); 159} 160 161static int __init check_nmi_watchdog(void) 162{ 163 unsigned int *prev_nmi_count; 164 int cpu, err; 165 166 if (!atomic_read(&nmi_active)) 167 return 0; 168 169 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(unsigned int), GFP_KERNEL); 170 if (!prev_nmi_count) { 171 err = -ENOMEM; 172 goto error; 173 } 174 175 printk(KERN_INFO "Testing NMI watchdog ... "); 176 177 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); 178 179 for_each_possible_cpu(cpu) 180 prev_nmi_count[cpu] = get_nmi_count(cpu); 181 local_irq_enable(); 182 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ 183 184 for_each_online_cpu(cpu) { 185 if (!per_cpu(wd_enabled, cpu)) 186 continue; 187 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) 188 report_broken_nmi(cpu, prev_nmi_count); 189 } 190 endflag = 1; 191 if (!atomic_read(&nmi_active)) { 192 kfree(prev_nmi_count); 193 atomic_set(&nmi_active, -1); 194 err = -ENODEV; 195 goto error; 196 } 197 printk("OK.\n"); 198 199 nmi_hz = 1; 200 201 kfree(prev_nmi_count); 202 return 0; 203error: 204 on_each_cpu(stop_nmi_watchdog, NULL, 1); 205 return err; 206} 207 208void start_nmi_watchdog(void *unused) 209{ 210 __this_cpu_write(wd_enabled, 1); 211 atomic_inc(&nmi_active); 212 213 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 214 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); 215 216 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); 217} 218 219static void nmi_adjust_hz_one(void *unused) 220{ 221 if (!__this_cpu_read(wd_enabled)) 222 return; 223 224 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_disable); 225 pcr_ops->write_pic(0, pcr_ops->nmi_picl_value(nmi_hz)); 226 227 pcr_ops->write_pcr(0, pcr_ops->pcr_nmi_enable); 228} 229 230void nmi_adjust_hz(unsigned int new_hz) 231{ 232 nmi_hz = new_hz; 233 on_each_cpu(nmi_adjust_hz_one, NULL, 1); 234} 235EXPORT_SYMBOL_GPL(nmi_adjust_hz); 236 237static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p) 238{ 239 on_each_cpu(stop_nmi_watchdog, NULL, 1); 240 return 0; 241} 242 243static struct notifier_block nmi_reboot_notifier = { 244 .notifier_call = nmi_shutdown, 245}; 246 247int __init nmi_init(void) 248{ 249 int err; 250 251 on_each_cpu(start_nmi_watchdog, NULL, 1); 252 253 err = check_nmi_watchdog(); 254 if (!err) { 255 err = register_reboot_notifier(&nmi_reboot_notifier); 256 if (err) { 257 on_each_cpu(stop_nmi_watchdog, NULL, 1); 258 atomic_set(&nmi_active, -1); 259 } 260 } 261 262 return err; 263} 264 265static int __init setup_nmi_watchdog(char *str) 266{ 267 if (!strncmp(str, "panic", 5)) 268 panic_on_timeout = 1; 269 270 return 0; 271} 272__setup("nmi_watchdog=", setup_nmi_watchdog); 273