1/*
2 * OPAL hypervisor Maintenance interrupt handling support in PowreNV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Copyright 2014 IBM Corporation
18 * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
19 */
20
21#undef DEBUG
22
23#include <linux/kernel.h>
24#include <linux/init.h>
25#include <linux/of.h>
26#include <linux/mm.h>
27#include <linux/slab.h>
28
29#include <asm/opal.h>
30#include <asm/cputable.h>
31#include <asm/machdep.h>
32
33static int opal_hmi_handler_nb_init;
34struct OpalHmiEvtNode {
35	struct list_head list;
36	struct OpalHMIEvent hmi_evt;
37};
38
39struct xstop_reason {
40	uint32_t xstop_reason;
41	const char *unit_failed;
42	const char *description;
43};
44
45static LIST_HEAD(opal_hmi_evt_list);
46static DEFINE_SPINLOCK(opal_hmi_evt_lock);
47
48static void print_core_checkstop_reason(const char *level,
49					struct OpalHMIEvent *hmi_evt)
50{
51	int i;
52	static const struct xstop_reason xstop_reason[] = {
53		{ CORE_CHECKSTOP_IFU_REGFILE, "IFU",
54				"RegFile core check stop" },
55		{ CORE_CHECKSTOP_IFU_LOGIC, "IFU", "Logic core check stop" },
56		{ CORE_CHECKSTOP_PC_DURING_RECOV, "PC",
57				"Core checkstop during recovery" },
58		{ CORE_CHECKSTOP_ISU_REGFILE, "ISU",
59				"RegFile core check stop (mapper error)" },
60		{ CORE_CHECKSTOP_ISU_LOGIC, "ISU", "Logic core check stop" },
61		{ CORE_CHECKSTOP_FXU_LOGIC, "FXU", "Logic core check stop" },
62		{ CORE_CHECKSTOP_VSU_LOGIC, "VSU", "Logic core check stop" },
63		{ CORE_CHECKSTOP_PC_RECOV_IN_MAINT_MODE, "PC",
64				"Recovery in maintenance mode" },
65		{ CORE_CHECKSTOP_LSU_REGFILE, "LSU",
66				"RegFile core check stop" },
67		{ CORE_CHECKSTOP_PC_FWD_PROGRESS, "PC",
68				"Forward Progress Error" },
69		{ CORE_CHECKSTOP_LSU_LOGIC, "LSU", "Logic core check stop" },
70		{ CORE_CHECKSTOP_PC_LOGIC, "PC", "Logic core check stop" },
71		{ CORE_CHECKSTOP_PC_HYP_RESOURCE, "PC",
72				"Hypervisor Resource error - core check stop" },
73		{ CORE_CHECKSTOP_PC_HANG_RECOV_FAILED, "PC",
74				"Hang Recovery Failed (core check stop)" },
75		{ CORE_CHECKSTOP_PC_AMBI_HANG_DETECTED, "PC",
76				"Ambiguous Hang Detected (unknown source)" },
77		{ CORE_CHECKSTOP_PC_DEBUG_TRIG_ERR_INJ, "PC",
78				"Debug Trigger Error inject" },
79		{ CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ, "PC",
80				"Hypervisor check stop via SPRC/SPRD" },
81	};
82
83	/* Validity check */
84	if (!hmi_evt->u.xstop_error.xstop_reason) {
85		printk("%s	Unknown Core check stop.\n", level);
86		return;
87	}
88
89	printk("%s	CPU PIR: %08x\n", level,
90			be32_to_cpu(hmi_evt->u.xstop_error.u.pir));
91	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
92		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
93					xstop_reason[i].xstop_reason)
94			printk("%s	[Unit: %-3s] %s\n", level,
95					xstop_reason[i].unit_failed,
96					xstop_reason[i].description);
97}
98
99static void print_nx_checkstop_reason(const char *level,
100					struct OpalHMIEvent *hmi_evt)
101{
102	int i;
103	static const struct xstop_reason xstop_reason[] = {
104		{ NX_CHECKSTOP_SHM_INVAL_STATE_ERR, "DMA & Engine",
105					"SHM invalid state error" },
106		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_1, "DMA & Engine",
107					"DMA invalid state error bit 15" },
108		{ NX_CHECKSTOP_DMA_INVAL_STATE_ERR_2, "DMA & Engine",
109					"DMA invalid state error bit 16" },
110		{ NX_CHECKSTOP_DMA_CH0_INVAL_STATE_ERR, "DMA & Engine",
111					"Channel 0 invalid state error" },
112		{ NX_CHECKSTOP_DMA_CH1_INVAL_STATE_ERR, "DMA & Engine",
113					"Channel 1 invalid state error" },
114		{ NX_CHECKSTOP_DMA_CH2_INVAL_STATE_ERR, "DMA & Engine",
115					"Channel 2 invalid state error" },
116		{ NX_CHECKSTOP_DMA_CH3_INVAL_STATE_ERR, "DMA & Engine",
117					"Channel 3 invalid state error" },
118		{ NX_CHECKSTOP_DMA_CH4_INVAL_STATE_ERR, "DMA & Engine",
119					"Channel 4 invalid state error" },
120		{ NX_CHECKSTOP_DMA_CH5_INVAL_STATE_ERR, "DMA & Engine",
121					"Channel 5 invalid state error" },
122		{ NX_CHECKSTOP_DMA_CH6_INVAL_STATE_ERR, "DMA & Engine",
123					"Channel 6 invalid state error" },
124		{ NX_CHECKSTOP_DMA_CH7_INVAL_STATE_ERR, "DMA & Engine",
125					"Channel 7 invalid state error" },
126		{ NX_CHECKSTOP_DMA_CRB_UE, "DMA & Engine",
127					"UE error on CRB(CSB address, CCB)" },
128		{ NX_CHECKSTOP_DMA_CRB_SUE, "DMA & Engine",
129					"SUE error on CRB(CSB address, CCB)" },
130		{ NX_CHECKSTOP_PBI_ISN_UE, "PowerBus Interface",
131		"CRB Kill ISN received while holding ISN with UE error" },
132	};
133
134	/* Validity check */
135	if (!hmi_evt->u.xstop_error.xstop_reason) {
136		printk("%s	Unknown NX check stop.\n", level);
137		return;
138	}
139
140	printk("%s	NX checkstop on CHIP ID: %x\n", level,
141			be32_to_cpu(hmi_evt->u.xstop_error.u.chip_id));
142	for (i = 0; i < ARRAY_SIZE(xstop_reason); i++)
143		if (be32_to_cpu(hmi_evt->u.xstop_error.xstop_reason) &
144					xstop_reason[i].xstop_reason)
145			printk("%s	[Unit: %-3s] %s\n", level,
146					xstop_reason[i].unit_failed,
147					xstop_reason[i].description);
148}
149
150static void print_checkstop_reason(const char *level,
151					struct OpalHMIEvent *hmi_evt)
152{
153	switch (hmi_evt->u.xstop_error.xstop_type) {
154	case CHECKSTOP_TYPE_CORE:
155		print_core_checkstop_reason(level, hmi_evt);
156		break;
157	case CHECKSTOP_TYPE_NX:
158		print_nx_checkstop_reason(level, hmi_evt);
159		break;
160	case CHECKSTOP_TYPE_UNKNOWN:
161		printk("%s	Unknown Malfunction Alert.\n", level);
162		break;
163	}
164}
165
166static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt)
167{
168	const char *level, *sevstr, *error_info;
169	static const char *hmi_error_types[] = {
170		"Malfunction Alert",
171		"Processor Recovery done",
172		"Processor recovery occurred again",
173		"Processor recovery occurred for masked error",
174		"Timer facility experienced an error",
175		"TFMR SPR is corrupted",
176		"UPS (Uniterrupted Power System) Overflow indication",
177		"An XSCOM operation failure",
178		"An XSCOM operation completed",
179		"SCOM has set a reserved FIR bit to cause recovery",
180		"Debug trigger has set a reserved FIR bit to cause recovery",
181		"A hypervisor resource error occurred"
182	};
183
184	/* Print things out */
185	if (hmi_evt->version < OpalHMIEvt_V1) {
186		pr_err("HMI Interrupt, Unknown event version %d !\n",
187			hmi_evt->version);
188		return;
189	}
190	switch (hmi_evt->severity) {
191	case OpalHMI_SEV_NO_ERROR:
192		level = KERN_INFO;
193		sevstr = "Harmless";
194		break;
195	case OpalHMI_SEV_WARNING:
196		level = KERN_WARNING;
197		sevstr = "";
198		break;
199	case OpalHMI_SEV_ERROR_SYNC:
200		level = KERN_ERR;
201		sevstr = "Severe";
202		break;
203	case OpalHMI_SEV_FATAL:
204	default:
205		level = KERN_ERR;
206		sevstr = "Fatal";
207		break;
208	}
209
210	printk("%s%s Hypervisor Maintenance interrupt [%s]\n",
211		level, sevstr,
212		hmi_evt->disposition == OpalHMI_DISPOSITION_RECOVERED ?
213		"Recovered" : "Not recovered");
214	error_info = hmi_evt->type < ARRAY_SIZE(hmi_error_types) ?
215			hmi_error_types[hmi_evt->type]
216			: "Unknown";
217	printk("%s Error detail: %s\n", level, error_info);
218	printk("%s	HMER: %016llx\n", level, be64_to_cpu(hmi_evt->hmer));
219	if ((hmi_evt->type == OpalHMI_ERROR_TFAC) ||
220		(hmi_evt->type == OpalHMI_ERROR_TFMR_PARITY))
221		printk("%s	TFMR: %016llx\n", level,
222						be64_to_cpu(hmi_evt->tfmr));
223
224	if (hmi_evt->version < OpalHMIEvt_V2)
225		return;
226
227	/* OpalHMIEvt_V2 and above provides reason for malfunction alert. */
228	if (hmi_evt->type == OpalHMI_ERROR_MALFUNC_ALERT)
229		print_checkstop_reason(level, hmi_evt);
230}
231
232static void hmi_event_handler(struct work_struct *work)
233{
234	unsigned long flags;
235	struct OpalHMIEvent *hmi_evt;
236	struct OpalHmiEvtNode *msg_node;
237	uint8_t disposition;
238	struct opal_msg msg;
239	int unrecoverable = 0;
240
241	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
242	while (!list_empty(&opal_hmi_evt_list)) {
243		msg_node = list_entry(opal_hmi_evt_list.next,
244					   struct OpalHmiEvtNode, list);
245		list_del(&msg_node->list);
246		spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
247
248		hmi_evt = (struct OpalHMIEvent *) &msg_node->hmi_evt;
249		print_hmi_event_info(hmi_evt);
250		disposition = hmi_evt->disposition;
251		kfree(msg_node);
252
253		/*
254		 * Check if HMI event has been recovered or not. If not
255		 * then kernel can't continue, we need to panic.
256		 * But before we do that, display all the HMI event
257		 * available on the list and set unrecoverable flag to 1.
258		 */
259		if (disposition != OpalHMI_DISPOSITION_RECOVERED)
260			unrecoverable = 1;
261
262		spin_lock_irqsave(&opal_hmi_evt_lock, flags);
263	}
264	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
265
266	if (unrecoverable) {
267		int ret;
268
269		/* Pull all HMI events from OPAL before we panic. */
270		while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) {
271			u32 type;
272
273			type = be32_to_cpu(msg.msg_type);
274
275			/* skip if not HMI event */
276			if (type != OPAL_MSG_HMI_EVT)
277				continue;
278
279			/* HMI event info starts from param[0] */
280			hmi_evt = (struct OpalHMIEvent *)&msg.params[0];
281			print_hmi_event_info(hmi_evt);
282		}
283
284		/*
285		 * Unrecoverable HMI exception. We need to inform BMC/OCC
286		 * about this error so that it can collect relevant data
287		 * for error analysis before rebooting.
288		 */
289		ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR,
290			"Unrecoverable HMI exception");
291		if (ret == OPAL_UNSUPPORTED) {
292			pr_emerg("Reboot type %d not supported\n",
293						OPAL_REBOOT_PLATFORM_ERROR);
294		}
295
296		/*
297		 * Fall through and panic if opal_cec_reboot2() returns
298		 * OPAL_UNSUPPORTED.
299		 */
300		panic("Unrecoverable HMI exception");
301	}
302}
303
304static DECLARE_WORK(hmi_event_work, hmi_event_handler);
305/*
306 * opal_handle_hmi_event - notifier handler that queues up HMI events
307 * to be preocessed later.
308 */
309static int opal_handle_hmi_event(struct notifier_block *nb,
310			  unsigned long msg_type, void *msg)
311{
312	unsigned long flags;
313	struct OpalHMIEvent *hmi_evt;
314	struct opal_msg *hmi_msg = msg;
315	struct OpalHmiEvtNode *msg_node;
316
317	/* Sanity Checks */
318	if (msg_type != OPAL_MSG_HMI_EVT)
319		return 0;
320
321	/* HMI event info starts from param[0] */
322	hmi_evt = (struct OpalHMIEvent *)&hmi_msg->params[0];
323
324	/* Delay the logging of HMI events to workqueue. */
325	msg_node = kzalloc(sizeof(*msg_node), GFP_ATOMIC);
326	if (!msg_node) {
327		pr_err("HMI: out of memory, Opal message event not handled\n");
328		return -ENOMEM;
329	}
330	memcpy(&msg_node->hmi_evt, hmi_evt, sizeof(struct OpalHMIEvent));
331
332	spin_lock_irqsave(&opal_hmi_evt_lock, flags);
333	list_add(&msg_node->list, &opal_hmi_evt_list);
334	spin_unlock_irqrestore(&opal_hmi_evt_lock, flags);
335
336	schedule_work(&hmi_event_work);
337	return 0;
338}
339
340static struct notifier_block opal_hmi_handler_nb = {
341	.notifier_call	= opal_handle_hmi_event,
342	.next		= NULL,
343	.priority	= 0,
344};
345
346int __init opal_hmi_handler_init(void)
347{
348	int ret;
349
350	if (!opal_hmi_handler_nb_init) {
351		ret = opal_message_notifier_register(
352				OPAL_MSG_HMI_EVT, &opal_hmi_handler_nb);
353		if (ret) {
354			pr_err("%s: Can't register OPAL event notifier (%d)\n",
355			       __func__, ret);
356			return ret;
357		}
358		opal_hmi_handler_nb_init = 1;
359	}
360	return 0;
361}
362