1/*
2 * Architecture specific (i386/x86_64) functions for kexec based crash dumps.
3 *
4 * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
5 *
6 * Copyright (C) IBM Corporation, 2004. All rights reserved.
7 * Copyright (C) Red Hat Inc., 2014. All rights reserved.
8 * Authors:
9 *      Vivek Goyal <vgoyal@redhat.com>
10 *
11 */
12
13#define pr_fmt(fmt)	"kexec: " fmt
14
15#include <linux/types.h>
16#include <linux/kernel.h>
17#include <linux/smp.h>
18#include <linux/reboot.h>
19#include <linux/kexec.h>
20#include <linux/delay.h>
21#include <linux/elf.h>
22#include <linux/elfcore.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/vmalloc.h>
26
27#include <asm/processor.h>
28#include <asm/hardirq.h>
29#include <asm/nmi.h>
30#include <asm/hw_irq.h>
31#include <asm/apic.h>
32#include <asm/io_apic.h>
33#include <asm/hpet.h>
34#include <linux/kdebug.h>
35#include <asm/cpu.h>
36#include <asm/reboot.h>
37#include <asm/virtext.h>
38
39/* Alignment required for elf header segment */
40#define ELF_CORE_HEADER_ALIGN   4096
41
42/* This primarily represents number of split ranges due to exclusion */
43#define CRASH_MAX_RANGES	16
44
45struct crash_mem_range {
46	u64 start, end;
47};
48
49struct crash_mem {
50	unsigned int nr_ranges;
51	struct crash_mem_range ranges[CRASH_MAX_RANGES];
52};
53
54/* Misc data about ram ranges needed to prepare elf headers */
55struct crash_elf_data {
56	struct kimage *image;
57	/*
58	 * Total number of ram ranges we have after various adjustments for
59	 * GART, crash reserved region etc.
60	 */
61	unsigned int max_nr_ranges;
62	unsigned long gart_start, gart_end;
63
64	/* Pointer to elf header */
65	void *ehdr;
66	/* Pointer to next phdr */
67	void *bufp;
68	struct crash_mem mem;
69};
70
71/* Used while preparing memory map entries for second kernel */
72struct crash_memmap_data {
73	struct boot_params *params;
74	/* Type of memory */
75	unsigned int type;
76};
77
78/*
79 * This is used to VMCLEAR all VMCSs loaded on the
80 * processor. And when loading kvm_intel module, the
81 * callback function pointer will be assigned.
82 *
83 * protected by rcu.
84 */
85crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
86EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
87unsigned long crash_zero_bytes;
88
89static inline void cpu_crash_vmclear_loaded_vmcss(void)
90{
91	crash_vmclear_fn *do_vmclear_operation = NULL;
92
93	rcu_read_lock();
94	do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
95	if (do_vmclear_operation)
96		do_vmclear_operation();
97	rcu_read_unlock();
98}
99
100#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
101
102static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
103{
104#ifdef CONFIG_X86_32
105	struct pt_regs fixed_regs;
106
107	if (!user_mode(regs)) {
108		crash_fixup_ss_esp(&fixed_regs, regs);
109		regs = &fixed_regs;
110	}
111#endif
112	crash_save_cpu(regs, cpu);
113
114	/*
115	 * VMCLEAR VMCSs loaded on all cpus if needed.
116	 */
117	cpu_crash_vmclear_loaded_vmcss();
118
119	/* Disable VMX or SVM if needed.
120	 *
121	 * We need to disable virtualization on all CPUs.
122	 * Having VMX or SVM enabled on any CPU may break rebooting
123	 * after the kdump kernel has finished its task.
124	 */
125	cpu_emergency_vmxoff();
126	cpu_emergency_svm_disable();
127
128	disable_local_APIC();
129}
130
131static void kdump_nmi_shootdown_cpus(void)
132{
133	nmi_shootdown_cpus(kdump_nmi_callback);
134
135	disable_local_APIC();
136}
137
138#else
139static void kdump_nmi_shootdown_cpus(void)
140{
141	/* There are no cpus to shootdown */
142}
143#endif
144
145void native_machine_crash_shutdown(struct pt_regs *regs)
146{
147	/* This function is only called after the system
148	 * has panicked or is otherwise in a critical state.
149	 * The minimum amount of code to allow a kexec'd kernel
150	 * to run successfully needs to happen here.
151	 *
152	 * In practice this means shooting down the other cpus in
153	 * an SMP system.
154	 */
155	/* The kernel is broken so disable interrupts */
156	local_irq_disable();
157
158	kdump_nmi_shootdown_cpus();
159
160	/*
161	 * VMCLEAR VMCSs loaded on this cpu if needed.
162	 */
163	cpu_crash_vmclear_loaded_vmcss();
164
165	/* Booting kdump kernel with VMX or SVM enabled won't work,
166	 * because (among other limitations) we can't disable paging
167	 * with the virt flags.
168	 */
169	cpu_emergency_vmxoff();
170	cpu_emergency_svm_disable();
171
172#ifdef CONFIG_X86_IO_APIC
173	/* Prevent crash_kexec() from deadlocking on ioapic_lock. */
174	ioapic_zap_locks();
175	disable_IO_APIC();
176#endif
177	lapic_shutdown();
178#ifdef CONFIG_HPET_TIMER
179	hpet_disable();
180#endif
181	crash_save_cpu(regs, safe_smp_processor_id());
182}
183
184#ifdef CONFIG_KEXEC_FILE
185static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
186{
187	unsigned int *nr_ranges = arg;
188
189	(*nr_ranges)++;
190	return 0;
191}
192
193static int get_gart_ranges_callback(u64 start, u64 end, void *arg)
194{
195	struct crash_elf_data *ced = arg;
196
197	ced->gart_start = start;
198	ced->gart_end = end;
199
200	/* Not expecting more than 1 gart aperture */
201	return 1;
202}
203
204
205/* Gather all the required information to prepare elf headers for ram regions */
206static void fill_up_crash_elf_data(struct crash_elf_data *ced,
207				   struct kimage *image)
208{
209	unsigned int nr_ranges = 0;
210
211	ced->image = image;
212
213	walk_system_ram_res(0, -1, &nr_ranges,
214				get_nr_ram_ranges_callback);
215
216	ced->max_nr_ranges = nr_ranges;
217
218	/*
219	 * We don't create ELF headers for GART aperture as an attempt
220	 * to dump this memory in second kernel leads to hang/crash.
221	 * If gart aperture is present, one needs to exclude that region
222	 * and that could lead to need of extra phdr.
223	 */
224	walk_iomem_res("GART", IORESOURCE_MEM, 0, -1,
225				ced, get_gart_ranges_callback);
226
227	/*
228	 * If we have gart region, excluding that could potentially split
229	 * a memory range, resulting in extra header. Account for  that.
230	 */
231	if (ced->gart_end)
232		ced->max_nr_ranges++;
233
234	/* Exclusion of crash region could split memory ranges */
235	ced->max_nr_ranges++;
236
237	/* If crashk_low_res is not 0, another range split possible */
238	if (crashk_low_res.end)
239		ced->max_nr_ranges++;
240}
241
242static int exclude_mem_range(struct crash_mem *mem,
243		unsigned long long mstart, unsigned long long mend)
244{
245	int i, j;
246	unsigned long long start, end;
247	struct crash_mem_range temp_range = {0, 0};
248
249	for (i = 0; i < mem->nr_ranges; i++) {
250		start = mem->ranges[i].start;
251		end = mem->ranges[i].end;
252
253		if (mstart > end || mend < start)
254			continue;
255
256		/* Truncate any area outside of range */
257		if (mstart < start)
258			mstart = start;
259		if (mend > end)
260			mend = end;
261
262		/* Found completely overlapping range */
263		if (mstart == start && mend == end) {
264			mem->ranges[i].start = 0;
265			mem->ranges[i].end = 0;
266			if (i < mem->nr_ranges - 1) {
267				/* Shift rest of the ranges to left */
268				for (j = i; j < mem->nr_ranges - 1; j++) {
269					mem->ranges[j].start =
270						mem->ranges[j+1].start;
271					mem->ranges[j].end =
272							mem->ranges[j+1].end;
273				}
274			}
275			mem->nr_ranges--;
276			return 0;
277		}
278
279		if (mstart > start && mend < end) {
280			/* Split original range */
281			mem->ranges[i].end = mstart - 1;
282			temp_range.start = mend + 1;
283			temp_range.end = end;
284		} else if (mstart != start)
285			mem->ranges[i].end = mstart - 1;
286		else
287			mem->ranges[i].start = mend + 1;
288		break;
289	}
290
291	/* If a split happend, add the split to array */
292	if (!temp_range.end)
293		return 0;
294
295	/* Split happened */
296	if (i == CRASH_MAX_RANGES - 1) {
297		pr_err("Too many crash ranges after split\n");
298		return -ENOMEM;
299	}
300
301	/* Location where new range should go */
302	j = i + 1;
303	if (j < mem->nr_ranges) {
304		/* Move over all ranges one slot towards the end */
305		for (i = mem->nr_ranges - 1; i >= j; i--)
306			mem->ranges[i + 1] = mem->ranges[i];
307	}
308
309	mem->ranges[j].start = temp_range.start;
310	mem->ranges[j].end = temp_range.end;
311	mem->nr_ranges++;
312	return 0;
313}
314
315/*
316 * Look for any unwanted ranges between mstart, mend and remove them. This
317 * might lead to split and split ranges are put in ced->mem.ranges[] array
318 */
319static int elf_header_exclude_ranges(struct crash_elf_data *ced,
320		unsigned long long mstart, unsigned long long mend)
321{
322	struct crash_mem *cmem = &ced->mem;
323	int ret = 0;
324
325	memset(cmem->ranges, 0, sizeof(cmem->ranges));
326
327	cmem->ranges[0].start = mstart;
328	cmem->ranges[0].end = mend;
329	cmem->nr_ranges = 1;
330
331	/* Exclude crashkernel region */
332	ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end);
333	if (ret)
334		return ret;
335
336	if (crashk_low_res.end) {
337		ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end);
338		if (ret)
339			return ret;
340	}
341
342	/* Exclude GART region */
343	if (ced->gart_end) {
344		ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end);
345		if (ret)
346			return ret;
347	}
348
349	return ret;
350}
351
352static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
353{
354	struct crash_elf_data *ced = arg;
355	Elf64_Ehdr *ehdr;
356	Elf64_Phdr *phdr;
357	unsigned long mstart, mend;
358	struct kimage *image = ced->image;
359	struct crash_mem *cmem;
360	int ret, i;
361
362	ehdr = ced->ehdr;
363
364	/* Exclude unwanted mem ranges */
365	ret = elf_header_exclude_ranges(ced, start, end);
366	if (ret)
367		return ret;
368
369	/* Go through all the ranges in ced->mem.ranges[] and prepare phdr */
370	cmem = &ced->mem;
371
372	for (i = 0; i < cmem->nr_ranges; i++) {
373		mstart = cmem->ranges[i].start;
374		mend = cmem->ranges[i].end;
375
376		phdr = ced->bufp;
377		ced->bufp += sizeof(Elf64_Phdr);
378
379		phdr->p_type = PT_LOAD;
380		phdr->p_flags = PF_R|PF_W|PF_X;
381		phdr->p_offset  = mstart;
382
383		/*
384		 * If a range matches backup region, adjust offset to backup
385		 * segment.
386		 */
387		if (mstart == image->arch.backup_src_start &&
388		    (mend - mstart + 1) == image->arch.backup_src_sz)
389			phdr->p_offset = image->arch.backup_load_addr;
390
391		phdr->p_paddr = mstart;
392		phdr->p_vaddr = (unsigned long long) __va(mstart);
393		phdr->p_filesz = phdr->p_memsz = mend - mstart + 1;
394		phdr->p_align = 0;
395		ehdr->e_phnum++;
396		pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n",
397			phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz,
398			ehdr->e_phnum, phdr->p_offset);
399	}
400
401	return ret;
402}
403
404static int prepare_elf64_headers(struct crash_elf_data *ced,
405		void **addr, unsigned long *sz)
406{
407	Elf64_Ehdr *ehdr;
408	Elf64_Phdr *phdr;
409	unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz;
410	unsigned char *buf, *bufp;
411	unsigned int cpu;
412	unsigned long long notes_addr;
413	int ret;
414
415	/* extra phdr for vmcoreinfo elf note */
416	nr_phdr = nr_cpus + 1;
417	nr_phdr += ced->max_nr_ranges;
418
419	/*
420	 * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping
421	 * area on x86_64 (ffffffff80000000 - ffffffffa0000000).
422	 * I think this is required by tools like gdb. So same physical
423	 * memory will be mapped in two elf headers. One will contain kernel
424	 * text virtual addresses and other will have __va(physical) addresses.
425	 */
426
427	nr_phdr++;
428	elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr);
429	elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN);
430
431	buf = vzalloc(elf_sz);
432	if (!buf)
433		return -ENOMEM;
434
435	bufp = buf;
436	ehdr = (Elf64_Ehdr *)bufp;
437	bufp += sizeof(Elf64_Ehdr);
438	memcpy(ehdr->e_ident, ELFMAG, SELFMAG);
439	ehdr->e_ident[EI_CLASS] = ELFCLASS64;
440	ehdr->e_ident[EI_DATA] = ELFDATA2LSB;
441	ehdr->e_ident[EI_VERSION] = EV_CURRENT;
442	ehdr->e_ident[EI_OSABI] = ELF_OSABI;
443	memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD);
444	ehdr->e_type = ET_CORE;
445	ehdr->e_machine = ELF_ARCH;
446	ehdr->e_version = EV_CURRENT;
447	ehdr->e_phoff = sizeof(Elf64_Ehdr);
448	ehdr->e_ehsize = sizeof(Elf64_Ehdr);
449	ehdr->e_phentsize = sizeof(Elf64_Phdr);
450
451	/* Prepare one phdr of type PT_NOTE for each present cpu */
452	for_each_present_cpu(cpu) {
453		phdr = (Elf64_Phdr *)bufp;
454		bufp += sizeof(Elf64_Phdr);
455		phdr->p_type = PT_NOTE;
456		notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu));
457		phdr->p_offset = phdr->p_paddr = notes_addr;
458		phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t);
459		(ehdr->e_phnum)++;
460	}
461
462	/* Prepare one PT_NOTE header for vmcoreinfo */
463	phdr = (Elf64_Phdr *)bufp;
464	bufp += sizeof(Elf64_Phdr);
465	phdr->p_type = PT_NOTE;
466	phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note();
467	phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note);
468	(ehdr->e_phnum)++;
469
470#ifdef CONFIG_X86_64
471	/* Prepare PT_LOAD type program header for kernel text region */
472	phdr = (Elf64_Phdr *)bufp;
473	bufp += sizeof(Elf64_Phdr);
474	phdr->p_type = PT_LOAD;
475	phdr->p_flags = PF_R|PF_W|PF_X;
476	phdr->p_vaddr = (Elf64_Addr)_text;
477	phdr->p_filesz = phdr->p_memsz = _end - _text;
478	phdr->p_offset = phdr->p_paddr = __pa_symbol(_text);
479	(ehdr->e_phnum)++;
480#endif
481
482	/* Prepare PT_LOAD headers for system ram chunks. */
483	ced->ehdr = ehdr;
484	ced->bufp = bufp;
485	ret = walk_system_ram_res(0, -1, ced,
486			prepare_elf64_ram_headers_callback);
487	if (ret < 0)
488		return ret;
489
490	*addr = buf;
491	*sz = elf_sz;
492	return 0;
493}
494
495/* Prepare elf headers. Return addr and size */
496static int prepare_elf_headers(struct kimage *image, void **addr,
497					unsigned long *sz)
498{
499	struct crash_elf_data *ced;
500	int ret;
501
502	ced = kzalloc(sizeof(*ced), GFP_KERNEL);
503	if (!ced)
504		return -ENOMEM;
505
506	fill_up_crash_elf_data(ced, image);
507
508	/* By default prepare 64bit headers */
509	ret =  prepare_elf64_headers(ced, addr, sz);
510	kfree(ced);
511	return ret;
512}
513
514static int add_e820_entry(struct boot_params *params, struct e820entry *entry)
515{
516	unsigned int nr_e820_entries;
517
518	nr_e820_entries = params->e820_entries;
519	if (nr_e820_entries >= E820MAX)
520		return 1;
521
522	memcpy(&params->e820_map[nr_e820_entries], entry,
523			sizeof(struct e820entry));
524	params->e820_entries++;
525	return 0;
526}
527
528static int memmap_entry_callback(u64 start, u64 end, void *arg)
529{
530	struct crash_memmap_data *cmd = arg;
531	struct boot_params *params = cmd->params;
532	struct e820entry ei;
533
534	ei.addr = start;
535	ei.size = end - start + 1;
536	ei.type = cmd->type;
537	add_e820_entry(params, &ei);
538
539	return 0;
540}
541
542static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem,
543				 unsigned long long mstart,
544				 unsigned long long mend)
545{
546	unsigned long start, end;
547	int ret = 0;
548
549	cmem->ranges[0].start = mstart;
550	cmem->ranges[0].end = mend;
551	cmem->nr_ranges = 1;
552
553	/* Exclude Backup region */
554	start = image->arch.backup_load_addr;
555	end = start + image->arch.backup_src_sz - 1;
556	ret = exclude_mem_range(cmem, start, end);
557	if (ret)
558		return ret;
559
560	/* Exclude elf header region */
561	start = image->arch.elf_load_addr;
562	end = start + image->arch.elf_headers_sz - 1;
563	return exclude_mem_range(cmem, start, end);
564}
565
566/* Prepare memory map for crash dump kernel */
567int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params)
568{
569	int i, ret = 0;
570	unsigned long flags;
571	struct e820entry ei;
572	struct crash_memmap_data cmd;
573	struct crash_mem *cmem;
574
575	cmem = vzalloc(sizeof(struct crash_mem));
576	if (!cmem)
577		return -ENOMEM;
578
579	memset(&cmd, 0, sizeof(struct crash_memmap_data));
580	cmd.params = params;
581
582	/* Add first 640K segment */
583	ei.addr = image->arch.backup_src_start;
584	ei.size = image->arch.backup_src_sz;
585	ei.type = E820_RAM;
586	add_e820_entry(params, &ei);
587
588	/* Add ACPI tables */
589	cmd.type = E820_ACPI;
590	flags = IORESOURCE_MEM | IORESOURCE_BUSY;
591	walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd,
592		       memmap_entry_callback);
593
594	/* Add ACPI Non-volatile Storage */
595	cmd.type = E820_NVS;
596	walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd,
597			memmap_entry_callback);
598
599	/* Add crashk_low_res region */
600	if (crashk_low_res.end) {
601		ei.addr = crashk_low_res.start;
602		ei.size = crashk_low_res.end - crashk_low_res.start + 1;
603		ei.type = E820_RAM;
604		add_e820_entry(params, &ei);
605	}
606
607	/* Exclude some ranges from crashk_res and add rest to memmap */
608	ret = memmap_exclude_ranges(image, cmem, crashk_res.start,
609						crashk_res.end);
610	if (ret)
611		goto out;
612
613	for (i = 0; i < cmem->nr_ranges; i++) {
614		ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1;
615
616		/* If entry is less than a page, skip it */
617		if (ei.size < PAGE_SIZE)
618			continue;
619		ei.addr = cmem->ranges[i].start;
620		ei.type = E820_RAM;
621		add_e820_entry(params, &ei);
622	}
623
624out:
625	vfree(cmem);
626	return ret;
627}
628
629static int determine_backup_region(u64 start, u64 end, void *arg)
630{
631	struct kimage *image = arg;
632
633	image->arch.backup_src_start = start;
634	image->arch.backup_src_sz = end - start + 1;
635
636	/* Expecting only one range for backup region */
637	return 1;
638}
639
640int crash_load_segments(struct kimage *image)
641{
642	unsigned long src_start, src_sz, elf_sz;
643	void *elf_addr;
644	int ret;
645
646	/*
647	 * Determine and load a segment for backup area. First 640K RAM
648	 * region is backup source
649	 */
650
651	ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END,
652				image, determine_backup_region);
653
654	/* Zero or postive return values are ok */
655	if (ret < 0)
656		return ret;
657
658	src_start = image->arch.backup_src_start;
659	src_sz = image->arch.backup_src_sz;
660
661	/* Add backup segment. */
662	if (src_sz) {
663		/*
664		 * Ideally there is no source for backup segment. This is
665		 * copied in purgatory after crash. Just add a zero filled
666		 * segment for now to make sure checksum logic works fine.
667		 */
668		ret = kexec_add_buffer(image, (char *)&crash_zero_bytes,
669				       sizeof(crash_zero_bytes), src_sz,
670				       PAGE_SIZE, 0, -1, 0,
671				       &image->arch.backup_load_addr);
672		if (ret)
673			return ret;
674		pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n",
675			 image->arch.backup_load_addr, src_start, src_sz);
676	}
677
678	/* Prepare elf headers and add a segment */
679	ret = prepare_elf_headers(image, &elf_addr, &elf_sz);
680	if (ret)
681		return ret;
682
683	image->arch.elf_headers = elf_addr;
684	image->arch.elf_headers_sz = elf_sz;
685
686	ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz,
687			ELF_CORE_HEADER_ALIGN, 0, -1, 0,
688			&image->arch.elf_load_addr);
689	if (ret) {
690		vfree((void *)image->arch.elf_headers);
691		return ret;
692	}
693	pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n",
694		 image->arch.elf_load_addr, elf_sz, elf_sz);
695
696	return ret;
697}
698#endif /* CONFIG_KEXEC_FILE */
699