1/*
2 * page-types: Tool for querying page flags
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the Free
6 * Software Foundation; version 2.
7 *
8 * This program is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11 * more details.
12 *
13 * You should find a copy of v2 of the GNU General Public License somewhere on
14 * your Linux system; if not, write to the Free Software Foundation, Inc., 59
15 * Temple Place, Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2009 Intel corporation
18 *
19 * Authors: Wu Fengguang <fengguang.wu@intel.com>
20 */
21
22#define _FILE_OFFSET_BITS 64
23#define _GNU_SOURCE
24#include <stdio.h>
25#include <stdlib.h>
26#include <unistd.h>
27#include <stdint.h>
28#include <stdarg.h>
29#include <string.h>
30#include <getopt.h>
31#include <limits.h>
32#include <assert.h>
33#include <ftw.h>
34#include <time.h>
35#include <setjmp.h>
36#include <signal.h>
37#include <sys/types.h>
38#include <sys/errno.h>
39#include <sys/fcntl.h>
40#include <sys/mount.h>
41#include <sys/statfs.h>
42#include <sys/mman.h>
43#include "../../include/uapi/linux/magic.h"
44#include "../../include/uapi/linux/kernel-page-flags.h"
45#include <api/fs/fs.h>
46
47#ifndef MAX_PATH
48# define MAX_PATH 256
49#endif
50
51#ifndef STR
52# define _STR(x) #x
53# define STR(x) _STR(x)
54#endif
55
56/*
57 * pagemap kernel ABI bits
58 */
59
60#define PM_ENTRY_BYTES		8
61#define PM_PFRAME_BITS		55
62#define PM_PFRAME_MASK		((1LL << PM_PFRAME_BITS) - 1)
63#define PM_PFRAME(x)		((x) & PM_PFRAME_MASK)
64#define PM_SOFT_DIRTY		(1ULL << 55)
65#define PM_MMAP_EXCLUSIVE	(1ULL << 56)
66#define PM_FILE			(1ULL << 61)
67#define PM_SWAP			(1ULL << 62)
68#define PM_PRESENT		(1ULL << 63)
69
70/*
71 * kernel page flags
72 */
73
74#define KPF_BYTES		8
75#define PROC_KPAGEFLAGS		"/proc/kpageflags"
76
77/* [32-] kernel hacking assistances */
78#define KPF_RESERVED		32
79#define KPF_MLOCKED		33
80#define KPF_MAPPEDTODISK	34
81#define KPF_PRIVATE		35
82#define KPF_PRIVATE_2		36
83#define KPF_OWNER_PRIVATE	37
84#define KPF_ARCH		38
85#define KPF_UNCACHED		39
86#define KPF_SOFTDIRTY		40
87
88/* [48-] take some arbitrary free slots for expanding overloaded flags
89 * not part of kernel API
90 */
91#define KPF_READAHEAD		48
92#define KPF_SLOB_FREE		49
93#define KPF_SLUB_FROZEN		50
94#define KPF_SLUB_DEBUG		51
95#define KPF_FILE		62
96#define KPF_MMAP_EXCLUSIVE	63
97
98#define KPF_ALL_BITS		((uint64_t)~0ULL)
99#define KPF_HACKERS_BITS	(0xffffULL << 32)
100#define KPF_OVERLOADED_BITS	(0xffffULL << 48)
101#define BIT(name)		(1ULL << KPF_##name)
102#define BITS_COMPOUND		(BIT(COMPOUND_HEAD) | BIT(COMPOUND_TAIL))
103
104static const char * const page_flag_names[] = {
105	[KPF_LOCKED]		= "L:locked",
106	[KPF_ERROR]		= "E:error",
107	[KPF_REFERENCED]	= "R:referenced",
108	[KPF_UPTODATE]		= "U:uptodate",
109	[KPF_DIRTY]		= "D:dirty",
110	[KPF_LRU]		= "l:lru",
111	[KPF_ACTIVE]		= "A:active",
112	[KPF_SLAB]		= "S:slab",
113	[KPF_WRITEBACK]		= "W:writeback",
114	[KPF_RECLAIM]		= "I:reclaim",
115	[KPF_BUDDY]		= "B:buddy",
116
117	[KPF_MMAP]		= "M:mmap",
118	[KPF_ANON]		= "a:anonymous",
119	[KPF_SWAPCACHE]		= "s:swapcache",
120	[KPF_SWAPBACKED]	= "b:swapbacked",
121	[KPF_COMPOUND_HEAD]	= "H:compound_head",
122	[KPF_COMPOUND_TAIL]	= "T:compound_tail",
123	[KPF_HUGE]		= "G:huge",
124	[KPF_UNEVICTABLE]	= "u:unevictable",
125	[KPF_HWPOISON]		= "X:hwpoison",
126	[KPF_NOPAGE]		= "n:nopage",
127	[KPF_KSM]		= "x:ksm",
128	[KPF_THP]		= "t:thp",
129	[KPF_BALLOON]		= "o:balloon",
130	[KPF_ZERO_PAGE]		= "z:zero_page",
131	[KPF_IDLE]              = "i:idle_page",
132
133	[KPF_RESERVED]		= "r:reserved",
134	[KPF_MLOCKED]		= "m:mlocked",
135	[KPF_MAPPEDTODISK]	= "d:mappedtodisk",
136	[KPF_PRIVATE]		= "P:private",
137	[KPF_PRIVATE_2]		= "p:private_2",
138	[KPF_OWNER_PRIVATE]	= "O:owner_private",
139	[KPF_ARCH]		= "h:arch",
140	[KPF_UNCACHED]		= "c:uncached",
141	[KPF_SOFTDIRTY]		= "f:softdirty",
142
143	[KPF_READAHEAD]		= "I:readahead",
144	[KPF_SLOB_FREE]		= "P:slob_free",
145	[KPF_SLUB_FROZEN]	= "A:slub_frozen",
146	[KPF_SLUB_DEBUG]	= "E:slub_debug",
147
148	[KPF_FILE]		= "F:file",
149	[KPF_MMAP_EXCLUSIVE]	= "1:mmap_exclusive",
150};
151
152
153static const char * const debugfs_known_mountpoints[] = {
154	"/sys/kernel/debug",
155	"/debug",
156	0,
157};
158
159/*
160 * data structures
161 */
162
163static int		opt_raw;	/* for kernel developers */
164static int		opt_list;	/* list pages (in ranges) */
165static int		opt_no_summary;	/* don't show summary */
166static pid_t		opt_pid;	/* process to walk */
167const char *		opt_file;
168
169#define MAX_ADDR_RANGES	1024
170static int		nr_addr_ranges;
171static unsigned long	opt_offset[MAX_ADDR_RANGES];
172static unsigned long	opt_size[MAX_ADDR_RANGES];
173
174#define MAX_VMAS	10240
175static int		nr_vmas;
176static unsigned long	pg_start[MAX_VMAS];
177static unsigned long	pg_end[MAX_VMAS];
178
179#define MAX_BIT_FILTERS	64
180static int		nr_bit_filters;
181static uint64_t		opt_mask[MAX_BIT_FILTERS];
182static uint64_t		opt_bits[MAX_BIT_FILTERS];
183
184static int		page_size;
185
186static int		pagemap_fd;
187static int		kpageflags_fd;
188
189static int		opt_hwpoison;
190static int		opt_unpoison;
191
192static const char	*hwpoison_debug_fs;
193static int		hwpoison_inject_fd;
194static int		hwpoison_forget_fd;
195
196#define HASH_SHIFT	13
197#define HASH_SIZE	(1 << HASH_SHIFT)
198#define HASH_MASK	(HASH_SIZE - 1)
199#define HASH_KEY(flags)	(flags & HASH_MASK)
200
201static unsigned long	total_pages;
202static unsigned long	nr_pages[HASH_SIZE];
203static uint64_t		page_flags[HASH_SIZE];
204
205
206/*
207 * helper functions
208 */
209
210#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
211
212#define min_t(type, x, y) ({			\
213	type __min1 = (x);			\
214	type __min2 = (y);			\
215	__min1 < __min2 ? __min1 : __min2; })
216
217#define max_t(type, x, y) ({			\
218	type __max1 = (x);			\
219	type __max2 = (y);			\
220	__max1 > __max2 ? __max1 : __max2; })
221
222static unsigned long pages2mb(unsigned long pages)
223{
224	return (pages * page_size) >> 20;
225}
226
227static void fatal(const char *x, ...)
228{
229	va_list ap;
230
231	va_start(ap, x);
232	vfprintf(stderr, x, ap);
233	va_end(ap);
234	exit(EXIT_FAILURE);
235}
236
237static int checked_open(const char *pathname, int flags)
238{
239	int fd = open(pathname, flags);
240
241	if (fd < 0) {
242		perror(pathname);
243		exit(EXIT_FAILURE);
244	}
245
246	return fd;
247}
248
249/*
250 * pagemap/kpageflags routines
251 */
252
253static unsigned long do_u64_read(int fd, char *name,
254				 uint64_t *buf,
255				 unsigned long index,
256				 unsigned long count)
257{
258	long bytes;
259
260	if (index > ULONG_MAX / 8)
261		fatal("index overflow: %lu\n", index);
262
263	bytes = pread(fd, buf, count * 8, (off_t)index * 8);
264	if (bytes < 0) {
265		perror(name);
266		exit(EXIT_FAILURE);
267	}
268	if (bytes % 8)
269		fatal("partial read: %lu bytes\n", bytes);
270
271	return bytes / 8;
272}
273
274static unsigned long kpageflags_read(uint64_t *buf,
275				     unsigned long index,
276				     unsigned long pages)
277{
278	return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages);
279}
280
281static unsigned long pagemap_read(uint64_t *buf,
282				  unsigned long index,
283				  unsigned long pages)
284{
285	return do_u64_read(pagemap_fd, "/proc/pid/pagemap", buf, index, pages);
286}
287
288static unsigned long pagemap_pfn(uint64_t val)
289{
290	unsigned long pfn;
291
292	if (val & PM_PRESENT)
293		pfn = PM_PFRAME(val);
294	else
295		pfn = 0;
296
297	return pfn;
298}
299
300
301/*
302 * page flag names
303 */
304
305static char *page_flag_name(uint64_t flags)
306{
307	static char buf[65];
308	int present;
309	size_t i, j;
310
311	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
312		present = (flags >> i) & 1;
313		if (!page_flag_names[i]) {
314			if (present)
315				fatal("unknown flag bit %d\n", i);
316			continue;
317		}
318		buf[j++] = present ? page_flag_names[i][0] : '_';
319	}
320
321	return buf;
322}
323
324static char *page_flag_longname(uint64_t flags)
325{
326	static char buf[1024];
327	size_t i, n;
328
329	for (i = 0, n = 0; i < ARRAY_SIZE(page_flag_names); i++) {
330		if (!page_flag_names[i])
331			continue;
332		if ((flags >> i) & 1)
333			n += snprintf(buf + n, sizeof(buf) - n, "%s,",
334					page_flag_names[i] + 2);
335	}
336	if (n)
337		n--;
338	buf[n] = '\0';
339
340	return buf;
341}
342
343
344/*
345 * page list and summary
346 */
347
348static void show_page_range(unsigned long voffset, unsigned long offset,
349			    unsigned long size, uint64_t flags)
350{
351	static uint64_t      flags0;
352	static unsigned long voff;
353	static unsigned long index;
354	static unsigned long count;
355
356	if (flags == flags0 && offset == index + count &&
357	    size && voffset == voff + count) {
358		count += size;
359		return;
360	}
361
362	if (count) {
363		if (opt_pid)
364			printf("%lx\t", voff);
365		if (opt_file)
366			printf("%lu\t", voff);
367		printf("%lx\t%lx\t%s\n",
368				index, count, page_flag_name(flags0));
369	}
370
371	flags0 = flags;
372	index  = offset;
373	voff   = voffset;
374	count  = size;
375}
376
377static void flush_page_range(void)
378{
379	show_page_range(0, 0, 0, 0);
380}
381
382static void show_page(unsigned long voffset,
383		      unsigned long offset, uint64_t flags)
384{
385	if (opt_pid)
386		printf("%lx\t", voffset);
387	if (opt_file)
388		printf("%lu\t", voffset);
389	printf("%lx\t%s\n", offset, page_flag_name(flags));
390}
391
392static void show_summary(void)
393{
394	size_t i;
395
396	printf("             flags\tpage-count       MB"
397		"  symbolic-flags\t\t\tlong-symbolic-flags\n");
398
399	for (i = 0; i < ARRAY_SIZE(nr_pages); i++) {
400		if (nr_pages[i])
401			printf("0x%016llx\t%10lu %8lu  %s\t%s\n",
402				(unsigned long long)page_flags[i],
403				nr_pages[i],
404				pages2mb(nr_pages[i]),
405				page_flag_name(page_flags[i]),
406				page_flag_longname(page_flags[i]));
407	}
408
409	printf("             total\t%10lu %8lu\n",
410			total_pages, pages2mb(total_pages));
411}
412
413
414/*
415 * page flag filters
416 */
417
418static int bit_mask_ok(uint64_t flags)
419{
420	int i;
421
422	for (i = 0; i < nr_bit_filters; i++) {
423		if (opt_bits[i] == KPF_ALL_BITS) {
424			if ((flags & opt_mask[i]) == 0)
425				return 0;
426		} else {
427			if ((flags & opt_mask[i]) != opt_bits[i])
428				return 0;
429		}
430	}
431
432	return 1;
433}
434
435static uint64_t expand_overloaded_flags(uint64_t flags, uint64_t pme)
436{
437	/* SLOB/SLUB overload several page flags */
438	if (flags & BIT(SLAB)) {
439		if (flags & BIT(PRIVATE))
440			flags ^= BIT(PRIVATE) | BIT(SLOB_FREE);
441		if (flags & BIT(ACTIVE))
442			flags ^= BIT(ACTIVE) | BIT(SLUB_FROZEN);
443		if (flags & BIT(ERROR))
444			flags ^= BIT(ERROR) | BIT(SLUB_DEBUG);
445	}
446
447	/* PG_reclaim is overloaded as PG_readahead in the read path */
448	if ((flags & (BIT(RECLAIM) | BIT(WRITEBACK))) == BIT(RECLAIM))
449		flags ^= BIT(RECLAIM) | BIT(READAHEAD);
450
451	if (pme & PM_SOFT_DIRTY)
452		flags |= BIT(SOFTDIRTY);
453	if (pme & PM_FILE)
454		flags |= BIT(FILE);
455	if (pme & PM_MMAP_EXCLUSIVE)
456		flags |= BIT(MMAP_EXCLUSIVE);
457
458	return flags;
459}
460
461static uint64_t well_known_flags(uint64_t flags)
462{
463	/* hide flags intended only for kernel hacker */
464	flags &= ~KPF_HACKERS_BITS;
465
466	/* hide non-hugeTLB compound pages */
467	if ((flags & BITS_COMPOUND) && !(flags & BIT(HUGE)))
468		flags &= ~BITS_COMPOUND;
469
470	return flags;
471}
472
473static uint64_t kpageflags_flags(uint64_t flags, uint64_t pme)
474{
475	if (opt_raw)
476		flags = expand_overloaded_flags(flags, pme);
477	else
478		flags = well_known_flags(flags);
479
480	return flags;
481}
482
483/*
484 * page actions
485 */
486
487static void prepare_hwpoison_fd(void)
488{
489	char buf[MAX_PATH + 1];
490
491	hwpoison_debug_fs = debugfs__mount();
492	if (!hwpoison_debug_fs) {
493		perror("mount debugfs");
494		exit(EXIT_FAILURE);
495	}
496
497	if (opt_hwpoison && !hwpoison_inject_fd) {
498		snprintf(buf, MAX_PATH, "%s/hwpoison/corrupt-pfn",
499			hwpoison_debug_fs);
500		hwpoison_inject_fd = checked_open(buf, O_WRONLY);
501	}
502
503	if (opt_unpoison && !hwpoison_forget_fd) {
504		snprintf(buf, MAX_PATH, "%s/hwpoison/unpoison-pfn",
505			hwpoison_debug_fs);
506		hwpoison_forget_fd = checked_open(buf, O_WRONLY);
507	}
508}
509
510static int hwpoison_page(unsigned long offset)
511{
512	char buf[100];
513	int len;
514
515	len = sprintf(buf, "0x%lx\n", offset);
516	len = write(hwpoison_inject_fd, buf, len);
517	if (len < 0) {
518		perror("hwpoison inject");
519		return len;
520	}
521	return 0;
522}
523
524static int unpoison_page(unsigned long offset)
525{
526	char buf[100];
527	int len;
528
529	len = sprintf(buf, "0x%lx\n", offset);
530	len = write(hwpoison_forget_fd, buf, len);
531	if (len < 0) {
532		perror("hwpoison forget");
533		return len;
534	}
535	return 0;
536}
537
538/*
539 * page frame walker
540 */
541
542static size_t hash_slot(uint64_t flags)
543{
544	size_t k = HASH_KEY(flags);
545	size_t i;
546
547	/* Explicitly reserve slot 0 for flags 0: the following logic
548	 * cannot distinguish an unoccupied slot from slot (flags==0).
549	 */
550	if (flags == 0)
551		return 0;
552
553	/* search through the remaining (HASH_SIZE-1) slots */
554	for (i = 1; i < ARRAY_SIZE(page_flags); i++, k++) {
555		if (!k || k >= ARRAY_SIZE(page_flags))
556			k = 1;
557		if (page_flags[k] == 0) {
558			page_flags[k] = flags;
559			return k;
560		}
561		if (page_flags[k] == flags)
562			return k;
563	}
564
565	fatal("hash table full: bump up HASH_SHIFT?\n");
566	exit(EXIT_FAILURE);
567}
568
569static void add_page(unsigned long voffset,
570		     unsigned long offset, uint64_t flags, uint64_t pme)
571{
572	flags = kpageflags_flags(flags, pme);
573
574	if (!bit_mask_ok(flags))
575		return;
576
577	if (opt_hwpoison)
578		hwpoison_page(offset);
579	if (opt_unpoison)
580		unpoison_page(offset);
581
582	if (opt_list == 1)
583		show_page_range(voffset, offset, 1, flags);
584	else if (opt_list == 2)
585		show_page(voffset, offset, flags);
586
587	nr_pages[hash_slot(flags)]++;
588	total_pages++;
589}
590
591#define KPAGEFLAGS_BATCH	(64 << 10)	/* 64k pages */
592static void walk_pfn(unsigned long voffset,
593		     unsigned long index,
594		     unsigned long count,
595		     uint64_t pme)
596{
597	uint64_t buf[KPAGEFLAGS_BATCH];
598	unsigned long batch;
599	unsigned long pages;
600	unsigned long i;
601
602	while (count) {
603		batch = min_t(unsigned long, count, KPAGEFLAGS_BATCH);
604		pages = kpageflags_read(buf, index, batch);
605		if (pages == 0)
606			break;
607
608		for (i = 0; i < pages; i++)
609			add_page(voffset + i, index + i, buf[i], pme);
610
611		index += pages;
612		count -= pages;
613	}
614}
615
616#define PAGEMAP_BATCH	(64 << 10)
617static void walk_vma(unsigned long index, unsigned long count)
618{
619	uint64_t buf[PAGEMAP_BATCH];
620	unsigned long batch;
621	unsigned long pages;
622	unsigned long pfn;
623	unsigned long i;
624
625	while (count) {
626		batch = min_t(unsigned long, count, PAGEMAP_BATCH);
627		pages = pagemap_read(buf, index, batch);
628		if (pages == 0)
629			break;
630
631		for (i = 0; i < pages; i++) {
632			pfn = pagemap_pfn(buf[i]);
633			if (pfn)
634				walk_pfn(index + i, pfn, 1, buf[i]);
635		}
636
637		index += pages;
638		count -= pages;
639	}
640}
641
642static void walk_task(unsigned long index, unsigned long count)
643{
644	const unsigned long end = index + count;
645	unsigned long start;
646	int i = 0;
647
648	while (index < end) {
649
650		while (pg_end[i] <= index)
651			if (++i >= nr_vmas)
652				return;
653		if (pg_start[i] >= end)
654			return;
655
656		start = max_t(unsigned long, pg_start[i], index);
657		index = min_t(unsigned long, pg_end[i], end);
658
659		assert(start < index);
660		walk_vma(start, index - start);
661	}
662}
663
664static void add_addr_range(unsigned long offset, unsigned long size)
665{
666	if (nr_addr_ranges >= MAX_ADDR_RANGES)
667		fatal("too many addr ranges\n");
668
669	opt_offset[nr_addr_ranges] = offset;
670	opt_size[nr_addr_ranges] = min_t(unsigned long, size, ULONG_MAX-offset);
671	nr_addr_ranges++;
672}
673
674static void walk_addr_ranges(void)
675{
676	int i;
677
678	kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
679
680	if (!nr_addr_ranges)
681		add_addr_range(0, ULONG_MAX);
682
683	for (i = 0; i < nr_addr_ranges; i++)
684		if (!opt_pid)
685			walk_pfn(opt_offset[i], opt_offset[i], opt_size[i], 0);
686		else
687			walk_task(opt_offset[i], opt_size[i]);
688
689	close(kpageflags_fd);
690}
691
692
693/*
694 * user interface
695 */
696
697static const char *page_flag_type(uint64_t flag)
698{
699	if (flag & KPF_HACKERS_BITS)
700		return "(r)";
701	if (flag & KPF_OVERLOADED_BITS)
702		return "(o)";
703	return "   ";
704}
705
706static void usage(void)
707{
708	size_t i, j;
709
710	printf(
711"page-types [options]\n"
712"            -r|--raw                   Raw mode, for kernel developers\n"
713"            -d|--describe flags        Describe flags\n"
714"            -a|--addr    addr-spec     Walk a range of pages\n"
715"            -b|--bits    bits-spec     Walk pages with specified bits\n"
716"            -p|--pid     pid           Walk process address space\n"
717"            -f|--file    filename      Walk file address space\n"
718"            -l|--list                  Show page details in ranges\n"
719"            -L|--list-each             Show page details one by one\n"
720"            -N|--no-summary            Don't show summary info\n"
721"            -X|--hwpoison              hwpoison pages\n"
722"            -x|--unpoison              unpoison pages\n"
723"            -h|--help                  Show this usage message\n"
724"flags:\n"
725"            0x10                       bitfield format, e.g.\n"
726"            anon                       bit-name, e.g.\n"
727"            0x10,anon                  comma-separated list, e.g.\n"
728"addr-spec:\n"
729"            N                          one page at offset N (unit: pages)\n"
730"            N+M                        pages range from N to N+M-1\n"
731"            N,M                        pages range from N to M-1\n"
732"            N,                         pages range from N to end\n"
733"            ,M                         pages range from 0 to M-1\n"
734"bits-spec:\n"
735"            bit1,bit2                  (flags & (bit1|bit2)) != 0\n"
736"            bit1,bit2=bit1             (flags & (bit1|bit2)) == bit1\n"
737"            bit1,~bit2                 (flags & (bit1|bit2)) == bit1\n"
738"            =bit1,bit2                 flags == (bit1|bit2)\n"
739"bit-names:\n"
740	);
741
742	for (i = 0, j = 0; i < ARRAY_SIZE(page_flag_names); i++) {
743		if (!page_flag_names[i])
744			continue;
745		printf("%16s%s", page_flag_names[i] + 2,
746				 page_flag_type(1ULL << i));
747		if (++j > 3) {
748			j = 0;
749			putchar('\n');
750		}
751	}
752	printf("\n                                   "
753		"(r) raw mode bits  (o) overloaded bits\n");
754}
755
756static unsigned long long parse_number(const char *str)
757{
758	unsigned long long n;
759
760	n = strtoll(str, NULL, 0);
761
762	if (n == 0 && str[0] != '0')
763		fatal("invalid name or number: %s\n", str);
764
765	return n;
766}
767
768static void parse_pid(const char *str)
769{
770	FILE *file;
771	char buf[5000];
772
773	opt_pid = parse_number(str);
774
775	sprintf(buf, "/proc/%d/pagemap", opt_pid);
776	pagemap_fd = checked_open(buf, O_RDONLY);
777
778	sprintf(buf, "/proc/%d/maps", opt_pid);
779	file = fopen(buf, "r");
780	if (!file) {
781		perror(buf);
782		exit(EXIT_FAILURE);
783	}
784
785	while (fgets(buf, sizeof(buf), file) != NULL) {
786		unsigned long vm_start;
787		unsigned long vm_end;
788		unsigned long long pgoff;
789		int major, minor;
790		char r, w, x, s;
791		unsigned long ino;
792		int n;
793
794		n = sscanf(buf, "%lx-%lx %c%c%c%c %llx %x:%x %lu",
795			   &vm_start,
796			   &vm_end,
797			   &r, &w, &x, &s,
798			   &pgoff,
799			   &major, &minor,
800			   &ino);
801		if (n < 10) {
802			fprintf(stderr, "unexpected line: %s\n", buf);
803			continue;
804		}
805		pg_start[nr_vmas] = vm_start / page_size;
806		pg_end[nr_vmas] = vm_end / page_size;
807		if (++nr_vmas >= MAX_VMAS) {
808			fprintf(stderr, "too many VMAs\n");
809			break;
810		}
811	}
812	fclose(file);
813}
814
815static void show_file(const char *name, const struct stat *st)
816{
817	unsigned long long size = st->st_size;
818	char atime[64], mtime[64];
819	long now = time(NULL);
820
821	printf("%s\tInode: %u\tSize: %llu (%llu pages)\n",
822			name, (unsigned)st->st_ino,
823			size, (size + page_size - 1) / page_size);
824
825	strftime(atime, sizeof(atime), "%c", localtime(&st->st_atime));
826	strftime(mtime, sizeof(mtime), "%c", localtime(&st->st_mtime));
827
828	printf("Modify: %s (%ld seconds ago)\nAccess: %s (%ld seconds ago)\n",
829			mtime, now - st->st_mtime,
830			atime, now - st->st_atime);
831}
832
833static sigjmp_buf sigbus_jmp;
834
835static void * volatile sigbus_addr;
836
837static void sigbus_handler(int sig, siginfo_t *info, void *ucontex)
838{
839	(void)sig;
840	(void)ucontex;
841	sigbus_addr = info ? info->si_addr : NULL;
842	siglongjmp(sigbus_jmp, 1);
843}
844
845static struct sigaction sigbus_action = {
846	.sa_sigaction = sigbus_handler,
847	.sa_flags = SA_SIGINFO,
848};
849
850static void walk_file(const char *name, const struct stat *st)
851{
852	uint8_t vec[PAGEMAP_BATCH];
853	uint64_t buf[PAGEMAP_BATCH], flags;
854	unsigned long nr_pages, pfn, i;
855	off_t off, end = st->st_size;
856	int fd;
857	ssize_t len;
858	void *ptr;
859	int first = 1;
860
861	fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW);
862
863	for (off = 0; off < end; off += len) {
864		nr_pages = (end - off + page_size - 1) / page_size;
865		if (nr_pages > PAGEMAP_BATCH)
866			nr_pages = PAGEMAP_BATCH;
867		len = nr_pages * page_size;
868
869		ptr = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, off);
870		if (ptr == MAP_FAILED)
871			fatal("mmap failed: %s", name);
872
873		/* determine cached pages */
874		if (mincore(ptr, len, vec))
875			fatal("mincore failed: %s", name);
876
877		/* turn off readahead */
878		if (madvise(ptr, len, MADV_RANDOM))
879			fatal("madvice failed: %s", name);
880
881		if (sigsetjmp(sigbus_jmp, 1)) {
882			end = off + sigbus_addr ? sigbus_addr - ptr : 0;
883			fprintf(stderr, "got sigbus at offset %lld: %s\n",
884					(long long)end, name);
885			goto got_sigbus;
886		}
887
888		/* populate ptes */
889		for (i = 0; i < nr_pages ; i++) {
890			if (vec[i] & 1)
891				(void)*(volatile int *)(ptr + i * page_size);
892		}
893got_sigbus:
894
895		/* turn off harvesting reference bits */
896		if (madvise(ptr, len, MADV_SEQUENTIAL))
897			fatal("madvice failed: %s", name);
898
899		if (pagemap_read(buf, (unsigned long)ptr / page_size,
900					nr_pages) != nr_pages)
901			fatal("cannot read pagemap");
902
903		munmap(ptr, len);
904
905		for (i = 0; i < nr_pages; i++) {
906			pfn = pagemap_pfn(buf[i]);
907			if (!pfn)
908				continue;
909			if (!kpageflags_read(&flags, pfn, 1))
910				continue;
911			if (first && opt_list) {
912				first = 0;
913				flush_page_range();
914				show_file(name, st);
915			}
916			add_page(off / page_size + i, pfn, flags, buf[i]);
917		}
918	}
919
920	close(fd);
921}
922
923int walk_tree(const char *name, const struct stat *st, int type, struct FTW *f)
924{
925	(void)f;
926	switch (type) {
927	case FTW_F:
928		if (S_ISREG(st->st_mode))
929			walk_file(name, st);
930		break;
931	case FTW_DNR:
932		fprintf(stderr, "cannot read dir: %s\n", name);
933		break;
934	}
935	return 0;
936}
937
938static void walk_page_cache(void)
939{
940	struct stat st;
941
942	kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY);
943	pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY);
944	sigaction(SIGBUS, &sigbus_action, NULL);
945
946	if (stat(opt_file, &st))
947		fatal("stat failed: %s\n", opt_file);
948
949	if (S_ISREG(st.st_mode)) {
950		walk_file(opt_file, &st);
951	} else if (S_ISDIR(st.st_mode)) {
952		/* do not follow symlinks and mountpoints */
953		if (nftw(opt_file, walk_tree, 64, FTW_MOUNT | FTW_PHYS) < 0)
954			fatal("nftw failed: %s\n", opt_file);
955	} else
956		fatal("unhandled file type: %s\n", opt_file);
957
958	close(kpageflags_fd);
959	close(pagemap_fd);
960	signal(SIGBUS, SIG_DFL);
961}
962
963static void parse_file(const char *name)
964{
965	opt_file = name;
966}
967
968static void parse_addr_range(const char *optarg)
969{
970	unsigned long offset;
971	unsigned long size;
972	char *p;
973
974	p = strchr(optarg, ',');
975	if (!p)
976		p = strchr(optarg, '+');
977
978	if (p == optarg) {
979		offset = 0;
980		size   = parse_number(p + 1);
981	} else if (p) {
982		offset = parse_number(optarg);
983		if (p[1] == '\0')
984			size = ULONG_MAX;
985		else {
986			size = parse_number(p + 1);
987			if (*p == ',') {
988				if (size < offset)
989					fatal("invalid range: %lu,%lu\n",
990							offset, size);
991				size -= offset;
992			}
993		}
994	} else {
995		offset = parse_number(optarg);
996		size   = 1;
997	}
998
999	add_addr_range(offset, size);
1000}
1001
1002static void add_bits_filter(uint64_t mask, uint64_t bits)
1003{
1004	if (nr_bit_filters >= MAX_BIT_FILTERS)
1005		fatal("too much bit filters\n");
1006
1007	opt_mask[nr_bit_filters] = mask;
1008	opt_bits[nr_bit_filters] = bits;
1009	nr_bit_filters++;
1010}
1011
1012static uint64_t parse_flag_name(const char *str, int len)
1013{
1014	size_t i;
1015
1016	if (!*str || !len)
1017		return 0;
1018
1019	if (len <= 8 && !strncmp(str, "compound", len))
1020		return BITS_COMPOUND;
1021
1022	for (i = 0; i < ARRAY_SIZE(page_flag_names); i++) {
1023		if (!page_flag_names[i])
1024			continue;
1025		if (!strncmp(str, page_flag_names[i] + 2, len))
1026			return 1ULL << i;
1027	}
1028
1029	return parse_number(str);
1030}
1031
1032static uint64_t parse_flag_names(const char *str, int all)
1033{
1034	const char *p    = str;
1035	uint64_t   flags = 0;
1036
1037	while (1) {
1038		if (*p == ',' || *p == '=' || *p == '\0') {
1039			if ((*str != '~') || (*str == '~' && all && *++str))
1040				flags |= parse_flag_name(str, p - str);
1041			if (*p != ',')
1042				break;
1043			str = p + 1;
1044		}
1045		p++;
1046	}
1047
1048	return flags;
1049}
1050
1051static void parse_bits_mask(const char *optarg)
1052{
1053	uint64_t mask;
1054	uint64_t bits;
1055	const char *p;
1056
1057	p = strchr(optarg, '=');
1058	if (p == optarg) {
1059		mask = KPF_ALL_BITS;
1060		bits = parse_flag_names(p + 1, 0);
1061	} else if (p) {
1062		mask = parse_flag_names(optarg, 0);
1063		bits = parse_flag_names(p + 1, 0);
1064	} else if (strchr(optarg, '~')) {
1065		mask = parse_flag_names(optarg, 1);
1066		bits = parse_flag_names(optarg, 0);
1067	} else {
1068		mask = parse_flag_names(optarg, 0);
1069		bits = KPF_ALL_BITS;
1070	}
1071
1072	add_bits_filter(mask, bits);
1073}
1074
1075static void describe_flags(const char *optarg)
1076{
1077	uint64_t flags = parse_flag_names(optarg, 0);
1078
1079	printf("0x%016llx\t%s\t%s\n",
1080		(unsigned long long)flags,
1081		page_flag_name(flags),
1082		page_flag_longname(flags));
1083}
1084
1085static const struct option opts[] = {
1086	{ "raw"       , 0, NULL, 'r' },
1087	{ "pid"       , 1, NULL, 'p' },
1088	{ "file"      , 1, NULL, 'f' },
1089	{ "addr"      , 1, NULL, 'a' },
1090	{ "bits"      , 1, NULL, 'b' },
1091	{ "describe"  , 1, NULL, 'd' },
1092	{ "list"      , 0, NULL, 'l' },
1093	{ "list-each" , 0, NULL, 'L' },
1094	{ "no-summary", 0, NULL, 'N' },
1095	{ "hwpoison"  , 0, NULL, 'X' },
1096	{ "unpoison"  , 0, NULL, 'x' },
1097	{ "help"      , 0, NULL, 'h' },
1098	{ NULL        , 0, NULL, 0 }
1099};
1100
1101int main(int argc, char *argv[])
1102{
1103	int c;
1104
1105	page_size = getpagesize();
1106
1107	while ((c = getopt_long(argc, argv,
1108				"rp:f:a:b:d:lLNXxh", opts, NULL)) != -1) {
1109		switch (c) {
1110		case 'r':
1111			opt_raw = 1;
1112			break;
1113		case 'p':
1114			parse_pid(optarg);
1115			break;
1116		case 'f':
1117			parse_file(optarg);
1118			break;
1119		case 'a':
1120			parse_addr_range(optarg);
1121			break;
1122		case 'b':
1123			parse_bits_mask(optarg);
1124			break;
1125		case 'd':
1126			describe_flags(optarg);
1127			exit(0);
1128		case 'l':
1129			opt_list = 1;
1130			break;
1131		case 'L':
1132			opt_list = 2;
1133			break;
1134		case 'N':
1135			opt_no_summary = 1;
1136			break;
1137		case 'X':
1138			opt_hwpoison = 1;
1139			prepare_hwpoison_fd();
1140			break;
1141		case 'x':
1142			opt_unpoison = 1;
1143			prepare_hwpoison_fd();
1144			break;
1145		case 'h':
1146			usage();
1147			exit(0);
1148		default:
1149			usage();
1150			exit(1);
1151		}
1152	}
1153
1154	if (opt_list && opt_pid)
1155		printf("voffset\t");
1156	if (opt_list && opt_file)
1157		printf("foffset\t");
1158	if (opt_list == 1)
1159		printf("offset\tlen\tflags\n");
1160	if (opt_list == 2)
1161		printf("offset\tflags\n");
1162
1163	if (opt_file)
1164		walk_page_cache();
1165	else
1166		walk_addr_ranges();
1167
1168	if (opt_list == 1)
1169		flush_page_range();
1170
1171	if (opt_no_summary)
1172		return 0;
1173
1174	if (opt_list)
1175		printf("\n\n");
1176
1177	show_summary();
1178
1179	return 0;
1180}
1181