1#include <linux/mm.h>
2#include <linux/vmacache.h>
3#include <linux/hugetlb.h>
4#include <linux/huge_mm.h>
5#include <linux/mount.h>
6#include <linux/seq_file.h>
7#include <linux/highmem.h>
8#include <linux/ptrace.h>
9#include <linux/slab.h>
10#include <linux/pagemap.h>
11#include <linux/mempolicy.h>
12#include <linux/rmap.h>
13#include <linux/swap.h>
14#include <linux/swapops.h>
15#include <linux/mmu_notifier.h>
16#include <linux/page_idle.h>
17
18#include <asm/elf.h>
19#include <asm/uaccess.h>
20#include <asm/tlbflush.h>
21#include "internal.h"
22
23void task_mem(struct seq_file *m, struct mm_struct *mm)
24{
25	unsigned long data, text, lib, swap, ptes, pmds;
26	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
27
28	/*
29	 * Note: to minimize their overhead, mm maintains hiwater_vm and
30	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
31	 * collector of these hiwater stats must therefore get total_vm
32	 * and rss too, which will usually be the higher.  Barriers? not
33	 * worth the effort, such snapshots can always be inconsistent.
34	 */
35	hiwater_vm = total_vm = mm->total_vm;
36	if (hiwater_vm < mm->hiwater_vm)
37		hiwater_vm = mm->hiwater_vm;
38	hiwater_rss = total_rss = get_mm_rss(mm);
39	if (hiwater_rss < mm->hiwater_rss)
40		hiwater_rss = mm->hiwater_rss;
41
42	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
43	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
44	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
45	swap = get_mm_counter(mm, MM_SWAPENTS);
46	ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
47	pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
48	seq_printf(m,
49		"VmPeak:\t%8lu kB\n"
50		"VmSize:\t%8lu kB\n"
51		"VmLck:\t%8lu kB\n"
52		"VmPin:\t%8lu kB\n"
53		"VmHWM:\t%8lu kB\n"
54		"VmRSS:\t%8lu kB\n"
55		"VmData:\t%8lu kB\n"
56		"VmStk:\t%8lu kB\n"
57		"VmExe:\t%8lu kB\n"
58		"VmLib:\t%8lu kB\n"
59		"VmPTE:\t%8lu kB\n"
60		"VmPMD:\t%8lu kB\n"
61		"VmSwap:\t%8lu kB\n",
62		hiwater_vm << (PAGE_SHIFT-10),
63		total_vm << (PAGE_SHIFT-10),
64		mm->locked_vm << (PAGE_SHIFT-10),
65		mm->pinned_vm << (PAGE_SHIFT-10),
66		hiwater_rss << (PAGE_SHIFT-10),
67		total_rss << (PAGE_SHIFT-10),
68		data << (PAGE_SHIFT-10),
69		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
70		ptes >> 10,
71		pmds >> 10,
72		swap << (PAGE_SHIFT-10));
73	hugetlb_report_usage(m, mm);
74}
75
76unsigned long task_vsize(struct mm_struct *mm)
77{
78	return PAGE_SIZE * mm->total_vm;
79}
80
81unsigned long task_statm(struct mm_struct *mm,
82			 unsigned long *shared, unsigned long *text,
83			 unsigned long *data, unsigned long *resident)
84{
85	*shared = get_mm_counter(mm, MM_FILEPAGES);
86	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
87								>> PAGE_SHIFT;
88	*data = mm->total_vm - mm->shared_vm;
89	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
90	return mm->total_vm;
91}
92
93#ifdef CONFIG_NUMA
94/*
95 * Save get_task_policy() for show_numa_map().
96 */
97static void hold_task_mempolicy(struct proc_maps_private *priv)
98{
99	struct task_struct *task = priv->task;
100
101	task_lock(task);
102	priv->task_mempolicy = get_task_policy(task);
103	mpol_get(priv->task_mempolicy);
104	task_unlock(task);
105}
106static void release_task_mempolicy(struct proc_maps_private *priv)
107{
108	mpol_put(priv->task_mempolicy);
109}
110#else
111static void hold_task_mempolicy(struct proc_maps_private *priv)
112{
113}
114static void release_task_mempolicy(struct proc_maps_private *priv)
115{
116}
117#endif
118
119static void vma_stop(struct proc_maps_private *priv)
120{
121	struct mm_struct *mm = priv->mm;
122
123	release_task_mempolicy(priv);
124	up_read(&mm->mmap_sem);
125	mmput(mm);
126}
127
128static struct vm_area_struct *
129m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma)
130{
131	if (vma == priv->tail_vma)
132		return NULL;
133	return vma->vm_next ?: priv->tail_vma;
134}
135
136static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma)
137{
138	if (m->count < m->size)	/* vma is copied successfully */
139		m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL;
140}
141
142static void *m_start(struct seq_file *m, loff_t *ppos)
143{
144	struct proc_maps_private *priv = m->private;
145	unsigned long last_addr = m->version;
146	struct mm_struct *mm;
147	struct vm_area_struct *vma;
148	unsigned int pos = *ppos;
149
150	/* See m_cache_vma(). Zero at the start or after lseek. */
151	if (last_addr == -1UL)
152		return NULL;
153
154	priv->task = get_proc_task(priv->inode);
155	if (!priv->task)
156		return ERR_PTR(-ESRCH);
157
158	mm = priv->mm;
159	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
160		return NULL;
161
162	down_read(&mm->mmap_sem);
163	hold_task_mempolicy(priv);
164	priv->tail_vma = get_gate_vma(mm);
165
166	if (last_addr) {
167		vma = find_vma(mm, last_addr);
168		if (vma && (vma = m_next_vma(priv, vma)))
169			return vma;
170	}
171
172	m->version = 0;
173	if (pos < mm->map_count) {
174		for (vma = mm->mmap; pos; pos--) {
175			m->version = vma->vm_start;
176			vma = vma->vm_next;
177		}
178		return vma;
179	}
180
181	/* we do not bother to update m->version in this case */
182	if (pos == mm->map_count && priv->tail_vma)
183		return priv->tail_vma;
184
185	vma_stop(priv);
186	return NULL;
187}
188
189static void *m_next(struct seq_file *m, void *v, loff_t *pos)
190{
191	struct proc_maps_private *priv = m->private;
192	struct vm_area_struct *next;
193
194	(*pos)++;
195	next = m_next_vma(priv, v);
196	if (!next)
197		vma_stop(priv);
198	return next;
199}
200
201static void m_stop(struct seq_file *m, void *v)
202{
203	struct proc_maps_private *priv = m->private;
204
205	if (!IS_ERR_OR_NULL(v))
206		vma_stop(priv);
207	if (priv->task) {
208		put_task_struct(priv->task);
209		priv->task = NULL;
210	}
211}
212
213static int proc_maps_open(struct inode *inode, struct file *file,
214			const struct seq_operations *ops, int psize)
215{
216	struct proc_maps_private *priv = __seq_open_private(file, ops, psize);
217
218	if (!priv)
219		return -ENOMEM;
220
221	priv->inode = inode;
222	priv->mm = proc_mem_open(inode, PTRACE_MODE_READ);
223	if (IS_ERR(priv->mm)) {
224		int err = PTR_ERR(priv->mm);
225
226		seq_release_private(inode, file);
227		return err;
228	}
229
230	return 0;
231}
232
233static int proc_map_release(struct inode *inode, struct file *file)
234{
235	struct seq_file *seq = file->private_data;
236	struct proc_maps_private *priv = seq->private;
237
238	if (priv->mm)
239		mmdrop(priv->mm);
240
241	return seq_release_private(inode, file);
242}
243
244static int do_maps_open(struct inode *inode, struct file *file,
245			const struct seq_operations *ops)
246{
247	return proc_maps_open(inode, file, ops,
248				sizeof(struct proc_maps_private));
249}
250
251static pid_t pid_of_stack(struct proc_maps_private *priv,
252				struct vm_area_struct *vma, bool is_pid)
253{
254	struct inode *inode = priv->inode;
255	struct task_struct *task;
256	pid_t ret = 0;
257
258	rcu_read_lock();
259	task = pid_task(proc_pid(inode), PIDTYPE_PID);
260	if (task) {
261		task = task_of_stack(task, vma, is_pid);
262		if (task)
263			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
264	}
265	rcu_read_unlock();
266
267	return ret;
268}
269
270static void
271show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
272{
273	struct mm_struct *mm = vma->vm_mm;
274	struct file *file = vma->vm_file;
275	struct proc_maps_private *priv = m->private;
276	vm_flags_t flags = vma->vm_flags;
277	unsigned long ino = 0;
278	unsigned long long pgoff = 0;
279	unsigned long start, end;
280	dev_t dev = 0;
281	const char *name = NULL;
282
283	if (file) {
284		struct inode *inode = file_inode(vma->vm_file);
285		dev = inode->i_sb->s_dev;
286		ino = inode->i_ino;
287		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
288	}
289
290	/* We don't show the stack guard page in /proc/maps */
291	start = vma->vm_start;
292	if (stack_guard_page_start(vma, start))
293		start += PAGE_SIZE;
294	end = vma->vm_end;
295	if (stack_guard_page_end(vma, end))
296		end -= PAGE_SIZE;
297
298	seq_setwidth(m, 25 + sizeof(void *) * 6 - 1);
299	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ",
300			start,
301			end,
302			flags & VM_READ ? 'r' : '-',
303			flags & VM_WRITE ? 'w' : '-',
304			flags & VM_EXEC ? 'x' : '-',
305			flags & VM_MAYSHARE ? 's' : 'p',
306			pgoff,
307			MAJOR(dev), MINOR(dev), ino);
308
309	/*
310	 * Print the dentry name for named mappings, and a
311	 * special [heap] marker for the heap:
312	 */
313	if (file) {
314		seq_pad(m, ' ');
315		seq_file_path(m, file, "\n");
316		goto done;
317	}
318
319	if (vma->vm_ops && vma->vm_ops->name) {
320		name = vma->vm_ops->name(vma);
321		if (name)
322			goto done;
323	}
324
325	name = arch_vma_name(vma);
326	if (!name) {
327		pid_t tid;
328
329		if (!mm) {
330			name = "[vdso]";
331			goto done;
332		}
333
334		if (vma->vm_start <= mm->brk &&
335		    vma->vm_end >= mm->start_brk) {
336			name = "[heap]";
337			goto done;
338		}
339
340		tid = pid_of_stack(priv, vma, is_pid);
341		if (tid != 0) {
342			/*
343			 * Thread stack in /proc/PID/task/TID/maps or
344			 * the main process stack.
345			 */
346			if (!is_pid || (vma->vm_start <= mm->start_stack &&
347			    vma->vm_end >= mm->start_stack)) {
348				name = "[stack]";
349			} else {
350				/* Thread stack in /proc/PID/maps */
351				seq_pad(m, ' ');
352				seq_printf(m, "[stack:%d]", tid);
353			}
354		}
355	}
356
357done:
358	if (name) {
359		seq_pad(m, ' ');
360		seq_puts(m, name);
361	}
362	seq_putc(m, '\n');
363}
364
365static int show_map(struct seq_file *m, void *v, int is_pid)
366{
367	show_map_vma(m, v, is_pid);
368	m_cache_vma(m, v);
369	return 0;
370}
371
372static int show_pid_map(struct seq_file *m, void *v)
373{
374	return show_map(m, v, 1);
375}
376
377static int show_tid_map(struct seq_file *m, void *v)
378{
379	return show_map(m, v, 0);
380}
381
382static const struct seq_operations proc_pid_maps_op = {
383	.start	= m_start,
384	.next	= m_next,
385	.stop	= m_stop,
386	.show	= show_pid_map
387};
388
389static const struct seq_operations proc_tid_maps_op = {
390	.start	= m_start,
391	.next	= m_next,
392	.stop	= m_stop,
393	.show	= show_tid_map
394};
395
396static int pid_maps_open(struct inode *inode, struct file *file)
397{
398	return do_maps_open(inode, file, &proc_pid_maps_op);
399}
400
401static int tid_maps_open(struct inode *inode, struct file *file)
402{
403	return do_maps_open(inode, file, &proc_tid_maps_op);
404}
405
406const struct file_operations proc_pid_maps_operations = {
407	.open		= pid_maps_open,
408	.read		= seq_read,
409	.llseek		= seq_lseek,
410	.release	= proc_map_release,
411};
412
413const struct file_operations proc_tid_maps_operations = {
414	.open		= tid_maps_open,
415	.read		= seq_read,
416	.llseek		= seq_lseek,
417	.release	= proc_map_release,
418};
419
420/*
421 * Proportional Set Size(PSS): my share of RSS.
422 *
423 * PSS of a process is the count of pages it has in memory, where each
424 * page is divided by the number of processes sharing it.  So if a
425 * process has 1000 pages all to itself, and 1000 shared with one other
426 * process, its PSS will be 1500.
427 *
428 * To keep (accumulated) division errors low, we adopt a 64bit
429 * fixed-point pss counter to minimize division errors. So (pss >>
430 * PSS_SHIFT) would be the real byte count.
431 *
432 * A shift of 12 before division means (assuming 4K page size):
433 * 	- 1M 3-user-pages add up to 8KB errors;
434 * 	- supports mapcount up to 2^24, or 16M;
435 * 	- supports PSS up to 2^52 bytes, or 4PB.
436 */
437#define PSS_SHIFT 12
438
439#ifdef CONFIG_PROC_PAGE_MONITOR
440struct mem_size_stats {
441	unsigned long resident;
442	unsigned long shared_clean;
443	unsigned long shared_dirty;
444	unsigned long private_clean;
445	unsigned long private_dirty;
446	unsigned long referenced;
447	unsigned long anonymous;
448	unsigned long anonymous_thp;
449	unsigned long swap;
450	unsigned long shared_hugetlb;
451	unsigned long private_hugetlb;
452	u64 pss;
453	u64 swap_pss;
454};
455
456static void smaps_account(struct mem_size_stats *mss, struct page *page,
457		unsigned long size, bool young, bool dirty)
458{
459	int mapcount;
460
461	if (PageAnon(page))
462		mss->anonymous += size;
463
464	mss->resident += size;
465	/* Accumulate the size in pages that have been accessed. */
466	if (young || page_is_young(page) || PageReferenced(page))
467		mss->referenced += size;
468	mapcount = page_mapcount(page);
469	if (mapcount >= 2) {
470		u64 pss_delta;
471
472		if (dirty || PageDirty(page))
473			mss->shared_dirty += size;
474		else
475			mss->shared_clean += size;
476		pss_delta = (u64)size << PSS_SHIFT;
477		do_div(pss_delta, mapcount);
478		mss->pss += pss_delta;
479	} else {
480		if (dirty || PageDirty(page))
481			mss->private_dirty += size;
482		else
483			mss->private_clean += size;
484		mss->pss += (u64)size << PSS_SHIFT;
485	}
486}
487
488static void smaps_pte_entry(pte_t *pte, unsigned long addr,
489		struct mm_walk *walk)
490{
491	struct mem_size_stats *mss = walk->private;
492	struct vm_area_struct *vma = walk->vma;
493	struct page *page = NULL;
494
495	if (pte_present(*pte)) {
496		page = vm_normal_page(vma, addr, *pte);
497	} else if (is_swap_pte(*pte)) {
498		swp_entry_t swpent = pte_to_swp_entry(*pte);
499
500		if (!non_swap_entry(swpent)) {
501			int mapcount;
502
503			mss->swap += PAGE_SIZE;
504			mapcount = swp_swapcount(swpent);
505			if (mapcount >= 2) {
506				u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
507
508				do_div(pss_delta, mapcount);
509				mss->swap_pss += pss_delta;
510			} else {
511				mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
512			}
513		} else if (is_migration_entry(swpent))
514			page = migration_entry_to_page(swpent);
515	}
516
517	if (!page)
518		return;
519	smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
520}
521
522#ifdef CONFIG_TRANSPARENT_HUGEPAGE
523static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
524		struct mm_walk *walk)
525{
526	struct mem_size_stats *mss = walk->private;
527	struct vm_area_struct *vma = walk->vma;
528	struct page *page;
529
530	/* FOLL_DUMP will return -EFAULT on huge zero page */
531	page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP);
532	if (IS_ERR_OR_NULL(page))
533		return;
534	mss->anonymous_thp += HPAGE_PMD_SIZE;
535	smaps_account(mss, page, HPAGE_PMD_SIZE,
536			pmd_young(*pmd), pmd_dirty(*pmd));
537}
538#else
539static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
540		struct mm_walk *walk)
541{
542}
543#endif
544
545static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
546			   struct mm_walk *walk)
547{
548	struct vm_area_struct *vma = walk->vma;
549	pte_t *pte;
550	spinlock_t *ptl;
551
552	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
553		smaps_pmd_entry(pmd, addr, walk);
554		spin_unlock(ptl);
555		return 0;
556	}
557
558	if (pmd_trans_unstable(pmd))
559		return 0;
560	/*
561	 * The mmap_sem held all the way back in m_start() is what
562	 * keeps khugepaged out of here and from collapsing things
563	 * in here.
564	 */
565	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
566	for (; addr != end; pte++, addr += PAGE_SIZE)
567		smaps_pte_entry(pte, addr, walk);
568	pte_unmap_unlock(pte - 1, ptl);
569	cond_resched();
570	return 0;
571}
572
573static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
574{
575	/*
576	 * Don't forget to update Documentation/ on changes.
577	 */
578	static const char mnemonics[BITS_PER_LONG][2] = {
579		/*
580		 * In case if we meet a flag we don't know about.
581		 */
582		[0 ... (BITS_PER_LONG-1)] = "??",
583
584		[ilog2(VM_READ)]	= "rd",
585		[ilog2(VM_WRITE)]	= "wr",
586		[ilog2(VM_EXEC)]	= "ex",
587		[ilog2(VM_SHARED)]	= "sh",
588		[ilog2(VM_MAYREAD)]	= "mr",
589		[ilog2(VM_MAYWRITE)]	= "mw",
590		[ilog2(VM_MAYEXEC)]	= "me",
591		[ilog2(VM_MAYSHARE)]	= "ms",
592		[ilog2(VM_GROWSDOWN)]	= "gd",
593		[ilog2(VM_PFNMAP)]	= "pf",
594		[ilog2(VM_DENYWRITE)]	= "dw",
595#ifdef CONFIG_X86_INTEL_MPX
596		[ilog2(VM_MPX)]		= "mp",
597#endif
598		[ilog2(VM_LOCKED)]	= "lo",
599		[ilog2(VM_IO)]		= "io",
600		[ilog2(VM_SEQ_READ)]	= "sr",
601		[ilog2(VM_RAND_READ)]	= "rr",
602		[ilog2(VM_DONTCOPY)]	= "dc",
603		[ilog2(VM_DONTEXPAND)]	= "de",
604		[ilog2(VM_ACCOUNT)]	= "ac",
605		[ilog2(VM_NORESERVE)]	= "nr",
606		[ilog2(VM_HUGETLB)]	= "ht",
607		[ilog2(VM_ARCH_1)]	= "ar",
608		[ilog2(VM_DONTDUMP)]	= "dd",
609#ifdef CONFIG_MEM_SOFT_DIRTY
610		[ilog2(VM_SOFTDIRTY)]	= "sd",
611#endif
612		[ilog2(VM_MIXEDMAP)]	= "mm",
613		[ilog2(VM_HUGEPAGE)]	= "hg",
614		[ilog2(VM_NOHUGEPAGE)]	= "nh",
615		[ilog2(VM_MERGEABLE)]	= "mg",
616		[ilog2(VM_UFFD_MISSING)]= "um",
617		[ilog2(VM_UFFD_WP)]	= "uw",
618	};
619	size_t i;
620
621	seq_puts(m, "VmFlags: ");
622	for (i = 0; i < BITS_PER_LONG; i++) {
623		if (vma->vm_flags & (1UL << i)) {
624			seq_printf(m, "%c%c ",
625				   mnemonics[i][0], mnemonics[i][1]);
626		}
627	}
628	seq_putc(m, '\n');
629}
630
631#ifdef CONFIG_HUGETLB_PAGE
632static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
633				 unsigned long addr, unsigned long end,
634				 struct mm_walk *walk)
635{
636	struct mem_size_stats *mss = walk->private;
637	struct vm_area_struct *vma = walk->vma;
638	struct page *page = NULL;
639
640	if (pte_present(*pte)) {
641		page = vm_normal_page(vma, addr, *pte);
642	} else if (is_swap_pte(*pte)) {
643		swp_entry_t swpent = pte_to_swp_entry(*pte);
644
645		if (is_migration_entry(swpent))
646			page = migration_entry_to_page(swpent);
647	}
648	if (page) {
649		int mapcount = page_mapcount(page);
650
651		if (mapcount >= 2)
652			mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
653		else
654			mss->private_hugetlb += huge_page_size(hstate_vma(vma));
655	}
656	return 0;
657}
658#endif /* HUGETLB_PAGE */
659
660static int show_smap(struct seq_file *m, void *v, int is_pid)
661{
662	struct vm_area_struct *vma = v;
663	struct mem_size_stats mss;
664	struct mm_walk smaps_walk = {
665		.pmd_entry = smaps_pte_range,
666#ifdef CONFIG_HUGETLB_PAGE
667		.hugetlb_entry = smaps_hugetlb_range,
668#endif
669		.mm = vma->vm_mm,
670		.private = &mss,
671	};
672
673	memset(&mss, 0, sizeof mss);
674	/* mmap_sem is held in m_start */
675	walk_page_vma(vma, &smaps_walk);
676
677	show_map_vma(m, vma, is_pid);
678
679	seq_printf(m,
680		   "Size:           %8lu kB\n"
681		   "Rss:            %8lu kB\n"
682		   "Pss:            %8lu kB\n"
683		   "Shared_Clean:   %8lu kB\n"
684		   "Shared_Dirty:   %8lu kB\n"
685		   "Private_Clean:  %8lu kB\n"
686		   "Private_Dirty:  %8lu kB\n"
687		   "Referenced:     %8lu kB\n"
688		   "Anonymous:      %8lu kB\n"
689		   "AnonHugePages:  %8lu kB\n"
690		   "Shared_Hugetlb: %8lu kB\n"
691		   "Private_Hugetlb: %7lu kB\n"
692		   "Swap:           %8lu kB\n"
693		   "SwapPss:        %8lu kB\n"
694		   "KernelPageSize: %8lu kB\n"
695		   "MMUPageSize:    %8lu kB\n"
696		   "Locked:         %8lu kB\n",
697		   (vma->vm_end - vma->vm_start) >> 10,
698		   mss.resident >> 10,
699		   (unsigned long)(mss.pss >> (10 + PSS_SHIFT)),
700		   mss.shared_clean  >> 10,
701		   mss.shared_dirty  >> 10,
702		   mss.private_clean >> 10,
703		   mss.private_dirty >> 10,
704		   mss.referenced >> 10,
705		   mss.anonymous >> 10,
706		   mss.anonymous_thp >> 10,
707		   mss.shared_hugetlb >> 10,
708		   mss.private_hugetlb >> 10,
709		   mss.swap >> 10,
710		   (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
711		   vma_kernel_pagesize(vma) >> 10,
712		   vma_mmu_pagesize(vma) >> 10,
713		   (vma->vm_flags & VM_LOCKED) ?
714			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
715
716	show_smap_vma_flags(m, vma);
717	m_cache_vma(m, vma);
718	return 0;
719}
720
721static int show_pid_smap(struct seq_file *m, void *v)
722{
723	return show_smap(m, v, 1);
724}
725
726static int show_tid_smap(struct seq_file *m, void *v)
727{
728	return show_smap(m, v, 0);
729}
730
731static const struct seq_operations proc_pid_smaps_op = {
732	.start	= m_start,
733	.next	= m_next,
734	.stop	= m_stop,
735	.show	= show_pid_smap
736};
737
738static const struct seq_operations proc_tid_smaps_op = {
739	.start	= m_start,
740	.next	= m_next,
741	.stop	= m_stop,
742	.show	= show_tid_smap
743};
744
745static int pid_smaps_open(struct inode *inode, struct file *file)
746{
747	return do_maps_open(inode, file, &proc_pid_smaps_op);
748}
749
750static int tid_smaps_open(struct inode *inode, struct file *file)
751{
752	return do_maps_open(inode, file, &proc_tid_smaps_op);
753}
754
755const struct file_operations proc_pid_smaps_operations = {
756	.open		= pid_smaps_open,
757	.read		= seq_read,
758	.llseek		= seq_lseek,
759	.release	= proc_map_release,
760};
761
762const struct file_operations proc_tid_smaps_operations = {
763	.open		= tid_smaps_open,
764	.read		= seq_read,
765	.llseek		= seq_lseek,
766	.release	= proc_map_release,
767};
768
769enum clear_refs_types {
770	CLEAR_REFS_ALL = 1,
771	CLEAR_REFS_ANON,
772	CLEAR_REFS_MAPPED,
773	CLEAR_REFS_SOFT_DIRTY,
774	CLEAR_REFS_MM_HIWATER_RSS,
775	CLEAR_REFS_LAST,
776};
777
778struct clear_refs_private {
779	enum clear_refs_types type;
780};
781
782#ifdef CONFIG_MEM_SOFT_DIRTY
783static inline void clear_soft_dirty(struct vm_area_struct *vma,
784		unsigned long addr, pte_t *pte)
785{
786	/*
787	 * The soft-dirty tracker uses #PF-s to catch writes
788	 * to pages, so write-protect the pte as well. See the
789	 * Documentation/vm/soft-dirty.txt for full description
790	 * of how soft-dirty works.
791	 */
792	pte_t ptent = *pte;
793
794	if (pte_present(ptent)) {
795		ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
796		ptent = pte_wrprotect(ptent);
797		ptent = pte_clear_soft_dirty(ptent);
798		ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
799	} else if (is_swap_pte(ptent)) {
800		ptent = pte_swp_clear_soft_dirty(ptent);
801		set_pte_at(vma->vm_mm, addr, pte, ptent);
802	}
803}
804#else
805static inline void clear_soft_dirty(struct vm_area_struct *vma,
806		unsigned long addr, pte_t *pte)
807{
808}
809#endif
810
811#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
812static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
813		unsigned long addr, pmd_t *pmdp)
814{
815	pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
816
817	pmd = pmd_wrprotect(pmd);
818	pmd = pmd_clear_soft_dirty(pmd);
819
820	if (vma->vm_flags & VM_SOFTDIRTY)
821		vma->vm_flags &= ~VM_SOFTDIRTY;
822
823	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
824}
825#else
826static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
827		unsigned long addr, pmd_t *pmdp)
828{
829}
830#endif
831
832static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
833				unsigned long end, struct mm_walk *walk)
834{
835	struct clear_refs_private *cp = walk->private;
836	struct vm_area_struct *vma = walk->vma;
837	pte_t *pte, ptent;
838	spinlock_t *ptl;
839	struct page *page;
840
841	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
842		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
843			clear_soft_dirty_pmd(vma, addr, pmd);
844			goto out;
845		}
846
847		page = pmd_page(*pmd);
848
849		/* Clear accessed and referenced bits. */
850		pmdp_test_and_clear_young(vma, addr, pmd);
851		test_and_clear_page_young(page);
852		ClearPageReferenced(page);
853out:
854		spin_unlock(ptl);
855		return 0;
856	}
857
858	if (pmd_trans_unstable(pmd))
859		return 0;
860
861	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
862	for (; addr != end; pte++, addr += PAGE_SIZE) {
863		ptent = *pte;
864
865		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
866			clear_soft_dirty(vma, addr, pte);
867			continue;
868		}
869
870		if (!pte_present(ptent))
871			continue;
872
873		page = vm_normal_page(vma, addr, ptent);
874		if (!page)
875			continue;
876
877		/* Clear accessed and referenced bits. */
878		ptep_test_and_clear_young(vma, addr, pte);
879		test_and_clear_page_young(page);
880		ClearPageReferenced(page);
881	}
882	pte_unmap_unlock(pte - 1, ptl);
883	cond_resched();
884	return 0;
885}
886
887static int clear_refs_test_walk(unsigned long start, unsigned long end,
888				struct mm_walk *walk)
889{
890	struct clear_refs_private *cp = walk->private;
891	struct vm_area_struct *vma = walk->vma;
892
893	if (vma->vm_flags & VM_PFNMAP)
894		return 1;
895
896	/*
897	 * Writing 1 to /proc/pid/clear_refs affects all pages.
898	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
899	 * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
900	 * Writing 4 to /proc/pid/clear_refs affects all pages.
901	 */
902	if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
903		return 1;
904	if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
905		return 1;
906	return 0;
907}
908
909static ssize_t clear_refs_write(struct file *file, const char __user *buf,
910				size_t count, loff_t *ppos)
911{
912	struct task_struct *task;
913	char buffer[PROC_NUMBUF];
914	struct mm_struct *mm;
915	struct vm_area_struct *vma;
916	enum clear_refs_types type;
917	int itype;
918	int rv;
919
920	memset(buffer, 0, sizeof(buffer));
921	if (count > sizeof(buffer) - 1)
922		count = sizeof(buffer) - 1;
923	if (copy_from_user(buffer, buf, count))
924		return -EFAULT;
925	rv = kstrtoint(strstrip(buffer), 10, &itype);
926	if (rv < 0)
927		return rv;
928	type = (enum clear_refs_types)itype;
929	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
930		return -EINVAL;
931
932	task = get_proc_task(file_inode(file));
933	if (!task)
934		return -ESRCH;
935	mm = get_task_mm(task);
936	if (mm) {
937		struct clear_refs_private cp = {
938			.type = type,
939		};
940		struct mm_walk clear_refs_walk = {
941			.pmd_entry = clear_refs_pte_range,
942			.test_walk = clear_refs_test_walk,
943			.mm = mm,
944			.private = &cp,
945		};
946
947		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
948			/*
949			 * Writing 5 to /proc/pid/clear_refs resets the peak
950			 * resident set size to this mm's current rss value.
951			 */
952			down_write(&mm->mmap_sem);
953			reset_mm_hiwater_rss(mm);
954			up_write(&mm->mmap_sem);
955			goto out_mm;
956		}
957
958		down_read(&mm->mmap_sem);
959		if (type == CLEAR_REFS_SOFT_DIRTY) {
960			for (vma = mm->mmap; vma; vma = vma->vm_next) {
961				if (!(vma->vm_flags & VM_SOFTDIRTY))
962					continue;
963				up_read(&mm->mmap_sem);
964				down_write(&mm->mmap_sem);
965				for (vma = mm->mmap; vma; vma = vma->vm_next) {
966					vma->vm_flags &= ~VM_SOFTDIRTY;
967					vma_set_page_prot(vma);
968				}
969				downgrade_write(&mm->mmap_sem);
970				break;
971			}
972			mmu_notifier_invalidate_range_start(mm, 0, -1);
973		}
974		walk_page_range(0, ~0UL, &clear_refs_walk);
975		if (type == CLEAR_REFS_SOFT_DIRTY)
976			mmu_notifier_invalidate_range_end(mm, 0, -1);
977		flush_tlb_mm(mm);
978		up_read(&mm->mmap_sem);
979out_mm:
980		mmput(mm);
981	}
982	put_task_struct(task);
983
984	return count;
985}
986
987const struct file_operations proc_clear_refs_operations = {
988	.write		= clear_refs_write,
989	.llseek		= noop_llseek,
990};
991
992typedef struct {
993	u64 pme;
994} pagemap_entry_t;
995
996struct pagemapread {
997	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
998	pagemap_entry_t *buffer;
999	bool show_pfn;
1000};
1001
1002#define PAGEMAP_WALK_SIZE	(PMD_SIZE)
1003#define PAGEMAP_WALK_MASK	(PMD_MASK)
1004
1005#define PM_ENTRY_BYTES		sizeof(pagemap_entry_t)
1006#define PM_PFRAME_BITS		55
1007#define PM_PFRAME_MASK		GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
1008#define PM_SOFT_DIRTY		BIT_ULL(55)
1009#define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
1010#define PM_FILE			BIT_ULL(61)
1011#define PM_SWAP			BIT_ULL(62)
1012#define PM_PRESENT		BIT_ULL(63)
1013
1014#define PM_END_OF_BUFFER    1
1015
1016static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
1017{
1018	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
1019}
1020
1021static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
1022			  struct pagemapread *pm)
1023{
1024	pm->buffer[pm->pos++] = *pme;
1025	if (pm->pos >= pm->len)
1026		return PM_END_OF_BUFFER;
1027	return 0;
1028}
1029
1030static int pagemap_pte_hole(unsigned long start, unsigned long end,
1031				struct mm_walk *walk)
1032{
1033	struct pagemapread *pm = walk->private;
1034	unsigned long addr = start;
1035	int err = 0;
1036
1037	while (addr < end) {
1038		struct vm_area_struct *vma = find_vma(walk->mm, addr);
1039		pagemap_entry_t pme = make_pme(0, 0);
1040		/* End of address space hole, which we mark as non-present. */
1041		unsigned long hole_end;
1042
1043		if (vma)
1044			hole_end = min(end, vma->vm_start);
1045		else
1046			hole_end = end;
1047
1048		for (; addr < hole_end; addr += PAGE_SIZE) {
1049			err = add_to_pagemap(addr, &pme, pm);
1050			if (err)
1051				goto out;
1052		}
1053
1054		if (!vma)
1055			break;
1056
1057		/* Addresses in the VMA. */
1058		if (vma->vm_flags & VM_SOFTDIRTY)
1059			pme = make_pme(0, PM_SOFT_DIRTY);
1060		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1061			err = add_to_pagemap(addr, &pme, pm);
1062			if (err)
1063				goto out;
1064		}
1065	}
1066out:
1067	return err;
1068}
1069
1070static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1071		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1072{
1073	u64 frame = 0, flags = 0;
1074	struct page *page = NULL;
1075
1076	if (pte_present(pte)) {
1077		if (pm->show_pfn)
1078			frame = pte_pfn(pte);
1079		flags |= PM_PRESENT;
1080		page = vm_normal_page(vma, addr, pte);
1081		if (pte_soft_dirty(pte))
1082			flags |= PM_SOFT_DIRTY;
1083	} else if (is_swap_pte(pte)) {
1084		swp_entry_t entry;
1085		if (pte_swp_soft_dirty(pte))
1086			flags |= PM_SOFT_DIRTY;
1087		entry = pte_to_swp_entry(pte);
1088		frame = swp_type(entry) |
1089			(swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1090		flags |= PM_SWAP;
1091		if (is_migration_entry(entry))
1092			page = migration_entry_to_page(entry);
1093	}
1094
1095	if (page && !PageAnon(page))
1096		flags |= PM_FILE;
1097	if (page && page_mapcount(page) == 1)
1098		flags |= PM_MMAP_EXCLUSIVE;
1099	if (vma->vm_flags & VM_SOFTDIRTY)
1100		flags |= PM_SOFT_DIRTY;
1101
1102	return make_pme(frame, flags);
1103}
1104
1105static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1106			     struct mm_walk *walk)
1107{
1108	struct vm_area_struct *vma = walk->vma;
1109	struct pagemapread *pm = walk->private;
1110	spinlock_t *ptl;
1111	pte_t *pte, *orig_pte;
1112	int err = 0;
1113
1114#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1115	if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
1116		u64 flags = 0, frame = 0;
1117		pmd_t pmd = *pmdp;
1118
1119		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
1120			flags |= PM_SOFT_DIRTY;
1121
1122		/*
1123		 * Currently pmd for thp is always present because thp
1124		 * can not be swapped-out, migrated, or HWPOISONed
1125		 * (split in such cases instead.)
1126		 * This if-check is just to prepare for future implementation.
1127		 */
1128		if (pmd_present(pmd)) {
1129			struct page *page = pmd_page(pmd);
1130
1131			if (page_mapcount(page) == 1)
1132				flags |= PM_MMAP_EXCLUSIVE;
1133
1134			flags |= PM_PRESENT;
1135			if (pm->show_pfn)
1136				frame = pmd_pfn(pmd) +
1137					((addr & ~PMD_MASK) >> PAGE_SHIFT);
1138		}
1139
1140		for (; addr != end; addr += PAGE_SIZE) {
1141			pagemap_entry_t pme = make_pme(frame, flags);
1142
1143			err = add_to_pagemap(addr, &pme, pm);
1144			if (err)
1145				break;
1146			if (pm->show_pfn && (flags & PM_PRESENT))
1147				frame++;
1148		}
1149		spin_unlock(ptl);
1150		return err;
1151	}
1152
1153	if (pmd_trans_unstable(pmdp))
1154		return 0;
1155#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1156
1157	/*
1158	 * We can assume that @vma always points to a valid one and @end never
1159	 * goes beyond vma->vm_end.
1160	 */
1161	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
1162	for (; addr < end; pte++, addr += PAGE_SIZE) {
1163		pagemap_entry_t pme;
1164
1165		pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
1166		err = add_to_pagemap(addr, &pme, pm);
1167		if (err)
1168			break;
1169	}
1170	pte_unmap_unlock(orig_pte, ptl);
1171
1172	cond_resched();
1173
1174	return err;
1175}
1176
1177#ifdef CONFIG_HUGETLB_PAGE
1178/* This function walks within one hugetlb entry in the single call */
1179static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1180				 unsigned long addr, unsigned long end,
1181				 struct mm_walk *walk)
1182{
1183	struct pagemapread *pm = walk->private;
1184	struct vm_area_struct *vma = walk->vma;
1185	u64 flags = 0, frame = 0;
1186	int err = 0;
1187	pte_t pte;
1188
1189	if (vma->vm_flags & VM_SOFTDIRTY)
1190		flags |= PM_SOFT_DIRTY;
1191
1192	pte = huge_ptep_get(ptep);
1193	if (pte_present(pte)) {
1194		struct page *page = pte_page(pte);
1195
1196		if (!PageAnon(page))
1197			flags |= PM_FILE;
1198
1199		if (page_mapcount(page) == 1)
1200			flags |= PM_MMAP_EXCLUSIVE;
1201
1202		flags |= PM_PRESENT;
1203		if (pm->show_pfn)
1204			frame = pte_pfn(pte) +
1205				((addr & ~hmask) >> PAGE_SHIFT);
1206	}
1207
1208	for (; addr != end; addr += PAGE_SIZE) {
1209		pagemap_entry_t pme = make_pme(frame, flags);
1210
1211		err = add_to_pagemap(addr, &pme, pm);
1212		if (err)
1213			return err;
1214		if (pm->show_pfn && (flags & PM_PRESENT))
1215			frame++;
1216	}
1217
1218	cond_resched();
1219
1220	return err;
1221}
1222#endif /* HUGETLB_PAGE */
1223
1224/*
1225 * /proc/pid/pagemap - an array mapping virtual pages to pfns
1226 *
1227 * For each page in the address space, this file contains one 64-bit entry
1228 * consisting of the following:
1229 *
1230 * Bits 0-54  page frame number (PFN) if present
1231 * Bits 0-4   swap type if swapped
1232 * Bits 5-54  swap offset if swapped
1233 * Bit  55    pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
1234 * Bit  56    page exclusively mapped
1235 * Bits 57-60 zero
1236 * Bit  61    page is file-page or shared-anon
1237 * Bit  62    page swapped
1238 * Bit  63    page present
1239 *
1240 * If the page is not present but in swap, then the PFN contains an
1241 * encoding of the swap file number and the page's offset into the
1242 * swap. Unmapped pages return a null PFN. This allows determining
1243 * precisely which pages are mapped (or in swap) and comparing mapped
1244 * pages between processes.
1245 *
1246 * Efficient users of this interface will use /proc/pid/maps to
1247 * determine which areas of memory are actually mapped and llseek to
1248 * skip over unmapped regions.
1249 */
1250static ssize_t pagemap_read(struct file *file, char __user *buf,
1251			    size_t count, loff_t *ppos)
1252{
1253	struct mm_struct *mm = file->private_data;
1254	struct pagemapread pm;
1255	struct mm_walk pagemap_walk = {};
1256	unsigned long src;
1257	unsigned long svpfn;
1258	unsigned long start_vaddr;
1259	unsigned long end_vaddr;
1260	int ret = 0, copied = 0;
1261
1262	if (!mm || !atomic_inc_not_zero(&mm->mm_users))
1263		goto out;
1264
1265	ret = -EINVAL;
1266	/* file position must be aligned */
1267	if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1268		goto out_mm;
1269
1270	ret = 0;
1271	if (!count)
1272		goto out_mm;
1273
1274	/* do not disclose physical addresses: attack vector */
1275	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1276
1277	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1278	pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1279	ret = -ENOMEM;
1280	if (!pm.buffer)
1281		goto out_mm;
1282
1283	pagemap_walk.pmd_entry = pagemap_pmd_range;
1284	pagemap_walk.pte_hole = pagemap_pte_hole;
1285#ifdef CONFIG_HUGETLB_PAGE
1286	pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
1287#endif
1288	pagemap_walk.mm = mm;
1289	pagemap_walk.private = &pm;
1290
1291	src = *ppos;
1292	svpfn = src / PM_ENTRY_BYTES;
1293	start_vaddr = svpfn << PAGE_SHIFT;
1294	end_vaddr = mm->task_size;
1295
1296	/* watch out for wraparound */
1297	if (svpfn > mm->task_size >> PAGE_SHIFT)
1298		start_vaddr = end_vaddr;
1299
1300	/*
1301	 * The odds are that this will stop walking way
1302	 * before end_vaddr, because the length of the
1303	 * user buffer is tracked in "pm", and the walk
1304	 * will stop when we hit the end of the buffer.
1305	 */
1306	ret = 0;
1307	while (count && (start_vaddr < end_vaddr)) {
1308		int len;
1309		unsigned long end;
1310
1311		pm.pos = 0;
1312		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
1313		/* overflow ? */
1314		if (end < start_vaddr || end > end_vaddr)
1315			end = end_vaddr;
1316		down_read(&mm->mmap_sem);
1317		ret = walk_page_range(start_vaddr, end, &pagemap_walk);
1318		up_read(&mm->mmap_sem);
1319		start_vaddr = end;
1320
1321		len = min(count, PM_ENTRY_BYTES * pm.pos);
1322		if (copy_to_user(buf, pm.buffer, len)) {
1323			ret = -EFAULT;
1324			goto out_free;
1325		}
1326		copied += len;
1327		buf += len;
1328		count -= len;
1329	}
1330	*ppos += copied;
1331	if (!ret || ret == PM_END_OF_BUFFER)
1332		ret = copied;
1333
1334out_free:
1335	kfree(pm.buffer);
1336out_mm:
1337	mmput(mm);
1338out:
1339	return ret;
1340}
1341
1342static int pagemap_open(struct inode *inode, struct file *file)
1343{
1344	struct mm_struct *mm;
1345
1346	mm = proc_mem_open(inode, PTRACE_MODE_READ);
1347	if (IS_ERR(mm))
1348		return PTR_ERR(mm);
1349	file->private_data = mm;
1350	return 0;
1351}
1352
1353static int pagemap_release(struct inode *inode, struct file *file)
1354{
1355	struct mm_struct *mm = file->private_data;
1356
1357	if (mm)
1358		mmdrop(mm);
1359	return 0;
1360}
1361
1362const struct file_operations proc_pagemap_operations = {
1363	.llseek		= mem_lseek, /* borrow this */
1364	.read		= pagemap_read,
1365	.open		= pagemap_open,
1366	.release	= pagemap_release,
1367};
1368#endif /* CONFIG_PROC_PAGE_MONITOR */
1369
1370#ifdef CONFIG_NUMA
1371
1372struct numa_maps {
1373	unsigned long pages;
1374	unsigned long anon;
1375	unsigned long active;
1376	unsigned long writeback;
1377	unsigned long mapcount_max;
1378	unsigned long dirty;
1379	unsigned long swapcache;
1380	unsigned long node[MAX_NUMNODES];
1381};
1382
1383struct numa_maps_private {
1384	struct proc_maps_private proc_maps;
1385	struct numa_maps md;
1386};
1387
1388static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
1389			unsigned long nr_pages)
1390{
1391	int count = page_mapcount(page);
1392
1393	md->pages += nr_pages;
1394	if (pte_dirty || PageDirty(page))
1395		md->dirty += nr_pages;
1396
1397	if (PageSwapCache(page))
1398		md->swapcache += nr_pages;
1399
1400	if (PageActive(page) || PageUnevictable(page))
1401		md->active += nr_pages;
1402
1403	if (PageWriteback(page))
1404		md->writeback += nr_pages;
1405
1406	if (PageAnon(page))
1407		md->anon += nr_pages;
1408
1409	if (count > md->mapcount_max)
1410		md->mapcount_max = count;
1411
1412	md->node[page_to_nid(page)] += nr_pages;
1413}
1414
1415static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1416		unsigned long addr)
1417{
1418	struct page *page;
1419	int nid;
1420
1421	if (!pte_present(pte))
1422		return NULL;
1423
1424	page = vm_normal_page(vma, addr, pte);
1425	if (!page)
1426		return NULL;
1427
1428	if (PageReserved(page))
1429		return NULL;
1430
1431	nid = page_to_nid(page);
1432	if (!node_isset(nid, node_states[N_MEMORY]))
1433		return NULL;
1434
1435	return page;
1436}
1437
1438#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1439static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
1440					      struct vm_area_struct *vma,
1441					      unsigned long addr)
1442{
1443	struct page *page;
1444	int nid;
1445
1446	if (!pmd_present(pmd))
1447		return NULL;
1448
1449	page = vm_normal_page_pmd(vma, addr, pmd);
1450	if (!page)
1451		return NULL;
1452
1453	if (PageReserved(page))
1454		return NULL;
1455
1456	nid = page_to_nid(page);
1457	if (!node_isset(nid, node_states[N_MEMORY]))
1458		return NULL;
1459
1460	return page;
1461}
1462#endif
1463
1464static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1465		unsigned long end, struct mm_walk *walk)
1466{
1467	struct numa_maps *md = walk->private;
1468	struct vm_area_struct *vma = walk->vma;
1469	spinlock_t *ptl;
1470	pte_t *orig_pte;
1471	pte_t *pte;
1472
1473#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1474	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1475		struct page *page;
1476
1477		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
1478		if (page)
1479			gather_stats(page, md, pmd_dirty(*pmd),
1480				     HPAGE_PMD_SIZE/PAGE_SIZE);
1481		spin_unlock(ptl);
1482		return 0;
1483	}
1484
1485	if (pmd_trans_unstable(pmd))
1486		return 0;
1487#endif
1488	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1489	do {
1490		struct page *page = can_gather_numa_stats(*pte, vma, addr);
1491		if (!page)
1492			continue;
1493		gather_stats(page, md, pte_dirty(*pte), 1);
1494
1495	} while (pte++, addr += PAGE_SIZE, addr != end);
1496	pte_unmap_unlock(orig_pte, ptl);
1497	return 0;
1498}
1499#ifdef CONFIG_HUGETLB_PAGE
1500static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1501		unsigned long addr, unsigned long end, struct mm_walk *walk)
1502{
1503	pte_t huge_pte = huge_ptep_get(pte);
1504	struct numa_maps *md;
1505	struct page *page;
1506
1507	if (!pte_present(huge_pte))
1508		return 0;
1509
1510	page = pte_page(huge_pte);
1511	if (!page)
1512		return 0;
1513
1514	md = walk->private;
1515	gather_stats(page, md, pte_dirty(huge_pte), 1);
1516	return 0;
1517}
1518
1519#else
1520static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
1521		unsigned long addr, unsigned long end, struct mm_walk *walk)
1522{
1523	return 0;
1524}
1525#endif
1526
1527/*
1528 * Display pages allocated per node and memory policy via /proc.
1529 */
1530static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1531{
1532	struct numa_maps_private *numa_priv = m->private;
1533	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1534	struct vm_area_struct *vma = v;
1535	struct numa_maps *md = &numa_priv->md;
1536	struct file *file = vma->vm_file;
1537	struct mm_struct *mm = vma->vm_mm;
1538	struct mm_walk walk = {
1539		.hugetlb_entry = gather_hugetlb_stats,
1540		.pmd_entry = gather_pte_stats,
1541		.private = md,
1542		.mm = mm,
1543	};
1544	struct mempolicy *pol;
1545	char buffer[64];
1546	int nid;
1547
1548	if (!mm)
1549		return 0;
1550
1551	/* Ensure we start with an empty set of numa_maps statistics. */
1552	memset(md, 0, sizeof(*md));
1553
1554	pol = __get_vma_policy(vma, vma->vm_start);
1555	if (pol) {
1556		mpol_to_str(buffer, sizeof(buffer), pol);
1557		mpol_cond_put(pol);
1558	} else {
1559		mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy);
1560	}
1561
1562	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1563
1564	if (file) {
1565		seq_puts(m, " file=");
1566		seq_file_path(m, file, "\n\t= ");
1567	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1568		seq_puts(m, " heap");
1569	} else {
1570		pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
1571		if (tid != 0) {
1572			/*
1573			 * Thread stack in /proc/PID/task/TID/maps or
1574			 * the main process stack.
1575			 */
1576			if (!is_pid || (vma->vm_start <= mm->start_stack &&
1577			    vma->vm_end >= mm->start_stack))
1578				seq_puts(m, " stack");
1579			else
1580				seq_printf(m, " stack:%d", tid);
1581		}
1582	}
1583
1584	if (is_vm_hugetlb_page(vma))
1585		seq_puts(m, " huge");
1586
1587	/* mmap_sem is held by m_start */
1588	walk_page_vma(vma, &walk);
1589
1590	if (!md->pages)
1591		goto out;
1592
1593	if (md->anon)
1594		seq_printf(m, " anon=%lu", md->anon);
1595
1596	if (md->dirty)
1597		seq_printf(m, " dirty=%lu", md->dirty);
1598
1599	if (md->pages != md->anon && md->pages != md->dirty)
1600		seq_printf(m, " mapped=%lu", md->pages);
1601
1602	if (md->mapcount_max > 1)
1603		seq_printf(m, " mapmax=%lu", md->mapcount_max);
1604
1605	if (md->swapcache)
1606		seq_printf(m, " swapcache=%lu", md->swapcache);
1607
1608	if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1609		seq_printf(m, " active=%lu", md->active);
1610
1611	if (md->writeback)
1612		seq_printf(m, " writeback=%lu", md->writeback);
1613
1614	for_each_node_state(nid, N_MEMORY)
1615		if (md->node[nid])
1616			seq_printf(m, " N%d=%lu", nid, md->node[nid]);
1617
1618	seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
1619out:
1620	seq_putc(m, '\n');
1621	m_cache_vma(m, vma);
1622	return 0;
1623}
1624
1625static int show_pid_numa_map(struct seq_file *m, void *v)
1626{
1627	return show_numa_map(m, v, 1);
1628}
1629
1630static int show_tid_numa_map(struct seq_file *m, void *v)
1631{
1632	return show_numa_map(m, v, 0);
1633}
1634
1635static const struct seq_operations proc_pid_numa_maps_op = {
1636	.start  = m_start,
1637	.next   = m_next,
1638	.stop   = m_stop,
1639	.show   = show_pid_numa_map,
1640};
1641
1642static const struct seq_operations proc_tid_numa_maps_op = {
1643	.start  = m_start,
1644	.next   = m_next,
1645	.stop   = m_stop,
1646	.show   = show_tid_numa_map,
1647};
1648
1649static int numa_maps_open(struct inode *inode, struct file *file,
1650			  const struct seq_operations *ops)
1651{
1652	return proc_maps_open(inode, file, ops,
1653				sizeof(struct numa_maps_private));
1654}
1655
1656static int pid_numa_maps_open(struct inode *inode, struct file *file)
1657{
1658	return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1659}
1660
1661static int tid_numa_maps_open(struct inode *inode, struct file *file)
1662{
1663	return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1664}
1665
1666const struct file_operations proc_pid_numa_maps_operations = {
1667	.open		= pid_numa_maps_open,
1668	.read		= seq_read,
1669	.llseek		= seq_lseek,
1670	.release	= proc_map_release,
1671};
1672
1673const struct file_operations proc_tid_numa_maps_operations = {
1674	.open		= tid_numa_maps_open,
1675	.read		= seq_read,
1676	.llseek		= seq_lseek,
1677	.release	= proc_map_release,
1678};
1679#endif /* CONFIG_NUMA */
1680