1/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
6 * Subject to the GNU Public License, version 2.
7 *
8 * NUMA policy allows the user to give hints in which node(s) memory should
9 * be allocated.
10 *
11 * Support four policies per VMA and per process:
12 *
13 * The VMA policy has priority over the process policy for a page fault.
14 *
15 * interleave     Allocate memory interleaved over a set of nodes,
16 *                with normal fallback if it fails.
17 *                For VMA based allocations this interleaves based on the
18 *                offset into the backing object or offset into the mapping
19 *                for anonymous memory. For process policy an process counter
20 *                is used.
21 *
22 * bind           Only allocate memory on a specific set of nodes,
23 *                no fallback.
24 *                FIXME: memory is allocated starting with the first node
25 *                to the last. It would be better if bind would truly restrict
26 *                the allocation to memory nodes instead
27 *
28 * preferred       Try a specific node first before normal fallback.
29 *                As a special case NUMA_NO_NODE here means do the allocation
30 *                on the local CPU. This is normally identical to default,
31 *                but useful to set in a VMA when you have a non default
32 *                process policy.
33 *
34 * default        Allocate on the local node first, or when on a VMA
35 *                use the process policy. This is what Linux always did
36 *		  in a NUMA aware kernel and still does by, ahem, default.
37 *
38 * The process policy is applied for most non interrupt memory allocations
39 * in that process' context. Interrupts ignore the policies and always
40 * try to allocate on the local CPU. The VMA policy is only applied for memory
41 * allocations for a VMA in the VM.
42 *
43 * Currently there are a few corner cases in swapping where the policy
44 * is not applied, but the majority should be handled. When process policy
45 * is used it is not remembered over swap outs/swap ins.
46 *
47 * Only the highest zone in the zone hierarchy gets policied. Allocations
48 * requesting a lower zone just use default policy. This implies that
49 * on systems with highmem kernel lowmem allocation don't get policied.
50 * Same with GFP_DMA allocations.
51 *
52 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
53 * all users and remembered even when nobody has memory mapped.
54 */
55
56/* Notebook:
57   fix mmap readahead to honour policy and enable policy for any page cache
58   object
59   statistics for bigpages
60   global policy for page cache? currently it uses process policy. Requires
61   first item above.
62   handle mremap for shared memory (currently ignored for the policy)
63   grows down?
64   make bind policy root only? It can trigger oom much faster and the
65   kernel is not always grateful with that.
66*/
67
68#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
69
70#include <linux/mempolicy.h>
71#include <linux/mm.h>
72#include <linux/highmem.h>
73#include <linux/hugetlb.h>
74#include <linux/kernel.h>
75#include <linux/sched.h>
76#include <linux/nodemask.h>
77#include <linux/cpuset.h>
78#include <linux/slab.h>
79#include <linux/string.h>
80#include <linux/export.h>
81#include <linux/nsproxy.h>
82#include <linux/interrupt.h>
83#include <linux/init.h>
84#include <linux/compat.h>
85#include <linux/swap.h>
86#include <linux/seq_file.h>
87#include <linux/proc_fs.h>
88#include <linux/migrate.h>
89#include <linux/ksm.h>
90#include <linux/rmap.h>
91#include <linux/security.h>
92#include <linux/syscalls.h>
93#include <linux/ctype.h>
94#include <linux/mm_inline.h>
95#include <linux/mmu_notifier.h>
96#include <linux/printk.h>
97
98#include <asm/tlbflush.h>
99#include <asm/uaccess.h>
100#include <linux/random.h>
101
102#include "internal.h"
103
104/* Internal flags */
105#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)	/* Skip checks for continuous vmas */
106#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
107
108static struct kmem_cache *policy_cache;
109static struct kmem_cache *sn_cache;
110
111/* Highest zone. An specific allocation for a zone below that is not
112   policied. */
113enum zone_type policy_zone = 0;
114
115/*
116 * run-time system-wide default policy => local allocation
117 */
118static struct mempolicy default_policy = {
119	.refcnt = ATOMIC_INIT(1), /* never free it */
120	.mode = MPOL_PREFERRED,
121	.flags = MPOL_F_LOCAL,
122};
123
124static struct mempolicy preferred_node_policy[MAX_NUMNODES];
125
126struct mempolicy *get_task_policy(struct task_struct *p)
127{
128	struct mempolicy *pol = p->mempolicy;
129	int node;
130
131	if (pol)
132		return pol;
133
134	node = numa_node_id();
135	if (node != NUMA_NO_NODE) {
136		pol = &preferred_node_policy[node];
137		/* preferred_node_policy is not initialised early in boot */
138		if (pol->mode)
139			return pol;
140	}
141
142	return &default_policy;
143}
144
145static const struct mempolicy_operations {
146	int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
147	/*
148	 * If read-side task has no lock to protect task->mempolicy, write-side
149	 * task will rebind the task->mempolicy by two step. The first step is
150	 * setting all the newly nodes, and the second step is cleaning all the
151	 * disallowed nodes. In this way, we can avoid finding no node to alloc
152	 * page.
153	 * If we have a lock to protect task->mempolicy in read-side, we do
154	 * rebind directly.
155	 *
156	 * step:
157	 * 	MPOL_REBIND_ONCE - do rebind work at once
158	 * 	MPOL_REBIND_STEP1 - set all the newly nodes
159	 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
160	 */
161	void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
162			enum mpol_rebind_step step);
163} mpol_ops[MPOL_MAX];
164
165static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
166{
167	return pol->flags & MPOL_MODE_FLAGS;
168}
169
170static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
171				   const nodemask_t *rel)
172{
173	nodemask_t tmp;
174	nodes_fold(tmp, *orig, nodes_weight(*rel));
175	nodes_onto(*ret, tmp, *rel);
176}
177
178static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
179{
180	if (nodes_empty(*nodes))
181		return -EINVAL;
182	pol->v.nodes = *nodes;
183	return 0;
184}
185
186static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
187{
188	if (!nodes)
189		pol->flags |= MPOL_F_LOCAL;	/* local allocation */
190	else if (nodes_empty(*nodes))
191		return -EINVAL;			/*  no allowed nodes */
192	else
193		pol->v.preferred_node = first_node(*nodes);
194	return 0;
195}
196
197static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
198{
199	if (nodes_empty(*nodes))
200		return -EINVAL;
201	pol->v.nodes = *nodes;
202	return 0;
203}
204
205/*
206 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
207 * any, for the new policy.  mpol_new() has already validated the nodes
208 * parameter with respect to the policy mode and flags.  But, we need to
209 * handle an empty nodemask with MPOL_PREFERRED here.
210 *
211 * Must be called holding task's alloc_lock to protect task's mems_allowed
212 * and mempolicy.  May also be called holding the mmap_semaphore for write.
213 */
214static int mpol_set_nodemask(struct mempolicy *pol,
215		     const nodemask_t *nodes, struct nodemask_scratch *nsc)
216{
217	int ret;
218
219	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
220	if (pol == NULL)
221		return 0;
222	/* Check N_MEMORY */
223	nodes_and(nsc->mask1,
224		  cpuset_current_mems_allowed, node_states[N_MEMORY]);
225
226	VM_BUG_ON(!nodes);
227	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
228		nodes = NULL;	/* explicit local allocation */
229	else {
230		if (pol->flags & MPOL_F_RELATIVE_NODES)
231			mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
232		else
233			nodes_and(nsc->mask2, *nodes, nsc->mask1);
234
235		if (mpol_store_user_nodemask(pol))
236			pol->w.user_nodemask = *nodes;
237		else
238			pol->w.cpuset_mems_allowed =
239						cpuset_current_mems_allowed;
240	}
241
242	if (nodes)
243		ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
244	else
245		ret = mpol_ops[pol->mode].create(pol, NULL);
246	return ret;
247}
248
249/*
250 * This function just creates a new policy, does some check and simple
251 * initialization. You must invoke mpol_set_nodemask() to set nodes.
252 */
253static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
254				  nodemask_t *nodes)
255{
256	struct mempolicy *policy;
257
258	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
259		 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
260
261	if (mode == MPOL_DEFAULT) {
262		if (nodes && !nodes_empty(*nodes))
263			return ERR_PTR(-EINVAL);
264		return NULL;
265	}
266	VM_BUG_ON(!nodes);
267
268	/*
269	 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
270	 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
271	 * All other modes require a valid pointer to a non-empty nodemask.
272	 */
273	if (mode == MPOL_PREFERRED) {
274		if (nodes_empty(*nodes)) {
275			if (((flags & MPOL_F_STATIC_NODES) ||
276			     (flags & MPOL_F_RELATIVE_NODES)))
277				return ERR_PTR(-EINVAL);
278		}
279	} else if (mode == MPOL_LOCAL) {
280		if (!nodes_empty(*nodes))
281			return ERR_PTR(-EINVAL);
282		mode = MPOL_PREFERRED;
283	} else if (nodes_empty(*nodes))
284		return ERR_PTR(-EINVAL);
285	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
286	if (!policy)
287		return ERR_PTR(-ENOMEM);
288	atomic_set(&policy->refcnt, 1);
289	policy->mode = mode;
290	policy->flags = flags;
291
292	return policy;
293}
294
295/* Slow path of a mpol destructor. */
296void __mpol_put(struct mempolicy *p)
297{
298	if (!atomic_dec_and_test(&p->refcnt))
299		return;
300	kmem_cache_free(policy_cache, p);
301}
302
303static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
304				enum mpol_rebind_step step)
305{
306}
307
308/*
309 * step:
310 * 	MPOL_REBIND_ONCE  - do rebind work at once
311 * 	MPOL_REBIND_STEP1 - set all the newly nodes
312 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
313 */
314static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
315				 enum mpol_rebind_step step)
316{
317	nodemask_t tmp;
318
319	if (pol->flags & MPOL_F_STATIC_NODES)
320		nodes_and(tmp, pol->w.user_nodemask, *nodes);
321	else if (pol->flags & MPOL_F_RELATIVE_NODES)
322		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
323	else {
324		/*
325		 * if step == 1, we use ->w.cpuset_mems_allowed to cache the
326		 * result
327		 */
328		if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
329			nodes_remap(tmp, pol->v.nodes,
330					pol->w.cpuset_mems_allowed, *nodes);
331			pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
332		} else if (step == MPOL_REBIND_STEP2) {
333			tmp = pol->w.cpuset_mems_allowed;
334			pol->w.cpuset_mems_allowed = *nodes;
335		} else
336			BUG();
337	}
338
339	if (nodes_empty(tmp))
340		tmp = *nodes;
341
342	if (step == MPOL_REBIND_STEP1)
343		nodes_or(pol->v.nodes, pol->v.nodes, tmp);
344	else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
345		pol->v.nodes = tmp;
346	else
347		BUG();
348
349	if (!node_isset(current->il_next, tmp)) {
350		current->il_next = next_node(current->il_next, tmp);
351		if (current->il_next >= MAX_NUMNODES)
352			current->il_next = first_node(tmp);
353		if (current->il_next >= MAX_NUMNODES)
354			current->il_next = numa_node_id();
355	}
356}
357
358static void mpol_rebind_preferred(struct mempolicy *pol,
359				  const nodemask_t *nodes,
360				  enum mpol_rebind_step step)
361{
362	nodemask_t tmp;
363
364	if (pol->flags & MPOL_F_STATIC_NODES) {
365		int node = first_node(pol->w.user_nodemask);
366
367		if (node_isset(node, *nodes)) {
368			pol->v.preferred_node = node;
369			pol->flags &= ~MPOL_F_LOCAL;
370		} else
371			pol->flags |= MPOL_F_LOCAL;
372	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
373		mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
374		pol->v.preferred_node = first_node(tmp);
375	} else if (!(pol->flags & MPOL_F_LOCAL)) {
376		pol->v.preferred_node = node_remap(pol->v.preferred_node,
377						   pol->w.cpuset_mems_allowed,
378						   *nodes);
379		pol->w.cpuset_mems_allowed = *nodes;
380	}
381}
382
383/*
384 * mpol_rebind_policy - Migrate a policy to a different set of nodes
385 *
386 * If read-side task has no lock to protect task->mempolicy, write-side
387 * task will rebind the task->mempolicy by two step. The first step is
388 * setting all the newly nodes, and the second step is cleaning all the
389 * disallowed nodes. In this way, we can avoid finding no node to alloc
390 * page.
391 * If we have a lock to protect task->mempolicy in read-side, we do
392 * rebind directly.
393 *
394 * step:
395 * 	MPOL_REBIND_ONCE  - do rebind work at once
396 * 	MPOL_REBIND_STEP1 - set all the newly nodes
397 * 	MPOL_REBIND_STEP2 - clean all the disallowed nodes
398 */
399static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
400				enum mpol_rebind_step step)
401{
402	if (!pol)
403		return;
404	if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
405	    nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
406		return;
407
408	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
409		return;
410
411	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
412		BUG();
413
414	if (step == MPOL_REBIND_STEP1)
415		pol->flags |= MPOL_F_REBINDING;
416	else if (step == MPOL_REBIND_STEP2)
417		pol->flags &= ~MPOL_F_REBINDING;
418	else if (step >= MPOL_REBIND_NSTEP)
419		BUG();
420
421	mpol_ops[pol->mode].rebind(pol, newmask, step);
422}
423
424/*
425 * Wrapper for mpol_rebind_policy() that just requires task
426 * pointer, and updates task mempolicy.
427 *
428 * Called with task's alloc_lock held.
429 */
430
431void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
432			enum mpol_rebind_step step)
433{
434	mpol_rebind_policy(tsk->mempolicy, new, step);
435}
436
437/*
438 * Rebind each vma in mm to new nodemask.
439 *
440 * Call holding a reference to mm.  Takes mm->mmap_sem during call.
441 */
442
443void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
444{
445	struct vm_area_struct *vma;
446
447	down_write(&mm->mmap_sem);
448	for (vma = mm->mmap; vma; vma = vma->vm_next)
449		mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
450	up_write(&mm->mmap_sem);
451}
452
453static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
454	[MPOL_DEFAULT] = {
455		.rebind = mpol_rebind_default,
456	},
457	[MPOL_INTERLEAVE] = {
458		.create = mpol_new_interleave,
459		.rebind = mpol_rebind_nodemask,
460	},
461	[MPOL_PREFERRED] = {
462		.create = mpol_new_preferred,
463		.rebind = mpol_rebind_preferred,
464	},
465	[MPOL_BIND] = {
466		.create = mpol_new_bind,
467		.rebind = mpol_rebind_nodemask,
468	},
469};
470
471static void migrate_page_add(struct page *page, struct list_head *pagelist,
472				unsigned long flags);
473
474struct queue_pages {
475	struct list_head *pagelist;
476	unsigned long flags;
477	nodemask_t *nmask;
478	struct vm_area_struct *prev;
479};
480
481/*
482 * Scan through pages checking if pages follow certain conditions,
483 * and move them to the pagelist if they do.
484 */
485static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
486			unsigned long end, struct mm_walk *walk)
487{
488	struct vm_area_struct *vma = walk->vma;
489	struct page *page;
490	struct queue_pages *qp = walk->private;
491	unsigned long flags = qp->flags;
492	int nid;
493	pte_t *pte;
494	spinlock_t *ptl;
495
496	split_huge_page_pmd(vma, addr, pmd);
497	if (pmd_trans_unstable(pmd))
498		return 0;
499
500	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
501	for (; addr != end; pte++, addr += PAGE_SIZE) {
502		if (!pte_present(*pte))
503			continue;
504		page = vm_normal_page(vma, addr, *pte);
505		if (!page)
506			continue;
507		/*
508		 * vm_normal_page() filters out zero pages, but there might
509		 * still be PageReserved pages to skip, perhaps in a VDSO.
510		 */
511		if (PageReserved(page))
512			continue;
513		nid = page_to_nid(page);
514		if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
515			continue;
516
517		if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
518			migrate_page_add(page, qp->pagelist, flags);
519	}
520	pte_unmap_unlock(pte - 1, ptl);
521	cond_resched();
522	return 0;
523}
524
525static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
526			       unsigned long addr, unsigned long end,
527			       struct mm_walk *walk)
528{
529#ifdef CONFIG_HUGETLB_PAGE
530	struct queue_pages *qp = walk->private;
531	unsigned long flags = qp->flags;
532	int nid;
533	struct page *page;
534	spinlock_t *ptl;
535	pte_t entry;
536
537	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
538	entry = huge_ptep_get(pte);
539	if (!pte_present(entry))
540		goto unlock;
541	page = pte_page(entry);
542	nid = page_to_nid(page);
543	if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
544		goto unlock;
545	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
546	if (flags & (MPOL_MF_MOVE_ALL) ||
547	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
548		isolate_huge_page(page, qp->pagelist);
549unlock:
550	spin_unlock(ptl);
551#else
552	BUG();
553#endif
554	return 0;
555}
556
557#ifdef CONFIG_NUMA_BALANCING
558/*
559 * This is used to mark a range of virtual addresses to be inaccessible.
560 * These are later cleared by a NUMA hinting fault. Depending on these
561 * faults, pages may be migrated for better NUMA placement.
562 *
563 * This is assuming that NUMA faults are handled using PROT_NONE. If
564 * an architecture makes a different choice, it will need further
565 * changes to the core.
566 */
567unsigned long change_prot_numa(struct vm_area_struct *vma,
568			unsigned long addr, unsigned long end)
569{
570	int nr_updated;
571
572	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
573	if (nr_updated)
574		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
575
576	return nr_updated;
577}
578#else
579static unsigned long change_prot_numa(struct vm_area_struct *vma,
580			unsigned long addr, unsigned long end)
581{
582	return 0;
583}
584#endif /* CONFIG_NUMA_BALANCING */
585
586static int queue_pages_test_walk(unsigned long start, unsigned long end,
587				struct mm_walk *walk)
588{
589	struct vm_area_struct *vma = walk->vma;
590	struct queue_pages *qp = walk->private;
591	unsigned long endvma = vma->vm_end;
592	unsigned long flags = qp->flags;
593
594	if (vma->vm_flags & VM_PFNMAP)
595		return 1;
596
597	if (endvma > end)
598		endvma = end;
599	if (vma->vm_start > start)
600		start = vma->vm_start;
601
602	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
603		if (!vma->vm_next && vma->vm_end < end)
604			return -EFAULT;
605		if (qp->prev && qp->prev->vm_end < vma->vm_start)
606			return -EFAULT;
607	}
608
609	qp->prev = vma;
610
611	if (vma->vm_flags & VM_PFNMAP)
612		return 1;
613
614	if (flags & MPOL_MF_LAZY) {
615		/* Similar to task_numa_work, skip inaccessible VMAs */
616		if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
617			change_prot_numa(vma, start, endvma);
618		return 1;
619	}
620
621	if ((flags & MPOL_MF_STRICT) ||
622	    ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
623	     vma_migratable(vma)))
624		/* queue pages from current vma */
625		return 0;
626	return 1;
627}
628
629/*
630 * Walk through page tables and collect pages to be migrated.
631 *
632 * If pages found in a given range are on a set of nodes (determined by
633 * @nodes and @flags,) it's isolated and queued to the pagelist which is
634 * passed via @private.)
635 */
636static int
637queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
638		nodemask_t *nodes, unsigned long flags,
639		struct list_head *pagelist)
640{
641	struct queue_pages qp = {
642		.pagelist = pagelist,
643		.flags = flags,
644		.nmask = nodes,
645		.prev = NULL,
646	};
647	struct mm_walk queue_pages_walk = {
648		.hugetlb_entry = queue_pages_hugetlb,
649		.pmd_entry = queue_pages_pte_range,
650		.test_walk = queue_pages_test_walk,
651		.mm = mm,
652		.private = &qp,
653	};
654
655	return walk_page_range(start, end, &queue_pages_walk);
656}
657
658/*
659 * Apply policy to a single VMA
660 * This must be called with the mmap_sem held for writing.
661 */
662static int vma_replace_policy(struct vm_area_struct *vma,
663						struct mempolicy *pol)
664{
665	int err;
666	struct mempolicy *old;
667	struct mempolicy *new;
668
669	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
670		 vma->vm_start, vma->vm_end, vma->vm_pgoff,
671		 vma->vm_ops, vma->vm_file,
672		 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
673
674	new = mpol_dup(pol);
675	if (IS_ERR(new))
676		return PTR_ERR(new);
677
678	if (vma->vm_ops && vma->vm_ops->set_policy) {
679		err = vma->vm_ops->set_policy(vma, new);
680		if (err)
681			goto err_out;
682	}
683
684	old = vma->vm_policy;
685	vma->vm_policy = new; /* protected by mmap_sem */
686	mpol_put(old);
687
688	return 0;
689 err_out:
690	mpol_put(new);
691	return err;
692}
693
694/* Step 2: apply policy to a range and do splits. */
695static int mbind_range(struct mm_struct *mm, unsigned long start,
696		       unsigned long end, struct mempolicy *new_pol)
697{
698	struct vm_area_struct *next;
699	struct vm_area_struct *prev;
700	struct vm_area_struct *vma;
701	int err = 0;
702	pgoff_t pgoff;
703	unsigned long vmstart;
704	unsigned long vmend;
705
706	vma = find_vma(mm, start);
707	if (!vma || vma->vm_start > start)
708		return -EFAULT;
709
710	prev = vma->vm_prev;
711	if (start > vma->vm_start)
712		prev = vma;
713
714	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
715		next = vma->vm_next;
716		vmstart = max(start, vma->vm_start);
717		vmend   = min(end, vma->vm_end);
718
719		if (mpol_equal(vma_policy(vma), new_pol))
720			continue;
721
722		pgoff = vma->vm_pgoff +
723			((vmstart - vma->vm_start) >> PAGE_SHIFT);
724		prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
725				  vma->anon_vma, vma->vm_file, pgoff,
726				  new_pol);
727		if (prev) {
728			vma = prev;
729			next = vma->vm_next;
730			if (mpol_equal(vma_policy(vma), new_pol))
731				continue;
732			/* vma_merge() joined vma && vma->next, case 8 */
733			goto replace;
734		}
735		if (vma->vm_start != vmstart) {
736			err = split_vma(vma->vm_mm, vma, vmstart, 1);
737			if (err)
738				goto out;
739		}
740		if (vma->vm_end != vmend) {
741			err = split_vma(vma->vm_mm, vma, vmend, 0);
742			if (err)
743				goto out;
744		}
745 replace:
746		err = vma_replace_policy(vma, new_pol);
747		if (err)
748			goto out;
749	}
750
751 out:
752	return err;
753}
754
755/* Set the process memory policy */
756static long do_set_mempolicy(unsigned short mode, unsigned short flags,
757			     nodemask_t *nodes)
758{
759	struct mempolicy *new, *old;
760	NODEMASK_SCRATCH(scratch);
761	int ret;
762
763	if (!scratch)
764		return -ENOMEM;
765
766	new = mpol_new(mode, flags, nodes);
767	if (IS_ERR(new)) {
768		ret = PTR_ERR(new);
769		goto out;
770	}
771
772	task_lock(current);
773	ret = mpol_set_nodemask(new, nodes, scratch);
774	if (ret) {
775		task_unlock(current);
776		mpol_put(new);
777		goto out;
778	}
779	old = current->mempolicy;
780	current->mempolicy = new;
781	if (new && new->mode == MPOL_INTERLEAVE &&
782	    nodes_weight(new->v.nodes))
783		current->il_next = first_node(new->v.nodes);
784	task_unlock(current);
785	mpol_put(old);
786	ret = 0;
787out:
788	NODEMASK_SCRATCH_FREE(scratch);
789	return ret;
790}
791
792/*
793 * Return nodemask for policy for get_mempolicy() query
794 *
795 * Called with task's alloc_lock held
796 */
797static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
798{
799	nodes_clear(*nodes);
800	if (p == &default_policy)
801		return;
802
803	switch (p->mode) {
804	case MPOL_BIND:
805		/* Fall through */
806	case MPOL_INTERLEAVE:
807		*nodes = p->v.nodes;
808		break;
809	case MPOL_PREFERRED:
810		if (!(p->flags & MPOL_F_LOCAL))
811			node_set(p->v.preferred_node, *nodes);
812		/* else return empty node mask for local allocation */
813		break;
814	default:
815		BUG();
816	}
817}
818
819static int lookup_node(struct mm_struct *mm, unsigned long addr)
820{
821	struct page *p;
822	int err;
823
824	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
825	if (err >= 0) {
826		err = page_to_nid(p);
827		put_page(p);
828	}
829	return err;
830}
831
832/* Retrieve NUMA policy */
833static long do_get_mempolicy(int *policy, nodemask_t *nmask,
834			     unsigned long addr, unsigned long flags)
835{
836	int err;
837	struct mm_struct *mm = current->mm;
838	struct vm_area_struct *vma = NULL;
839	struct mempolicy *pol = current->mempolicy;
840
841	if (flags &
842		~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
843		return -EINVAL;
844
845	if (flags & MPOL_F_MEMS_ALLOWED) {
846		if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
847			return -EINVAL;
848		*policy = 0;	/* just so it's initialized */
849		task_lock(current);
850		*nmask  = cpuset_current_mems_allowed;
851		task_unlock(current);
852		return 0;
853	}
854
855	if (flags & MPOL_F_ADDR) {
856		/*
857		 * Do NOT fall back to task policy if the
858		 * vma/shared policy at addr is NULL.  We
859		 * want to return MPOL_DEFAULT in this case.
860		 */
861		down_read(&mm->mmap_sem);
862		vma = find_vma_intersection(mm, addr, addr+1);
863		if (!vma) {
864			up_read(&mm->mmap_sem);
865			return -EFAULT;
866		}
867		if (vma->vm_ops && vma->vm_ops->get_policy)
868			pol = vma->vm_ops->get_policy(vma, addr);
869		else
870			pol = vma->vm_policy;
871	} else if (addr)
872		return -EINVAL;
873
874	if (!pol)
875		pol = &default_policy;	/* indicates default behavior */
876
877	if (flags & MPOL_F_NODE) {
878		if (flags & MPOL_F_ADDR) {
879			err = lookup_node(mm, addr);
880			if (err < 0)
881				goto out;
882			*policy = err;
883		} else if (pol == current->mempolicy &&
884				pol->mode == MPOL_INTERLEAVE) {
885			*policy = current->il_next;
886		} else {
887			err = -EINVAL;
888			goto out;
889		}
890	} else {
891		*policy = pol == &default_policy ? MPOL_DEFAULT :
892						pol->mode;
893		/*
894		 * Internal mempolicy flags must be masked off before exposing
895		 * the policy to userspace.
896		 */
897		*policy |= (pol->flags & MPOL_MODE_FLAGS);
898	}
899
900	if (vma) {
901		up_read(&current->mm->mmap_sem);
902		vma = NULL;
903	}
904
905	err = 0;
906	if (nmask) {
907		if (mpol_store_user_nodemask(pol)) {
908			*nmask = pol->w.user_nodemask;
909		} else {
910			task_lock(current);
911			get_policy_nodemask(pol, nmask);
912			task_unlock(current);
913		}
914	}
915
916 out:
917	mpol_cond_put(pol);
918	if (vma)
919		up_read(&current->mm->mmap_sem);
920	return err;
921}
922
923#ifdef CONFIG_MIGRATION
924/*
925 * page migration
926 */
927static void migrate_page_add(struct page *page, struct list_head *pagelist,
928				unsigned long flags)
929{
930	/*
931	 * Avoid migrating a page that is shared with others.
932	 */
933	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
934		if (!isolate_lru_page(page)) {
935			list_add_tail(&page->lru, pagelist);
936			inc_zone_page_state(page, NR_ISOLATED_ANON +
937					    page_is_file_cache(page));
938		}
939	}
940}
941
942static struct page *new_node_page(struct page *page, unsigned long node, int **x)
943{
944	if (PageHuge(page))
945		return alloc_huge_page_node(page_hstate(compound_head(page)),
946					node);
947	else
948		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE |
949						    __GFP_THISNODE, 0);
950}
951
952/*
953 * Migrate pages from one node to a target node.
954 * Returns error or the number of pages not migrated.
955 */
956static int migrate_to_node(struct mm_struct *mm, int source, int dest,
957			   int flags)
958{
959	nodemask_t nmask;
960	LIST_HEAD(pagelist);
961	int err = 0;
962
963	nodes_clear(nmask);
964	node_set(source, nmask);
965
966	/*
967	 * This does not "check" the range but isolates all pages that
968	 * need migration.  Between passing in the full user address
969	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
970	 */
971	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
972	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
973			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
974
975	if (!list_empty(&pagelist)) {
976		err = migrate_pages(&pagelist, new_node_page, NULL, dest,
977					MIGRATE_SYNC, MR_SYSCALL);
978		if (err)
979			putback_movable_pages(&pagelist);
980	}
981
982	return err;
983}
984
985/*
986 * Move pages between the two nodesets so as to preserve the physical
987 * layout as much as possible.
988 *
989 * Returns the number of page that could not be moved.
990 */
991int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
992		     const nodemask_t *to, int flags)
993{
994	int busy = 0;
995	int err;
996	nodemask_t tmp;
997
998	err = migrate_prep();
999	if (err)
1000		return err;
1001
1002	down_read(&mm->mmap_sem);
1003
1004	/*
1005	 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
1006	 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
1007	 * bit in 'tmp', and return that <source, dest> pair for migration.
1008	 * The pair of nodemasks 'to' and 'from' define the map.
1009	 *
1010	 * If no pair of bits is found that way, fallback to picking some
1011	 * pair of 'source' and 'dest' bits that are not the same.  If the
1012	 * 'source' and 'dest' bits are the same, this represents a node
1013	 * that will be migrating to itself, so no pages need move.
1014	 *
1015	 * If no bits are left in 'tmp', or if all remaining bits left
1016	 * in 'tmp' correspond to the same bit in 'to', return false
1017	 * (nothing left to migrate).
1018	 *
1019	 * This lets us pick a pair of nodes to migrate between, such that
1020	 * if possible the dest node is not already occupied by some other
1021	 * source node, minimizing the risk of overloading the memory on a
1022	 * node that would happen if we migrated incoming memory to a node
1023	 * before migrating outgoing memory source that same node.
1024	 *
1025	 * A single scan of tmp is sufficient.  As we go, we remember the
1026	 * most recent <s, d> pair that moved (s != d).  If we find a pair
1027	 * that not only moved, but what's better, moved to an empty slot
1028	 * (d is not set in tmp), then we break out then, with that pair.
1029	 * Otherwise when we finish scanning from_tmp, we at least have the
1030	 * most recent <s, d> pair that moved.  If we get all the way through
1031	 * the scan of tmp without finding any node that moved, much less
1032	 * moved to an empty node, then there is nothing left worth migrating.
1033	 */
1034
1035	tmp = *from;
1036	while (!nodes_empty(tmp)) {
1037		int s,d;
1038		int source = NUMA_NO_NODE;
1039		int dest = 0;
1040
1041		for_each_node_mask(s, tmp) {
1042
1043			/*
1044			 * do_migrate_pages() tries to maintain the relative
1045			 * node relationship of the pages established between
1046			 * threads and memory areas.
1047                         *
1048			 * However if the number of source nodes is not equal to
1049			 * the number of destination nodes we can not preserve
1050			 * this node relative relationship.  In that case, skip
1051			 * copying memory from a node that is in the destination
1052			 * mask.
1053			 *
1054			 * Example: [2,3,4] -> [3,4,5] moves everything.
1055			 *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
1056			 */
1057
1058			if ((nodes_weight(*from) != nodes_weight(*to)) &&
1059						(node_isset(s, *to)))
1060				continue;
1061
1062			d = node_remap(s, *from, *to);
1063			if (s == d)
1064				continue;
1065
1066			source = s;	/* Node moved. Memorize */
1067			dest = d;
1068
1069			/* dest not in remaining from nodes? */
1070			if (!node_isset(dest, tmp))
1071				break;
1072		}
1073		if (source == NUMA_NO_NODE)
1074			break;
1075
1076		node_clear(source, tmp);
1077		err = migrate_to_node(mm, source, dest, flags);
1078		if (err > 0)
1079			busy += err;
1080		if (err < 0)
1081			break;
1082	}
1083	up_read(&mm->mmap_sem);
1084	if (err < 0)
1085		return err;
1086	return busy;
1087
1088}
1089
1090/*
1091 * Allocate a new page for page migration based on vma policy.
1092 * Start by assuming the page is mapped by the same vma as contains @start.
1093 * Search forward from there, if not.  N.B., this assumes that the
1094 * list of pages handed to migrate_pages()--which is how we get here--
1095 * is in virtual address order.
1096 */
1097static struct page *new_page(struct page *page, unsigned long start, int **x)
1098{
1099	struct vm_area_struct *vma;
1100	unsigned long uninitialized_var(address);
1101
1102	vma = find_vma(current->mm, start);
1103	while (vma) {
1104		address = page_address_in_vma(page, vma);
1105		if (address != -EFAULT)
1106			break;
1107		vma = vma->vm_next;
1108	}
1109
1110	if (PageHuge(page)) {
1111		BUG_ON(!vma);
1112		return alloc_huge_page_noerr(vma, address, 1);
1113	}
1114	/*
1115	 * if !vma, alloc_page_vma() will use task or system default policy
1116	 */
1117	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1118}
1119#else
1120
1121static void migrate_page_add(struct page *page, struct list_head *pagelist,
1122				unsigned long flags)
1123{
1124}
1125
1126int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
1127		     const nodemask_t *to, int flags)
1128{
1129	return -ENOSYS;
1130}
1131
1132static struct page *new_page(struct page *page, unsigned long start, int **x)
1133{
1134	return NULL;
1135}
1136#endif
1137
1138static long do_mbind(unsigned long start, unsigned long len,
1139		     unsigned short mode, unsigned short mode_flags,
1140		     nodemask_t *nmask, unsigned long flags)
1141{
1142	struct mm_struct *mm = current->mm;
1143	struct mempolicy *new;
1144	unsigned long end;
1145	int err;
1146	LIST_HEAD(pagelist);
1147
1148	if (flags & ~(unsigned long)MPOL_MF_VALID)
1149		return -EINVAL;
1150	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
1151		return -EPERM;
1152
1153	if (start & ~PAGE_MASK)
1154		return -EINVAL;
1155
1156	if (mode == MPOL_DEFAULT)
1157		flags &= ~MPOL_MF_STRICT;
1158
1159	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
1160	end = start + len;
1161
1162	if (end < start)
1163		return -EINVAL;
1164	if (end == start)
1165		return 0;
1166
1167	new = mpol_new(mode, mode_flags, nmask);
1168	if (IS_ERR(new))
1169		return PTR_ERR(new);
1170
1171	if (flags & MPOL_MF_LAZY)
1172		new->flags |= MPOL_F_MOF;
1173
1174	/*
1175	 * If we are using the default policy then operation
1176	 * on discontinuous address spaces is okay after all
1177	 */
1178	if (!new)
1179		flags |= MPOL_MF_DISCONTIG_OK;
1180
1181	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1182		 start, start + len, mode, mode_flags,
1183		 nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
1184
1185	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1186
1187		err = migrate_prep();
1188		if (err)
1189			goto mpol_out;
1190	}
1191	{
1192		NODEMASK_SCRATCH(scratch);
1193		if (scratch) {
1194			down_write(&mm->mmap_sem);
1195			task_lock(current);
1196			err = mpol_set_nodemask(new, nmask, scratch);
1197			task_unlock(current);
1198			if (err)
1199				up_write(&mm->mmap_sem);
1200		} else
1201			err = -ENOMEM;
1202		NODEMASK_SCRATCH_FREE(scratch);
1203	}
1204	if (err)
1205		goto mpol_out;
1206
1207	err = queue_pages_range(mm, start, end, nmask,
1208			  flags | MPOL_MF_INVERT, &pagelist);
1209	if (!err)
1210		err = mbind_range(mm, start, end, new);
1211
1212	if (!err) {
1213		int nr_failed = 0;
1214
1215		if (!list_empty(&pagelist)) {
1216			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
1217			nr_failed = migrate_pages(&pagelist, new_page, NULL,
1218				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
1219			if (nr_failed)
1220				putback_movable_pages(&pagelist);
1221		}
1222
1223		if (nr_failed && (flags & MPOL_MF_STRICT))
1224			err = -EIO;
1225	} else
1226		putback_movable_pages(&pagelist);
1227
1228	up_write(&mm->mmap_sem);
1229 mpol_out:
1230	mpol_put(new);
1231	return err;
1232}
1233
1234/*
1235 * User space interface with variable sized bitmaps for nodelists.
1236 */
1237
1238/* Copy a node mask from user space. */
1239static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1240		     unsigned long maxnode)
1241{
1242	unsigned long k;
1243	unsigned long nlongs;
1244	unsigned long endmask;
1245
1246	--maxnode;
1247	nodes_clear(*nodes);
1248	if (maxnode == 0 || !nmask)
1249		return 0;
1250	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1251		return -EINVAL;
1252
1253	nlongs = BITS_TO_LONGS(maxnode);
1254	if ((maxnode % BITS_PER_LONG) == 0)
1255		endmask = ~0UL;
1256	else
1257		endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1258
1259	/* When the user specified more nodes than supported just check
1260	   if the non supported part is all zero. */
1261	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1262		if (nlongs > PAGE_SIZE/sizeof(long))
1263			return -EINVAL;
1264		for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1265			unsigned long t;
1266			if (get_user(t, nmask + k))
1267				return -EFAULT;
1268			if (k == nlongs - 1) {
1269				if (t & endmask)
1270					return -EINVAL;
1271			} else if (t)
1272				return -EINVAL;
1273		}
1274		nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1275		endmask = ~0UL;
1276	}
1277
1278	if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1279		return -EFAULT;
1280	nodes_addr(*nodes)[nlongs-1] &= endmask;
1281	return 0;
1282}
1283
1284/* Copy a kernel node mask to user space */
1285static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1286			      nodemask_t *nodes)
1287{
1288	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1289	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1290
1291	if (copy > nbytes) {
1292		if (copy > PAGE_SIZE)
1293			return -EINVAL;
1294		if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1295			return -EFAULT;
1296		copy = nbytes;
1297	}
1298	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1299}
1300
1301SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1302		unsigned long, mode, const unsigned long __user *, nmask,
1303		unsigned long, maxnode, unsigned, flags)
1304{
1305	nodemask_t nodes;
1306	int err;
1307	unsigned short mode_flags;
1308
1309	mode_flags = mode & MPOL_MODE_FLAGS;
1310	mode &= ~MPOL_MODE_FLAGS;
1311	if (mode >= MPOL_MAX)
1312		return -EINVAL;
1313	if ((mode_flags & MPOL_F_STATIC_NODES) &&
1314	    (mode_flags & MPOL_F_RELATIVE_NODES))
1315		return -EINVAL;
1316	err = get_nodes(&nodes, nmask, maxnode);
1317	if (err)
1318		return err;
1319	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1320}
1321
1322/* Set the process memory policy */
1323SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
1324		unsigned long, maxnode)
1325{
1326	int err;
1327	nodemask_t nodes;
1328	unsigned short flags;
1329
1330	flags = mode & MPOL_MODE_FLAGS;
1331	mode &= ~MPOL_MODE_FLAGS;
1332	if ((unsigned int)mode >= MPOL_MAX)
1333		return -EINVAL;
1334	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1335		return -EINVAL;
1336	err = get_nodes(&nodes, nmask, maxnode);
1337	if (err)
1338		return err;
1339	return do_set_mempolicy(mode, flags, &nodes);
1340}
1341
1342SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1343		const unsigned long __user *, old_nodes,
1344		const unsigned long __user *, new_nodes)
1345{
1346	const struct cred *cred = current_cred(), *tcred;
1347	struct mm_struct *mm = NULL;
1348	struct task_struct *task;
1349	nodemask_t task_nodes;
1350	int err;
1351	nodemask_t *old;
1352	nodemask_t *new;
1353	NODEMASK_SCRATCH(scratch);
1354
1355	if (!scratch)
1356		return -ENOMEM;
1357
1358	old = &scratch->mask1;
1359	new = &scratch->mask2;
1360
1361	err = get_nodes(old, old_nodes, maxnode);
1362	if (err)
1363		goto out;
1364
1365	err = get_nodes(new, new_nodes, maxnode);
1366	if (err)
1367		goto out;
1368
1369	/* Find the mm_struct */
1370	rcu_read_lock();
1371	task = pid ? find_task_by_vpid(pid) : current;
1372	if (!task) {
1373		rcu_read_unlock();
1374		err = -ESRCH;
1375		goto out;
1376	}
1377	get_task_struct(task);
1378
1379	err = -EINVAL;
1380
1381	/*
1382	 * Check if this process has the right to modify the specified
1383	 * process. The right exists if the process has administrative
1384	 * capabilities, superuser privileges or the same
1385	 * userid as the target process.
1386	 */
1387	tcred = __task_cred(task);
1388	if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
1389	    !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
1390	    !capable(CAP_SYS_NICE)) {
1391		rcu_read_unlock();
1392		err = -EPERM;
1393		goto out_put;
1394	}
1395	rcu_read_unlock();
1396
1397	task_nodes = cpuset_mems_allowed(task);
1398	/* Is the user allowed to access the target nodes? */
1399	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
1400		err = -EPERM;
1401		goto out_put;
1402	}
1403
1404	if (!nodes_subset(*new, node_states[N_MEMORY])) {
1405		err = -EINVAL;
1406		goto out_put;
1407	}
1408
1409	err = security_task_movememory(task);
1410	if (err)
1411		goto out_put;
1412
1413	mm = get_task_mm(task);
1414	put_task_struct(task);
1415
1416	if (!mm) {
1417		err = -EINVAL;
1418		goto out;
1419	}
1420
1421	err = do_migrate_pages(mm, old, new,
1422		capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1423
1424	mmput(mm);
1425out:
1426	NODEMASK_SCRATCH_FREE(scratch);
1427
1428	return err;
1429
1430out_put:
1431	put_task_struct(task);
1432	goto out;
1433
1434}
1435
1436
1437/* Retrieve NUMA policy */
1438SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1439		unsigned long __user *, nmask, unsigned long, maxnode,
1440		unsigned long, addr, unsigned long, flags)
1441{
1442	int err;
1443	int uninitialized_var(pval);
1444	nodemask_t nodes;
1445
1446	if (nmask != NULL && maxnode < MAX_NUMNODES)
1447		return -EINVAL;
1448
1449	err = do_get_mempolicy(&pval, &nodes, addr, flags);
1450
1451	if (err)
1452		return err;
1453
1454	if (policy && put_user(pval, policy))
1455		return -EFAULT;
1456
1457	if (nmask)
1458		err = copy_nodes_to_user(nmask, maxnode, &nodes);
1459
1460	return err;
1461}
1462
1463#ifdef CONFIG_COMPAT
1464
1465COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1466		       compat_ulong_t __user *, nmask,
1467		       compat_ulong_t, maxnode,
1468		       compat_ulong_t, addr, compat_ulong_t, flags)
1469{
1470	long err;
1471	unsigned long __user *nm = NULL;
1472	unsigned long nr_bits, alloc_size;
1473	DECLARE_BITMAP(bm, MAX_NUMNODES);
1474
1475	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1476	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1477
1478	if (nmask)
1479		nm = compat_alloc_user_space(alloc_size);
1480
1481	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1482
1483	if (!err && nmask) {
1484		unsigned long copy_size;
1485		copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
1486		err = copy_from_user(bm, nm, copy_size);
1487		/* ensure entire bitmap is zeroed */
1488		err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1489		err |= compat_put_bitmap(nmask, bm, nr_bits);
1490	}
1491
1492	return err;
1493}
1494
1495COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
1496		       compat_ulong_t, maxnode)
1497{
1498	long err = 0;
1499	unsigned long __user *nm = NULL;
1500	unsigned long nr_bits, alloc_size;
1501	DECLARE_BITMAP(bm, MAX_NUMNODES);
1502
1503	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1504	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1505
1506	if (nmask) {
1507		err = compat_get_bitmap(bm, nmask, nr_bits);
1508		nm = compat_alloc_user_space(alloc_size);
1509		err |= copy_to_user(nm, bm, alloc_size);
1510	}
1511
1512	if (err)
1513		return -EFAULT;
1514
1515	return sys_set_mempolicy(mode, nm, nr_bits+1);
1516}
1517
1518COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
1519		       compat_ulong_t, mode, compat_ulong_t __user *, nmask,
1520		       compat_ulong_t, maxnode, compat_ulong_t, flags)
1521{
1522	long err = 0;
1523	unsigned long __user *nm = NULL;
1524	unsigned long nr_bits, alloc_size;
1525	nodemask_t bm;
1526
1527	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1528	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1529
1530	if (nmask) {
1531		err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1532		nm = compat_alloc_user_space(alloc_size);
1533		err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1534	}
1535
1536	if (err)
1537		return -EFAULT;
1538
1539	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1540}
1541
1542#endif
1543
1544struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
1545						unsigned long addr)
1546{
1547	struct mempolicy *pol = NULL;
1548
1549	if (vma) {
1550		if (vma->vm_ops && vma->vm_ops->get_policy) {
1551			pol = vma->vm_ops->get_policy(vma, addr);
1552		} else if (vma->vm_policy) {
1553			pol = vma->vm_policy;
1554
1555			/*
1556			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
1557			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
1558			 * count on these policies which will be dropped by
1559			 * mpol_cond_put() later
1560			 */
1561			if (mpol_needs_cond_ref(pol))
1562				mpol_get(pol);
1563		}
1564	}
1565
1566	return pol;
1567}
1568
1569/*
1570 * get_vma_policy(@vma, @addr)
1571 * @vma: virtual memory area whose policy is sought
1572 * @addr: address in @vma for shared policy lookup
1573 *
1574 * Returns effective policy for a VMA at specified address.
1575 * Falls back to current->mempolicy or system default policy, as necessary.
1576 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1577 * count--added by the get_policy() vm_op, as appropriate--to protect against
1578 * freeing by another task.  It is the caller's responsibility to free the
1579 * extra reference for shared policies.
1580 */
1581static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
1582						unsigned long addr)
1583{
1584	struct mempolicy *pol = __get_vma_policy(vma, addr);
1585
1586	if (!pol)
1587		pol = get_task_policy(current);
1588
1589	return pol;
1590}
1591
1592bool vma_policy_mof(struct vm_area_struct *vma)
1593{
1594	struct mempolicy *pol;
1595
1596	if (vma->vm_ops && vma->vm_ops->get_policy) {
1597		bool ret = false;
1598
1599		pol = vma->vm_ops->get_policy(vma, vma->vm_start);
1600		if (pol && (pol->flags & MPOL_F_MOF))
1601			ret = true;
1602		mpol_cond_put(pol);
1603
1604		return ret;
1605	}
1606
1607	pol = vma->vm_policy;
1608	if (!pol)
1609		pol = get_task_policy(current);
1610
1611	return pol->flags & MPOL_F_MOF;
1612}
1613
1614static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
1615{
1616	enum zone_type dynamic_policy_zone = policy_zone;
1617
1618	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
1619
1620	/*
1621	 * if policy->v.nodes has movable memory only,
1622	 * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
1623	 *
1624	 * policy->v.nodes is intersect with node_states[N_MEMORY].
1625	 * so if the following test faile, it implies
1626	 * policy->v.nodes has movable memory only.
1627	 */
1628	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
1629		dynamic_policy_zone = ZONE_MOVABLE;
1630
1631	return zone >= dynamic_policy_zone;
1632}
1633
1634/*
1635 * Return a nodemask representing a mempolicy for filtering nodes for
1636 * page allocation
1637 */
1638static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1639{
1640	/* Lower zones don't get a nodemask applied for MPOL_BIND */
1641	if (unlikely(policy->mode == MPOL_BIND) &&
1642			apply_policy_zone(policy, gfp_zone(gfp)) &&
1643			cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1644		return &policy->v.nodes;
1645
1646	return NULL;
1647}
1648
1649/* Return a zonelist indicated by gfp for node representing a mempolicy */
1650static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
1651	int nd)
1652{
1653	switch (policy->mode) {
1654	case MPOL_PREFERRED:
1655		if (!(policy->flags & MPOL_F_LOCAL))
1656			nd = policy->v.preferred_node;
1657		break;
1658	case MPOL_BIND:
1659		/*
1660		 * Normally, MPOL_BIND allocations are node-local within the
1661		 * allowed nodemask.  However, if __GFP_THISNODE is set and the
1662		 * current node isn't part of the mask, we use the zonelist for
1663		 * the first node in the mask instead.
1664		 */
1665		if (unlikely(gfp & __GFP_THISNODE) &&
1666				unlikely(!node_isset(nd, policy->v.nodes)))
1667			nd = first_node(policy->v.nodes);
1668		break;
1669	default:
1670		BUG();
1671	}
1672	return node_zonelist(nd, gfp);
1673}
1674
1675/* Do dynamic interleaving for a process */
1676static unsigned interleave_nodes(struct mempolicy *policy)
1677{
1678	unsigned nid, next;
1679	struct task_struct *me = current;
1680
1681	nid = me->il_next;
1682	next = next_node(nid, policy->v.nodes);
1683	if (next >= MAX_NUMNODES)
1684		next = first_node(policy->v.nodes);
1685	if (next < MAX_NUMNODES)
1686		me->il_next = next;
1687	return nid;
1688}
1689
1690/*
1691 * Depending on the memory policy provide a node from which to allocate the
1692 * next slab entry.
1693 */
1694unsigned int mempolicy_slab_node(void)
1695{
1696	struct mempolicy *policy;
1697	int node = numa_mem_id();
1698
1699	if (in_interrupt())
1700		return node;
1701
1702	policy = current->mempolicy;
1703	if (!policy || policy->flags & MPOL_F_LOCAL)
1704		return node;
1705
1706	switch (policy->mode) {
1707	case MPOL_PREFERRED:
1708		/*
1709		 * handled MPOL_F_LOCAL above
1710		 */
1711		return policy->v.preferred_node;
1712
1713	case MPOL_INTERLEAVE:
1714		return interleave_nodes(policy);
1715
1716	case MPOL_BIND: {
1717		/*
1718		 * Follow bind policy behavior and start allocation at the
1719		 * first node.
1720		 */
1721		struct zonelist *zonelist;
1722		struct zone *zone;
1723		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1724		zonelist = &NODE_DATA(node)->node_zonelists[0];
1725		(void)first_zones_zonelist(zonelist, highest_zoneidx,
1726							&policy->v.nodes,
1727							&zone);
1728		return zone ? zone->node : node;
1729	}
1730
1731	default:
1732		BUG();
1733	}
1734}
1735
1736/* Do static interleaving for a VMA with known offset. */
1737static unsigned offset_il_node(struct mempolicy *pol,
1738		struct vm_area_struct *vma, unsigned long off)
1739{
1740	unsigned nnodes = nodes_weight(pol->v.nodes);
1741	unsigned target;
1742	int c;
1743	int nid = NUMA_NO_NODE;
1744
1745	if (!nnodes)
1746		return numa_node_id();
1747	target = (unsigned int)off % nnodes;
1748	c = 0;
1749	do {
1750		nid = next_node(nid, pol->v.nodes);
1751		c++;
1752	} while (c <= target);
1753	return nid;
1754}
1755
1756/* Determine a node number for interleave */
1757static inline unsigned interleave_nid(struct mempolicy *pol,
1758		 struct vm_area_struct *vma, unsigned long addr, int shift)
1759{
1760	if (vma) {
1761		unsigned long off;
1762
1763		/*
1764		 * for small pages, there is no difference between
1765		 * shift and PAGE_SHIFT, so the bit-shift is safe.
1766		 * for huge pages, since vm_pgoff is in units of small
1767		 * pages, we need to shift off the always 0 bits to get
1768		 * a useful offset.
1769		 */
1770		BUG_ON(shift < PAGE_SHIFT);
1771		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1772		off += (addr - vma->vm_start) >> shift;
1773		return offset_il_node(pol, vma, off);
1774	} else
1775		return interleave_nodes(pol);
1776}
1777
1778/*
1779 * Return the bit number of a random bit set in the nodemask.
1780 * (returns NUMA_NO_NODE if nodemask is empty)
1781 */
1782int node_random(const nodemask_t *maskp)
1783{
1784	int w, bit = NUMA_NO_NODE;
1785
1786	w = nodes_weight(*maskp);
1787	if (w)
1788		bit = bitmap_ord_to_pos(maskp->bits,
1789			get_random_int() % w, MAX_NUMNODES);
1790	return bit;
1791}
1792
1793#ifdef CONFIG_HUGETLBFS
1794/*
1795 * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1796 * @vma: virtual memory area whose policy is sought
1797 * @addr: address in @vma for shared policy lookup and interleave policy
1798 * @gfp_flags: for requested zone
1799 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
1800 * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
1801 *
1802 * Returns a zonelist suitable for a huge page allocation and a pointer
1803 * to the struct mempolicy for conditional unref after allocation.
1804 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1805 * @nodemask for filtering the zonelist.
1806 *
1807 * Must be protected by read_mems_allowed_begin()
1808 */
1809struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1810				gfp_t gfp_flags, struct mempolicy **mpol,
1811				nodemask_t **nodemask)
1812{
1813	struct zonelist *zl;
1814
1815	*mpol = get_vma_policy(vma, addr);
1816	*nodemask = NULL;	/* assume !MPOL_BIND */
1817
1818	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1819		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1820				huge_page_shift(hstate_vma(vma))), gfp_flags);
1821	} else {
1822		zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
1823		if ((*mpol)->mode == MPOL_BIND)
1824			*nodemask = &(*mpol)->v.nodes;
1825	}
1826	return zl;
1827}
1828
1829/*
1830 * init_nodemask_of_mempolicy
1831 *
1832 * If the current task's mempolicy is "default" [NULL], return 'false'
1833 * to indicate default policy.  Otherwise, extract the policy nodemask
1834 * for 'bind' or 'interleave' policy into the argument nodemask, or
1835 * initialize the argument nodemask to contain the single node for
1836 * 'preferred' or 'local' policy and return 'true' to indicate presence
1837 * of non-default mempolicy.
1838 *
1839 * We don't bother with reference counting the mempolicy [mpol_get/put]
1840 * because the current task is examining it's own mempolicy and a task's
1841 * mempolicy is only ever changed by the task itself.
1842 *
1843 * N.B., it is the caller's responsibility to free a returned nodemask.
1844 */
1845bool init_nodemask_of_mempolicy(nodemask_t *mask)
1846{
1847	struct mempolicy *mempolicy;
1848	int nid;
1849
1850	if (!(mask && current->mempolicy))
1851		return false;
1852
1853	task_lock(current);
1854	mempolicy = current->mempolicy;
1855	switch (mempolicy->mode) {
1856	case MPOL_PREFERRED:
1857		if (mempolicy->flags & MPOL_F_LOCAL)
1858			nid = numa_node_id();
1859		else
1860			nid = mempolicy->v.preferred_node;
1861		init_nodemask_of_node(mask, nid);
1862		break;
1863
1864	case MPOL_BIND:
1865		/* Fall through */
1866	case MPOL_INTERLEAVE:
1867		*mask =  mempolicy->v.nodes;
1868		break;
1869
1870	default:
1871		BUG();
1872	}
1873	task_unlock(current);
1874
1875	return true;
1876}
1877#endif
1878
1879/*
1880 * mempolicy_nodemask_intersects
1881 *
1882 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1883 * policy.  Otherwise, check for intersection between mask and the policy
1884 * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
1885 * policy, always return true since it may allocate elsewhere on fallback.
1886 *
1887 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1888 */
1889bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1890					const nodemask_t *mask)
1891{
1892	struct mempolicy *mempolicy;
1893	bool ret = true;
1894
1895	if (!mask)
1896		return ret;
1897	task_lock(tsk);
1898	mempolicy = tsk->mempolicy;
1899	if (!mempolicy)
1900		goto out;
1901
1902	switch (mempolicy->mode) {
1903	case MPOL_PREFERRED:
1904		/*
1905		 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1906		 * allocate from, they may fallback to other nodes when oom.
1907		 * Thus, it's possible for tsk to have allocated memory from
1908		 * nodes in mask.
1909		 */
1910		break;
1911	case MPOL_BIND:
1912	case MPOL_INTERLEAVE:
1913		ret = nodes_intersects(mempolicy->v.nodes, *mask);
1914		break;
1915	default:
1916		BUG();
1917	}
1918out:
1919	task_unlock(tsk);
1920	return ret;
1921}
1922
1923/* Allocate a page in interleaved policy.
1924   Own path because it needs to do special accounting. */
1925static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1926					unsigned nid)
1927{
1928	struct zonelist *zl;
1929	struct page *page;
1930
1931	zl = node_zonelist(nid, gfp);
1932	page = __alloc_pages(gfp, order, zl);
1933	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1934		inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1935	return page;
1936}
1937
1938/**
1939 * 	alloc_pages_vma	- Allocate a page for a VMA.
1940 *
1941 * 	@gfp:
1942 *      %GFP_USER    user allocation.
1943 *      %GFP_KERNEL  kernel allocations,
1944 *      %GFP_HIGHMEM highmem/user allocations,
1945 *      %GFP_FS      allocation should not call back into a file system.
1946 *      %GFP_ATOMIC  don't sleep.
1947 *
1948 *	@order:Order of the GFP allocation.
1949 * 	@vma:  Pointer to VMA or NULL if not available.
1950 *	@addr: Virtual Address of the allocation. Must be inside the VMA.
1951 *	@node: Which node to prefer for allocation (modulo policy).
1952 *	@hugepage: for hugepages try only the preferred node if possible
1953 *
1954 * 	This function allocates a page from the kernel page pool and applies
1955 *	a NUMA policy associated with the VMA or the current process.
1956 *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
1957 *	mm_struct of the VMA to prevent it from going away. Should be used for
1958 *	all allocations for pages that will be mapped into user space. Returns
1959 *	NULL when no page can be allocated.
1960 */
1961struct page *
1962alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
1963		unsigned long addr, int node, bool hugepage)
1964{
1965	struct mempolicy *pol;
1966	struct page *page;
1967	unsigned int cpuset_mems_cookie;
1968	struct zonelist *zl;
1969	nodemask_t *nmask;
1970
1971retry_cpuset:
1972	pol = get_vma_policy(vma, addr);
1973	cpuset_mems_cookie = read_mems_allowed_begin();
1974
1975	if (pol->mode == MPOL_INTERLEAVE) {
1976		unsigned nid;
1977
1978		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1979		mpol_cond_put(pol);
1980		page = alloc_page_interleave(gfp, order, nid);
1981		goto out;
1982	}
1983
1984	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1985		int hpage_node = node;
1986
1987		/*
1988		 * For hugepage allocation and non-interleave policy which
1989		 * allows the current node (or other explicitly preferred
1990		 * node) we only try to allocate from the current/preferred
1991		 * node and don't fall back to other nodes, as the cost of
1992		 * remote accesses would likely offset THP benefits.
1993		 *
1994		 * If the policy is interleave, or does not allow the current
1995		 * node in its nodemask, we allocate the standard way.
1996		 */
1997		if (pol->mode == MPOL_PREFERRED &&
1998						!(pol->flags & MPOL_F_LOCAL))
1999			hpage_node = pol->v.preferred_node;
2000
2001		nmask = policy_nodemask(gfp, pol);
2002		if (!nmask || node_isset(hpage_node, *nmask)) {
2003			mpol_cond_put(pol);
2004			page = alloc_pages_exact_node(hpage_node,
2005						gfp | __GFP_THISNODE, order);
2006			goto out;
2007		}
2008	}
2009
2010	nmask = policy_nodemask(gfp, pol);
2011	zl = policy_zonelist(gfp, pol, node);
2012	mpol_cond_put(pol);
2013	page = __alloc_pages_nodemask(gfp, order, zl, nmask);
2014out:
2015	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2016		goto retry_cpuset;
2017	return page;
2018}
2019
2020/**
2021 * 	alloc_pages_current - Allocate pages.
2022 *
2023 *	@gfp:
2024 *		%GFP_USER   user allocation,
2025 *      	%GFP_KERNEL kernel allocation,
2026 *      	%GFP_HIGHMEM highmem allocation,
2027 *      	%GFP_FS     don't call back into a file system.
2028 *      	%GFP_ATOMIC don't sleep.
2029 *	@order: Power of two of allocation size in pages. 0 is a single page.
2030 *
2031 *	Allocate a page from the kernel page pool.  When not in
2032 *	interrupt context and apply the current process NUMA policy.
2033 *	Returns NULL when no page can be allocated.
2034 *
2035 *	Don't call cpuset_update_task_memory_state() unless
2036 *	1) it's ok to take cpuset_sem (can WAIT), and
2037 *	2) allocating for current task (not interrupt).
2038 */
2039struct page *alloc_pages_current(gfp_t gfp, unsigned order)
2040{
2041	struct mempolicy *pol = &default_policy;
2042	struct page *page;
2043	unsigned int cpuset_mems_cookie;
2044
2045	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
2046		pol = get_task_policy(current);
2047
2048retry_cpuset:
2049	cpuset_mems_cookie = read_mems_allowed_begin();
2050
2051	/*
2052	 * No reference counting needed for current->mempolicy
2053	 * nor system default_policy
2054	 */
2055	if (pol->mode == MPOL_INTERLEAVE)
2056		page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
2057	else
2058		page = __alloc_pages_nodemask(gfp, order,
2059				policy_zonelist(gfp, pol, numa_node_id()),
2060				policy_nodemask(gfp, pol));
2061
2062	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
2063		goto retry_cpuset;
2064
2065	return page;
2066}
2067EXPORT_SYMBOL(alloc_pages_current);
2068
2069int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
2070{
2071	struct mempolicy *pol = mpol_dup(vma_policy(src));
2072
2073	if (IS_ERR(pol))
2074		return PTR_ERR(pol);
2075	dst->vm_policy = pol;
2076	return 0;
2077}
2078
2079/*
2080 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
2081 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
2082 * with the mems_allowed returned by cpuset_mems_allowed().  This
2083 * keeps mempolicies cpuset relative after its cpuset moves.  See
2084 * further kernel/cpuset.c update_nodemask().
2085 *
2086 * current's mempolicy may be rebinded by the other task(the task that changes
2087 * cpuset's mems), so we needn't do rebind work for current task.
2088 */
2089
2090/* Slow path of a mempolicy duplicate */
2091struct mempolicy *__mpol_dup(struct mempolicy *old)
2092{
2093	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2094
2095	if (!new)
2096		return ERR_PTR(-ENOMEM);
2097
2098	/* task's mempolicy is protected by alloc_lock */
2099	if (old == current->mempolicy) {
2100		task_lock(current);
2101		*new = *old;
2102		task_unlock(current);
2103	} else
2104		*new = *old;
2105
2106	if (current_cpuset_is_being_rebound()) {
2107		nodemask_t mems = cpuset_mems_allowed(current);
2108		if (new->flags & MPOL_F_REBINDING)
2109			mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
2110		else
2111			mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
2112	}
2113	atomic_set(&new->refcnt, 1);
2114	return new;
2115}
2116
2117/* Slow path of a mempolicy comparison */
2118bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
2119{
2120	if (!a || !b)
2121		return false;
2122	if (a->mode != b->mode)
2123		return false;
2124	if (a->flags != b->flags)
2125		return false;
2126	if (mpol_store_user_nodemask(a))
2127		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
2128			return false;
2129
2130	switch (a->mode) {
2131	case MPOL_BIND:
2132		/* Fall through */
2133	case MPOL_INTERLEAVE:
2134		return !!nodes_equal(a->v.nodes, b->v.nodes);
2135	case MPOL_PREFERRED:
2136		return a->v.preferred_node == b->v.preferred_node;
2137	default:
2138		BUG();
2139		return false;
2140	}
2141}
2142
2143/*
2144 * Shared memory backing store policy support.
2145 *
2146 * Remember policies even when nobody has shared memory mapped.
2147 * The policies are kept in Red-Black tree linked from the inode.
2148 * They are protected by the sp->lock spinlock, which should be held
2149 * for any accesses to the tree.
2150 */
2151
2152/* lookup first element intersecting start-end */
2153/* Caller holds sp->lock */
2154static struct sp_node *
2155sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
2156{
2157	struct rb_node *n = sp->root.rb_node;
2158
2159	while (n) {
2160		struct sp_node *p = rb_entry(n, struct sp_node, nd);
2161
2162		if (start >= p->end)
2163			n = n->rb_right;
2164		else if (end <= p->start)
2165			n = n->rb_left;
2166		else
2167			break;
2168	}
2169	if (!n)
2170		return NULL;
2171	for (;;) {
2172		struct sp_node *w = NULL;
2173		struct rb_node *prev = rb_prev(n);
2174		if (!prev)
2175			break;
2176		w = rb_entry(prev, struct sp_node, nd);
2177		if (w->end <= start)
2178			break;
2179		n = prev;
2180	}
2181	return rb_entry(n, struct sp_node, nd);
2182}
2183
2184/* Insert a new shared policy into the list. */
2185/* Caller holds sp->lock */
2186static void sp_insert(struct shared_policy *sp, struct sp_node *new)
2187{
2188	struct rb_node **p = &sp->root.rb_node;
2189	struct rb_node *parent = NULL;
2190	struct sp_node *nd;
2191
2192	while (*p) {
2193		parent = *p;
2194		nd = rb_entry(parent, struct sp_node, nd);
2195		if (new->start < nd->start)
2196			p = &(*p)->rb_left;
2197		else if (new->end > nd->end)
2198			p = &(*p)->rb_right;
2199		else
2200			BUG();
2201	}
2202	rb_link_node(&new->nd, parent, p);
2203	rb_insert_color(&new->nd, &sp->root);
2204	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
2205		 new->policy ? new->policy->mode : 0);
2206}
2207
2208/* Find shared policy intersecting idx */
2209struct mempolicy *
2210mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
2211{
2212	struct mempolicy *pol = NULL;
2213	struct sp_node *sn;
2214
2215	if (!sp->root.rb_node)
2216		return NULL;
2217	spin_lock(&sp->lock);
2218	sn = sp_lookup(sp, idx, idx+1);
2219	if (sn) {
2220		mpol_get(sn->policy);
2221		pol = sn->policy;
2222	}
2223	spin_unlock(&sp->lock);
2224	return pol;
2225}
2226
2227static void sp_free(struct sp_node *n)
2228{
2229	mpol_put(n->policy);
2230	kmem_cache_free(sn_cache, n);
2231}
2232
2233/**
2234 * mpol_misplaced - check whether current page node is valid in policy
2235 *
2236 * @page: page to be checked
2237 * @vma: vm area where page mapped
2238 * @addr: virtual address where page mapped
2239 *
2240 * Lookup current policy node id for vma,addr and "compare to" page's
2241 * node id.
2242 *
2243 * Returns:
2244 *	-1	- not misplaced, page is in the right node
2245 *	node	- node id where the page should be
2246 *
2247 * Policy determination "mimics" alloc_page_vma().
2248 * Called from fault path where we know the vma and faulting address.
2249 */
2250int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
2251{
2252	struct mempolicy *pol;
2253	struct zone *zone;
2254	int curnid = page_to_nid(page);
2255	unsigned long pgoff;
2256	int thiscpu = raw_smp_processor_id();
2257	int thisnid = cpu_to_node(thiscpu);
2258	int polnid = -1;
2259	int ret = -1;
2260
2261	BUG_ON(!vma);
2262
2263	pol = get_vma_policy(vma, addr);
2264	if (!(pol->flags & MPOL_F_MOF))
2265		goto out;
2266
2267	switch (pol->mode) {
2268	case MPOL_INTERLEAVE:
2269		BUG_ON(addr >= vma->vm_end);
2270		BUG_ON(addr < vma->vm_start);
2271
2272		pgoff = vma->vm_pgoff;
2273		pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
2274		polnid = offset_il_node(pol, vma, pgoff);
2275		break;
2276
2277	case MPOL_PREFERRED:
2278		if (pol->flags & MPOL_F_LOCAL)
2279			polnid = numa_node_id();
2280		else
2281			polnid = pol->v.preferred_node;
2282		break;
2283
2284	case MPOL_BIND:
2285		/*
2286		 * allows binding to multiple nodes.
2287		 * use current page if in policy nodemask,
2288		 * else select nearest allowed node, if any.
2289		 * If no allowed nodes, use current [!misplaced].
2290		 */
2291		if (node_isset(curnid, pol->v.nodes))
2292			goto out;
2293		(void)first_zones_zonelist(
2294				node_zonelist(numa_node_id(), GFP_HIGHUSER),
2295				gfp_zone(GFP_HIGHUSER),
2296				&pol->v.nodes, &zone);
2297		polnid = zone->node;
2298		break;
2299
2300	default:
2301		BUG();
2302	}
2303
2304	/* Migrate the page towards the node whose CPU is referencing it */
2305	if (pol->flags & MPOL_F_MORON) {
2306		polnid = thisnid;
2307
2308		if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
2309			goto out;
2310	}
2311
2312	if (curnid != polnid)
2313		ret = polnid;
2314out:
2315	mpol_cond_put(pol);
2316
2317	return ret;
2318}
2319
2320static void sp_delete(struct shared_policy *sp, struct sp_node *n)
2321{
2322	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
2323	rb_erase(&n->nd, &sp->root);
2324	sp_free(n);
2325}
2326
2327static void sp_node_init(struct sp_node *node, unsigned long start,
2328			unsigned long end, struct mempolicy *pol)
2329{
2330	node->start = start;
2331	node->end = end;
2332	node->policy = pol;
2333}
2334
2335static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
2336				struct mempolicy *pol)
2337{
2338	struct sp_node *n;
2339	struct mempolicy *newpol;
2340
2341	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2342	if (!n)
2343		return NULL;
2344
2345	newpol = mpol_dup(pol);
2346	if (IS_ERR(newpol)) {
2347		kmem_cache_free(sn_cache, n);
2348		return NULL;
2349	}
2350	newpol->flags |= MPOL_F_SHARED;
2351	sp_node_init(n, start, end, newpol);
2352
2353	return n;
2354}
2355
2356/* Replace a policy range. */
2357static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
2358				 unsigned long end, struct sp_node *new)
2359{
2360	struct sp_node *n;
2361	struct sp_node *n_new = NULL;
2362	struct mempolicy *mpol_new = NULL;
2363	int ret = 0;
2364
2365restart:
2366	spin_lock(&sp->lock);
2367	n = sp_lookup(sp, start, end);
2368	/* Take care of old policies in the same range. */
2369	while (n && n->start < end) {
2370		struct rb_node *next = rb_next(&n->nd);
2371		if (n->start >= start) {
2372			if (n->end <= end)
2373				sp_delete(sp, n);
2374			else
2375				n->start = end;
2376		} else {
2377			/* Old policy spanning whole new range. */
2378			if (n->end > end) {
2379				if (!n_new)
2380					goto alloc_new;
2381
2382				*mpol_new = *n->policy;
2383				atomic_set(&mpol_new->refcnt, 1);
2384				sp_node_init(n_new, end, n->end, mpol_new);
2385				n->end = start;
2386				sp_insert(sp, n_new);
2387				n_new = NULL;
2388				mpol_new = NULL;
2389				break;
2390			} else
2391				n->end = start;
2392		}
2393		if (!next)
2394			break;
2395		n = rb_entry(next, struct sp_node, nd);
2396	}
2397	if (new)
2398		sp_insert(sp, new);
2399	spin_unlock(&sp->lock);
2400	ret = 0;
2401
2402err_out:
2403	if (mpol_new)
2404		mpol_put(mpol_new);
2405	if (n_new)
2406		kmem_cache_free(sn_cache, n_new);
2407
2408	return ret;
2409
2410alloc_new:
2411	spin_unlock(&sp->lock);
2412	ret = -ENOMEM;
2413	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
2414	if (!n_new)
2415		goto err_out;
2416	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
2417	if (!mpol_new)
2418		goto err_out;
2419	goto restart;
2420}
2421
2422/**
2423 * mpol_shared_policy_init - initialize shared policy for inode
2424 * @sp: pointer to inode shared policy
2425 * @mpol:  struct mempolicy to install
2426 *
2427 * Install non-NULL @mpol in inode's shared policy rb-tree.
2428 * On entry, the current task has a reference on a non-NULL @mpol.
2429 * This must be released on exit.
2430 * This is called at get_inode() calls and we can use GFP_KERNEL.
2431 */
2432void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
2433{
2434	int ret;
2435
2436	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
2437	spin_lock_init(&sp->lock);
2438
2439	if (mpol) {
2440		struct vm_area_struct pvma;
2441		struct mempolicy *new;
2442		NODEMASK_SCRATCH(scratch);
2443
2444		if (!scratch)
2445			goto put_mpol;
2446		/* contextualize the tmpfs mount point mempolicy */
2447		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
2448		if (IS_ERR(new))
2449			goto free_scratch; /* no valid nodemask intersection */
2450
2451		task_lock(current);
2452		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
2453		task_unlock(current);
2454		if (ret)
2455			goto put_new;
2456
2457		/* Create pseudo-vma that contains just the policy */
2458		memset(&pvma, 0, sizeof(struct vm_area_struct));
2459		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
2460		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2461
2462put_new:
2463		mpol_put(new);			/* drop initial ref */
2464free_scratch:
2465		NODEMASK_SCRATCH_FREE(scratch);
2466put_mpol:
2467		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
2468	}
2469}
2470
2471int mpol_set_shared_policy(struct shared_policy *info,
2472			struct vm_area_struct *vma, struct mempolicy *npol)
2473{
2474	int err;
2475	struct sp_node *new = NULL;
2476	unsigned long sz = vma_pages(vma);
2477
2478	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2479		 vma->vm_pgoff,
2480		 sz, npol ? npol->mode : -1,
2481		 npol ? npol->flags : -1,
2482		 npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
2483
2484	if (npol) {
2485		new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2486		if (!new)
2487			return -ENOMEM;
2488	}
2489	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2490	if (err && new)
2491		sp_free(new);
2492	return err;
2493}
2494
2495/* Free a backing policy store on inode delete. */
2496void mpol_free_shared_policy(struct shared_policy *p)
2497{
2498	struct sp_node *n;
2499	struct rb_node *next;
2500
2501	if (!p->root.rb_node)
2502		return;
2503	spin_lock(&p->lock);
2504	next = rb_first(&p->root);
2505	while (next) {
2506		n = rb_entry(next, struct sp_node, nd);
2507		next = rb_next(&n->nd);
2508		sp_delete(p, n);
2509	}
2510	spin_unlock(&p->lock);
2511}
2512
2513#ifdef CONFIG_NUMA_BALANCING
2514static int __initdata numabalancing_override;
2515
2516static void __init check_numabalancing_enable(void)
2517{
2518	bool numabalancing_default = false;
2519
2520	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
2521		numabalancing_default = true;
2522
2523	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
2524	if (numabalancing_override)
2525		set_numabalancing_state(numabalancing_override == 1);
2526
2527	if (num_online_nodes() > 1 && !numabalancing_override) {
2528		pr_info("%s automatic NUMA balancing. "
2529			"Configure with numa_balancing= or the "
2530			"kernel.numa_balancing sysctl",
2531			numabalancing_default ? "Enabling" : "Disabling");
2532		set_numabalancing_state(numabalancing_default);
2533	}
2534}
2535
2536static int __init setup_numabalancing(char *str)
2537{
2538	int ret = 0;
2539	if (!str)
2540		goto out;
2541
2542	if (!strcmp(str, "enable")) {
2543		numabalancing_override = 1;
2544		ret = 1;
2545	} else if (!strcmp(str, "disable")) {
2546		numabalancing_override = -1;
2547		ret = 1;
2548	}
2549out:
2550	if (!ret)
2551		pr_warn("Unable to parse numa_balancing=\n");
2552
2553	return ret;
2554}
2555__setup("numa_balancing=", setup_numabalancing);
2556#else
2557static inline void __init check_numabalancing_enable(void)
2558{
2559}
2560#endif /* CONFIG_NUMA_BALANCING */
2561
2562/* assumes fs == KERNEL_DS */
2563void __init numa_policy_init(void)
2564{
2565	nodemask_t interleave_nodes;
2566	unsigned long largest = 0;
2567	int nid, prefer = 0;
2568
2569	policy_cache = kmem_cache_create("numa_policy",
2570					 sizeof(struct mempolicy),
2571					 0, SLAB_PANIC, NULL);
2572
2573	sn_cache = kmem_cache_create("shared_policy_node",
2574				     sizeof(struct sp_node),
2575				     0, SLAB_PANIC, NULL);
2576
2577	for_each_node(nid) {
2578		preferred_node_policy[nid] = (struct mempolicy) {
2579			.refcnt = ATOMIC_INIT(1),
2580			.mode = MPOL_PREFERRED,
2581			.flags = MPOL_F_MOF | MPOL_F_MORON,
2582			.v = { .preferred_node = nid, },
2583		};
2584	}
2585
2586	/*
2587	 * Set interleaving policy for system init. Interleaving is only
2588	 * enabled across suitably sized nodes (default is >= 16MB), or
2589	 * fall back to the largest node if they're all smaller.
2590	 */
2591	nodes_clear(interleave_nodes);
2592	for_each_node_state(nid, N_MEMORY) {
2593		unsigned long total_pages = node_present_pages(nid);
2594
2595		/* Preserve the largest node */
2596		if (largest < total_pages) {
2597			largest = total_pages;
2598			prefer = nid;
2599		}
2600
2601		/* Interleave this node? */
2602		if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2603			node_set(nid, interleave_nodes);
2604	}
2605
2606	/* All too small, use the largest */
2607	if (unlikely(nodes_empty(interleave_nodes)))
2608		node_set(prefer, interleave_nodes);
2609
2610	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2611		pr_err("%s: interleaving failed\n", __func__);
2612
2613	check_numabalancing_enable();
2614}
2615
2616/* Reset policy of current process to default */
2617void numa_default_policy(void)
2618{
2619	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2620}
2621
2622/*
2623 * Parse and format mempolicy from/to strings
2624 */
2625
2626/*
2627 * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
2628 */
2629static const char * const policy_modes[] =
2630{
2631	[MPOL_DEFAULT]    = "default",
2632	[MPOL_PREFERRED]  = "prefer",
2633	[MPOL_BIND]       = "bind",
2634	[MPOL_INTERLEAVE] = "interleave",
2635	[MPOL_LOCAL]      = "local",
2636};
2637
2638
2639#ifdef CONFIG_TMPFS
2640/**
2641 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
2642 * @str:  string containing mempolicy to parse
2643 * @mpol:  pointer to struct mempolicy pointer, returned on success.
2644 *
2645 * Format of input:
2646 *	<mode>[=<flags>][:<nodelist>]
2647 *
2648 * On success, returns 0, else 1
2649 */
2650int mpol_parse_str(char *str, struct mempolicy **mpol)
2651{
2652	struct mempolicy *new = NULL;
2653	unsigned short mode;
2654	unsigned short mode_flags;
2655	nodemask_t nodes;
2656	char *nodelist = strchr(str, ':');
2657	char *flags = strchr(str, '=');
2658	int err = 1;
2659
2660	if (nodelist) {
2661		/* NUL-terminate mode or flags string */
2662		*nodelist++ = '\0';
2663		if (nodelist_parse(nodelist, nodes))
2664			goto out;
2665		if (!nodes_subset(nodes, node_states[N_MEMORY]))
2666			goto out;
2667	} else
2668		nodes_clear(nodes);
2669
2670	if (flags)
2671		*flags++ = '\0';	/* terminate mode string */
2672
2673	for (mode = 0; mode < MPOL_MAX; mode++) {
2674		if (!strcmp(str, policy_modes[mode])) {
2675			break;
2676		}
2677	}
2678	if (mode >= MPOL_MAX)
2679		goto out;
2680
2681	switch (mode) {
2682	case MPOL_PREFERRED:
2683		/*
2684		 * Insist on a nodelist of one node only
2685		 */
2686		if (nodelist) {
2687			char *rest = nodelist;
2688			while (isdigit(*rest))
2689				rest++;
2690			if (*rest)
2691				goto out;
2692		}
2693		break;
2694	case MPOL_INTERLEAVE:
2695		/*
2696		 * Default to online nodes with memory if no nodelist
2697		 */
2698		if (!nodelist)
2699			nodes = node_states[N_MEMORY];
2700		break;
2701	case MPOL_LOCAL:
2702		/*
2703		 * Don't allow a nodelist;  mpol_new() checks flags
2704		 */
2705		if (nodelist)
2706			goto out;
2707		mode = MPOL_PREFERRED;
2708		break;
2709	case MPOL_DEFAULT:
2710		/*
2711		 * Insist on a empty nodelist
2712		 */
2713		if (!nodelist)
2714			err = 0;
2715		goto out;
2716	case MPOL_BIND:
2717		/*
2718		 * Insist on a nodelist
2719		 */
2720		if (!nodelist)
2721			goto out;
2722	}
2723
2724	mode_flags = 0;
2725	if (flags) {
2726		/*
2727		 * Currently, we only support two mutually exclusive
2728		 * mode flags.
2729		 */
2730		if (!strcmp(flags, "static"))
2731			mode_flags |= MPOL_F_STATIC_NODES;
2732		else if (!strcmp(flags, "relative"))
2733			mode_flags |= MPOL_F_RELATIVE_NODES;
2734		else
2735			goto out;
2736	}
2737
2738	new = mpol_new(mode, mode_flags, &nodes);
2739	if (IS_ERR(new))
2740		goto out;
2741
2742	/*
2743	 * Save nodes for mpol_to_str() to show the tmpfs mount options
2744	 * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
2745	 */
2746	if (mode != MPOL_PREFERRED)
2747		new->v.nodes = nodes;
2748	else if (nodelist)
2749		new->v.preferred_node = first_node(nodes);
2750	else
2751		new->flags |= MPOL_F_LOCAL;
2752
2753	/*
2754	 * Save nodes for contextualization: this will be used to "clone"
2755	 * the mempolicy in a specific context [cpuset] at a later time.
2756	 */
2757	new->w.user_nodemask = nodes;
2758
2759	err = 0;
2760
2761out:
2762	/* Restore string for error message */
2763	if (nodelist)
2764		*--nodelist = ':';
2765	if (flags)
2766		*--flags = '=';
2767	if (!err)
2768		*mpol = new;
2769	return err;
2770}
2771#endif /* CONFIG_TMPFS */
2772
2773/**
2774 * mpol_to_str - format a mempolicy structure for printing
2775 * @buffer:  to contain formatted mempolicy string
2776 * @maxlen:  length of @buffer
2777 * @pol:  pointer to mempolicy to be formatted
2778 *
2779 * Convert @pol into a string.  If @buffer is too short, truncate the string.
2780 * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
2781 * longest flag, "relative", and to display at least a few node ids.
2782 */
2783void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
2784{
2785	char *p = buffer;
2786	nodemask_t nodes = NODE_MASK_NONE;
2787	unsigned short mode = MPOL_DEFAULT;
2788	unsigned short flags = 0;
2789
2790	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
2791		mode = pol->mode;
2792		flags = pol->flags;
2793	}
2794
2795	switch (mode) {
2796	case MPOL_DEFAULT:
2797		break;
2798	case MPOL_PREFERRED:
2799		if (flags & MPOL_F_LOCAL)
2800			mode = MPOL_LOCAL;
2801		else
2802			node_set(pol->v.preferred_node, nodes);
2803		break;
2804	case MPOL_BIND:
2805	case MPOL_INTERLEAVE:
2806		nodes = pol->v.nodes;
2807		break;
2808	default:
2809		WARN_ON_ONCE(1);
2810		snprintf(p, maxlen, "unknown");
2811		return;
2812	}
2813
2814	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
2815
2816	if (flags & MPOL_MODE_FLAGS) {
2817		p += snprintf(p, buffer + maxlen - p, "=");
2818
2819		/*
2820		 * Currently, the only defined flags are mutually exclusive
2821		 */
2822		if (flags & MPOL_F_STATIC_NODES)
2823			p += snprintf(p, buffer + maxlen - p, "static");
2824		else if (flags & MPOL_F_RELATIVE_NODES)
2825			p += snprintf(p, buffer + maxlen - p, "relative");
2826	}
2827
2828	if (!nodes_empty(nodes))
2829		p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
2830			       nodemask_pr_args(&nodes));
2831}
2832