Blame - src/kernel/linux/v4.19/mm/mempolicy.c - T800

blob: 69f8d447bb9030302716365ef0860cd5696281e6 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* Simple NUMA memory policy for the Linux kernel.
				3	*
				4	* Copyright 2003,2004 Andi Kleen, SuSE Labs.
				5	* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
				6	* Subject to the GNU Public License, version 2.
				7	*
				8	* NUMA policy allows the user to give hints in which node(s) memory should
				9	* be allocated.
				10	*
				11	* Support four policies per VMA and per process:
				12	*
				13	* The VMA policy has priority over the process policy for a page fault.
				14	*
				15	* interleave Allocate memory interleaved over a set of nodes,
				16	* with normal fallback if it fails.
				17	* For VMA based allocations this interleaves based on the
				18	* offset into the backing object or offset into the mapping
				19	* for anonymous memory. For process policy an process counter
				20	* is used.
				21	*
				22	* bind Only allocate memory on a specific set of nodes,
				23	* no fallback.
				24	* FIXME: memory is allocated starting with the first node
				25	* to the last. It would be better if bind would truly restrict
				26	* the allocation to memory nodes instead
				27	*
				28	* preferred Try a specific node first before normal fallback.
				29	* As a special case NUMA_NO_NODE here means do the allocation
				30	* on the local CPU. This is normally identical to default,
				31	* but useful to set in a VMA when you have a non default
				32	* process policy.
				33	*
				34	* default Allocate on the local node first, or when on a VMA
				35	* use the process policy. This is what Linux always did
				36	* in a NUMA aware kernel and still does by, ahem, default.
				37	*
				38	* The process policy is applied for most non interrupt memory allocations
				39	* in that process' context. Interrupts ignore the policies and always
				40	* try to allocate on the local CPU. The VMA policy is only applied for memory
				41	* allocations for a VMA in the VM.
				42	*
				43	* Currently there are a few corner cases in swapping where the policy
				44	* is not applied, but the majority should be handled. When process policy
				45	* is used it is not remembered over swap outs/swap ins.
				46	*
				47	* Only the highest zone in the zone hierarchy gets policied. Allocations
				48	* requesting a lower zone just use default policy. This implies that
				49	* on systems with highmem kernel lowmem allocation don't get policied.
				50	* Same with GFP_DMA allocations.
				51	*
				52	* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
				53	* all users and remembered even when nobody has memory mapped.
				54	*/
				55
				56	/* Notebook:
				57	fix mmap readahead to honour policy and enable policy for any page cache
				58	object
				59	statistics for bigpages
				60	global policy for page cache? currently it uses process policy. Requires
				61	first item above.
				62	handle mremap for shared memory (currently ignored for the policy)
				63	grows down?
				64	make bind policy root only? It can trigger oom much faster and the
				65	kernel is not always grateful with that.
				66	*/
				67
				68	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				69
				70	#include <linux/mempolicy.h>
				71	#include <linux/mm.h>
				72	#include <linux/highmem.h>
				73	#include <linux/hugetlb.h>
				74	#include <linux/kernel.h>
				75	#include <linux/sched.h>
				76	#include <linux/sched/mm.h>
				77	#include <linux/sched/numa_balancing.h>
				78	#include <linux/sched/task.h>
				79	#include <linux/nodemask.h>
				80	#include <linux/cpuset.h>
				81	#include <linux/slab.h>
				82	#include <linux/string.h>
				83	#include <linux/export.h>
				84	#include <linux/nsproxy.h>
				85	#include <linux/interrupt.h>
				86	#include <linux/init.h>
				87	#include <linux/compat.h>
				88	#include <linux/ptrace.h>
				89	#include <linux/swap.h>
				90	#include <linux/seq_file.h>
				91	#include <linux/proc_fs.h>
				92	#include <linux/migrate.h>
				93	#include <linux/ksm.h>
				94	#include <linux/rmap.h>
				95	#include <linux/security.h>
				96	#include <linux/syscalls.h>
				97	#include <linux/ctype.h>
				98	#include <linux/mm_inline.h>
				99	#include <linux/mmu_notifier.h>
				100	#include <linux/printk.h>
				101	#include <linux/swapops.h>
				102
				103	#include <asm/tlbflush.h>
				104	#include <linux/uaccess.h>
				105
				106	#include "internal.h"
				107
				108	/* Internal flags */
				109	#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
				110	#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
				111
				112	static struct kmem_cache *policy_cache;
				113	static struct kmem_cache *sn_cache;
				114
				115	/* Highest zone. An specific allocation for a zone below that is not
				116	policied. */
				117	enum zone_type policy_zone = 0;
				118
				119	/*
				120	* run-time system-wide default policy => local allocation
				121	*/
				122	static struct mempolicy default_policy = {
				123	.refcnt = ATOMIC_INIT(1), /* never free it */
				124	.mode = MPOL_PREFERRED,
				125	.flags = MPOL_F_LOCAL,
				126	};
				127
				128	static struct mempolicy preferred_node_policy[MAX_NUMNODES];
				129
				130	struct mempolicy get_task_policy(struct task_struct p)
				131	{
				132	struct mempolicy *pol = p->mempolicy;
				133	int node;
				134
				135	if (pol)
				136	return pol;
				137
				138	node = numa_node_id();
				139	if (node != NUMA_NO_NODE) {
				140	pol = &preferred_node_policy[node];
				141	/* preferred_node_policy is not initialised early in boot */
				142	if (pol->mode)
				143	return pol;
				144	}
				145
				146	return &default_policy;
				147	}
				148
				149	static const struct mempolicy_operations {
				150	int (create)(struct mempolicy pol, const nodemask_t *nodes);
				151	void (rebind)(struct mempolicy pol, const nodemask_t *nodes);
				152	} mpol_ops[MPOL_MAX];
				153
				154	static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
				155	{
				156	return pol->flags & MPOL_MODE_FLAGS;
				157	}
				158
				159	static void mpol_relative_nodemask(nodemask_t ret, const nodemask_t orig,
				160	const nodemask_t *rel)
				161	{
				162	nodemask_t tmp;
				163	nodes_fold(tmp, orig, nodes_weight(rel));
				164	nodes_onto(ret, tmp, rel);
				165	}
				166
				167	static int mpol_new_interleave(struct mempolicy pol, const nodemask_t nodes)
				168	{
				169	if (nodes_empty(*nodes))
				170	return -EINVAL;
				171	pol->v.nodes = *nodes;
				172	return 0;
				173	}
				174
				175	static int mpol_new_preferred(struct mempolicy pol, const nodemask_t nodes)
				176	{
				177	if (!nodes)
				178	pol->flags \|= MPOL_F_LOCAL; /* local allocation */
				179	else if (nodes_empty(*nodes))
				180	return -EINVAL; /* no allowed nodes */
				181	else
				182	pol->v.preferred_node = first_node(*nodes);
				183	return 0;
				184	}
				185
				186	static int mpol_new_bind(struct mempolicy pol, const nodemask_t nodes)
				187	{
				188	if (nodes_empty(*nodes))
				189	return -EINVAL;
				190	pol->v.nodes = *nodes;
				191	return 0;
				192	}
				193
				194	/*
				195	* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
				196	* any, for the new policy. mpol_new() has already validated the nodes
				197	* parameter with respect to the policy mode and flags. But, we need to
				198	* handle an empty nodemask with MPOL_PREFERRED here.
				199	*
				200	* Must be called holding task's alloc_lock to protect task's mems_allowed
				201	* and mempolicy. May also be called holding the mmap_semaphore for write.
				202	*/
				203	static int mpol_set_nodemask(struct mempolicy *pol,
				204	const nodemask_t nodes, struct nodemask_scratch nsc)
				205	{
				206	int ret;
				207
				208	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
				209	if (pol == NULL)
				210	return 0;
				211	/* Check N_MEMORY */
				212	nodes_and(nsc->mask1,
				213	cpuset_current_mems_allowed, node_states[N_MEMORY]);
				214
				215	VM_BUG_ON(!nodes);
				216	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
				217	nodes = NULL; /* explicit local allocation */
				218	else {
				219	if (pol->flags & MPOL_F_RELATIVE_NODES)
				220	mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
				221	else
				222	nodes_and(nsc->mask2, *nodes, nsc->mask1);
				223
				224	if (mpol_store_user_nodemask(pol))
				225	pol->w.user_nodemask = *nodes;
				226	else
				227	pol->w.cpuset_mems_allowed =
				228	cpuset_current_mems_allowed;
				229	}
				230
				231	if (nodes)
				232	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
				233	else
				234	ret = mpol_ops[pol->mode].create(pol, NULL);
				235	return ret;
				236	}
				237
				238	/*
				239	* This function just creates a new policy, does some check and simple
				240	* initialization. You must invoke mpol_set_nodemask() to set nodes.
				241	*/
				242	static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
				243	nodemask_t *nodes)
				244	{
				245	struct mempolicy *policy;
				246
				247	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
				248	mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
				249
				250	if (mode == MPOL_DEFAULT) {
				251	if (nodes && !nodes_empty(*nodes))
				252	return ERR_PTR(-EINVAL);
				253	return NULL;
				254	}
				255	VM_BUG_ON(!nodes);
				256
				257	/*
				258	* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
				259	* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
				260	* All other modes require a valid pointer to a non-empty nodemask.
				261	*/
				262	if (mode == MPOL_PREFERRED) {
				263	if (nodes_empty(*nodes)) {
				264	if (((flags & MPOL_F_STATIC_NODES) \|\|
				265	(flags & MPOL_F_RELATIVE_NODES)))
				266	return ERR_PTR(-EINVAL);
				267	}
				268	} else if (mode == MPOL_LOCAL) {
				269	if (!nodes_empty(*nodes) \|\|
				270	(flags & MPOL_F_STATIC_NODES) \|\|
				271	(flags & MPOL_F_RELATIVE_NODES))
				272	return ERR_PTR(-EINVAL);
				273	mode = MPOL_PREFERRED;
				274	} else if (nodes_empty(*nodes))
				275	return ERR_PTR(-EINVAL);
				276	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
				277	if (!policy)
				278	return ERR_PTR(-ENOMEM);
				279	atomic_set(&policy->refcnt, 1);
				280	policy->mode = mode;
				281	policy->flags = flags;
				282
				283	return policy;
				284	}
				285
				286	/* Slow path of a mpol destructor. */
				287	void __mpol_put(struct mempolicy *p)
				288	{
				289	if (!atomic_dec_and_test(&p->refcnt))
				290	return;
				291	kmem_cache_free(policy_cache, p);
				292	}
				293
				294	static void mpol_rebind_default(struct mempolicy pol, const nodemask_t nodes)
				295	{
				296	}
				297
				298	static void mpol_rebind_nodemask(struct mempolicy pol, const nodemask_t nodes)
				299	{
				300	nodemask_t tmp;
				301
				302	if (pol->flags & MPOL_F_STATIC_NODES)
				303	nodes_and(tmp, pol->w.user_nodemask, *nodes);
				304	else if (pol->flags & MPOL_F_RELATIVE_NODES)
				305	mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
				306	else {
				307	nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
				308	*nodes);
				309	pol->w.cpuset_mems_allowed = *nodes;
				310	}
				311
				312	if (nodes_empty(tmp))
				313	tmp = *nodes;
				314
				315	pol->v.nodes = tmp;
				316	}
				317
				318	static void mpol_rebind_preferred(struct mempolicy *pol,
				319	const nodemask_t *nodes)
				320	{
				321	nodemask_t tmp;
				322
				323	if (pol->flags & MPOL_F_STATIC_NODES) {
				324	int node = first_node(pol->w.user_nodemask);
				325
				326	if (node_isset(node, *nodes)) {
				327	pol->v.preferred_node = node;
				328	pol->flags &= ~MPOL_F_LOCAL;
				329	} else
				330	pol->flags \|= MPOL_F_LOCAL;
				331	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
				332	mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
				333	pol->v.preferred_node = first_node(tmp);
				334	} else if (!(pol->flags & MPOL_F_LOCAL)) {
				335	pol->v.preferred_node = node_remap(pol->v.preferred_node,
				336	pol->w.cpuset_mems_allowed,
				337	*nodes);
				338	pol->w.cpuset_mems_allowed = *nodes;
				339	}
				340	}
				341
				342	/*
				343	* mpol_rebind_policy - Migrate a policy to a different set of nodes
				344	*
				345	* Per-vma policies are protected by mmap_sem. Allocations using per-task
				346	* policies are protected by task->mems_allowed_seq to prevent a premature
				347	* OOM/allocation failure due to parallel nodemask modification.
				348	*/
				349	static void mpol_rebind_policy(struct mempolicy pol, const nodemask_t newmask)
				350	{
				351	if (!pol)
				352	return;
				353	if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
				354	nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
				355	return;
				356
				357	mpol_ops[pol->mode].rebind(pol, newmask);
				358	}
				359
				360	/*
				361	* Wrapper for mpol_rebind_policy() that just requires task
				362	* pointer, and updates task mempolicy.
				363	*
				364	* Called with task's alloc_lock held.
				365	*/
				366
				367	void mpol_rebind_task(struct task_struct tsk, const nodemask_t new)
				368	{
				369	mpol_rebind_policy(tsk->mempolicy, new);
				370	}
				371
				372	/*
				373	* Rebind each vma in mm to new nodemask.
				374	*
				375	* Call holding a reference to mm. Takes mm->mmap_sem during call.
				376	*/
				377
				378	void mpol_rebind_mm(struct mm_struct mm, nodemask_t new)
				379	{
				380	struct vm_area_struct *vma;
				381
				382	down_write(&mm->mmap_sem);
				383	for (vma = mm->mmap; vma; vma = vma->vm_next)
				384	mpol_rebind_policy(vma->vm_policy, new);
				385	up_write(&mm->mmap_sem);
				386	}
				387
				388	static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
				389	[MPOL_DEFAULT] = {
				390	.rebind = mpol_rebind_default,
				391	},
				392	[MPOL_INTERLEAVE] = {
				393	.create = mpol_new_interleave,
				394	.rebind = mpol_rebind_nodemask,
				395	},
				396	[MPOL_PREFERRED] = {
				397	.create = mpol_new_preferred,
				398	.rebind = mpol_rebind_preferred,
				399	},
				400	[MPOL_BIND] = {
				401	.create = mpol_new_bind,
				402	.rebind = mpol_rebind_nodemask,
				403	},
				404	};
				405
				406	static int migrate_page_add(struct page page, struct list_head pagelist,
				407	unsigned long flags);
				408
				409	struct queue_pages {
				410	struct list_head *pagelist;
				411	unsigned long flags;
				412	nodemask_t *nmask;
				413	struct vm_area_struct *prev;
				414	};
				415
				416	/*
				417	* Check if the page's nid is in qp->nmask.
				418	*
				419	* If MPOL_MF_INVERT is set in qp->flags, check if the nid is
				420	* in the invert of qp->nmask.
				421	*/
				422	static inline bool queue_pages_required(struct page *page,
				423	struct queue_pages *qp)
				424	{
				425	int nid = page_to_nid(page);
				426	unsigned long flags = qp->flags;
				427
				428	return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
				429	}
				430
				431	/*
				432	* queue_pages_pmd() has four possible return values:
				433	* 0 - pages are placed on the right node or queued successfully.
				434	* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
				435	* specified.
				436	* 2 - THP was split.
				437	* -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
				438	* existing page was already on a node that does not follow the
				439	* policy.
				440	*/
				441	static int queue_pages_pmd(pmd_t pmd, spinlock_t ptl, unsigned long addr,
				442	unsigned long end, struct mm_walk *walk)
				443	{
				444	int ret = 0;
				445	struct page *page;
				446	struct queue_pages *qp = walk->private;
				447	unsigned long flags;
				448
				449	if (unlikely(is_pmd_migration_entry(*pmd))) {
				450	ret = -EIO;
				451	goto unlock;
				452	}
				453	page = pmd_page(*pmd);
				454	if (is_huge_zero_page(page)) {
				455	spin_unlock(ptl);
				456	__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
				457	ret = 2;
				458	goto out;
				459	}
				460	if (!queue_pages_required(page, qp))
				461	goto unlock;
				462
				463	flags = qp->flags;
				464	/* go to thp migration */
				465	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) {
				466	if (!vma_migratable(walk->vma) \|\|
				467	migrate_page_add(page, qp->pagelist, flags)) {
				468	ret = 1;
				469	goto unlock;
				470	}
				471	} else
				472	ret = -EIO;
				473	unlock:
				474	spin_unlock(ptl);
				475	out:
				476	return ret;
				477	}
				478
				479	/*
				480	* Scan through pages checking if pages follow certain conditions,
				481	* and move them to the pagelist if they do.
				482	*
				483	* queue_pages_pte_range() has three possible return values:
				484	* 0 - pages are placed on the right node or queued successfully.
				485	* 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
				486	* specified.
				487	* -EIO - only MPOL_MF_STRICT was specified and an existing page was already
				488	* on a node that does not follow the policy.
				489	*/
				490	static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
				491	unsigned long end, struct mm_walk *walk)
				492	{
				493	struct vm_area_struct *vma = walk->vma;
				494	struct page *page;
				495	struct queue_pages *qp = walk->private;
				496	unsigned long flags = qp->flags;
				497	int ret;
				498	bool has_unmovable = false;
				499	pte_t *pte;
				500	spinlock_t *ptl;
				501
				502	ptl = pmd_trans_huge_lock(pmd, vma);
				503	if (ptl) {
				504	ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
				505	if (ret != 2)
				506	return ret;
				507	}
				508	/* THP was split, fall through to pte walk */
				509
				510	if (pmd_trans_unstable(pmd))
				511	return 0;
				512
				513	pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
				514	for (; addr != end; pte++, addr += PAGE_SIZE) {
				515	if (!pte_present(*pte))
				516	continue;
				517	page = vm_normal_page(vma, addr, *pte);
				518	if (!page)
				519	continue;
				520	/*
				521	* vm_normal_page() filters out zero pages, but there might
				522	* still be PageReserved pages to skip, perhaps in a VDSO.
				523	*/
				524	if (PageReserved(page))
				525	continue;
				526	if (!queue_pages_required(page, qp))
				527	continue;
				528	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) {
				529	/* MPOL_MF_STRICT must be specified if we get here */
				530	if (!vma_migratable(vma)) {
				531	has_unmovable = true;
				532	break;
				533	}
				534
				535	/*
				536	* Do not abort immediately since there may be
				537	* temporary off LRU pages in the range. Still
				538	* need migrate other LRU pages.
				539	*/
				540	if (migrate_page_add(page, qp->pagelist, flags))
				541	has_unmovable = true;
				542	} else
				543	break;
				544	}
				545	pte_unmap_unlock(pte - 1, ptl);
				546	cond_resched();
				547
				548	if (has_unmovable)
				549	return 1;
				550
				551	return addr != end ? -EIO : 0;
				552	}
				553
				554	static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
				555	unsigned long addr, unsigned long end,
				556	struct mm_walk *walk)
				557	{
				558	#ifdef CONFIG_HUGETLB_PAGE
				559	struct queue_pages *qp = walk->private;
				560	unsigned long flags = qp->flags;
				561	struct page *page;
				562	spinlock_t *ptl;
				563	pte_t entry;
				564
				565	ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
				566	entry = huge_ptep_get(pte);
				567	if (!pte_present(entry))
				568	goto unlock;
				569	page = pte_page(entry);
				570	if (!queue_pages_required(page, qp))
				571	goto unlock;
				572	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
				573	if (flags & (MPOL_MF_MOVE_ALL) \|\|
				574	(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
				575	isolate_huge_page(page, qp->pagelist);
				576	unlock:
				577	spin_unlock(ptl);
				578	#else
				579	BUG();
				580	#endif
				581	return 0;
				582	}
				583
				584	#ifdef CONFIG_NUMA_BALANCING
				585	/*
				586	* This is used to mark a range of virtual addresses to be inaccessible.
				587	* These are later cleared by a NUMA hinting fault. Depending on these
				588	* faults, pages may be migrated for better NUMA placement.
				589	*
				590	* This is assuming that NUMA faults are handled using PROT_NONE. If
				591	* an architecture makes a different choice, it will need further
				592	* changes to the core.
				593	*/
				594	unsigned long change_prot_numa(struct vm_area_struct *vma,
				595	unsigned long addr, unsigned long end)
				596	{
				597	int nr_updated;
				598
				599	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
				600	if (nr_updated)
				601	count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
				602
				603	return nr_updated;
				604	}
				605	#else
				606	static unsigned long change_prot_numa(struct vm_area_struct *vma,
				607	unsigned long addr, unsigned long end)
				608	{
				609	return 0;
				610	}
				611	#endif /* CONFIG_NUMA_BALANCING */
				612
				613	static int queue_pages_test_walk(unsigned long start, unsigned long end,
				614	struct mm_walk *walk)
				615	{
				616	struct vm_area_struct *vma = walk->vma;
				617	struct queue_pages *qp = walk->private;
				618	unsigned long endvma = vma->vm_end;
				619	unsigned long flags = qp->flags;
				620
				621	/*
				622	* Need check MPOL_MF_STRICT to return -EIO if possible
				623	* regardless of vma_migratable
				624	*/
				625	if (!vma_migratable(vma) &&
				626	!(flags & MPOL_MF_STRICT))
				627	return 1;
				628
				629	if (endvma > end)
				630	endvma = end;
				631	if (vma->vm_start > start)
				632	start = vma->vm_start;
				633
				634	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
				635	if (!vma->vm_next && vma->vm_end < end)
				636	return -EFAULT;
				637	if (qp->prev && qp->prev->vm_end < vma->vm_start)
				638	return -EFAULT;
				639	}
				640
				641	qp->prev = vma;
				642
				643	if (flags & MPOL_MF_LAZY) {
				644	/* Similar to task_numa_work, skip inaccessible VMAs */
				645	if (!is_vm_hugetlb_page(vma) &&
				646	(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)) &&
				647	!(vma->vm_flags & VM_MIXEDMAP))
				648	change_prot_numa(vma, start, endvma);
				649	return 1;
				650	}
				651
				652	/* queue pages from current vma */
				653	if (flags & MPOL_MF_VALID)
				654	return 0;
				655	return 1;
				656	}
				657
				658	/*
				659	* Walk through page tables and collect pages to be migrated.
				660	*
				661	* If pages found in a given range are on a set of nodes (determined by
				662	* @nodes and @flags,) it's isolated and queued to the pagelist which is
				663	* passed via @private.
				664	*
				665	* queue_pages_range() has three possible return values:
				666	* 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
				667	* specified.
				668	* 0 - queue pages successfully or no misplaced page.
				669	* errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
				670	* memory range specified by nodemask and maxnode points outside
				671	* your accessible address space (-EFAULT)
				672	*/
				673	static int
				674	queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
				675	nodemask_t *nodes, unsigned long flags,
				676	struct list_head *pagelist)
				677	{
				678	struct queue_pages qp = {
				679	.pagelist = pagelist,
				680	.flags = flags,
				681	.nmask = nodes,
				682	.prev = NULL,
				683	};
				684	struct mm_walk queue_pages_walk = {
				685	.hugetlb_entry = queue_pages_hugetlb,
				686	.pmd_entry = queue_pages_pte_range,
				687	.test_walk = queue_pages_test_walk,
				688	.mm = mm,
				689	.private = &qp,
				690	};
				691
				692	return walk_page_range(start, end, &queue_pages_walk);
				693	}
				694
				695	/*
				696	* Apply policy to a single VMA
				697	* This must be called with the mmap_sem held for writing.
				698	*/
				699	static int vma_replace_policy(struct vm_area_struct *vma,
				700	struct mempolicy *pol)
				701	{
				702	int err;
				703	struct mempolicy *old;
				704	struct mempolicy *new;
				705
				706	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
				707	vma->vm_start, vma->vm_end, vma->vm_pgoff,
				708	vma->vm_ops, vma->vm_file,
				709	vma->vm_ops ? vma->vm_ops->set_policy : NULL);
				710
				711	new = mpol_dup(pol);
				712	if (IS_ERR(new))
				713	return PTR_ERR(new);
				714
				715	if (vma->vm_ops && vma->vm_ops->set_policy) {
				716	err = vma->vm_ops->set_policy(vma, new);
				717	if (err)
				718	goto err_out;
				719	}
				720
				721	old = vma->vm_policy;
				722	vma->vm_policy = new; /* protected by mmap_sem */
				723	mpol_put(old);
				724
				725	return 0;
				726	err_out:
				727	mpol_put(new);
				728	return err;
				729	}
				730
				731	/* Step 2: apply policy to a range and do splits. */
				732	static int mbind_range(struct mm_struct *mm, unsigned long start,
				733	unsigned long end, struct mempolicy *new_pol)
				734	{
				735	struct vm_area_struct *next;
				736	struct vm_area_struct *prev;
				737	struct vm_area_struct *vma;
				738	int err = 0;
				739	pgoff_t pgoff;
				740	unsigned long vmstart;
				741	unsigned long vmend;
				742
				743	vma = find_vma(mm, start);
				744	if (!vma \|\| vma->vm_start > start)
				745	return -EFAULT;
				746
				747	prev = vma->vm_prev;
				748	if (start > vma->vm_start)
				749	prev = vma;
				750
				751	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
				752	next = vma->vm_next;
				753	vmstart = max(start, vma->vm_start);
				754	vmend = min(end, vma->vm_end);
				755
				756	if (mpol_equal(vma_policy(vma), new_pol))
				757	continue;
				758
				759	pgoff = vma->vm_pgoff +
				760	((vmstart - vma->vm_start) >> PAGE_SHIFT);
				761	prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
				762	vma->anon_vma, vma->vm_file, pgoff,
				763	new_pol, vma->vm_userfaultfd_ctx,
				764	vma_get_anon_name(vma));
				765	if (prev) {
				766	vma = prev;
				767	next = vma->vm_next;
				768	if (mpol_equal(vma_policy(vma), new_pol))
				769	continue;
				770	/* vma_merge() joined vma && vma->next, case 8 */
				771	goto replace;
				772	}
				773	if (vma->vm_start != vmstart) {
				774	err = split_vma(vma->vm_mm, vma, vmstart, 1);
				775	if (err)
				776	goto out;
				777	}
				778	if (vma->vm_end != vmend) {
				779	err = split_vma(vma->vm_mm, vma, vmend, 0);
				780	if (err)
				781	goto out;
				782	}
				783	replace:
				784	err = vma_replace_policy(vma, new_pol);
				785	if (err)
				786	goto out;
				787	}
				788
				789	out:
				790	return err;
				791	}
				792
				793	/* Set the process memory policy */
				794	static long do_set_mempolicy(unsigned short mode, unsigned short flags,
				795	nodemask_t *nodes)
				796	{
				797	struct mempolicy new, old;
				798	NODEMASK_SCRATCH(scratch);
				799	int ret;
				800
				801	if (!scratch)
				802	return -ENOMEM;
				803
				804	new = mpol_new(mode, flags, nodes);
				805	if (IS_ERR(new)) {
				806	ret = PTR_ERR(new);
				807	goto out;
				808	}
				809
				810	task_lock(current);
				811	ret = mpol_set_nodemask(new, nodes, scratch);
				812	if (ret) {
				813	task_unlock(current);
				814	mpol_put(new);
				815	goto out;
				816	}
				817	old = current->mempolicy;
				818	current->mempolicy = new;
				819	if (new && new->mode == MPOL_INTERLEAVE)
				820	current->il_prev = MAX_NUMNODES-1;
				821	task_unlock(current);
				822	mpol_put(old);
				823	ret = 0;
				824	out:
				825	NODEMASK_SCRATCH_FREE(scratch);
				826	return ret;
				827	}
				828
				829	/*
				830	* Return nodemask for policy for get_mempolicy() query
				831	*
				832	* Called with task's alloc_lock held
				833	*/
				834	static void get_policy_nodemask(struct mempolicy p, nodemask_t nodes)
				835	{
				836	nodes_clear(*nodes);
				837	if (p == &default_policy)
				838	return;
				839
				840	switch (p->mode) {
				841	case MPOL_BIND:
				842	/* Fall through */
				843	case MPOL_INTERLEAVE:
				844	*nodes = p->v.nodes;
				845	break;
				846	case MPOL_PREFERRED:
				847	if (!(p->flags & MPOL_F_LOCAL))
				848	node_set(p->v.preferred_node, *nodes);
				849	/* else return empty node mask for local allocation */
				850	break;
				851	default:
				852	BUG();
				853	}
				854	}
				855
				856	static int lookup_node(unsigned long addr)
				857	{
				858	struct page *p;
				859	int err;
				860
				861	err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
				862	if (err >= 0) {
				863	err = page_to_nid(p);
				864	put_page(p);
				865	}
				866	return err;
				867	}
				868
				869	/* Retrieve NUMA policy */
				870	static long do_get_mempolicy(int policy, nodemask_t nmask,
				871	unsigned long addr, unsigned long flags)
				872	{
				873	int err;
				874	struct mm_struct *mm = current->mm;
				875	struct vm_area_struct *vma = NULL;
				876	struct mempolicy *pol = current->mempolicy;
				877
				878	if (flags &
				879	~(unsigned long)(MPOL_F_NODE\|MPOL_F_ADDR\|MPOL_F_MEMS_ALLOWED))
				880	return -EINVAL;
				881
				882	if (flags & MPOL_F_MEMS_ALLOWED) {
				883	if (flags & (MPOL_F_NODE\|MPOL_F_ADDR))
				884	return -EINVAL;
				885	policy = 0; / just so it's initialized */
				886	task_lock(current);
				887	*nmask = cpuset_current_mems_allowed;
				888	task_unlock(current);
				889	return 0;
				890	}
				891
				892	if (flags & MPOL_F_ADDR) {
				893	/*
				894	* Do NOT fall back to task policy if the
				895	* vma/shared policy at addr is NULL. We
				896	* want to return MPOL_DEFAULT in this case.
				897	*/
				898	down_read(&mm->mmap_sem);
				899	vma = find_vma_intersection(mm, addr, addr+1);
				900	if (!vma) {
				901	up_read(&mm->mmap_sem);
				902	return -EFAULT;
				903	}
				904	if (vma->vm_ops && vma->vm_ops->get_policy)
				905	pol = vma->vm_ops->get_policy(vma, addr);
				906	else
				907	pol = vma->vm_policy;
				908	} else if (addr)
				909	return -EINVAL;
				910
				911	if (!pol)
				912	pol = &default_policy; /* indicates default behavior */
				913
				914	if (flags & MPOL_F_NODE) {
				915	if (flags & MPOL_F_ADDR) {
				916	err = lookup_node(addr);
				917	if (err < 0)
				918	goto out;
				919	*policy = err;
				920	} else if (pol == current->mempolicy &&
				921	pol->mode == MPOL_INTERLEAVE) {
				922	*policy = next_node_in(current->il_prev, pol->v.nodes);
				923	} else {
				924	err = -EINVAL;
				925	goto out;
				926	}
				927	} else {
				928	*policy = pol == &default_policy ? MPOL_DEFAULT :
				929	pol->mode;
				930	/*
				931	* Internal mempolicy flags must be masked off before exposing
				932	* the policy to userspace.
				933	*/
				934	*policy \|= (pol->flags & MPOL_MODE_FLAGS);
				935	}
				936
				937	err = 0;
				938	if (nmask) {
				939	if (mpol_store_user_nodemask(pol)) {
				940	*nmask = pol->w.user_nodemask;
				941	} else {
				942	task_lock(current);
				943	get_policy_nodemask(pol, nmask);
				944	task_unlock(current);
				945	}
				946	}
				947
				948	out:
				949	mpol_cond_put(pol);
				950	if (vma)
				951	up_read(&current->mm->mmap_sem);
				952	return err;
				953	}
				954
				955	#ifdef CONFIG_MIGRATION
				956	/*
				957	* page migration, thp tail pages can be passed.
				958	*/
				959	static int migrate_page_add(struct page page, struct list_head pagelist,
				960	unsigned long flags)
				961	{
				962	struct page *head = compound_head(page);
				963	/*
				964	* Avoid migrating a page that is shared with others.
				965	*/
				966	if ((flags & MPOL_MF_MOVE_ALL) \|\| page_mapcount(head) == 1) {
				967	if (!isolate_lru_page(head)) {
				968	list_add_tail(&head->lru, pagelist);
				969	mod_node_page_state(page_pgdat(head),
				970	NR_ISOLATED_ANON + page_is_file_cache(head),
				971	hpage_nr_pages(head));
				972	} else if (flags & MPOL_MF_STRICT) {
				973	/*
				974	* Non-movable page may reach here. And, there may be
				975	* temporary off LRU pages or non-LRU movable pages.
				976	* Treat them as unmovable pages since they can't be
				977	* isolated, so they can't be moved at the moment. It
				978	* should return -EIO for this case too.
				979	*/
				980	return -EIO;
				981	}
				982	}
				983
				984	return 0;
				985	}
				986
				987	/* page allocation callback for NUMA node migration */
				988	struct page alloc_new_node_page(struct page page, unsigned long node)
				989	{
				990	if (PageHuge(page))
				991	return alloc_huge_page_node(page_hstate(compound_head(page)),
				992	node);
				993	else if (PageTransHuge(page)) {
				994	struct page *thp;
				995
				996	thp = alloc_pages_node(node,
				997	(GFP_TRANSHUGE \| __GFP_THISNODE),
				998	HPAGE_PMD_ORDER);
				999	if (!thp)
				1000	return NULL;
				1001	prep_transhuge_page(thp);
				1002	return thp;
				1003	} else
				1004	return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE \|
				1005	__GFP_THISNODE, 0);
				1006	}
				1007
				1008	/*
				1009	* Migrate pages from one node to a target node.
				1010	* Returns error or the number of pages not migrated.
				1011	*/
				1012	static int migrate_to_node(struct mm_struct *mm, int source, int dest,
				1013	int flags)
				1014	{
				1015	nodemask_t nmask;
				1016	LIST_HEAD(pagelist);
				1017	int err = 0;
				1018
				1019	nodes_clear(nmask);
				1020	node_set(source, nmask);
				1021
				1022	/*
				1023	* This does not "check" the range but isolates all pages that
				1024	* need migration. Between passing in the full user address
				1025	* space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
				1026	*/
				1027	VM_BUG_ON(!(flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)));
				1028	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
				1029	flags \| MPOL_MF_DISCONTIG_OK, &pagelist);
				1030
				1031	if (!list_empty(&pagelist)) {
				1032	err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
				1033	MIGRATE_SYNC, MR_SYSCALL);
				1034	if (err)
				1035	putback_movable_pages(&pagelist);
				1036	}
				1037
				1038	return err;
				1039	}
				1040
				1041	/*
				1042	* Move pages between the two nodesets so as to preserve the physical
				1043	* layout as much as possible.
				1044	*
				1045	* Returns the number of page that could not be moved.
				1046	*/
				1047	int do_migrate_pages(struct mm_struct mm, const nodemask_t from,
				1048	const nodemask_t *to, int flags)
				1049	{
				1050	int busy = 0;
				1051	int err;
				1052	nodemask_t tmp;
				1053
				1054	err = migrate_prep();
				1055	if (err)
				1056	return err;
				1057
				1058	down_read(&mm->mmap_sem);
				1059
				1060	/*
				1061	* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
				1062	* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
				1063	* bit in 'tmp', and return that <source, dest> pair for migration.
				1064	* The pair of nodemasks 'to' and 'from' define the map.
				1065	*
				1066	* If no pair of bits is found that way, fallback to picking some
				1067	* pair of 'source' and 'dest' bits that are not the same. If the
				1068	* 'source' and 'dest' bits are the same, this represents a node
				1069	* that will be migrating to itself, so no pages need move.
				1070	*
				1071	* If no bits are left in 'tmp', or if all remaining bits left
				1072	* in 'tmp' correspond to the same bit in 'to', return false
				1073	* (nothing left to migrate).
				1074	*
				1075	* This lets us pick a pair of nodes to migrate between, such that
				1076	* if possible the dest node is not already occupied by some other
				1077	* source node, minimizing the risk of overloading the memory on a
				1078	* node that would happen if we migrated incoming memory to a node
				1079	* before migrating outgoing memory source that same node.
				1080	*
				1081	* A single scan of tmp is sufficient. As we go, we remember the
				1082	* most recent <s, d> pair that moved (s != d). If we find a pair
				1083	* that not only moved, but what's better, moved to an empty slot
				1084	* (d is not set in tmp), then we break out then, with that pair.
				1085	* Otherwise when we finish scanning from_tmp, we at least have the
				1086	* most recent <s, d> pair that moved. If we get all the way through
				1087	* the scan of tmp without finding any node that moved, much less
				1088	* moved to an empty node, then there is nothing left worth migrating.
				1089	*/
				1090
				1091	tmp = *from;
				1092	while (!nodes_empty(tmp)) {
				1093	int s,d;
				1094	int source = NUMA_NO_NODE;
				1095	int dest = 0;
				1096
				1097	for_each_node_mask(s, tmp) {
				1098
				1099	/*
				1100	* do_migrate_pages() tries to maintain the relative
				1101	* node relationship of the pages established between
				1102	* threads and memory areas.
				1103	*
				1104	* However if the number of source nodes is not equal to
				1105	* the number of destination nodes we can not preserve
				1106	* this node relative relationship. In that case, skip
				1107	* copying memory from a node that is in the destination
				1108	* mask.
				1109	*
				1110	* Example: [2,3,4] -> [3,4,5] moves everything.
				1111	* [0-7] - > [3,4,5] moves only 0,1,2,6,7.
				1112	*/
				1113
				1114	if ((nodes_weight(from) != nodes_weight(to)) &&
				1115	(node_isset(s, *to)))
				1116	continue;
				1117
				1118	d = node_remap(s, from, to);
				1119	if (s == d)
				1120	continue;
				1121
				1122	source = s; /* Node moved. Memorize */
				1123	dest = d;
				1124
				1125	/* dest not in remaining from nodes? */
				1126	if (!node_isset(dest, tmp))
				1127	break;
				1128	}
				1129	if (source == NUMA_NO_NODE)
				1130	break;
				1131
				1132	node_clear(source, tmp);
				1133	err = migrate_to_node(mm, source, dest, flags);
				1134	if (err > 0)
				1135	busy += err;
				1136	if (err < 0)
				1137	break;
				1138	}
				1139	up_read(&mm->mmap_sem);
				1140	if (err < 0)
				1141	return err;
				1142	return busy;
				1143
				1144	}
				1145
				1146	/*
				1147	* Allocate a new page for page migration based on vma policy.
				1148	* Start by assuming the page is mapped by the same vma as contains @start.
				1149	* Search forward from there, if not. N.B., this assumes that the
				1150	* list of pages handed to migrate_pages()--which is how we get here--
				1151	* is in virtual address order.
				1152	*/
				1153	static struct page new_page(struct page page, unsigned long start)
				1154	{
				1155	struct vm_area_struct *vma;
				1156	unsigned long uninitialized_var(address);
				1157
				1158	vma = find_vma(current->mm, start);
				1159	while (vma) {
				1160	address = page_address_in_vma(page, vma);
				1161	if (address != -EFAULT)
				1162	break;
				1163	vma = vma->vm_next;
				1164	}
				1165
				1166	if (PageHuge(page)) {
				1167	return alloc_huge_page_vma(page_hstate(compound_head(page)),
				1168	vma, address);
				1169	} else if (PageTransHuge(page)) {
				1170	struct page *thp;
				1171
				1172	thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address,
				1173	HPAGE_PMD_ORDER);
				1174	if (!thp)
				1175	return NULL;
				1176	prep_transhuge_page(thp);
				1177	return thp;
				1178	}
				1179	/*
				1180	* if !vma, alloc_page_vma() will use task or system default policy
				1181	*/
				1182	return alloc_page_vma(GFP_HIGHUSER_MOVABLE \| __GFP_RETRY_MAYFAIL,
				1183	vma, address);
				1184	}
				1185	#else
				1186
				1187	static int migrate_page_add(struct page page, struct list_head pagelist,
				1188	unsigned long flags)
				1189	{
				1190	return -EIO;
				1191	}
				1192
				1193	int do_migrate_pages(struct mm_struct mm, const nodemask_t from,
				1194	const nodemask_t *to, int flags)
				1195	{
				1196	return -ENOSYS;
				1197	}
				1198
				1199	static struct page new_page(struct page page, unsigned long start)
				1200	{
				1201	return NULL;
				1202	}
				1203	#endif
				1204
				1205	static long do_mbind(unsigned long start, unsigned long len,
				1206	unsigned short mode, unsigned short mode_flags,
				1207	nodemask_t *nmask, unsigned long flags)
				1208	{
				1209	struct mm_struct *mm = current->mm;
				1210	struct mempolicy *new;
				1211	unsigned long end;
				1212	int err;
				1213	int ret;
				1214	LIST_HEAD(pagelist);
				1215
				1216	if (flags & ~(unsigned long)MPOL_MF_VALID)
				1217	return -EINVAL;
				1218	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				1219	return -EPERM;
				1220
				1221	if (start & ~PAGE_MASK)
				1222	return -EINVAL;
				1223
				1224	if (mode == MPOL_DEFAULT)
				1225	flags &= ~MPOL_MF_STRICT;
				1226
				1227	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
				1228	end = start + len;
				1229
				1230	if (end < start)
				1231	return -EINVAL;
				1232	if (end == start)
				1233	return 0;
				1234
				1235	new = mpol_new(mode, mode_flags, nmask);
				1236	if (IS_ERR(new))
				1237	return PTR_ERR(new);
				1238
				1239	if (flags & MPOL_MF_LAZY)
				1240	new->flags \|= MPOL_F_MOF;
				1241
				1242	/*
				1243	* If we are using the default policy then operation
				1244	* on discontinuous address spaces is okay after all
				1245	*/
				1246	if (!new)
				1247	flags \|= MPOL_MF_DISCONTIG_OK;
				1248
				1249	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
				1250	start, start + len, mode, mode_flags,
				1251	nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
				1252
				1253	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) {
				1254
				1255	err = migrate_prep();
				1256	if (err)
				1257	goto mpol_out;
				1258	}
				1259	{
				1260	NODEMASK_SCRATCH(scratch);
				1261	if (scratch) {
				1262	down_write(&mm->mmap_sem);
				1263	task_lock(current);
				1264	err = mpol_set_nodemask(new, nmask, scratch);
				1265	task_unlock(current);
				1266	if (err)
				1267	up_write(&mm->mmap_sem);
				1268	} else
				1269	err = -ENOMEM;
				1270	NODEMASK_SCRATCH_FREE(scratch);
				1271	}
				1272	if (err)
				1273	goto mpol_out;
				1274
				1275	ret = queue_pages_range(mm, start, end, nmask,
				1276	flags \| MPOL_MF_INVERT, &pagelist);
				1277
				1278	if (ret < 0) {
				1279	err = ret;
				1280	goto up_out;
				1281	}
				1282
				1283	err = mbind_range(mm, start, end, new);
				1284
				1285	if (!err) {
				1286	int nr_failed = 0;
				1287
				1288	if (!list_empty(&pagelist)) {
				1289	WARN_ON_ONCE(flags & MPOL_MF_LAZY);
				1290	nr_failed = migrate_pages(&pagelist, new_page, NULL,
				1291	start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
				1292	if (nr_failed)
				1293	putback_movable_pages(&pagelist);
				1294	}
				1295
				1296	if ((ret > 0) \|\| (nr_failed && (flags & MPOL_MF_STRICT)))
				1297	err = -EIO;
				1298	} else {
				1299	up_out:
				1300	if (!list_empty(&pagelist))
				1301	putback_movable_pages(&pagelist);
				1302	}
				1303
				1304	up_write(&mm->mmap_sem);
				1305	mpol_out:
				1306	mpol_put(new);
				1307	return err;
				1308	}
				1309
				1310	/*
				1311	* User space interface with variable sized bitmaps for nodelists.
				1312	*/
				1313
				1314	/* Copy a node mask from user space. */
				1315	static int get_nodes(nodemask_t nodes, const unsigned long __user nmask,
				1316	unsigned long maxnode)
				1317	{
				1318	unsigned long k;
				1319	unsigned long t;
				1320	unsigned long nlongs;
				1321	unsigned long endmask;
				1322
				1323	--maxnode;
				1324	nodes_clear(*nodes);
				1325	if (maxnode == 0 \|\| !nmask)
				1326	return 0;
				1327	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
				1328	return -EINVAL;
				1329
				1330	nlongs = BITS_TO_LONGS(maxnode);
				1331	if ((maxnode % BITS_PER_LONG) == 0)
				1332	endmask = ~0UL;
				1333	else
				1334	endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
				1335
				1336	/*
				1337	* When the user specified more nodes than supported just check
				1338	* if the non supported part is all zero.
				1339	*
				1340	* If maxnode have more longs than MAX_NUMNODES, check
				1341	* the bits in that area first. And then go through to
				1342	* check the rest bits which equal or bigger than MAX_NUMNODES.
				1343	* Otherwise, just check bits [MAX_NUMNODES, maxnode).
				1344	*/
				1345	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
				1346	for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
				1347	if (get_user(t, nmask + k))
				1348	return -EFAULT;
				1349	if (k == nlongs - 1) {
				1350	if (t & endmask)
				1351	return -EINVAL;
				1352	} else if (t)
				1353	return -EINVAL;
				1354	}
				1355	nlongs = BITS_TO_LONGS(MAX_NUMNODES);
				1356	endmask = ~0UL;
				1357	}
				1358
				1359	if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) {
				1360	unsigned long valid_mask = endmask;
				1361
				1362	valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
				1363	if (get_user(t, nmask + nlongs - 1))
				1364	return -EFAULT;
				1365	if (t & valid_mask)
				1366	return -EINVAL;
				1367	}
				1368
				1369	if (copy_from_user(nodes_addr(nodes), nmask, nlongssizeof(unsigned long)))
				1370	return -EFAULT;
				1371	nodes_addr(*nodes)[nlongs-1] &= endmask;
				1372	return 0;
				1373	}
				1374
				1375	/* Copy a kernel node mask to user space */
				1376	static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
				1377	nodemask_t *nodes)
				1378	{
				1379	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
				1380	unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
				1381
				1382	if (copy > nbytes) {
				1383	if (copy > PAGE_SIZE)
				1384	return -EINVAL;
				1385	if (clear_user((char __user *)mask + nbytes, copy - nbytes))
				1386	return -EFAULT;
				1387	copy = nbytes;
				1388	}
				1389	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
				1390	}
				1391
				1392	static long kernel_mbind(unsigned long start, unsigned long len,
				1393	unsigned long mode, const unsigned long __user *nmask,
				1394	unsigned long maxnode, unsigned int flags)
				1395	{
				1396	nodemask_t nodes;
				1397	int err;
				1398	unsigned short mode_flags;
				1399
				1400	start = untagged_addr(start);
				1401	mode_flags = mode & MPOL_MODE_FLAGS;
				1402	mode &= ~MPOL_MODE_FLAGS;
				1403	if (mode >= MPOL_MAX)
				1404	return -EINVAL;
				1405	if ((mode_flags & MPOL_F_STATIC_NODES) &&
				1406	(mode_flags & MPOL_F_RELATIVE_NODES))
				1407	return -EINVAL;
				1408	err = get_nodes(&nodes, nmask, maxnode);
				1409	if (err)
				1410	return err;
				1411	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
				1412	}
				1413
				1414	SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
				1415	unsigned long, mode, const unsigned long __user *, nmask,
				1416	unsigned long, maxnode, unsigned int, flags)
				1417	{
				1418	return kernel_mbind(start, len, mode, nmask, maxnode, flags);
				1419	}
				1420
				1421	/* Set the process memory policy */
				1422	static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
				1423	unsigned long maxnode)
				1424	{
				1425	int err;
				1426	nodemask_t nodes;
				1427	unsigned short flags;
				1428
				1429	flags = mode & MPOL_MODE_FLAGS;
				1430	mode &= ~MPOL_MODE_FLAGS;
				1431	if ((unsigned int)mode >= MPOL_MAX)
				1432	return -EINVAL;
				1433	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
				1434	return -EINVAL;
				1435	err = get_nodes(&nodes, nmask, maxnode);
				1436	if (err)
				1437	return err;
				1438	return do_set_mempolicy(mode, flags, &nodes);
				1439	}
				1440
				1441	SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
				1442	unsigned long, maxnode)
				1443	{
				1444	return kernel_set_mempolicy(mode, nmask, maxnode);
				1445	}
				1446
				1447	static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
				1448	const unsigned long __user *old_nodes,
				1449	const unsigned long __user *new_nodes)
				1450	{
				1451	struct mm_struct *mm = NULL;
				1452	struct task_struct *task;
				1453	nodemask_t task_nodes;
				1454	int err;
				1455	nodemask_t *old;
				1456	nodemask_t *new;
				1457	NODEMASK_SCRATCH(scratch);
				1458
				1459	if (!scratch)
				1460	return -ENOMEM;
				1461
				1462	old = &scratch->mask1;
				1463	new = &scratch->mask2;
				1464
				1465	err = get_nodes(old, old_nodes, maxnode);
				1466	if (err)
				1467	goto out;
				1468
				1469	err = get_nodes(new, new_nodes, maxnode);
				1470	if (err)
				1471	goto out;
				1472
				1473	/* Find the mm_struct */
				1474	rcu_read_lock();
				1475	task = pid ? find_task_by_vpid(pid) : current;
				1476	if (!task) {
				1477	rcu_read_unlock();
				1478	err = -ESRCH;
				1479	goto out;
				1480	}
				1481	get_task_struct(task);
				1482
				1483	err = -EINVAL;
				1484
				1485	/*
				1486	* Check if this process has the right to modify the specified process.
				1487	* Use the regular "ptrace_may_access()" checks.
				1488	*/
				1489	if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
				1490	rcu_read_unlock();
				1491	err = -EPERM;
				1492	goto out_put;
				1493	}
				1494	rcu_read_unlock();
				1495
				1496	task_nodes = cpuset_mems_allowed(task);
				1497	/* Is the user allowed to access the target nodes? */
				1498	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
				1499	err = -EPERM;
				1500	goto out_put;
				1501	}
				1502
				1503	task_nodes = cpuset_mems_allowed(current);
				1504	nodes_and(new, new, task_nodes);
				1505	if (nodes_empty(*new))
				1506	goto out_put;
				1507
				1508	nodes_and(new, new, node_states[N_MEMORY]);
				1509	if (nodes_empty(*new))
				1510	goto out_put;
				1511
				1512	err = security_task_movememory(task);
				1513	if (err)
				1514	goto out_put;
				1515
				1516	mm = get_task_mm(task);
				1517	put_task_struct(task);
				1518
				1519	if (!mm) {
				1520	err = -EINVAL;
				1521	goto out;
				1522	}
				1523
				1524	err = do_migrate_pages(mm, old, new,
				1525	capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
				1526
				1527	mmput(mm);
				1528	out:
				1529	NODEMASK_SCRATCH_FREE(scratch);
				1530
				1531	return err;
				1532
				1533	out_put:
				1534	put_task_struct(task);
				1535	goto out;
				1536
				1537	}
				1538
				1539	SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
				1540	const unsigned long __user *, old_nodes,
				1541	const unsigned long __user *, new_nodes)
				1542	{
				1543	return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
				1544	}
				1545
				1546
				1547	/* Retrieve NUMA policy */
				1548	static int kernel_get_mempolicy(int __user *policy,
				1549	unsigned long __user *nmask,
				1550	unsigned long maxnode,
				1551	unsigned long addr,
				1552	unsigned long flags)
				1553	{
				1554	int err;
				1555	int uninitialized_var(pval);
				1556	nodemask_t nodes;
				1557
				1558	addr = untagged_addr(addr);
				1559
				1560	if (nmask != NULL && maxnode < nr_node_ids)
				1561	return -EINVAL;
				1562
				1563	err = do_get_mempolicy(&pval, &nodes, addr, flags);
				1564
				1565	if (err)
				1566	return err;
				1567
				1568	if (policy && put_user(pval, policy))
				1569	return -EFAULT;
				1570
				1571	if (nmask)
				1572	err = copy_nodes_to_user(nmask, maxnode, &nodes);
				1573
				1574	return err;
				1575	}
				1576
				1577	SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
				1578	unsigned long __user *, nmask, unsigned long, maxnode,
				1579	unsigned long, addr, unsigned long, flags)
				1580	{
				1581	return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
				1582	}
				1583
				1584	#ifdef CONFIG_COMPAT
				1585
				1586	COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
				1587	compat_ulong_t __user *, nmask,
				1588	compat_ulong_t, maxnode,
				1589	compat_ulong_t, addr, compat_ulong_t, flags)
				1590	{
				1591	long err;
				1592	unsigned long __user *nm = NULL;
				1593	unsigned long nr_bits, alloc_size;
				1594	DECLARE_BITMAP(bm, MAX_NUMNODES);
				1595
				1596	nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
				1597	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
				1598
				1599	if (nmask)
				1600	nm = compat_alloc_user_space(alloc_size);
				1601
				1602	err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
				1603
				1604	if (!err && nmask) {
				1605	unsigned long copy_size;
				1606	copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
				1607	err = copy_from_user(bm, nm, copy_size);
				1608	/* ensure entire bitmap is zeroed */
				1609	err \|= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
				1610	err \|= compat_put_bitmap(nmask, bm, nr_bits);
				1611	}
				1612
				1613	return err;
				1614	}
				1615
				1616	COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask,
				1617	compat_ulong_t, maxnode)
				1618	{
				1619	unsigned long __user *nm = NULL;
				1620	unsigned long nr_bits, alloc_size;
				1621	DECLARE_BITMAP(bm, MAX_NUMNODES);
				1622
				1623	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
				1624	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
				1625
				1626	if (nmask) {
				1627	if (compat_get_bitmap(bm, nmask, nr_bits))
				1628	return -EFAULT;
				1629	nm = compat_alloc_user_space(alloc_size);
				1630	if (copy_to_user(nm, bm, alloc_size))
				1631	return -EFAULT;
				1632	}
				1633
				1634	return kernel_set_mempolicy(mode, nm, nr_bits+1);
				1635	}
				1636
				1637	COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
				1638	compat_ulong_t, mode, compat_ulong_t __user *, nmask,
				1639	compat_ulong_t, maxnode, compat_ulong_t, flags)
				1640	{
				1641	unsigned long __user *nm = NULL;
				1642	unsigned long nr_bits, alloc_size;
				1643	nodemask_t bm;
				1644
				1645	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
				1646	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
				1647
				1648	if (nmask) {
				1649	if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits))
				1650	return -EFAULT;
				1651	nm = compat_alloc_user_space(alloc_size);
				1652	if (copy_to_user(nm, nodes_addr(bm), alloc_size))
				1653	return -EFAULT;
				1654	}
				1655
				1656	return kernel_mbind(start, len, mode, nm, nr_bits+1, flags);
				1657	}
				1658
				1659	COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
				1660	compat_ulong_t, maxnode,
				1661	const compat_ulong_t __user *, old_nodes,
				1662	const compat_ulong_t __user *, new_nodes)
				1663	{
				1664	unsigned long __user *old = NULL;
				1665	unsigned long __user *new = NULL;
				1666	nodemask_t tmp_mask;
				1667	unsigned long nr_bits;
				1668	unsigned long size;
				1669
				1670	nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES);
				1671	size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
				1672	if (old_nodes) {
				1673	if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits))
				1674	return -EFAULT;
				1675	old = compat_alloc_user_space(new_nodes ? size * 2 : size);
				1676	if (new_nodes)
				1677	new = old + size / sizeof(unsigned long);
				1678	if (copy_to_user(old, nodes_addr(tmp_mask), size))
				1679	return -EFAULT;
				1680	}
				1681	if (new_nodes) {
				1682	if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits))
				1683	return -EFAULT;
				1684	if (new == NULL)
				1685	new = compat_alloc_user_space(size);
				1686	if (copy_to_user(new, nodes_addr(tmp_mask), size))
				1687	return -EFAULT;
				1688	}
				1689	return kernel_migrate_pages(pid, nr_bits + 1, old, new);
				1690	}
				1691
				1692	#endif /* CONFIG_COMPAT */
				1693
				1694	struct mempolicy __get_vma_policy(struct vm_area_struct vma,
				1695	unsigned long addr)
				1696	{
				1697	struct mempolicy *pol = NULL;
				1698
				1699	if (vma) {
				1700	if (vma->vm_ops && vma->vm_ops->get_policy) {
				1701	pol = vma->vm_ops->get_policy(vma, addr);
				1702	} else if (vma->vm_policy) {
				1703	pol = vma->vm_policy;
				1704
				1705	/*
				1706	* shmem_alloc_page() passes MPOL_F_SHARED policy with
				1707	* a pseudo vma whose vma->vm_ops=NULL. Take a reference
				1708	* count on these policies which will be dropped by
				1709	* mpol_cond_put() later
				1710	*/
				1711	if (mpol_needs_cond_ref(pol))
				1712	mpol_get(pol);
				1713	}
				1714	}
				1715
				1716	return pol;
				1717	}
				1718
				1719	/*
				1720	* get_vma_policy(@vma, @addr)
				1721	* @vma: virtual memory area whose policy is sought
				1722	* @addr: address in @vma for shared policy lookup
				1723	*
				1724	* Returns effective policy for a VMA at specified address.
				1725	* Falls back to current->mempolicy or system default policy, as necessary.
				1726	* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
				1727	* count--added by the get_policy() vm_op, as appropriate--to protect against
				1728	* freeing by another task. It is the caller's responsibility to free the
				1729	* extra reference for shared policies.
				1730	*/
				1731	static struct mempolicy get_vma_policy(struct vm_area_struct vma,
				1732	unsigned long addr)
				1733	{
				1734	struct mempolicy *pol = __get_vma_policy(vma, addr);
				1735
				1736	if (!pol)
				1737	pol = get_task_policy(current);
				1738
				1739	return pol;
				1740	}
				1741
				1742	bool vma_policy_mof(struct vm_area_struct *vma)
				1743	{
				1744	struct mempolicy *pol;
				1745
				1746	if (vma->vm_ops && vma->vm_ops->get_policy) {
				1747	bool ret = false;
				1748
				1749	pol = vma->vm_ops->get_policy(vma, vma->vm_start);
				1750	if (pol && (pol->flags & MPOL_F_MOF))
				1751	ret = true;
				1752	mpol_cond_put(pol);
				1753
				1754	return ret;
				1755	}
				1756
				1757	pol = vma->vm_policy;
				1758	if (!pol)
				1759	pol = get_task_policy(current);
				1760
				1761	return pol->flags & MPOL_F_MOF;
				1762	}
				1763
				1764	static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
				1765	{
				1766	enum zone_type dynamic_policy_zone = policy_zone;
				1767
				1768	BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
				1769
				1770	/*
				1771	* if policy->v.nodes has movable memory only,
				1772	* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
				1773	*
				1774	* policy->v.nodes is intersect with node_states[N_MEMORY].
				1775	* so if the following test faile, it implies
				1776	* policy->v.nodes has movable memory only.
				1777	*/
				1778	if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
				1779	dynamic_policy_zone = ZONE_MOVABLE;
				1780
				1781	return zone >= dynamic_policy_zone;
				1782	}
				1783
				1784	/*
				1785	* Return a nodemask representing a mempolicy for filtering nodes for
				1786	* page allocation
				1787	*/
				1788	static nodemask_t policy_nodemask(gfp_t gfp, struct mempolicy policy)
				1789	{
				1790	/* Lower zones don't get a nodemask applied for MPOL_BIND */
				1791	if (unlikely(policy->mode == MPOL_BIND) &&
				1792	apply_policy_zone(policy, gfp_zone(gfp)) &&
				1793	cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
				1794	return &policy->v.nodes;
				1795
				1796	return NULL;
				1797	}
				1798
				1799	/* Return the node id preferred by the given mempolicy, or the given id */
				1800	static int policy_node(gfp_t gfp, struct mempolicy *policy,
				1801	int nd)
				1802	{
				1803	if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
				1804	nd = policy->v.preferred_node;
				1805	else {
				1806	/*
				1807	* __GFP_THISNODE shouldn't even be used with the bind policy
				1808	* because we might easily break the expectation to stay on the
				1809	* requested node and not break the policy.
				1810	*/
				1811	WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
				1812	}
				1813
				1814	return nd;
				1815	}
				1816
				1817	/* Do dynamic interleaving for a process */
				1818	static unsigned interleave_nodes(struct mempolicy *policy)
				1819	{
				1820	unsigned next;
				1821	struct task_struct *me = current;
				1822
				1823	next = next_node_in(me->il_prev, policy->v.nodes);
				1824	if (next < MAX_NUMNODES)
				1825	me->il_prev = next;
				1826	return next;
				1827	}
				1828
				1829	/*
				1830	* Depending on the memory policy provide a node from which to allocate the
				1831	* next slab entry.
				1832	*/
				1833	unsigned int mempolicy_slab_node(void)
				1834	{
				1835	struct mempolicy *policy;
				1836	int node = numa_mem_id();
				1837
				1838	if (in_interrupt())
				1839	return node;
				1840
				1841	policy = current->mempolicy;
				1842	if (!policy \|\| policy->flags & MPOL_F_LOCAL)
				1843	return node;
				1844
				1845	switch (policy->mode) {
				1846	case MPOL_PREFERRED:
				1847	/*
				1848	* handled MPOL_F_LOCAL above
				1849	*/
				1850	return policy->v.preferred_node;
				1851
				1852	case MPOL_INTERLEAVE:
				1853	return interleave_nodes(policy);
				1854
				1855	case MPOL_BIND: {
				1856	struct zoneref *z;
				1857
				1858	/*
				1859	* Follow bind policy behavior and start allocation at the
				1860	* first node.
				1861	*/
				1862	struct zonelist *zonelist;
				1863	enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
				1864	zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
				1865	z = first_zones_zonelist(zonelist, highest_zoneidx,
				1866	&policy->v.nodes);
				1867	return z->zone ? zone_to_nid(z->zone) : node;
				1868	}
				1869
				1870	default:
				1871	BUG();
				1872	}
				1873	}
				1874
				1875	/*
				1876	* Do static interleaving for a VMA with known offset @n. Returns the n'th
				1877	* node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
				1878	* number of present nodes.
				1879	*/
				1880	static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
				1881	{
				1882	unsigned nnodes = nodes_weight(pol->v.nodes);
				1883	unsigned target;
				1884	int i;
				1885	int nid;
				1886
				1887	if (!nnodes)
				1888	return numa_node_id();
				1889	target = (unsigned int)n % nnodes;
				1890	nid = first_node(pol->v.nodes);
				1891	for (i = 0; i < target; i++)
				1892	nid = next_node(nid, pol->v.nodes);
				1893	return nid;
				1894	}
				1895
				1896	/* Determine a node number for interleave */
				1897	static inline unsigned interleave_nid(struct mempolicy *pol,
				1898	struct vm_area_struct *vma, unsigned long addr, int shift)
				1899	{
				1900	if (vma) {
				1901	unsigned long off;
				1902
				1903	/*
				1904	* for small pages, there is no difference between
				1905	* shift and PAGE_SHIFT, so the bit-shift is safe.
				1906	* for huge pages, since vm_pgoff is in units of small
				1907	* pages, we need to shift off the always 0 bits to get
				1908	* a useful offset.
				1909	*/
				1910	BUG_ON(shift < PAGE_SHIFT);
				1911	off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
				1912	off += (addr - vma->vm_start) >> shift;
				1913	return offset_il_node(pol, off);
				1914	} else
				1915	return interleave_nodes(pol);
				1916	}
				1917
				1918	#ifdef CONFIG_HUGETLBFS
				1919	/*
				1920	* huge_node(@vma, @addr, @gfp_flags, @mpol)
				1921	* @vma: virtual memory area whose policy is sought
				1922	* @addr: address in @vma for shared policy lookup and interleave policy
				1923	* @gfp_flags: for requested zone
				1924	* @mpol: pointer to mempolicy pointer for reference counted mempolicy
				1925	* @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask
				1926	*
				1927	* Returns a nid suitable for a huge page allocation and a pointer
				1928	* to the struct mempolicy for conditional unref after allocation.
				1929	* If the effective policy is 'BIND, returns a pointer to the mempolicy's
				1930	* @nodemask for filtering the zonelist.
				1931	*
				1932	* Must be protected by read_mems_allowed_begin()
				1933	*/
				1934	int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
				1935	struct mempolicy mpol, nodemask_t nodemask)
				1936	{
				1937	int nid;
				1938
				1939	*mpol = get_vma_policy(vma, addr);
				1940	nodemask = NULL; / assume !MPOL_BIND */
				1941
				1942	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
				1943	nid = interleave_nid(*mpol, vma, addr,
				1944	huge_page_shift(hstate_vma(vma)));
				1945	} else {
				1946	nid = policy_node(gfp_flags, *mpol, numa_node_id());
				1947	if ((*mpol)->mode == MPOL_BIND)
				1948	nodemask = &(mpol)->v.nodes;
				1949	}
				1950	return nid;
				1951	}
				1952
				1953	/*
				1954	* init_nodemask_of_mempolicy
				1955	*
				1956	* If the current task's mempolicy is "default" [NULL], return 'false'
				1957	* to indicate default policy. Otherwise, extract the policy nodemask
				1958	* for 'bind' or 'interleave' policy into the argument nodemask, or
				1959	* initialize the argument nodemask to contain the single node for
				1960	* 'preferred' or 'local' policy and return 'true' to indicate presence
				1961	* of non-default mempolicy.
				1962	*
				1963	* We don't bother with reference counting the mempolicy [mpol_get/put]
				1964	* because the current task is examining it's own mempolicy and a task's
				1965	* mempolicy is only ever changed by the task itself.
				1966	*
				1967	* N.B., it is the caller's responsibility to free a returned nodemask.
				1968	*/
				1969	bool init_nodemask_of_mempolicy(nodemask_t *mask)
				1970	{
				1971	struct mempolicy *mempolicy;
				1972	int nid;
				1973
				1974	if (!(mask && current->mempolicy))
				1975	return false;
				1976
				1977	task_lock(current);
				1978	mempolicy = current->mempolicy;
				1979	switch (mempolicy->mode) {
				1980	case MPOL_PREFERRED:
				1981	if (mempolicy->flags & MPOL_F_LOCAL)
				1982	nid = numa_node_id();
				1983	else
				1984	nid = mempolicy->v.preferred_node;
				1985	init_nodemask_of_node(mask, nid);
				1986	break;
				1987
				1988	case MPOL_BIND:
				1989	/* Fall through */
				1990	case MPOL_INTERLEAVE:
				1991	*mask = mempolicy->v.nodes;
				1992	break;
				1993
				1994	default:
				1995	BUG();
				1996	}
				1997	task_unlock(current);
				1998
				1999	return true;
				2000	}
				2001	#endif
				2002
				2003	/*
				2004	* mempolicy_nodemask_intersects
				2005	*
				2006	* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
				2007	* policy. Otherwise, check for intersection between mask and the policy
				2008	* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
				2009	* policy, always return true since it may allocate elsewhere on fallback.
				2010	*
				2011	* Takes task_lock(tsk) to prevent freeing of its mempolicy.
				2012	*/
				2013	bool mempolicy_nodemask_intersects(struct task_struct *tsk,
				2014	const nodemask_t *mask)
				2015	{
				2016	struct mempolicy *mempolicy;
				2017	bool ret = true;
				2018
				2019	if (!mask)
				2020	return ret;
				2021	task_lock(tsk);
				2022	mempolicy = tsk->mempolicy;
				2023	if (!mempolicy)
				2024	goto out;
				2025
				2026	switch (mempolicy->mode) {
				2027	case MPOL_PREFERRED:
				2028	/*
				2029	* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
				2030	* allocate from, they may fallback to other nodes when oom.
				2031	* Thus, it's possible for tsk to have allocated memory from
				2032	* nodes in mask.
				2033	*/
				2034	break;
				2035	case MPOL_BIND:
				2036	case MPOL_INTERLEAVE:
				2037	ret = nodes_intersects(mempolicy->v.nodes, *mask);
				2038	break;
				2039	default:
				2040	BUG();
				2041	}
				2042	out:
				2043	task_unlock(tsk);
				2044	return ret;
				2045	}
				2046
				2047	/* Allocate a page in interleaved policy.
				2048	Own path because it needs to do special accounting. */
				2049	static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
				2050	unsigned nid)
				2051	{
				2052	struct page *page;
				2053
				2054	page = __alloc_pages(gfp, order, nid);
				2055	/* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */
				2056	if (!static_branch_likely(&vm_numa_stat_key))
				2057	return page;
				2058	if (page && page_to_nid(page) == nid) {
				2059	preempt_disable();
				2060	__inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT);
				2061	preempt_enable();
				2062	}
				2063	return page;
				2064	}
				2065
				2066	/**
				2067	* alloc_pages_vma - Allocate a page for a VMA.
				2068	*
				2069	* @gfp:
				2070	* %GFP_USER user allocation.
				2071	* %GFP_KERNEL kernel allocations,
				2072	* %GFP_HIGHMEM highmem/user allocations,
				2073	* %GFP_FS allocation should not call back into a file system.
				2074	* %GFP_ATOMIC don't sleep.
				2075	*
				2076	* @order:Order of the GFP allocation.
				2077	* @vma: Pointer to VMA or NULL if not available.
				2078	* @addr: Virtual Address of the allocation. Must be inside the VMA.
				2079	* @node: Which node to prefer for allocation (modulo policy).
				2080	* @hugepage: for hugepages try only the preferred node if possible
				2081	*
				2082	* This function allocates a page from the kernel page pool and applies
				2083	* a NUMA policy associated with the VMA or the current process.
				2084	* When VMA is not NULL caller must hold down_read on the mmap_sem of the
				2085	* mm_struct of the VMA to prevent it from going away. Should be used for
				2086	* all allocations for pages that will be mapped into user space. Returns
				2087	* NULL when no page can be allocated.
				2088	*/
				2089	struct page *
				2090	alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
				2091	unsigned long addr, int node, bool hugepage)
				2092	{
				2093	struct mempolicy *pol;
				2094	struct page *page;
				2095	int preferred_nid;
				2096	nodemask_t *nmask;
				2097
				2098	pol = get_vma_policy(vma, addr);
				2099
				2100	if (pol->mode == MPOL_INTERLEAVE) {
				2101	unsigned nid;
				2102
				2103	nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
				2104	mpol_cond_put(pol);
				2105	page = alloc_page_interleave(gfp, order, nid);
				2106	goto out;
				2107	}
				2108
				2109	if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
				2110	int hpage_node = node;
				2111
				2112	/*
				2113	* For hugepage allocation and non-interleave policy which
				2114	* allows the current node (or other explicitly preferred
				2115	* node) we only try to allocate from the current/preferred
				2116	* node and don't fall back to other nodes, as the cost of
				2117	* remote accesses would likely offset THP benefits.
				2118	*
				2119	* If the policy is interleave, or does not allow the current
				2120	* node in its nodemask, we allocate the standard way.
				2121	*/
				2122	if (pol->mode == MPOL_PREFERRED &&
				2123	!(pol->flags & MPOL_F_LOCAL))
				2124	hpage_node = pol->v.preferred_node;
				2125
				2126	nmask = policy_nodemask(gfp, pol);
				2127	if (!nmask \|\| node_isset(hpage_node, *nmask)) {
				2128	mpol_cond_put(pol);
				2129	/*
				2130	* We cannot invoke reclaim if __GFP_THISNODE
				2131	* is set. Invoking reclaim with
				2132	* __GFP_THISNODE set, would cause THP
				2133	* allocations to trigger heavy swapping
				2134	* despite there may be tons of free memory
				2135	* (including potentially plenty of THP
				2136	* already available in the buddy) on all the
				2137	* other NUMA nodes.
				2138	*
				2139	* At most we could invoke compaction when
				2140	* __GFP_THISNODE is set (but we would need to
				2141	* refrain from invoking reclaim even if
				2142	* compaction returned COMPACT_SKIPPED because
				2143	* there wasn't not enough memory to succeed
				2144	* compaction). For now just avoid
				2145	* __GFP_THISNODE instead of limiting the
				2146	* allocation path to a strict and single
				2147	* compaction invocation.
				2148	*
				2149	* Supposedly if direct reclaim was enabled by
				2150	* the caller, the app prefers THP regardless
				2151	* of the node it comes from so this would be
				2152	* more desiderable behavior than only
				2153	* providing THP originated from the local
				2154	* node in such case.
				2155	*/
				2156	if (!(gfp & __GFP_DIRECT_RECLAIM))
				2157	gfp \|= __GFP_THISNODE;
				2158	page = __alloc_pages_node(hpage_node, gfp, order);
				2159	goto out;
				2160	}
				2161	}
				2162
				2163	nmask = policy_nodemask(gfp, pol);
				2164	preferred_nid = policy_node(gfp, pol, node);
				2165	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
				2166	mpol_cond_put(pol);
				2167	out:
				2168	return page;
				2169	}
				2170
				2171	/**
				2172	* alloc_pages_current - Allocate pages.
				2173	*
				2174	* @gfp:
				2175	* %GFP_USER user allocation,
				2176	* %GFP_KERNEL kernel allocation,
				2177	* %GFP_HIGHMEM highmem allocation,
				2178	* %GFP_FS don't call back into a file system.
				2179	* %GFP_ATOMIC don't sleep.
				2180	* @order: Power of two of allocation size in pages. 0 is a single page.
				2181	*
				2182	* Allocate a page from the kernel page pool. When not in
				2183	* interrupt context and apply the current process NUMA policy.
				2184	* Returns NULL when no page can be allocated.
				2185	*/
				2186	struct page *alloc_pages_current(gfp_t gfp, unsigned order)
				2187	{
				2188	struct mempolicy *pol = &default_policy;
				2189	struct page *page;
				2190
				2191	if (!in_interrupt() && !(gfp & __GFP_THISNODE))
				2192	pol = get_task_policy(current);
				2193
				2194	/*
				2195	* No reference counting needed for current->mempolicy
				2196	* nor system default_policy
				2197	*/
				2198	if (pol->mode == MPOL_INTERLEAVE)
				2199	page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
				2200	else
				2201	page = __alloc_pages_nodemask(gfp, order,
				2202	policy_node(gfp, pol, numa_node_id()),
				2203	policy_nodemask(gfp, pol));
				2204
				2205	return page;
				2206	}
				2207	EXPORT_SYMBOL(alloc_pages_current);
				2208
				2209	int vma_dup_policy(struct vm_area_struct src, struct vm_area_struct dst)
				2210	{
				2211	struct mempolicy *pol = mpol_dup(vma_policy(src));
				2212
				2213	if (IS_ERR(pol))
				2214	return PTR_ERR(pol);
				2215	dst->vm_policy = pol;
				2216	return 0;
				2217	}
				2218
				2219	/*
				2220	* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
				2221	* rebinds the mempolicy its copying by calling mpol_rebind_policy()
				2222	* with the mems_allowed returned by cpuset_mems_allowed(). This
				2223	* keeps mempolicies cpuset relative after its cpuset moves. See
				2224	* further kernel/cpuset.c update_nodemask().
				2225	*
				2226	* current's mempolicy may be rebinded by the other task(the task that changes
				2227	* cpuset's mems), so we needn't do rebind work for current task.
				2228	*/
				2229
				2230	/* Slow path of a mempolicy duplicate */
				2231	struct mempolicy __mpol_dup(struct mempolicy old)
				2232	{
				2233	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
				2234
				2235	if (!new)
				2236	return ERR_PTR(-ENOMEM);
				2237
				2238	/* task's mempolicy is protected by alloc_lock */
				2239	if (old == current->mempolicy) {
				2240	task_lock(current);
				2241	new = old;
				2242	task_unlock(current);
				2243	} else
				2244	new = old;
				2245
				2246	if (current_cpuset_is_being_rebound()) {
				2247	nodemask_t mems = cpuset_mems_allowed(current);
				2248	mpol_rebind_policy(new, &mems);
				2249	}
				2250	atomic_set(&new->refcnt, 1);
				2251	return new;
				2252	}
				2253
				2254	/* Slow path of a mempolicy comparison */
				2255	bool __mpol_equal(struct mempolicy a, struct mempolicy b)
				2256	{
				2257	if (!a \|\| !b)
				2258	return false;
				2259	if (a->mode != b->mode)
				2260	return false;
				2261	if (a->flags != b->flags)
				2262	return false;
				2263	if (mpol_store_user_nodemask(a))
				2264	if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
				2265	return false;
				2266
				2267	switch (a->mode) {
				2268	case MPOL_BIND:
				2269	/* Fall through */
				2270	case MPOL_INTERLEAVE:
				2271	return !!nodes_equal(a->v.nodes, b->v.nodes);
				2272	case MPOL_PREFERRED:
				2273	/* a's ->flags is the same as b's */
				2274	if (a->flags & MPOL_F_LOCAL)
				2275	return true;
				2276	return a->v.preferred_node == b->v.preferred_node;
				2277	default:
				2278	BUG();
				2279	return false;
				2280	}
				2281	}
				2282
				2283	/*
				2284	* Shared memory backing store policy support.
				2285	*
				2286	* Remember policies even when nobody has shared memory mapped.
				2287	* The policies are kept in Red-Black tree linked from the inode.
				2288	* They are protected by the sp->lock rwlock, which should be held
				2289	* for any accesses to the tree.
				2290	*/
				2291
				2292	/*
				2293	* lookup first element intersecting start-end. Caller holds sp->lock for
				2294	* reading or for writing
				2295	*/
				2296	static struct sp_node *
				2297	sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
				2298	{
				2299	struct rb_node *n = sp->root.rb_node;
				2300
				2301	while (n) {
				2302	struct sp_node *p = rb_entry(n, struct sp_node, nd);
				2303
				2304	if (start >= p->end)
				2305	n = n->rb_right;
				2306	else if (end <= p->start)
				2307	n = n->rb_left;
				2308	else
				2309	break;
				2310	}
				2311	if (!n)
				2312	return NULL;
				2313	for (;;) {
				2314	struct sp_node *w = NULL;
				2315	struct rb_node *prev = rb_prev(n);
				2316	if (!prev)
				2317	break;
				2318	w = rb_entry(prev, struct sp_node, nd);
				2319	if (w->end <= start)
				2320	break;
				2321	n = prev;
				2322	}
				2323	return rb_entry(n, struct sp_node, nd);
				2324	}
				2325
				2326	/*
				2327	* Insert a new shared policy into the list. Caller holds sp->lock for
				2328	* writing.
				2329	*/
				2330	static void sp_insert(struct shared_policy sp, struct sp_node new)
				2331	{
				2332	struct rb_node **p = &sp->root.rb_node;
				2333	struct rb_node *parent = NULL;
				2334	struct sp_node *nd;
				2335
				2336	while (*p) {
				2337	parent = *p;
				2338	nd = rb_entry(parent, struct sp_node, nd);
				2339	if (new->start < nd->start)
				2340	p = &(*p)->rb_left;
				2341	else if (new->end > nd->end)
				2342	p = &(*p)->rb_right;
				2343	else
				2344	BUG();
				2345	}
				2346	rb_link_node(&new->nd, parent, p);
				2347	rb_insert_color(&new->nd, &sp->root);
				2348	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
				2349	new->policy ? new->policy->mode : 0);
				2350	}
				2351
				2352	/* Find shared policy intersecting idx */
				2353	struct mempolicy *
				2354	mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
				2355	{
				2356	struct mempolicy *pol = NULL;
				2357	struct sp_node *sn;
				2358
				2359	if (!sp->root.rb_node)
				2360	return NULL;
				2361	read_lock(&sp->lock);
				2362	sn = sp_lookup(sp, idx, idx+1);
				2363	if (sn) {
				2364	mpol_get(sn->policy);
				2365	pol = sn->policy;
				2366	}
				2367	read_unlock(&sp->lock);
				2368	return pol;
				2369	}
				2370
				2371	static void sp_free(struct sp_node *n)
				2372	{
				2373	mpol_put(n->policy);
				2374	kmem_cache_free(sn_cache, n);
				2375	}
				2376
				2377	/**
				2378	* mpol_misplaced - check whether current page node is valid in policy
				2379	*
				2380	* @page: page to be checked
				2381	* @vma: vm area where page mapped
				2382	* @addr: virtual address where page mapped
				2383	*
				2384	* Lookup current policy node id for vma,addr and "compare to" page's
				2385	* node id.
				2386	*
				2387	* Returns:
				2388	* -1 - not misplaced, page is in the right node
				2389	* node - node id where the page should be
				2390	*
				2391	* Policy determination "mimics" alloc_page_vma().
				2392	* Called from fault path where we know the vma and faulting address.
				2393	*/
				2394	int mpol_misplaced(struct page page, struct vm_area_struct vma, unsigned long addr)
				2395	{
				2396	struct mempolicy *pol;
				2397	struct zoneref *z;
				2398	int curnid = page_to_nid(page);
				2399	unsigned long pgoff;
				2400	int thiscpu = raw_smp_processor_id();
				2401	int thisnid = cpu_to_node(thiscpu);
				2402	int polnid = -1;
				2403	int ret = -1;
				2404
				2405	pol = get_vma_policy(vma, addr);
				2406	if (!(pol->flags & MPOL_F_MOF))
				2407	goto out;
				2408
				2409	switch (pol->mode) {
				2410	case MPOL_INTERLEAVE:
				2411	pgoff = vma->vm_pgoff;
				2412	pgoff += (addr - vma->vm_start) >> PAGE_SHIFT;
				2413	polnid = offset_il_node(pol, pgoff);
				2414	break;
				2415
				2416	case MPOL_PREFERRED:
				2417	if (pol->flags & MPOL_F_LOCAL)
				2418	polnid = numa_node_id();
				2419	else
				2420	polnid = pol->v.preferred_node;
				2421	break;
				2422
				2423	case MPOL_BIND:
				2424
				2425	/*
				2426	* allows binding to multiple nodes.
				2427	* use current page if in policy nodemask,
				2428	* else select nearest allowed node, if any.
				2429	* If no allowed nodes, use current [!misplaced].
				2430	*/
				2431	if (node_isset(curnid, pol->v.nodes))
				2432	goto out;
				2433	z = first_zones_zonelist(
				2434	node_zonelist(numa_node_id(), GFP_HIGHUSER),
				2435	gfp_zone(GFP_HIGHUSER),
				2436	&pol->v.nodes);
				2437	polnid = zone_to_nid(z->zone);
				2438	break;
				2439
				2440	default:
				2441	BUG();
				2442	}
				2443
				2444	/* Migrate the page towards the node whose CPU is referencing it */
				2445	if (pol->flags & MPOL_F_MORON) {
				2446	polnid = thisnid;
				2447
				2448	if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
				2449	goto out;
				2450	}
				2451
				2452	if (curnid != polnid)
				2453	ret = polnid;
				2454	out:
				2455	mpol_cond_put(pol);
				2456
				2457	return ret;
				2458	}
				2459
				2460	/*
				2461	* Drop the (possibly final) reference to task->mempolicy. It needs to be
				2462	* dropped after task->mempolicy is set to NULL so that any allocation done as
				2463	* part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
				2464	* policy.
				2465	*/
				2466	void mpol_put_task_policy(struct task_struct *task)
				2467	{
				2468	struct mempolicy *pol;
				2469
				2470	task_lock(task);
				2471	pol = task->mempolicy;
				2472	task->mempolicy = NULL;
				2473	task_unlock(task);
				2474	mpol_put(pol);
				2475	}
				2476
				2477	static void sp_delete(struct shared_policy sp, struct sp_node n)
				2478	{
				2479	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
				2480	rb_erase(&n->nd, &sp->root);
				2481	sp_free(n);
				2482	}
				2483
				2484	static void sp_node_init(struct sp_node *node, unsigned long start,
				2485	unsigned long end, struct mempolicy *pol)
				2486	{
				2487	node->start = start;
				2488	node->end = end;
				2489	node->policy = pol;
				2490	}
				2491
				2492	static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
				2493	struct mempolicy *pol)
				2494	{
				2495	struct sp_node *n;
				2496	struct mempolicy *newpol;
				2497
				2498	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
				2499	if (!n)
				2500	return NULL;
				2501
				2502	newpol = mpol_dup(pol);
				2503	if (IS_ERR(newpol)) {
				2504	kmem_cache_free(sn_cache, n);
				2505	return NULL;
				2506	}
				2507	newpol->flags \|= MPOL_F_SHARED;
				2508	sp_node_init(n, start, end, newpol);
				2509
				2510	return n;
				2511	}
				2512
				2513	/* Replace a policy range. */
				2514	static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
				2515	unsigned long end, struct sp_node *new)
				2516	{
				2517	struct sp_node *n;
				2518	struct sp_node *n_new = NULL;
				2519	struct mempolicy *mpol_new = NULL;
				2520	int ret = 0;
				2521
				2522	restart:
				2523	write_lock(&sp->lock);
				2524	n = sp_lookup(sp, start, end);
				2525	/* Take care of old policies in the same range. */
				2526	while (n && n->start < end) {
				2527	struct rb_node *next = rb_next(&n->nd);
				2528	if (n->start >= start) {
				2529	if (n->end <= end)
				2530	sp_delete(sp, n);
				2531	else
				2532	n->start = end;
				2533	} else {
				2534	/* Old policy spanning whole new range. */
				2535	if (n->end > end) {
				2536	if (!n_new)
				2537	goto alloc_new;
				2538
				2539	mpol_new = n->policy;
				2540	atomic_set(&mpol_new->refcnt, 1);
				2541	sp_node_init(n_new, end, n->end, mpol_new);
				2542	n->end = start;
				2543	sp_insert(sp, n_new);
				2544	n_new = NULL;
				2545	mpol_new = NULL;
				2546	break;
				2547	} else
				2548	n->end = start;
				2549	}
				2550	if (!next)
				2551	break;
				2552	n = rb_entry(next, struct sp_node, nd);
				2553	}
				2554	if (new)
				2555	sp_insert(sp, new);
				2556	write_unlock(&sp->lock);
				2557	ret = 0;
				2558
				2559	err_out:
				2560	if (mpol_new)
				2561	mpol_put(mpol_new);
				2562	if (n_new)
				2563	kmem_cache_free(sn_cache, n_new);
				2564
				2565	return ret;
				2566
				2567	alloc_new:
				2568	write_unlock(&sp->lock);
				2569	ret = -ENOMEM;
				2570	n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
				2571	if (!n_new)
				2572	goto err_out;
				2573	mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
				2574	if (!mpol_new)
				2575	goto err_out;
				2576	goto restart;
				2577	}
				2578
				2579	/**
				2580	* mpol_shared_policy_init - initialize shared policy for inode
				2581	* @sp: pointer to inode shared policy
				2582	* @mpol: struct mempolicy to install
				2583	*
				2584	* Install non-NULL @mpol in inode's shared policy rb-tree.
				2585	* On entry, the current task has a reference on a non-NULL @mpol.
				2586	* This must be released on exit.
				2587	* This is called at get_inode() calls and we can use GFP_KERNEL.
				2588	*/
				2589	void mpol_shared_policy_init(struct shared_policy sp, struct mempolicy mpol)
				2590	{
				2591	int ret;
				2592
				2593	sp->root = RB_ROOT; /* empty tree == default mempolicy */
				2594	rwlock_init(&sp->lock);
				2595
				2596	if (mpol) {
				2597	struct vm_area_struct pvma;
				2598	struct mempolicy *new;
				2599	NODEMASK_SCRATCH(scratch);
				2600
				2601	if (!scratch)
				2602	goto put_mpol;
				2603	/* contextualize the tmpfs mount point mempolicy */
				2604	new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
				2605	if (IS_ERR(new))
				2606	goto free_scratch; /* no valid nodemask intersection */
				2607
				2608	task_lock(current);
				2609	ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
				2610	task_unlock(current);
				2611	if (ret)
				2612	goto put_new;
				2613
				2614	/* Create pseudo-vma that contains just the policy */
				2615	vma_init(&pvma, NULL);
				2616	pvma.vm_end = TASK_SIZE; /* policy covers entire file */
				2617	mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
				2618
				2619	put_new:
				2620	mpol_put(new); /* drop initial ref */
				2621	free_scratch:
				2622	NODEMASK_SCRATCH_FREE(scratch);
				2623	put_mpol:
				2624	mpol_put(mpol); /* drop our incoming ref on sb mpol */
				2625	}
				2626	}
				2627
				2628	int mpol_set_shared_policy(struct shared_policy *info,
				2629	struct vm_area_struct vma, struct mempolicy npol)
				2630	{
				2631	int err;
				2632	struct sp_node *new = NULL;
				2633	unsigned long sz = vma_pages(vma);
				2634
				2635	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
				2636	vma->vm_pgoff,
				2637	sz, npol ? npol->mode : -1,
				2638	npol ? npol->flags : -1,
				2639	npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
				2640
				2641	if (npol) {
				2642	new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
				2643	if (!new)
				2644	return -ENOMEM;
				2645	}
				2646	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
				2647	if (err && new)
				2648	sp_free(new);
				2649	return err;
				2650	}
				2651
				2652	/* Free a backing policy store on inode delete. */
				2653	void mpol_free_shared_policy(struct shared_policy *p)
				2654	{
				2655	struct sp_node *n;
				2656	struct rb_node *next;
				2657
				2658	if (!p->root.rb_node)
				2659	return;
				2660	write_lock(&p->lock);
				2661	next = rb_first(&p->root);
				2662	while (next) {
				2663	n = rb_entry(next, struct sp_node, nd);
				2664	next = rb_next(&n->nd);
				2665	sp_delete(p, n);
				2666	}
				2667	write_unlock(&p->lock);
				2668	}
				2669
				2670	#ifdef CONFIG_NUMA_BALANCING
				2671	static int __initdata numabalancing_override;
				2672
				2673	static void __init check_numabalancing_enable(void)
				2674	{
				2675	bool numabalancing_default = false;
				2676
				2677	if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
				2678	numabalancing_default = true;
				2679
				2680	/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
				2681	if (numabalancing_override)
				2682	set_numabalancing_state(numabalancing_override == 1);
				2683
				2684	if (num_online_nodes() > 1 && !numabalancing_override) {
				2685	pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
				2686	numabalancing_default ? "Enabling" : "Disabling");
				2687	set_numabalancing_state(numabalancing_default);
				2688	}
				2689	}
				2690
				2691	static int __init setup_numabalancing(char *str)
				2692	{
				2693	int ret = 0;
				2694	if (!str)
				2695	goto out;
				2696
				2697	if (!strcmp(str, "enable")) {
				2698	numabalancing_override = 1;
				2699	ret = 1;
				2700	} else if (!strcmp(str, "disable")) {
				2701	numabalancing_override = -1;
				2702	ret = 1;
				2703	}
				2704	out:
				2705	if (!ret)
				2706	pr_warn("Unable to parse numa_balancing=\n");
				2707
				2708	return ret;
				2709	}
				2710	__setup("numa_balancing=", setup_numabalancing);
				2711	#else
				2712	static inline void __init check_numabalancing_enable(void)
				2713	{
				2714	}
				2715	#endif /* CONFIG_NUMA_BALANCING */
				2716
				2717	/* assumes fs == KERNEL_DS */
				2718	void __init numa_policy_init(void)
				2719	{
				2720	nodemask_t interleave_nodes;
				2721	unsigned long largest = 0;
				2722	int nid, prefer = 0;
				2723
				2724	policy_cache = kmem_cache_create("numa_policy",
				2725	sizeof(struct mempolicy),
				2726	0, SLAB_PANIC, NULL);
				2727
				2728	sn_cache = kmem_cache_create("shared_policy_node",
				2729	sizeof(struct sp_node),
				2730	0, SLAB_PANIC, NULL);
				2731
				2732	for_each_node(nid) {
				2733	preferred_node_policy[nid] = (struct mempolicy) {
				2734	.refcnt = ATOMIC_INIT(1),
				2735	.mode = MPOL_PREFERRED,
				2736	.flags = MPOL_F_MOF \| MPOL_F_MORON,
				2737	.v = { .preferred_node = nid, },
				2738	};
				2739	}
				2740
				2741	/*
				2742	* Set interleaving policy for system init. Interleaving is only
				2743	* enabled across suitably sized nodes (default is >= 16MB), or
				2744	* fall back to the largest node if they're all smaller.
				2745	*/
				2746	nodes_clear(interleave_nodes);
				2747	for_each_node_state(nid, N_MEMORY) {
				2748	unsigned long total_pages = node_present_pages(nid);
				2749
				2750	/* Preserve the largest node */
				2751	if (largest < total_pages) {
				2752	largest = total_pages;
				2753	prefer = nid;
				2754	}
				2755
				2756	/* Interleave this node? */
				2757	if ((total_pages << PAGE_SHIFT) >= (16 << 20))
				2758	node_set(nid, interleave_nodes);
				2759	}
				2760
				2761	/* All too small, use the largest */
				2762	if (unlikely(nodes_empty(interleave_nodes)))
				2763	node_set(prefer, interleave_nodes);
				2764
				2765	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
				2766	pr_err("%s: interleaving failed\n", __func__);
				2767
				2768	check_numabalancing_enable();
				2769	}
				2770
				2771	/* Reset policy of current process to default */
				2772	void numa_default_policy(void)
				2773	{
				2774	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
				2775	}
				2776
				2777	/*
				2778	* Parse and format mempolicy from/to strings
				2779	*/
				2780
				2781	/*
				2782	* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
				2783	*/
				2784	static const char * const policy_modes[] =
				2785	{
				2786	[MPOL_DEFAULT] = "default",
				2787	[MPOL_PREFERRED] = "prefer",
				2788	[MPOL_BIND] = "bind",
				2789	[MPOL_INTERLEAVE] = "interleave",
				2790	[MPOL_LOCAL] = "local",
				2791	};
				2792
				2793
				2794	#ifdef CONFIG_TMPFS
				2795	/**
				2796	* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
				2797	* @str: string containing mempolicy to parse
				2798	* @mpol: pointer to struct mempolicy pointer, returned on success.
				2799	*
				2800	* Format of input:
				2801	* <mode>[=<flags>][:<nodelist>]
				2802	*
				2803	* On success, returns 0, else 1
				2804	*/
				2805	int mpol_parse_str(char str, struct mempolicy *mpol)
				2806	{
				2807	struct mempolicy *new = NULL;
				2808	unsigned short mode;
				2809	unsigned short mode_flags;
				2810	nodemask_t nodes;
				2811	char *nodelist = strchr(str, ':');
				2812	char *flags = strchr(str, '=');
				2813	int err = 1;
				2814
				2815	if (nodelist) {
				2816	/* NUL-terminate mode or flags string */
				2817	*nodelist++ = '\0';
				2818	if (nodelist_parse(nodelist, nodes))
				2819	goto out;
				2820	if (!nodes_subset(nodes, node_states[N_MEMORY]))
				2821	goto out;
				2822	} else
				2823	nodes_clear(nodes);
				2824
				2825	if (flags)
				2826	flags++ = '\0'; / terminate mode string */
				2827
				2828	for (mode = 0; mode < MPOL_MAX; mode++) {
				2829	if (!strcmp(str, policy_modes[mode])) {
				2830	break;
				2831	}
				2832	}
				2833	if (mode >= MPOL_MAX)
				2834	goto out;
				2835
				2836	switch (mode) {
				2837	case MPOL_PREFERRED:
				2838	/*
				2839	* Insist on a nodelist of one node only
				2840	*/
				2841	if (nodelist) {
				2842	char *rest = nodelist;
				2843	while (isdigit(*rest))
				2844	rest++;
				2845	if (*rest)
				2846	goto out;
				2847	}
				2848	break;
				2849	case MPOL_INTERLEAVE:
				2850	/*
				2851	* Default to online nodes with memory if no nodelist
				2852	*/
				2853	if (!nodelist)
				2854	nodes = node_states[N_MEMORY];
				2855	break;
				2856	case MPOL_LOCAL:
				2857	/*
				2858	* Don't allow a nodelist; mpol_new() checks flags
				2859	*/
				2860	if (nodelist)
				2861	goto out;
				2862	mode = MPOL_PREFERRED;
				2863	break;
				2864	case MPOL_DEFAULT:
				2865	/*
				2866	* Insist on a empty nodelist
				2867	*/
				2868	if (!nodelist)
				2869	err = 0;
				2870	goto out;
				2871	case MPOL_BIND:
				2872	/*
				2873	* Insist on a nodelist
				2874	*/
				2875	if (!nodelist)
				2876	goto out;
				2877	}
				2878
				2879	mode_flags = 0;
				2880	if (flags) {
				2881	/*
				2882	* Currently, we only support two mutually exclusive
				2883	* mode flags.
				2884	*/
				2885	if (!strcmp(flags, "static"))
				2886	mode_flags \|= MPOL_F_STATIC_NODES;
				2887	else if (!strcmp(flags, "relative"))
				2888	mode_flags \|= MPOL_F_RELATIVE_NODES;
				2889	else
				2890	goto out;
				2891	}
				2892
				2893	new = mpol_new(mode, mode_flags, &nodes);
				2894	if (IS_ERR(new))
				2895	goto out;
				2896
				2897	/*
				2898	* Save nodes for mpol_to_str() to show the tmpfs mount options
				2899	* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
				2900	*/
				2901	if (mode != MPOL_PREFERRED)
				2902	new->v.nodes = nodes;
				2903	else if (nodelist)
				2904	new->v.preferred_node = first_node(nodes);
				2905	else
				2906	new->flags \|= MPOL_F_LOCAL;
				2907
				2908	/*
				2909	* Save nodes for contextualization: this will be used to "clone"
				2910	* the mempolicy in a specific context [cpuset] at a later time.
				2911	*/
				2912	new->w.user_nodemask = nodes;
				2913
				2914	err = 0;
				2915
				2916	out:
				2917	/* Restore string for error message */
				2918	if (nodelist)
				2919	*--nodelist = ':';
				2920	if (flags)
				2921	*--flags = '=';
				2922	if (!err)
				2923	*mpol = new;
				2924	return err;
				2925	}
				2926	#endif /* CONFIG_TMPFS */
				2927
				2928	/**
				2929	* mpol_to_str - format a mempolicy structure for printing
				2930	* @buffer: to contain formatted mempolicy string
				2931	* @maxlen: length of @buffer
				2932	* @pol: pointer to mempolicy to be formatted
				2933	*
				2934	* Convert @pol into a string. If @buffer is too short, truncate the string.
				2935	* Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
				2936	* longest flag, "relative", and to display at least a few node ids.
				2937	*/
				2938	void mpol_to_str(char buffer, int maxlen, struct mempolicy pol)
				2939	{
				2940	char *p = buffer;
				2941	nodemask_t nodes = NODE_MASK_NONE;
				2942	unsigned short mode = MPOL_DEFAULT;
				2943	unsigned short flags = 0;
				2944
				2945	if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) {
				2946	mode = pol->mode;
				2947	flags = pol->flags;
				2948	}
				2949
				2950	switch (mode) {
				2951	case MPOL_DEFAULT:
				2952	break;
				2953	case MPOL_PREFERRED:
				2954	if (flags & MPOL_F_LOCAL)
				2955	mode = MPOL_LOCAL;
				2956	else
				2957	node_set(pol->v.preferred_node, nodes);
				2958	break;
				2959	case MPOL_BIND:
				2960	case MPOL_INTERLEAVE:
				2961	nodes = pol->v.nodes;
				2962	break;
				2963	default:
				2964	WARN_ON_ONCE(1);
				2965	snprintf(p, maxlen, "unknown");
				2966	return;
				2967	}
				2968
				2969	p += snprintf(p, maxlen, "%s", policy_modes[mode]);
				2970
				2971	if (flags & MPOL_MODE_FLAGS) {
				2972	p += snprintf(p, buffer + maxlen - p, "=");
				2973
				2974	/*
				2975	* Currently, the only defined flags are mutually exclusive
				2976	*/
				2977	if (flags & MPOL_F_STATIC_NODES)
				2978	p += snprintf(p, buffer + maxlen - p, "static");
				2979	else if (flags & MPOL_F_RELATIVE_NODES)
				2980	p += snprintf(p, buffer + maxlen - p, "relative");
				2981	}
				2982
				2983	if (!nodes_empty(nodes))
				2984	p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
				2985	nodemask_pr_args(&nodes));
				2986	}