Blame - ap/os/linux/linux-3.4.x/mm/mempolicy.c - R306

blob: 87a43cce8d59dfdc01877e66c3780bf978efc6a0 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* Simple NUMA memory policy for the Linux kernel.
				3	*
				4	* Copyright 2003,2004 Andi Kleen, SuSE Labs.
				5	* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
				6	* Subject to the GNU Public License, version 2.
				7	*
				8	* NUMA policy allows the user to give hints in which node(s) memory should
				9	* be allocated.
				10	*
				11	* Support four policies per VMA and per process:
				12	*
				13	* The VMA policy has priority over the process policy for a page fault.
				14	*
				15	* interleave Allocate memory interleaved over a set of nodes,
				16	* with normal fallback if it fails.
				17	* For VMA based allocations this interleaves based on the
				18	* offset into the backing object or offset into the mapping
				19	* for anonymous memory. For process policy an process counter
				20	* is used.
				21	*
				22	* bind Only allocate memory on a specific set of nodes,
				23	* no fallback.
				24	* FIXME: memory is allocated starting with the first node
				25	* to the last. It would be better if bind would truly restrict
				26	* the allocation to memory nodes instead
				27	*
				28	* preferred Try a specific node first before normal fallback.
				29	* As a special case node -1 here means do the allocation
				30	* on the local CPU. This is normally identical to default,
				31	* but useful to set in a VMA when you have a non default
				32	* process policy.
				33	*
				34	* default Allocate on the local node first, or when on a VMA
				35	* use the process policy. This is what Linux always did
				36	* in a NUMA aware kernel and still does by, ahem, default.
				37	*
				38	* The process policy is applied for most non interrupt memory allocations
				39	* in that process' context. Interrupts ignore the policies and always
				40	* try to allocate on the local CPU. The VMA policy is only applied for memory
				41	* allocations for a VMA in the VM.
				42	*
				43	* Currently there are a few corner cases in swapping where the policy
				44	* is not applied, but the majority should be handled. When process policy
				45	* is used it is not remembered over swap outs/swap ins.
				46	*
				47	* Only the highest zone in the zone hierarchy gets policied. Allocations
				48	* requesting a lower zone just use default policy. This implies that
				49	* on systems with highmem kernel lowmem allocation don't get policied.
				50	* Same with GFP_DMA allocations.
				51	*
				52	* For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
				53	* all users and remembered even when nobody has memory mapped.
				54	*/
				55
				56	/* Notebook:
				57	fix mmap readahead to honour policy and enable policy for any page cache
				58	object
				59	statistics for bigpages
				60	global policy for page cache? currently it uses process policy. Requires
				61	first item above.
				62	handle mremap for shared memory (currently ignored for the policy)
				63	grows down?
				64	make bind policy root only? It can trigger oom much faster and the
				65	kernel is not always grateful with that.
				66	*/
				67
				68	#include <linux/mempolicy.h>
				69	#include <linux/mm.h>
				70	#include <linux/highmem.h>
				71	#include <linux/hugetlb.h>
				72	#include <linux/kernel.h>
				73	#include <linux/sched.h>
				74	#include <linux/nodemask.h>
				75	#include <linux/cpuset.h>
				76	#include <linux/slab.h>
				77	#include <linux/string.h>
				78	#include <linux/export.h>
				79	#include <linux/nsproxy.h>
				80	#include <linux/interrupt.h>
				81	#include <linux/init.h>
				82	#include <linux/compat.h>
				83	#include <linux/swap.h>
				84	#include <linux/seq_file.h>
				85	#include <linux/proc_fs.h>
				86	#include <linux/migrate.h>
				87	#include <linux/ksm.h>
				88	#include <linux/rmap.h>
				89	#include <linux/security.h>
				90	#include <linux/syscalls.h>
				91	#include <linux/ctype.h>
				92	#include <linux/mm_inline.h>
				93
				94	#include <asm/tlbflush.h>
				95	#include <asm/uaccess.h>
				96	#include <linux/random.h>
				97
				98	#include "internal.h"
				99
				100	/* Internal flags */
				101	#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
				102	#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
				103
				104	static struct kmem_cache *policy_cache;
				105	static struct kmem_cache *sn_cache;
				106
				107	/* Highest zone. An specific allocation for a zone below that is not
				108	policied. */
				109	enum zone_type policy_zone = 0;
				110
				111	/*
				112	* run-time system-wide default policy => local allocation
				113	*/
				114	static struct mempolicy default_policy = {
				115	.refcnt = ATOMIC_INIT(1), /* never free it */
				116	.mode = MPOL_PREFERRED,
				117	.flags = MPOL_F_LOCAL,
				118	};
				119
				120	static const struct mempolicy_operations {
				121	int (create)(struct mempolicy pol, const nodemask_t *nodes);
				122	/*
				123	* If read-side task has no lock to protect task->mempolicy, write-side
				124	* task will rebind the task->mempolicy by two step. The first step is
				125	* setting all the newly nodes, and the second step is cleaning all the
				126	* disallowed nodes. In this way, we can avoid finding no node to alloc
				127	* page.
				128	* If we have a lock to protect task->mempolicy in read-side, we do
				129	* rebind directly.
				130	*
				131	* step:
				132	* MPOL_REBIND_ONCE - do rebind work at once
				133	* MPOL_REBIND_STEP1 - set all the newly nodes
				134	* MPOL_REBIND_STEP2 - clean all the disallowed nodes
				135	*/
				136	void (rebind)(struct mempolicy pol, const nodemask_t *nodes,
				137	enum mpol_rebind_step step);
				138	} mpol_ops[MPOL_MAX];
				139
				140	/* Check that the nodemask contains at least one populated zone */
				141	static int is_valid_nodemask(const nodemask_t *nodemask)
				142	{
				143	int nd, k;
				144
				145	for_each_node_mask(nd, *nodemask) {
				146	struct zone *z;
				147
				148	for (k = 0; k <= policy_zone; k++) {
				149	z = &NODE_DATA(nd)->node_zones[k];
				150	if (z->present_pages > 0)
				151	return 1;
				152	}
				153	}
				154
				155	return 0;
				156	}
				157
				158	static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
				159	{
				160	return pol->flags & MPOL_MODE_FLAGS;
				161	}
				162
				163	static void mpol_relative_nodemask(nodemask_t ret, const nodemask_t orig,
				164	const nodemask_t *rel)
				165	{
				166	nodemask_t tmp;
				167	nodes_fold(tmp, orig, nodes_weight(rel));
				168	nodes_onto(ret, tmp, rel);
				169	}
				170
				171	static int mpol_new_interleave(struct mempolicy pol, const nodemask_t nodes)
				172	{
				173	if (nodes_empty(*nodes))
				174	return -EINVAL;
				175	pol->v.nodes = *nodes;
				176	return 0;
				177	}
				178
				179	static int mpol_new_preferred(struct mempolicy pol, const nodemask_t nodes)
				180	{
				181	if (!nodes)
				182	pol->flags \|= MPOL_F_LOCAL; /* local allocation */
				183	else if (nodes_empty(*nodes))
				184	return -EINVAL; /* no allowed nodes */
				185	else
				186	pol->v.preferred_node = first_node(*nodes);
				187	return 0;
				188	}
				189
				190	static int mpol_new_bind(struct mempolicy pol, const nodemask_t nodes)
				191	{
				192	if (!is_valid_nodemask(nodes))
				193	return -EINVAL;
				194	pol->v.nodes = *nodes;
				195	return 0;
				196	}
				197
				198	/*
				199	* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
				200	* any, for the new policy. mpol_new() has already validated the nodes
				201	* parameter with respect to the policy mode and flags. But, we need to
				202	* handle an empty nodemask with MPOL_PREFERRED here.
				203	*
				204	* Must be called holding task's alloc_lock to protect task's mems_allowed
				205	* and mempolicy. May also be called holding the mmap_semaphore for write.
				206	*/
				207	static int mpol_set_nodemask(struct mempolicy *pol,
				208	const nodemask_t nodes, struct nodemask_scratch nsc)
				209	{
				210	int ret;
				211
				212	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
				213	if (pol == NULL)
				214	return 0;
				215	/* Check N_HIGH_MEMORY */
				216	nodes_and(nsc->mask1,
				217	cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
				218
				219	VM_BUG_ON(!nodes);
				220	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
				221	nodes = NULL; /* explicit local allocation */
				222	else {
				223	if (pol->flags & MPOL_F_RELATIVE_NODES)
				224	mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
				225	else
				226	nodes_and(nsc->mask2, *nodes, nsc->mask1);
				227
				228	if (mpol_store_user_nodemask(pol))
				229	pol->w.user_nodemask = *nodes;
				230	else
				231	pol->w.cpuset_mems_allowed =
				232	cpuset_current_mems_allowed;
				233	}
				234
				235	if (nodes)
				236	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
				237	else
				238	ret = mpol_ops[pol->mode].create(pol, NULL);
				239	return ret;
				240	}
				241
				242	/*
				243	* This function just creates a new policy, does some check and simple
				244	* initialization. You must invoke mpol_set_nodemask() to set nodes.
				245	*/
				246	static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
				247	nodemask_t *nodes)
				248	{
				249	struct mempolicy *policy;
				250
				251	pr_debug("setting mode %d flags %d nodes[0] %lx\n",
				252	mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
				253
				254	if (mode == MPOL_DEFAULT) {
				255	if (nodes && !nodes_empty(*nodes))
				256	return ERR_PTR(-EINVAL);
				257	return NULL; /* simply delete any existing policy */
				258	}
				259	VM_BUG_ON(!nodes);
				260
				261	/*
				262	* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
				263	* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
				264	* All other modes require a valid pointer to a non-empty nodemask.
				265	*/
				266	if (mode == MPOL_PREFERRED) {
				267	if (nodes_empty(*nodes)) {
				268	if (((flags & MPOL_F_STATIC_NODES) \|\|
				269	(flags & MPOL_F_RELATIVE_NODES)))
				270	return ERR_PTR(-EINVAL);
				271	}
				272	} else if (nodes_empty(*nodes))
				273	return ERR_PTR(-EINVAL);
				274	policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
				275	if (!policy)
				276	return ERR_PTR(-ENOMEM);
				277	atomic_set(&policy->refcnt, 1);
				278	policy->mode = mode;
				279	policy->flags = flags;
				280
				281	return policy;
				282	}
				283
				284	/* Slow path of a mpol destructor. */
				285	void __mpol_put(struct mempolicy *p)
				286	{
				287	if (!atomic_dec_and_test(&p->refcnt))
				288	return;
				289	kmem_cache_free(policy_cache, p);
				290	}
				291
				292	static void mpol_rebind_default(struct mempolicy pol, const nodemask_t nodes,
				293	enum mpol_rebind_step step)
				294	{
				295	}
				296
				297	/*
				298	* step:
				299	* MPOL_REBIND_ONCE - do rebind work at once
				300	* MPOL_REBIND_STEP1 - set all the newly nodes
				301	* MPOL_REBIND_STEP2 - clean all the disallowed nodes
				302	*/
				303	static void mpol_rebind_nodemask(struct mempolicy pol, const nodemask_t nodes,
				304	enum mpol_rebind_step step)
				305	{
				306	nodemask_t tmp;
				307
				308	if (pol->flags & MPOL_F_STATIC_NODES)
				309	nodes_and(tmp, pol->w.user_nodemask, *nodes);
				310	else if (pol->flags & MPOL_F_RELATIVE_NODES)
				311	mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
				312	else {
				313	/*
				314	* if step == 1, we use ->w.cpuset_mems_allowed to cache the
				315	* result
				316	*/
				317	if (step == MPOL_REBIND_ONCE \|\| step == MPOL_REBIND_STEP1) {
				318	nodes_remap(tmp, pol->v.nodes,
				319	pol->w.cpuset_mems_allowed, *nodes);
				320	pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
				321	} else if (step == MPOL_REBIND_STEP2) {
				322	tmp = pol->w.cpuset_mems_allowed;
				323	pol->w.cpuset_mems_allowed = *nodes;
				324	} else
				325	BUG();
				326	}
				327
				328	if (nodes_empty(tmp))
				329	tmp = *nodes;
				330
				331	if (step == MPOL_REBIND_STEP1)
				332	nodes_or(pol->v.nodes, pol->v.nodes, tmp);
				333	else if (step == MPOL_REBIND_ONCE \|\| step == MPOL_REBIND_STEP2)
				334	pol->v.nodes = tmp;
				335	else
				336	BUG();
				337
				338	if (!node_isset(current->il_next, tmp)) {
				339	current->il_next = next_node(current->il_next, tmp);
				340	if (current->il_next >= MAX_NUMNODES)
				341	current->il_next = first_node(tmp);
				342	if (current->il_next >= MAX_NUMNODES)
				343	current->il_next = numa_node_id();
				344	}
				345	}
				346
				347	static void mpol_rebind_preferred(struct mempolicy *pol,
				348	const nodemask_t *nodes,
				349	enum mpol_rebind_step step)
				350	{
				351	nodemask_t tmp;
				352
				353	if (pol->flags & MPOL_F_STATIC_NODES) {
				354	int node = first_node(pol->w.user_nodemask);
				355
				356	if (node_isset(node, *nodes)) {
				357	pol->v.preferred_node = node;
				358	pol->flags &= ~MPOL_F_LOCAL;
				359	} else
				360	pol->flags \|= MPOL_F_LOCAL;
				361	} else if (pol->flags & MPOL_F_RELATIVE_NODES) {
				362	mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
				363	pol->v.preferred_node = first_node(tmp);
				364	} else if (!(pol->flags & MPOL_F_LOCAL)) {
				365	pol->v.preferred_node = node_remap(pol->v.preferred_node,
				366	pol->w.cpuset_mems_allowed,
				367	*nodes);
				368	pol->w.cpuset_mems_allowed = *nodes;
				369	}
				370	}
				371
				372	/*
				373	* mpol_rebind_policy - Migrate a policy to a different set of nodes
				374	*
				375	* If read-side task has no lock to protect task->mempolicy, write-side
				376	* task will rebind the task->mempolicy by two step. The first step is
				377	* setting all the newly nodes, and the second step is cleaning all the
				378	* disallowed nodes. In this way, we can avoid finding no node to alloc
				379	* page.
				380	* If we have a lock to protect task->mempolicy in read-side, we do
				381	* rebind directly.
				382	*
				383	* step:
				384	* MPOL_REBIND_ONCE - do rebind work at once
				385	* MPOL_REBIND_STEP1 - set all the newly nodes
				386	* MPOL_REBIND_STEP2 - clean all the disallowed nodes
				387	*/
				388	static void mpol_rebind_policy(struct mempolicy pol, const nodemask_t newmask,
				389	enum mpol_rebind_step step)
				390	{
				391	if (!pol)
				392	return;
				393	if (!mpol_store_user_nodemask(pol) && step == 0 &&
				394	nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
				395	return;
				396
				397	if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
				398	return;
				399
				400	if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
				401	BUG();
				402
				403	if (step == MPOL_REBIND_STEP1)
				404	pol->flags \|= MPOL_F_REBINDING;
				405	else if (step == MPOL_REBIND_STEP2)
				406	pol->flags &= ~MPOL_F_REBINDING;
				407	else if (step >= MPOL_REBIND_NSTEP)
				408	BUG();
				409
				410	mpol_ops[pol->mode].rebind(pol, newmask, step);
				411	}
				412
				413	/*
				414	* Wrapper for mpol_rebind_policy() that just requires task
				415	* pointer, and updates task mempolicy.
				416	*
				417	* Called with task's alloc_lock held.
				418	*/
				419
				420	void mpol_rebind_task(struct task_struct tsk, const nodemask_t new,
				421	enum mpol_rebind_step step)
				422	{
				423	mpol_rebind_policy(tsk->mempolicy, new, step);
				424	}
				425
				426	/*
				427	* Rebind each vma in mm to new nodemask.
				428	*
				429	* Call holding a reference to mm. Takes mm->mmap_sem during call.
				430	*/
				431
				432	void mpol_rebind_mm(struct mm_struct mm, nodemask_t new)
				433	{
				434	struct vm_area_struct *vma;
				435
				436	down_write(&mm->mmap_sem);
				437	for (vma = mm->mmap; vma; vma = vma->vm_next)
				438	mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
				439	up_write(&mm->mmap_sem);
				440	}
				441
				442	static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
				443	[MPOL_DEFAULT] = {
				444	.rebind = mpol_rebind_default,
				445	},
				446	[MPOL_INTERLEAVE] = {
				447	.create = mpol_new_interleave,
				448	.rebind = mpol_rebind_nodemask,
				449	},
				450	[MPOL_PREFERRED] = {
				451	.create = mpol_new_preferred,
				452	.rebind = mpol_rebind_preferred,
				453	},
				454	[MPOL_BIND] = {
				455	.create = mpol_new_bind,
				456	.rebind = mpol_rebind_nodemask,
				457	},
				458	};
				459
				460	static void migrate_page_add(struct page page, struct list_head pagelist,
				461	unsigned long flags);
				462
				463	/* Scan through pages checking if pages follow certain conditions. */
				464	static int check_pte_range(struct vm_area_struct vma, pmd_t pmd,
				465	unsigned long addr, unsigned long end,
				466	const nodemask_t *nodes, unsigned long flags,
				467	void *private)
				468	{
				469	pte_t *orig_pte;
				470	pte_t *pte;
				471	spinlock_t *ptl;
				472
				473	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
				474	do {
				475	struct page *page;
				476	int nid;
				477
				478	if (!pte_present(*pte))
				479	continue;
				480	page = vm_normal_page(vma, addr, *pte);
				481	if (!page)
				482	continue;
				483	/*
				484	* vm_normal_page() filters out zero pages, but there might
				485	* still be PageReserved pages to skip, perhaps in a VDSO.
				486	* And we cannot move PageKsm pages sensibly or safely yet.
				487	*/
				488	if (PageReserved(page) \|\| PageKsm(page))
				489	continue;
				490	nid = page_to_nid(page);
				491	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
				492	continue;
				493
				494	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
				495	migrate_page_add(page, private, flags);
				496	else
				497	break;
				498	} while (pte++, addr += PAGE_SIZE, addr != end);
				499	pte_unmap_unlock(orig_pte, ptl);
				500	return addr != end;
				501	}
				502
				503	static inline int check_pmd_range(struct vm_area_struct vma, pud_t pud,
				504	unsigned long addr, unsigned long end,
				505	const nodemask_t *nodes, unsigned long flags,
				506	void *private)
				507	{
				508	pmd_t *pmd;
				509	unsigned long next;
				510
				511	pmd = pmd_offset(pud, addr);
				512	do {
				513	next = pmd_addr_end(addr, end);
				514	split_huge_page_pmd(vma->vm_mm, pmd);
				515	if (pmd_none_or_trans_huge_or_clear_bad(pmd))
				516	continue;
				517	if (check_pte_range(vma, pmd, addr, next, nodes,
				518	flags, private))
				519	return -EIO;
				520	} while (pmd++, addr = next, addr != end);
				521	return 0;
				522	}
				523
				524	static inline int check_pud_range(struct vm_area_struct vma, pgd_t pgd,
				525	unsigned long addr, unsigned long end,
				526	const nodemask_t *nodes, unsigned long flags,
				527	void *private)
				528	{
				529	pud_t *pud;
				530	unsigned long next;
				531
				532	pud = pud_offset(pgd, addr);
				533	do {
				534	next = pud_addr_end(addr, end);
				535	if (pud_none_or_clear_bad(pud))
				536	continue;
				537	if (check_pmd_range(vma, pud, addr, next, nodes,
				538	flags, private))
				539	return -EIO;
				540	} while (pud++, addr = next, addr != end);
				541	return 0;
				542	}
				543
				544	static inline int check_pgd_range(struct vm_area_struct *vma,
				545	unsigned long addr, unsigned long end,
				546	const nodemask_t *nodes, unsigned long flags,
				547	void *private)
				548	{
				549	pgd_t *pgd;
				550	unsigned long next;
				551
				552	pgd = pgd_offset(vma->vm_mm, addr);
				553	do {
				554	next = pgd_addr_end(addr, end);
				555	if (pgd_none_or_clear_bad(pgd))
				556	continue;
				557	if (check_pud_range(vma, pgd, addr, next, nodes,
				558	flags, private))
				559	return -EIO;
				560	} while (pgd++, addr = next, addr != end);
				561	return 0;
				562	}
				563
				564	/*
				565	* Check if all pages in a range are on a set of nodes.
				566	* If pagelist != NULL then isolate pages from the LRU and
				567	* put them on the pagelist.
				568	*/
				569	static int
				570	check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
				571	const nodemask_t nodes, unsigned long flags, void private)
				572	{
				573	int err = 0;
				574	struct vm_area_struct vma, prev;
				575
				576
				577	vma = find_vma(mm, start);
				578	if (!vma)
				579	return -EFAULT;
				580	prev = NULL;
				581	for (; vma && vma->vm_start < end; vma = vma->vm_next) {
				582	if (!(flags & MPOL_MF_DISCONTIG_OK)) {
				583	if (!vma->vm_next && vma->vm_end < end)
				584	return -EFAULT;
				585	if (prev && prev->vm_end < vma->vm_start)
				586	return -EFAULT;
				587	}
				588	if (!is_vm_hugetlb_page(vma) &&
				589	((flags & MPOL_MF_STRICT) \|\|
				590	((flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) &&
				591	vma_migratable(vma)))) {
				592	unsigned long endvma = vma->vm_end;
				593
				594	if (endvma > end)
				595	endvma = end;
				596	if (vma->vm_start > start)
				597	start = vma->vm_start;
				598	err = check_pgd_range(vma, start, endvma, nodes,
				599	flags, private);
				600	if (err)
				601	break;
				602	}
				603	prev = vma;
				604	}
				605	return err;
				606	}
				607
				608	/*
				609	* Apply policy to a single VMA
				610	* This must be called with the mmap_sem held for writing.
				611	*/
				612	static int vma_replace_policy(struct vm_area_struct *vma,
				613	struct mempolicy *pol)
				614	{
				615	int err;
				616	struct mempolicy *old;
				617	struct mempolicy *new;
				618
				619	pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
				620	vma->vm_start, vma->vm_end, vma->vm_pgoff,
				621	vma->vm_ops, vma->vm_file,
				622	vma->vm_ops ? vma->vm_ops->set_policy : NULL);
				623
				624	new = mpol_dup(pol);
				625	if (IS_ERR(new))
				626	return PTR_ERR(new);
				627
				628	if (vma->vm_ops && vma->vm_ops->set_policy) {
				629	err = vma->vm_ops->set_policy(vma, new);
				630	if (err)
				631	goto err_out;
				632	}
				633
				634	old = vma->vm_policy;
				635	vma->vm_policy = new; /* protected by mmap_sem */
				636	mpol_put(old);
				637
				638	return 0;
				639	err_out:
				640	mpol_put(new);
				641	return err;
				642	}
				643
				644	/* Step 2: apply policy to a range and do splits. */
				645	static int mbind_range(struct mm_struct *mm, unsigned long start,
				646	unsigned long end, struct mempolicy *new_pol)
				647	{
				648	struct vm_area_struct *next;
				649	struct vm_area_struct *prev;
				650	struct vm_area_struct *vma;
				651	int err = 0;
				652	pgoff_t pgoff;
				653	unsigned long vmstart;
				654	unsigned long vmend;
				655
				656	vma = find_vma(mm, start);
				657	if (!vma \|\| vma->vm_start > start)
				658	return -EFAULT;
				659
				660	prev = vma->vm_prev;
				661	if (start > vma->vm_start)
				662	prev = vma;
				663
				664	for (; vma && vma->vm_start < end; prev = vma, vma = next) {
				665	next = vma->vm_next;
				666	vmstart = max(start, vma->vm_start);
				667	vmend = min(end, vma->vm_end);
				668
				669	if (mpol_equal(vma_policy(vma), new_pol))
				670	continue;
				671
				672	pgoff = vma->vm_pgoff +
				673	((vmstart - vma->vm_start) >> PAGE_SHIFT);
				674	prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
				675	vma->anon_vma, vma->vm_file, pgoff,
				676	new_pol);
				677	if (prev) {
				678	vma = prev;
				679	next = vma->vm_next;
				680	continue;
				681	}
				682	if (vma->vm_start != vmstart) {
				683	err = split_vma(vma->vm_mm, vma, vmstart, 1);
				684	if (err)
				685	goto out;
				686	}
				687	if (vma->vm_end != vmend) {
				688	err = split_vma(vma->vm_mm, vma, vmend, 0);
				689	if (err)
				690	goto out;
				691	}
				692	err = vma_replace_policy(vma, new_pol);
				693	if (err)
				694	goto out;
				695	}
				696
				697	out:
				698	return err;
				699	}
				700
				701	/*
				702	* Update task->flags PF_MEMPOLICY bit: set iff non-default
				703	* mempolicy. Allows more rapid checking of this (combined perhaps
				704	* with other PF_* flag bits) on memory allocation hot code paths.
				705	*
				706	* If called from outside this file, the task 'p' should -only- be
				707	* a newly forked child not yet visible on the task list, because
				708	* manipulating the task flags of a visible task is not safe.
				709	*
				710	* The above limitation is why this routine has the funny name
				711	* mpol_fix_fork_child_flag().
				712	*
				713	* It is also safe to call this with a task pointer of current,
				714	* which the static wrapper mpol_set_task_struct_flag() does,
				715	* for use within this file.
				716	*/
				717
				718	void mpol_fix_fork_child_flag(struct task_struct *p)
				719	{
				720	if (p->mempolicy)
				721	p->flags \|= PF_MEMPOLICY;
				722	else
				723	p->flags &= ~PF_MEMPOLICY;
				724	}
				725
				726	static void mpol_set_task_struct_flag(void)
				727	{
				728	mpol_fix_fork_child_flag(current);
				729	}
				730
				731	/* Set the process memory policy */
				732	static long do_set_mempolicy(unsigned short mode, unsigned short flags,
				733	nodemask_t *nodes)
				734	{
				735	struct mempolicy new, old;
				736	struct mm_struct *mm = current->mm;
				737	NODEMASK_SCRATCH(scratch);
				738	int ret;
				739
				740	if (!scratch)
				741	return -ENOMEM;
				742
				743	new = mpol_new(mode, flags, nodes);
				744	if (IS_ERR(new)) {
				745	ret = PTR_ERR(new);
				746	goto out;
				747	}
				748	/*
				749	* prevent changing our mempolicy while show_numa_maps()
				750	* is using it.
				751	* Note: do_set_mempolicy() can be called at init time
				752	* with no 'mm'.
				753	*/
				754	if (mm)
				755	down_write(&mm->mmap_sem);
				756	task_lock(current);
				757	ret = mpol_set_nodemask(new, nodes, scratch);
				758	if (ret) {
				759	task_unlock(current);
				760	if (mm)
				761	up_write(&mm->mmap_sem);
				762	mpol_put(new);
				763	goto out;
				764	}
				765	old = current->mempolicy;
				766	current->mempolicy = new;
				767	mpol_set_task_struct_flag();
				768	if (new && new->mode == MPOL_INTERLEAVE &&
				769	nodes_weight(new->v.nodes))
				770	current->il_next = first_node(new->v.nodes);
				771	task_unlock(current);
				772	if (mm)
				773	up_write(&mm->mmap_sem);
				774
				775	mpol_put(old);
				776	ret = 0;
				777	out:
				778	NODEMASK_SCRATCH_FREE(scratch);
				779	return ret;
				780	}
				781
				782	/*
				783	* Return nodemask for policy for get_mempolicy() query
				784	*
				785	* Called with task's alloc_lock held
				786	*/
				787	static void get_policy_nodemask(struct mempolicy p, nodemask_t nodes)
				788	{
				789	nodes_clear(*nodes);
				790	if (p == &default_policy)
				791	return;
				792
				793	switch (p->mode) {
				794	case MPOL_BIND:
				795	/* Fall through */
				796	case MPOL_INTERLEAVE:
				797	*nodes = p->v.nodes;
				798	break;
				799	case MPOL_PREFERRED:
				800	if (!(p->flags & MPOL_F_LOCAL))
				801	node_set(p->v.preferred_node, *nodes);
				802	/* else return empty node mask for local allocation */
				803	break;
				804	default:
				805	BUG();
				806	}
				807	}
				808
				809	static int lookup_node(struct mm_struct *mm, unsigned long addr)
				810	{
				811	struct page *p;
				812	int err;
				813
				814	err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
				815	if (err >= 0) {
				816	err = page_to_nid(p);
				817	put_page(p);
				818	}
				819	return err;
				820	}
				821
				822	/* Retrieve NUMA policy */
				823	static long do_get_mempolicy(int policy, nodemask_t nmask,
				824	unsigned long addr, unsigned long flags)
				825	{
				826	int err;
				827	struct mm_struct *mm = current->mm;
				828	struct vm_area_struct *vma = NULL;
				829	struct mempolicy *pol = current->mempolicy;
				830
				831	if (flags &
				832	~(unsigned long)(MPOL_F_NODE\|MPOL_F_ADDR\|MPOL_F_MEMS_ALLOWED))
				833	return -EINVAL;
				834
				835	if (flags & MPOL_F_MEMS_ALLOWED) {
				836	if (flags & (MPOL_F_NODE\|MPOL_F_ADDR))
				837	return -EINVAL;
				838	policy = 0; / just so it's initialized */
				839	task_lock(current);
				840	*nmask = cpuset_current_mems_allowed;
				841	task_unlock(current);
				842	return 0;
				843	}
				844
				845	if (flags & MPOL_F_ADDR) {
				846	/*
				847	* Do NOT fall back to task policy if the
				848	* vma/shared policy at addr is NULL. We
				849	* want to return MPOL_DEFAULT in this case.
				850	*/
				851	down_read(&mm->mmap_sem);
				852	vma = find_vma_intersection(mm, addr, addr+1);
				853	if (!vma) {
				854	up_read(&mm->mmap_sem);
				855	return -EFAULT;
				856	}
				857	if (vma->vm_ops && vma->vm_ops->get_policy)
				858	pol = vma->vm_ops->get_policy(vma, addr);
				859	else
				860	pol = vma->vm_policy;
				861	} else if (addr)
				862	return -EINVAL;
				863
				864	if (!pol)
				865	pol = &default_policy; /* indicates default behavior */
				866
				867	if (flags & MPOL_F_NODE) {
				868	if (flags & MPOL_F_ADDR) {
				869	err = lookup_node(mm, addr);
				870	if (err < 0)
				871	goto out;
				872	*policy = err;
				873	} else if (pol == current->mempolicy &&
				874	pol->mode == MPOL_INTERLEAVE) {
				875	*policy = current->il_next;
				876	} else {
				877	err = -EINVAL;
				878	goto out;
				879	}
				880	} else {
				881	*policy = pol == &default_policy ? MPOL_DEFAULT :
				882	pol->mode;
				883	/*
				884	* Internal mempolicy flags must be masked off before exposing
				885	* the policy to userspace.
				886	*/
				887	*policy \|= (pol->flags & MPOL_MODE_FLAGS);
				888	}
				889
				890	if (vma) {
				891	up_read(&current->mm->mmap_sem);
				892	vma = NULL;
				893	}
				894
				895	err = 0;
				896	if (nmask) {
				897	if (mpol_store_user_nodemask(pol)) {
				898	*nmask = pol->w.user_nodemask;
				899	} else {
				900	task_lock(current);
				901	get_policy_nodemask(pol, nmask);
				902	task_unlock(current);
				903	}
				904	}
				905
				906	out:
				907	mpol_cond_put(pol);
				908	if (vma)
				909	up_read(&current->mm->mmap_sem);
				910	return err;
				911	}
				912
				913	#ifdef CONFIG_MIGRATION
				914	/*
				915	* page migration
				916	*/
				917	static void migrate_page_add(struct page page, struct list_head pagelist,
				918	unsigned long flags)
				919	{
				920	/*
				921	* Avoid migrating a page that is shared with others.
				922	*/
				923	if ((flags & MPOL_MF_MOVE_ALL) \|\| page_mapcount(page) == 1) {
				924	if (!isolate_lru_page(page)) {
				925	list_add_tail(&page->lru, pagelist);
				926	inc_zone_page_state(page, NR_ISOLATED_ANON +
				927	page_is_file_cache(page));
				928	}
				929	}
				930	}
				931
				932	static struct page new_node_page(struct page page, unsigned long node, int **x)
				933	{
				934	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
				935	}
				936
				937	/*
				938	* Migrate pages from one node to a target node.
				939	* Returns error or the number of pages not migrated.
				940	*/
				941	static int migrate_to_node(struct mm_struct *mm, int source, int dest,
				942	int flags)
				943	{
				944	nodemask_t nmask;
				945	LIST_HEAD(pagelist);
				946	int err;
				947
				948	nodes_clear(nmask);
				949	node_set(source, nmask);
				950
				951	err = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
				952	flags \| MPOL_MF_DISCONTIG_OK, &pagelist);
				953	if (err)
				954	return err;
				955
				956	if (!list_empty(&pagelist)) {
				957	err = migrate_pages(&pagelist, new_node_page, dest,
				958	false, MIGRATE_SYNC);
				959	if (err)
				960	putback_lru_pages(&pagelist);
				961	}
				962
				963	return err;
				964	}
				965
				966	/*
				967	* Move pages between the two nodesets so as to preserve the physical
				968	* layout as much as possible.
				969	*
				970	* Returns the number of page that could not be moved.
				971	*/
				972	int do_migrate_pages(struct mm_struct *mm,
				973	const nodemask_t from_nodes, const nodemask_t to_nodes, int flags)
				974	{
				975	int busy = 0;
				976	int err;
				977	nodemask_t tmp;
				978
				979	err = migrate_prep();
				980	if (err)
				981	return err;
				982
				983	down_read(&mm->mmap_sem);
				984
				985	err = migrate_vmas(mm, from_nodes, to_nodes, flags);
				986	if (err)
				987	goto out;
				988
				989	/*
				990	* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
				991	* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
				992	* bit in 'tmp', and return that <source, dest> pair for migration.
				993	* The pair of nodemasks 'to' and 'from' define the map.
				994	*
				995	* If no pair of bits is found that way, fallback to picking some
				996	* pair of 'source' and 'dest' bits that are not the same. If the
				997	* 'source' and 'dest' bits are the same, this represents a node
				998	* that will be migrating to itself, so no pages need move.
				999	*
				1000	* If no bits are left in 'tmp', or if all remaining bits left
				1001	* in 'tmp' correspond to the same bit in 'to', return false
				1002	* (nothing left to migrate).
				1003	*
				1004	* This lets us pick a pair of nodes to migrate between, such that
				1005	* if possible the dest node is not already occupied by some other
				1006	* source node, minimizing the risk of overloading the memory on a
				1007	* node that would happen if we migrated incoming memory to a node
				1008	* before migrating outgoing memory source that same node.
				1009	*
				1010	* A single scan of tmp is sufficient. As we go, we remember the
				1011	* most recent <s, d> pair that moved (s != d). If we find a pair
				1012	* that not only moved, but what's better, moved to an empty slot
				1013	* (d is not set in tmp), then we break out then, with that pair.
				1014	* Otherwise when we finish scanning from_tmp, we at least have the
				1015	* most recent <s, d> pair that moved. If we get all the way through
				1016	* the scan of tmp without finding any node that moved, much less
				1017	* moved to an empty node, then there is nothing left worth migrating.
				1018	*/
				1019
				1020	tmp = *from_nodes;
				1021	while (!nodes_empty(tmp)) {
				1022	int s,d;
				1023	int source = -1;
				1024	int dest = 0;
				1025
				1026	for_each_node_mask(s, tmp) {
				1027	d = node_remap(s, from_nodes, to_nodes);
				1028	if (s == d)
				1029	continue;
				1030
				1031	source = s; /* Node moved. Memorize */
				1032	dest = d;
				1033
				1034	/* dest not in remaining from nodes? */
				1035	if (!node_isset(dest, tmp))
				1036	break;
				1037	}
				1038	if (source == -1)
				1039	break;
				1040
				1041	node_clear(source, tmp);
				1042	err = migrate_to_node(mm, source, dest, flags);
				1043	if (err > 0)
				1044	busy += err;
				1045	if (err < 0)
				1046	break;
				1047	}
				1048	out:
				1049	up_read(&mm->mmap_sem);
				1050	if (err < 0)
				1051	return err;
				1052	return busy;
				1053
				1054	}
				1055
				1056	/*
				1057	* Allocate a new page for page migration based on vma policy.
				1058	* Start by assuming the page is mapped by the same vma as contains @start.
				1059	* Search forward from there, if not. N.B., this assumes that the
				1060	* list of pages handed to migrate_pages()--which is how we get here--
				1061	* is in virtual address order.
				1062	*/
				1063	static struct page new_page(struct page page, unsigned long start, int **x)
				1064	{
				1065	struct vm_area_struct *vma;
				1066	unsigned long uninitialized_var(address);
				1067
				1068	vma = find_vma(current->mm, start);
				1069	while (vma) {
				1070	address = page_address_in_vma(page, vma);
				1071	if (address != -EFAULT)
				1072	break;
				1073	vma = vma->vm_next;
				1074	}
				1075
				1076	/*
				1077	* if !vma, alloc_page_vma() will use task or system default policy
				1078	*/
				1079	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
				1080	}
				1081	#else
				1082
				1083	static void migrate_page_add(struct page page, struct list_head pagelist,
				1084	unsigned long flags)
				1085	{
				1086	}
				1087
				1088	int do_migrate_pages(struct mm_struct *mm,
				1089	const nodemask_t from_nodes, const nodemask_t to_nodes, int flags)
				1090	{
				1091	return -ENOSYS;
				1092	}
				1093
				1094	static struct page new_page(struct page page, unsigned long start, int **x)
				1095	{
				1096	return NULL;
				1097	}
				1098	#endif
				1099
				1100	static long do_mbind(unsigned long start, unsigned long len,
				1101	unsigned short mode, unsigned short mode_flags,
				1102	nodemask_t *nmask, unsigned long flags)
				1103	{
				1104	struct mm_struct *mm = current->mm;
				1105	struct mempolicy *new;
				1106	unsigned long end;
				1107	int err;
				1108	LIST_HEAD(pagelist);
				1109
				1110	if (flags & ~(unsigned long)(MPOL_MF_STRICT \|
				1111	MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL))
				1112	return -EINVAL;
				1113	if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
				1114	return -EPERM;
				1115
				1116	if (start & ~PAGE_MASK)
				1117	return -EINVAL;
				1118
				1119	if (mode == MPOL_DEFAULT)
				1120	flags &= ~MPOL_MF_STRICT;
				1121
				1122	len = (len + PAGE_SIZE - 1) & PAGE_MASK;
				1123	end = start + len;
				1124
				1125	if (end < start)
				1126	return -EINVAL;
				1127	if (end == start)
				1128	return 0;
				1129
				1130	new = mpol_new(mode, mode_flags, nmask);
				1131	if (IS_ERR(new))
				1132	return PTR_ERR(new);
				1133
				1134	/*
				1135	* If we are using the default policy then operation
				1136	* on discontinuous address spaces is okay after all
				1137	*/
				1138	if (!new)
				1139	flags \|= MPOL_MF_DISCONTIG_OK;
				1140
				1141	pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
				1142	start, start + len, mode, mode_flags,
				1143	nmask ? nodes_addr(*nmask)[0] : -1);
				1144
				1145	if (flags & (MPOL_MF_MOVE \| MPOL_MF_MOVE_ALL)) {
				1146
				1147	err = migrate_prep();
				1148	if (err)
				1149	goto mpol_out;
				1150	}
				1151	{
				1152	NODEMASK_SCRATCH(scratch);
				1153	if (scratch) {
				1154	down_write(&mm->mmap_sem);
				1155	task_lock(current);
				1156	err = mpol_set_nodemask(new, nmask, scratch);
				1157	task_unlock(current);
				1158	if (err)
				1159	up_write(&mm->mmap_sem);
				1160	} else
				1161	err = -ENOMEM;
				1162	NODEMASK_SCRATCH_FREE(scratch);
				1163	}
				1164	if (err)
				1165	goto mpol_out;
				1166
				1167	err = check_range(mm, start, end, nmask,
				1168	flags \| MPOL_MF_INVERT, &pagelist);
				1169
				1170	if (!err) {
				1171	int nr_failed = 0;
				1172
				1173	err = mbind_range(mm, start, end, new);
				1174
				1175	if (!list_empty(&pagelist)) {
				1176	nr_failed = migrate_pages(&pagelist, new_page,
				1177	start, false, true);
				1178	if (nr_failed)
				1179	putback_lru_pages(&pagelist);
				1180	}
				1181
				1182	if (!err && nr_failed && (flags & MPOL_MF_STRICT))
				1183	err = -EIO;
				1184	} else
				1185	putback_lru_pages(&pagelist);
				1186
				1187	up_write(&mm->mmap_sem);
				1188	mpol_out:
				1189	mpol_put(new);
				1190	return err;
				1191	}
				1192
				1193	/*
				1194	* User space interface with variable sized bitmaps for nodelists.
				1195	*/
				1196
				1197	/* Copy a node mask from user space. */
				1198	static int get_nodes(nodemask_t nodes, const unsigned long __user nmask,
				1199	unsigned long maxnode)
				1200	{
				1201	unsigned long k;
				1202	unsigned long nlongs;
				1203	unsigned long endmask;
				1204
				1205	--maxnode;
				1206	nodes_clear(*nodes);
				1207	if (maxnode == 0 \|\| !nmask)
				1208	return 0;
				1209	if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
				1210	return -EINVAL;
				1211
				1212	nlongs = BITS_TO_LONGS(maxnode);
				1213	if ((maxnode % BITS_PER_LONG) == 0)
				1214	endmask = ~0UL;
				1215	else
				1216	endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
				1217
				1218	/* When the user specified more nodes than supported just check
				1219	if the non supported part is all zero. */
				1220	if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
				1221	if (nlongs > PAGE_SIZE/sizeof(long))
				1222	return -EINVAL;
				1223	for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
				1224	unsigned long t;
				1225	if (get_user(t, nmask + k))
				1226	return -EFAULT;
				1227	if (k == nlongs - 1) {
				1228	if (t & endmask)
				1229	return -EINVAL;
				1230	} else if (t)
				1231	return -EINVAL;
				1232	}
				1233	nlongs = BITS_TO_LONGS(MAX_NUMNODES);
				1234	endmask = ~0UL;
				1235	}
				1236
				1237	if (copy_from_user(nodes_addr(nodes), nmask, nlongssizeof(unsigned long)))
				1238	return -EFAULT;
				1239	nodes_addr(*nodes)[nlongs-1] &= endmask;
				1240	return 0;
				1241	}
				1242
				1243	/* Copy a kernel node mask to user space */
				1244	static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
				1245	nodemask_t *nodes)
				1246	{
				1247	unsigned long copy = ALIGN(maxnode-1, 64) / 8;
				1248	const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
				1249
				1250	if (copy > nbytes) {
				1251	if (copy > PAGE_SIZE)
				1252	return -EINVAL;
				1253	if (clear_user((char __user *)mask + nbytes, copy - nbytes))
				1254	return -EFAULT;
				1255	copy = nbytes;
				1256	}
				1257	return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
				1258	}
				1259
				1260	SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
				1261	unsigned long, mode, unsigned long __user *, nmask,
				1262	unsigned long, maxnode, unsigned, flags)
				1263	{
				1264	nodemask_t nodes;
				1265	int err;
				1266	unsigned short mode_flags;
				1267
				1268	mode_flags = mode & MPOL_MODE_FLAGS;
				1269	mode &= ~MPOL_MODE_FLAGS;
				1270	if (mode >= MPOL_MAX)
				1271	return -EINVAL;
				1272	if ((mode_flags & MPOL_F_STATIC_NODES) &&
				1273	(mode_flags & MPOL_F_RELATIVE_NODES))
				1274	return -EINVAL;
				1275	err = get_nodes(&nodes, nmask, maxnode);
				1276	if (err)
				1277	return err;
				1278	return do_mbind(start, len, mode, mode_flags, &nodes, flags);
				1279	}
				1280
				1281	/* Set the process memory policy */
				1282	SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
				1283	unsigned long, maxnode)
				1284	{
				1285	int err;
				1286	nodemask_t nodes;
				1287	unsigned short flags;
				1288
				1289	flags = mode & MPOL_MODE_FLAGS;
				1290	mode &= ~MPOL_MODE_FLAGS;
				1291	if ((unsigned int)mode >= MPOL_MAX)
				1292	return -EINVAL;
				1293	if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
				1294	return -EINVAL;
				1295	err = get_nodes(&nodes, nmask, maxnode);
				1296	if (err)
				1297	return err;
				1298	return do_set_mempolicy(mode, flags, &nodes);
				1299	}
				1300
				1301	SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
				1302	const unsigned long __user *, old_nodes,
				1303	const unsigned long __user *, new_nodes)
				1304	{
				1305	const struct cred cred = current_cred(), tcred;
				1306	struct mm_struct *mm = NULL;
				1307	struct task_struct *task;
				1308	nodemask_t task_nodes;
				1309	int err;
				1310	nodemask_t *old;
				1311	nodemask_t *new;
				1312	NODEMASK_SCRATCH(scratch);
				1313
				1314	if (!scratch)
				1315	return -ENOMEM;
				1316
				1317	old = &scratch->mask1;
				1318	new = &scratch->mask2;
				1319
				1320	err = get_nodes(old, old_nodes, maxnode);
				1321	if (err)
				1322	goto out;
				1323
				1324	err = get_nodes(new, new_nodes, maxnode);
				1325	if (err)
				1326	goto out;
				1327
				1328	/* Find the mm_struct */
				1329	rcu_read_lock();
				1330	task = pid ? find_task_by_vpid(pid) : current;
				1331	if (!task) {
				1332	rcu_read_unlock();
				1333	err = -ESRCH;
				1334	goto out;
				1335	}
				1336	get_task_struct(task);
				1337
				1338	err = -EINVAL;
				1339
				1340	/*
				1341	* Check if this process has the right to modify the specified
				1342	* process. The right exists if the process has administrative
				1343	* capabilities, superuser privileges or the same
				1344	* userid as the target process.
				1345	*/
				1346	tcred = __task_cred(task);
				1347	if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
				1348	cred->uid != tcred->suid && cred->uid != tcred->uid &&
				1349	!capable(CAP_SYS_NICE)) {
				1350	rcu_read_unlock();
				1351	err = -EPERM;
				1352	goto out_put;
				1353	}
				1354	rcu_read_unlock();
				1355
				1356	task_nodes = cpuset_mems_allowed(task);
				1357	/* Is the user allowed to access the target nodes? */
				1358	if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
				1359	err = -EPERM;
				1360	goto out_put;
				1361	}
				1362
				1363	if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
				1364	err = -EINVAL;
				1365	goto out_put;
				1366	}
				1367
				1368	err = security_task_movememory(task);
				1369	if (err)
				1370	goto out_put;
				1371
				1372	mm = get_task_mm(task);
				1373	put_task_struct(task);
				1374
				1375	if (!mm) {
				1376	err = -EINVAL;
				1377	goto out;
				1378	}
				1379
				1380	err = do_migrate_pages(mm, old, new,
				1381	capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
				1382
				1383	mmput(mm);
				1384	out:
				1385	NODEMASK_SCRATCH_FREE(scratch);
				1386
				1387	return err;
				1388
				1389	out_put:
				1390	put_task_struct(task);
				1391	goto out;
				1392
				1393	}
				1394
				1395
				1396	/* Retrieve NUMA policy */
				1397	SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
				1398	unsigned long __user *, nmask, unsigned long, maxnode,
				1399	unsigned long, addr, unsigned long, flags)
				1400	{
				1401	int err;
				1402	int uninitialized_var(pval);
				1403	nodemask_t nodes;
				1404
				1405	if (nmask != NULL && maxnode < MAX_NUMNODES)
				1406	return -EINVAL;
				1407
				1408	err = do_get_mempolicy(&pval, &nodes, addr, flags);
				1409
				1410	if (err)
				1411	return err;
				1412
				1413	if (policy && put_user(pval, policy))
				1414	return -EFAULT;
				1415
				1416	if (nmask)
				1417	err = copy_nodes_to_user(nmask, maxnode, &nodes);
				1418
				1419	return err;
				1420	}
				1421
				1422	#ifdef CONFIG_COMPAT
				1423
				1424	asmlinkage long compat_sys_get_mempolicy(int __user *policy,
				1425	compat_ulong_t __user *nmask,
				1426	compat_ulong_t maxnode,
				1427	compat_ulong_t addr, compat_ulong_t flags)
				1428	{
				1429	long err;
				1430	unsigned long __user *nm = NULL;
				1431	unsigned long nr_bits, alloc_size;
				1432	DECLARE_BITMAP(bm, MAX_NUMNODES);
				1433
				1434	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
				1435	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
				1436
				1437	if (nmask)
				1438	nm = compat_alloc_user_space(alloc_size);
				1439
				1440	err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
				1441
				1442	if (!err && nmask) {
				1443	unsigned long copy_size;
				1444	copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
				1445	err = copy_from_user(bm, nm, copy_size);
				1446	/* ensure entire bitmap is zeroed */
				1447	err \|= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
				1448	err \|= compat_put_bitmap(nmask, bm, nr_bits);
				1449	}
				1450
				1451	return err;
				1452	}
				1453
				1454	asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
				1455	compat_ulong_t maxnode)
				1456	{
				1457	long err = 0;
				1458	unsigned long __user *nm = NULL;
				1459	unsigned long nr_bits, alloc_size;
				1460	DECLARE_BITMAP(bm, MAX_NUMNODES);
				1461
				1462	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
				1463	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
				1464
				1465	if (nmask) {
				1466	err = compat_get_bitmap(bm, nmask, nr_bits);
				1467	nm = compat_alloc_user_space(alloc_size);
				1468	err \|= copy_to_user(nm, bm, alloc_size);
				1469	}
				1470
				1471	if (err)
				1472	return -EFAULT;
				1473
				1474	return sys_set_mempolicy(mode, nm, nr_bits+1);
				1475	}
				1476
				1477	asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
				1478	compat_ulong_t mode, compat_ulong_t __user *nmask,
				1479	compat_ulong_t maxnode, compat_ulong_t flags)
				1480	{
				1481	long err = 0;
				1482	unsigned long __user *nm = NULL;
				1483	unsigned long nr_bits, alloc_size;
				1484	nodemask_t bm;
				1485
				1486	nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
				1487	alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
				1488
				1489	if (nmask) {
				1490	err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
				1491	nm = compat_alloc_user_space(alloc_size);
				1492	err \|= copy_to_user(nm, nodes_addr(bm), alloc_size);
				1493	}
				1494
				1495	if (err)
				1496	return -EFAULT;
				1497
				1498	return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
				1499	}
				1500
				1501	#endif
				1502
				1503	/*
				1504	* get_vma_policy(@task, @vma, @addr)
				1505	* @task - task for fallback if vma policy == default
				1506	* @vma - virtual memory area whose policy is sought
				1507	* @addr - address in @vma for shared policy lookup
				1508	*
				1509	* Returns effective policy for a VMA at specified address.
				1510	* Falls back to @task or system default policy, as necessary.
				1511	* Current or other task's task mempolicy and non-shared vma policies
				1512	* are protected by the task's mmap_sem, which must be held for read by
				1513	* the caller.
				1514	* Shared policies [those marked as MPOL_F_SHARED] require an extra reference
				1515	* count--added by the get_policy() vm_op, as appropriate--to protect against
				1516	* freeing by another task. It is the caller's responsibility to free the
				1517	* extra reference for shared policies.
				1518	*/
				1519	struct mempolicy get_vma_policy(struct task_struct task,
				1520	struct vm_area_struct *vma, unsigned long addr)
				1521	{
				1522	struct mempolicy *pol = task->mempolicy;
				1523
				1524	if (vma) {
				1525	if (vma->vm_ops && vma->vm_ops->get_policy) {
				1526	struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
				1527	addr);
				1528	if (vpol)
				1529	pol = vpol;
				1530	} else if (vma->vm_policy) {
				1531	pol = vma->vm_policy;
				1532
				1533	/*
				1534	* shmem_alloc_page() passes MPOL_F_SHARED policy with
				1535	* a pseudo vma whose vma->vm_ops=NULL. Take a reference
				1536	* count on these policies which will be dropped by
				1537	* mpol_cond_put() later
				1538	*/
				1539	if (mpol_needs_cond_ref(pol))
				1540	mpol_get(pol);
				1541	}
				1542	}
				1543	if (!pol)
				1544	pol = &default_policy;
				1545	return pol;
				1546	}
				1547
				1548	/*
				1549	* Return a nodemask representing a mempolicy for filtering nodes for
				1550	* page allocation
				1551	*/
				1552	static nodemask_t policy_nodemask(gfp_t gfp, struct mempolicy policy)
				1553	{
				1554	/* Lower zones don't get a nodemask applied for MPOL_BIND */
				1555	if (unlikely(policy->mode == MPOL_BIND) &&
				1556	gfp_zone(gfp) >= policy_zone &&
				1557	cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
				1558	return &policy->v.nodes;
				1559
				1560	return NULL;
				1561	}
				1562
				1563	/* Return a zonelist indicated by gfp for node representing a mempolicy */
				1564	static struct zonelist policy_zonelist(gfp_t gfp, struct mempolicy policy,
				1565	int nd)
				1566	{
				1567	switch (policy->mode) {
				1568	case MPOL_PREFERRED:
				1569	if (!(policy->flags & MPOL_F_LOCAL))
				1570	nd = policy->v.preferred_node;
				1571	break;
				1572	case MPOL_BIND:
				1573	/*
				1574	* Normally, MPOL_BIND allocations are node-local within the
				1575	* allowed nodemask. However, if __GFP_THISNODE is set and the
				1576	* current node isn't part of the mask, we use the zonelist for
				1577	* the first node in the mask instead.
				1578	*/
				1579	if (unlikely(gfp & __GFP_THISNODE) &&
				1580	unlikely(!node_isset(nd, policy->v.nodes)))
				1581	nd = first_node(policy->v.nodes);
				1582	break;
				1583	default:
				1584	BUG();
				1585	}
				1586	return node_zonelist(nd, gfp);
				1587	}
				1588
				1589	/* Do dynamic interleaving for a process */
				1590	static unsigned interleave_nodes(struct mempolicy *policy)
				1591	{
				1592	unsigned nid, next;
				1593	struct task_struct *me = current;
				1594
				1595	nid = me->il_next;
				1596	next = next_node(nid, policy->v.nodes);
				1597	if (next >= MAX_NUMNODES)
				1598	next = first_node(policy->v.nodes);
				1599	if (next < MAX_NUMNODES)
				1600	me->il_next = next;
				1601	return nid;
				1602	}
				1603
				1604	/*
				1605	* Depending on the memory policy provide a node from which to allocate the
				1606	* next slab entry.
				1607	* @policy must be protected by freeing by the caller. If @policy is
				1608	* the current task's mempolicy, this protection is implicit, as only the
				1609	* task can change it's policy. The system default policy requires no
				1610	* such protection.
				1611	*/
				1612	unsigned slab_node(void)
				1613	{
				1614	struct mempolicy *policy;
				1615
				1616	if (in_interrupt())
				1617	return numa_node_id();
				1618
				1619	policy = current->mempolicy;
				1620	if (!policy \|\| policy->flags & MPOL_F_LOCAL)
				1621	return numa_node_id();
				1622
				1623	switch (policy->mode) {
				1624	case MPOL_PREFERRED:
				1625	/*
				1626	* handled MPOL_F_LOCAL above
				1627	*/
				1628	return policy->v.preferred_node;
				1629
				1630	case MPOL_INTERLEAVE:
				1631	return interleave_nodes(policy);
				1632
				1633	case MPOL_BIND: {
				1634	/*
				1635	* Follow bind policy behavior and start allocation at the
				1636	* first node.
				1637	*/
				1638	struct zonelist *zonelist;
				1639	struct zone *zone;
				1640	enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
				1641	zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
				1642	(void)first_zones_zonelist(zonelist, highest_zoneidx,
				1643	&policy->v.nodes,
				1644	&zone);
				1645	return zone ? zone->node : numa_node_id();
				1646	}
				1647
				1648	default:
				1649	BUG();
				1650	}
				1651	}
				1652
				1653	/* Do static interleaving for a VMA with known offset. */
				1654	static unsigned offset_il_node(struct mempolicy *pol,
				1655	struct vm_area_struct *vma, unsigned long off)
				1656	{
				1657	unsigned nnodes = nodes_weight(pol->v.nodes);
				1658	unsigned target;
				1659	int c;
				1660	int nid = -1;
				1661
				1662	if (!nnodes)
				1663	return numa_node_id();
				1664	target = (unsigned int)off % nnodes;
				1665	c = 0;
				1666	do {
				1667	nid = next_node(nid, pol->v.nodes);
				1668	c++;
				1669	} while (c <= target);
				1670	return nid;
				1671	}
				1672
				1673	/* Determine a node number for interleave */
				1674	static inline unsigned interleave_nid(struct mempolicy *pol,
				1675	struct vm_area_struct *vma, unsigned long addr, int shift)
				1676	{
				1677	if (vma) {
				1678	unsigned long off;
				1679
				1680	/*
				1681	* for small pages, there is no difference between
				1682	* shift and PAGE_SHIFT, so the bit-shift is safe.
				1683	* for huge pages, since vm_pgoff is in units of small
				1684	* pages, we need to shift off the always 0 bits to get
				1685	* a useful offset.
				1686	*/
				1687	BUG_ON(shift < PAGE_SHIFT);
				1688	off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
				1689	off += (addr - vma->vm_start) >> shift;
				1690	return offset_il_node(pol, vma, off);
				1691	} else
				1692	return interleave_nodes(pol);
				1693	}
				1694
				1695	/*
				1696	* Return the bit number of a random bit set in the nodemask.
				1697	* (returns -1 if nodemask is empty)
				1698	*/
				1699	int node_random(const nodemask_t *maskp)
				1700	{
				1701	int w, bit = -1;
				1702
				1703	w = nodes_weight(*maskp);
				1704	if (w)
				1705	bit = bitmap_ord_to_pos(maskp->bits,
				1706	get_random_int() % w, MAX_NUMNODES);
				1707	return bit;
				1708	}
				1709
				1710	#ifdef CONFIG_HUGETLBFS
				1711	/*
				1712	* huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
				1713	* @vma = virtual memory area whose policy is sought
				1714	* @addr = address in @vma for shared policy lookup and interleave policy
				1715	* @gfp_flags = for requested zone
				1716	* @mpol = pointer to mempolicy pointer for reference counted mempolicy
				1717	* @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
				1718	*
				1719	* Returns a zonelist suitable for a huge page allocation and a pointer
				1720	* to the struct mempolicy for conditional unref after allocation.
				1721	* If the effective policy is 'BIND, returns a pointer to the mempolicy's
				1722	* @nodemask for filtering the zonelist.
				1723	*
				1724	* Must be protected by get_mems_allowed()
				1725	*/
				1726	struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr,
				1727	gfp_t gfp_flags, struct mempolicy **mpol,
				1728	nodemask_t **nodemask)
				1729	{
				1730	struct zonelist *zl;
				1731
				1732	*mpol = get_vma_policy(current, vma, addr);
				1733	nodemask = NULL; / assume !MPOL_BIND */
				1734
				1735	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
				1736	zl = node_zonelist(interleave_nid(*mpol, vma, addr,
				1737	huge_page_shift(hstate_vma(vma))), gfp_flags);
				1738	} else {
				1739	zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
				1740	if ((*mpol)->mode == MPOL_BIND)
				1741	nodemask = &(mpol)->v.nodes;
				1742	}
				1743	return zl;
				1744	}
				1745
				1746	/*
				1747	* init_nodemask_of_mempolicy
				1748	*
				1749	* If the current task's mempolicy is "default" [NULL], return 'false'
				1750	* to indicate default policy. Otherwise, extract the policy nodemask
				1751	* for 'bind' or 'interleave' policy into the argument nodemask, or
				1752	* initialize the argument nodemask to contain the single node for
				1753	* 'preferred' or 'local' policy and return 'true' to indicate presence
				1754	* of non-default mempolicy.
				1755	*
				1756	* We don't bother with reference counting the mempolicy [mpol_get/put]
				1757	* because the current task is examining it's own mempolicy and a task's
				1758	* mempolicy is only ever changed by the task itself.
				1759	*
				1760	* N.B., it is the caller's responsibility to free a returned nodemask.
				1761	*/
				1762	bool init_nodemask_of_mempolicy(nodemask_t *mask)
				1763	{
				1764	struct mempolicy *mempolicy;
				1765	int nid;
				1766
				1767	if (!(mask && current->mempolicy))
				1768	return false;
				1769
				1770	task_lock(current);
				1771	mempolicy = current->mempolicy;
				1772	switch (mempolicy->mode) {
				1773	case MPOL_PREFERRED:
				1774	if (mempolicy->flags & MPOL_F_LOCAL)
				1775	nid = numa_node_id();
				1776	else
				1777	nid = mempolicy->v.preferred_node;
				1778	init_nodemask_of_node(mask, nid);
				1779	break;
				1780
				1781	case MPOL_BIND:
				1782	/* Fall through */
				1783	case MPOL_INTERLEAVE:
				1784	*mask = mempolicy->v.nodes;
				1785	break;
				1786
				1787	default:
				1788	BUG();
				1789	}
				1790	task_unlock(current);
				1791
				1792	return true;
				1793	}
				1794	#endif
				1795
				1796	/*
				1797	* mempolicy_nodemask_intersects
				1798	*
				1799	* If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
				1800	* policy. Otherwise, check for intersection between mask and the policy
				1801	* nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
				1802	* policy, always return true since it may allocate elsewhere on fallback.
				1803	*
				1804	* Takes task_lock(tsk) to prevent freeing of its mempolicy.
				1805	*/
				1806	bool mempolicy_nodemask_intersects(struct task_struct *tsk,
				1807	const nodemask_t *mask)
				1808	{
				1809	struct mempolicy *mempolicy;
				1810	bool ret = true;
				1811
				1812	if (!mask)
				1813	return ret;
				1814	task_lock(tsk);
				1815	mempolicy = tsk->mempolicy;
				1816	if (!mempolicy)
				1817	goto out;
				1818
				1819	switch (mempolicy->mode) {
				1820	case MPOL_PREFERRED:
				1821	/*
				1822	* MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
				1823	* allocate from, they may fallback to other nodes when oom.
				1824	* Thus, it's possible for tsk to have allocated memory from
				1825	* nodes in mask.
				1826	*/
				1827	break;
				1828	case MPOL_BIND:
				1829	case MPOL_INTERLEAVE:
				1830	ret = nodes_intersects(mempolicy->v.nodes, *mask);
				1831	break;
				1832	default:
				1833	BUG();
				1834	}
				1835	out:
				1836	task_unlock(tsk);
				1837	return ret;
				1838	}
				1839
				1840	/* Allocate a page in interleaved policy.
				1841	Own path because it needs to do special accounting. */
				1842	static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
				1843	unsigned nid)
				1844	{
				1845	struct zonelist *zl;
				1846	struct page *page;
				1847
				1848	zl = node_zonelist(nid, gfp);
				1849	page = __alloc_pages(gfp, order, zl);
				1850	if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
				1851	inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
				1852	return page;
				1853	}
				1854
				1855	/**
				1856	* alloc_pages_vma - Allocate a page for a VMA.
				1857	*
				1858	* @gfp:
				1859	* %GFP_USER user allocation.
				1860	* %GFP_KERNEL kernel allocations,
				1861	* %GFP_HIGHMEM highmem/user allocations,
				1862	* %GFP_FS allocation should not call back into a file system.
				1863	* %GFP_ATOMIC don't sleep.
				1864	*
				1865	* @order:Order of the GFP allocation.
				1866	* @vma: Pointer to VMA or NULL if not available.
				1867	* @addr: Virtual Address of the allocation. Must be inside the VMA.
				1868	*
				1869	* This function allocates a page from the kernel page pool and applies
				1870	* a NUMA policy associated with the VMA or the current process.
				1871	* When VMA is not NULL caller must hold down_read on the mmap_sem of the
				1872	* mm_struct of the VMA to prevent it from going away. Should be used for
				1873	* all allocations for pages that will be mapped into
				1874	* user space. Returns NULL when no page can be allocated.
				1875	*
				1876	* Should be called with the mm_sem of the vma hold.
				1877	*/
				1878	struct page *
				1879	alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
				1880	unsigned long addr, int node)
				1881	{
				1882	struct mempolicy *pol;
				1883	struct zonelist *zl;
				1884	struct page *page;
				1885	unsigned int cpuset_mems_cookie;
				1886
				1887	retry_cpuset:
				1888	pol = get_vma_policy(current, vma, addr);
				1889	cpuset_mems_cookie = get_mems_allowed();
				1890
				1891	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
				1892	unsigned nid;
				1893
				1894	nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
				1895	mpol_cond_put(pol);
				1896	page = alloc_page_interleave(gfp, order, nid);
				1897	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
				1898	goto retry_cpuset;
				1899
				1900	return page;
				1901	}
				1902	zl = policy_zonelist(gfp, pol, node);
				1903	if (unlikely(mpol_needs_cond_ref(pol))) {
				1904	/*
				1905	* slow path: ref counted shared policy
				1906	*/
				1907	struct page *page = __alloc_pages_nodemask(gfp, order,
				1908	zl, policy_nodemask(gfp, pol));
				1909	__mpol_put(pol);
				1910	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
				1911	goto retry_cpuset;
				1912	return page;
				1913	}
				1914	/*
				1915	* fast path: default or task policy
				1916	*/
				1917	page = __alloc_pages_nodemask(gfp, order, zl,
				1918	policy_nodemask(gfp, pol));
				1919	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
				1920	goto retry_cpuset;
				1921	return page;
				1922	}
				1923
				1924	/**
				1925	* alloc_pages_current - Allocate pages.
				1926	*
				1927	* @gfp:
				1928	* %GFP_USER user allocation,
				1929	* %GFP_KERNEL kernel allocation,
				1930	* %GFP_HIGHMEM highmem allocation,
				1931	* %GFP_FS don't call back into a file system.
				1932	* %GFP_ATOMIC don't sleep.
				1933	* @order: Power of two of allocation size in pages. 0 is a single page.
				1934	*
				1935	* Allocate a page from the kernel page pool. When not in
				1936	* interrupt context and apply the current process NUMA policy.
				1937	* Returns NULL when no page can be allocated.
				1938	*
				1939	* Don't call cpuset_update_task_memory_state() unless
				1940	* 1) it's ok to take cpuset_sem (can WAIT), and
				1941	* 2) allocating for current task (not interrupt).
				1942	*/
				1943	struct page *alloc_pages_current(gfp_t gfp, unsigned order)
				1944	{
				1945	struct mempolicy *pol = current->mempolicy;
				1946	struct page *page;
				1947	unsigned int cpuset_mems_cookie;
				1948
				1949	if (!pol \|\| in_interrupt() \|\| (gfp & __GFP_THISNODE))
				1950	pol = &default_policy;
				1951
				1952	retry_cpuset:
				1953	cpuset_mems_cookie = get_mems_allowed();
				1954
				1955	/*
				1956	* No reference counting needed for current->mempolicy
				1957	* nor system default_policy
				1958	*/
				1959	if (pol->mode == MPOL_INTERLEAVE)
				1960	page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
				1961	else
				1962	page = __alloc_pages_nodemask(gfp, order,
				1963	policy_zonelist(gfp, pol, numa_node_id()),
				1964	policy_nodemask(gfp, pol));
				1965
				1966	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
				1967	goto retry_cpuset;
				1968
				1969	return page;
				1970	}
				1971	EXPORT_SYMBOL(alloc_pages_current);
				1972
				1973	/*
				1974	* If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
				1975	* rebinds the mempolicy its copying by calling mpol_rebind_policy()
				1976	* with the mems_allowed returned by cpuset_mems_allowed(). This
				1977	* keeps mempolicies cpuset relative after its cpuset moves. See
				1978	* further kernel/cpuset.c update_nodemask().
				1979	*
				1980	* current's mempolicy may be rebinded by the other task(the task that changes
				1981	* cpuset's mems), so we needn't do rebind work for current task.
				1982	*/
				1983
				1984	/* Slow path of a mempolicy duplicate */
				1985	struct mempolicy __mpol_dup(struct mempolicy old)
				1986	{
				1987	struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
				1988
				1989	if (!new)
				1990	return ERR_PTR(-ENOMEM);
				1991
				1992	/* task's mempolicy is protected by alloc_lock */
				1993	if (old == current->mempolicy) {
				1994	task_lock(current);
				1995	new = old;
				1996	task_unlock(current);
				1997	} else
				1998	new = old;
				1999
				2000	if (current_cpuset_is_being_rebound()) {
				2001	nodemask_t mems = cpuset_mems_allowed(current);
				2002	if (new->flags & MPOL_F_REBINDING)
				2003	mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
				2004	else
				2005	mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
				2006	}
				2007	atomic_set(&new->refcnt, 1);
				2008	return new;
				2009	}
				2010
				2011	/* Slow path of a mempolicy comparison */
				2012	bool __mpol_equal(struct mempolicy a, struct mempolicy b)
				2013	{
				2014	if (!a \|\| !b)
				2015	return false;
				2016	if (a->mode != b->mode)
				2017	return false;
				2018	if (a->flags != b->flags)
				2019	return false;
				2020	if (mpol_store_user_nodemask(a))
				2021	if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
				2022	return false;
				2023
				2024	switch (a->mode) {
				2025	case MPOL_BIND:
				2026	/* Fall through */
				2027	case MPOL_INTERLEAVE:
				2028	return !!nodes_equal(a->v.nodes, b->v.nodes);
				2029	case MPOL_PREFERRED:
				2030	return a->v.preferred_node == b->v.preferred_node;
				2031	default:
				2032	BUG();
				2033	return false;
				2034	}
				2035	}
				2036
				2037	/*
				2038	* Shared memory backing store policy support.
				2039	*
				2040	* Remember policies even when nobody has shared memory mapped.
				2041	* The policies are kept in Red-Black tree linked from the inode.
				2042	* They are protected by the sp->lock spinlock, which should be held
				2043	* for any accesses to the tree.
				2044	*/
				2045
				2046	/* lookup first element intersecting start-end */
				2047	/* Caller holds sp->mutex */
				2048	static struct sp_node *
				2049	sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
				2050	{
				2051	struct rb_node *n = sp->root.rb_node;
				2052
				2053	while (n) {
				2054	struct sp_node *p = rb_entry(n, struct sp_node, nd);
				2055
				2056	if (start >= p->end)
				2057	n = n->rb_right;
				2058	else if (end <= p->start)
				2059	n = n->rb_left;
				2060	else
				2061	break;
				2062	}
				2063	if (!n)
				2064	return NULL;
				2065	for (;;) {
				2066	struct sp_node *w = NULL;
				2067	struct rb_node *prev = rb_prev(n);
				2068	if (!prev)
				2069	break;
				2070	w = rb_entry(prev, struct sp_node, nd);
				2071	if (w->end <= start)
				2072	break;
				2073	n = prev;
				2074	}
				2075	return rb_entry(n, struct sp_node, nd);
				2076	}
				2077
				2078	/* Insert a new shared policy into the list. */
				2079	/* Caller holds sp->lock */
				2080	static void sp_insert(struct shared_policy sp, struct sp_node new)
				2081	{
				2082	struct rb_node **p = &sp->root.rb_node;
				2083	struct rb_node *parent = NULL;
				2084	struct sp_node *nd;
				2085
				2086	while (*p) {
				2087	parent = *p;
				2088	nd = rb_entry(parent, struct sp_node, nd);
				2089	if (new->start < nd->start)
				2090	p = &(*p)->rb_left;
				2091	else if (new->end > nd->end)
				2092	p = &(*p)->rb_right;
				2093	else
				2094	BUG();
				2095	}
				2096	rb_link_node(&new->nd, parent, p);
				2097	rb_insert_color(&new->nd, &sp->root);
				2098	pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
				2099	new->policy ? new->policy->mode : 0);
				2100	}
				2101
				2102	/* Find shared policy intersecting idx */
				2103	struct mempolicy *
				2104	mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
				2105	{
				2106	struct mempolicy *pol = NULL;
				2107	struct sp_node *sn;
				2108
				2109	if (!sp->root.rb_node)
				2110	return NULL;
				2111	mutex_lock(&sp->mutex);
				2112	sn = sp_lookup(sp, idx, idx+1);
				2113	if (sn) {
				2114	mpol_get(sn->policy);
				2115	pol = sn->policy;
				2116	}
				2117	mutex_unlock(&sp->mutex);
				2118	return pol;
				2119	}
				2120
				2121	static void sp_free(struct sp_node *n)
				2122	{
				2123	mpol_put(n->policy);
				2124	kmem_cache_free(sn_cache, n);
				2125	}
				2126
				2127	static void sp_delete(struct shared_policy sp, struct sp_node n)
				2128	{
				2129	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
				2130	rb_erase(&n->nd, &sp->root);
				2131	sp_free(n);
				2132	}
				2133
				2134	static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
				2135	struct mempolicy *pol)
				2136	{
				2137	struct sp_node *n;
				2138	struct mempolicy *newpol;
				2139
				2140	n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
				2141	if (!n)
				2142	return NULL;
				2143
				2144	newpol = mpol_dup(pol);
				2145	if (IS_ERR(newpol)) {
				2146	kmem_cache_free(sn_cache, n);
				2147	return NULL;
				2148	}
				2149	newpol->flags \|= MPOL_F_SHARED;
				2150
				2151	n->start = start;
				2152	n->end = end;
				2153	n->policy = newpol;
				2154
				2155	return n;
				2156	}
				2157
				2158	/* Replace a policy range. */
				2159	static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
				2160	unsigned long end, struct sp_node *new)
				2161	{
				2162	struct sp_node *n;
				2163	int ret = 0;
				2164
				2165	mutex_lock(&sp->mutex);
				2166	n = sp_lookup(sp, start, end);
				2167	/* Take care of old policies in the same range. */
				2168	while (n && n->start < end) {
				2169	struct rb_node *next = rb_next(&n->nd);
				2170	if (n->start >= start) {
				2171	if (n->end <= end)
				2172	sp_delete(sp, n);
				2173	else
				2174	n->start = end;
				2175	} else {
				2176	/* Old policy spanning whole new range. */
				2177	if (n->end > end) {
				2178	struct sp_node *new2;
				2179	new2 = sp_alloc(end, n->end, n->policy);
				2180	if (!new2) {
				2181	ret = -ENOMEM;
				2182	goto out;
				2183	}
				2184	n->end = start;
				2185	sp_insert(sp, new2);
				2186	break;
				2187	} else
				2188	n->end = start;
				2189	}
				2190	if (!next)
				2191	break;
				2192	n = rb_entry(next, struct sp_node, nd);
				2193	}
				2194	if (new)
				2195	sp_insert(sp, new);
				2196	out:
				2197	mutex_unlock(&sp->mutex);
				2198	return ret;
				2199	}
				2200
				2201	/**
				2202	* mpol_shared_policy_init - initialize shared policy for inode
				2203	* @sp: pointer to inode shared policy
				2204	* @mpol: struct mempolicy to install
				2205	*
				2206	* Install non-NULL @mpol in inode's shared policy rb-tree.
				2207	* On entry, the current task has a reference on a non-NULL @mpol.
				2208	* This must be released on exit.
				2209	* This is called at get_inode() calls and we can use GFP_KERNEL.
				2210	*/
				2211	void mpol_shared_policy_init(struct shared_policy sp, struct mempolicy mpol)
				2212	{
				2213	int ret;
				2214
				2215	sp->root = RB_ROOT; /* empty tree == default mempolicy */
				2216	mutex_init(&sp->mutex);
				2217
				2218	if (mpol) {
				2219	struct vm_area_struct pvma;
				2220	struct mempolicy *new;
				2221	NODEMASK_SCRATCH(scratch);
				2222
				2223	if (!scratch)
				2224	goto put_mpol;
				2225	/* contextualize the tmpfs mount point mempolicy */
				2226	new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
				2227	if (IS_ERR(new))
				2228	goto free_scratch; /* no valid nodemask intersection */
				2229
				2230	task_lock(current);
				2231	ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
				2232	task_unlock(current);
				2233	if (ret)
				2234	goto put_new;
				2235
				2236	/* Create pseudo-vma that contains just the policy */
				2237	memset(&pvma, 0, sizeof(struct vm_area_struct));
				2238	pvma.vm_end = TASK_SIZE; /* policy covers entire file */
				2239	mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
				2240
				2241	put_new:
				2242	mpol_put(new); /* drop initial ref */
				2243	free_scratch:
				2244	NODEMASK_SCRATCH_FREE(scratch);
				2245	put_mpol:
				2246	mpol_put(mpol); /* drop our incoming ref on sb mpol */
				2247	}
				2248	}
				2249
				2250	int mpol_set_shared_policy(struct shared_policy *info,
				2251	struct vm_area_struct vma, struct mempolicy npol)
				2252	{
				2253	int err;
				2254	struct sp_node *new = NULL;
				2255	unsigned long sz = vma_pages(vma);
				2256
				2257	pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
				2258	vma->vm_pgoff,
				2259	sz, npol ? npol->mode : -1,
				2260	npol ? npol->flags : -1,
				2261	npol ? nodes_addr(npol->v.nodes)[0] : -1);
				2262
				2263	if (npol) {
				2264	new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
				2265	if (!new)
				2266	return -ENOMEM;
				2267	}
				2268	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
				2269	if (err && new)
				2270	sp_free(new);
				2271	return err;
				2272	}
				2273
				2274	/* Free a backing policy store on inode delete. */
				2275	void mpol_free_shared_policy(struct shared_policy *p)
				2276	{
				2277	struct sp_node *n;
				2278	struct rb_node *next;
				2279
				2280	if (!p->root.rb_node)
				2281	return;
				2282	mutex_lock(&p->mutex);
				2283	next = rb_first(&p->root);
				2284	while (next) {
				2285	n = rb_entry(next, struct sp_node, nd);
				2286	next = rb_next(&n->nd);
				2287	sp_delete(p, n);
				2288	}
				2289	mutex_unlock(&p->mutex);
				2290	}
				2291
				2292	/* assumes fs == KERNEL_DS */
				2293	void __init numa_policy_init(void)
				2294	{
				2295	nodemask_t interleave_nodes;
				2296	unsigned long largest = 0;
				2297	int nid, prefer = 0;
				2298
				2299	policy_cache = kmem_cache_create("numa_policy",
				2300	sizeof(struct mempolicy),
				2301	0, SLAB_PANIC, NULL);
				2302
				2303	sn_cache = kmem_cache_create("shared_policy_node",
				2304	sizeof(struct sp_node),
				2305	0, SLAB_PANIC, NULL);
				2306
				2307	/*
				2308	* Set interleaving policy for system init. Interleaving is only
				2309	* enabled across suitably sized nodes (default is >= 16MB), or
				2310	* fall back to the largest node if they're all smaller.
				2311	*/
				2312	nodes_clear(interleave_nodes);
				2313	for_each_node_state(nid, N_HIGH_MEMORY) {
				2314	unsigned long total_pages = node_present_pages(nid);
				2315
				2316	/* Preserve the largest node */
				2317	if (largest < total_pages) {
				2318	largest = total_pages;
				2319	prefer = nid;
				2320	}
				2321
				2322	/* Interleave this node? */
				2323	if ((total_pages << PAGE_SHIFT) >= (16 << 20))
				2324	node_set(nid, interleave_nodes);
				2325	}
				2326
				2327	/* All too small, use the largest */
				2328	if (unlikely(nodes_empty(interleave_nodes)))
				2329	node_set(prefer, interleave_nodes);
				2330
				2331	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
				2332	printk("numa_policy_init: interleaving failed\n");
				2333	}
				2334
				2335	/* Reset policy of current process to default */
				2336	void numa_default_policy(void)
				2337	{
				2338	do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
				2339	}
				2340
				2341	/*
				2342	* Parse and format mempolicy from/to strings
				2343	*/
				2344
				2345	/*
				2346	* "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
				2347	*/
				2348	#define MPOL_LOCAL MPOL_MAX
				2349	static const char * const policy_modes[] =
				2350	{
				2351	[MPOL_DEFAULT] = "default",
				2352	[MPOL_PREFERRED] = "prefer",
				2353	[MPOL_BIND] = "bind",
				2354	[MPOL_INTERLEAVE] = "interleave",
				2355	[MPOL_LOCAL] = "local"
				2356	};
				2357
				2358
				2359	#ifdef CONFIG_TMPFS
				2360	/**
				2361	* mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
				2362	* @str: string containing mempolicy to parse
				2363	* @mpol: pointer to struct mempolicy pointer, returned on success.
				2364	* @unused: redundant argument, to be removed later.
				2365	*
				2366	* Format of input:
				2367	* <mode>[=<flags>][:<nodelist>]
				2368	*
				2369	* On success, returns 0, else 1
				2370	*/
				2371	int mpol_parse_str(char str, struct mempolicy *mpol, int unused)
				2372	{
				2373	struct mempolicy *new = NULL;
				2374	unsigned short mode;
				2375	unsigned short mode_flags;
				2376	nodemask_t nodes;
				2377	char *nodelist = strchr(str, ':');
				2378	char *flags = strchr(str, '=');
				2379	int err = 1;
				2380
				2381	if (nodelist) {
				2382	/* NUL-terminate mode or flags string */
				2383	*nodelist++ = '\0';
				2384	if (nodelist_parse(nodelist, nodes))
				2385	goto out;
				2386	if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
				2387	goto out;
				2388	} else
				2389	nodes_clear(nodes);
				2390
				2391	if (flags)
				2392	flags++ = '\0'; / terminate mode string */
				2393
				2394	for (mode = 0; mode <= MPOL_LOCAL; mode++) {
				2395	if (!strcmp(str, policy_modes[mode])) {
				2396	break;
				2397	}
				2398	}
				2399	if (mode > MPOL_LOCAL)
				2400	goto out;
				2401
				2402	switch (mode) {
				2403	case MPOL_PREFERRED:
				2404	/*
				2405	* Insist on a nodelist of one node only
				2406	*/
				2407	if (nodelist) {
				2408	char *rest = nodelist;
				2409	while (isdigit(*rest))
				2410	rest++;
				2411	if (*rest)
				2412	goto out;
				2413	}
				2414	break;
				2415	case MPOL_INTERLEAVE:
				2416	/*
				2417	* Default to online nodes with memory if no nodelist
				2418	*/
				2419	if (!nodelist)
				2420	nodes = node_states[N_HIGH_MEMORY];
				2421	break;
				2422	case MPOL_LOCAL:
				2423	/*
				2424	* Don't allow a nodelist; mpol_new() checks flags
				2425	*/
				2426	if (nodelist)
				2427	goto out;
				2428	mode = MPOL_PREFERRED;
				2429	break;
				2430	case MPOL_DEFAULT:
				2431	/*
				2432	* Insist on a empty nodelist
				2433	*/
				2434	if (!nodelist)
				2435	err = 0;
				2436	goto out;
				2437	case MPOL_BIND:
				2438	/*
				2439	* Insist on a nodelist
				2440	*/
				2441	if (!nodelist)
				2442	goto out;
				2443	}
				2444
				2445	mode_flags = 0;
				2446	if (flags) {
				2447	/*
				2448	* Currently, we only support two mutually exclusive
				2449	* mode flags.
				2450	*/
				2451	if (!strcmp(flags, "static"))
				2452	mode_flags \|= MPOL_F_STATIC_NODES;
				2453	else if (!strcmp(flags, "relative"))
				2454	mode_flags \|= MPOL_F_RELATIVE_NODES;
				2455	else
				2456	goto out;
				2457	}
				2458
				2459	new = mpol_new(mode, mode_flags, &nodes);
				2460	if (IS_ERR(new))
				2461	goto out;
				2462
				2463	/*
				2464	* Save nodes for mpol_to_str() to show the tmpfs mount options
				2465	* for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
				2466	*/
				2467	if (mode != MPOL_PREFERRED)
				2468	new->v.nodes = nodes;
				2469	else if (nodelist)
				2470	new->v.preferred_node = first_node(nodes);
				2471	else
				2472	new->flags \|= MPOL_F_LOCAL;
				2473
				2474	/*
				2475	* Save nodes for contextualization: this will be used to "clone"
				2476	* the mempolicy in a specific context [cpuset] at a later time.
				2477	*/
				2478	new->w.user_nodemask = nodes;
				2479
				2480	err = 0;
				2481
				2482	out:
				2483	/* Restore string for error message */
				2484	if (nodelist)
				2485	*--nodelist = ':';
				2486	if (flags)
				2487	*--flags = '=';
				2488	if (!err)
				2489	*mpol = new;
				2490	return err;
				2491	}
				2492	#endif /* CONFIG_TMPFS */
				2493
				2494	/**
				2495	* mpol_to_str - format a mempolicy structure for printing
				2496	* @buffer: to contain formatted mempolicy string
				2497	* @maxlen: length of @buffer
				2498	* @pol: pointer to mempolicy to be formatted
				2499	* @unused: redundant argument, to be removed later.
				2500	*
				2501	* Convert a mempolicy into a string.
				2502	* Returns the number of characters in buffer (if positive)
				2503	* or an error (negative)
				2504	*/
				2505	int mpol_to_str(char buffer, int maxlen, struct mempolicy pol, int unused)
				2506	{
				2507	char *p = buffer;
				2508	int l;
				2509	nodemask_t nodes;
				2510	unsigned short mode;
				2511	unsigned short flags = pol ? pol->flags : 0;
				2512
				2513	/*
				2514	* Sanity check: room for longest mode, flag and some nodes
				2515	*/
				2516	VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
				2517
				2518	if (!pol \|\| pol == &default_policy)
				2519	mode = MPOL_DEFAULT;
				2520	else
				2521	mode = pol->mode;
				2522
				2523	switch (mode) {
				2524	case MPOL_DEFAULT:
				2525	nodes_clear(nodes);
				2526	break;
				2527
				2528	case MPOL_PREFERRED:
				2529	nodes_clear(nodes);
				2530	if (flags & MPOL_F_LOCAL)
				2531	mode = MPOL_LOCAL;
				2532	else
				2533	node_set(pol->v.preferred_node, nodes);
				2534	break;
				2535
				2536	case MPOL_BIND:
				2537	/* Fall through */
				2538	case MPOL_INTERLEAVE:
				2539	nodes = pol->v.nodes;
				2540	break;
				2541
				2542	default:
				2543	return -EINVAL;
				2544	}
				2545
				2546	l = strlen(policy_modes[mode]);
				2547	if (buffer + maxlen < p + l + 1)
				2548	return -ENOSPC;
				2549
				2550	strcpy(p, policy_modes[mode]);
				2551	p += l;
				2552
				2553	if (flags & MPOL_MODE_FLAGS) {
				2554	if (buffer + maxlen < p + 2)
				2555	return -ENOSPC;
				2556	*p++ = '=';
				2557
				2558	/*
				2559	* Currently, the only defined flags are mutually exclusive
				2560	*/
				2561	if (flags & MPOL_F_STATIC_NODES)
				2562	p += snprintf(p, buffer + maxlen - p, "static");
				2563	else if (flags & MPOL_F_RELATIVE_NODES)
				2564	p += snprintf(p, buffer + maxlen - p, "relative");
				2565	}
				2566
				2567	if (!nodes_empty(nodes)) {
				2568	if (buffer + maxlen < p + 2)
				2569	return -ENOSPC;
				2570	*p++ = ':';
				2571	p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
				2572	}
				2573	return p - buffer;
				2574	}