Blame - ap/os/linux/linux-3.4.x/kernel/cpuset.c - T106_DC

blob: 7f3bde5c50fe3ac32784bf89c976349fff7b4835 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* kernel/cpuset.c
				3	*
				4	* Processor and Memory placement constraints for sets of tasks.
				5	*
				6	* Copyright (C) 2003 BULL SA.
				7	* Copyright (C) 2004-2007 Silicon Graphics, Inc.
				8	* Copyright (C) 2006 Google, Inc
				9	*
				10	* Portions derived from Patrick Mochel's sysfs code.
				11	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				12	*
				13	* 2003-10-10 Written by Simon Derr.
				14	* 2003-10-22 Updates by Stephen Hemminger.
				15	* 2004 May-July Rework by Paul Jackson.
				16	* 2006 Rework by Paul Menage to use generic cgroups
				17	* 2008 Rework of the scheduler domains and CPU hotplug handling
				18	* by Max Krasnyansky
				19	*
				20	* This file is subject to the terms and conditions of the GNU General Public
				21	* License. See the file COPYING in the main directory of the Linux
				22	* distribution for more details.
				23	*/
				24
				25	#include <linux/cpu.h>
				26	#include <linux/cpumask.h>
				27	#include <linux/cpuset.h>
				28	#include <linux/err.h>
				29	#include <linux/errno.h>
				30	#include <linux/file.h>
				31	#include <linux/fs.h>
				32	#include <linux/init.h>
				33	#include <linux/interrupt.h>
				34	#include <linux/kernel.h>
				35	#include <linux/kmod.h>
				36	#include <linux/list.h>
				37	#include <linux/mempolicy.h>
				38	#include <linux/mm.h>
				39	#include <linux/memory.h>
				40	#include <linux/export.h>
				41	#include <linux/mount.h>
				42	#include <linux/namei.h>
				43	#include <linux/pagemap.h>
				44	#include <linux/proc_fs.h>
				45	#include <linux/rcupdate.h>
				46	#include <linux/sched.h>
				47	#include <linux/seq_file.h>
				48	#include <linux/security.h>
				49	#include <linux/slab.h>
				50	#include <linux/spinlock.h>
				51	#include <linux/stat.h>
				52	#include <linux/string.h>
				53	#include <linux/time.h>
				54	#include <linux/backing-dev.h>
				55	#include <linux/sort.h>
				56
				57	#include <asm/uaccess.h>
				58	#include <linux/atomic.h>
				59	#include <linux/mutex.h>
				60	#include <linux/workqueue.h>
				61	#include <linux/cgroup.h>
				62
				63	/*
				64	* Workqueue for cpuset related tasks.
				65	*
				66	* Using kevent workqueue may cause deadlock when memory_migrate
				67	* is set. So we create a separate workqueue thread for cpuset.
				68	*/
				69	static struct workqueue_struct *cpuset_wq;
				70
				71	/*
				72	* Tracks how many cpusets are currently defined in system.
				73	* When there is only one cpuset (the root cpuset) we can
				74	* short circuit some hooks.
				75	*/
				76	int number_of_cpusets __read_mostly;
				77
				78	/* Forward declare cgroup structures */
				79	struct cgroup_subsys cpuset_subsys;
				80	struct cpuset;
				81
				82	/* See "Frequency meter" comments, below. */
				83
				84	struct fmeter {
				85	int cnt; /* unprocessed events count */
				86	int val; /* most recent output value */
				87	time_t time; /* clock (secs) when val computed */
				88	spinlock_t lock; /* guards read or write of above */
				89	};
				90
				91	struct cpuset {
				92	struct cgroup_subsys_state css;
				93
				94	unsigned long flags; /* "unsigned long" so bitops work */
				95	cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
				96	nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
				97
				98	struct cpuset parent; / my parent */
				99
				100	struct fmeter fmeter; /* memory_pressure filter */
				101
				102	/* partition number for rebuild_sched_domains() */
				103	int pn;
				104
				105	/* for custom sched domain */
				106	int relax_domain_level;
				107
				108	/* used for walking a cpuset hierarchy */
				109	struct list_head stack_list;
				110	};
				111
				112	/* Retrieve the cpuset for a cgroup */
				113	static inline struct cpuset cgroup_cs(struct cgroup cont)
				114	{
				115	return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
				116	struct cpuset, css);
				117	}
				118
				119	/* Retrieve the cpuset for a task */
				120	static inline struct cpuset task_cs(struct task_struct task)
				121	{
				122	return container_of(task_subsys_state(task, cpuset_subsys_id),
				123	struct cpuset, css);
				124	}
				125
				126	#ifdef CONFIG_NUMA
				127	static inline bool task_has_mempolicy(struct task_struct *task)
				128	{
				129	return task->mempolicy;
				130	}
				131	#else
				132	static inline bool task_has_mempolicy(struct task_struct *task)
				133	{
				134	return false;
				135	}
				136	#endif
				137
				138
				139	/* bits in struct cpuset flags field */
				140	typedef enum {
				141	CS_CPU_EXCLUSIVE,
				142	CS_MEM_EXCLUSIVE,
				143	CS_MEM_HARDWALL,
				144	CS_MEMORY_MIGRATE,
				145	CS_SCHED_LOAD_BALANCE,
				146	CS_SPREAD_PAGE,
				147	CS_SPREAD_SLAB,
				148	} cpuset_flagbits_t;
				149
				150	/* convenient tests for these bits */
				151	static inline int is_cpu_exclusive(const struct cpuset *cs)
				152	{
				153	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
				154	}
				155
				156	static inline int is_mem_exclusive(const struct cpuset *cs)
				157	{
				158	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
				159	}
				160
				161	static inline int is_mem_hardwall(const struct cpuset *cs)
				162	{
				163	return test_bit(CS_MEM_HARDWALL, &cs->flags);
				164	}
				165
				166	static inline int is_sched_load_balance(const struct cpuset *cs)
				167	{
				168	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
				169	}
				170
				171	static inline int is_memory_migrate(const struct cpuset *cs)
				172	{
				173	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
				174	}
				175
				176	static inline int is_spread_page(const struct cpuset *cs)
				177	{
				178	return test_bit(CS_SPREAD_PAGE, &cs->flags);
				179	}
				180
				181	static inline int is_spread_slab(const struct cpuset *cs)
				182	{
				183	return test_bit(CS_SPREAD_SLAB, &cs->flags);
				184	}
				185
				186	static struct cpuset top_cpuset = {
				187	.flags = ((1 << CS_CPU_EXCLUSIVE) \| (1 << CS_MEM_EXCLUSIVE)),
				188	};
				189
				190	/*
				191	* There are two global mutexes guarding cpuset structures. The first
				192	* is the main control groups cgroup_mutex, accessed via
				193	* cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
				194	* callback_mutex, below. They can nest. It is ok to first take
				195	* cgroup_mutex, then nest callback_mutex. We also require taking
				196	* task_lock() when dereferencing a task's cpuset pointer. See "The
				197	* task_lock() exception", at the end of this comment.
				198	*
				199	* A task must hold both mutexes to modify cpusets. If a task
				200	* holds cgroup_mutex, then it blocks others wanting that mutex,
				201	* ensuring that it is the only task able to also acquire callback_mutex
				202	* and be able to modify cpusets. It can perform various checks on
				203	* the cpuset structure first, knowing nothing will change. It can
				204	* also allocate memory while just holding cgroup_mutex. While it is
				205	* performing these checks, various callback routines can briefly
				206	* acquire callback_mutex to query cpusets. Once it is ready to make
				207	* the changes, it takes callback_mutex, blocking everyone else.
				208	*
				209	* Calls to the kernel memory allocator can not be made while holding
				210	* callback_mutex, as that would risk double tripping on callback_mutex
				211	* from one of the callbacks into the cpuset code from within
				212	* __alloc_pages().
				213	*
				214	* If a task is only holding callback_mutex, then it has read-only
				215	* access to cpusets.
				216	*
				217	* Now, the task_struct fields mems_allowed and mempolicy may be changed
				218	* by other task, we use alloc_lock in the task_struct fields to protect
				219	* them.
				220	*
				221	* The cpuset_common_file_read() handlers only hold callback_mutex across
				222	* small pieces of code, such as when reading out possibly multi-word
				223	* cpumasks and nodemasks.
				224	*
				225	* Accessing a task's cpuset should be done in accordance with the
				226	* guidelines for accessing subsystem state in kernel/cgroup.c
				227	*/
				228
				229	static DEFINE_MUTEX(callback_mutex);
				230
				231	/*
				232	* cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
				233	* buffers. They are statically allocated to prevent using excess stack
				234	* when calling cpuset_print_task_mems_allowed().
				235	*/
				236	#define CPUSET_NAME_LEN (128)
				237	#define CPUSET_NODELIST_LEN (256)
				238	static char cpuset_name[CPUSET_NAME_LEN];
				239	static char cpuset_nodelist[CPUSET_NODELIST_LEN];
				240	static DEFINE_SPINLOCK(cpuset_buffer_lock);
				241
				242	/*
				243	* This is ugly, but preserves the userspace API for existing cpuset
				244	* users. If someone tries to mount the "cpuset" filesystem, we
				245	* silently switch it to mount "cgroup" instead
				246	*/
				247	static struct dentry cpuset_mount(struct file_system_type fs_type,
				248	int flags, const char unused_dev_name, void data)
				249	{
				250	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
				251	struct dentry *ret = ERR_PTR(-ENODEV);
				252	if (cgroup_fs) {
				253	char mountopts[] =
				254	"cpuset,noprefix,"
				255	"release_agent=/sbin/cpuset_release_agent";
				256	ret = cgroup_fs->mount(cgroup_fs, flags,
				257	unused_dev_name, mountopts);
				258	put_filesystem(cgroup_fs);
				259	}
				260	return ret;
				261	}
				262
				263	static struct file_system_type cpuset_fs_type = {
				264	.name = "cpuset",
				265	.mount = cpuset_mount,
				266	};
				267
				268	/*
				269	* Return in pmask the portion of a cpusets's cpus_allowed that
				270	* are online. If none are online, walk up the cpuset hierarchy
				271	* until we find one that does have some online cpus. If we get
				272	* all the way to the top and still haven't found any online cpus,
				273	* return cpu_online_mask. Or if passed a NULL cs from an exit'ing
				274	* task, return cpu_online_mask.
				275	*
				276	* One way or another, we guarantee to return some non-empty subset
				277	* of cpu_online_mask.
				278	*
				279	* Call with callback_mutex held.
				280	*/
				281
				282	static void guarantee_online_cpus(const struct cpuset *cs,
				283	struct cpumask *pmask)
				284	{
				285	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
				286	cs = cs->parent;
				287	if (cs)
				288	cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
				289	else
				290	cpumask_copy(pmask, cpu_online_mask);
				291	BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
				292	}
				293
				294	/*
				295	* Return in *pmask the portion of a cpusets's mems_allowed that
				296	* are online, with memory. If none are online with memory, walk
				297	* up the cpuset hierarchy until we find one that does have some
				298	* online mems. If we get all the way to the top and still haven't
				299	* found any online mems, return node_states[N_HIGH_MEMORY].
				300	*
				301	* One way or another, we guarantee to return some non-empty subset
				302	* of node_states[N_HIGH_MEMORY].
				303	*
				304	* Call with callback_mutex held.
				305	*/
				306
				307	static void guarantee_online_mems(const struct cpuset cs, nodemask_t pmask)
				308	{
				309	while (cs && !nodes_intersects(cs->mems_allowed,
				310	node_states[N_HIGH_MEMORY]))
				311	cs = cs->parent;
				312	if (cs)
				313	nodes_and(*pmask, cs->mems_allowed,
				314	node_states[N_HIGH_MEMORY]);
				315	else
				316	*pmask = node_states[N_HIGH_MEMORY];
				317	BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
				318	}
				319
				320	/*
				321	* update task's spread flag if cpuset's page/slab spread flag is set
				322	*
				323	* Called with callback_mutex/cgroup_mutex held
				324	*/
				325	static void cpuset_update_task_spread_flag(struct cpuset *cs,
				326	struct task_struct *tsk)
				327	{
				328	if (is_spread_page(cs))
				329	task_set_spread_page(tsk);
				330	else
				331	task_clear_spread_page(tsk);
				332
				333	if (is_spread_slab(cs))
				334	task_set_spread_slab(tsk);
				335	else
				336	task_clear_spread_slab(tsk);
				337	}
				338
				339	/*
				340	* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
				341	*
				342	* One cpuset is a subset of another if all its allowed CPUs and
				343	* Memory Nodes are a subset of the other, and its exclusive flags
				344	* are only set if the other's are set. Call holding cgroup_mutex.
				345	*/
				346
				347	static int is_cpuset_subset(const struct cpuset p, const struct cpuset q)
				348	{
				349	return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
				350	nodes_subset(p->mems_allowed, q->mems_allowed) &&
				351	is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
				352	is_mem_exclusive(p) <= is_mem_exclusive(q);
				353	}
				354
				355	/**
				356	* alloc_trial_cpuset - allocate a trial cpuset
				357	* @cs: the cpuset that the trial cpuset duplicates
				358	*/
				359	static struct cpuset alloc_trial_cpuset(const struct cpuset cs)
				360	{
				361	struct cpuset *trial;
				362
				363	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
				364	if (!trial)
				365	return NULL;
				366
				367	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
				368	kfree(trial);
				369	return NULL;
				370	}
				371	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
				372
				373	return trial;
				374	}
				375
				376	/**
				377	* free_trial_cpuset - free the trial cpuset
				378	* @trial: the trial cpuset to be freed
				379	*/
				380	static void free_trial_cpuset(struct cpuset *trial)
				381	{
				382	free_cpumask_var(trial->cpus_allowed);
				383	kfree(trial);
				384	}
				385
				386	/*
				387	* validate_change() - Used to validate that any proposed cpuset change
				388	* follows the structural rules for cpusets.
				389	*
				390	* If we replaced the flag and mask values of the current cpuset
				391	* (cur) with those values in the trial cpuset (trial), would
				392	* our various subset and exclusive rules still be valid? Presumes
				393	* cgroup_mutex held.
				394	*
				395	* 'cur' is the address of an actual, in-use cpuset. Operations
				396	* such as list traversal that depend on the actual address of the
				397	* cpuset in the list must use cur below, not trial.
				398	*
				399	* 'trial' is the address of bulk structure copy of cur, with
				400	* perhaps one or more of the fields cpus_allowed, mems_allowed,
				401	* or flags changed to new, trial values.
				402	*
				403	* Return 0 if valid, -errno if not.
				404	*/
				405
				406	static int validate_change(const struct cpuset cur, const struct cpuset trial)
				407	{
				408	struct cgroup *cont;
				409	struct cpuset c, par;
				410
				411	/* Each of our child cpusets must be a subset of us */
				412	list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
				413	if (!is_cpuset_subset(cgroup_cs(cont), trial))
				414	return -EBUSY;
				415	}
				416
				417	/* Remaining checks don't apply to root cpuset */
				418	if (cur == &top_cpuset)
				419	return 0;
				420
				421	par = cur->parent;
				422
				423	/* We must be a subset of our parent cpuset */
				424	if (!is_cpuset_subset(trial, par))
				425	return -EACCES;
				426
				427	/*
				428	* If either I or some sibling (!= me) is exclusive, we can't
				429	* overlap
				430	*/
				431	list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
				432	c = cgroup_cs(cont);
				433	if ((is_cpu_exclusive(trial) \|\| is_cpu_exclusive(c)) &&
				434	c != cur &&
				435	cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
				436	return -EINVAL;
				437	if ((is_mem_exclusive(trial) \|\| is_mem_exclusive(c)) &&
				438	c != cur &&
				439	nodes_intersects(trial->mems_allowed, c->mems_allowed))
				440	return -EINVAL;
				441	}
				442
				443	/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
				444	if (cgroup_task_count(cur->css.cgroup)) {
				445	if (cpumask_empty(trial->cpus_allowed) \|\|
				446	nodes_empty(trial->mems_allowed)) {
				447	return -ENOSPC;
				448	}
				449	}
				450
				451	return 0;
				452	}
				453
				454	#ifdef CONFIG_SMP
				455	/*
				456	* Helper routine for generate_sched_domains().
				457	* Do cpusets a, b have overlapping cpus_allowed masks?
				458	*/
				459	static int cpusets_overlap(struct cpuset a, struct cpuset b)
				460	{
				461	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
				462	}
				463
				464	static void
				465	update_domain_attr(struct sched_domain_attr dattr, struct cpuset c)
				466	{
				467	if (dattr->relax_domain_level < c->relax_domain_level)
				468	dattr->relax_domain_level = c->relax_domain_level;
				469	return;
				470	}
				471
				472	static void
				473	update_domain_attr_tree(struct sched_domain_attr dattr, struct cpuset c)
				474	{
				475	LIST_HEAD(q);
				476
				477	list_add(&c->stack_list, &q);
				478	while (!list_empty(&q)) {
				479	struct cpuset *cp;
				480	struct cgroup *cont;
				481	struct cpuset *child;
				482
				483	cp = list_first_entry(&q, struct cpuset, stack_list);
				484	list_del(q.next);
				485
				486	if (cpumask_empty(cp->cpus_allowed))
				487	continue;
				488
				489	if (is_sched_load_balance(cp))
				490	update_domain_attr(dattr, cp);
				491
				492	list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
				493	child = cgroup_cs(cont);
				494	list_add_tail(&child->stack_list, &q);
				495	}
				496	}
				497	}
				498
				499	/*
				500	* generate_sched_domains()
				501	*
				502	* This function builds a partial partition of the systems CPUs
				503	* A 'partial partition' is a set of non-overlapping subsets whose
				504	* union is a subset of that set.
				505	* The output of this function needs to be passed to kernel/sched.c
				506	* partition_sched_domains() routine, which will rebuild the scheduler's
				507	* load balancing domains (sched domains) as specified by that partial
				508	* partition.
				509	*
				510	* See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
				511	* for a background explanation of this.
				512	*
				513	* Does not return errors, on the theory that the callers of this
				514	* routine would rather not worry about failures to rebuild sched
				515	* domains when operating in the severe memory shortage situations
				516	* that could cause allocation failures below.
				517	*
				518	* Must be called with cgroup_lock held.
				519	*
				520	* The three key local variables below are:
				521	* q - a linked-list queue of cpuset pointers, used to implement a
				522	* top-down scan of all cpusets. This scan loads a pointer
				523	* to each cpuset marked is_sched_load_balance into the
				524	* array 'csa'. For our purposes, rebuilding the schedulers
				525	* sched domains, we can ignore !is_sched_load_balance cpusets.
				526	* csa - (for CpuSet Array) Array of pointers to all the cpusets
				527	* that need to be load balanced, for convenient iterative
				528	* access by the subsequent code that finds the best partition,
				529	* i.e the set of domains (subsets) of CPUs such that the
				530	* cpus_allowed of every cpuset marked is_sched_load_balance
				531	* is a subset of one of these domains, while there are as
				532	* many such domains as possible, each as small as possible.
				533	* doms - Conversion of 'csa' to an array of cpumasks, for passing to
				534	* the kernel/sched.c routine partition_sched_domains() in a
				535	* convenient format, that can be easily compared to the prior
				536	* value to determine what partition elements (sched domains)
				537	* were changed (added or removed.)
				538	*
				539	* Finding the best partition (set of domains):
				540	* The triple nested loops below over i, j, k scan over the
				541	* load balanced cpusets (using the array of cpuset pointers in
				542	* csa[]) looking for pairs of cpusets that have overlapping
				543	* cpus_allowed, but which don't have the same 'pn' partition
				544	* number and gives them in the same partition number. It keeps
				545	* looping on the 'restart' label until it can no longer find
				546	* any such pairs.
				547	*
				548	* The union of the cpus_allowed masks from the set of
				549	* all cpusets having the same 'pn' value then form the one
				550	* element of the partition (one sched domain) to be passed to
				551	* partition_sched_domains().
				552	*/
				553	static int generate_sched_domains(cpumask_var_t **domains,
				554	struct sched_domain_attr **attributes)
				555	{
				556	LIST_HEAD(q); /* queue of cpusets to be scanned */
				557	struct cpuset cp; / scans q */
				558	struct cpuset *csa; / array of all cpuset ptrs */
				559	int csn; /* how many cpuset ptrs in csa so far */
				560	int i, j, k; /* indices for partition finding loops */
				561	cpumask_var_t doms; / resulting partition; i.e. sched domains */
				562	struct sched_domain_attr dattr; / attributes for custom domains */
				563	int ndoms = 0; /* number of sched domains in result */
				564	int nslot; /* next empty doms[] struct cpumask slot */
				565
				566	doms = NULL;
				567	dattr = NULL;
				568	csa = NULL;
				569
				570	/* Special case for the 99% of systems with one, full, sched domain */
				571	if (is_sched_load_balance(&top_cpuset)) {
				572	ndoms = 1;
				573	doms = alloc_sched_domains(ndoms);
				574	if (!doms)
				575	goto done;
				576
				577	dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
				578	if (dattr) {
				579	*dattr = SD_ATTR_INIT;
				580	update_domain_attr_tree(dattr, &top_cpuset);
				581	}
				582	cpumask_copy(doms[0], top_cpuset.cpus_allowed);
				583
				584	goto done;
				585	}
				586
				587	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
				588	if (!csa)
				589	goto done;
				590	csn = 0;
				591
				592	list_add(&top_cpuset.stack_list, &q);
				593	while (!list_empty(&q)) {
				594	struct cgroup *cont;
				595	struct cpuset child; / scans child cpusets of cp */
				596
				597	cp = list_first_entry(&q, struct cpuset, stack_list);
				598	list_del(q.next);
				599
				600	if (cpumask_empty(cp->cpus_allowed))
				601	continue;
				602
				603	/*
				604	* All child cpusets contain a subset of the parent's cpus, so
				605	* just skip them, and then we call update_domain_attr_tree()
				606	* to calc relax_domain_level of the corresponding sched
				607	* domain.
				608	*/
				609	if (is_sched_load_balance(cp)) {
				610	csa[csn++] = cp;
				611	continue;
				612	}
				613
				614	list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
				615	child = cgroup_cs(cont);
				616	list_add_tail(&child->stack_list, &q);
				617	}
				618	}
				619
				620	for (i = 0; i < csn; i++)
				621	csa[i]->pn = i;
				622	ndoms = csn;
				623
				624	restart:
				625	/* Find the best partition (set of sched domains) */
				626	for (i = 0; i < csn; i++) {
				627	struct cpuset *a = csa[i];
				628	int apn = a->pn;
				629
				630	for (j = 0; j < csn; j++) {
				631	struct cpuset *b = csa[j];
				632	int bpn = b->pn;
				633
				634	if (apn != bpn && cpusets_overlap(a, b)) {
				635	for (k = 0; k < csn; k++) {
				636	struct cpuset *c = csa[k];
				637
				638	if (c->pn == bpn)
				639	c->pn = apn;
				640	}
				641	ndoms--; /* one less element */
				642	goto restart;
				643	}
				644	}
				645	}
				646
				647	/*
				648	* Now we know how many domains to create.
				649	* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
				650	*/
				651	doms = alloc_sched_domains(ndoms);
				652	if (!doms)
				653	goto done;
				654
				655	/*
				656	* The rest of the code, including the scheduler, can deal with
				657	* dattr==NULL case. No need to abort if alloc fails.
				658	*/
				659	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
				660
				661	for (nslot = 0, i = 0; i < csn; i++) {
				662	struct cpuset *a = csa[i];
				663	struct cpumask *dp;
				664	int apn = a->pn;
				665
				666	if (apn < 0) {
				667	/* Skip completed partitions */
				668	continue;
				669	}
				670
				671	dp = doms[nslot];
				672
				673	if (nslot == ndoms) {
				674	static int warnings = 10;
				675	if (warnings) {
				676	printk(KERN_WARNING
				677	"rebuild_sched_domains confused:"
				678	" nslot %d, ndoms %d, csn %d, i %d,"
				679	" apn %d\n",
				680	nslot, ndoms, csn, i, apn);
				681	warnings--;
				682	}
				683	continue;
				684	}
				685
				686	cpumask_clear(dp);
				687	if (dattr)
				688	*(dattr + nslot) = SD_ATTR_INIT;
				689	for (j = i; j < csn; j++) {
				690	struct cpuset *b = csa[j];
				691
				692	if (apn == b->pn) {
				693	cpumask_or(dp, dp, b->cpus_allowed);
				694	if (dattr)
				695	update_domain_attr_tree(dattr + nslot, b);
				696
				697	/* Done with this partition */
				698	b->pn = -1;
				699	}
				700	}
				701	nslot++;
				702	}
				703	BUG_ON(nslot != ndoms);
				704
				705	done:
				706	kfree(csa);
				707
				708	/*
				709	* Fallback to the default domain if kmalloc() failed.
				710	* See comments in partition_sched_domains().
				711	*/
				712	if (doms == NULL)
				713	ndoms = 1;
				714
				715	*domains = doms;
				716	*attributes = dattr;
				717	return ndoms;
				718	}
				719
				720	/*
				721	* Rebuild scheduler domains.
				722	*
				723	* Call with neither cgroup_mutex held nor within get_online_cpus().
				724	* Takes both cgroup_mutex and get_online_cpus().
				725	*
				726	* Cannot be directly called from cpuset code handling changes
				727	* to the cpuset pseudo-filesystem, because it cannot be called
				728	* from code that already holds cgroup_mutex.
				729	*/
				730	static void do_rebuild_sched_domains(struct work_struct *unused)
				731	{
				732	struct sched_domain_attr *attr;
				733	cpumask_var_t *doms;
				734	int ndoms;
				735
				736	get_online_cpus();
				737
				738	/* Generate domain masks and attrs */
				739	cgroup_lock();
				740	ndoms = generate_sched_domains(&doms, &attr);
				741	cgroup_unlock();
				742
				743	/* Have scheduler rebuild the domains */
				744	partition_sched_domains(ndoms, doms, attr);
				745
				746	put_online_cpus();
				747	}
				748	#else /* !CONFIG_SMP */
				749	static void do_rebuild_sched_domains(struct work_struct *unused)
				750	{
				751	}
				752
				753	static int generate_sched_domains(cpumask_var_t **domains,
				754	struct sched_domain_attr **attributes)
				755	{
				756	*domains = NULL;
				757	return 1;
				758	}
				759	#endif /* CONFIG_SMP */
				760
				761	static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
				762
				763	/*
				764	* Rebuild scheduler domains, asynchronously via workqueue.
				765	*
				766	* If the flag 'sched_load_balance' of any cpuset with non-empty
				767	* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
				768	* which has that flag enabled, or if any cpuset with a non-empty
				769	* 'cpus' is removed, then call this routine to rebuild the
				770	* scheduler's dynamic sched domains.
				771	*
				772	* The rebuild_sched_domains() and partition_sched_domains()
				773	* routines must nest cgroup_lock() inside get_online_cpus(),
				774	* but such cpuset changes as these must nest that locking the
				775	* other way, holding cgroup_lock() for much of the code.
				776	*
				777	* So in order to avoid an ABBA deadlock, the cpuset code handling
				778	* these user changes delegates the actual sched domain rebuilding
				779	* to a separate workqueue thread, which ends up processing the
				780	* above do_rebuild_sched_domains() function.
				781	*/
				782	static void async_rebuild_sched_domains(void)
				783	{
				784	queue_work(cpuset_wq, &rebuild_sched_domains_work);
				785	}
				786
				787	/*
				788	* Accomplishes the same scheduler domain rebuild as the above
				789	* async_rebuild_sched_domains(), however it directly calls the
				790	* rebuild routine synchronously rather than calling it via an
				791	* asynchronous work thread.
				792	*
				793	* This can only be called from code that is not holding
				794	* cgroup_mutex (not nested in a cgroup_lock() call.)
				795	*/
				796	void rebuild_sched_domains(void)
				797	{
				798	do_rebuild_sched_domains(NULL);
				799	}
				800
				801	/**
				802	* cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
				803	* @tsk: task to test
				804	* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
				805	*
				806	* Call with cgroup_mutex held. May take callback_mutex during call.
				807	* Called for each task in a cgroup by cgroup_scan_tasks().
				808	* Return nonzero if this tasks's cpus_allowed mask should be changed (in other
				809	* words, if its mask is not equal to its cpuset's mask).
				810	*/
				811	static int cpuset_test_cpumask(struct task_struct *tsk,
				812	struct cgroup_scanner *scan)
				813	{
				814	return !cpumask_equal(&tsk->cpus_allowed,
				815	(cgroup_cs(scan->cg))->cpus_allowed);
				816	}
				817
				818	/**
				819	* cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
				820	* @tsk: task to test
				821	* @scan: struct cgroup_scanner containing the cgroup of the task
				822	*
				823	* Called by cgroup_scan_tasks() for each task in a cgroup whose
				824	* cpus_allowed mask needs to be changed.
				825	*
				826	* We don't need to re-check for the cgroup/cpuset membership, since we're
				827	* holding cgroup_lock() at this point.
				828	*/
				829	static void cpuset_change_cpumask(struct task_struct *tsk,
				830	struct cgroup_scanner *scan)
				831	{
				832	set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
				833	}
				834
				835	/**
				836	* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
				837	* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
				838	* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
				839	*
				840	* Called with cgroup_mutex held
				841	*
				842	* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
				843	* calling callback functions for each.
				844	*
				845	* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
				846	* if @heap != NULL.
				847	*/
				848	static void update_tasks_cpumask(struct cpuset cs, struct ptr_heap heap)
				849	{
				850	struct cgroup_scanner scan;
				851
				852	scan.cg = cs->css.cgroup;
				853	scan.test_task = cpuset_test_cpumask;
				854	scan.process_task = cpuset_change_cpumask;
				855	scan.heap = heap;
				856	cgroup_scan_tasks(&scan);
				857	}
				858
				859	/**
				860	* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
				861	* @cs: the cpuset to consider
				862	* @buf: buffer of cpu numbers written to this cpuset
				863	*/
				864	static int update_cpumask(struct cpuset cs, struct cpuset trialcs,
				865	const char *buf)
				866	{
				867	struct ptr_heap heap;
				868	int retval;
				869	int is_load_balanced;
				870
				871	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
				872	if (cs == &top_cpuset)
				873	return -EACCES;
				874
				875	/*
				876	* An empty cpus_allowed is ok only if the cpuset has no tasks.
				877	* Since cpulist_parse() fails on an empty mask, we special case
				878	* that parsing. The validate_change() call ensures that cpusets
				879	* with tasks have cpus.
				880	*/
				881	if (!*buf) {
				882	cpumask_clear(trialcs->cpus_allowed);
				883	} else {
				884	retval = cpulist_parse(buf, trialcs->cpus_allowed);
				885	if (retval < 0)
				886	return retval;
				887
				888	if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
				889	return -EINVAL;
				890	}
				891	retval = validate_change(cs, trialcs);
				892	if (retval < 0)
				893	return retval;
				894
				895	/* Nothing to do if the cpus didn't change */
				896	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
				897	return 0;
				898
				899	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
				900	if (retval)
				901	return retval;
				902
				903	is_load_balanced = is_sched_load_balance(trialcs);
				904
				905	mutex_lock(&callback_mutex);
				906	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
				907	mutex_unlock(&callback_mutex);
				908
				909	/*
				910	* Scan tasks in the cpuset, and update the cpumasks of any
				911	* that need an update.
				912	*/
				913	update_tasks_cpumask(cs, &heap);
				914
				915	heap_free(&heap);
				916
				917	if (is_load_balanced)
				918	async_rebuild_sched_domains();
				919	return 0;
				920	}
				921
				922	/*
				923	* cpuset_migrate_mm
				924	*
				925	* Migrate memory region from one set of nodes to another.
				926	*
				927	* Temporarilly set tasks mems_allowed to target nodes of migration,
				928	* so that the migration code can allocate pages on these nodes.
				929	*
				930	* Call holding cgroup_mutex, so current's cpuset won't change
				931	* during this call, as manage_mutex holds off any cpuset_attach()
				932	* calls. Therefore we don't need to take task_lock around the
				933	* call to guarantee_online_mems(), as we know no one is changing
				934	* our task's cpuset.
				935	*
				936	* While the mm_struct we are migrating is typically from some
				937	* other task, the task_struct mems_allowed that we are hacking
				938	* is for our current task, which must allocate new pages for that
				939	* migrating memory region.
				940	*/
				941
				942	static void cpuset_migrate_mm(struct mm_struct mm, const nodemask_t from,
				943	const nodemask_t *to)
				944	{
				945	struct task_struct *tsk = current;
				946
				947	tsk->mems_allowed = *to;
				948
				949	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
				950
				951	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
				952	}
				953
				954	/*
				955	* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
				956	* @tsk: the task to change
				957	* @newmems: new nodes that the task will be set
				958	*
				959	* In order to avoid seeing no nodes if the old and new nodes are disjoint,
				960	* we structure updates as setting all new allowed nodes, then clearing newly
				961	* disallowed ones.
				962	*/
				963	static void cpuset_change_task_nodemask(struct task_struct *tsk,
				964	nodemask_t *newmems)
				965	{
				966	bool need_loop;
				967
				968	/*
				969	* Allow tasks that have access to memory reserves because they have
				970	* been OOM killed to get memory anywhere.
				971	*/
				972	if (unlikely(test_thread_flag(TIF_MEMDIE)))
				973	return;
				974	if (current->flags & PF_EXITING) /* Let dying task have memory */
				975	return;
				976
				977	task_lock(tsk);
				978	/*
				979	* Determine if a loop is necessary if another thread is doing
				980	* get_mems_allowed(). If at least one node remains unchanged and
				981	* tsk does not have a mempolicy, then an empty nodemask will not be
				982	* possible when mems_allowed is larger than a word.
				983	*/
				984	need_loop = task_has_mempolicy(tsk) \|\|
				985	!nodes_intersects(*newmems, tsk->mems_allowed);
				986
				987	if (need_loop) {
				988	local_irq_disable();
				989	write_seqcount_begin(&tsk->mems_allowed_seq);
				990	}
				991
				992	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
				993	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
				994
				995	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
				996	tsk->mems_allowed = *newmems;
				997
				998	if (need_loop) {
				999	write_seqcount_end(&tsk->mems_allowed_seq);
				1000	local_irq_enable();
				1001	}
				1002
				1003	task_unlock(tsk);
				1004	}
				1005
				1006	/*
				1007	* Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
				1008	* of it to cpuset's new mems_allowed, and migrate pages to new nodes if
				1009	* memory_migrate flag is set. Called with cgroup_mutex held.
				1010	*/
				1011	static void cpuset_change_nodemask(struct task_struct *p,
				1012	struct cgroup_scanner *scan)
				1013	{
				1014	struct mm_struct *mm;
				1015	struct cpuset *cs;
				1016	int migrate;
				1017	const nodemask_t *oldmem = scan->data;
				1018	static nodemask_t newmems; /* protected by cgroup_mutex */
				1019
				1020	cs = cgroup_cs(scan->cg);
				1021	guarantee_online_mems(cs, &newmems);
				1022
				1023	cpuset_change_task_nodemask(p, &newmems);
				1024
				1025	mm = get_task_mm(p);
				1026	if (!mm)
				1027	return;
				1028
				1029	migrate = is_memory_migrate(cs);
				1030
				1031	mpol_rebind_mm(mm, &cs->mems_allowed);
				1032	if (migrate)
				1033	cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
				1034	mmput(mm);
				1035	}
				1036
				1037	static void *cpuset_being_rebound;
				1038
				1039	/**
				1040	* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
				1041	* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
				1042	* @oldmem: old mems_allowed of cpuset cs
				1043	* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
				1044	*
				1045	* Called with cgroup_mutex held
				1046	* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
				1047	* if @heap != NULL.
				1048	*/
				1049	static void update_tasks_nodemask(struct cpuset cs, const nodemask_t oldmem,
				1050	struct ptr_heap *heap)
				1051	{
				1052	struct cgroup_scanner scan;
				1053
				1054	cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
				1055
				1056	scan.cg = cs->css.cgroup;
				1057	scan.test_task = NULL;
				1058	scan.process_task = cpuset_change_nodemask;
				1059	scan.heap = heap;
				1060	scan.data = (nodemask_t *)oldmem;
				1061
				1062	/*
				1063	* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
				1064	* take while holding tasklist_lock. Forks can happen - the
				1065	* mpol_dup() cpuset_being_rebound check will catch such forks,
				1066	* and rebind their vma mempolicies too. Because we still hold
				1067	* the global cgroup_mutex, we know that no other rebind effort
				1068	* will be contending for the global variable cpuset_being_rebound.
				1069	* It's ok if we rebind the same mm twice; mpol_rebind_mm()
				1070	* is idempotent. Also migrate pages in each mm to new nodes.
				1071	*/
				1072	cgroup_scan_tasks(&scan);
				1073
				1074	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
				1075	cpuset_being_rebound = NULL;
				1076	}
				1077
				1078	/*
				1079	* Handle user request to change the 'mems' memory placement
				1080	* of a cpuset. Needs to validate the request, update the
				1081	* cpusets mems_allowed, and for each task in the cpuset,
				1082	* update mems_allowed and rebind task's mempolicy and any vma
				1083	* mempolicies and if the cpuset is marked 'memory_migrate',
				1084	* migrate the tasks pages to the new memory.
				1085	*
				1086	* Call with cgroup_mutex held. May take callback_mutex during call.
				1087	* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
				1088	* lock each such tasks mm->mmap_sem, scan its vma's and rebind
				1089	* their mempolicies to the cpusets new mems_allowed.
				1090	*/
				1091	static int update_nodemask(struct cpuset cs, struct cpuset trialcs,
				1092	const char *buf)
				1093	{
				1094	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
				1095	int retval;
				1096	struct ptr_heap heap;
				1097
				1098	if (!oldmem)
				1099	return -ENOMEM;
				1100
				1101	/*
				1102	* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
				1103	* it's read-only
				1104	*/
				1105	if (cs == &top_cpuset) {
				1106	retval = -EACCES;
				1107	goto done;
				1108	}
				1109
				1110	/*
				1111	* An empty mems_allowed is ok iff there are no tasks in the cpuset.
				1112	* Since nodelist_parse() fails on an empty mask, we special case
				1113	* that parsing. The validate_change() call ensures that cpusets
				1114	* with tasks have memory.
				1115	*/
				1116	if (!*buf) {
				1117	nodes_clear(trialcs->mems_allowed);
				1118	} else {
				1119	retval = nodelist_parse(buf, trialcs->mems_allowed);
				1120	if (retval < 0)
				1121	goto done;
				1122
				1123	if (!nodes_subset(trialcs->mems_allowed,
				1124	node_states[N_HIGH_MEMORY])) {
				1125	retval = -EINVAL;
				1126	goto done;
				1127	}
				1128	}
				1129	*oldmem = cs->mems_allowed;
				1130	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
				1131	retval = 0; /* Too easy - nothing to do */
				1132	goto done;
				1133	}
				1134	retval = validate_change(cs, trialcs);
				1135	if (retval < 0)
				1136	goto done;
				1137
				1138	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
				1139	if (retval < 0)
				1140	goto done;
				1141
				1142	mutex_lock(&callback_mutex);
				1143	cs->mems_allowed = trialcs->mems_allowed;
				1144	mutex_unlock(&callback_mutex);
				1145
				1146	update_tasks_nodemask(cs, oldmem, &heap);
				1147
				1148	heap_free(&heap);
				1149	done:
				1150	NODEMASK_FREE(oldmem);
				1151	return retval;
				1152	}
				1153
				1154	int current_cpuset_is_being_rebound(void)
				1155	{
				1156	int ret;
				1157
				1158	rcu_read_lock();
				1159	ret = task_cs(current) == cpuset_being_rebound;
				1160	rcu_read_unlock();
				1161
				1162	return ret;
				1163	}
				1164
				1165	static int update_relax_domain_level(struct cpuset *cs, s64 val)
				1166	{
				1167	#ifdef CONFIG_SMP
				1168	if (val < -1 \|\| val >= sched_domain_level_max)
				1169	return -EINVAL;
				1170	#endif
				1171
				1172	if (val != cs->relax_domain_level) {
				1173	cs->relax_domain_level = val;
				1174	if (!cpumask_empty(cs->cpus_allowed) &&
				1175	is_sched_load_balance(cs))
				1176	async_rebuild_sched_domains();
				1177	}
				1178
				1179	return 0;
				1180	}
				1181
				1182	/*
				1183	* cpuset_change_flag - make a task's spread flags the same as its cpuset's
				1184	* @tsk: task to be updated
				1185	* @scan: struct cgroup_scanner containing the cgroup of the task
				1186	*
				1187	* Called by cgroup_scan_tasks() for each task in a cgroup.
				1188	*
				1189	* We don't need to re-check for the cgroup/cpuset membership, since we're
				1190	* holding cgroup_lock() at this point.
				1191	*/
				1192	static void cpuset_change_flag(struct task_struct *tsk,
				1193	struct cgroup_scanner *scan)
				1194	{
				1195	cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
				1196	}
				1197
				1198	/*
				1199	* update_tasks_flags - update the spread flags of tasks in the cpuset.
				1200	* @cs: the cpuset in which each task's spread flags needs to be changed
				1201	* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
				1202	*
				1203	* Called with cgroup_mutex held
				1204	*
				1205	* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
				1206	* calling callback functions for each.
				1207	*
				1208	* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
				1209	* if @heap != NULL.
				1210	*/
				1211	static void update_tasks_flags(struct cpuset cs, struct ptr_heap heap)
				1212	{
				1213	struct cgroup_scanner scan;
				1214
				1215	scan.cg = cs->css.cgroup;
				1216	scan.test_task = NULL;
				1217	scan.process_task = cpuset_change_flag;
				1218	scan.heap = heap;
				1219	cgroup_scan_tasks(&scan);
				1220	}
				1221
				1222	/*
				1223	* update_flag - read a 0 or a 1 in a file and update associated flag
				1224	* bit: the bit to update (see cpuset_flagbits_t)
				1225	* cs: the cpuset to update
				1226	* turning_on: whether the flag is being set or cleared
				1227	*
				1228	* Call with cgroup_mutex held.
				1229	*/
				1230
				1231	static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
				1232	int turning_on)
				1233	{
				1234	struct cpuset *trialcs;
				1235	int balance_flag_changed;
				1236	int spread_flag_changed;
				1237	struct ptr_heap heap;
				1238	int err;
				1239
				1240	trialcs = alloc_trial_cpuset(cs);
				1241	if (!trialcs)
				1242	return -ENOMEM;
				1243
				1244	if (turning_on)
				1245	set_bit(bit, &trialcs->flags);
				1246	else
				1247	clear_bit(bit, &trialcs->flags);
				1248
				1249	err = validate_change(cs, trialcs);
				1250	if (err < 0)
				1251	goto out;
				1252
				1253	err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
				1254	if (err < 0)
				1255	goto out;
				1256
				1257	balance_flag_changed = (is_sched_load_balance(cs) !=
				1258	is_sched_load_balance(trialcs));
				1259
				1260	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
				1261	\|\| (is_spread_page(cs) != is_spread_page(trialcs)));
				1262
				1263	mutex_lock(&callback_mutex);
				1264	cs->flags = trialcs->flags;
				1265	mutex_unlock(&callback_mutex);
				1266
				1267	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
				1268	async_rebuild_sched_domains();
				1269
				1270	if (spread_flag_changed)
				1271	update_tasks_flags(cs, &heap);
				1272	heap_free(&heap);
				1273	out:
				1274	free_trial_cpuset(trialcs);
				1275	return err;
				1276	}
				1277
				1278	/*
				1279	* Frequency meter - How fast is some event occurring?
				1280	*
				1281	* These routines manage a digitally filtered, constant time based,
				1282	* event frequency meter. There are four routines:
				1283	* fmeter_init() - initialize a frequency meter.
				1284	* fmeter_markevent() - called each time the event happens.
				1285	* fmeter_getrate() - returns the recent rate of such events.
				1286	* fmeter_update() - internal routine used to update fmeter.
				1287	*
				1288	* A common data structure is passed to each of these routines,
				1289	* which is used to keep track of the state required to manage the
				1290	* frequency meter and its digital filter.
				1291	*
				1292	* The filter works on the number of events marked per unit time.
				1293	* The filter is single-pole low-pass recursive (IIR). The time unit
				1294	* is 1 second. Arithmetic is done using 32-bit integers scaled to
				1295	* simulate 3 decimal digits of precision (multiplied by 1000).
				1296	*
				1297	* With an FM_COEF of 933, and a time base of 1 second, the filter
				1298	* has a half-life of 10 seconds, meaning that if the events quit
				1299	* happening, then the rate returned from the fmeter_getrate()
				1300	* will be cut in half each 10 seconds, until it converges to zero.
				1301	*
				1302	* It is not worth doing a real infinitely recursive filter. If more
				1303	* than FM_MAXTICKS ticks have elapsed since the last filter event,
				1304	* just compute FM_MAXTICKS ticks worth, by which point the level
				1305	* will be stable.
				1306	*
				1307	* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
				1308	* arithmetic overflow in the fmeter_update() routine.
				1309	*
				1310	* Given the simple 32 bit integer arithmetic used, this meter works
				1311	* best for reporting rates between one per millisecond (msec) and
				1312	* one per 32 (approx) seconds. At constant rates faster than one
				1313	* per msec it maxes out at values just under 1,000,000. At constant
				1314	* rates between one per msec, and one per second it will stabilize
				1315	* to a value N*1000, where N is the rate of events per second.
				1316	* At constant rates between one per second and one per 32 seconds,
				1317	* it will be choppy, moving up on the seconds that have an event,
				1318	* and then decaying until the next event. At rates slower than
				1319	* about one in 32 seconds, it decays all the way back to zero between
				1320	* each event.
				1321	*/
				1322
				1323	#define FM_COEF 933 /* coefficient for half-life of 10 secs */
				1324	#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
				1325	#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
				1326	#define FM_SCALE 1000 /* faux fixed point scale */
				1327
				1328	/* Initialize a frequency meter */
				1329	static void fmeter_init(struct fmeter *fmp)
				1330	{
				1331	fmp->cnt = 0;
				1332	fmp->val = 0;
				1333	fmp->time = 0;
				1334	spin_lock_init(&fmp->lock);
				1335	}
				1336
				1337	/* Internal meter update - process cnt events and update value */
				1338	static void fmeter_update(struct fmeter *fmp)
				1339	{
				1340	time_t now = get_seconds();
				1341	time_t ticks = now - fmp->time;
				1342
				1343	if (ticks == 0)
				1344	return;
				1345
				1346	ticks = min(FM_MAXTICKS, ticks);
				1347	while (ticks-- > 0)
				1348	fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
				1349	fmp->time = now;
				1350
				1351	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
				1352	fmp->cnt = 0;
				1353	}
				1354
				1355	/* Process any previous ticks, then bump cnt by one (times scale). */
				1356	static void fmeter_markevent(struct fmeter *fmp)
				1357	{
				1358	spin_lock(&fmp->lock);
				1359	fmeter_update(fmp);
				1360	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
				1361	spin_unlock(&fmp->lock);
				1362	}
				1363
				1364	/* Process any previous ticks, then return current value. */
				1365	static int fmeter_getrate(struct fmeter *fmp)
				1366	{
				1367	int val;
				1368
				1369	spin_lock(&fmp->lock);
				1370	fmeter_update(fmp);
				1371	val = fmp->val;
				1372	spin_unlock(&fmp->lock);
				1373	return val;
				1374	}
				1375
				1376	/*
				1377	* Protected by cgroup_lock. The nodemasks must be stored globally because
				1378	* dynamically allocating them is not allowed in can_attach, and they must
				1379	* persist until attach.
				1380	*/
				1381	static cpumask_var_t cpus_attach;
				1382	static nodemask_t cpuset_attach_nodemask_from;
				1383	static nodemask_t cpuset_attach_nodemask_to;
				1384
				1385	/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
				1386	static int cpuset_can_attach(struct cgroup cgrp, struct cgroup_taskset tset)
				1387	{
				1388	struct cpuset *cs = cgroup_cs(cgrp);
				1389	struct task_struct *task;
				1390	int ret;
				1391
				1392	if (cpumask_empty(cs->cpus_allowed) \|\| nodes_empty(cs->mems_allowed))
				1393	return -ENOSPC;
				1394
				1395	cgroup_taskset_for_each(task, cgrp, tset) {
				1396	/*
				1397	* Kthreads bound to specific cpus cannot be moved to a new
				1398	* cpuset; we cannot change their cpu affinity and
				1399	* isolating such threads by their set of allowed nodes is
				1400	* unnecessary. Thus, cpusets are not applicable for such
				1401	* threads. This prevents checking for success of
				1402	* set_cpus_allowed_ptr() on all attached tasks before
				1403	* cpus_allowed may be changed.
				1404	*/
				1405	if (task->flags & PF_THREAD_BOUND)
				1406	return -EINVAL;
				1407	if ((ret = security_task_setscheduler(task)))
				1408	return ret;
				1409	}
				1410
				1411	/* prepare for attach */
				1412	if (cs == &top_cpuset)
				1413	cpumask_copy(cpus_attach, cpu_possible_mask);
				1414	else
				1415	guarantee_online_cpus(cs, cpus_attach);
				1416
				1417	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
				1418
				1419	return 0;
				1420	}
				1421
				1422	static void cpuset_attach(struct cgroup cgrp, struct cgroup_taskset tset)
				1423	{
				1424	struct mm_struct *mm;
				1425	struct task_struct *task;
				1426	struct task_struct *leader = cgroup_taskset_first(tset);
				1427	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
				1428	struct cpuset *cs = cgroup_cs(cgrp);
				1429	struct cpuset *oldcs = cgroup_cs(oldcgrp);
				1430
				1431	cgroup_taskset_for_each(task, cgrp, tset) {
				1432	/*
				1433	* can_attach beforehand should guarantee that this doesn't
				1434	* fail. TODO: have a better way to handle failure here
				1435	*/
				1436	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
				1437
				1438	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
				1439	cpuset_update_task_spread_flag(cs, task);
				1440	}
				1441
				1442	/*
				1443	* Change mm, possibly for multiple threads in a threadgroup. This is
				1444	* expensive and may sleep.
				1445	*/
				1446	cpuset_attach_nodemask_from = oldcs->mems_allowed;
				1447	cpuset_attach_nodemask_to = cs->mems_allowed;
				1448	mm = get_task_mm(leader);
				1449	if (mm) {
				1450	mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
				1451	if (is_memory_migrate(cs))
				1452	cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
				1453	&cpuset_attach_nodemask_to);
				1454	mmput(mm);
				1455	}
				1456	}
				1457
				1458	/* The various types of files and directories in a cpuset file system */
				1459
				1460	typedef enum {
				1461	FILE_MEMORY_MIGRATE,
				1462	FILE_CPULIST,
				1463	FILE_MEMLIST,
				1464	FILE_CPU_EXCLUSIVE,
				1465	FILE_MEM_EXCLUSIVE,
				1466	FILE_MEM_HARDWALL,
				1467	FILE_SCHED_LOAD_BALANCE,
				1468	FILE_SCHED_RELAX_DOMAIN_LEVEL,
				1469	FILE_MEMORY_PRESSURE_ENABLED,
				1470	FILE_MEMORY_PRESSURE,
				1471	FILE_SPREAD_PAGE,
				1472	FILE_SPREAD_SLAB,
				1473	} cpuset_filetype_t;
				1474
				1475	static int cpuset_write_u64(struct cgroup cgrp, struct cftype cft, u64 val)
				1476	{
				1477	int retval = 0;
				1478	struct cpuset *cs = cgroup_cs(cgrp);
				1479	cpuset_filetype_t type = cft->private;
				1480
				1481	if (!cgroup_lock_live_group(cgrp))
				1482	return -ENODEV;
				1483
				1484	switch (type) {
				1485	case FILE_CPU_EXCLUSIVE:
				1486	retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
				1487	break;
				1488	case FILE_MEM_EXCLUSIVE:
				1489	retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
				1490	break;
				1491	case FILE_MEM_HARDWALL:
				1492	retval = update_flag(CS_MEM_HARDWALL, cs, val);
				1493	break;
				1494	case FILE_SCHED_LOAD_BALANCE:
				1495	retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
				1496	break;
				1497	case FILE_MEMORY_MIGRATE:
				1498	retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
				1499	break;
				1500	case FILE_MEMORY_PRESSURE_ENABLED:
				1501	cpuset_memory_pressure_enabled = !!val;
				1502	break;
				1503	case FILE_MEMORY_PRESSURE:
				1504	retval = -EACCES;
				1505	break;
				1506	case FILE_SPREAD_PAGE:
				1507	retval = update_flag(CS_SPREAD_PAGE, cs, val);
				1508	break;
				1509	case FILE_SPREAD_SLAB:
				1510	retval = update_flag(CS_SPREAD_SLAB, cs, val);
				1511	break;
				1512	default:
				1513	retval = -EINVAL;
				1514	break;
				1515	}
				1516	cgroup_unlock();
				1517	return retval;
				1518	}
				1519
				1520	static int cpuset_write_s64(struct cgroup cgrp, struct cftype cft, s64 val)
				1521	{
				1522	int retval = 0;
				1523	struct cpuset *cs = cgroup_cs(cgrp);
				1524	cpuset_filetype_t type = cft->private;
				1525
				1526	if (!cgroup_lock_live_group(cgrp))
				1527	return -ENODEV;
				1528
				1529	switch (type) {
				1530	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
				1531	retval = update_relax_domain_level(cs, val);
				1532	break;
				1533	default:
				1534	retval = -EINVAL;
				1535	break;
				1536	}
				1537	cgroup_unlock();
				1538	return retval;
				1539	}
				1540
				1541	/*
				1542	* Common handling for a write to a "cpus" or "mems" file.
				1543	*/
				1544	static int cpuset_write_resmask(struct cgroup cgrp, struct cftype cft,
				1545	const char *buf)
				1546	{
				1547	int retval = 0;
				1548	struct cpuset *cs = cgroup_cs(cgrp);
				1549	struct cpuset *trialcs;
				1550
				1551	if (!cgroup_lock_live_group(cgrp))
				1552	return -ENODEV;
				1553
				1554	trialcs = alloc_trial_cpuset(cs);
				1555	if (!trialcs) {
				1556	retval = -ENOMEM;
				1557	goto out;
				1558	}
				1559
				1560	switch (cft->private) {
				1561	case FILE_CPULIST:
				1562	retval = update_cpumask(cs, trialcs, buf);
				1563	break;
				1564	case FILE_MEMLIST:
				1565	retval = update_nodemask(cs, trialcs, buf);
				1566	break;
				1567	default:
				1568	retval = -EINVAL;
				1569	break;
				1570	}
				1571
				1572	free_trial_cpuset(trialcs);
				1573	out:
				1574	cgroup_unlock();
				1575	return retval;
				1576	}
				1577
				1578	/*
				1579	* These ascii lists should be read in a single call, by using a user
				1580	* buffer large enough to hold the entire map. If read in smaller
				1581	* chunks, there is no guarantee of atomicity. Since the display format
				1582	* used, list of ranges of sequential numbers, is variable length,
				1583	* and since these maps can change value dynamically, one could read
				1584	* gibberish by doing partial reads while a list was changing.
				1585	* A single large read to a buffer that crosses a page boundary is
				1586	* ok, because the result being copied to user land is not recomputed
				1587	* across a page fault.
				1588	*/
				1589
				1590	static size_t cpuset_sprintf_cpulist(char page, struct cpuset cs)
				1591	{
				1592	size_t count;
				1593
				1594	mutex_lock(&callback_mutex);
				1595	count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
				1596	mutex_unlock(&callback_mutex);
				1597
				1598	return count;
				1599	}
				1600
				1601	static size_t cpuset_sprintf_memlist(char page, struct cpuset cs)
				1602	{
				1603	size_t count;
				1604
				1605	mutex_lock(&callback_mutex);
				1606	count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
				1607	mutex_unlock(&callback_mutex);
				1608
				1609	return count;
				1610	}
				1611
				1612	static ssize_t cpuset_common_file_read(struct cgroup *cont,
				1613	struct cftype *cft,
				1614	struct file *file,
				1615	char __user *buf,
				1616	size_t nbytes, loff_t *ppos)
				1617	{
				1618	struct cpuset *cs = cgroup_cs(cont);
				1619	cpuset_filetype_t type = cft->private;
				1620	char *page;
				1621	ssize_t retval = 0;
				1622	char *s;
				1623
				1624	if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
				1625	return -ENOMEM;
				1626
				1627	s = page;
				1628
				1629	switch (type) {
				1630	case FILE_CPULIST:
				1631	s += cpuset_sprintf_cpulist(s, cs);
				1632	break;
				1633	case FILE_MEMLIST:
				1634	s += cpuset_sprintf_memlist(s, cs);
				1635	break;
				1636	default:
				1637	retval = -EINVAL;
				1638	goto out;
				1639	}
				1640	*s++ = '\n';
				1641
				1642	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
				1643	out:
				1644	free_page((unsigned long)page);
				1645	return retval;
				1646	}
				1647
				1648	static u64 cpuset_read_u64(struct cgroup cont, struct cftype cft)
				1649	{
				1650	struct cpuset *cs = cgroup_cs(cont);
				1651	cpuset_filetype_t type = cft->private;
				1652	switch (type) {
				1653	case FILE_CPU_EXCLUSIVE:
				1654	return is_cpu_exclusive(cs);
				1655	case FILE_MEM_EXCLUSIVE:
				1656	return is_mem_exclusive(cs);
				1657	case FILE_MEM_HARDWALL:
				1658	return is_mem_hardwall(cs);
				1659	case FILE_SCHED_LOAD_BALANCE:
				1660	return is_sched_load_balance(cs);
				1661	case FILE_MEMORY_MIGRATE:
				1662	return is_memory_migrate(cs);
				1663	case FILE_MEMORY_PRESSURE_ENABLED:
				1664	return cpuset_memory_pressure_enabled;
				1665	case FILE_MEMORY_PRESSURE:
				1666	return fmeter_getrate(&cs->fmeter);
				1667	case FILE_SPREAD_PAGE:
				1668	return is_spread_page(cs);
				1669	case FILE_SPREAD_SLAB:
				1670	return is_spread_slab(cs);
				1671	default:
				1672	BUG();
				1673	}
				1674
				1675	/* Unreachable but makes gcc happy */
				1676	return 0;
				1677	}
				1678
				1679	static s64 cpuset_read_s64(struct cgroup cont, struct cftype cft)
				1680	{
				1681	struct cpuset *cs = cgroup_cs(cont);
				1682	cpuset_filetype_t type = cft->private;
				1683	switch (type) {
				1684	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
				1685	return cs->relax_domain_level;
				1686	default:
				1687	BUG();
				1688	}
				1689
				1690	/* Unrechable but makes gcc happy */
				1691	return 0;
				1692	}
				1693
				1694
				1695	/*
				1696	* for the common functions, 'private' gives the type of file
				1697	*/
				1698
				1699	static struct cftype files[] = {
				1700	{
				1701	.name = "cpus",
				1702	.read = cpuset_common_file_read,
				1703	.write_string = cpuset_write_resmask,
				1704	.max_write_len = (100U + 6 * NR_CPUS),
				1705	.private = FILE_CPULIST,
				1706	},
				1707
				1708	{
				1709	.name = "mems",
				1710	.read = cpuset_common_file_read,
				1711	.write_string = cpuset_write_resmask,
				1712	.max_write_len = (100U + 6 * MAX_NUMNODES),
				1713	.private = FILE_MEMLIST,
				1714	},
				1715
				1716	{
				1717	.name = "cpu_exclusive",
				1718	.read_u64 = cpuset_read_u64,
				1719	.write_u64 = cpuset_write_u64,
				1720	.private = FILE_CPU_EXCLUSIVE,
				1721	},
				1722
				1723	{
				1724	.name = "mem_exclusive",
				1725	.read_u64 = cpuset_read_u64,
				1726	.write_u64 = cpuset_write_u64,
				1727	.private = FILE_MEM_EXCLUSIVE,
				1728	},
				1729
				1730	{
				1731	.name = "mem_hardwall",
				1732	.read_u64 = cpuset_read_u64,
				1733	.write_u64 = cpuset_write_u64,
				1734	.private = FILE_MEM_HARDWALL,
				1735	},
				1736
				1737	{
				1738	.name = "sched_load_balance",
				1739	.read_u64 = cpuset_read_u64,
				1740	.write_u64 = cpuset_write_u64,
				1741	.private = FILE_SCHED_LOAD_BALANCE,
				1742	},
				1743
				1744	{
				1745	.name = "sched_relax_domain_level",
				1746	.read_s64 = cpuset_read_s64,
				1747	.write_s64 = cpuset_write_s64,
				1748	.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
				1749	},
				1750
				1751	{
				1752	.name = "memory_migrate",
				1753	.read_u64 = cpuset_read_u64,
				1754	.write_u64 = cpuset_write_u64,
				1755	.private = FILE_MEMORY_MIGRATE,
				1756	},
				1757
				1758	{
				1759	.name = "memory_pressure",
				1760	.read_u64 = cpuset_read_u64,
				1761	.write_u64 = cpuset_write_u64,
				1762	.private = FILE_MEMORY_PRESSURE,
				1763	.mode = S_IRUGO,
				1764	},
				1765
				1766	{
				1767	.name = "memory_spread_page",
				1768	.read_u64 = cpuset_read_u64,
				1769	.write_u64 = cpuset_write_u64,
				1770	.private = FILE_SPREAD_PAGE,
				1771	},
				1772
				1773	{
				1774	.name = "memory_spread_slab",
				1775	.read_u64 = cpuset_read_u64,
				1776	.write_u64 = cpuset_write_u64,
				1777	.private = FILE_SPREAD_SLAB,
				1778	},
				1779	};
				1780
				1781	static struct cftype cft_memory_pressure_enabled = {
				1782	.name = "memory_pressure_enabled",
				1783	.read_u64 = cpuset_read_u64,
				1784	.write_u64 = cpuset_write_u64,
				1785	.private = FILE_MEMORY_PRESSURE_ENABLED,
				1786	};
				1787
				1788	static int cpuset_populate(struct cgroup_subsys ss, struct cgroup cont)
				1789	{
				1790	int err;
				1791
				1792	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
				1793	if (err)
				1794	return err;
				1795	/* memory_pressure_enabled is in root cpuset only */
				1796	if (!cont->parent)
				1797	err = cgroup_add_file(cont, ss,
				1798	&cft_memory_pressure_enabled);
				1799	return err;
				1800	}
				1801
				1802	/*
				1803	* post_clone() is called during cgroup_create() when the
				1804	* clone_children mount argument was specified. The cgroup
				1805	* can not yet have any tasks.
				1806	*
				1807	* Currently we refuse to set up the cgroup - thereby
				1808	* refusing the task to be entered, and as a result refusing
				1809	* the sys_unshare() or clone() which initiated it - if any
				1810	* sibling cpusets have exclusive cpus or mem.
				1811	*
				1812	* If this becomes a problem for some users who wish to
				1813	* allow that scenario, then cpuset_post_clone() could be
				1814	* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
				1815	* (and likewise for mems) to the new cgroup. Called with cgroup_mutex
				1816	* held.
				1817	*/
				1818	static void cpuset_post_clone(struct cgroup *cgroup)
				1819	{
				1820	struct cgroup parent, child;
				1821	struct cpuset cs, parent_cs;
				1822
				1823	parent = cgroup->parent;
				1824	list_for_each_entry(child, &parent->children, sibling) {
				1825	cs = cgroup_cs(child);
				1826	if (is_mem_exclusive(cs) \|\| is_cpu_exclusive(cs))
				1827	return;
				1828	}
				1829	cs = cgroup_cs(cgroup);
				1830	parent_cs = cgroup_cs(parent);
				1831
				1832	mutex_lock(&callback_mutex);
				1833	cs->mems_allowed = parent_cs->mems_allowed;
				1834	cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
				1835	mutex_unlock(&callback_mutex);
				1836	return;
				1837	}
				1838
				1839	/*
				1840	* cpuset_create - create a cpuset
				1841	* cont: control group that the new cpuset will be part of
				1842	*/
				1843
				1844	static struct cgroup_subsys_state cpuset_create(struct cgroup cont)
				1845	{
				1846	struct cpuset *cs;
				1847	struct cpuset *parent;
				1848
				1849	if (!cont->parent) {
				1850	return &top_cpuset.css;
				1851	}
				1852	parent = cgroup_cs(cont->parent);
				1853	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
				1854	if (!cs)
				1855	return ERR_PTR(-ENOMEM);
				1856	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
				1857	kfree(cs);
				1858	return ERR_PTR(-ENOMEM);
				1859	}
				1860
				1861	cs->flags = 0;
				1862	if (is_spread_page(parent))
				1863	set_bit(CS_SPREAD_PAGE, &cs->flags);
				1864	if (is_spread_slab(parent))
				1865	set_bit(CS_SPREAD_SLAB, &cs->flags);
				1866	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
				1867	cpumask_clear(cs->cpus_allowed);
				1868	nodes_clear(cs->mems_allowed);
				1869	fmeter_init(&cs->fmeter);
				1870	cs->relax_domain_level = -1;
				1871
				1872	cs->parent = parent;
				1873	number_of_cpusets++;
				1874	return &cs->css ;
				1875	}
				1876
				1877	/*
				1878	* If the cpuset being removed has its flag 'sched_load_balance'
				1879	* enabled, then simulate turning sched_load_balance off, which
				1880	* will call async_rebuild_sched_domains().
				1881	*/
				1882
				1883	static void cpuset_destroy(struct cgroup *cont)
				1884	{
				1885	struct cpuset *cs = cgroup_cs(cont);
				1886
				1887	if (is_sched_load_balance(cs))
				1888	update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
				1889
				1890	number_of_cpusets--;
				1891	free_cpumask_var(cs->cpus_allowed);
				1892	kfree(cs);
				1893	}
				1894
				1895	struct cgroup_subsys cpuset_subsys = {
				1896	.name = "cpuset",
				1897	.create = cpuset_create,
				1898	.destroy = cpuset_destroy,
				1899	.can_attach = cpuset_can_attach,
				1900	.attach = cpuset_attach,
				1901	.populate = cpuset_populate,
				1902	.post_clone = cpuset_post_clone,
				1903	.subsys_id = cpuset_subsys_id,
				1904	.early_init = 1,
				1905	};
				1906
				1907	/**
				1908	* cpuset_init - initialize cpusets at system boot
				1909	*
				1910	* Description: Initialize top_cpuset and the cpuset internal file system,
				1911	**/
				1912
				1913	int __init cpuset_init(void)
				1914	{
				1915	int err = 0;
				1916
				1917	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
				1918	BUG();
				1919
				1920	cpumask_setall(top_cpuset.cpus_allowed);
				1921	nodes_setall(top_cpuset.mems_allowed);
				1922
				1923	fmeter_init(&top_cpuset.fmeter);
				1924	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
				1925	top_cpuset.relax_domain_level = -1;
				1926
				1927	err = register_filesystem(&cpuset_fs_type);
				1928	if (err < 0)
				1929	return err;
				1930
				1931	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
				1932	BUG();
				1933
				1934	number_of_cpusets = 1;
				1935	return 0;
				1936	}
				1937
				1938	/**
				1939	* cpuset_do_move_task - move a given task to another cpuset
				1940	* @tsk: pointer to task_struct the task to move
				1941	* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
				1942	*
				1943	* Called by cgroup_scan_tasks() for each task in a cgroup.
				1944	* Return nonzero to stop the walk through the tasks.
				1945	*/
				1946	static void cpuset_do_move_task(struct task_struct *tsk,
				1947	struct cgroup_scanner *scan)
				1948	{
				1949	struct cgroup *new_cgroup = scan->data;
				1950
				1951	cgroup_attach_task(new_cgroup, tsk);
				1952	}
				1953
				1954	/**
				1955	* move_member_tasks_to_cpuset - move tasks from one cpuset to another
				1956	* @from: cpuset in which the tasks currently reside
				1957	* @to: cpuset to which the tasks will be moved
				1958	*
				1959	* Called with cgroup_mutex held
				1960	* callback_mutex must not be held, as cpuset_attach() will take it.
				1961	*
				1962	* The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
				1963	* calling callback functions for each.
				1964	*/
				1965	static void move_member_tasks_to_cpuset(struct cpuset from, struct cpuset to)
				1966	{
				1967	struct cgroup_scanner scan;
				1968
				1969	scan.cg = from->css.cgroup;
				1970	scan.test_task = NULL; /* select all tasks in cgroup */
				1971	scan.process_task = cpuset_do_move_task;
				1972	scan.heap = NULL;
				1973	scan.data = to->css.cgroup;
				1974
				1975	if (cgroup_scan_tasks(&scan))
				1976	printk(KERN_ERR "move_member_tasks_to_cpuset: "
				1977	"cgroup_scan_tasks failed\n");
				1978	}
				1979
				1980	/*
				1981	* If CPU and/or memory hotplug handlers, below, unplug any CPUs
				1982	* or memory nodes, we need to walk over the cpuset hierarchy,
				1983	* removing that CPU or node from all cpusets. If this removes the
				1984	* last CPU or node from a cpuset, then move the tasks in the empty
				1985	* cpuset to its next-highest non-empty parent.
				1986	*
				1987	* Called with cgroup_mutex held
				1988	* callback_mutex must not be held, as cpuset_attach() will take it.
				1989	*/
				1990	static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
				1991	{
				1992	struct cpuset *parent;
				1993
				1994	/*
				1995	* The cgroup's css_sets list is in use if there are tasks
				1996	* in the cpuset; the list is empty if there are none;
				1997	* the cs->css.refcnt seems always 0.
				1998	*/
				1999	if (list_empty(&cs->css.cgroup->css_sets))
				2000	return;
				2001
				2002	/*
				2003	* Find its next-highest non-empty parent, (top cpuset
				2004	* has online cpus, so can't be empty).
				2005	*/
				2006	parent = cs->parent;
				2007	while (cpumask_empty(parent->cpus_allowed) \|\|
				2008	nodes_empty(parent->mems_allowed))
				2009	parent = parent->parent;
				2010
				2011	move_member_tasks_to_cpuset(cs, parent);
				2012	}
				2013
				2014	/*
				2015	* Walk the specified cpuset subtree and look for empty cpusets.
				2016	* The tasks of such cpuset must be moved to a parent cpuset.
				2017	*
				2018	* Called with cgroup_mutex held. We take callback_mutex to modify
				2019	* cpus_allowed and mems_allowed.
				2020	*
				2021	* This walk processes the tree from top to bottom, completing one layer
				2022	* before dropping down to the next. It always processes a node before
				2023	* any of its children.
				2024	*
				2025	* For now, since we lack memory hot unplug, we'll never see a cpuset
				2026	* that has tasks along with an empty 'mems'. But if we did see such
				2027	* a cpuset, we'd handle it just like we do if its 'cpus' was empty.
				2028	*/
				2029	static void scan_for_empty_cpusets(struct cpuset *root)
				2030	{
				2031	LIST_HEAD(queue);
				2032	struct cpuset cp; / scans cpusets being updated */
				2033	struct cpuset child; / scans child cpusets of cp */
				2034	struct cgroup *cont;
				2035	static nodemask_t oldmems; /* protected by cgroup_mutex */
				2036
				2037	list_add_tail((struct list_head *)&root->stack_list, &queue);
				2038
				2039	while (!list_empty(&queue)) {
				2040	cp = list_first_entry(&queue, struct cpuset, stack_list);
				2041	list_del(queue.next);
				2042	list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
				2043	child = cgroup_cs(cont);
				2044	list_add_tail(&child->stack_list, &queue);
				2045	}
				2046
				2047	/* Continue past cpusets with all cpus, mems online */
				2048	if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
				2049	nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
				2050	continue;
				2051
				2052	oldmems = cp->mems_allowed;
				2053
				2054	/* Remove offline cpus and mems from this cpuset. */
				2055	mutex_lock(&callback_mutex);
				2056	cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
				2057	cpu_active_mask);
				2058	nodes_and(cp->mems_allowed, cp->mems_allowed,
				2059	node_states[N_HIGH_MEMORY]);
				2060	mutex_unlock(&callback_mutex);
				2061
				2062	/* Move tasks from the empty cpuset to a parent */
				2063	if (cpumask_empty(cp->cpus_allowed) \|\|
				2064	nodes_empty(cp->mems_allowed))
				2065	remove_tasks_in_empty_cpuset(cp);
				2066	else {
				2067	update_tasks_cpumask(cp, NULL);
				2068	update_tasks_nodemask(cp, &oldmems, NULL);
				2069	}
				2070	}
				2071	}
				2072
				2073	/*
				2074	* The top_cpuset tracks what CPUs and Memory Nodes are online,
				2075	* period. This is necessary in order to make cpusets transparent
				2076	* (of no affect) on systems that are actively using CPU hotplug
				2077	* but making no active use of cpusets.
				2078	*
				2079	* The only exception to this is suspend/resume, where we don't
				2080	* modify cpusets at all.
				2081	*
				2082	* This routine ensures that top_cpuset.cpus_allowed tracks
				2083	* cpu_active_mask on each CPU hotplug (cpuhp) event.
				2084	*
				2085	* Called within get_online_cpus(). Needs to call cgroup_lock()
				2086	* before calling generate_sched_domains().
				2087	*/
				2088	void cpuset_update_active_cpus(void)
				2089	{
				2090	struct sched_domain_attr *attr;
				2091	cpumask_var_t *doms;
				2092	int ndoms;
				2093
				2094	cgroup_lock();
				2095	mutex_lock(&callback_mutex);
				2096	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
				2097	mutex_unlock(&callback_mutex);
				2098	scan_for_empty_cpusets(&top_cpuset);
				2099	ndoms = generate_sched_domains(&doms, &attr);
				2100	cgroup_unlock();
				2101
				2102	/* Have scheduler rebuild the domains */
				2103	partition_sched_domains(ndoms, doms, attr);
				2104	}
				2105
				2106	#ifdef CONFIG_MEMORY_HOTPLUG
				2107	/*
				2108	* Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
				2109	* Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
				2110	* See also the previous routine cpuset_track_online_cpus().
				2111	*/
				2112	static int cpuset_track_online_nodes(struct notifier_block *self,
				2113	unsigned long action, void *arg)
				2114	{
				2115	static nodemask_t oldmems; /* protected by cgroup_mutex */
				2116
				2117	cgroup_lock();
				2118	switch (action) {
				2119	case MEM_ONLINE:
				2120	oldmems = top_cpuset.mems_allowed;
				2121	mutex_lock(&callback_mutex);
				2122	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
				2123	mutex_unlock(&callback_mutex);
				2124	update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
				2125	break;
				2126	case MEM_OFFLINE:
				2127	/*
				2128	* needn't update top_cpuset.mems_allowed explicitly because
				2129	* scan_for_empty_cpusets() will update it.
				2130	*/
				2131	scan_for_empty_cpusets(&top_cpuset);
				2132	break;
				2133	default:
				2134	break;
				2135	}
				2136	cgroup_unlock();
				2137
				2138	return NOTIFY_OK;
				2139	}
				2140	#endif
				2141
				2142	/**
				2143	* cpuset_init_smp - initialize cpus_allowed
				2144	*
				2145	* Description: Finish top cpuset after cpu, node maps are initialized
				2146	**/
				2147
				2148	void __init cpuset_init_smp(void)
				2149	{
				2150	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
				2151	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
				2152
				2153	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
				2154
				2155	cpuset_wq = create_singlethread_workqueue("cpuset");
				2156	BUG_ON(!cpuset_wq);
				2157	}
				2158
				2159	/**
				2160	* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
				2161	* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
				2162	* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
				2163	*
				2164	* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
				2165	* attached to the specified @tsk. Guaranteed to return some non-empty
				2166	* subset of cpu_online_mask, even if this means going outside the
				2167	* tasks cpuset.
				2168	**/
				2169
				2170	void cpuset_cpus_allowed(struct task_struct tsk, struct cpumask pmask)
				2171	{
				2172	mutex_lock(&callback_mutex);
				2173	task_lock(tsk);
				2174	guarantee_online_cpus(task_cs(tsk), pmask);
				2175	task_unlock(tsk);
				2176	mutex_unlock(&callback_mutex);
				2177	}
				2178
				2179	void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
				2180	{
				2181	const struct cpuset *cs;
				2182
				2183	rcu_read_lock();
				2184	cs = task_cs(tsk);
				2185	if (cs)
				2186	do_set_cpus_allowed(tsk, cs->cpus_allowed);
				2187	rcu_read_unlock();
				2188
				2189	/*
				2190	* We own tsk->cpus_allowed, nobody can change it under us.
				2191	*
				2192	* But we used cs && cs->cpus_allowed lockless and thus can
				2193	* race with cgroup_attach_task() or update_cpumask() and get
				2194	* the wrong tsk->cpus_allowed. However, both cases imply the
				2195	* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
				2196	* which takes task_rq_lock().
				2197	*
				2198	* If we are called after it dropped the lock we must see all
				2199	* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
				2200	* set any mask even if it is not right from task_cs() pov,
				2201	* the pending set_cpus_allowed_ptr() will fix things.
				2202	*
				2203	* select_fallback_rq() will fix things ups and set cpu_possible_mask
				2204	* if required.
				2205	*/
				2206	}
				2207
				2208	void cpuset_init_current_mems_allowed(void)
				2209	{
				2210	nodes_setall(current->mems_allowed);
				2211	}
				2212
				2213	/**
				2214	* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
				2215	* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
				2216	*
				2217	* Description: Returns the nodemask_t mems_allowed of the cpuset
				2218	* attached to the specified @tsk. Guaranteed to return some non-empty
				2219	* subset of node_states[N_HIGH_MEMORY], even if this means going outside the
				2220	* tasks cpuset.
				2221	**/
				2222
				2223	nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
				2224	{
				2225	nodemask_t mask;
				2226
				2227	mutex_lock(&callback_mutex);
				2228	task_lock(tsk);
				2229	guarantee_online_mems(task_cs(tsk), &mask);
				2230	task_unlock(tsk);
				2231	mutex_unlock(&callback_mutex);
				2232
				2233	return mask;
				2234	}
				2235
				2236	/**
				2237	* cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
				2238	* @nodemask: the nodemask to be checked
				2239	*
				2240	* Are any of the nodes in the nodemask allowed in current->mems_allowed?
				2241	*/
				2242	int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
				2243	{
				2244	return nodes_intersects(*nodemask, current->mems_allowed);
				2245	}
				2246
				2247	/*
				2248	* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
				2249	* mem_hardwall ancestor to the specified cpuset. Call holding
				2250	* callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
				2251	* (an unusual configuration), then returns the root cpuset.
				2252	*/
				2253	static const struct cpuset nearest_hardwall_ancestor(const struct cpuset cs)
				2254	{
				2255	while (!(is_mem_exclusive(cs) \|\| is_mem_hardwall(cs)) && cs->parent)
				2256	cs = cs->parent;
				2257	return cs;
				2258	}
				2259
				2260	/**
				2261	* cpuset_node_allowed_softwall - Can we allocate on a memory node?
				2262	* @node: is this an allowed node?
				2263	* @gfp_mask: memory allocation flags
				2264	*
				2265	* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
				2266	* set, yes, we can always allocate. If node is in our task's mems_allowed,
				2267	* yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
				2268	* hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
				2269	* OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
				2270	* flag, yes.
				2271	* Otherwise, no.
				2272	*
				2273	* If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
				2274	* cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
				2275	* might sleep, and might allow a node from an enclosing cpuset.
				2276	*
				2277	* cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
				2278	* cpusets, and never sleeps.
				2279	*
				2280	* The __GFP_THISNODE placement logic is really handled elsewhere,
				2281	* by forcibly using a zonelist starting at a specified node, and by
				2282	* (in get_page_from_freelist()) refusing to consider the zones for
				2283	* any node on the zonelist except the first. By the time any such
				2284	* calls get to this routine, we should just shut up and say 'yes'.
				2285	*
				2286	* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
				2287	* and do not allow allocations outside the current tasks cpuset
				2288	* unless the task has been OOM killed as is marked TIF_MEMDIE.
				2289	* GFP_KERNEL allocations are not so marked, so can escape to the
				2290	* nearest enclosing hardwalled ancestor cpuset.
				2291	*
				2292	* Scanning up parent cpusets requires callback_mutex. The
				2293	* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
				2294	* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
				2295	* current tasks mems_allowed came up empty on the first pass over
				2296	* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
				2297	* cpuset are short of memory, might require taking the callback_mutex
				2298	* mutex.
				2299	*
				2300	* The first call here from mm/page_alloc:get_page_from_freelist()
				2301	* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
				2302	* so no allocation on a node outside the cpuset is allowed (unless
				2303	* in interrupt, of course).
				2304	*
				2305	* The second pass through get_page_from_freelist() doesn't even call
				2306	* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
				2307	* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
				2308	* in alloc_flags. That logic and the checks below have the combined
				2309	* affect that:
				2310	* in_interrupt - any node ok (current task context irrelevant)
				2311	* GFP_ATOMIC - any node ok
				2312	* TIF_MEMDIE - any node ok
				2313	* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
				2314	* GFP_USER - only nodes in current tasks mems allowed ok.
				2315	*
				2316	* Rule:
				2317	* Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
				2318	* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
				2319	* the code that might scan up ancestor cpusets and sleep.
				2320	*/
				2321	int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
				2322	{
				2323	const struct cpuset cs; / current cpuset ancestors */
				2324	int allowed; /* is allocation in zone z allowed? */
				2325
				2326	if (in_interrupt() \|\| (gfp_mask & __GFP_THISNODE))
				2327	return 1;
				2328	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
				2329	if (node_isset(node, current->mems_allowed))
				2330	return 1;
				2331	/*
				2332	* Allow tasks that have access to memory reserves because they have
				2333	* been OOM killed to get memory anywhere.
				2334	*/
				2335	if (unlikely(test_thread_flag(TIF_MEMDIE)))
				2336	return 1;
				2337	if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
				2338	return 0;
				2339
				2340	if (current->flags & PF_EXITING) /* Let dying task have memory */
				2341	return 1;
				2342
				2343	/* Not hardwall and node outside mems_allowed: scan up cpusets */
				2344	mutex_lock(&callback_mutex);
				2345
				2346	task_lock(current);
				2347	cs = nearest_hardwall_ancestor(task_cs(current));
				2348	allowed = node_isset(node, cs->mems_allowed);
				2349	task_unlock(current);
				2350
				2351	mutex_unlock(&callback_mutex);
				2352	return allowed;
				2353	}
				2354
				2355	/*
				2356	* cpuset_node_allowed_hardwall - Can we allocate on a memory node?
				2357	* @node: is this an allowed node?
				2358	* @gfp_mask: memory allocation flags
				2359	*
				2360	* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
				2361	* set, yes, we can always allocate. If node is in our task's mems_allowed,
				2362	* yes. If the task has been OOM killed and has access to memory reserves as
				2363	* specified by the TIF_MEMDIE flag, yes.
				2364	* Otherwise, no.
				2365	*
				2366	* The __GFP_THISNODE placement logic is really handled elsewhere,
				2367	* by forcibly using a zonelist starting at a specified node, and by
				2368	* (in get_page_from_freelist()) refusing to consider the zones for
				2369	* any node on the zonelist except the first. By the time any such
				2370	* calls get to this routine, we should just shut up and say 'yes'.
				2371	*
				2372	* Unlike the cpuset_node_allowed_softwall() variant, above,
				2373	* this variant requires that the node be in the current task's
				2374	* mems_allowed or that we're in interrupt. It does not scan up the
				2375	* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
				2376	* It never sleeps.
				2377	*/
				2378	int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
				2379	{
				2380	if (in_interrupt() \|\| (gfp_mask & __GFP_THISNODE))
				2381	return 1;
				2382	if (node_isset(node, current->mems_allowed))
				2383	return 1;
				2384	/*
				2385	* Allow tasks that have access to memory reserves because they have
				2386	* been OOM killed to get memory anywhere.
				2387	*/
				2388	if (unlikely(test_thread_flag(TIF_MEMDIE)))
				2389	return 1;
				2390	return 0;
				2391	}
				2392
				2393	/**
				2394	* cpuset_unlock - release lock on cpuset changes
				2395	*
				2396	* Undo the lock taken in a previous cpuset_lock() call.
				2397	*/
				2398
				2399	void cpuset_unlock(void)
				2400	{
				2401	mutex_unlock(&callback_mutex);
				2402	}
				2403
				2404	/**
				2405	* cpuset_mem_spread_node() - On which node to begin search for a file page
				2406	* cpuset_slab_spread_node() - On which node to begin search for a slab page
				2407	*
				2408	* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
				2409	* tasks in a cpuset with is_spread_page or is_spread_slab set),
				2410	* and if the memory allocation used cpuset_mem_spread_node()
				2411	* to determine on which node to start looking, as it will for
				2412	* certain page cache or slab cache pages such as used for file
				2413	* system buffers and inode caches, then instead of starting on the
				2414	* local node to look for a free page, rather spread the starting
				2415	* node around the tasks mems_allowed nodes.
				2416	*
				2417	* We don't have to worry about the returned node being offline
				2418	* because "it can't happen", and even if it did, it would be ok.
				2419	*
				2420	* The routines calling guarantee_online_mems() are careful to
				2421	* only set nodes in task->mems_allowed that are online. So it
				2422	* should not be possible for the following code to return an
				2423	* offline node. But if it did, that would be ok, as this routine
				2424	* is not returning the node where the allocation must be, only
				2425	* the node where the search should start. The zonelist passed to
				2426	* __alloc_pages() will include all nodes. If the slab allocator
				2427	* is passed an offline node, it will fall back to the local node.
				2428	* See kmem_cache_alloc_node().
				2429	*/
				2430
				2431	static int cpuset_spread_node(int *rotor)
				2432	{
				2433	int node;
				2434
				2435	node = next_node(*rotor, current->mems_allowed);
				2436	if (node == MAX_NUMNODES)
				2437	node = first_node(current->mems_allowed);
				2438	*rotor = node;
				2439	return node;
				2440	}
				2441
				2442	int cpuset_mem_spread_node(void)
				2443	{
				2444	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
				2445	current->cpuset_mem_spread_rotor =
				2446	node_random(&current->mems_allowed);
				2447
				2448	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
				2449	}
				2450
				2451	int cpuset_slab_spread_node(void)
				2452	{
				2453	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
				2454	current->cpuset_slab_spread_rotor =
				2455	node_random(&current->mems_allowed);
				2456
				2457	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
				2458	}
				2459
				2460	EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
				2461
				2462	/**
				2463	* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
				2464	* @tsk1: pointer to task_struct of some task.
				2465	* @tsk2: pointer to task_struct of some other task.
				2466	*
				2467	* Description: Return true if @tsk1's mems_allowed intersects the
				2468	* mems_allowed of @tsk2. Used by the OOM killer to determine if
				2469	* one of the task's memory usage might impact the memory available
				2470	* to the other.
				2471	**/
				2472
				2473	int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
				2474	const struct task_struct *tsk2)
				2475	{
				2476	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
				2477	}
				2478
				2479	/**
				2480	* cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
				2481	* @task: pointer to task_struct of some task.
				2482	*
				2483	* Description: Prints @task's name, cpuset name, and cached copy of its
				2484	* mems_allowed to the kernel log. Must hold task_lock(task) to allow
				2485	* dereferencing task_cs(task).
				2486	*/
				2487	void cpuset_print_task_mems_allowed(struct task_struct *tsk)
				2488	{
				2489	struct dentry *dentry;
				2490
				2491	dentry = task_cs(tsk)->css.cgroup->dentry;
				2492	spin_lock(&cpuset_buffer_lock);
				2493
				2494	if (!dentry) {
				2495	strcpy(cpuset_name, "/");
				2496	} else {
				2497	spin_lock(&dentry->d_lock);
				2498	strlcpy(cpuset_name, (const char *)dentry->d_name.name,
				2499	CPUSET_NAME_LEN);
				2500	spin_unlock(&dentry->d_lock);
				2501	}
				2502
				2503	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
				2504	tsk->mems_allowed);
				2505	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n",
				2506	tsk->comm, cpuset_name, cpuset_nodelist);
				2507	spin_unlock(&cpuset_buffer_lock);
				2508	}
				2509
				2510	/*
				2511	* Collection of memory_pressure is suppressed unless
				2512	* this flag is enabled by writing "1" to the special
				2513	* cpuset file 'memory_pressure_enabled' in the root cpuset.
				2514	*/
				2515
				2516	int cpuset_memory_pressure_enabled __read_mostly;
				2517
				2518	/**
				2519	* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
				2520	*
				2521	* Keep a running average of the rate of synchronous (direct)
				2522	* page reclaim efforts initiated by tasks in each cpuset.
				2523	*
				2524	* This represents the rate at which some task in the cpuset
				2525	* ran low on memory on all nodes it was allowed to use, and
				2526	* had to enter the kernels page reclaim code in an effort to
				2527	* create more free memory by tossing clean pages or swapping
				2528	* or writing dirty pages.
				2529	*
				2530	* Display to user space in the per-cpuset read-only file
				2531	* "memory_pressure". Value displayed is an integer
				2532	* representing the recent rate of entry into the synchronous
				2533	* (direct) page reclaim by any task attached to the cpuset.
				2534	**/
				2535
				2536	void __cpuset_memory_pressure_bump(void)
				2537	{
				2538	task_lock(current);
				2539	fmeter_markevent(&task_cs(current)->fmeter);
				2540	task_unlock(current);
				2541	}
				2542
				2543	#ifdef CONFIG_PROC_PID_CPUSET
				2544	/*
				2545	* proc_cpuset_show()
				2546	* - Print tasks cpuset path into seq_file.
				2547	* - Used for /proc/<pid>/cpuset.
				2548	* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
				2549	* doesn't really matter if tsk->cpuset changes after we read it,
				2550	* and we take cgroup_mutex, keeping cpuset_attach() from changing it
				2551	* anyway.
				2552	*/
				2553	static int proc_cpuset_show(struct seq_file m, void unused_v)
				2554	{
				2555	struct pid *pid;
				2556	struct task_struct *tsk;
				2557	char *buf;
				2558	struct cgroup_subsys_state *css;
				2559	int retval;
				2560
				2561	retval = -ENOMEM;
				2562	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
				2563	if (!buf)
				2564	goto out;
				2565
				2566	retval = -ESRCH;
				2567	pid = m->private;
				2568	tsk = get_pid_task(pid, PIDTYPE_PID);
				2569	if (!tsk)
				2570	goto out_free;
				2571
				2572	retval = -EINVAL;
				2573	cgroup_lock();
				2574	css = task_subsys_state(tsk, cpuset_subsys_id);
				2575	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
				2576	if (retval < 0)
				2577	goto out_unlock;
				2578	seq_puts(m, buf);
				2579	seq_putc(m, '\n');
				2580	out_unlock:
				2581	cgroup_unlock();
				2582	put_task_struct(tsk);
				2583	out_free:
				2584	kfree(buf);
				2585	out:
				2586	return retval;
				2587	}
				2588
				2589	static int cpuset_open(struct inode inode, struct file file)
				2590	{
				2591	struct pid *pid = PROC_I(inode)->pid;
				2592	return single_open(file, proc_cpuset_show, pid);
				2593	}
				2594
				2595	const struct file_operations proc_cpuset_operations = {
				2596	.open = cpuset_open,
				2597	.read = seq_read,
				2598	.llseek = seq_lseek,
				2599	.release = single_release,
				2600	};
				2601	#endif /* CONFIG_PROC_PID_CPUSET */
				2602
				2603	/* Display task mems_allowed in /proc/<pid>/status file. */
				2604	void cpuset_task_status_allowed(struct seq_file m, struct task_struct task)
				2605	{
				2606	seq_printf(m, "Mems_allowed:\t");
				2607	seq_nodemask(m, &task->mems_allowed);
				2608	seq_printf(m, "\n");
				2609	seq_printf(m, "Mems_allowed_list:\t");
				2610	seq_nodemask_list(m, &task->mems_allowed);
				2611	seq_printf(m, "\n");
				2612	}