Blame - marvell/linux/kernel/cgroup/cpuset.c - T108

blob: fe5f4196aee6bd27062f5dca1a51568d5b7de3cf [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	/*
				2	* kernel/cpuset.c
				3	*
				4	* Processor and Memory placement constraints for sets of tasks.
				5	*
				6	* Copyright (C) 2003 BULL SA.
				7	* Copyright (C) 2004-2007 Silicon Graphics, Inc.
				8	* Copyright (C) 2006 Google, Inc
				9	*
				10	* Portions derived from Patrick Mochel's sysfs code.
				11	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				12	*
				13	* 2003-10-10 Written by Simon Derr.
				14	* 2003-10-22 Updates by Stephen Hemminger.
				15	* 2004 May-July Rework by Paul Jackson.
				16	* 2006 Rework by Paul Menage to use generic cgroups
				17	* 2008 Rework of the scheduler domains and CPU hotplug handling
				18	* by Max Krasnyansky
				19	*
				20	* This file is subject to the terms and conditions of the GNU General Public
				21	* License. See the file COPYING in the main directory of the Linux
				22	* distribution for more details.
				23	*/
				24
				25	#include "cgroup-internal.h"
				26	#include <linux/cpu.h>
				27	#include <linux/cpumask.h>
				28	#include <linux/cpuset.h>
				29	#include <linux/err.h>
				30	#include <linux/errno.h>
				31	#include <linux/file.h>
				32	#include <linux/fs.h>
				33	#include <linux/init.h>
				34	#include <linux/interrupt.h>
				35	#include <linux/kernel.h>
				36	#include <linux/kmod.h>
				37	#include <linux/kthread.h>
				38	#include <linux/list.h>
				39	#include <linux/mempolicy.h>
				40	#include <linux/mm.h>
				41	#include <linux/memory.h>
				42	#include <linux/export.h>
				43	#include <linux/mount.h>
				44	#include <linux/fs_context.h>
				45	#include <linux/namei.h>
				46	#include <linux/pagemap.h>
				47	#include <linux/proc_fs.h>
				48	#include <linux/rcupdate.h>
				49	#include <linux/sched.h>
				50	#include <linux/sched/deadline.h>
				51	#include <linux/sched/mm.h>
				52	#include <linux/sched/task.h>
				53	#include <linux/seq_file.h>
				54	#include <linux/security.h>
				55	#include <linux/slab.h>
				56	#include <linux/spinlock.h>
				57	#include <linux/stat.h>
				58	#include <linux/string.h>
				59	#include <linux/time.h>
				60	#include <linux/time64.h>
				61	#include <linux/backing-dev.h>
				62	#include <linux/sort.h>
				63	#include <linux/oom.h>
				64	#include <linux/sched/isolation.h>
				65	#include <linux/uaccess.h>
				66	#include <linux/atomic.h>
				67	#include <linux/mutex.h>
				68	#include <linux/cgroup.h>
				69	#include <linux/wait.h>
				70
				71	DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
				72	DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
				73
				74	/* See "Frequency meter" comments, below. */
				75
				76	struct fmeter {
				77	int cnt; /* unprocessed events count */
				78	int val; /* most recent output value */
				79	time64_t time; /* clock (secs) when val computed */
				80	spinlock_t lock; /* guards read or write of above */
				81	};
				82
				83	struct cpuset {
				84	struct cgroup_subsys_state css;
				85
				86	unsigned long flags; /* "unsigned long" so bitops work */
				87
				88	/*
				89	* On default hierarchy:
				90	*
				91	* The user-configured masks can only be changed by writing to
				92	* cpuset.cpus and cpuset.mems, and won't be limited by the
				93	* parent masks.
				94	*
				95	* The effective masks is the real masks that apply to the tasks
				96	* in the cpuset. They may be changed if the configured masks are
				97	* changed or hotplug happens.
				98	*
				99	* effective_mask == configured_mask & parent's effective_mask,
				100	* and if it ends up empty, it will inherit the parent's mask.
				101	*
				102	*
				103	* On legacy hierachy:
				104	*
				105	* The user-configured masks are always the same with effective masks.
				106	*/
				107
				108	/* user-configured CPUs and Memory Nodes allow to tasks */
				109	cpumask_var_t cpus_allowed;
				110	cpumask_var_t cpus_requested;
				111	nodemask_t mems_allowed;
				112
				113	/* effective CPUs and Memory Nodes allow to tasks */
				114	cpumask_var_t effective_cpus;
				115	nodemask_t effective_mems;
				116
				117	/*
				118	* CPUs allocated to child sub-partitions (default hierarchy only)
				119	* - CPUs granted by the parent = effective_cpus U subparts_cpus
				120	* - effective_cpus and subparts_cpus are mutually exclusive.
				121	*
				122	* effective_cpus contains only onlined CPUs, but subparts_cpus
				123	* may have offlined ones.
				124	*/
				125	cpumask_var_t subparts_cpus;
				126
				127	/*
				128	* This is old Memory Nodes tasks took on.
				129	*
				130	* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
				131	* - A new cpuset's old_mems_allowed is initialized when some
				132	* task is moved into it.
				133	* - old_mems_allowed is used in cpuset_migrate_mm() when we change
				134	* cpuset.mems_allowed and have tasks' nodemask updated, and
				135	* then old_mems_allowed is updated to mems_allowed.
				136	*/
				137	nodemask_t old_mems_allowed;
				138
				139	struct fmeter fmeter; /* memory_pressure filter */
				140
				141	/*
				142	* Tasks are being attached to this cpuset. Used to prevent
				143	* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
				144	*/
				145	int attach_in_progress;
				146
				147	/* partition number for rebuild_sched_domains() */
				148	int pn;
				149
				150	/* for custom sched domain */
				151	int relax_domain_level;
				152
				153	/* number of CPUs in subparts_cpus */
				154	int nr_subparts_cpus;
				155
				156	/* partition root state */
				157	int partition_root_state;
				158
				159	/*
				160	* Default hierarchy only:
				161	* use_parent_ecpus - set if using parent's effective_cpus
				162	* child_ecpus_count - # of children with use_parent_ecpus set
				163	*/
				164	int use_parent_ecpus;
				165	int child_ecpus_count;
				166	};
				167
				168	/*
				169	* Partition root states:
				170	*
				171	* 0 - not a partition root
				172	*
				173	* 1 - partition root
				174	*
				175	* -1 - invalid partition root
				176	* None of the cpus in cpus_allowed can be put into the parent's
				177	* subparts_cpus. In this case, the cpuset is not a real partition
				178	* root anymore. However, the CPU_EXCLUSIVE bit will still be set
				179	* and the cpuset can be restored back to a partition root if the
				180	* parent cpuset can give more CPUs back to this child cpuset.
				181	*/
				182	#define PRS_DISABLED 0
				183	#define PRS_ENABLED 1
				184	#define PRS_ERROR -1
				185
				186	/*
				187	* Temporary cpumasks for working with partitions that are passed among
				188	* functions to avoid memory allocation in inner functions.
				189	*/
				190	struct tmpmasks {
				191	cpumask_var_t addmask, delmask; /* For partition root */
				192	cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
				193	};
				194
				195	static inline struct cpuset css_cs(struct cgroup_subsys_state css)
				196	{
				197	return css ? container_of(css, struct cpuset, css) : NULL;
				198	}
				199
				200	/* Retrieve the cpuset for a task */
				201	static inline struct cpuset task_cs(struct task_struct task)
				202	{
				203	return css_cs(task_css(task, cpuset_cgrp_id));
				204	}
				205
				206	static inline struct cpuset parent_cs(struct cpuset cs)
				207	{
				208	return css_cs(cs->css.parent);
				209	}
				210
				211	/* bits in struct cpuset flags field */
				212	typedef enum {
				213	CS_ONLINE,
				214	CS_CPU_EXCLUSIVE,
				215	CS_MEM_EXCLUSIVE,
				216	CS_MEM_HARDWALL,
				217	CS_MEMORY_MIGRATE,
				218	CS_SCHED_LOAD_BALANCE,
				219	CS_SPREAD_PAGE,
				220	CS_SPREAD_SLAB,
				221	} cpuset_flagbits_t;
				222
				223	/* convenient tests for these bits */
				224	static inline bool is_cpuset_online(struct cpuset *cs)
				225	{
				226	return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
				227	}
				228
				229	static inline int is_cpu_exclusive(const struct cpuset *cs)
				230	{
				231	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
				232	}
				233
				234	static inline int is_mem_exclusive(const struct cpuset *cs)
				235	{
				236	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
				237	}
				238
				239	static inline int is_mem_hardwall(const struct cpuset *cs)
				240	{
				241	return test_bit(CS_MEM_HARDWALL, &cs->flags);
				242	}
				243
				244	static inline int is_sched_load_balance(const struct cpuset *cs)
				245	{
				246	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
				247	}
				248
				249	static inline int is_memory_migrate(const struct cpuset *cs)
				250	{
				251	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
				252	}
				253
				254	static inline int is_spread_page(const struct cpuset *cs)
				255	{
				256	return test_bit(CS_SPREAD_PAGE, &cs->flags);
				257	}
				258
				259	static inline int is_spread_slab(const struct cpuset *cs)
				260	{
				261	return test_bit(CS_SPREAD_SLAB, &cs->flags);
				262	}
				263
				264	static inline int is_partition_root(const struct cpuset *cs)
				265	{
				266	return cs->partition_root_state > 0;
				267	}
				268
				269	static struct cpuset top_cpuset = {
				270	.flags = ((1 << CS_ONLINE) \| (1 << CS_CPU_EXCLUSIVE) \|
				271	(1 << CS_MEM_EXCLUSIVE)),
				272	.partition_root_state = PRS_ENABLED,
				273	};
				274
				275	/**
				276	* cpuset_for_each_child - traverse online children of a cpuset
				277	* @child_cs: loop cursor pointing to the current child
				278	* @pos_css: used for iteration
				279	* @parent_cs: target cpuset to walk children of
				280	*
				281	* Walk @child_cs through the online children of @parent_cs. Must be used
				282	* with RCU read locked.
				283	*/
				284	#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \
				285	css_for_each_child((pos_css), &(parent_cs)->css) \
				286	if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
				287
				288	/**
				289	* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
				290	* @des_cs: loop cursor pointing to the current descendant
				291	* @pos_css: used for iteration
				292	* @root_cs: target cpuset to walk ancestor of
				293	*
				294	* Walk @des_cs through the online descendants of @root_cs. Must be used
				295	* with RCU read locked. The caller may modify @pos_css by calling
				296	* css_rightmost_descendant() to skip subtree. @root_cs is included in the
				297	* iteration and the first node to be visited.
				298	*/
				299	#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \
				300	css_for_each_descendant_pre((pos_css), &(root_cs)->css) \
				301	if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
				302
				303	/*
				304	* There are two global locks guarding cpuset structures - cpuset_mutex and
				305	* callback_lock. We also require taking task_lock() when dereferencing a
				306	* task's cpuset pointer. See "The task_lock() exception", at the end of this
				307	* comment.
				308	*
				309	* A task must hold both locks to modify cpusets. If a task holds
				310	* cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
				311	* is the only task able to also acquire callback_lock and be able to
				312	* modify cpusets. It can perform various checks on the cpuset structure
				313	* first, knowing nothing will change. It can also allocate memory while
				314	* just holding cpuset_mutex. While it is performing these checks, various
				315	* callback routines can briefly acquire callback_lock to query cpusets.
				316	* Once it is ready to make the changes, it takes callback_lock, blocking
				317	* everyone else.
				318	*
				319	* Calls to the kernel memory allocator can not be made while holding
				320	* callback_lock, as that would risk double tripping on callback_lock
				321	* from one of the callbacks into the cpuset code from within
				322	* __alloc_pages().
				323	*
				324	* If a task is only holding callback_lock, then it has read-only
				325	* access to cpusets.
				326	*
				327	* Now, the task_struct fields mems_allowed and mempolicy may be changed
				328	* by other task, we use alloc_lock in the task_struct fields to protect
				329	* them.
				330	*
				331	* The cpuset_common_file_read() handlers only hold callback_lock across
				332	* small pieces of code, such as when reading out possibly multi-word
				333	* cpumasks and nodemasks.
				334	*
				335	* Accessing a task's cpuset should be done in accordance with the
				336	* guidelines for accessing subsystem state in kernel/cgroup.c
				337	*/
				338
				339	DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
				340	static DEFINE_SPINLOCK(callback_lock);
				341
				342	static struct workqueue_struct *cpuset_migrate_mm_wq;
				343
				344	/*
				345	* CPU / memory hotplug is handled asynchronously.
				346	*/
				347	static void cpuset_hotplug_workfn(struct work_struct *work);
				348	static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
				349
				350	static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
				351
				352	/*
				353	* Cgroup v2 behavior is used when on default hierarchy or the
				354	* cgroup_v2_mode flag is set.
				355	*/
				356	static inline bool is_in_v2_mode(void)
				357	{
				358	return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
				359	(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
				360	}
				361
				362	/*
				363	* Return in pmask the portion of a cpusets's cpus_allowed that
				364	* are online. If none are online, walk up the cpuset hierarchy
				365	* until we find one that does have some online cpus.
				366	*
				367	* One way or another, we guarantee to return some non-empty subset
				368	* of cpu_online_mask.
				369	*
				370	* Call with callback_lock or cpuset_mutex held.
				371	*/
				372	static void guarantee_online_cpus(struct cpuset cs, struct cpumask pmask)
				373	{
				374	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
				375	cs = parent_cs(cs);
				376	if (unlikely(!cs)) {
				377	/*
				378	* The top cpuset doesn't have any online cpu as a
				379	* consequence of a race between cpuset_hotplug_work
				380	* and cpu hotplug notifier. But we know the top
				381	* cpuset's effective_cpus is on its way to to be
				382	* identical to cpu_online_mask.
				383	*/
				384	cpumask_copy(pmask, cpu_online_mask);
				385	return;
				386	}
				387	}
				388	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
				389	}
				390
				391	/*
				392	* Return in *pmask the portion of a cpusets's mems_allowed that
				393	* are online, with memory. If none are online with memory, walk
				394	* up the cpuset hierarchy until we find one that does have some
				395	* online mems. The top cpuset always has some mems online.
				396	*
				397	* One way or another, we guarantee to return some non-empty subset
				398	* of node_states[N_MEMORY].
				399	*
				400	* Call with callback_lock or cpuset_mutex held.
				401	*/
				402	static void guarantee_online_mems(struct cpuset cs, nodemask_t pmask)
				403	{
				404	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
				405	cs = parent_cs(cs);
				406	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
				407	}
				408
				409	/*
				410	* update task's spread flag if cpuset's page/slab spread flag is set
				411	*
				412	* Call with callback_lock or cpuset_mutex held.
				413	*/
				414	static void cpuset_update_task_spread_flag(struct cpuset *cs,
				415	struct task_struct *tsk)
				416	{
				417	if (is_spread_page(cs))
				418	task_set_spread_page(tsk);
				419	else
				420	task_clear_spread_page(tsk);
				421
				422	if (is_spread_slab(cs))
				423	task_set_spread_slab(tsk);
				424	else
				425	task_clear_spread_slab(tsk);
				426	}
				427
				428	/*
				429	* is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
				430	*
				431	* One cpuset is a subset of another if all its allowed CPUs and
				432	* Memory Nodes are a subset of the other, and its exclusive flags
				433	* are only set if the other's are set. Call holding cpuset_mutex.
				434	*/
				435
				436	static int is_cpuset_subset(const struct cpuset p, const struct cpuset q)
				437	{
				438	return cpumask_subset(p->cpus_requested, q->cpus_requested) &&
				439	nodes_subset(p->mems_allowed, q->mems_allowed) &&
				440	is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
				441	is_mem_exclusive(p) <= is_mem_exclusive(q);
				442	}
				443
				444	/**
				445	* alloc_cpumasks - allocate three cpumasks for cpuset
				446	* @cs: the cpuset that have cpumasks to be allocated.
				447	* @tmp: the tmpmasks structure pointer
				448	* Return: 0 if successful, -ENOMEM otherwise.
				449	*
				450	* Only one of the two input arguments should be non-NULL.
				451	*/
				452	static inline int alloc_cpumasks(struct cpuset cs, struct tmpmasks tmp)
				453	{
				454	cpumask_var_t pmask1, pmask2, *pmask3;
				455
				456	if (cs) {
				457	pmask1 = &cs->cpus_allowed;
				458	pmask2 = &cs->effective_cpus;
				459	pmask3 = &cs->subparts_cpus;
				460	} else {
				461	pmask1 = &tmp->new_cpus;
				462	pmask2 = &tmp->addmask;
				463	pmask3 = &tmp->delmask;
				464	}
				465
				466	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
				467	return -ENOMEM;
				468
				469	if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
				470	goto free_one;
				471
				472	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
				473	goto free_two;
				474
				475	if (cs && !zalloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
				476	goto free_three;
				477
				478	return 0;
				479
				480	free_three:
				481	free_cpumask_var(*pmask3);
				482	free_two:
				483	free_cpumask_var(*pmask2);
				484	free_one:
				485	free_cpumask_var(*pmask1);
				486	return -ENOMEM;
				487	}
				488
				489	/**
				490	* free_cpumasks - free cpumasks in a tmpmasks structure
				491	* @cs: the cpuset that have cpumasks to be free.
				492	* @tmp: the tmpmasks structure pointer
				493	*/
				494	static inline void free_cpumasks(struct cpuset cs, struct tmpmasks tmp)
				495	{
				496	if (cs) {
				497	free_cpumask_var(cs->cpus_allowed);
				498	free_cpumask_var(cs->cpus_requested);
				499	free_cpumask_var(cs->effective_cpus);
				500	free_cpumask_var(cs->subparts_cpus);
				501	}
				502	if (tmp) {
				503	free_cpumask_var(tmp->new_cpus);
				504	free_cpumask_var(tmp->addmask);
				505	free_cpumask_var(tmp->delmask);
				506	}
				507	}
				508
				509	/**
				510	* alloc_trial_cpuset - allocate a trial cpuset
				511	* @cs: the cpuset that the trial cpuset duplicates
				512	*/
				513	static struct cpuset alloc_trial_cpuset(struct cpuset cs)
				514	{
				515	struct cpuset *trial;
				516
				517	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
				518	if (!trial)
				519	return NULL;
				520
				521	if (alloc_cpumasks(trial, NULL)) {
				522	kfree(trial);
				523	return NULL;
				524	}
				525
				526	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
				527	cpumask_copy(trial->cpus_requested, cs->cpus_requested);
				528	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
				529	return trial;
				530	}
				531
				532	/**
				533	* free_cpuset - free the cpuset
				534	* @cs: the cpuset to be freed
				535	*/
				536	static inline void free_cpuset(struct cpuset *cs)
				537	{
				538	free_cpumasks(cs, NULL);
				539	kfree(cs);
				540	}
				541
				542	/*
				543	* validate_change() - Used to validate that any proposed cpuset change
				544	* follows the structural rules for cpusets.
				545	*
				546	* If we replaced the flag and mask values of the current cpuset
				547	* (cur) with those values in the trial cpuset (trial), would
				548	* our various subset and exclusive rules still be valid? Presumes
				549	* cpuset_mutex held.
				550	*
				551	* 'cur' is the address of an actual, in-use cpuset. Operations
				552	* such as list traversal that depend on the actual address of the
				553	* cpuset in the list must use cur below, not trial.
				554	*
				555	* 'trial' is the address of bulk structure copy of cur, with
				556	* perhaps one or more of the fields cpus_allowed, mems_allowed,
				557	* or flags changed to new, trial values.
				558	*
				559	* Return 0 if valid, -errno if not.
				560	*/
				561
				562	static int validate_change(struct cpuset cur, struct cpuset trial)
				563	{
				564	struct cgroup_subsys_state *css;
				565	struct cpuset c, par;
				566	int ret;
				567
				568	rcu_read_lock();
				569
				570	/* Each of our child cpusets must be a subset of us */
				571	ret = -EBUSY;
				572	cpuset_for_each_child(c, css, cur)
				573	if (!is_cpuset_subset(c, trial))
				574	goto out;
				575
				576	/* Remaining checks don't apply to root cpuset */
				577	ret = 0;
				578	if (cur == &top_cpuset)
				579	goto out;
				580
				581	par = parent_cs(cur);
				582
				583	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
				584	ret = -EACCES;
				585	if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
				586	goto out;
				587
				588	/*
				589	* If either I or some sibling (!= me) is exclusive, we can't
				590	* overlap
				591	*/
				592	ret = -EINVAL;
				593	cpuset_for_each_child(c, css, par) {
				594	if ((is_cpu_exclusive(trial) \|\| is_cpu_exclusive(c)) &&
				595	c != cur &&
				596	cpumask_intersects(trial->cpus_requested, c->cpus_requested))
				597	goto out;
				598	if ((is_mem_exclusive(trial) \|\| is_mem_exclusive(c)) &&
				599	c != cur &&
				600	nodes_intersects(trial->mems_allowed, c->mems_allowed))
				601	goto out;
				602	}
				603
				604	/*
				605	* Cpusets with tasks - existing or newly being attached - can't
				606	* be changed to have empty cpus_allowed or mems_allowed.
				607	*/
				608	ret = -ENOSPC;
				609	if ((cgroup_is_populated(cur->css.cgroup) \|\| cur->attach_in_progress)) {
				610	if (!cpumask_empty(cur->cpus_allowed) &&
				611	cpumask_empty(trial->cpus_allowed))
				612	goto out;
				613	if (!nodes_empty(cur->mems_allowed) &&
				614	nodes_empty(trial->mems_allowed))
				615	goto out;
				616	}
				617
				618	/*
				619	* We can't shrink if we won't have enough room for SCHED_DEADLINE
				620	* tasks.
				621	*/
				622	ret = -EBUSY;
				623	if (is_cpu_exclusive(cur) &&
				624	!cpuset_cpumask_can_shrink(cur->cpus_allowed,
				625	trial->cpus_allowed))
				626	goto out;
				627
				628	ret = 0;
				629	out:
				630	rcu_read_unlock();
				631	return ret;
				632	}
				633
				634	#ifdef CONFIG_SMP
				635	/*
				636	* Helper routine for generate_sched_domains().
				637	* Do cpusets a, b have overlapping effective cpus_allowed masks?
				638	*/
				639	static int cpusets_overlap(struct cpuset a, struct cpuset b)
				640	{
				641	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
				642	}
				643
				644	static void
				645	update_domain_attr(struct sched_domain_attr dattr, struct cpuset c)
				646	{
				647	if (dattr->relax_domain_level < c->relax_domain_level)
				648	dattr->relax_domain_level = c->relax_domain_level;
				649	return;
				650	}
				651
				652	static void update_domain_attr_tree(struct sched_domain_attr *dattr,
				653	struct cpuset *root_cs)
				654	{
				655	struct cpuset *cp;
				656	struct cgroup_subsys_state *pos_css;
				657
				658	rcu_read_lock();
				659	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
				660	/* skip the whole subtree if @cp doesn't have any CPU */
				661	if (cpumask_empty(cp->cpus_allowed)) {
				662	pos_css = css_rightmost_descendant(pos_css);
				663	continue;
				664	}
				665
				666	if (is_sched_load_balance(cp))
				667	update_domain_attr(dattr, cp);
				668	}
				669	rcu_read_unlock();
				670	}
				671
				672	/* Must be called with cpuset_mutex held. */
				673	static inline int nr_cpusets(void)
				674	{
				675	/* jump label reference count + the top-level cpuset */
				676	return static_key_count(&cpusets_enabled_key.key) + 1;
				677	}
				678
				679	/*
				680	* generate_sched_domains()
				681	*
				682	* This function builds a partial partition of the systems CPUs
				683	* A 'partial partition' is a set of non-overlapping subsets whose
				684	* union is a subset of that set.
				685	* The output of this function needs to be passed to kernel/sched/core.c
				686	* partition_sched_domains() routine, which will rebuild the scheduler's
				687	* load balancing domains (sched domains) as specified by that partial
				688	* partition.
				689	*
				690	* See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst
				691	* for a background explanation of this.
				692	*
				693	* Does not return errors, on the theory that the callers of this
				694	* routine would rather not worry about failures to rebuild sched
				695	* domains when operating in the severe memory shortage situations
				696	* that could cause allocation failures below.
				697	*
				698	* Must be called with cpuset_mutex held.
				699	*
				700	* The three key local variables below are:
				701	* cp - cpuset pointer, used (together with pos_css) to perform a
				702	* top-down scan of all cpusets. For our purposes, rebuilding
				703	* the schedulers sched domains, we can ignore !is_sched_load_
				704	* balance cpusets.
				705	* csa - (for CpuSet Array) Array of pointers to all the cpusets
				706	* that need to be load balanced, for convenient iterative
				707	* access by the subsequent code that finds the best partition,
				708	* i.e the set of domains (subsets) of CPUs such that the
				709	* cpus_allowed of every cpuset marked is_sched_load_balance
				710	* is a subset of one of these domains, while there are as
				711	* many such domains as possible, each as small as possible.
				712	* doms - Conversion of 'csa' to an array of cpumasks, for passing to
				713	* the kernel/sched/core.c routine partition_sched_domains() in a
				714	* convenient format, that can be easily compared to the prior
				715	* value to determine what partition elements (sched domains)
				716	* were changed (added or removed.)
				717	*
				718	* Finding the best partition (set of domains):
				719	* The triple nested loops below over i, j, k scan over the
				720	* load balanced cpusets (using the array of cpuset pointers in
				721	* csa[]) looking for pairs of cpusets that have overlapping
				722	* cpus_allowed, but which don't have the same 'pn' partition
				723	* number and gives them in the same partition number. It keeps
				724	* looping on the 'restart' label until it can no longer find
				725	* any such pairs.
				726	*
				727	* The union of the cpus_allowed masks from the set of
				728	* all cpusets having the same 'pn' value then form the one
				729	* element of the partition (one sched domain) to be passed to
				730	* partition_sched_domains().
				731	*/
				732	static int generate_sched_domains(cpumask_var_t **domains,
				733	struct sched_domain_attr **attributes)
				734	{
				735	struct cpuset cp; / top-down scan of cpusets */
				736	struct cpuset *csa; / array of all cpuset ptrs */
				737	int csn; /* how many cpuset ptrs in csa so far */
				738	int i, j, k; /* indices for partition finding loops */
				739	cpumask_var_t doms; / resulting partition; i.e. sched domains */
				740	struct sched_domain_attr dattr; / attributes for custom domains */
				741	int ndoms = 0; /* number of sched domains in result */
				742	int nslot; /* next empty doms[] struct cpumask slot */
				743	struct cgroup_subsys_state *pos_css;
				744	bool root_load_balance = is_sched_load_balance(&top_cpuset);
				745
				746	doms = NULL;
				747	dattr = NULL;
				748	csa = NULL;
				749
				750	/* Special case for the 99% of systems with one, full, sched domain */
				751	if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
				752	ndoms = 1;
				753	doms = alloc_sched_domains(ndoms);
				754	if (!doms)
				755	goto done;
				756
				757	dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
				758	if (dattr) {
				759	*dattr = SD_ATTR_INIT;
				760	update_domain_attr_tree(dattr, &top_cpuset);
				761	}
				762	cpumask_and(doms[0], top_cpuset.effective_cpus,
				763	housekeeping_cpumask(HK_FLAG_DOMAIN));
				764
				765	goto done;
				766	}
				767
				768	csa = kmalloc_array(nr_cpusets(), sizeof(cp), GFP_KERNEL);
				769	if (!csa)
				770	goto done;
				771	csn = 0;
				772
				773	rcu_read_lock();
				774	if (root_load_balance)
				775	csa[csn++] = &top_cpuset;
				776	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
				777	if (cp == &top_cpuset)
				778	continue;
				779	/*
				780	* Continue traversing beyond @cp iff @cp has some CPUs and
				781	* isn't load balancing. The former is obvious. The
				782	* latter: All child cpusets contain a subset of the
				783	* parent's cpus, so just skip them, and then we call
				784	* update_domain_attr_tree() to calc relax_domain_level of
				785	* the corresponding sched domain.
				786	*
				787	* If root is load-balancing, we can skip @cp if it
				788	* is a subset of the root's effective_cpus.
				789	*/
				790	if (!cpumask_empty(cp->cpus_allowed) &&
				791	!(is_sched_load_balance(cp) &&
				792	cpumask_intersects(cp->cpus_allowed,
				793	housekeeping_cpumask(HK_FLAG_DOMAIN))))
				794	continue;
				795
				796	if (root_load_balance &&
				797	cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
				798	continue;
				799
				800	if (is_sched_load_balance(cp) &&
				801	!cpumask_empty(cp->effective_cpus))
				802	csa[csn++] = cp;
				803
				804	/* skip @cp's subtree if not a partition root */
				805	if (!is_partition_root(cp))
				806	pos_css = css_rightmost_descendant(pos_css);
				807	}
				808	rcu_read_unlock();
				809
				810	for (i = 0; i < csn; i++)
				811	csa[i]->pn = i;
				812	ndoms = csn;
				813
				814	restart:
				815	/* Find the best partition (set of sched domains) */
				816	for (i = 0; i < csn; i++) {
				817	struct cpuset *a = csa[i];
				818	int apn = a->pn;
				819
				820	for (j = 0; j < csn; j++) {
				821	struct cpuset *b = csa[j];
				822	int bpn = b->pn;
				823
				824	if (apn != bpn && cpusets_overlap(a, b)) {
				825	for (k = 0; k < csn; k++) {
				826	struct cpuset *c = csa[k];
				827
				828	if (c->pn == bpn)
				829	c->pn = apn;
				830	}
				831	ndoms--; /* one less element */
				832	goto restart;
				833	}
				834	}
				835	}
				836
				837	/*
				838	* Now we know how many domains to create.
				839	* Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
				840	*/
				841	doms = alloc_sched_domains(ndoms);
				842	if (!doms)
				843	goto done;
				844
				845	/*
				846	* The rest of the code, including the scheduler, can deal with
				847	* dattr==NULL case. No need to abort if alloc fails.
				848	*/
				849	dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
				850	GFP_KERNEL);
				851
				852	for (nslot = 0, i = 0; i < csn; i++) {
				853	struct cpuset *a = csa[i];
				854	struct cpumask *dp;
				855	int apn = a->pn;
				856
				857	if (apn < 0) {
				858	/* Skip completed partitions */
				859	continue;
				860	}
				861
				862	dp = doms[nslot];
				863
				864	if (nslot == ndoms) {
				865	static int warnings = 10;
				866	if (warnings) {
				867	pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d\n",
				868	nslot, ndoms, csn, i, apn);
				869	warnings--;
				870	}
				871	continue;
				872	}
				873
				874	cpumask_clear(dp);
				875	if (dattr)
				876	*(dattr + nslot) = SD_ATTR_INIT;
				877	for (j = i; j < csn; j++) {
				878	struct cpuset *b = csa[j];
				879
				880	if (apn == b->pn) {
				881	cpumask_or(dp, dp, b->effective_cpus);
				882	cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
				883	if (dattr)
				884	update_domain_attr_tree(dattr + nslot, b);
				885
				886	/* Done with this partition */
				887	b->pn = -1;
				888	}
				889	}
				890	nslot++;
				891	}
				892	BUG_ON(nslot != ndoms);
				893
				894	done:
				895	kfree(csa);
				896
				897	/*
				898	* Fallback to the default domain if kmalloc() failed.
				899	* See comments in partition_sched_domains().
				900	*/
				901	if (doms == NULL)
				902	ndoms = 1;
				903
				904	*domains = doms;
				905	*attributes = dattr;
				906	return ndoms;
				907	}
				908
				909	static void update_tasks_root_domain(struct cpuset *cs)
				910	{
				911	struct css_task_iter it;
				912	struct task_struct *task;
				913
				914	css_task_iter_start(&cs->css, 0, &it);
				915
				916	while ((task = css_task_iter_next(&it)))
				917	dl_add_task_root_domain(task);
				918
				919	css_task_iter_end(&it);
				920	}
				921
				922	static void rebuild_root_domains(void)
				923	{
				924	struct cpuset *cs = NULL;
				925	struct cgroup_subsys_state *pos_css;
				926
				927	percpu_rwsem_assert_held(&cpuset_rwsem);
				928	lockdep_assert_cpus_held();
				929	lockdep_assert_held(&sched_domains_mutex);
				930
				931	cgroup_enable_task_cg_lists();
				932
				933	rcu_read_lock();
				934
				935	/*
				936	* Clear default root domain DL accounting, it will be computed again
				937	* if a task belongs to it.
				938	*/
				939	dl_clear_root_domain(&def_root_domain);
				940
				941	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
				942
				943	if (cpumask_empty(cs->effective_cpus)) {
				944	pos_css = css_rightmost_descendant(pos_css);
				945	continue;
				946	}
				947
				948	css_get(&cs->css);
				949
				950	rcu_read_unlock();
				951
				952	update_tasks_root_domain(cs);
				953
				954	rcu_read_lock();
				955	css_put(&cs->css);
				956	}
				957	rcu_read_unlock();
				958	}
				959
				960	static void
				961	partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
				962	struct sched_domain_attr *dattr_new)
				963	{
				964	mutex_lock(&sched_domains_mutex);
				965	partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
				966	rebuild_root_domains();
				967	mutex_unlock(&sched_domains_mutex);
				968	}
				969
				970	/*
				971	* Rebuild scheduler domains.
				972	*
				973	* If the flag 'sched_load_balance' of any cpuset with non-empty
				974	* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
				975	* which has that flag enabled, or if any cpuset with a non-empty
				976	* 'cpus' is removed, then call this routine to rebuild the
				977	* scheduler's dynamic sched domains.
				978	*
				979	* Call with cpuset_mutex held. Takes get_online_cpus().
				980	*/
				981	static void rebuild_sched_domains_locked(void)
				982	{
				983	struct cgroup_subsys_state *pos_css;
				984	struct sched_domain_attr *attr;
				985	cpumask_var_t *doms;
				986	struct cpuset *cs;
				987	int ndoms;
				988
				989	lockdep_assert_cpus_held();
				990	percpu_rwsem_assert_held(&cpuset_rwsem);
				991
				992	/*
				993	* If we have raced with CPU hotplug, return early to avoid
				994	* passing doms with offlined cpu to partition_sched_domains().
				995	* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
				996	*
				997	* With no CPUs in any subpartitions, top_cpuset's effective CPUs
				998	* should be the same as the active CPUs, so checking only top_cpuset
				999	* is enough to detect racing CPU offlines.
				1000	*/
				1001	if (!top_cpuset.nr_subparts_cpus &&
				1002	!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
				1003	return;
				1004
				1005	/*
				1006	* With subpartition CPUs, however, the effective CPUs of a partition
				1007	* root should be only a subset of the active CPUs. Since a CPU in any
				1008	* partition root could be offlined, all must be checked.
				1009	*/
				1010	if (top_cpuset.nr_subparts_cpus) {
				1011	rcu_read_lock();
				1012	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
				1013	if (!is_partition_root(cs)) {
				1014	pos_css = css_rightmost_descendant(pos_css);
				1015	continue;
				1016	}
				1017	if (!cpumask_subset(cs->effective_cpus,
				1018	cpu_active_mask)) {
				1019	rcu_read_unlock();
				1020	return;
				1021	}
				1022	}
				1023	rcu_read_unlock();
				1024	}
				1025
				1026	/* Generate domain masks and attrs */
				1027	ndoms = generate_sched_domains(&doms, &attr);
				1028
				1029	/* Have scheduler rebuild the domains */
				1030	partition_and_rebuild_sched_domains(ndoms, doms, attr);
				1031	}
				1032	#else /* !CONFIG_SMP */
				1033	static void rebuild_sched_domains_locked(void)
				1034	{
				1035	}
				1036	#endif /* CONFIG_SMP */
				1037
				1038	void rebuild_sched_domains(void)
				1039	{
				1040	get_online_cpus();
				1041	percpu_down_write(&cpuset_rwsem);
				1042	rebuild_sched_domains_locked();
				1043	percpu_up_write(&cpuset_rwsem);
				1044	put_online_cpus();
				1045	}
				1046
				1047	/**
				1048	* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
				1049	* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
				1050	*
				1051	* Iterate through each task of @cs updating its cpus_allowed to the
				1052	* effective cpuset's. As this function is called with cpuset_mutex held,
				1053	* cpuset membership stays stable.
				1054	*/
				1055	static void update_tasks_cpumask(struct cpuset *cs)
				1056	{
				1057	struct css_task_iter it;
				1058	struct task_struct *task;
				1059	bool top_cs = cs == &top_cpuset;
				1060
				1061	css_task_iter_start(&cs->css, 0, &it);
				1062	while ((task = css_task_iter_next(&it))) {
				1063	/*
				1064	* Percpu kthreads in top_cpuset are ignored
				1065	*/
				1066	if (top_cs && (task->flags & PF_KTHREAD) &&
				1067	kthread_is_per_cpu(task))
				1068	continue;
				1069	set_cpus_allowed_ptr(task, cs->effective_cpus);
				1070	}
				1071	css_task_iter_end(&it);
				1072	}
				1073
				1074	/**
				1075	* compute_effective_cpumask - Compute the effective cpumask of the cpuset
				1076	* @new_cpus: the temp variable for the new effective_cpus mask
				1077	* @cs: the cpuset the need to recompute the new effective_cpus mask
				1078	* @parent: the parent cpuset
				1079	*
				1080	* If the parent has subpartition CPUs, include them in the list of
				1081	* allowable CPUs in computing the new effective_cpus mask. Since offlined
				1082	* CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
				1083	* to mask those out.
				1084	*/
				1085	static void compute_effective_cpumask(struct cpumask *new_cpus,
				1086	struct cpuset cs, struct cpuset parent)
				1087	{
				1088	if (parent->nr_subparts_cpus) {
				1089	cpumask_or(new_cpus, parent->effective_cpus,
				1090	parent->subparts_cpus);
				1091	cpumask_and(new_cpus, new_cpus, cs->cpus_requested);
				1092	cpumask_and(new_cpus, new_cpus, cpu_active_mask);
				1093	} else {
				1094	cpumask_and(new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
				1095	}
				1096	}
				1097
				1098	/*
				1099	* Commands for update_parent_subparts_cpumask
				1100	*/
				1101	enum subparts_cmd {
				1102	partcmd_enable, /* Enable partition root */
				1103	partcmd_disable, /* Disable partition root */
				1104	partcmd_update, /* Update parent's subparts_cpus */
				1105	};
				1106
				1107	/**
				1108	* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
				1109	* @cpuset: The cpuset that requests change in partition root state
				1110	* @cmd: Partition root state change command
				1111	* @newmask: Optional new cpumask for partcmd_update
				1112	* @tmp: Temporary addmask and delmask
				1113	* Return: 0, 1 or an error code
				1114	*
				1115	* For partcmd_enable, the cpuset is being transformed from a non-partition
				1116	* root to a partition root. The cpus_allowed mask of the given cpuset will
				1117	* be put into parent's subparts_cpus and taken away from parent's
				1118	* effective_cpus. The function will return 0 if all the CPUs listed in
				1119	* cpus_allowed can be granted or an error code will be returned.
				1120	*
				1121	* For partcmd_disable, the cpuset is being transofrmed from a partition
				1122	* root back to a non-partition root. any CPUs in cpus_allowed that are in
				1123	* parent's subparts_cpus will be taken away from that cpumask and put back
				1124	* into parent's effective_cpus. 0 should always be returned.
				1125	*
				1126	* For partcmd_update, if the optional newmask is specified, the cpu
				1127	* list is to be changed from cpus_allowed to newmask. Otherwise,
				1128	* cpus_allowed is assumed to remain the same. The cpuset should either
				1129	* be a partition root or an invalid partition root. The partition root
				1130	* state may change if newmask is NULL and none of the requested CPUs can
				1131	* be granted by the parent. The function will return 1 if changes to
				1132	* parent's subparts_cpus and effective_cpus happen or 0 otherwise.
				1133	* Error code should only be returned when newmask is non-NULL.
				1134	*
				1135	* The partcmd_enable and partcmd_disable commands are used by
				1136	* update_prstate(). The partcmd_update command is used by
				1137	* update_cpumasks_hier() with newmask NULL and update_cpumask() with
				1138	* newmask set.
				1139	*
				1140	* The checking is more strict when enabling partition root than the
				1141	* other two commands.
				1142	*
				1143	* Because of the implicit cpu exclusive nature of a partition root,
				1144	* cpumask changes that violates the cpu exclusivity rule will not be
				1145	* permitted when checked by validate_change(). The validate_change()
				1146	* function will also prevent any changes to the cpu list if it is not
				1147	* a superset of children's cpu lists.
				1148	*/
				1149	static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
				1150	struct cpumask *newmask,
				1151	struct tmpmasks *tmp)
				1152	{
				1153	struct cpuset *parent = parent_cs(cpuset);
				1154	int adding; /* Moving cpus from effective_cpus to subparts_cpus */
				1155	int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
				1156	bool part_error = false; /* Partition error? */
				1157
				1158	percpu_rwsem_assert_held(&cpuset_rwsem);
				1159
				1160	/*
				1161	* The parent must be a partition root.
				1162	* The new cpumask, if present, or the current cpus_allowed must
				1163	* not be empty.
				1164	*/
				1165	if (!is_partition_root(parent) \|\|
				1166	(newmask && cpumask_empty(newmask)) \|\|
				1167	(!newmask && cpumask_empty(cpuset->cpus_allowed)))
				1168	return -EINVAL;
				1169
				1170	/*
				1171	* Enabling/disabling partition root is not allowed if there are
				1172	* online children.
				1173	*/
				1174	if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
				1175	return -EBUSY;
				1176
				1177	/*
				1178	* Enabling partition root is not allowed if not all the CPUs
				1179	* can be granted from parent's effective_cpus or at least one
				1180	* CPU will be left after that.
				1181	*/
				1182	if ((cmd == partcmd_enable) &&
				1183	(!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) \|\|
				1184	cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
				1185	return -EINVAL;
				1186
				1187	/*
				1188	* A cpumask update cannot make parent's effective_cpus become empty.
				1189	*/
				1190	adding = deleting = false;
				1191	if (cmd == partcmd_enable) {
				1192	cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
				1193	adding = true;
				1194	} else if (cmd == partcmd_disable) {
				1195	deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
				1196	parent->subparts_cpus);
				1197	} else if (newmask) {
				1198	/*
				1199	* partcmd_update with newmask:
				1200	*
				1201	* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
				1202	* addmask = newmask & parent->effective_cpus
				1203	* & ~parent->subparts_cpus
				1204	*/
				1205	cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
				1206	deleting = cpumask_and(tmp->delmask, tmp->delmask,
				1207	parent->subparts_cpus);
				1208
				1209	cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
				1210	adding = cpumask_andnot(tmp->addmask, tmp->addmask,
				1211	parent->subparts_cpus);
				1212	/*
				1213	* Return error if the new effective_cpus could become empty.
				1214	*/
				1215	if (adding &&
				1216	cpumask_equal(parent->effective_cpus, tmp->addmask)) {
				1217	if (!deleting)
				1218	return -EINVAL;
				1219	/*
				1220	* As some of the CPUs in subparts_cpus might have
				1221	* been offlined, we need to compute the real delmask
				1222	* to confirm that.
				1223	*/
				1224	if (!cpumask_and(tmp->addmask, tmp->delmask,
				1225	cpu_active_mask))
				1226	return -EINVAL;
				1227	cpumask_copy(tmp->addmask, parent->effective_cpus);
				1228	}
				1229	} else {
				1230	/*
				1231	* partcmd_update w/o newmask:
				1232	*
				1233	* addmask = cpus_allowed & parent->effectiveb_cpus
				1234	*
				1235	* Note that parent's subparts_cpus may have been
				1236	* pre-shrunk in case there is a change in the cpu list.
				1237	* So no deletion is needed.
				1238	*/
				1239	adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
				1240	parent->effective_cpus);
				1241	part_error = cpumask_equal(tmp->addmask,
				1242	parent->effective_cpus);
				1243	}
				1244
				1245	if (cmd == partcmd_update) {
				1246	int prev_prs = cpuset->partition_root_state;
				1247
				1248	/*
				1249	* Check for possible transition between PRS_ENABLED
				1250	* and PRS_ERROR.
				1251	*/
				1252	switch (cpuset->partition_root_state) {
				1253	case PRS_ENABLED:
				1254	if (part_error)
				1255	cpuset->partition_root_state = PRS_ERROR;
				1256	break;
				1257	case PRS_ERROR:
				1258	if (!part_error)
				1259	cpuset->partition_root_state = PRS_ENABLED;
				1260	break;
				1261	}
				1262	/*
				1263	* Set part_error if previously in invalid state.
				1264	*/
				1265	part_error = (prev_prs == PRS_ERROR);
				1266	}
				1267
				1268	if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
				1269	return 0; /* Nothing need to be done */
				1270
				1271	if (cpuset->partition_root_state == PRS_ERROR) {
				1272	/*
				1273	* Remove all its cpus from parent's subparts_cpus.
				1274	*/
				1275	adding = false;
				1276	deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
				1277	parent->subparts_cpus);
				1278	}
				1279
				1280	if (!adding && !deleting)
				1281	return 0;
				1282
				1283	/*
				1284	* Change the parent's subparts_cpus.
				1285	* Newly added CPUs will be removed from effective_cpus and
				1286	* newly deleted ones will be added back to effective_cpus.
				1287	*/
				1288	spin_lock_irq(&callback_lock);
				1289	if (adding) {
				1290	cpumask_or(parent->subparts_cpus,
				1291	parent->subparts_cpus, tmp->addmask);
				1292	cpumask_andnot(parent->effective_cpus,
				1293	parent->effective_cpus, tmp->addmask);
				1294	}
				1295	if (deleting) {
				1296	cpumask_andnot(parent->subparts_cpus,
				1297	parent->subparts_cpus, tmp->delmask);
				1298	/*
				1299	* Some of the CPUs in subparts_cpus might have been offlined.
				1300	*/
				1301	cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
				1302	cpumask_or(parent->effective_cpus,
				1303	parent->effective_cpus, tmp->delmask);
				1304	}
				1305
				1306	parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
				1307	spin_unlock_irq(&callback_lock);
				1308
				1309	return cmd == partcmd_update;
				1310	}
				1311
				1312	/*
				1313	* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
				1314	* @cs: the cpuset to consider
				1315	* @tmp: temp variables for calculating effective_cpus & partition setup
				1316	*
				1317	* When congifured cpumask is changed, the effective cpumasks of this cpuset
				1318	* and all its descendants need to be updated.
				1319	*
				1320	* On legacy hierachy, effective_cpus will be the same with cpu_allowed.
				1321	*
				1322	* Called with cpuset_mutex held
				1323	*/
				1324	static void update_cpumasks_hier(struct cpuset cs, struct tmpmasks tmp)
				1325	{
				1326	struct cpuset *cp;
				1327	struct cgroup_subsys_state *pos_css;
				1328	bool need_rebuild_sched_domains = false;
				1329
				1330	rcu_read_lock();
				1331	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
				1332	struct cpuset *parent = parent_cs(cp);
				1333
				1334	compute_effective_cpumask(tmp->new_cpus, cp, parent);
				1335
				1336	/*
				1337	* If it becomes empty, inherit the effective mask of the
				1338	* parent, which is guaranteed to have some CPUs.
				1339	*/
				1340	if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
				1341	cpumask_copy(tmp->new_cpus, parent->effective_cpus);
				1342	if (!cp->use_parent_ecpus) {
				1343	cp->use_parent_ecpus = true;
				1344	parent->child_ecpus_count++;
				1345	}
				1346	} else if (cp->use_parent_ecpus) {
				1347	cp->use_parent_ecpus = false;
				1348	WARN_ON_ONCE(!parent->child_ecpus_count);
				1349	parent->child_ecpus_count--;
				1350	}
				1351
				1352	/*
				1353	* Skip the whole subtree if the cpumask remains the same
				1354	* and has no partition root state.
				1355	*/
				1356	if (!cp->partition_root_state &&
				1357	cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
				1358	pos_css = css_rightmost_descendant(pos_css);
				1359	continue;
				1360	}
				1361
				1362	/*
				1363	* update_parent_subparts_cpumask() should have been called
				1364	* for cs already in update_cpumask(). We should also call
				1365	* update_tasks_cpumask() again for tasks in the parent
				1366	* cpuset if the parent's subparts_cpus changes.
				1367	*/
				1368	if ((cp != cs) && cp->partition_root_state) {
				1369	switch (parent->partition_root_state) {
				1370	case PRS_DISABLED:
				1371	/*
				1372	* If parent is not a partition root or an
				1373	* invalid partition root, clear the state
				1374	* state and the CS_CPU_EXCLUSIVE flag.
				1375	*/
				1376	WARN_ON_ONCE(cp->partition_root_state
				1377	!= PRS_ERROR);
				1378	cp->partition_root_state = 0;
				1379
				1380	/*
				1381	* clear_bit() is an atomic operation and
				1382	* readers aren't interested in the state
				1383	* of CS_CPU_EXCLUSIVE anyway. So we can
				1384	* just update the flag without holding
				1385	* the callback_lock.
				1386	*/
				1387	clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
				1388	break;
				1389
				1390	case PRS_ENABLED:
				1391	if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
				1392	update_tasks_cpumask(parent);
				1393	break;
				1394
				1395	case PRS_ERROR:
				1396	/*
				1397	* When parent is invalid, it has to be too.
				1398	*/
				1399	cp->partition_root_state = PRS_ERROR;
				1400	if (cp->nr_subparts_cpus) {
				1401	cp->nr_subparts_cpus = 0;
				1402	cpumask_clear(cp->subparts_cpus);
				1403	}
				1404	break;
				1405	}
				1406	}
				1407
				1408	if (!css_tryget_online(&cp->css))
				1409	continue;
				1410	rcu_read_unlock();
				1411
				1412	spin_lock_irq(&callback_lock);
				1413
				1414	cpumask_copy(cp->effective_cpus, tmp->new_cpus);
				1415	if (cp->nr_subparts_cpus &&
				1416	(cp->partition_root_state != PRS_ENABLED)) {
				1417	cp->nr_subparts_cpus = 0;
				1418	cpumask_clear(cp->subparts_cpus);
				1419	} else if (cp->nr_subparts_cpus) {
				1420	/*
				1421	* Make sure that effective_cpus & subparts_cpus
				1422	* are mutually exclusive.
				1423	*
				1424	* In the unlikely event that effective_cpus
				1425	* becomes empty. we clear cp->nr_subparts_cpus and
				1426	* let its child partition roots to compete for
				1427	* CPUs again.
				1428	*/
				1429	cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
				1430	cp->subparts_cpus);
				1431	if (cpumask_empty(cp->effective_cpus)) {
				1432	cpumask_copy(cp->effective_cpus, tmp->new_cpus);
				1433	cpumask_clear(cp->subparts_cpus);
				1434	cp->nr_subparts_cpus = 0;
				1435	} else if (!cpumask_subset(cp->subparts_cpus,
				1436	tmp->new_cpus)) {
				1437	cpumask_andnot(cp->subparts_cpus,
				1438	cp->subparts_cpus, tmp->new_cpus);
				1439	cp->nr_subparts_cpus
				1440	= cpumask_weight(cp->subparts_cpus);
				1441	}
				1442	}
				1443	spin_unlock_irq(&callback_lock);
				1444
				1445	WARN_ON(!is_in_v2_mode() &&
				1446	!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
				1447
				1448	update_tasks_cpumask(cp);
				1449
				1450	/*
				1451	* On legacy hierarchy, if the effective cpumask of any non-
				1452	* empty cpuset is changed, we need to rebuild sched domains.
				1453	* On default hierarchy, the cpuset needs to be a partition
				1454	* root as well.
				1455	*/
				1456	if (!cpumask_empty(cp->cpus_allowed) &&
				1457	is_sched_load_balance(cp) &&
				1458	(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) \|\|
				1459	is_partition_root(cp)))
				1460	need_rebuild_sched_domains = true;
				1461
				1462	rcu_read_lock();
				1463	css_put(&cp->css);
				1464	}
				1465	rcu_read_unlock();
				1466
				1467	if (need_rebuild_sched_domains)
				1468	rebuild_sched_domains_locked();
				1469	}
				1470
				1471	/**
				1472	* update_sibling_cpumasks - Update siblings cpumasks
				1473	* @parent: Parent cpuset
				1474	* @cs: Current cpuset
				1475	* @tmp: Temp variables
				1476	*/
				1477	static void update_sibling_cpumasks(struct cpuset parent, struct cpuset cs,
				1478	struct tmpmasks *tmp)
				1479	{
				1480	struct cpuset *sibling;
				1481	struct cgroup_subsys_state *pos_css;
				1482
				1483	percpu_rwsem_assert_held(&cpuset_rwsem);
				1484
				1485	/*
				1486	* Check all its siblings and call update_cpumasks_hier()
				1487	* if their use_parent_ecpus flag is set in order for them
				1488	* to use the right effective_cpus value.
				1489	*
				1490	* The update_cpumasks_hier() function may sleep. So we have to
				1491	* release the RCU read lock before calling it.
				1492	*/
				1493	rcu_read_lock();
				1494	cpuset_for_each_child(sibling, pos_css, parent) {
				1495	if (sibling == cs)
				1496	continue;
				1497	if (!sibling->use_parent_ecpus)
				1498	continue;
				1499	if (!css_tryget_online(&sibling->css))
				1500	continue;
				1501
				1502	rcu_read_unlock();
				1503	update_cpumasks_hier(sibling, tmp);
				1504	rcu_read_lock();
				1505	css_put(&sibling->css);
				1506	}
				1507	rcu_read_unlock();
				1508	}
				1509
				1510	/**
				1511	* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
				1512	* @cs: the cpuset to consider
				1513	* @trialcs: trial cpuset
				1514	* @buf: buffer of cpu numbers written to this cpuset
				1515	*/
				1516	static int update_cpumask(struct cpuset cs, struct cpuset trialcs,
				1517	const char *buf)
				1518	{
				1519	int retval;
				1520	struct tmpmasks tmp;
				1521
				1522	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
				1523	if (cs == &top_cpuset)
				1524	return -EACCES;
				1525
				1526	/*
				1527	* An empty cpus_requested is ok only if the cpuset has no tasks.
				1528	* Since cpulist_parse() fails on an empty mask, we special case
				1529	* that parsing. The validate_change() call ensures that cpusets
				1530	* with tasks have cpus.
				1531	*/
				1532	if (!*buf) {
				1533	cpumask_clear(trialcs->cpus_requested);
				1534	} else {
				1535	retval = cpulist_parse(buf, trialcs->cpus_requested);
				1536	if (retval < 0)
				1537	return retval;
				1538	}
				1539
				1540	if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
				1541	return -EINVAL;
				1542
				1543	cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
				1544
				1545	/* Nothing to do if the cpus didn't change */
				1546	if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
				1547	return 0;
				1548
				1549	retval = validate_change(cs, trialcs);
				1550	if (retval < 0)
				1551	return retval;
				1552
				1553	#ifdef CONFIG_CPUMASK_OFFSTACK
				1554	/*
				1555	* Use the cpumasks in trialcs for tmpmasks when they are pointers
				1556	* to allocated cpumasks.
				1557	*/
				1558	tmp.addmask = trialcs->subparts_cpus;
				1559	tmp.delmask = trialcs->effective_cpus;
				1560	tmp.new_cpus = trialcs->cpus_allowed;
				1561	#endif
				1562
				1563	if (cs->partition_root_state) {
				1564	/* Cpumask of a partition root cannot be empty */
				1565	if (cpumask_empty(trialcs->cpus_allowed))
				1566	return -EINVAL;
				1567	if (update_parent_subparts_cpumask(cs, partcmd_update,
				1568	trialcs->cpus_allowed, &tmp) < 0)
				1569	return -EINVAL;
				1570	}
				1571
				1572	spin_lock_irq(&callback_lock);
				1573	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
				1574	cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
				1575
				1576	/*
				1577	* Make sure that subparts_cpus is a subset of cpus_allowed.
				1578	*/
				1579	if (cs->nr_subparts_cpus) {
				1580	cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
				1581	cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
				1582	}
				1583	spin_unlock_irq(&callback_lock);
				1584
				1585	update_cpumasks_hier(cs, &tmp);
				1586
				1587	if (cs->partition_root_state) {
				1588	struct cpuset *parent = parent_cs(cs);
				1589
				1590	/*
				1591	* For partition root, update the cpumasks of sibling
				1592	* cpusets if they use parent's effective_cpus.
				1593	*/
				1594	if (parent->child_ecpus_count)
				1595	update_sibling_cpumasks(parent, cs, &tmp);
				1596	}
				1597	return 0;
				1598	}
				1599
				1600	/*
				1601	* Migrate memory region from one set of nodes to another. This is
				1602	* performed asynchronously as it can be called from process migration path
				1603	* holding locks involved in process management. All mm migrations are
				1604	* performed in the queued order and can be waited for by flushing
				1605	* cpuset_migrate_mm_wq.
				1606	*/
				1607
				1608	struct cpuset_migrate_mm_work {
				1609	struct work_struct work;
				1610	struct mm_struct *mm;
				1611	nodemask_t from;
				1612	nodemask_t to;
				1613	};
				1614
				1615	static void cpuset_migrate_mm_workfn(struct work_struct *work)
				1616	{
				1617	struct cpuset_migrate_mm_work *mwork =
				1618	container_of(work, struct cpuset_migrate_mm_work, work);
				1619
				1620	/* on a wq worker, no need to worry about %current's mems_allowed */
				1621	do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
				1622	mmput(mwork->mm);
				1623	kfree(mwork);
				1624	}
				1625
				1626	static void cpuset_migrate_mm(struct mm_struct mm, const nodemask_t from,
				1627	const nodemask_t *to)
				1628	{
				1629	struct cpuset_migrate_mm_work *mwork;
				1630
				1631	mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
				1632	if (mwork) {
				1633	mwork->mm = mm;
				1634	mwork->from = *from;
				1635	mwork->to = *to;
				1636	INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
				1637	queue_work(cpuset_migrate_mm_wq, &mwork->work);
				1638	} else {
				1639	mmput(mm);
				1640	}
				1641	}
				1642
				1643	static void cpuset_post_attach(void)
				1644	{
				1645	flush_workqueue(cpuset_migrate_mm_wq);
				1646	}
				1647
				1648	/*
				1649	* cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
				1650	* @tsk: the task to change
				1651	* @newmems: new nodes that the task will be set
				1652	*
				1653	* We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
				1654	* and rebind an eventual tasks' mempolicy. If the task is allocating in
				1655	* parallel, it might temporarily see an empty intersection, which results in
				1656	* a seqlock check and retry before OOM or allocation failure.
				1657	*/
				1658	static void cpuset_change_task_nodemask(struct task_struct *tsk,
				1659	nodemask_t *newmems)
				1660	{
				1661	task_lock(tsk);
				1662
				1663	local_irq_disable();
				1664	write_seqcount_begin(&tsk->mems_allowed_seq);
				1665
				1666	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
				1667	mpol_rebind_task(tsk, newmems);
				1668	tsk->mems_allowed = *newmems;
				1669
				1670	write_seqcount_end(&tsk->mems_allowed_seq);
				1671	local_irq_enable();
				1672
				1673	task_unlock(tsk);
				1674	}
				1675
				1676	static void *cpuset_being_rebound;
				1677
				1678	/**
				1679	* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
				1680	* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
				1681	*
				1682	* Iterate through each task of @cs updating its mems_allowed to the
				1683	* effective cpuset's. As this function is called with cpuset_mutex held,
				1684	* cpuset membership stays stable.
				1685	*/
				1686	static void update_tasks_nodemask(struct cpuset *cs)
				1687	{
				1688	static nodemask_t newmems; /* protected by cpuset_mutex */
				1689	struct css_task_iter it;
				1690	struct task_struct *task;
				1691
				1692	cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
				1693
				1694	guarantee_online_mems(cs, &newmems);
				1695
				1696	/*
				1697	* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
				1698	* take while holding tasklist_lock. Forks can happen - the
				1699	* mpol_dup() cpuset_being_rebound check will catch such forks,
				1700	* and rebind their vma mempolicies too. Because we still hold
				1701	* the global cpuset_mutex, we know that no other rebind effort
				1702	* will be contending for the global variable cpuset_being_rebound.
				1703	* It's ok if we rebind the same mm twice; mpol_rebind_mm()
				1704	* is idempotent. Also migrate pages in each mm to new nodes.
				1705	*/
				1706	css_task_iter_start(&cs->css, 0, &it);
				1707	while ((task = css_task_iter_next(&it))) {
				1708	struct mm_struct *mm;
				1709	bool migrate;
				1710
				1711	cpuset_change_task_nodemask(task, &newmems);
				1712
				1713	mm = get_task_mm(task);
				1714	if (!mm)
				1715	continue;
				1716
				1717	migrate = is_memory_migrate(cs);
				1718
				1719	mpol_rebind_mm(mm, &cs->mems_allowed);
				1720	if (migrate)
				1721	cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
				1722	else
				1723	mmput(mm);
				1724	}
				1725	css_task_iter_end(&it);
				1726
				1727	/*
				1728	* All the tasks' nodemasks have been updated, update
				1729	* cs->old_mems_allowed.
				1730	*/
				1731	cs->old_mems_allowed = newmems;
				1732
				1733	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
				1734	cpuset_being_rebound = NULL;
				1735	}
				1736
				1737	/*
				1738	* update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
				1739	* @cs: the cpuset to consider
				1740	* @new_mems: a temp variable for calculating new effective_mems
				1741	*
				1742	* When configured nodemask is changed, the effective nodemasks of this cpuset
				1743	* and all its descendants need to be updated.
				1744	*
				1745	* On legacy hiearchy, effective_mems will be the same with mems_allowed.
				1746	*
				1747	* Called with cpuset_mutex held
				1748	*/
				1749	static void update_nodemasks_hier(struct cpuset cs, nodemask_t new_mems)
				1750	{
				1751	struct cpuset *cp;
				1752	struct cgroup_subsys_state *pos_css;
				1753
				1754	rcu_read_lock();
				1755	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
				1756	struct cpuset *parent = parent_cs(cp);
				1757
				1758	nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
				1759
				1760	/*
				1761	* If it becomes empty, inherit the effective mask of the
				1762	* parent, which is guaranteed to have some MEMs.
				1763	*/
				1764	if (is_in_v2_mode() && nodes_empty(*new_mems))
				1765	*new_mems = parent->effective_mems;
				1766
				1767	/* Skip the whole subtree if the nodemask remains the same. */
				1768	if (nodes_equal(*new_mems, cp->effective_mems)) {
				1769	pos_css = css_rightmost_descendant(pos_css);
				1770	continue;
				1771	}
				1772
				1773	if (!css_tryget_online(&cp->css))
				1774	continue;
				1775	rcu_read_unlock();
				1776
				1777	spin_lock_irq(&callback_lock);
				1778	cp->effective_mems = *new_mems;
				1779	spin_unlock_irq(&callback_lock);
				1780
				1781	WARN_ON(!is_in_v2_mode() &&
				1782	!nodes_equal(cp->mems_allowed, cp->effective_mems));
				1783
				1784	update_tasks_nodemask(cp);
				1785
				1786	rcu_read_lock();
				1787	css_put(&cp->css);
				1788	}
				1789	rcu_read_unlock();
				1790	}
				1791
				1792	/*
				1793	* Handle user request to change the 'mems' memory placement
				1794	* of a cpuset. Needs to validate the request, update the
				1795	* cpusets mems_allowed, and for each task in the cpuset,
				1796	* update mems_allowed and rebind task's mempolicy and any vma
				1797	* mempolicies and if the cpuset is marked 'memory_migrate',
				1798	* migrate the tasks pages to the new memory.
				1799	*
				1800	* Call with cpuset_mutex held. May take callback_lock during call.
				1801	* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
				1802	* lock each such tasks mm->mmap_sem, scan its vma's and rebind
				1803	* their mempolicies to the cpusets new mems_allowed.
				1804	*/
				1805	static int update_nodemask(struct cpuset cs, struct cpuset trialcs,
				1806	const char *buf)
				1807	{
				1808	int retval;
				1809
				1810	/*
				1811	* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
				1812	* it's read-only
				1813	*/
				1814	if (cs == &top_cpuset) {
				1815	retval = -EACCES;
				1816	goto done;
				1817	}
				1818
				1819	/*
				1820	* An empty mems_allowed is ok iff there are no tasks in the cpuset.
				1821	* Since nodelist_parse() fails on an empty mask, we special case
				1822	* that parsing. The validate_change() call ensures that cpusets
				1823	* with tasks have memory.
				1824	*/
				1825	if (!*buf) {
				1826	nodes_clear(trialcs->mems_allowed);
				1827	} else {
				1828	retval = nodelist_parse(buf, trialcs->mems_allowed);
				1829	if (retval < 0)
				1830	goto done;
				1831
				1832	if (!nodes_subset(trialcs->mems_allowed,
				1833	top_cpuset.mems_allowed)) {
				1834	retval = -EINVAL;
				1835	goto done;
				1836	}
				1837	}
				1838
				1839	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
				1840	retval = 0; /* Too easy - nothing to do */
				1841	goto done;
				1842	}
				1843	retval = validate_change(cs, trialcs);
				1844	if (retval < 0)
				1845	goto done;
				1846
				1847	spin_lock_irq(&callback_lock);
				1848	cs->mems_allowed = trialcs->mems_allowed;
				1849	spin_unlock_irq(&callback_lock);
				1850
				1851	/* use trialcs->mems_allowed as a temp variable */
				1852	update_nodemasks_hier(cs, &trialcs->mems_allowed);
				1853	done:
				1854	return retval;
				1855	}
				1856
				1857	bool current_cpuset_is_being_rebound(void)
				1858	{
				1859	bool ret;
				1860
				1861	rcu_read_lock();
				1862	ret = task_cs(current) == cpuset_being_rebound;
				1863	rcu_read_unlock();
				1864
				1865	return ret;
				1866	}
				1867
				1868	static int update_relax_domain_level(struct cpuset *cs, s64 val)
				1869	{
				1870	#ifdef CONFIG_SMP
				1871	if (val < -1 \|\| val > sched_domain_level_max + 1)
				1872	return -EINVAL;
				1873	#endif
				1874
				1875	if (val != cs->relax_domain_level) {
				1876	cs->relax_domain_level = val;
				1877	if (!cpumask_empty(cs->cpus_allowed) &&
				1878	is_sched_load_balance(cs))
				1879	rebuild_sched_domains_locked();
				1880	}
				1881
				1882	return 0;
				1883	}
				1884
				1885	/**
				1886	* update_tasks_flags - update the spread flags of tasks in the cpuset.
				1887	* @cs: the cpuset in which each task's spread flags needs to be changed
				1888	*
				1889	* Iterate through each task of @cs updating its spread flags. As this
				1890	* function is called with cpuset_mutex held, cpuset membership stays
				1891	* stable.
				1892	*/
				1893	static void update_tasks_flags(struct cpuset *cs)
				1894	{
				1895	struct css_task_iter it;
				1896	struct task_struct *task;
				1897
				1898	css_task_iter_start(&cs->css, 0, &it);
				1899	while ((task = css_task_iter_next(&it)))
				1900	cpuset_update_task_spread_flag(cs, task);
				1901	css_task_iter_end(&it);
				1902	}
				1903
				1904	/*
				1905	* update_flag - read a 0 or a 1 in a file and update associated flag
				1906	* bit: the bit to update (see cpuset_flagbits_t)
				1907	* cs: the cpuset to update
				1908	* turning_on: whether the flag is being set or cleared
				1909	*
				1910	* Call with cpuset_mutex held.
				1911	*/
				1912
				1913	static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
				1914	int turning_on)
				1915	{
				1916	struct cpuset *trialcs;
				1917	int balance_flag_changed;
				1918	int spread_flag_changed;
				1919	int err;
				1920
				1921	trialcs = alloc_trial_cpuset(cs);
				1922	if (!trialcs)
				1923	return -ENOMEM;
				1924
				1925	if (turning_on)
				1926	set_bit(bit, &trialcs->flags);
				1927	else
				1928	clear_bit(bit, &trialcs->flags);
				1929
				1930	err = validate_change(cs, trialcs);
				1931	if (err < 0)
				1932	goto out;
				1933
				1934	balance_flag_changed = (is_sched_load_balance(cs) !=
				1935	is_sched_load_balance(trialcs));
				1936
				1937	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
				1938	\|\| (is_spread_page(cs) != is_spread_page(trialcs)));
				1939
				1940	spin_lock_irq(&callback_lock);
				1941	cs->flags = trialcs->flags;
				1942	spin_unlock_irq(&callback_lock);
				1943
				1944	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
				1945	rebuild_sched_domains_locked();
				1946
				1947	if (spread_flag_changed)
				1948	update_tasks_flags(cs);
				1949	out:
				1950	free_cpuset(trialcs);
				1951	return err;
				1952	}
				1953
				1954	/*
				1955	* update_prstate - update partititon_root_state
				1956	* cs: the cpuset to update
				1957	* val: 0 - disabled, 1 - enabled
				1958	*
				1959	* Call with cpuset_mutex held.
				1960	*/
				1961	static int update_prstate(struct cpuset *cs, int val)
				1962	{
				1963	int err;
				1964	struct cpuset *parent = parent_cs(cs);
				1965	struct tmpmasks tmp;
				1966
				1967	if ((val != 0) && (val != 1))
				1968	return -EINVAL;
				1969	if (val == cs->partition_root_state)
				1970	return 0;
				1971
				1972	/*
				1973	* Cannot force a partial or invalid partition root to a full
				1974	* partition root.
				1975	*/
				1976	if (val && cs->partition_root_state)
				1977	return -EINVAL;
				1978
				1979	if (alloc_cpumasks(NULL, &tmp))
				1980	return -ENOMEM;
				1981
				1982	err = -EINVAL;
				1983	if (!cs->partition_root_state) {
				1984	/*
				1985	* Turning on partition root requires setting the
				1986	* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
				1987	* cannot be NULL.
				1988	*/
				1989	if (cpumask_empty(cs->cpus_allowed))
				1990	goto out;
				1991
				1992	err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
				1993	if (err)
				1994	goto out;
				1995
				1996	err = update_parent_subparts_cpumask(cs, partcmd_enable,
				1997	NULL, &tmp);
				1998	if (err) {
				1999	update_flag(CS_CPU_EXCLUSIVE, cs, 0);
				2000	goto out;
				2001	}
				2002	cs->partition_root_state = PRS_ENABLED;
				2003	} else {
				2004	/*
				2005	* Turning off partition root will clear the
				2006	* CS_CPU_EXCLUSIVE bit.
				2007	*/
				2008	if (cs->partition_root_state == PRS_ERROR) {
				2009	cs->partition_root_state = 0;
				2010	update_flag(CS_CPU_EXCLUSIVE, cs, 0);
				2011	err = 0;
				2012	goto out;
				2013	}
				2014
				2015	err = update_parent_subparts_cpumask(cs, partcmd_disable,
				2016	NULL, &tmp);
				2017	if (err)
				2018	goto out;
				2019
				2020	cs->partition_root_state = 0;
				2021
				2022	/* Turning off CS_CPU_EXCLUSIVE will not return error */
				2023	update_flag(CS_CPU_EXCLUSIVE, cs, 0);
				2024	}
				2025
				2026	update_tasks_cpumask(parent);
				2027
				2028	if (parent->child_ecpus_count)
				2029	update_sibling_cpumasks(parent, cs, &tmp);
				2030
				2031	rebuild_sched_domains_locked();
				2032	out:
				2033	free_cpumasks(NULL, &tmp);
				2034	return err;
				2035	}
				2036
				2037	/*
				2038	* Frequency meter - How fast is some event occurring?
				2039	*
				2040	* These routines manage a digitally filtered, constant time based,
				2041	* event frequency meter. There are four routines:
				2042	* fmeter_init() - initialize a frequency meter.
				2043	* fmeter_markevent() - called each time the event happens.
				2044	* fmeter_getrate() - returns the recent rate of such events.
				2045	* fmeter_update() - internal routine used to update fmeter.
				2046	*
				2047	* A common data structure is passed to each of these routines,
				2048	* which is used to keep track of the state required to manage the
				2049	* frequency meter and its digital filter.
				2050	*
				2051	* The filter works on the number of events marked per unit time.
				2052	* The filter is single-pole low-pass recursive (IIR). The time unit
				2053	* is 1 second. Arithmetic is done using 32-bit integers scaled to
				2054	* simulate 3 decimal digits of precision (multiplied by 1000).
				2055	*
				2056	* With an FM_COEF of 933, and a time base of 1 second, the filter
				2057	* has a half-life of 10 seconds, meaning that if the events quit
				2058	* happening, then the rate returned from the fmeter_getrate()
				2059	* will be cut in half each 10 seconds, until it converges to zero.
				2060	*
				2061	* It is not worth doing a real infinitely recursive filter. If more
				2062	* than FM_MAXTICKS ticks have elapsed since the last filter event,
				2063	* just compute FM_MAXTICKS ticks worth, by which point the level
				2064	* will be stable.
				2065	*
				2066	* Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
				2067	* arithmetic overflow in the fmeter_update() routine.
				2068	*
				2069	* Given the simple 32 bit integer arithmetic used, this meter works
				2070	* best for reporting rates between one per millisecond (msec) and
				2071	* one per 32 (approx) seconds. At constant rates faster than one
				2072	* per msec it maxes out at values just under 1,000,000. At constant
				2073	* rates between one per msec, and one per second it will stabilize
				2074	* to a value N*1000, where N is the rate of events per second.
				2075	* At constant rates between one per second and one per 32 seconds,
				2076	* it will be choppy, moving up on the seconds that have an event,
				2077	* and then decaying until the next event. At rates slower than
				2078	* about one in 32 seconds, it decays all the way back to zero between
				2079	* each event.
				2080	*/
				2081
				2082	#define FM_COEF 933 /* coefficient for half-life of 10 secs */
				2083	#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
				2084	#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
				2085	#define FM_SCALE 1000 /* faux fixed point scale */
				2086
				2087	/* Initialize a frequency meter */
				2088	static void fmeter_init(struct fmeter *fmp)
				2089	{
				2090	fmp->cnt = 0;
				2091	fmp->val = 0;
				2092	fmp->time = 0;
				2093	spin_lock_init(&fmp->lock);
				2094	}
				2095
				2096	/* Internal meter update - process cnt events and update value */
				2097	static void fmeter_update(struct fmeter *fmp)
				2098	{
				2099	time64_t now;
				2100	u32 ticks;
				2101
				2102	now = ktime_get_seconds();
				2103	ticks = now - fmp->time;
				2104
				2105	if (ticks == 0)
				2106	return;
				2107
				2108	ticks = min(FM_MAXTICKS, ticks);
				2109	while (ticks-- > 0)
				2110	fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
				2111	fmp->time = now;
				2112
				2113	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
				2114	fmp->cnt = 0;
				2115	}
				2116
				2117	/* Process any previous ticks, then bump cnt by one (times scale). */
				2118	static void fmeter_markevent(struct fmeter *fmp)
				2119	{
				2120	spin_lock(&fmp->lock);
				2121	fmeter_update(fmp);
				2122	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
				2123	spin_unlock(&fmp->lock);
				2124	}
				2125
				2126	/* Process any previous ticks, then return current value. */
				2127	static int fmeter_getrate(struct fmeter *fmp)
				2128	{
				2129	int val;
				2130
				2131	spin_lock(&fmp->lock);
				2132	fmeter_update(fmp);
				2133	val = fmp->val;
				2134	spin_unlock(&fmp->lock);
				2135	return val;
				2136	}
				2137
				2138	static struct cpuset *cpuset_attach_old_cs;
				2139
				2140	/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
				2141	static int cpuset_can_attach(struct cgroup_taskset *tset)
				2142	{
				2143	struct cgroup_subsys_state *css;
				2144	struct cpuset *cs;
				2145	struct task_struct *task;
				2146	int ret;
				2147
				2148	/* used later by cpuset_attach() */
				2149	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
				2150	cs = css_cs(css);
				2151
				2152	percpu_down_write(&cpuset_rwsem);
				2153
				2154	/* allow moving tasks into an empty cpuset if on default hierarchy */
				2155	ret = -ENOSPC;
				2156	if (!is_in_v2_mode() &&
				2157	(cpumask_empty(cs->cpus_allowed) \|\| nodes_empty(cs->mems_allowed)))
				2158	goto out_unlock;
				2159
				2160	cgroup_taskset_for_each(task, css, tset) {
				2161	ret = task_can_attach(task, cs->cpus_allowed);
				2162	if (ret)
				2163	goto out_unlock;
				2164	ret = security_task_setscheduler(task);
				2165	if (ret)
				2166	goto out_unlock;
				2167	}
				2168
				2169	/*
				2170	* Mark attach is in progress. This makes validate_change() fail
				2171	* changes which zero cpus/mems_allowed.
				2172	*/
				2173	cs->attach_in_progress++;
				2174	ret = 0;
				2175	out_unlock:
				2176	percpu_up_write(&cpuset_rwsem);
				2177	return ret;
				2178	}
				2179
				2180	static void cpuset_cancel_attach(struct cgroup_taskset *tset)
				2181	{
				2182	struct cgroup_subsys_state *css;
				2183	struct cpuset *cs;
				2184
				2185	cgroup_taskset_first(tset, &css);
				2186	cs = css_cs(css);
				2187
				2188	percpu_down_write(&cpuset_rwsem);
				2189	cs->attach_in_progress--;
				2190	if (!cs->attach_in_progress)
				2191	wake_up(&cpuset_attach_wq);
				2192	percpu_up_write(&cpuset_rwsem);
				2193	}
				2194
				2195	/*
				2196	* Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
				2197	* but we can't allocate it dynamically there. Define it global and
				2198	* allocate from cpuset_init().
				2199	*/
				2200	static cpumask_var_t cpus_attach;
				2201
				2202	static void cpuset_attach(struct cgroup_taskset *tset)
				2203	{
				2204	/* static buf protected by cpuset_mutex */
				2205	static nodemask_t cpuset_attach_nodemask_to;
				2206	struct task_struct *task;
				2207	struct task_struct *leader;
				2208	struct cgroup_subsys_state *css;
				2209	struct cpuset *cs;
				2210	struct cpuset *oldcs = cpuset_attach_old_cs;
				2211
				2212	cgroup_taskset_first(tset, &css);
				2213	cs = css_cs(css);
				2214
				2215	lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
				2216	percpu_down_write(&cpuset_rwsem);
				2217
				2218	/* prepare for attach */
				2219	if (cs == &top_cpuset)
				2220	cpumask_copy(cpus_attach, cpu_possible_mask);
				2221	else
				2222	guarantee_online_cpus(cs, cpus_attach);
				2223
				2224	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
				2225
				2226	cgroup_taskset_for_each(task, css, tset) {
				2227	/*
				2228	* can_attach beforehand should guarantee that this doesn't
				2229	* fail. TODO: have a better way to handle failure here
				2230	*/
				2231	WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
				2232
				2233	cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
				2234	cpuset_update_task_spread_flag(cs, task);
				2235	}
				2236
				2237	/*
				2238	* Change mm for all threadgroup leaders. This is expensive and may
				2239	* sleep and should be moved outside migration path proper.
				2240	*/
				2241	cpuset_attach_nodemask_to = cs->effective_mems;
				2242	cgroup_taskset_for_each_leader(leader, css, tset) {
				2243	struct mm_struct *mm = get_task_mm(leader);
				2244
				2245	if (mm) {
				2246	mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
				2247
				2248	/*
				2249	* old_mems_allowed is the same with mems_allowed
				2250	* here, except if this task is being moved
				2251	* automatically due to hotplug. In that case
				2252	* @mems_allowed has been updated and is empty, so
				2253	* @old_mems_allowed is the right nodesets that we
				2254	* migrate mm from.
				2255	*/
				2256	if (is_memory_migrate(cs))
				2257	cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
				2258	&cpuset_attach_nodemask_to);
				2259	else
				2260	mmput(mm);
				2261	}
				2262	}
				2263
				2264	cs->old_mems_allowed = cpuset_attach_nodemask_to;
				2265
				2266	cs->attach_in_progress--;
				2267	if (!cs->attach_in_progress)
				2268	wake_up(&cpuset_attach_wq);
				2269
				2270	percpu_up_write(&cpuset_rwsem);
				2271	}
				2272
				2273	/* The various types of files and directories in a cpuset file system */
				2274
				2275	typedef enum {
				2276	FILE_MEMORY_MIGRATE,
				2277	FILE_CPULIST,
				2278	FILE_MEMLIST,
				2279	FILE_EFFECTIVE_CPULIST,
				2280	FILE_EFFECTIVE_MEMLIST,
				2281	FILE_SUBPARTS_CPULIST,
				2282	FILE_CPU_EXCLUSIVE,
				2283	FILE_MEM_EXCLUSIVE,
				2284	FILE_MEM_HARDWALL,
				2285	FILE_SCHED_LOAD_BALANCE,
				2286	FILE_PARTITION_ROOT,
				2287	FILE_SCHED_RELAX_DOMAIN_LEVEL,
				2288	FILE_MEMORY_PRESSURE_ENABLED,
				2289	FILE_MEMORY_PRESSURE,
				2290	FILE_SPREAD_PAGE,
				2291	FILE_SPREAD_SLAB,
				2292	} cpuset_filetype_t;
				2293
				2294	static int cpuset_write_u64(struct cgroup_subsys_state css, struct cftype cft,
				2295	u64 val)
				2296	{
				2297	struct cpuset *cs = css_cs(css);
				2298	cpuset_filetype_t type = cft->private;
				2299	int retval = 0;
				2300
				2301	get_online_cpus();
				2302	percpu_down_write(&cpuset_rwsem);
				2303	if (!is_cpuset_online(cs)) {
				2304	retval = -ENODEV;
				2305	goto out_unlock;
				2306	}
				2307
				2308	switch (type) {
				2309	case FILE_CPU_EXCLUSIVE:
				2310	retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
				2311	break;
				2312	case FILE_MEM_EXCLUSIVE:
				2313	retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
				2314	break;
				2315	case FILE_MEM_HARDWALL:
				2316	retval = update_flag(CS_MEM_HARDWALL, cs, val);
				2317	break;
				2318	case FILE_SCHED_LOAD_BALANCE:
				2319	retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
				2320	break;
				2321	case FILE_MEMORY_MIGRATE:
				2322	retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
				2323	break;
				2324	case FILE_MEMORY_PRESSURE_ENABLED:
				2325	cpuset_memory_pressure_enabled = !!val;
				2326	break;
				2327	case FILE_SPREAD_PAGE:
				2328	retval = update_flag(CS_SPREAD_PAGE, cs, val);
				2329	break;
				2330	case FILE_SPREAD_SLAB:
				2331	retval = update_flag(CS_SPREAD_SLAB, cs, val);
				2332	break;
				2333	default:
				2334	retval = -EINVAL;
				2335	break;
				2336	}
				2337	out_unlock:
				2338	percpu_up_write(&cpuset_rwsem);
				2339	put_online_cpus();
				2340	return retval;
				2341	}
				2342
				2343	static int cpuset_write_s64(struct cgroup_subsys_state css, struct cftype cft,
				2344	s64 val)
				2345	{
				2346	struct cpuset *cs = css_cs(css);
				2347	cpuset_filetype_t type = cft->private;
				2348	int retval = -ENODEV;
				2349
				2350	get_online_cpus();
				2351	percpu_down_write(&cpuset_rwsem);
				2352	if (!is_cpuset_online(cs))
				2353	goto out_unlock;
				2354
				2355	switch (type) {
				2356	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
				2357	retval = update_relax_domain_level(cs, val);
				2358	break;
				2359	default:
				2360	retval = -EINVAL;
				2361	break;
				2362	}
				2363	out_unlock:
				2364	percpu_up_write(&cpuset_rwsem);
				2365	put_online_cpus();
				2366	return retval;
				2367	}
				2368
				2369	/*
				2370	* Common handling for a write to a "cpus" or "mems" file.
				2371	*/
				2372	static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
				2373	char *buf, size_t nbytes, loff_t off)
				2374	{
				2375	struct cpuset *cs = css_cs(of_css(of));
				2376	struct cpuset *trialcs;
				2377	int retval = -ENODEV;
				2378
				2379	buf = strstrip(buf);
				2380
				2381	/*
				2382	* CPU or memory hotunplug may leave @cs w/o any execution
				2383	* resources, in which case the hotplug code asynchronously updates
				2384	* configuration and transfers all tasks to the nearest ancestor
				2385	* which can execute.
				2386	*
				2387	* As writes to "cpus" or "mems" may restore @cs's execution
				2388	* resources, wait for the previously scheduled operations before
				2389	* proceeding, so that we don't end up keep removing tasks added
				2390	* after execution capability is restored.
				2391	*
				2392	* cpuset_hotplug_work calls back into cgroup core via
				2393	* cgroup_transfer_tasks() and waiting for it from a cgroupfs
				2394	* operation like this one can lead to a deadlock through kernfs
				2395	* active_ref protection. Let's break the protection. Losing the
				2396	* protection is okay as we check whether @cs is online after
				2397	* grabbing cpuset_mutex anyway. This only happens on the legacy
				2398	* hierarchies.
				2399	*/
				2400	css_get(&cs->css);
				2401	kernfs_break_active_protection(of->kn);
				2402	flush_work(&cpuset_hotplug_work);
				2403
				2404	get_online_cpus();
				2405	percpu_down_write(&cpuset_rwsem);
				2406	if (!is_cpuset_online(cs))
				2407	goto out_unlock;
				2408
				2409	trialcs = alloc_trial_cpuset(cs);
				2410	if (!trialcs) {
				2411	retval = -ENOMEM;
				2412	goto out_unlock;
				2413	}
				2414
				2415	switch (of_cft(of)->private) {
				2416	case FILE_CPULIST:
				2417	retval = update_cpumask(cs, trialcs, buf);
				2418	break;
				2419	case FILE_MEMLIST:
				2420	retval = update_nodemask(cs, trialcs, buf);
				2421	break;
				2422	default:
				2423	retval = -EINVAL;
				2424	break;
				2425	}
				2426
				2427	free_cpuset(trialcs);
				2428	out_unlock:
				2429	percpu_up_write(&cpuset_rwsem);
				2430	put_online_cpus();
				2431	kernfs_unbreak_active_protection(of->kn);
				2432	css_put(&cs->css);
				2433	flush_workqueue(cpuset_migrate_mm_wq);
				2434	return retval ?: nbytes;
				2435	}
				2436
				2437	/*
				2438	* These ascii lists should be read in a single call, by using a user
				2439	* buffer large enough to hold the entire map. If read in smaller
				2440	* chunks, there is no guarantee of atomicity. Since the display format
				2441	* used, list of ranges of sequential numbers, is variable length,
				2442	* and since these maps can change value dynamically, one could read
				2443	* gibberish by doing partial reads while a list was changing.
				2444	*/
				2445	static int cpuset_common_seq_show(struct seq_file sf, void v)
				2446	{
				2447	struct cpuset *cs = css_cs(seq_css(sf));
				2448	cpuset_filetype_t type = seq_cft(sf)->private;
				2449	int ret = 0;
				2450
				2451	spin_lock_irq(&callback_lock);
				2452
				2453	switch (type) {
				2454	case FILE_CPULIST:
				2455	seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
				2456	break;
				2457	case FILE_MEMLIST:
				2458	seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
				2459	break;
				2460	case FILE_EFFECTIVE_CPULIST:
				2461	seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
				2462	break;
				2463	case FILE_EFFECTIVE_MEMLIST:
				2464	seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
				2465	break;
				2466	case FILE_SUBPARTS_CPULIST:
				2467	seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
				2468	break;
				2469	default:
				2470	ret = -EINVAL;
				2471	}
				2472
				2473	spin_unlock_irq(&callback_lock);
				2474	return ret;
				2475	}
				2476
				2477	static u64 cpuset_read_u64(struct cgroup_subsys_state css, struct cftype cft)
				2478	{
				2479	struct cpuset *cs = css_cs(css);
				2480	cpuset_filetype_t type = cft->private;
				2481	switch (type) {
				2482	case FILE_CPU_EXCLUSIVE:
				2483	return is_cpu_exclusive(cs);
				2484	case FILE_MEM_EXCLUSIVE:
				2485	return is_mem_exclusive(cs);
				2486	case FILE_MEM_HARDWALL:
				2487	return is_mem_hardwall(cs);
				2488	case FILE_SCHED_LOAD_BALANCE:
				2489	return is_sched_load_balance(cs);
				2490	case FILE_MEMORY_MIGRATE:
				2491	return is_memory_migrate(cs);
				2492	case FILE_MEMORY_PRESSURE_ENABLED:
				2493	return cpuset_memory_pressure_enabled;
				2494	case FILE_MEMORY_PRESSURE:
				2495	return fmeter_getrate(&cs->fmeter);
				2496	case FILE_SPREAD_PAGE:
				2497	return is_spread_page(cs);
				2498	case FILE_SPREAD_SLAB:
				2499	return is_spread_slab(cs);
				2500	default:
				2501	BUG();
				2502	}
				2503
				2504	/* Unreachable but makes gcc happy */
				2505	return 0;
				2506	}
				2507
				2508	static s64 cpuset_read_s64(struct cgroup_subsys_state css, struct cftype cft)
				2509	{
				2510	struct cpuset *cs = css_cs(css);
				2511	cpuset_filetype_t type = cft->private;
				2512	switch (type) {
				2513	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
				2514	return cs->relax_domain_level;
				2515	default:
				2516	BUG();
				2517	}
				2518
				2519	/* Unrechable but makes gcc happy */
				2520	return 0;
				2521	}
				2522
				2523	static int sched_partition_show(struct seq_file seq, void v)
				2524	{
				2525	struct cpuset *cs = css_cs(seq_css(seq));
				2526
				2527	switch (cs->partition_root_state) {
				2528	case PRS_ENABLED:
				2529	seq_puts(seq, "root\n");
				2530	break;
				2531	case PRS_DISABLED:
				2532	seq_puts(seq, "member\n");
				2533	break;
				2534	case PRS_ERROR:
				2535	seq_puts(seq, "root invalid\n");
				2536	break;
				2537	}
				2538	return 0;
				2539	}
				2540
				2541	static ssize_t sched_partition_write(struct kernfs_open_file of, char buf,
				2542	size_t nbytes, loff_t off)
				2543	{
				2544	struct cpuset *cs = css_cs(of_css(of));
				2545	int val;
				2546	int retval = -ENODEV;
				2547
				2548	buf = strstrip(buf);
				2549
				2550	/*
				2551	* Convert "root" to ENABLED, and convert "member" to DISABLED.
				2552	*/
				2553	if (!strcmp(buf, "root"))
				2554	val = PRS_ENABLED;
				2555	else if (!strcmp(buf, "member"))
				2556	val = PRS_DISABLED;
				2557	else
				2558	return -EINVAL;
				2559
				2560	css_get(&cs->css);
				2561	get_online_cpus();
				2562	percpu_down_write(&cpuset_rwsem);
				2563	if (!is_cpuset_online(cs))
				2564	goto out_unlock;
				2565
				2566	retval = update_prstate(cs, val);
				2567	out_unlock:
				2568	percpu_up_write(&cpuset_rwsem);
				2569	put_online_cpus();
				2570	css_put(&cs->css);
				2571	return retval ?: nbytes;
				2572	}
				2573
				2574	/*
				2575	* for the common functions, 'private' gives the type of file
				2576	*/
				2577
				2578	static struct cftype legacy_files[] = {
				2579	{
				2580	.name = "cpus",
				2581	.seq_show = cpuset_common_seq_show,
				2582	.write = cpuset_write_resmask,
				2583	.max_write_len = (100U + 6 * NR_CPUS),
				2584	.private = FILE_CPULIST,
				2585	},
				2586
				2587	{
				2588	.name = "mems",
				2589	.seq_show = cpuset_common_seq_show,
				2590	.write = cpuset_write_resmask,
				2591	.max_write_len = (100U + 6 * MAX_NUMNODES),
				2592	.private = FILE_MEMLIST,
				2593	},
				2594
				2595	{
				2596	.name = "effective_cpus",
				2597	.seq_show = cpuset_common_seq_show,
				2598	.private = FILE_EFFECTIVE_CPULIST,
				2599	},
				2600
				2601	{
				2602	.name = "effective_mems",
				2603	.seq_show = cpuset_common_seq_show,
				2604	.private = FILE_EFFECTIVE_MEMLIST,
				2605	},
				2606
				2607	{
				2608	.name = "cpu_exclusive",
				2609	.read_u64 = cpuset_read_u64,
				2610	.write_u64 = cpuset_write_u64,
				2611	.private = FILE_CPU_EXCLUSIVE,
				2612	},
				2613
				2614	{
				2615	.name = "mem_exclusive",
				2616	.read_u64 = cpuset_read_u64,
				2617	.write_u64 = cpuset_write_u64,
				2618	.private = FILE_MEM_EXCLUSIVE,
				2619	},
				2620
				2621	{
				2622	.name = "mem_hardwall",
				2623	.read_u64 = cpuset_read_u64,
				2624	.write_u64 = cpuset_write_u64,
				2625	.private = FILE_MEM_HARDWALL,
				2626	},
				2627
				2628	{
				2629	.name = "sched_load_balance",
				2630	.read_u64 = cpuset_read_u64,
				2631	.write_u64 = cpuset_write_u64,
				2632	.private = FILE_SCHED_LOAD_BALANCE,
				2633	},
				2634
				2635	{
				2636	.name = "sched_relax_domain_level",
				2637	.read_s64 = cpuset_read_s64,
				2638	.write_s64 = cpuset_write_s64,
				2639	.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
				2640	},
				2641
				2642	{
				2643	.name = "memory_migrate",
				2644	.read_u64 = cpuset_read_u64,
				2645	.write_u64 = cpuset_write_u64,
				2646	.private = FILE_MEMORY_MIGRATE,
				2647	},
				2648
				2649	{
				2650	.name = "memory_pressure",
				2651	.read_u64 = cpuset_read_u64,
				2652	.private = FILE_MEMORY_PRESSURE,
				2653	},
				2654
				2655	{
				2656	.name = "memory_spread_page",
				2657	.read_u64 = cpuset_read_u64,
				2658	.write_u64 = cpuset_write_u64,
				2659	.private = FILE_SPREAD_PAGE,
				2660	},
				2661
				2662	{
				2663	.name = "memory_spread_slab",
				2664	.read_u64 = cpuset_read_u64,
				2665	.write_u64 = cpuset_write_u64,
				2666	.private = FILE_SPREAD_SLAB,
				2667	},
				2668
				2669	{
				2670	.name = "memory_pressure_enabled",
				2671	.flags = CFTYPE_ONLY_ON_ROOT,
				2672	.read_u64 = cpuset_read_u64,
				2673	.write_u64 = cpuset_write_u64,
				2674	.private = FILE_MEMORY_PRESSURE_ENABLED,
				2675	},
				2676
				2677	{ } /* terminate */
				2678	};
				2679
				2680	/*
				2681	* This is currently a minimal set for the default hierarchy. It can be
				2682	* expanded later on by migrating more features and control files from v1.
				2683	*/
				2684	static struct cftype dfl_files[] = {
				2685	{
				2686	.name = "cpus",
				2687	.seq_show = cpuset_common_seq_show,
				2688	.write = cpuset_write_resmask,
				2689	.max_write_len = (100U + 6 * NR_CPUS),
				2690	.private = FILE_CPULIST,
				2691	.flags = CFTYPE_NOT_ON_ROOT,
				2692	},
				2693
				2694	{
				2695	.name = "mems",
				2696	.seq_show = cpuset_common_seq_show,
				2697	.write = cpuset_write_resmask,
				2698	.max_write_len = (100U + 6 * MAX_NUMNODES),
				2699	.private = FILE_MEMLIST,
				2700	.flags = CFTYPE_NOT_ON_ROOT,
				2701	},
				2702
				2703	{
				2704	.name = "cpus.effective",
				2705	.seq_show = cpuset_common_seq_show,
				2706	.private = FILE_EFFECTIVE_CPULIST,
				2707	},
				2708
				2709	{
				2710	.name = "mems.effective",
				2711	.seq_show = cpuset_common_seq_show,
				2712	.private = FILE_EFFECTIVE_MEMLIST,
				2713	},
				2714
				2715	{
				2716	.name = "cpus.partition",
				2717	.seq_show = sched_partition_show,
				2718	.write = sched_partition_write,
				2719	.private = FILE_PARTITION_ROOT,
				2720	.flags = CFTYPE_NOT_ON_ROOT,
				2721	},
				2722
				2723	{
				2724	.name = "cpus.subpartitions",
				2725	.seq_show = cpuset_common_seq_show,
				2726	.private = FILE_SUBPARTS_CPULIST,
				2727	.flags = CFTYPE_DEBUG,
				2728	},
				2729
				2730	{ } /* terminate */
				2731	};
				2732
				2733
				2734	/*
				2735	* cpuset_css_alloc - allocate a cpuset css
				2736	* cgrp: control group that the new cpuset will be part of
				2737	*/
				2738
				2739	static struct cgroup_subsys_state *
				2740	cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
				2741	{
				2742	struct cpuset *cs;
				2743
				2744	if (!parent_css)
				2745	return &top_cpuset.css;
				2746
				2747	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
				2748	if (!cs)
				2749	return ERR_PTR(-ENOMEM);
				2750
				2751	if (alloc_cpumasks(cs, NULL)) {
				2752	kfree(cs);
				2753	return ERR_PTR(-ENOMEM);
				2754	}
				2755
				2756	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
				2757	nodes_clear(cs->mems_allowed);
				2758	nodes_clear(cs->effective_mems);
				2759	fmeter_init(&cs->fmeter);
				2760	cs->relax_domain_level = -1;
				2761
				2762	return &cs->css;
				2763	}
				2764
				2765	static int cpuset_css_online(struct cgroup_subsys_state *css)
				2766	{
				2767	struct cpuset *cs = css_cs(css);
				2768	struct cpuset *parent = parent_cs(cs);
				2769	struct cpuset *tmp_cs;
				2770	struct cgroup_subsys_state *pos_css;
				2771
				2772	if (!parent)
				2773	return 0;
				2774
				2775	get_online_cpus();
				2776	percpu_down_write(&cpuset_rwsem);
				2777
				2778	set_bit(CS_ONLINE, &cs->flags);
				2779	if (is_spread_page(parent))
				2780	set_bit(CS_SPREAD_PAGE, &cs->flags);
				2781	if (is_spread_slab(parent))
				2782	set_bit(CS_SPREAD_SLAB, &cs->flags);
				2783
				2784	cpuset_inc();
				2785
				2786	spin_lock_irq(&callback_lock);
				2787	if (is_in_v2_mode()) {
				2788	cpumask_copy(cs->effective_cpus, parent->effective_cpus);
				2789	cs->effective_mems = parent->effective_mems;
				2790	cs->use_parent_ecpus = true;
				2791	parent->child_ecpus_count++;
				2792	}
				2793	spin_unlock_irq(&callback_lock);
				2794
				2795	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
				2796	goto out_unlock;
				2797
				2798	/*
				2799	* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
				2800	* set. This flag handling is implemented in cgroup core for
				2801	* histrical reasons - the flag may be specified during mount.
				2802	*
				2803	* Currently, if any sibling cpusets have exclusive cpus or mem, we
				2804	* refuse to clone the configuration - thereby refusing the task to
				2805	* be entered, and as a result refusing the sys_unshare() or
				2806	* clone() which initiated it. If this becomes a problem for some
				2807	* users who wish to allow that scenario, then this could be
				2808	* changed to grant parent->cpus_allowed-sibling_cpus_exclusive
				2809	* (and likewise for mems) to the new cgroup.
				2810	*/
				2811	rcu_read_lock();
				2812	cpuset_for_each_child(tmp_cs, pos_css, parent) {
				2813	if (is_mem_exclusive(tmp_cs) \|\| is_cpu_exclusive(tmp_cs)) {
				2814	rcu_read_unlock();
				2815	goto out_unlock;
				2816	}
				2817	}
				2818	rcu_read_unlock();
				2819
				2820	spin_lock_irq(&callback_lock);
				2821	cs->mems_allowed = parent->mems_allowed;
				2822	cs->effective_mems = parent->mems_allowed;
				2823	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
				2824	cpumask_copy(cs->cpus_requested, parent->cpus_requested);
				2825	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
				2826	spin_unlock_irq(&callback_lock);
				2827	out_unlock:
				2828	percpu_up_write(&cpuset_rwsem);
				2829	put_online_cpus();
				2830	return 0;
				2831	}
				2832
				2833	/*
				2834	* If the cpuset being removed has its flag 'sched_load_balance'
				2835	* enabled, then simulate turning sched_load_balance off, which
				2836	* will call rebuild_sched_domains_locked(). That is not needed
				2837	* in the default hierarchy where only changes in partition
				2838	* will cause repartitioning.
				2839	*
				2840	* If the cpuset has the 'sched.partition' flag enabled, simulate
				2841	* turning 'sched.partition" off.
				2842	*/
				2843
				2844	static void cpuset_css_offline(struct cgroup_subsys_state *css)
				2845	{
				2846	struct cpuset *cs = css_cs(css);
				2847
				2848	get_online_cpus();
				2849	percpu_down_write(&cpuset_rwsem);
				2850
				2851	if (is_partition_root(cs))
				2852	update_prstate(cs, 0);
				2853
				2854	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
				2855	is_sched_load_balance(cs))
				2856	update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
				2857
				2858	if (cs->use_parent_ecpus) {
				2859	struct cpuset *parent = parent_cs(cs);
				2860
				2861	cs->use_parent_ecpus = false;
				2862	parent->child_ecpus_count--;
				2863	}
				2864
				2865	cpuset_dec();
				2866	clear_bit(CS_ONLINE, &cs->flags);
				2867
				2868	percpu_up_write(&cpuset_rwsem);
				2869	put_online_cpus();
				2870	}
				2871
				2872	static void cpuset_css_free(struct cgroup_subsys_state *css)
				2873	{
				2874	struct cpuset *cs = css_cs(css);
				2875
				2876	free_cpuset(cs);
				2877	}
				2878
				2879	static void cpuset_bind(struct cgroup_subsys_state *root_css)
				2880	{
				2881	percpu_down_write(&cpuset_rwsem);
				2882	spin_lock_irq(&callback_lock);
				2883
				2884	if (is_in_v2_mode()) {
				2885	cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
				2886	top_cpuset.mems_allowed = node_possible_map;
				2887	} else {
				2888	cpumask_copy(top_cpuset.cpus_allowed,
				2889	top_cpuset.effective_cpus);
				2890	top_cpuset.mems_allowed = top_cpuset.effective_mems;
				2891	}
				2892
				2893	spin_unlock_irq(&callback_lock);
				2894	percpu_up_write(&cpuset_rwsem);
				2895	}
				2896
				2897	/*
				2898	* Make sure the new task conform to the current state of its parent,
				2899	* which could have been changed by cpuset just after it inherits the
				2900	* state from the parent and before it sits on the cgroup's task list.
				2901	*/
				2902	static void cpuset_fork(struct task_struct *task)
				2903	{
				2904	if (task_css_is_root(task, cpuset_cgrp_id))
				2905	return;
				2906
				2907	set_cpus_allowed_ptr(task, current->cpus_ptr);
				2908	task->mems_allowed = current->mems_allowed;
				2909	}
				2910
				2911	struct cgroup_subsys cpuset_cgrp_subsys = {
				2912	.css_alloc = cpuset_css_alloc,
				2913	.css_online = cpuset_css_online,
				2914	.css_offline = cpuset_css_offline,
				2915	.css_free = cpuset_css_free,
				2916	.can_attach = cpuset_can_attach,
				2917	.cancel_attach = cpuset_cancel_attach,
				2918	.attach = cpuset_attach,
				2919	.post_attach = cpuset_post_attach,
				2920	.bind = cpuset_bind,
				2921	.fork = cpuset_fork,
				2922	.legacy_cftypes = legacy_files,
				2923	.dfl_cftypes = dfl_files,
				2924	.early_init = true,
				2925	.threaded = true,
				2926	};
				2927
				2928	/**
				2929	* cpuset_init - initialize cpusets at system boot
				2930	*
				2931	* Description: Initialize top_cpuset
				2932	**/
				2933
				2934	int __init cpuset_init(void)
				2935	{
				2936	BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
				2937
				2938	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
				2939	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
				2940	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
				2941	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
				2942
				2943	cpumask_setall(top_cpuset.cpus_allowed);
				2944	cpumask_setall(top_cpuset.cpus_requested);
				2945	nodes_setall(top_cpuset.mems_allowed);
				2946	cpumask_setall(top_cpuset.effective_cpus);
				2947	nodes_setall(top_cpuset.effective_mems);
				2948
				2949	fmeter_init(&top_cpuset.fmeter);
				2950	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
				2951	top_cpuset.relax_domain_level = -1;
				2952
				2953	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
				2954
				2955	return 0;
				2956	}
				2957
				2958	/*
				2959	* If CPU and/or memory hotplug handlers, below, unplug any CPUs
				2960	* or memory nodes, we need to walk over the cpuset hierarchy,
				2961	* removing that CPU or node from all cpusets. If this removes the
				2962	* last CPU or node from a cpuset, then move the tasks in the empty
				2963	* cpuset to its next-highest non-empty parent.
				2964	*/
				2965	static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
				2966	{
				2967	struct cpuset *parent;
				2968
				2969	/*
				2970	* Find its next-highest non-empty parent, (top cpuset
				2971	* has online cpus, so can't be empty).
				2972	*/
				2973	parent = parent_cs(cs);
				2974	while (cpumask_empty(parent->cpus_allowed) \|\|
				2975	nodes_empty(parent->mems_allowed))
				2976	parent = parent_cs(parent);
				2977
				2978	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
				2979	pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
				2980	pr_cont_cgroup_name(cs->css.cgroup);
				2981	pr_cont("\n");
				2982	}
				2983	}
				2984
				2985	static void
				2986	hotplug_update_tasks_legacy(struct cpuset *cs,
				2987	struct cpumask new_cpus, nodemask_t new_mems,
				2988	bool cpus_updated, bool mems_updated)
				2989	{
				2990	bool is_empty;
				2991
				2992	spin_lock_irq(&callback_lock);
				2993	cpumask_copy(cs->cpus_allowed, new_cpus);
				2994	cpumask_copy(cs->effective_cpus, new_cpus);
				2995	cs->mems_allowed = *new_mems;
				2996	cs->effective_mems = *new_mems;
				2997	spin_unlock_irq(&callback_lock);
				2998
				2999	/*
				3000	* Don't call update_tasks_cpumask() if the cpuset becomes empty,
				3001	* as the tasks will be migratecd to an ancestor.
				3002	*/
				3003	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
				3004	update_tasks_cpumask(cs);
				3005	if (mems_updated && !nodes_empty(cs->mems_allowed))
				3006	update_tasks_nodemask(cs);
				3007
				3008	is_empty = cpumask_empty(cs->cpus_allowed) \|\|
				3009	nodes_empty(cs->mems_allowed);
				3010
				3011	percpu_up_write(&cpuset_rwsem);
				3012
				3013	/*
				3014	* Move tasks to the nearest ancestor with execution resources,
				3015	* This is full cgroup operation which will also call back into
				3016	* cpuset. Should be done outside any lock.
				3017	*/
				3018	if (is_empty)
				3019	remove_tasks_in_empty_cpuset(cs);
				3020
				3021	percpu_down_write(&cpuset_rwsem);
				3022	}
				3023
				3024	static void
				3025	hotplug_update_tasks(struct cpuset *cs,
				3026	struct cpumask new_cpus, nodemask_t new_mems,
				3027	bool cpus_updated, bool mems_updated)
				3028	{
				3029	if (cpumask_empty(new_cpus))
				3030	cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
				3031	if (nodes_empty(*new_mems))
				3032	*new_mems = parent_cs(cs)->effective_mems;
				3033
				3034	spin_lock_irq(&callback_lock);
				3035	cpumask_copy(cs->effective_cpus, new_cpus);
				3036	cs->effective_mems = *new_mems;
				3037	spin_unlock_irq(&callback_lock);
				3038
				3039	if (cpus_updated)
				3040	update_tasks_cpumask(cs);
				3041	if (mems_updated)
				3042	update_tasks_nodemask(cs);
				3043	}
				3044
				3045	static bool force_rebuild;
				3046
				3047	void cpuset_force_rebuild(void)
				3048	{
				3049	force_rebuild = true;
				3050	}
				3051
				3052	/**
				3053	* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
				3054	* @cs: cpuset in interest
				3055	* @tmp: the tmpmasks structure pointer
				3056	*
				3057	* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
				3058	* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
				3059	* all its tasks are moved to the nearest ancestor with both resources.
				3060	*/
				3061	static void cpuset_hotplug_update_tasks(struct cpuset cs, struct tmpmasks tmp)
				3062	{
				3063	static cpumask_t new_cpus;
				3064	static nodemask_t new_mems;
				3065	bool cpus_updated;
				3066	bool mems_updated;
				3067	struct cpuset *parent;
				3068	retry:
				3069	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
				3070
				3071	percpu_down_write(&cpuset_rwsem);
				3072
				3073	/*
				3074	* We have raced with task attaching. We wait until attaching
				3075	* is finished, so we won't attach a task to an empty cpuset.
				3076	*/
				3077	if (cs->attach_in_progress) {
				3078	percpu_up_write(&cpuset_rwsem);
				3079	goto retry;
				3080	}
				3081
				3082	parent = parent_cs(cs);
				3083	compute_effective_cpumask(&new_cpus, cs, parent);
				3084	nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
				3085
				3086	if (cs->nr_subparts_cpus)
				3087	/*
				3088	* Make sure that CPUs allocated to child partitions
				3089	* do not show up in effective_cpus.
				3090	*/
				3091	cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
				3092
				3093	if (!tmp \|\| !cs->partition_root_state)
				3094	goto update_tasks;
				3095
				3096	/*
				3097	* In the unlikely event that a partition root has empty
				3098	* effective_cpus or its parent becomes erroneous, we have to
				3099	* transition it to the erroneous state.
				3100	*/
				3101	if (is_partition_root(cs) && (cpumask_empty(&new_cpus) \|\|
				3102	(parent->partition_root_state == PRS_ERROR))) {
				3103	if (cs->nr_subparts_cpus) {
				3104	cs->nr_subparts_cpus = 0;
				3105	cpumask_clear(cs->subparts_cpus);
				3106	compute_effective_cpumask(&new_cpus, cs, parent);
				3107	}
				3108
				3109	/*
				3110	* If the effective_cpus is empty because the child
				3111	* partitions take away all the CPUs, we can keep
				3112	* the current partition and let the child partitions
				3113	* fight for available CPUs.
				3114	*/
				3115	if ((parent->partition_root_state == PRS_ERROR) \|\|
				3116	cpumask_empty(&new_cpus)) {
				3117	update_parent_subparts_cpumask(cs, partcmd_disable,
				3118	NULL, tmp);
				3119	cs->partition_root_state = PRS_ERROR;
				3120	}
				3121	cpuset_force_rebuild();
				3122	}
				3123
				3124	/*
				3125	* On the other hand, an erroneous partition root may be transitioned
				3126	* back to a regular one or a partition root with no CPU allocated
				3127	* from the parent may change to erroneous.
				3128	*/
				3129	if (is_partition_root(parent) &&
				3130	((cs->partition_root_state == PRS_ERROR) \|\|
				3131	!cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
				3132	update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
				3133	cpuset_force_rebuild();
				3134
				3135	update_tasks:
				3136	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
				3137	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
				3138
				3139	if (is_in_v2_mode())
				3140	hotplug_update_tasks(cs, &new_cpus, &new_mems,
				3141	cpus_updated, mems_updated);
				3142	else
				3143	hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
				3144	cpus_updated, mems_updated);
				3145
				3146	percpu_up_write(&cpuset_rwsem);
				3147	}
				3148
				3149	/**
				3150	* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
				3151	*
				3152	* This function is called after either CPU or memory configuration has
				3153	* changed and updates cpuset accordingly. The top_cpuset is always
				3154	* synchronized to cpu_active_mask and N_MEMORY, which is necessary in
				3155	* order to make cpusets transparent (of no affect) on systems that are
				3156	* actively using CPU hotplug but making no active use of cpusets.
				3157	*
				3158	* Non-root cpusets are only affected by offlining. If any CPUs or memory
				3159	* nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
				3160	* all descendants.
				3161	*
				3162	* Note that CPU offlining during suspend is ignored. We don't modify
				3163	* cpusets across suspend/resume cycles at all.
				3164	*/
				3165	static void cpuset_hotplug_workfn(struct work_struct *work)
				3166	{
				3167	static cpumask_t new_cpus;
				3168	static nodemask_t new_mems;
				3169	bool cpus_updated, mems_updated;
				3170	bool on_dfl = is_in_v2_mode();
				3171	struct tmpmasks tmp, *ptmp = NULL;
				3172
				3173	if (on_dfl && !alloc_cpumasks(NULL, &tmp))
				3174	ptmp = &tmp;
				3175
				3176	percpu_down_write(&cpuset_rwsem);
				3177
				3178	/* fetch the available cpus/mems and find out which changed how */
				3179	cpumask_copy(&new_cpus, cpu_active_mask);
				3180	new_mems = node_states[N_MEMORY];
				3181
				3182	/*
				3183	* If subparts_cpus is populated, it is likely that the check below
				3184	* will produce a false positive on cpus_updated when the cpu list
				3185	* isn't changed. It is extra work, but it is better to be safe.
				3186	*/
				3187	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
				3188	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
				3189
				3190	/*
				3191	* In the rare case that hotplug removes all the cpus in subparts_cpus,
				3192	* we assumed that cpus are updated.
				3193	*/
				3194	if (!cpus_updated && top_cpuset.nr_subparts_cpus)
				3195	cpus_updated = true;
				3196
				3197	/* synchronize cpus_allowed to cpu_active_mask */
				3198	if (cpus_updated) {
				3199	spin_lock_irq(&callback_lock);
				3200	if (!on_dfl)
				3201	cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
				3202	/*
				3203	* Make sure that CPUs allocated to child partitions
				3204	* do not show up in effective_cpus. If no CPU is left,
				3205	* we clear the subparts_cpus & let the child partitions
				3206	* fight for the CPUs again.
				3207	*/
				3208	if (top_cpuset.nr_subparts_cpus) {
				3209	if (cpumask_subset(&new_cpus,
				3210	top_cpuset.subparts_cpus)) {
				3211	top_cpuset.nr_subparts_cpus = 0;
				3212	cpumask_clear(top_cpuset.subparts_cpus);
				3213	} else {
				3214	cpumask_andnot(&new_cpus, &new_cpus,
				3215	top_cpuset.subparts_cpus);
				3216	}
				3217	}
				3218	cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
				3219	spin_unlock_irq(&callback_lock);
				3220	/* we don't mess with cpumasks of tasks in top_cpuset */
				3221	}
				3222
				3223	/* synchronize mems_allowed to N_MEMORY */
				3224	if (mems_updated) {
				3225	spin_lock_irq(&callback_lock);
				3226	if (!on_dfl)
				3227	top_cpuset.mems_allowed = new_mems;
				3228	top_cpuset.effective_mems = new_mems;
				3229	spin_unlock_irq(&callback_lock);
				3230	update_tasks_nodemask(&top_cpuset);
				3231	}
				3232
				3233	percpu_up_write(&cpuset_rwsem);
				3234
				3235	/* if cpus or mems changed, we need to propagate to descendants */
				3236	if (cpus_updated \|\| mems_updated) {
				3237	struct cpuset *cs;
				3238	struct cgroup_subsys_state *pos_css;
				3239
				3240	rcu_read_lock();
				3241	cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
				3242	if (cs == &top_cpuset \|\| !css_tryget_online(&cs->css))
				3243	continue;
				3244	rcu_read_unlock();
				3245
				3246	cpuset_hotplug_update_tasks(cs, ptmp);
				3247
				3248	rcu_read_lock();
				3249	css_put(&cs->css);
				3250	}
				3251	rcu_read_unlock();
				3252	}
				3253
				3254	/* rebuild sched domains if cpus_allowed has changed */
				3255	if (cpus_updated \|\| force_rebuild) {
				3256	force_rebuild = false;
				3257	rebuild_sched_domains();
				3258	}
				3259
				3260	free_cpumasks(NULL, ptmp);
				3261	}
				3262
				3263	void cpuset_update_active_cpus(void)
				3264	{
				3265	/*
				3266	* We're inside cpu hotplug critical region which usually nests
				3267	* inside cgroup synchronization. Bounce actual hotplug processing
				3268	* to a work item to avoid reverse locking order.
				3269	*/
				3270	schedule_work(&cpuset_hotplug_work);
				3271	}
				3272
				3273	void cpuset_wait_for_hotplug(void)
				3274	{
				3275	flush_work(&cpuset_hotplug_work);
				3276	}
				3277
				3278	/*
				3279	* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
				3280	* Call this routine anytime after node_states[N_MEMORY] changes.
				3281	* See cpuset_update_active_cpus() for CPU hotplug handling.
				3282	*/
				3283	static int cpuset_track_online_nodes(struct notifier_block *self,
				3284	unsigned long action, void *arg)
				3285	{
				3286	schedule_work(&cpuset_hotplug_work);
				3287	return NOTIFY_OK;
				3288	}
				3289
				3290	static struct notifier_block cpuset_track_online_nodes_nb = {
				3291	.notifier_call = cpuset_track_online_nodes,
				3292	.priority = 10, /* ??! */
				3293	};
				3294
				3295	/**
				3296	* cpuset_init_smp - initialize cpus_allowed
				3297	*
				3298	* Description: Finish top cpuset after cpu, node maps are initialized
				3299	*/
				3300	void __init cpuset_init_smp(void)
				3301	{
				3302	/*
				3303	* cpus_allowd/mems_allowed set to v2 values in the initial
				3304	* cpuset_bind() call will be reset to v1 values in another
				3305	* cpuset_bind() call when v1 cpuset is mounted.
				3306	*/
				3307	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
				3308
				3309	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
				3310	top_cpuset.effective_mems = node_states[N_MEMORY];
				3311
				3312	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
				3313
				3314	cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
				3315	BUG_ON(!cpuset_migrate_mm_wq);
				3316	}
				3317
				3318	/**
				3319	* cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
				3320	* @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
				3321	* @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
				3322	*
				3323	* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
				3324	* attached to the specified @tsk. Guaranteed to return some non-empty
				3325	* subset of cpu_online_mask, even if this means going outside the
				3326	* tasks cpuset.
				3327	**/
				3328
				3329	void cpuset_cpus_allowed(struct task_struct tsk, struct cpumask pmask)
				3330	{
				3331	unsigned long flags;
				3332
				3333	spin_lock_irqsave(&callback_lock, flags);
				3334	rcu_read_lock();
				3335	guarantee_online_cpus(task_cs(tsk), pmask);
				3336	rcu_read_unlock();
				3337	spin_unlock_irqrestore(&callback_lock, flags);
				3338	}
				3339
				3340	/**
				3341	* cpuset_cpus_allowed_fallback - final fallback before complete catastrophe.
				3342	* @tsk: pointer to task_struct with which the scheduler is struggling
				3343	*
				3344	* Description: In the case that the scheduler cannot find an allowed cpu in
				3345	* tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy
				3346	* mode however, this value is the same as task_cs(tsk)->effective_cpus,
				3347	* which will not contain a sane cpumask during cases such as cpu hotplugging.
				3348	* This is the absolute last resort for the scheduler and it is only used if
				3349	* _every_ other avenue has been traveled.
				3350	**/
				3351
				3352	void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
				3353	{
				3354	rcu_read_lock();
				3355	do_set_cpus_allowed(tsk, is_in_v2_mode() ?
				3356	task_cs(tsk)->cpus_allowed : cpu_possible_mask);
				3357	rcu_read_unlock();
				3358
				3359	/*
				3360	* We own tsk->cpus_allowed, nobody can change it under us.
				3361	*
				3362	* But we used cs && cs->cpus_allowed lockless and thus can
				3363	* race with cgroup_attach_task() or update_cpumask() and get
				3364	* the wrong tsk->cpus_allowed. However, both cases imply the
				3365	* subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
				3366	* which takes task_rq_lock().
				3367	*
				3368	* If we are called after it dropped the lock we must see all
				3369	* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
				3370	* set any mask even if it is not right from task_cs() pov,
				3371	* the pending set_cpus_allowed_ptr() will fix things.
				3372	*
				3373	* select_fallback_rq() will fix things ups and set cpu_possible_mask
				3374	* if required.
				3375	*/
				3376	}
				3377
				3378	void __init cpuset_init_current_mems_allowed(void)
				3379	{
				3380	nodes_setall(current->mems_allowed);
				3381	}
				3382
				3383	/**
				3384	* cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
				3385	* @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
				3386	*
				3387	* Description: Returns the nodemask_t mems_allowed of the cpuset
				3388	* attached to the specified @tsk. Guaranteed to return some non-empty
				3389	* subset of node_states[N_MEMORY], even if this means going outside the
				3390	* tasks cpuset.
				3391	**/
				3392
				3393	nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
				3394	{
				3395	nodemask_t mask;
				3396	unsigned long flags;
				3397
				3398	spin_lock_irqsave(&callback_lock, flags);
				3399	rcu_read_lock();
				3400	guarantee_online_mems(task_cs(tsk), &mask);
				3401	rcu_read_unlock();
				3402	spin_unlock_irqrestore(&callback_lock, flags);
				3403
				3404	return mask;
				3405	}
				3406
				3407	/**
				3408	* cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
				3409	* @nodemask: the nodemask to be checked
				3410	*
				3411	* Are any of the nodes in the nodemask allowed in current->mems_allowed?
				3412	*/
				3413	int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
				3414	{
				3415	return nodes_intersects(*nodemask, current->mems_allowed);
				3416	}
				3417
				3418	/*
				3419	* nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
				3420	* mem_hardwall ancestor to the specified cpuset. Call holding
				3421	* callback_lock. If no ancestor is mem_exclusive or mem_hardwall
				3422	* (an unusual configuration), then returns the root cpuset.
				3423	*/
				3424	static struct cpuset nearest_hardwall_ancestor(struct cpuset cs)
				3425	{
				3426	while (!(is_mem_exclusive(cs) \|\| is_mem_hardwall(cs)) && parent_cs(cs))
				3427	cs = parent_cs(cs);
				3428	return cs;
				3429	}
				3430
				3431	/**
				3432	* cpuset_node_allowed - Can we allocate on a memory node?
				3433	* @node: is this an allowed node?
				3434	* @gfp_mask: memory allocation flags
				3435	*
				3436	* If we're in interrupt, yes, we can always allocate. If @node is set in
				3437	* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
				3438	* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
				3439	* yes. If current has access to memory reserves as an oom victim, yes.
				3440	* Otherwise, no.
				3441	*
				3442	* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
				3443	* and do not allow allocations outside the current tasks cpuset
				3444	* unless the task has been OOM killed.
				3445	* GFP_KERNEL allocations are not so marked, so can escape to the
				3446	* nearest enclosing hardwalled ancestor cpuset.
				3447	*
				3448	* Scanning up parent cpusets requires callback_lock. The
				3449	* __alloc_pages() routine only calls here with __GFP_HARDWALL bit
				3450	* _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
				3451	* current tasks mems_allowed came up empty on the first pass over
				3452	* the zonelist. So only GFP_KERNEL allocations, if all nodes in the
				3453	* cpuset are short of memory, might require taking the callback_lock.
				3454	*
				3455	* The first call here from mm/page_alloc:get_page_from_freelist()
				3456	* has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
				3457	* so no allocation on a node outside the cpuset is allowed (unless
				3458	* in interrupt, of course).
				3459	*
				3460	* The second pass through get_page_from_freelist() doesn't even call
				3461	* here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
				3462	* variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
				3463	* in alloc_flags. That logic and the checks below have the combined
				3464	* affect that:
				3465	* in_interrupt - any node ok (current task context irrelevant)
				3466	* GFP_ATOMIC - any node ok
				3467	* tsk_is_oom_victim - any node ok
				3468	* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
				3469	* GFP_USER - only nodes in current tasks mems allowed ok.
				3470	*/
				3471	bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
				3472	{
				3473	struct cpuset cs; / current cpuset ancestors */
				3474	int allowed; /* is allocation in zone z allowed? */
				3475	unsigned long flags;
				3476
				3477	if (in_interrupt())
				3478	return true;
				3479	if (node_isset(node, current->mems_allowed))
				3480	return true;
				3481	/*
				3482	* Allow tasks that have access to memory reserves because they have
				3483	* been OOM killed to get memory anywhere.
				3484	*/
				3485	if (unlikely(tsk_is_oom_victim(current)))
				3486	return true;
				3487	if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
				3488	return false;
				3489
				3490	if (current->flags & PF_EXITING) /* Let dying task have memory */
				3491	return true;
				3492
				3493	/* Not hardwall and node outside mems_allowed: scan up cpusets */
				3494	spin_lock_irqsave(&callback_lock, flags);
				3495
				3496	rcu_read_lock();
				3497	cs = nearest_hardwall_ancestor(task_cs(current));
				3498	allowed = node_isset(node, cs->mems_allowed);
				3499	rcu_read_unlock();
				3500
				3501	spin_unlock_irqrestore(&callback_lock, flags);
				3502	return allowed;
				3503	}
				3504
				3505	/**
				3506	* cpuset_mem_spread_node() - On which node to begin search for a file page
				3507	* cpuset_slab_spread_node() - On which node to begin search for a slab page
				3508	*
				3509	* If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
				3510	* tasks in a cpuset with is_spread_page or is_spread_slab set),
				3511	* and if the memory allocation used cpuset_mem_spread_node()
				3512	* to determine on which node to start looking, as it will for
				3513	* certain page cache or slab cache pages such as used for file
				3514	* system buffers and inode caches, then instead of starting on the
				3515	* local node to look for a free page, rather spread the starting
				3516	* node around the tasks mems_allowed nodes.
				3517	*
				3518	* We don't have to worry about the returned node being offline
				3519	* because "it can't happen", and even if it did, it would be ok.
				3520	*
				3521	* The routines calling guarantee_online_mems() are careful to
				3522	* only set nodes in task->mems_allowed that are online. So it
				3523	* should not be possible for the following code to return an
				3524	* offline node. But if it did, that would be ok, as this routine
				3525	* is not returning the node where the allocation must be, only
				3526	* the node where the search should start. The zonelist passed to
				3527	* __alloc_pages() will include all nodes. If the slab allocator
				3528	* is passed an offline node, it will fall back to the local node.
				3529	* See kmem_cache_alloc_node().
				3530	*/
				3531
				3532	static int cpuset_spread_node(int *rotor)
				3533	{
				3534	return rotor = next_node_in(rotor, current->mems_allowed);
				3535	}
				3536
				3537	int cpuset_mem_spread_node(void)
				3538	{
				3539	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
				3540	current->cpuset_mem_spread_rotor =
				3541	node_random(&current->mems_allowed);
				3542
				3543	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
				3544	}
				3545
				3546	int cpuset_slab_spread_node(void)
				3547	{
				3548	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
				3549	current->cpuset_slab_spread_rotor =
				3550	node_random(&current->mems_allowed);
				3551
				3552	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
				3553	}
				3554
				3555	EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
				3556
				3557	/**
				3558	* cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
				3559	* @tsk1: pointer to task_struct of some task.
				3560	* @tsk2: pointer to task_struct of some other task.
				3561	*
				3562	* Description: Return true if @tsk1's mems_allowed intersects the
				3563	* mems_allowed of @tsk2. Used by the OOM killer to determine if
				3564	* one of the task's memory usage might impact the memory available
				3565	* to the other.
				3566	**/
				3567
				3568	int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
				3569	const struct task_struct *tsk2)
				3570	{
				3571	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
				3572	}
				3573
				3574	/**
				3575	* cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
				3576	*
				3577	* Description: Prints current's name, cpuset name, and cached copy of its
				3578	* mems_allowed to the kernel log.
				3579	*/
				3580	void cpuset_print_current_mems_allowed(void)
				3581	{
				3582	struct cgroup *cgrp;
				3583
				3584	rcu_read_lock();
				3585
				3586	cgrp = task_cs(current)->css.cgroup;
				3587	pr_cont(",cpuset=");
				3588	pr_cont_cgroup_name(cgrp);
				3589	pr_cont(",mems_allowed=%*pbl",
				3590	nodemask_pr_args(&current->mems_allowed));
				3591
				3592	rcu_read_unlock();
				3593	}
				3594
				3595	/*
				3596	* Collection of memory_pressure is suppressed unless
				3597	* this flag is enabled by writing "1" to the special
				3598	* cpuset file 'memory_pressure_enabled' in the root cpuset.
				3599	*/
				3600
				3601	int cpuset_memory_pressure_enabled __read_mostly;
				3602
				3603	/**
				3604	* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
				3605	*
				3606	* Keep a running average of the rate of synchronous (direct)
				3607	* page reclaim efforts initiated by tasks in each cpuset.
				3608	*
				3609	* This represents the rate at which some task in the cpuset
				3610	* ran low on memory on all nodes it was allowed to use, and
				3611	* had to enter the kernels page reclaim code in an effort to
				3612	* create more free memory by tossing clean pages or swapping
				3613	* or writing dirty pages.
				3614	*
				3615	* Display to user space in the per-cpuset read-only file
				3616	* "memory_pressure". Value displayed is an integer
				3617	* representing the recent rate of entry into the synchronous
				3618	* (direct) page reclaim by any task attached to the cpuset.
				3619	**/
				3620
				3621	void __cpuset_memory_pressure_bump(void)
				3622	{
				3623	rcu_read_lock();
				3624	fmeter_markevent(&task_cs(current)->fmeter);
				3625	rcu_read_unlock();
				3626	}
				3627
				3628	#ifdef CONFIG_PROC_PID_CPUSET
				3629	/*
				3630	* proc_cpuset_show()
				3631	* - Print tasks cpuset path into seq_file.
				3632	* - Used for /proc/<pid>/cpuset.
				3633	* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
				3634	* doesn't really matter if tsk->cpuset changes after we read it,
				3635	* and we take cpuset_mutex, keeping cpuset_attach() from changing it
				3636	* anyway.
				3637	*/
				3638	int proc_cpuset_show(struct seq_file m, struct pid_namespace ns,
				3639	struct pid pid, struct task_struct tsk)
				3640	{
				3641	char *buf;
				3642	struct cgroup_subsys_state *css;
				3643	int retval;
				3644
				3645	retval = -ENOMEM;
				3646	buf = kmalloc(PATH_MAX, GFP_KERNEL);
				3647	if (!buf)
				3648	goto out;
				3649
				3650	rcu_read_lock();
				3651	spin_lock_irq(&css_set_lock);
				3652	css = task_css(tsk, cpuset_cgrp_id);
				3653	retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
				3654	current->nsproxy->cgroup_ns);
				3655	spin_unlock_irq(&css_set_lock);
				3656	rcu_read_unlock();
				3657
				3658	if (retval >= PATH_MAX)
				3659	retval = -ENAMETOOLONG;
				3660	if (retval < 0)
				3661	goto out_free;
				3662	seq_puts(m, buf);
				3663	seq_putc(m, '\n');
				3664	retval = 0;
				3665	out_free:
				3666	kfree(buf);
				3667	out:
				3668	return retval;
				3669	}
				3670	#endif /* CONFIG_PROC_PID_CPUSET */
				3671
				3672	/* Display task mems_allowed in /proc/<pid>/status file. */
				3673	void cpuset_task_status_allowed(struct seq_file m, struct task_struct task)
				3674	{
				3675	seq_printf(m, "Mems_allowed:\t%*pb\n",
				3676	nodemask_pr_args(&task->mems_allowed));
				3677	seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
				3678	nodemask_pr_args(&task->mems_allowed));
				3679	}