Blame - marvell/linux/kernel/cgroup/cgroup.c - T108

blob: 9370cf23bc9543b93e6a7885a92bbd1d16d63d10 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	/*
				2	* Generic process-grouping system.
				3	*
				4	* Based originally on the cpuset system, extracted by Paul Menage
				5	* Copyright (C) 2006 Google, Inc
				6	*
				7	* Notifications support
				8	* Copyright (C) 2009 Nokia Corporation
				9	* Author: Kirill A. Shutemov
				10	*
				11	* Copyright notices from the original cpuset code:
				12	* --------------------------------------------------
				13	* Copyright (C) 2003 BULL SA.
				14	* Copyright (C) 2004-2006 Silicon Graphics, Inc.
				15	*
				16	* Portions derived from Patrick Mochel's sysfs code.
				17	* sysfs is Copyright (c) 2001-3 Patrick Mochel
				18	*
				19	* 2003-10-10 Written by Simon Derr.
				20	* 2003-10-22 Updates by Stephen Hemminger.
				21	* 2004 May-July Rework by Paul Jackson.
				22	* ---------------------------------------------------
				23	*
				24	* This file is subject to the terms and conditions of the GNU General Public
				25	* License. See the file COPYING in the main directory of the Linux
				26	* distribution for more details.
				27	*/
				28
				29	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
				30
				31	#include "cgroup-internal.h"
				32
				33	#include <linux/cpu.h>
				34	#include <linux/cred.h>
				35	#include <linux/errno.h>
				36	#include <linux/init_task.h>
				37	#include <linux/kernel.h>
				38	#include <linux/magic.h>
				39	#include <linux/mutex.h>
				40	#include <linux/mount.h>
				41	#include <linux/pagemap.h>
				42	#include <linux/proc_fs.h>
				43	#include <linux/rcupdate.h>
				44	#include <linux/sched.h>
				45	#include <linux/sched/task.h>
				46	#include <linux/slab.h>
				47	#include <linux/spinlock.h>
				48	#include <linux/percpu-rwsem.h>
				49	#include <linux/string.h>
				50	#include <linux/hashtable.h>
				51	#include <linux/idr.h>
				52	#include <linux/kthread.h>
				53	#include <linux/atomic.h>
				54	#include <linux/cpuset.h>
				55	#include <linux/proc_ns.h>
				56	#include <linux/nsproxy.h>
				57	#include <linux/file.h>
				58	#include <linux/fs_parser.h>
				59	#include <linux/sched/cputime.h>
				60	#include <linux/psi.h>
				61	#include <net/sock.h>
				62
				63	#define CREATE_TRACE_POINTS
				64	#include <trace/events/cgroup.h>
				65
				66	#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
				67	MAX_CFTYPE_NAME + 2)
				68	/* let's not notify more than 100 times per second */
				69	#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
				70
				71	/*
				72	* cgroup_mutex is the master lock. Any modification to cgroup or its
				73	* hierarchy must be performed while holding it.
				74	*
				75	* css_set_lock protects task->cgroups pointer, the list of css_set
				76	* objects, and the chain of tasks off each css_set.
				77	*
				78	* These locks are exported if CONFIG_PROVE_RCU so that accessors in
				79	* cgroup.h can use them for lockdep annotations.
				80	*/
				81	DEFINE_MUTEX(cgroup_mutex);
				82	DEFINE_SPINLOCK(css_set_lock);
				83
				84	#ifdef CONFIG_PROVE_RCU
				85	EXPORT_SYMBOL_GPL(cgroup_mutex);
				86	EXPORT_SYMBOL_GPL(css_set_lock);
				87	#endif
				88
				89	DEFINE_SPINLOCK(trace_cgroup_path_lock);
				90	char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
				91	bool cgroup_debug __read_mostly;
				92
				93	/*
				94	* Protects cgroup_idr and css_idr so that IDs can be released without
				95	* grabbing cgroup_mutex.
				96	*/
				97	static DEFINE_SPINLOCK(cgroup_idr_lock);
				98
				99	/*
				100	* Protects cgroup_file->kn for !self csses. It synchronizes notifications
				101	* against file removal/re-creation across css hiding.
				102	*/
				103	static DEFINE_SPINLOCK(cgroup_file_kn_lock);
				104
				105	DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
				106
				107	#define cgroup_assert_mutex_or_rcu_locked() \
				108	RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
				109	!lockdep_is_held(&cgroup_mutex), \
				110	"cgroup_mutex or RCU read lock required");
				111
				112	/*
				113	* cgroup destruction makes heavy use of work items and there can be a lot
				114	* of concurrent destructions. Use a separate workqueue so that cgroup
				115	* destruction work items don't end up filling up max_active of system_wq
				116	* which may lead to deadlock.
				117	*/
				118	static struct workqueue_struct *cgroup_destroy_wq;
				119
				120	/* generate an array of cgroup subsystem pointers */
				121	#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
				122	struct cgroup_subsys *cgroup_subsys[] = {
				123	#include <linux/cgroup_subsys.h>
				124	};
				125	#undef SUBSYS
				126
				127	/* array of cgroup subsystem names */
				128	#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
				129	static const char *cgroup_subsys_name[] = {
				130	#include <linux/cgroup_subsys.h>
				131	};
				132	#undef SUBSYS
				133
				134	/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
				135	#define SUBSYS(_x) \
				136	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
				137	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
				138	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
				139	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
				140	#include <linux/cgroup_subsys.h>
				141	#undef SUBSYS
				142
				143	#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
				144	static struct static_key_true *cgroup_subsys_enabled_key[] = {
				145	#include <linux/cgroup_subsys.h>
				146	};
				147	#undef SUBSYS
				148
				149	#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
				150	static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
				151	#include <linux/cgroup_subsys.h>
				152	};
				153	#undef SUBSYS
				154
				155	static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
				156
				157	/*
				158	* The default hierarchy, reserved for the subsystems that are otherwise
				159	* unattached - it never has more than a single cgroup, and all tasks are
				160	* part of that cgroup.
				161	*/
				162	struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
				163	EXPORT_SYMBOL_GPL(cgrp_dfl_root);
				164
				165	/*
				166	* The default hierarchy always exists but is hidden until mounted for the
				167	* first time. This is for backward compatibility.
				168	*/
				169	static bool cgrp_dfl_visible;
				170
				171	/* some controllers are not supported in the default hierarchy */
				172	static u16 cgrp_dfl_inhibit_ss_mask;
				173
				174	/* some controllers are implicitly enabled on the default hierarchy */
				175	static u16 cgrp_dfl_implicit_ss_mask;
				176
				177	/* some controllers can be threaded on the default hierarchy */
				178	static u16 cgrp_dfl_threaded_ss_mask;
				179
				180	/* The list of hierarchy roots */
				181	LIST_HEAD(cgroup_roots);
				182	static int cgroup_root_count;
				183
				184	/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
				185	static DEFINE_IDR(cgroup_hierarchy_idr);
				186
				187	/*
				188	* Assign a monotonically increasing serial number to csses. It guarantees
				189	* cgroups with bigger numbers are newer than those with smaller numbers.
				190	* Also, as csses are always appended to the parent's ->children list, it
				191	* guarantees that sibling csses are always sorted in the ascending serial
				192	* number order on the list. Protected by cgroup_mutex.
				193	*/
				194	static u64 css_serial_nr_next = 1;
				195
				196	/*
				197	* These bitmasks identify subsystems with specific features to avoid
				198	* having to do iterative checks repeatedly.
				199	*/
				200	static u16 have_fork_callback __read_mostly;
				201	static u16 have_exit_callback __read_mostly;
				202	static u16 have_release_callback __read_mostly;
				203	static u16 have_canfork_callback __read_mostly;
				204
				205	/* cgroup namespace for init task */
				206	struct cgroup_namespace init_cgroup_ns = {
				207	.count = REFCOUNT_INIT(2),
				208	.user_ns = &init_user_ns,
				209	.ns.ops = &cgroupns_operations,
				210	.ns.inum = PROC_CGROUP_INIT_INO,
				211	.root_cset = &init_css_set,
				212	};
				213
				214	static struct file_system_type cgroup2_fs_type;
				215	static struct cftype cgroup_base_files[];
				216
				217	/* cgroup optional features */
				218	enum cgroup_opt_features {
				219	#ifdef CONFIG_PSI
				220	OPT_FEATURE_PRESSURE,
				221	#endif
				222	OPT_FEATURE_COUNT
				223	};
				224
				225	static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
				226	#ifdef CONFIG_PSI
				227	"pressure",
				228	#endif
				229	};
				230
				231	static u16 cgroup_feature_disable_mask __read_mostly;
				232
				233	static int cgroup_apply_control(struct cgroup *cgrp);
				234	static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
				235	static void css_task_iter_skip(struct css_task_iter *it,
				236	struct task_struct *task);
				237	static int cgroup_destroy_locked(struct cgroup *cgrp);
				238	static struct cgroup_subsys_state css_create(struct cgroup cgrp,
				239	struct cgroup_subsys *ss);
				240	static void css_release(struct percpu_ref *ref);
				241	static void kill_css(struct cgroup_subsys_state *css);
				242	static int cgroup_addrm_files(struct cgroup_subsys_state *css,
				243	struct cgroup *cgrp, struct cftype cfts[],
				244	bool is_add);
				245
				246	/**
				247	* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
				248	* @ssid: subsys ID of interest
				249	*
				250	* cgroup_subsys_enabled() can only be used with literal subsys names which
				251	* is fine for individual subsystems but unsuitable for cgroup core. This
				252	* is slower static_key_enabled() based test indexed by @ssid.
				253	*/
				254	bool cgroup_ssid_enabled(int ssid)
				255	{
				256	if (CGROUP_SUBSYS_COUNT == 0)
				257	return false;
				258
				259	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
				260	}
				261
				262	/**
				263	* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
				264	* @cgrp: the cgroup of interest
				265	*
				266	* The default hierarchy is the v2 interface of cgroup and this function
				267	* can be used to test whether a cgroup is on the default hierarchy for
				268	* cases where a subsystem should behave differnetly depending on the
				269	* interface version.
				270	*
				271	* The set of behaviors which change on the default hierarchy are still
				272	* being determined and the mount option is prefixed with __DEVEL__.
				273	*
				274	* List of changed behaviors:
				275	*
				276	* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
				277	* and "name" are disallowed.
				278	*
				279	* - When mounting an existing superblock, mount options should match.
				280	*
				281	* - Remount is disallowed.
				282	*
				283	* - rename(2) is disallowed.
				284	*
				285	* - "tasks" is removed. Everything should be at process granularity. Use
				286	* "cgroup.procs" instead.
				287	*
				288	* - "cgroup.procs" is not sorted. pids will be unique unless they got
				289	* recycled inbetween reads.
				290	*
				291	* - "release_agent" and "notify_on_release" are removed. Replacement
				292	* notification mechanism will be implemented.
				293	*
				294	* - "cgroup.clone_children" is removed.
				295	*
				296	* - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
				297	* and its descendants contain no task; otherwise, 1. The file also
				298	* generates kernfs notification which can be monitored through poll and
				299	* [di]notify when the value of the file changes.
				300	*
				301	* - cpuset: tasks will be kept in empty cpusets when hotplug happens and
				302	* take masks of ancestors with non-empty cpus/mems, instead of being
				303	* moved to an ancestor.
				304	*
				305	* - cpuset: a task can be moved into an empty cpuset, and again it takes
				306	* masks of ancestors.
				307	*
				308	* - memcg: use_hierarchy is on by default and the cgroup file for the flag
				309	* is not created.
				310	*
				311	* - blkcg: blk-throttle becomes properly hierarchical.
				312	*
				313	* - debug: disallowed on the default hierarchy.
				314	*/
				315	bool cgroup_on_dfl(const struct cgroup *cgrp)
				316	{
				317	return cgrp->root == &cgrp_dfl_root;
				318	}
				319
				320	/* IDR wrappers which synchronize using cgroup_idr_lock */
				321	static int cgroup_idr_alloc(struct idr idr, void ptr, int start, int end,
				322	gfp_t gfp_mask)
				323	{
				324	int ret;
				325
				326	idr_preload(gfp_mask);
				327	spin_lock_bh(&cgroup_idr_lock);
				328	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
				329	spin_unlock_bh(&cgroup_idr_lock);
				330	idr_preload_end();
				331	return ret;
				332	}
				333
				334	static void cgroup_idr_replace(struct idr idr, void *ptr, int id)
				335	{
				336	void *ret;
				337
				338	spin_lock_bh(&cgroup_idr_lock);
				339	ret = idr_replace(idr, ptr, id);
				340	spin_unlock_bh(&cgroup_idr_lock);
				341	return ret;
				342	}
				343
				344	static void cgroup_idr_remove(struct idr *idr, int id)
				345	{
				346	spin_lock_bh(&cgroup_idr_lock);
				347	idr_remove(idr, id);
				348	spin_unlock_bh(&cgroup_idr_lock);
				349	}
				350
				351	static bool cgroup_has_tasks(struct cgroup *cgrp)
				352	{
				353	return cgrp->nr_populated_csets;
				354	}
				355
				356	bool cgroup_is_threaded(struct cgroup *cgrp)
				357	{
				358	return cgrp->dom_cgrp != cgrp;
				359	}
				360
				361	/* can @cgrp host both domain and threaded children? */
				362	static bool cgroup_is_mixable(struct cgroup *cgrp)
				363	{
				364	/*
				365	* Root isn't under domain level resource control exempting it from
				366	* the no-internal-process constraint, so it can serve as a thread
				367	* root and a parent of resource domains at the same time.
				368	*/
				369	return !cgroup_parent(cgrp);
				370	}
				371
				372	/* can @cgrp become a thread root? should always be true for a thread root */
				373	static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
				374	{
				375	/* mixables don't care */
				376	if (cgroup_is_mixable(cgrp))
				377	return true;
				378
				379	/* domain roots can't be nested under threaded */
				380	if (cgroup_is_threaded(cgrp))
				381	return false;
				382
				383	/* can only have either domain or threaded children */
				384	if (cgrp->nr_populated_domain_children)
				385	return false;
				386
				387	/* and no domain controllers can be enabled */
				388	if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
				389	return false;
				390
				391	return true;
				392	}
				393
				394	/* is @cgrp root of a threaded subtree? */
				395	bool cgroup_is_thread_root(struct cgroup *cgrp)
				396	{
				397	/* thread root should be a domain */
				398	if (cgroup_is_threaded(cgrp))
				399	return false;
				400
				401	/* a domain w/ threaded children is a thread root */
				402	if (cgrp->nr_threaded_children)
				403	return true;
				404
				405	/*
				406	* A domain which has tasks and explicit threaded controllers
				407	* enabled is a thread root.
				408	*/
				409	if (cgroup_has_tasks(cgrp) &&
				410	(cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
				411	return true;
				412
				413	return false;
				414	}
				415
				416	/* a domain which isn't connected to the root w/o brekage can't be used */
				417	static bool cgroup_is_valid_domain(struct cgroup *cgrp)
				418	{
				419	/* the cgroup itself can be a thread root */
				420	if (cgroup_is_threaded(cgrp))
				421	return false;
				422
				423	/* but the ancestors can't be unless mixable */
				424	while ((cgrp = cgroup_parent(cgrp))) {
				425	if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
				426	return false;
				427	if (cgroup_is_threaded(cgrp))
				428	return false;
				429	}
				430
				431	return true;
				432	}
				433
				434	/* subsystems visibly enabled on a cgroup */
				435	static u16 cgroup_control(struct cgroup *cgrp)
				436	{
				437	struct cgroup *parent = cgroup_parent(cgrp);
				438	u16 root_ss_mask = cgrp->root->subsys_mask;
				439
				440	if (parent) {
				441	u16 ss_mask = parent->subtree_control;
				442
				443	/* threaded cgroups can only have threaded controllers */
				444	if (cgroup_is_threaded(cgrp))
				445	ss_mask &= cgrp_dfl_threaded_ss_mask;
				446	return ss_mask;
				447	}
				448
				449	if (cgroup_on_dfl(cgrp))
				450	root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask \|
				451	cgrp_dfl_implicit_ss_mask);
				452	return root_ss_mask;
				453	}
				454
				455	/* subsystems enabled on a cgroup */
				456	static u16 cgroup_ss_mask(struct cgroup *cgrp)
				457	{
				458	struct cgroup *parent = cgroup_parent(cgrp);
				459
				460	if (parent) {
				461	u16 ss_mask = parent->subtree_ss_mask;
				462
				463	/* threaded cgroups can only have threaded controllers */
				464	if (cgroup_is_threaded(cgrp))
				465	ss_mask &= cgrp_dfl_threaded_ss_mask;
				466	return ss_mask;
				467	}
				468
				469	return cgrp->root->subsys_mask;
				470	}
				471
				472	/**
				473	* cgroup_css - obtain a cgroup's css for the specified subsystem
				474	* @cgrp: the cgroup of interest
				475	* @ss: the subsystem of interest (%NULL returns @cgrp->self)
				476	*
				477	* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
				478	* function must be called either under cgroup_mutex or rcu_read_lock() and
				479	* the caller is responsible for pinning the returned css if it wants to
				480	* keep accessing it outside the said locks. This function may return
				481	* %NULL if @cgrp doesn't have @subsys_id enabled.
				482	*/
				483	static struct cgroup_subsys_state cgroup_css(struct cgroup cgrp,
				484	struct cgroup_subsys *ss)
				485	{
				486	if (ss)
				487	return rcu_dereference_check(cgrp->subsys[ss->id],
				488	lockdep_is_held(&cgroup_mutex));
				489	else
				490	return &cgrp->self;
				491	}
				492
				493	/**
				494	* cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
				495	* @cgrp: the cgroup of interest
				496	* @ss: the subsystem of interest
				497	*
				498	* Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
				499	* or is offline, %NULL is returned.
				500	*/
				501	static struct cgroup_subsys_state cgroup_tryget_css(struct cgroup cgrp,
				502	struct cgroup_subsys *ss)
				503	{
				504	struct cgroup_subsys_state *css;
				505
				506	rcu_read_lock();
				507	css = cgroup_css(cgrp, ss);
				508	if (css && !css_tryget_online(css))
				509	css = NULL;
				510	rcu_read_unlock();
				511
				512	return css;
				513	}
				514
				515	/**
				516	* cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
				517	* @cgrp: the cgroup of interest
				518	* @ss: the subsystem of interest (%NULL returns @cgrp->self)
				519	*
				520	* Similar to cgroup_css() but returns the effective css, which is defined
				521	* as the matching css of the nearest ancestor including self which has @ss
				522	* enabled. If @ss is associated with the hierarchy @cgrp is on, this
				523	* function is guaranteed to return non-NULL css.
				524	*/
				525	static struct cgroup_subsys_state cgroup_e_css_by_mask(struct cgroup cgrp,
				526	struct cgroup_subsys *ss)
				527	{
				528	lockdep_assert_held(&cgroup_mutex);
				529
				530	if (!ss)
				531	return &cgrp->self;
				532
				533	/*
				534	* This function is used while updating css associations and thus
				535	* can't test the csses directly. Test ss_mask.
				536	*/
				537	while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
				538	cgrp = cgroup_parent(cgrp);
				539	if (!cgrp)
				540	return NULL;
				541	}
				542
				543	return cgroup_css(cgrp, ss);
				544	}
				545
				546	/**
				547	* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
				548	* @cgrp: the cgroup of interest
				549	* @ss: the subsystem of interest
				550	*
				551	* Find and get the effective css of @cgrp for @ss. The effective css is
				552	* defined as the matching css of the nearest ancestor including self which
				553	* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
				554	* the root css is returned, so this function always returns a valid css.
				555	*
				556	* The returned css is not guaranteed to be online, and therefore it is the
				557	* callers responsiblity to tryget a reference for it.
				558	*/
				559	struct cgroup_subsys_state cgroup_e_css(struct cgroup cgrp,
				560	struct cgroup_subsys *ss)
				561	{
				562	struct cgroup_subsys_state *css;
				563
				564	do {
				565	css = cgroup_css(cgrp, ss);
				566
				567	if (css)
				568	return css;
				569	cgrp = cgroup_parent(cgrp);
				570	} while (cgrp);
				571
				572	return init_css_set.subsys[ss->id];
				573	}
				574
				575	/**
				576	* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
				577	* @cgrp: the cgroup of interest
				578	* @ss: the subsystem of interest
				579	*
				580	* Find and get the effective css of @cgrp for @ss. The effective css is
				581	* defined as the matching css of the nearest ancestor including self which
				582	* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
				583	* the root css is returned, so this function always returns a valid css.
				584	* The returned css must be put using css_put().
				585	*/
				586	struct cgroup_subsys_state cgroup_get_e_css(struct cgroup cgrp,
				587	struct cgroup_subsys *ss)
				588	{
				589	struct cgroup_subsys_state *css;
				590
				591	rcu_read_lock();
				592
				593	do {
				594	css = cgroup_css(cgrp, ss);
				595
				596	if (css && css_tryget_online(css))
				597	goto out_unlock;
				598	cgrp = cgroup_parent(cgrp);
				599	} while (cgrp);
				600
				601	css = init_css_set.subsys[ss->id];
				602	css_get(css);
				603	out_unlock:
				604	rcu_read_unlock();
				605	return css;
				606	}
				607
				608	static void cgroup_get_live(struct cgroup *cgrp)
				609	{
				610	WARN_ON_ONCE(cgroup_is_dead(cgrp));
				611	css_get(&cgrp->self);
				612	}
				613
				614	/**
				615	* __cgroup_task_count - count the number of tasks in a cgroup. The caller
				616	* is responsible for taking the css_set_lock.
				617	* @cgrp: the cgroup in question
				618	*/
				619	int __cgroup_task_count(const struct cgroup *cgrp)
				620	{
				621	int count = 0;
				622	struct cgrp_cset_link *link;
				623
				624	lockdep_assert_held(&css_set_lock);
				625
				626	list_for_each_entry(link, &cgrp->cset_links, cset_link)
				627	count += link->cset->nr_tasks;
				628
				629	return count;
				630	}
				631
				632	/**
				633	* cgroup_task_count - count the number of tasks in a cgroup.
				634	* @cgrp: the cgroup in question
				635	*/
				636	int cgroup_task_count(const struct cgroup *cgrp)
				637	{
				638	int count;
				639
				640	spin_lock_irq(&css_set_lock);
				641	count = __cgroup_task_count(cgrp);
				642	spin_unlock_irq(&css_set_lock);
				643
				644	return count;
				645	}
				646
				647	struct cgroup_subsys_state of_css(struct kernfs_open_file of)
				648	{
				649	struct cgroup *cgrp = of->kn->parent->priv;
				650	struct cftype *cft = of_cft(of);
				651
				652	/*
				653	* This is open and unprotected implementation of cgroup_css().
				654	* seq_css() is only called from a kernfs file operation which has
				655	* an active reference on the file. Because all the subsystem
				656	* files are drained before a css is disassociated with a cgroup,
				657	* the matching css from the cgroup's subsys table is guaranteed to
				658	* be and stay valid until the enclosing operation is complete.
				659	*/
				660	if (cft->ss)
				661	return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
				662	else
				663	return &cgrp->self;
				664	}
				665	EXPORT_SYMBOL_GPL(of_css);
				666
				667	/**
				668	* for_each_css - iterate all css's of a cgroup
				669	* @css: the iteration cursor
				670	* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
				671	* @cgrp: the target cgroup to iterate css's of
				672	*
				673	* Should be called under cgroup_[tree_]mutex.
				674	*/
				675	#define for_each_css(css, ssid, cgrp) \
				676	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
				677	if (!((css) = rcu_dereference_check( \
				678	(cgrp)->subsys[(ssid)], \
				679	lockdep_is_held(&cgroup_mutex)))) { } \
				680	else
				681
				682	/**
				683	* for_each_e_css - iterate all effective css's of a cgroup
				684	* @css: the iteration cursor
				685	* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
				686	* @cgrp: the target cgroup to iterate css's of
				687	*
				688	* Should be called under cgroup_[tree_]mutex.
				689	*/
				690	#define for_each_e_css(css, ssid, cgrp) \
				691	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
				692	if (!((css) = cgroup_e_css_by_mask(cgrp, \
				693	cgroup_subsys[(ssid)]))) \
				694	; \
				695	else
				696
				697	/**
				698	* do_each_subsys_mask - filter for_each_subsys with a bitmask
				699	* @ss: the iteration cursor
				700	* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
				701	* @ss_mask: the bitmask
				702	*
				703	* The block will only run for cases where the ssid-th bit (1 << ssid) of
				704	* @ss_mask is set.
				705	*/
				706	#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
				707	unsigned long __ss_mask = (ss_mask); \
				708	if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
				709	(ssid) = 0; \
				710	break; \
				711	} \
				712	for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
				713	(ss) = cgroup_subsys[ssid]; \
				714	{
				715
				716	#define while_each_subsys_mask() \
				717	} \
				718	} \
				719	} while (false)
				720
				721	/* iterate over child cgrps, lock should be held throughout iteration */
				722	#define cgroup_for_each_live_child(child, cgrp) \
				723	list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
				724	if (({ lockdep_assert_held(&cgroup_mutex); \
				725	cgroup_is_dead(child); })) \
				726	; \
				727	else
				728
				729	/* walk live descendants in preorder */
				730	#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
				731	css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
				732	if (({ lockdep_assert_held(&cgroup_mutex); \
				733	(dsct) = (d_css)->cgroup; \
				734	cgroup_is_dead(dsct); })) \
				735	; \
				736	else
				737
				738	/* walk live descendants in postorder */
				739	#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
				740	css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
				741	if (({ lockdep_assert_held(&cgroup_mutex); \
				742	(dsct) = (d_css)->cgroup; \
				743	cgroup_is_dead(dsct); })) \
				744	; \
				745	else
				746
				747	/*
				748	* The default css_set - used by init and its children prior to any
				749	* hierarchies being mounted. It contains a pointer to the root state
				750	* for each subsystem. Also used to anchor the list of css_sets. Not
				751	* reference-counted, to improve performance when child cgroups
				752	* haven't been created.
				753	*/
				754	struct css_set init_css_set = {
				755	.refcount = REFCOUNT_INIT(1),
				756	.dom_cset = &init_css_set,
				757	.tasks = LIST_HEAD_INIT(init_css_set.tasks),
				758	.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
				759	.dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks),
				760	.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
				761	.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
				762	.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
				763	.mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
				764	.mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
				765	.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
				766
				767	/*
				768	* The following field is re-initialized when this cset gets linked
				769	* in cgroup_init(). However, let's initialize the field
				770	* statically too so that the default cgroup can be accessed safely
				771	* early during boot.
				772	*/
				773	.dfl_cgrp = &cgrp_dfl_root.cgrp,
				774	};
				775
				776	static int css_set_count = 1; /* 1 for init_css_set */
				777
				778	static bool css_set_threaded(struct css_set *cset)
				779	{
				780	return cset->dom_cset != cset;
				781	}
				782
				783	/**
				784	* css_set_populated - does a css_set contain any tasks?
				785	* @cset: target css_set
				786	*
				787	* css_set_populated() should be the same as !!cset->nr_tasks at steady
				788	* state. However, css_set_populated() can be called while a task is being
				789	* added to or removed from the linked list before the nr_tasks is
				790	* properly updated. Hence, we can't just look at ->nr_tasks here.
				791	*/
				792	static bool css_set_populated(struct css_set *cset)
				793	{
				794	lockdep_assert_held(&css_set_lock);
				795
				796	return !list_empty(&cset->tasks) \|\| !list_empty(&cset->mg_tasks);
				797	}
				798
				799	/**
				800	* cgroup_update_populated - update the populated count of a cgroup
				801	* @cgrp: the target cgroup
				802	* @populated: inc or dec populated count
				803	*
				804	* One of the css_sets associated with @cgrp is either getting its first
				805	* task or losing the last. Update @cgrp->nr_populated_* accordingly. The
				806	* count is propagated towards root so that a given cgroup's
				807	* nr_populated_children is zero iff none of its descendants contain any
				808	* tasks.
				809	*
				810	* @cgrp's interface file "cgroup.populated" is zero if both
				811	* @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
				812	* 1 otherwise. When the sum changes from or to zero, userland is notified
				813	* that the content of the interface file has changed. This can be used to
				814	* detect when @cgrp and its descendants become populated or empty.
				815	*/
				816	static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
				817	{
				818	struct cgroup *child = NULL;
				819	int adj = populated ? 1 : -1;
				820
				821	lockdep_assert_held(&css_set_lock);
				822
				823	do {
				824	bool was_populated = cgroup_is_populated(cgrp);
				825
				826	if (!child) {
				827	cgrp->nr_populated_csets += adj;
				828	} else {
				829	if (cgroup_is_threaded(child))
				830	cgrp->nr_populated_threaded_children += adj;
				831	else
				832	cgrp->nr_populated_domain_children += adj;
				833	}
				834
				835	if (was_populated == cgroup_is_populated(cgrp))
				836	break;
				837
				838	cgroup1_check_for_release(cgrp);
				839	TRACE_CGROUP_PATH(notify_populated, cgrp,
				840	cgroup_is_populated(cgrp));
				841	cgroup_file_notify(&cgrp->events_file);
				842
				843	child = cgrp;
				844	cgrp = cgroup_parent(cgrp);
				845	} while (cgrp);
				846	}
				847
				848	/**
				849	* css_set_update_populated - update populated state of a css_set
				850	* @cset: target css_set
				851	* @populated: whether @cset is populated or depopulated
				852	*
				853	* @cset is either getting the first task or losing the last. Update the
				854	* populated counters of all associated cgroups accordingly.
				855	*/
				856	static void css_set_update_populated(struct css_set *cset, bool populated)
				857	{
				858	struct cgrp_cset_link *link;
				859
				860	lockdep_assert_held(&css_set_lock);
				861
				862	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
				863	cgroup_update_populated(link->cgrp, populated);
				864	}
				865
				866	/*
				867	* @task is leaving, advance task iterators which are pointing to it so
				868	* that they can resume at the next position. Advancing an iterator might
				869	* remove it from the list, use safe walk. See css_task_iter_skip() for
				870	* details.
				871	*/
				872	static void css_set_skip_task_iters(struct css_set *cset,
				873	struct task_struct *task)
				874	{
				875	struct css_task_iter it, pos;
				876
				877	list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
				878	css_task_iter_skip(it, task);
				879	}
				880
				881	/**
				882	* css_set_move_task - move a task from one css_set to another
				883	* @task: task being moved
				884	* @from_cset: css_set @task currently belongs to (may be NULL)
				885	* @to_cset: new css_set @task is being moved to (may be NULL)
				886	* @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
				887	*
				888	* Move @task from @from_cset to @to_cset. If @task didn't belong to any
				889	* css_set, @from_cset can be NULL. If @task is being disassociated
				890	* instead of moved, @to_cset can be NULL.
				891	*
				892	* This function automatically handles populated counter updates and
				893	* css_task_iter adjustments but the caller is responsible for managing
				894	* @from_cset and @to_cset's reference counts.
				895	*/
				896	static void css_set_move_task(struct task_struct *task,
				897	struct css_set from_cset, struct css_set to_cset,
				898	bool use_mg_tasks)
				899	{
				900	lockdep_assert_held(&css_set_lock);
				901
				902	if (to_cset && !css_set_populated(to_cset))
				903	css_set_update_populated(to_cset, true);
				904
				905	if (from_cset) {
				906	WARN_ON_ONCE(list_empty(&task->cg_list));
				907
				908	css_set_skip_task_iters(from_cset, task);
				909	list_del_init(&task->cg_list);
				910	if (!css_set_populated(from_cset))
				911	css_set_update_populated(from_cset, false);
				912	} else {
				913	WARN_ON_ONCE(!list_empty(&task->cg_list));
				914	}
				915
				916	if (to_cset) {
				917	/*
				918	* We are synchronized through cgroup_threadgroup_rwsem
				919	* against PF_EXITING setting such that we can't race
				920	* against cgroup_exit() changing the css_set to
				921	* init_css_set and dropping the old one.
				922	*/
				923	WARN_ON_ONCE(task->flags & PF_EXITING);
				924
				925	cgroup_move_task(task, to_cset);
				926	list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
				927	&to_cset->tasks);
				928	}
				929	}
				930
				931	/*
				932	* hash table for cgroup groups. This improves the performance to find
				933	* an existing css_set. This hash doesn't (currently) take into
				934	* account cgroups in empty hierarchies.
				935	*/
				936	#define CSS_SET_HASH_BITS 7
				937	static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
				938
				939	static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
				940	{
				941	unsigned long key = 0UL;
				942	struct cgroup_subsys *ss;
				943	int i;
				944
				945	for_each_subsys(ss, i)
				946	key += (unsigned long)css[i];
				947	key = (key >> 16) ^ key;
				948
				949	return key;
				950	}
				951
				952	void put_css_set_locked(struct css_set *cset)
				953	{
				954	struct cgrp_cset_link link, tmp_link;
				955	struct cgroup_subsys *ss;
				956	int ssid;
				957
				958	lockdep_assert_held(&css_set_lock);
				959
				960	if (!refcount_dec_and_test(&cset->refcount))
				961	return;
				962
				963	WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
				964
				965	/* This css_set is dead. unlink it and release cgroup and css refs */
				966	for_each_subsys(ss, ssid) {
				967	list_del(&cset->e_cset_node[ssid]);
				968	css_put(cset->subsys[ssid]);
				969	}
				970	hash_del(&cset->hlist);
				971	css_set_count--;
				972
				973	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
				974	list_del(&link->cset_link);
				975	list_del(&link->cgrp_link);
				976	if (cgroup_parent(link->cgrp))
				977	cgroup_put(link->cgrp);
				978	kfree(link);
				979	}
				980
				981	if (css_set_threaded(cset)) {
				982	list_del(&cset->threaded_csets_node);
				983	put_css_set_locked(cset->dom_cset);
				984	}
				985
				986	kfree_rcu(cset, rcu_head);
				987	}
				988
				989	/**
				990	* compare_css_sets - helper function for find_existing_css_set().
				991	* @cset: candidate css_set being tested
				992	* @old_cset: existing css_set for a task
				993	* @new_cgrp: cgroup that's being entered by the task
				994	* @template: desired set of css pointers in css_set (pre-calculated)
				995	*
				996	* Returns true if "cset" matches "old_cset" except for the hierarchy
				997	* which "new_cgrp" belongs to, for which it should match "new_cgrp".
				998	*/
				999	static bool compare_css_sets(struct css_set *cset,
				1000	struct css_set *old_cset,
				1001	struct cgroup *new_cgrp,
				1002	struct cgroup_subsys_state *template[])
				1003	{
				1004	struct cgroup *new_dfl_cgrp;
				1005	struct list_head l1, l2;
				1006
				1007	/*
				1008	* On the default hierarchy, there can be csets which are
				1009	* associated with the same set of cgroups but different csses.
				1010	* Let's first ensure that csses match.
				1011	*/
				1012	if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
				1013	return false;
				1014
				1015
				1016	/* @cset's domain should match the default cgroup's */
				1017	if (cgroup_on_dfl(new_cgrp))
				1018	new_dfl_cgrp = new_cgrp;
				1019	else
				1020	new_dfl_cgrp = old_cset->dfl_cgrp;
				1021
				1022	if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
				1023	return false;
				1024
				1025	/*
				1026	* Compare cgroup pointers in order to distinguish between
				1027	* different cgroups in hierarchies. As different cgroups may
				1028	* share the same effective css, this comparison is always
				1029	* necessary.
				1030	*/
				1031	l1 = &cset->cgrp_links;
				1032	l2 = &old_cset->cgrp_links;
				1033	while (1) {
				1034	struct cgrp_cset_link link1, link2;
				1035	struct cgroup cgrp1, cgrp2;
				1036
				1037	l1 = l1->next;
				1038	l2 = l2->next;
				1039	/* See if we reached the end - both lists are equal length. */
				1040	if (l1 == &cset->cgrp_links) {
				1041	BUG_ON(l2 != &old_cset->cgrp_links);
				1042	break;
				1043	} else {
				1044	BUG_ON(l2 == &old_cset->cgrp_links);
				1045	}
				1046	/* Locate the cgroups associated with these links. */
				1047	link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
				1048	link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
				1049	cgrp1 = link1->cgrp;
				1050	cgrp2 = link2->cgrp;
				1051	/* Hierarchies should be linked in the same order. */
				1052	BUG_ON(cgrp1->root != cgrp2->root);
				1053
				1054	/*
				1055	* If this hierarchy is the hierarchy of the cgroup
				1056	* that's changing, then we need to check that this
				1057	* css_set points to the new cgroup; if it's any other
				1058	* hierarchy, then this css_set should point to the
				1059	* same cgroup as the old css_set.
				1060	*/
				1061	if (cgrp1->root == new_cgrp->root) {
				1062	if (cgrp1 != new_cgrp)
				1063	return false;
				1064	} else {
				1065	if (cgrp1 != cgrp2)
				1066	return false;
				1067	}
				1068	}
				1069	return true;
				1070	}
				1071
				1072	/**
				1073	* find_existing_css_set - init css array and find the matching css_set
				1074	* @old_cset: the css_set that we're using before the cgroup transition
				1075	* @cgrp: the cgroup that we're moving into
				1076	* @template: out param for the new set of csses, should be clear on entry
				1077	*/
				1078	static struct css_set find_existing_css_set(struct css_set old_cset,
				1079	struct cgroup *cgrp,
				1080	struct cgroup_subsys_state *template[])
				1081	{
				1082	struct cgroup_root *root = cgrp->root;
				1083	struct cgroup_subsys *ss;
				1084	struct css_set *cset;
				1085	unsigned long key;
				1086	int i;
				1087
				1088	/*
				1089	* Build the set of subsystem state objects that we want to see in the
				1090	* new css_set. while subsystems can change globally, the entries here
				1091	* won't change, so no need for locking.
				1092	*/
				1093	for_each_subsys(ss, i) {
				1094	if (root->subsys_mask & (1UL << i)) {
				1095	/*
				1096	* @ss is in this hierarchy, so we want the
				1097	* effective css from @cgrp.
				1098	*/
				1099	template[i] = cgroup_e_css_by_mask(cgrp, ss);
				1100	} else {
				1101	/*
				1102	* @ss is not in this hierarchy, so we don't want
				1103	* to change the css.
				1104	*/
				1105	template[i] = old_cset->subsys[i];
				1106	}
				1107	}
				1108
				1109	key = css_set_hash(template);
				1110	hash_for_each_possible(css_set_table, cset, hlist, key) {
				1111	if (!compare_css_sets(cset, old_cset, cgrp, template))
				1112	continue;
				1113
				1114	/* This css_set matches what we need */
				1115	return cset;
				1116	}
				1117
				1118	/* No existing cgroup group matched */
				1119	return NULL;
				1120	}
				1121
				1122	static void free_cgrp_cset_links(struct list_head *links_to_free)
				1123	{
				1124	struct cgrp_cset_link link, tmp_link;
				1125
				1126	list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
				1127	list_del(&link->cset_link);
				1128	kfree(link);
				1129	}
				1130	}
				1131
				1132	/**
				1133	* allocate_cgrp_cset_links - allocate cgrp_cset_links
				1134	* @count: the number of links to allocate
				1135	* @tmp_links: list_head the allocated links are put on
				1136	*
				1137	* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
				1138	* through ->cset_link. Returns 0 on success or -errno.
				1139	*/
				1140	static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
				1141	{
				1142	struct cgrp_cset_link *link;
				1143	int i;
				1144
				1145	INIT_LIST_HEAD(tmp_links);
				1146
				1147	for (i = 0; i < count; i++) {
				1148	link = kzalloc(sizeof(*link), GFP_KERNEL);
				1149	if (!link) {
				1150	free_cgrp_cset_links(tmp_links);
				1151	return -ENOMEM;
				1152	}
				1153	list_add(&link->cset_link, tmp_links);
				1154	}
				1155	return 0;
				1156	}
				1157
				1158	/**
				1159	* link_css_set - a helper function to link a css_set to a cgroup
				1160	* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
				1161	* @cset: the css_set to be linked
				1162	* @cgrp: the destination cgroup
				1163	*/
				1164	static void link_css_set(struct list_head tmp_links, struct css_set cset,
				1165	struct cgroup *cgrp)
				1166	{
				1167	struct cgrp_cset_link *link;
				1168
				1169	BUG_ON(list_empty(tmp_links));
				1170
				1171	if (cgroup_on_dfl(cgrp))
				1172	cset->dfl_cgrp = cgrp;
				1173
				1174	link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
				1175	link->cset = cset;
				1176	link->cgrp = cgrp;
				1177
				1178	/*
				1179	* Always add links to the tail of the lists so that the lists are
				1180	* in choronological order.
				1181	*/
				1182	list_move_tail(&link->cset_link, &cgrp->cset_links);
				1183	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
				1184
				1185	if (cgroup_parent(cgrp))
				1186	cgroup_get_live(cgrp);
				1187	}
				1188
				1189	/**
				1190	* find_css_set - return a new css_set with one cgroup updated
				1191	* @old_cset: the baseline css_set
				1192	* @cgrp: the cgroup to be updated
				1193	*
				1194	* Return a new css_set that's equivalent to @old_cset, but with @cgrp
				1195	* substituted into the appropriate hierarchy.
				1196	*/
				1197	static struct css_set find_css_set(struct css_set old_cset,
				1198	struct cgroup *cgrp)
				1199	{
				1200	struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
				1201	struct css_set *cset;
				1202	struct list_head tmp_links;
				1203	struct cgrp_cset_link *link;
				1204	struct cgroup_subsys *ss;
				1205	unsigned long key;
				1206	int ssid;
				1207
				1208	lockdep_assert_held(&cgroup_mutex);
				1209
				1210	/* First see if we already have a cgroup group that matches
				1211	* the desired set */
				1212	spin_lock_irq(&css_set_lock);
				1213	cset = find_existing_css_set(old_cset, cgrp, template);
				1214	if (cset)
				1215	get_css_set(cset);
				1216	spin_unlock_irq(&css_set_lock);
				1217
				1218	if (cset)
				1219	return cset;
				1220
				1221	cset = kzalloc(sizeof(*cset), GFP_KERNEL);
				1222	if (!cset)
				1223	return NULL;
				1224
				1225	/* Allocate all the cgrp_cset_link objects that we'll need */
				1226	if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
				1227	kfree(cset);
				1228	return NULL;
				1229	}
				1230
				1231	refcount_set(&cset->refcount, 1);
				1232	cset->dom_cset = cset;
				1233	INIT_LIST_HEAD(&cset->tasks);
				1234	INIT_LIST_HEAD(&cset->mg_tasks);
				1235	INIT_LIST_HEAD(&cset->dying_tasks);
				1236	INIT_LIST_HEAD(&cset->task_iters);
				1237	INIT_LIST_HEAD(&cset->threaded_csets);
				1238	INIT_HLIST_NODE(&cset->hlist);
				1239	INIT_LIST_HEAD(&cset->cgrp_links);
				1240	INIT_LIST_HEAD(&cset->mg_src_preload_node);
				1241	INIT_LIST_HEAD(&cset->mg_dst_preload_node);
				1242	INIT_LIST_HEAD(&cset->mg_node);
				1243
				1244	/* Copy the set of subsystem state objects generated in
				1245	* find_existing_css_set() */
				1246	memcpy(cset->subsys, template, sizeof(cset->subsys));
				1247
				1248	spin_lock_irq(&css_set_lock);
				1249	/* Add reference counts and links from the new css_set. */
				1250	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
				1251	struct cgroup *c = link->cgrp;
				1252
				1253	if (c->root == cgrp->root)
				1254	c = cgrp;
				1255	link_css_set(&tmp_links, cset, c);
				1256	}
				1257
				1258	BUG_ON(!list_empty(&tmp_links));
				1259
				1260	css_set_count++;
				1261
				1262	/* Add @cset to the hash table */
				1263	key = css_set_hash(cset->subsys);
				1264	hash_add(css_set_table, &cset->hlist, key);
				1265
				1266	for_each_subsys(ss, ssid) {
				1267	struct cgroup_subsys_state *css = cset->subsys[ssid];
				1268
				1269	list_add_tail(&cset->e_cset_node[ssid],
				1270	&css->cgroup->e_csets[ssid]);
				1271	css_get(css);
				1272	}
				1273
				1274	spin_unlock_irq(&css_set_lock);
				1275
				1276	/*
				1277	* If @cset should be threaded, look up the matching dom_cset and
				1278	* link them up. We first fully initialize @cset then look for the
				1279	* dom_cset. It's simpler this way and safe as @cset is guaranteed
				1280	* to stay empty until we return.
				1281	*/
				1282	if (cgroup_is_threaded(cset->dfl_cgrp)) {
				1283	struct css_set *dcset;
				1284
				1285	dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
				1286	if (!dcset) {
				1287	put_css_set(cset);
				1288	return NULL;
				1289	}
				1290
				1291	spin_lock_irq(&css_set_lock);
				1292	cset->dom_cset = dcset;
				1293	list_add_tail(&cset->threaded_csets_node,
				1294	&dcset->threaded_csets);
				1295	spin_unlock_irq(&css_set_lock);
				1296	}
				1297
				1298	return cset;
				1299	}
				1300
				1301	struct cgroup_root cgroup_root_from_kf(struct kernfs_root kf_root)
				1302	{
				1303	struct cgroup *root_cgrp = kf_root->kn->priv;
				1304
				1305	return root_cgrp->root;
				1306	}
				1307
				1308	static int cgroup_init_root_id(struct cgroup_root *root)
				1309	{
				1310	int id;
				1311
				1312	lockdep_assert_held(&cgroup_mutex);
				1313
				1314	id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
				1315	if (id < 0)
				1316	return id;
				1317
				1318	root->hierarchy_id = id;
				1319	return 0;
				1320	}
				1321
				1322	static void cgroup_exit_root_id(struct cgroup_root *root)
				1323	{
				1324	lockdep_assert_held(&cgroup_mutex);
				1325
				1326	idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
				1327	}
				1328
				1329	void cgroup_free_root(struct cgroup_root *root)
				1330	{
				1331	if (root) {
				1332	idr_destroy(&root->cgroup_idr);
				1333	kfree_rcu(root, rcu);
				1334	}
				1335	}
				1336
				1337	static void cgroup_destroy_root(struct cgroup_root *root)
				1338	{
				1339	struct cgroup *cgrp = &root->cgrp;
				1340	struct cgrp_cset_link link, tmp_link;
				1341
				1342	trace_cgroup_destroy_root(root);
				1343
				1344	cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
				1345
				1346	BUG_ON(atomic_read(&root->nr_cgrps));
				1347	BUG_ON(!list_empty(&cgrp->self.children));
				1348
				1349	/* Rebind all subsystems back to the default hierarchy */
				1350	WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
				1351
				1352	/*
				1353	* Release all the links from cset_links to this hierarchy's
				1354	* root cgroup
				1355	*/
				1356	spin_lock_irq(&css_set_lock);
				1357
				1358	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
				1359	list_del(&link->cset_link);
				1360	list_del(&link->cgrp_link);
				1361	kfree(link);
				1362	}
				1363
				1364	spin_unlock_irq(&css_set_lock);
				1365
				1366	if (!list_empty(&root->root_list)) {
				1367	list_del_rcu(&root->root_list);
				1368	cgroup_root_count--;
				1369	}
				1370
				1371	cgroup_exit_root_id(root);
				1372
				1373	mutex_unlock(&cgroup_mutex);
				1374
				1375	kernfs_destroy_root(root->kf_root);
				1376	cgroup_free_root(root);
				1377	}
				1378
				1379	/*
				1380	* look up cgroup associated with current task's cgroup namespace on the
				1381	* specified hierarchy
				1382	*/
				1383	static struct cgroup *
				1384	current_cgns_cgroup_from_root(struct cgroup_root *root)
				1385	{
				1386	struct cgroup *res = NULL;
				1387	struct css_set *cset;
				1388
				1389	lockdep_assert_held(&css_set_lock);
				1390
				1391	rcu_read_lock();
				1392
				1393	cset = current->nsproxy->cgroup_ns->root_cset;
				1394	if (cset == &init_css_set) {
				1395	res = &root->cgrp;
				1396	} else {
				1397	struct cgrp_cset_link *link;
				1398
				1399	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
				1400	struct cgroup *c = link->cgrp;
				1401
				1402	if (c->root == root) {
				1403	res = c;
				1404	break;
				1405	}
				1406	}
				1407	}
				1408	rcu_read_unlock();
				1409
				1410	BUG_ON(!res);
				1411	return res;
				1412	}
				1413
				1414	/* look up cgroup associated with given css_set on the specified hierarchy */
				1415	static struct cgroup cset_cgroup_from_root(struct css_set cset,
				1416	struct cgroup_root *root)
				1417	{
				1418	struct cgroup *res = NULL;
				1419
				1420	lockdep_assert_held(&css_set_lock);
				1421
				1422	if (cset == &init_css_set) {
				1423	res = &root->cgrp;
				1424	} else if (root == &cgrp_dfl_root) {
				1425	res = cset->dfl_cgrp;
				1426	} else {
				1427	struct cgrp_cset_link *link;
				1428
				1429	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
				1430	struct cgroup *c = link->cgrp;
				1431
				1432	if (c->root == root) {
				1433	res = c;
				1434	break;
				1435	}
				1436	}
				1437	}
				1438
				1439	/*
				1440	* If cgroup_mutex is not held, the cgrp_cset_link will be freed
				1441	* before we remove the cgroup root from the root_list. Consequently,
				1442	* when accessing a cgroup root, the cset_link may have already been
				1443	* freed, resulting in a NULL res_cgroup. However, by holding the
				1444	* cgroup_mutex, we ensure that res_cgroup can't be NULL.
				1445	* If we don't hold cgroup_mutex in the caller, we must do the NULL
				1446	* check.
				1447	*/
				1448	return res;
				1449	}
				1450
				1451	/*
				1452	* Return the cgroup for "task" from the given hierarchy. Must be
				1453	* called with css_set_lock held to prevent task's groups from being modified.
				1454	* Must be called with either cgroup_mutex or rcu read lock to prevent the
				1455	* cgroup root from being destroyed.
				1456	*/
				1457	struct cgroup task_cgroup_from_root(struct task_struct task,
				1458	struct cgroup_root *root)
				1459	{
				1460	/*
				1461	* No need to lock the task - since we hold cgroup_mutex the
				1462	* task can't change groups, so the only thing that can happen
				1463	* is that it exits and its css is set back to init_css_set.
				1464	*/
				1465	return cset_cgroup_from_root(task_css_set(task), root);
				1466	}
				1467
				1468	/*
				1469	* A task must hold cgroup_mutex to modify cgroups.
				1470	*
				1471	* Any task can increment and decrement the count field without lock.
				1472	* So in general, code holding cgroup_mutex can't rely on the count
				1473	* field not changing. However, if the count goes to zero, then only
				1474	* cgroup_attach_task() can increment it again. Because a count of zero
				1475	* means that no tasks are currently attached, therefore there is no
				1476	* way a task attached to that cgroup can fork (the other way to
				1477	* increment the count). So code holding cgroup_mutex can safely
				1478	* assume that if the count is zero, it will stay zero. Similarly, if
				1479	* a task holds cgroup_mutex on a cgroup with zero count, it
				1480	* knows that the cgroup won't be removed, as cgroup_rmdir()
				1481	* needs that mutex.
				1482	*
				1483	* A cgroup can only be deleted if both its 'count' of using tasks
				1484	* is zero, and its list of 'children' cgroups is empty. Since all
				1485	* tasks in the system use _some_ cgroup, and since there is always at
				1486	* least one task in the system (init, pid == 1), therefore, root cgroup
				1487	* always has either children cgroups and/or using tasks. So we don't
				1488	* need a special hack to ensure that root cgroup cannot be deleted.
				1489	*
				1490	* P.S. One more locking exception. RCU is used to guard the
				1491	* update of a tasks cgroup pointer by cgroup_attach_task()
				1492	*/
				1493
				1494	static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
				1495
				1496	static char cgroup_file_name(struct cgroup cgrp, const struct cftype *cft,
				1497	char *buf)
				1498	{
				1499	struct cgroup_subsys *ss = cft->ss;
				1500
				1501	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
				1502	!(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
				1503	const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
				1504
				1505	snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
				1506	dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
				1507	cft->name);
				1508	} else {
				1509	strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
				1510	}
				1511	return buf;
				1512	}
				1513
				1514	/**
				1515	* cgroup_file_mode - deduce file mode of a control file
				1516	* @cft: the control file in question
				1517	*
				1518	* S_IRUGO for read, S_IWUSR for write.
				1519	*/
				1520	static umode_t cgroup_file_mode(const struct cftype *cft)
				1521	{
				1522	umode_t mode = 0;
				1523
				1524	if (cft->read_u64 \|\| cft->read_s64 \|\| cft->seq_show)
				1525	mode \|= S_IRUGO;
				1526
				1527	if (cft->write_u64 \|\| cft->write_s64 \|\| cft->write) {
				1528	if (cft->flags & CFTYPE_WORLD_WRITABLE)
				1529	mode \|= S_IWUGO;
				1530	else
				1531	mode \|= S_IWUSR;
				1532	}
				1533
				1534	return mode;
				1535	}
				1536
				1537	/**
				1538	* cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
				1539	* @subtree_control: the new subtree_control mask to consider
				1540	* @this_ss_mask: available subsystems
				1541	*
				1542	* On the default hierarchy, a subsystem may request other subsystems to be
				1543	* enabled together through its ->depends_on mask. In such cases, more
				1544	* subsystems than specified in "cgroup.subtree_control" may be enabled.
				1545	*
				1546	* This function calculates which subsystems need to be enabled if
				1547	* @subtree_control is to be applied while restricted to @this_ss_mask.
				1548	*/
				1549	static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
				1550	{
				1551	u16 cur_ss_mask = subtree_control;
				1552	struct cgroup_subsys *ss;
				1553	int ssid;
				1554
				1555	lockdep_assert_held(&cgroup_mutex);
				1556
				1557	cur_ss_mask \|= cgrp_dfl_implicit_ss_mask;
				1558
				1559	while (true) {
				1560	u16 new_ss_mask = cur_ss_mask;
				1561
				1562	do_each_subsys_mask(ss, ssid, cur_ss_mask) {
				1563	new_ss_mask \|= ss->depends_on;
				1564	} while_each_subsys_mask();
				1565
				1566	/*
				1567	* Mask out subsystems which aren't available. This can
				1568	* happen only if some depended-upon subsystems were bound
				1569	* to non-default hierarchies.
				1570	*/
				1571	new_ss_mask &= this_ss_mask;
				1572
				1573	if (new_ss_mask == cur_ss_mask)
				1574	break;
				1575	cur_ss_mask = new_ss_mask;
				1576	}
				1577
				1578	return cur_ss_mask;
				1579	}
				1580
				1581	/**
				1582	* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
				1583	* @kn: the kernfs_node being serviced
				1584	*
				1585	* This helper undoes cgroup_kn_lock_live() and should be invoked before
				1586	* the method finishes if locking succeeded. Note that once this function
				1587	* returns the cgroup returned by cgroup_kn_lock_live() may become
				1588	* inaccessible any time. If the caller intends to continue to access the
				1589	* cgroup, it should pin it before invoking this function.
				1590	*/
				1591	void cgroup_kn_unlock(struct kernfs_node *kn)
				1592	{
				1593	struct cgroup *cgrp;
				1594
				1595	if (kernfs_type(kn) == KERNFS_DIR)
				1596	cgrp = kn->priv;
				1597	else
				1598	cgrp = kn->parent->priv;
				1599
				1600	mutex_unlock(&cgroup_mutex);
				1601
				1602	kernfs_unbreak_active_protection(kn);
				1603	cgroup_put(cgrp);
				1604	}
				1605
				1606	/**
				1607	* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
				1608	* @kn: the kernfs_node being serviced
				1609	* @drain_offline: perform offline draining on the cgroup
				1610	*
				1611	* This helper is to be used by a cgroup kernfs method currently servicing
				1612	* @kn. It breaks the active protection, performs cgroup locking and
				1613	* verifies that the associated cgroup is alive. Returns the cgroup if
				1614	* alive; otherwise, %NULL. A successful return should be undone by a
				1615	* matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the
				1616	* cgroup is drained of offlining csses before return.
				1617	*
				1618	* Any cgroup kernfs method implementation which requires locking the
				1619	* associated cgroup should use this helper. It avoids nesting cgroup
				1620	* locking under kernfs active protection and allows all kernfs operations
				1621	* including self-removal.
				1622	*/
				1623	struct cgroup cgroup_kn_lock_live(struct kernfs_node kn, bool drain_offline)
				1624	{
				1625	struct cgroup *cgrp;
				1626
				1627	if (kernfs_type(kn) == KERNFS_DIR)
				1628	cgrp = kn->priv;
				1629	else
				1630	cgrp = kn->parent->priv;
				1631
				1632	/*
				1633	* We're gonna grab cgroup_mutex which nests outside kernfs
				1634	* active_ref. cgroup liveliness check alone provides enough
				1635	* protection against removal. Ensure @cgrp stays accessible and
				1636	* break the active_ref protection.
				1637	*/
				1638	if (!cgroup_tryget(cgrp))
				1639	return NULL;
				1640	kernfs_break_active_protection(kn);
				1641
				1642	if (drain_offline)
				1643	cgroup_lock_and_drain_offline(cgrp);
				1644	else
				1645	mutex_lock(&cgroup_mutex);
				1646
				1647	if (!cgroup_is_dead(cgrp))
				1648	return cgrp;
				1649
				1650	cgroup_kn_unlock(kn);
				1651	return NULL;
				1652	}
				1653
				1654	static void cgroup_rm_file(struct cgroup cgrp, const struct cftype cft)
				1655	{
				1656	char name[CGROUP_FILE_NAME_MAX];
				1657
				1658	lockdep_assert_held(&cgroup_mutex);
				1659
				1660	if (cft->file_offset) {
				1661	struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
				1662	struct cgroup_file cfile = (void )css + cft->file_offset;
				1663
				1664	spin_lock_irq(&cgroup_file_kn_lock);
				1665	cfile->kn = NULL;
				1666	spin_unlock_irq(&cgroup_file_kn_lock);
				1667
				1668	del_timer_sync(&cfile->notify_timer);
				1669	}
				1670
				1671	kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
				1672	}
				1673
				1674	/**
				1675	* css_clear_dir - remove subsys files in a cgroup directory
				1676	* @css: taget css
				1677	*/
				1678	static void css_clear_dir(struct cgroup_subsys_state *css)
				1679	{
				1680	struct cgroup *cgrp = css->cgroup;
				1681	struct cftype *cfts;
				1682
				1683	if (!(css->flags & CSS_VISIBLE))
				1684	return;
				1685
				1686	css->flags &= ~CSS_VISIBLE;
				1687
				1688	if (!css->ss) {
				1689	if (cgroup_on_dfl(cgrp))
				1690	cfts = cgroup_base_files;
				1691	else
				1692	cfts = cgroup1_base_files;
				1693
				1694	cgroup_addrm_files(css, cgrp, cfts, false);
				1695	} else {
				1696	list_for_each_entry(cfts, &css->ss->cfts, node)
				1697	cgroup_addrm_files(css, cgrp, cfts, false);
				1698	}
				1699	}
				1700
				1701	/**
				1702	* css_populate_dir - create subsys files in a cgroup directory
				1703	* @css: target css
				1704	*
				1705	* On failure, no file is added.
				1706	*/
				1707	static int css_populate_dir(struct cgroup_subsys_state *css)
				1708	{
				1709	struct cgroup *cgrp = css->cgroup;
				1710	struct cftype cfts, failed_cfts;
				1711	int ret;
				1712
				1713	if ((css->flags & CSS_VISIBLE) \|\| !cgrp->kn)
				1714	return 0;
				1715
				1716	if (!css->ss) {
				1717	if (cgroup_on_dfl(cgrp))
				1718	cfts = cgroup_base_files;
				1719	else
				1720	cfts = cgroup1_base_files;
				1721
				1722	ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
				1723	if (ret < 0)
				1724	return ret;
				1725	} else {
				1726	list_for_each_entry(cfts, &css->ss->cfts, node) {
				1727	ret = cgroup_addrm_files(css, cgrp, cfts, true);
				1728	if (ret < 0) {
				1729	failed_cfts = cfts;
				1730	goto err;
				1731	}
				1732	}
				1733	}
				1734
				1735	css->flags \|= CSS_VISIBLE;
				1736
				1737	return 0;
				1738	err:
				1739	list_for_each_entry(cfts, &css->ss->cfts, node) {
				1740	if (cfts == failed_cfts)
				1741	break;
				1742	cgroup_addrm_files(css, cgrp, cfts, false);
				1743	}
				1744	return ret;
				1745	}
				1746
				1747	int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
				1748	{
				1749	struct cgroup *dcgrp = &dst_root->cgrp;
				1750	struct cgroup_subsys *ss;
				1751	int ssid, ret;
				1752	u16 dfl_disable_ss_mask = 0;
				1753
				1754	lockdep_assert_held(&cgroup_mutex);
				1755
				1756	do_each_subsys_mask(ss, ssid, ss_mask) {
				1757	/*
				1758	* If @ss has non-root csses attached to it, can't move.
				1759	* If @ss is an implicit controller, it is exempt from this
				1760	* rule and can be stolen.
				1761	*/
				1762	if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
				1763	!ss->implicit_on_dfl)
				1764	return -EBUSY;
				1765
				1766	/* can't move between two non-dummy roots either */
				1767	if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
				1768	return -EBUSY;
				1769
				1770	/*
				1771	* Collect ssid's that need to be disabled from default
				1772	* hierarchy.
				1773	*/
				1774	if (ss->root == &cgrp_dfl_root)
				1775	dfl_disable_ss_mask \|= 1 << ssid;
				1776
				1777	} while_each_subsys_mask();
				1778
				1779	if (dfl_disable_ss_mask) {
				1780	struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
				1781
				1782	/*
				1783	* Controllers from default hierarchy that need to be rebound
				1784	* are all disabled together in one go.
				1785	*/
				1786	cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
				1787	WARN_ON(cgroup_apply_control(scgrp));
				1788	cgroup_finalize_control(scgrp, 0);
				1789	}
				1790
				1791	do_each_subsys_mask(ss, ssid, ss_mask) {
				1792	struct cgroup_root *src_root = ss->root;
				1793	struct cgroup *scgrp = &src_root->cgrp;
				1794	struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
				1795	struct css_set cset, cset_pos;
				1796	struct css_task_iter *it;
				1797
				1798	WARN_ON(!css \|\| cgroup_css(dcgrp, ss));
				1799
				1800	if (src_root != &cgrp_dfl_root) {
				1801	/* disable from the source */
				1802	src_root->subsys_mask &= ~(1 << ssid);
				1803	WARN_ON(cgroup_apply_control(scgrp));
				1804	cgroup_finalize_control(scgrp, 0);
				1805	}
				1806
				1807	/* rebind */
				1808	RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
				1809	rcu_assign_pointer(dcgrp->subsys[ssid], css);
				1810	ss->root = dst_root;
				1811
				1812	spin_lock_irq(&css_set_lock);
				1813	css->cgroup = dcgrp;
				1814	WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
				1815	list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
				1816	e_cset_node[ss->id]) {
				1817	list_move_tail(&cset->e_cset_node[ss->id],
				1818	&dcgrp->e_csets[ss->id]);
				1819	/*
				1820	* all css_sets of scgrp together in same order to dcgrp,
				1821	* patch in-flight iterators to preserve correct iteration.
				1822	* since the iterator is always advanced right away and
				1823	* finished when it->cset_pos meets it->cset_head, so only
				1824	* update it->cset_head is enough here.
				1825	*/
				1826	list_for_each_entry(it, &cset->task_iters, iters_node)
				1827	if (it->cset_head == &scgrp->e_csets[ss->id])
				1828	it->cset_head = &dcgrp->e_csets[ss->id];
				1829	}
				1830	spin_unlock_irq(&css_set_lock);
				1831
				1832	/* default hierarchy doesn't enable controllers by default */
				1833	dst_root->subsys_mask \|= 1 << ssid;
				1834	if (dst_root == &cgrp_dfl_root) {
				1835	static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
				1836	} else {
				1837	dcgrp->subtree_control \|= 1 << ssid;
				1838	static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
				1839	}
				1840
				1841	ret = cgroup_apply_control(dcgrp);
				1842	if (ret)
				1843	pr_warn("partial failure to rebind %s controller (err=%d)\n",
				1844	ss->name, ret);
				1845
				1846	if (ss->bind)
				1847	ss->bind(css);
				1848	} while_each_subsys_mask();
				1849
				1850	kernfs_activate(dcgrp->kn);
				1851	return 0;
				1852	}
				1853
				1854	int cgroup_show_path(struct seq_file sf, struct kernfs_node kf_node,
				1855	struct kernfs_root *kf_root)
				1856	{
				1857	int len = 0;
				1858	char *buf = NULL;
				1859	struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
				1860	struct cgroup *ns_cgroup;
				1861
				1862	buf = kmalloc(PATH_MAX, GFP_KERNEL);
				1863	if (!buf)
				1864	return -ENOMEM;
				1865
				1866	spin_lock_irq(&css_set_lock);
				1867	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
				1868	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
				1869	spin_unlock_irq(&css_set_lock);
				1870
				1871	if (len >= PATH_MAX)
				1872	len = -ERANGE;
				1873	else if (len > 0) {
				1874	seq_escape(sf, buf, " \t\n\\");
				1875	len = 0;
				1876	}
				1877	kfree(buf);
				1878	return len;
				1879	}
				1880
				1881	enum cgroup2_param {
				1882	Opt_nsdelegate,
				1883	Opt_memory_localevents,
				1884	nr__cgroup2_params
				1885	};
				1886
				1887	static const struct fs_parameter_spec cgroup2_param_specs[] = {
				1888	fsparam_flag("nsdelegate", Opt_nsdelegate),
				1889	fsparam_flag("memory_localevents", Opt_memory_localevents),
				1890	{}
				1891	};
				1892
				1893	static const struct fs_parameter_description cgroup2_fs_parameters = {
				1894	.name = "cgroup2",
				1895	.specs = cgroup2_param_specs,
				1896	};
				1897
				1898	static int cgroup2_parse_param(struct fs_context fc, struct fs_parameter param)
				1899	{
				1900	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
				1901	struct fs_parse_result result;
				1902	int opt;
				1903
				1904	opt = fs_parse(fc, &cgroup2_fs_parameters, param, &result);
				1905	if (opt < 0)
				1906	return opt;
				1907
				1908	switch (opt) {
				1909	case Opt_nsdelegate:
				1910	ctx->flags \|= CGRP_ROOT_NS_DELEGATE;
				1911	return 0;
				1912	case Opt_memory_localevents:
				1913	ctx->flags \|= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
				1914	return 0;
				1915	}
				1916	return -EINVAL;
				1917	}
				1918
				1919	static void apply_cgroup_root_flags(unsigned int root_flags)
				1920	{
				1921	if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
				1922	if (root_flags & CGRP_ROOT_NS_DELEGATE)
				1923	cgrp_dfl_root.flags \|= CGRP_ROOT_NS_DELEGATE;
				1924	else
				1925	cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
				1926
				1927	if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
				1928	cgrp_dfl_root.flags \|= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
				1929	else
				1930	cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
				1931	}
				1932	}
				1933
				1934	static int cgroup_show_options(struct seq_file seq, struct kernfs_root kf_root)
				1935	{
				1936	if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
				1937	seq_puts(seq, ",nsdelegate");
				1938	if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
				1939	seq_puts(seq, ",memory_localevents");
				1940	return 0;
				1941	}
				1942
				1943	static int cgroup_reconfigure(struct fs_context *fc)
				1944	{
				1945	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
				1946
				1947	apply_cgroup_root_flags(ctx->flags);
				1948	return 0;
				1949	}
				1950
				1951	/*
				1952	* To reduce the fork() overhead for systems that are not actually using
				1953	* their cgroups capability, we don't maintain the lists running through
				1954	* each css_set to its tasks until we see the list actually used - in other
				1955	* words after the first mount.
				1956	*/
				1957	static bool use_task_css_set_links __read_mostly;
				1958
				1959	void cgroup_enable_task_cg_lists(void)
				1960	{
				1961	struct task_struct p, g;
				1962
				1963	/*
				1964	* We need tasklist_lock because RCU is not safe against
				1965	* while_each_thread(). Besides, a forking task that has passed
				1966	* cgroup_post_fork() without seeing use_task_css_set_links = 1
				1967	* is not guaranteed to have its child immediately visible in the
				1968	* tasklist if we walk through it with RCU.
				1969	*/
				1970	read_lock(&tasklist_lock);
				1971	spin_lock_irq(&css_set_lock);
				1972
				1973	if (use_task_css_set_links)
				1974	goto out_unlock;
				1975
				1976	use_task_css_set_links = true;
				1977
				1978	do_each_thread(g, p) {
				1979	WARN_ON_ONCE(!list_empty(&p->cg_list) \|\|
				1980	task_css_set(p) != &init_css_set);
				1981
				1982	/*
				1983	* We should check if the process is exiting, otherwise
				1984	* it will race with cgroup_exit() in that the list
				1985	* entry won't be deleted though the process has exited.
				1986	* Do it while holding siglock so that we don't end up
				1987	* racing against cgroup_exit().
				1988	*
				1989	* Interrupts were already disabled while acquiring
				1990	* the css_set_lock, so we do not need to disable it
				1991	* again when acquiring the sighand->siglock here.
				1992	*/
				1993	spin_lock(&p->sighand->siglock);
				1994	if (!(p->flags & PF_EXITING)) {
				1995	struct css_set *cset = task_css_set(p);
				1996
				1997	if (!css_set_populated(cset))
				1998	css_set_update_populated(cset, true);
				1999	list_add_tail(&p->cg_list, &cset->tasks);
				2000	get_css_set(cset);
				2001	cset->nr_tasks++;
				2002	}
				2003	spin_unlock(&p->sighand->siglock);
				2004	} while_each_thread(g, p);
				2005	out_unlock:
				2006	spin_unlock_irq(&css_set_lock);
				2007	read_unlock(&tasklist_lock);
				2008	}
				2009
				2010	static void init_cgroup_housekeeping(struct cgroup *cgrp)
				2011	{
				2012	struct cgroup_subsys *ss;
				2013	int ssid;
				2014
				2015	INIT_LIST_HEAD(&cgrp->self.sibling);
				2016	INIT_LIST_HEAD(&cgrp->self.children);
				2017	INIT_LIST_HEAD(&cgrp->cset_links);
				2018	INIT_LIST_HEAD(&cgrp->pidlists);
				2019	mutex_init(&cgrp->pidlist_mutex);
				2020	cgrp->self.cgroup = cgrp;
				2021	cgrp->self.flags \|= CSS_ONLINE;
				2022	cgrp->dom_cgrp = cgrp;
				2023	cgrp->max_descendants = INT_MAX;
				2024	cgrp->max_depth = INT_MAX;
				2025	INIT_LIST_HEAD(&cgrp->rstat_css_list);
				2026	prev_cputime_init(&cgrp->prev_cputime);
				2027
				2028	for_each_subsys(ss, ssid)
				2029	INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
				2030
				2031	init_waitqueue_head(&cgrp->offline_waitq);
				2032	INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
				2033	}
				2034
				2035	void init_cgroup_root(struct cgroup_fs_context *ctx)
				2036	{
				2037	struct cgroup_root *root = ctx->root;
				2038	struct cgroup *cgrp = &root->cgrp;
				2039
				2040	INIT_LIST_HEAD_RCU(&root->root_list);
				2041	atomic_set(&root->nr_cgrps, 1);
				2042	cgrp->root = root;
				2043	init_cgroup_housekeeping(cgrp);
				2044	idr_init(&root->cgroup_idr);
				2045
				2046	root->flags = ctx->flags;
				2047	if (ctx->release_agent)
				2048	strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
				2049	if (ctx->name)
				2050	strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
				2051	if (ctx->cpuset_clone_children)
				2052	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
				2053	}
				2054
				2055	int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
				2056	{
				2057	LIST_HEAD(tmp_links);
				2058	struct cgroup *root_cgrp = &root->cgrp;
				2059	struct kernfs_syscall_ops *kf_sops;
				2060	struct css_set *cset;
				2061	int i, ret;
				2062
				2063	lockdep_assert_held(&cgroup_mutex);
				2064
				2065	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
				2066	if (ret < 0)
				2067	goto out;
				2068	root_cgrp->id = ret;
				2069	root_cgrp->ancestor_ids[0] = ret;
				2070
				2071	ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
				2072	0, GFP_KERNEL);
				2073	if (ret)
				2074	goto out;
				2075
				2076	/*
				2077	* We're accessing css_set_count without locking css_set_lock here,
				2078	* but that's OK - it can only be increased by someone holding
				2079	* cgroup_lock, and that's us. Later rebinding may disable
				2080	* controllers on the default hierarchy and thus create new csets,
				2081	* which can't be more than the existing ones. Allocate 2x.
				2082	*/
				2083	ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
				2084	if (ret)
				2085	goto cancel_ref;
				2086
				2087	ret = cgroup_init_root_id(root);
				2088	if (ret)
				2089	goto cancel_ref;
				2090
				2091	kf_sops = root == &cgrp_dfl_root ?
				2092	&cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
				2093
				2094	root->kf_root = kernfs_create_root(kf_sops,
				2095	KERNFS_ROOT_CREATE_DEACTIVATED \|
				2096	KERNFS_ROOT_SUPPORT_EXPORTOP,
				2097	root_cgrp);
				2098	if (IS_ERR(root->kf_root)) {
				2099	ret = PTR_ERR(root->kf_root);
				2100	goto exit_root_id;
				2101	}
				2102	root_cgrp->kn = root->kf_root->kn;
				2103
				2104	ret = css_populate_dir(&root_cgrp->self);
				2105	if (ret)
				2106	goto destroy_root;
				2107
				2108	ret = rebind_subsystems(root, ss_mask);
				2109	if (ret)
				2110	goto destroy_root;
				2111
				2112	ret = cgroup_bpf_inherit(root_cgrp);
				2113	WARN_ON_ONCE(ret);
				2114
				2115	trace_cgroup_setup_root(root);
				2116
				2117	/*
				2118	* There must be no failure case after here, since rebinding takes
				2119	* care of subsystems' refcounts, which are explicitly dropped in
				2120	* the failure exit path.
				2121	*/
				2122	list_add_rcu(&root->root_list, &cgroup_roots);
				2123	cgroup_root_count++;
				2124
				2125	/*
				2126	* Link the root cgroup in this hierarchy into all the css_set
				2127	* objects.
				2128	*/
				2129	spin_lock_irq(&css_set_lock);
				2130	hash_for_each(css_set_table, i, cset, hlist) {
				2131	link_css_set(&tmp_links, cset, root_cgrp);
				2132	if (css_set_populated(cset))
				2133	cgroup_update_populated(root_cgrp, true);
				2134	}
				2135	spin_unlock_irq(&css_set_lock);
				2136
				2137	BUG_ON(!list_empty(&root_cgrp->self.children));
				2138	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
				2139
				2140	kernfs_activate(root_cgrp->kn);
				2141	ret = 0;
				2142	goto out;
				2143
				2144	destroy_root:
				2145	kernfs_destroy_root(root->kf_root);
				2146	root->kf_root = NULL;
				2147	exit_root_id:
				2148	cgroup_exit_root_id(root);
				2149	cancel_ref:
				2150	percpu_ref_exit(&root_cgrp->self.refcnt);
				2151	out:
				2152	free_cgrp_cset_links(&tmp_links);
				2153	return ret;
				2154	}
				2155
				2156	int cgroup_do_get_tree(struct fs_context *fc)
				2157	{
				2158	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
				2159	int ret;
				2160
				2161	ctx->kfc.root = ctx->root->kf_root;
				2162	if (fc->fs_type == &cgroup2_fs_type)
				2163	ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
				2164	else
				2165	ctx->kfc.magic = CGROUP_SUPER_MAGIC;
				2166	ret = kernfs_get_tree(fc);
				2167
				2168	/*
				2169	* In non-init cgroup namespace, instead of root cgroup's dentry,
				2170	* we return the dentry corresponding to the cgroupns->root_cgrp.
				2171	*/
				2172	if (!ret && ctx->ns != &init_cgroup_ns) {
				2173	struct dentry *nsdentry;
				2174	struct super_block *sb = fc->root->d_sb;
				2175	struct cgroup *cgrp;
				2176
				2177	mutex_lock(&cgroup_mutex);
				2178	spin_lock_irq(&css_set_lock);
				2179
				2180	cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
				2181
				2182	spin_unlock_irq(&css_set_lock);
				2183	mutex_unlock(&cgroup_mutex);
				2184
				2185	nsdentry = kernfs_node_dentry(cgrp->kn, sb);
				2186	dput(fc->root);
				2187	if (IS_ERR(nsdentry)) {
				2188	deactivate_locked_super(sb);
				2189	ret = PTR_ERR(nsdentry);
				2190	nsdentry = NULL;
				2191	}
				2192	fc->root = nsdentry;
				2193	}
				2194
				2195	if (!ctx->kfc.new_sb_created)
				2196	cgroup_put(&ctx->root->cgrp);
				2197
				2198	return ret;
				2199	}
				2200
				2201	/*
				2202	* Destroy a cgroup filesystem context.
				2203	*/
				2204	static void cgroup_fs_context_free(struct fs_context *fc)
				2205	{
				2206	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
				2207
				2208	kfree(ctx->name);
				2209	kfree(ctx->release_agent);
				2210	put_cgroup_ns(ctx->ns);
				2211	kernfs_free_fs_context(fc);
				2212	kfree(ctx);
				2213	}
				2214
				2215	static int cgroup_get_tree(struct fs_context *fc)
				2216	{
				2217	struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
				2218	int ret;
				2219
				2220	cgrp_dfl_visible = true;
				2221	cgroup_get_live(&cgrp_dfl_root.cgrp);
				2222	ctx->root = &cgrp_dfl_root;
				2223
				2224	ret = cgroup_do_get_tree(fc);
				2225	if (!ret)
				2226	apply_cgroup_root_flags(ctx->flags);
				2227	return ret;
				2228	}
				2229
				2230	static const struct fs_context_operations cgroup_fs_context_ops = {
				2231	.free = cgroup_fs_context_free,
				2232	.parse_param = cgroup2_parse_param,
				2233	.get_tree = cgroup_get_tree,
				2234	.reconfigure = cgroup_reconfigure,
				2235	};
				2236
				2237	static const struct fs_context_operations cgroup1_fs_context_ops = {
				2238	.free = cgroup_fs_context_free,
				2239	.parse_param = cgroup1_parse_param,
				2240	.get_tree = cgroup1_get_tree,
				2241	.reconfigure = cgroup1_reconfigure,
				2242	};
				2243
				2244	/*
				2245	* Initialise the cgroup filesystem creation/reconfiguration context. Notably,
				2246	* we select the namespace we're going to use.
				2247	*/
				2248	static int cgroup_init_fs_context(struct fs_context *fc)
				2249	{
				2250	struct cgroup_fs_context *ctx;
				2251
				2252	ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
				2253	if (!ctx)
				2254	return -ENOMEM;
				2255
				2256	/*
				2257	* The first time anyone tries to mount a cgroup, enable the list
				2258	* linking each css_set to its tasks and fix up all existing tasks.
				2259	*/
				2260	if (!use_task_css_set_links)
				2261	cgroup_enable_task_cg_lists();
				2262
				2263	ctx->ns = current->nsproxy->cgroup_ns;
				2264	get_cgroup_ns(ctx->ns);
				2265	fc->fs_private = &ctx->kfc;
				2266	if (fc->fs_type == &cgroup2_fs_type)
				2267	fc->ops = &cgroup_fs_context_ops;
				2268	else
				2269	fc->ops = &cgroup1_fs_context_ops;
				2270	put_user_ns(fc->user_ns);
				2271	fc->user_ns = get_user_ns(ctx->ns->user_ns);
				2272	fc->global = true;
				2273	return 0;
				2274	}
				2275
				2276	static void cgroup_kill_sb(struct super_block *sb)
				2277	{
				2278	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
				2279	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
				2280
				2281	/*
				2282	* If @root doesn't have any children, start killing it.
				2283	* This prevents new mounts by disabling percpu_ref_tryget_live().
				2284	* cgroup_mount() may wait for @root's release.
				2285	*
				2286	* And don't kill the default root.
				2287	*/
				2288	if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
				2289	!percpu_ref_is_dying(&root->cgrp.self.refcnt))
				2290	percpu_ref_kill(&root->cgrp.self.refcnt);
				2291	cgroup_put(&root->cgrp);
				2292	kernfs_kill_sb(sb);
				2293	}
				2294
				2295	struct file_system_type cgroup_fs_type = {
				2296	.name = "cgroup",
				2297	.init_fs_context = cgroup_init_fs_context,
				2298	.parameters = &cgroup1_fs_parameters,
				2299	.kill_sb = cgroup_kill_sb,
				2300	.fs_flags = FS_USERNS_MOUNT,
				2301	};
				2302
				2303	static struct file_system_type cgroup2_fs_type = {
				2304	.name = "cgroup2",
				2305	.init_fs_context = cgroup_init_fs_context,
				2306	.parameters = &cgroup2_fs_parameters,
				2307	.kill_sb = cgroup_kill_sb,
				2308	.fs_flags = FS_USERNS_MOUNT,
				2309	};
				2310
				2311	#ifdef CONFIG_CPUSETS
				2312	static const struct fs_context_operations cpuset_fs_context_ops = {
				2313	.get_tree = cgroup1_get_tree,
				2314	.free = cgroup_fs_context_free,
				2315	};
				2316
				2317	/*
				2318	* This is ugly, but preserves the userspace API for existing cpuset
				2319	* users. If someone tries to mount the "cpuset" filesystem, we
				2320	* silently switch it to mount "cgroup" instead
				2321	*/
				2322	static int cpuset_init_fs_context(struct fs_context *fc)
				2323	{
				2324	char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
				2325	struct cgroup_fs_context *ctx;
				2326	int err;
				2327
				2328	err = cgroup_init_fs_context(fc);
				2329	if (err) {
				2330	kfree(agent);
				2331	return err;
				2332	}
				2333
				2334	fc->ops = &cpuset_fs_context_ops;
				2335
				2336	ctx = cgroup_fc2context(fc);
				2337	ctx->subsys_mask = 1 << cpuset_cgrp_id;
				2338	ctx->flags \|= CGRP_ROOT_NOPREFIX;
				2339	ctx->release_agent = agent;
				2340
				2341	get_filesystem(&cgroup_fs_type);
				2342	put_filesystem(fc->fs_type);
				2343	fc->fs_type = &cgroup_fs_type;
				2344
				2345	return 0;
				2346	}
				2347
				2348	static struct file_system_type cpuset_fs_type = {
				2349	.name = "cpuset",
				2350	.init_fs_context = cpuset_init_fs_context,
				2351	.fs_flags = FS_USERNS_MOUNT,
				2352	};
				2353	#endif
				2354
				2355	int cgroup_path_ns_locked(struct cgroup cgrp, char buf, size_t buflen,
				2356	struct cgroup_namespace *ns)
				2357	{
				2358	struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
				2359
				2360	return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
				2361	}
				2362
				2363	int cgroup_path_ns(struct cgroup cgrp, char buf, size_t buflen,
				2364	struct cgroup_namespace *ns)
				2365	{
				2366	int ret;
				2367
				2368	mutex_lock(&cgroup_mutex);
				2369	spin_lock_irq(&css_set_lock);
				2370
				2371	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
				2372
				2373	spin_unlock_irq(&css_set_lock);
				2374	mutex_unlock(&cgroup_mutex);
				2375
				2376	return ret;
				2377	}
				2378	EXPORT_SYMBOL_GPL(cgroup_path_ns);
				2379
				2380	/**
				2381	* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
				2382	* @task: target task
				2383	* @buf: the buffer to write the path into
				2384	* @buflen: the length of the buffer
				2385	*
				2386	* Determine @task's cgroup on the first (the one with the lowest non-zero
				2387	* hierarchy_id) cgroup hierarchy and copy its path into @buf. This
				2388	* function grabs cgroup_mutex and shouldn't be used inside locks used by
				2389	* cgroup controller callbacks.
				2390	*
				2391	* Return value is the same as kernfs_path().
				2392	*/
				2393	int task_cgroup_path(struct task_struct task, char buf, size_t buflen)
				2394	{
				2395	struct cgroup_root *root;
				2396	struct cgroup *cgrp;
				2397	int hierarchy_id = 1;
				2398	int ret;
				2399
				2400	mutex_lock(&cgroup_mutex);
				2401	spin_lock_irq(&css_set_lock);
				2402
				2403	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
				2404
				2405	if (root) {
				2406	cgrp = task_cgroup_from_root(task, root);
				2407	ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
				2408	} else {
				2409	/* if no hierarchy exists, everyone is in "/" */
				2410	ret = strlcpy(buf, "/", buflen);
				2411	}
				2412
				2413	spin_unlock_irq(&css_set_lock);
				2414	mutex_unlock(&cgroup_mutex);
				2415	return ret;
				2416	}
				2417	EXPORT_SYMBOL_GPL(task_cgroup_path);
				2418
				2419	/**
				2420	* cgroup_attach_lock - Lock for ->attach()
				2421	* @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
				2422	*
				2423	* cgroup migration sometimes needs to stabilize threadgroups against forks and
				2424	* exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
				2425	* implementations (e.g. cpuset), also need to disable CPU hotplug.
				2426	* Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
				2427	* lead to deadlocks.
				2428	*
				2429	* Bringing up a CPU may involve creating and destroying tasks which requires
				2430	* read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
				2431	* cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
				2432	* write-locking threadgroup_rwsem, the locking order is reversed and we end up
				2433	* waiting for an on-going CPU hotplug operation which in turn is waiting for
				2434	* the threadgroup_rwsem to be released to create new tasks. For more details:
				2435	*
				2436	* http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
				2437	*
				2438	* Resolve the situation by always acquiring cpus_read_lock() before optionally
				2439	* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
				2440	* CPU hotplug is disabled on entry.
				2441	*/
				2442	static void cgroup_attach_lock(bool lock_threadgroup)
				2443	{
				2444	cpus_read_lock();
				2445	if (lock_threadgroup)
				2446	percpu_down_write(&cgroup_threadgroup_rwsem);
				2447	}
				2448
				2449	/**
				2450	* cgroup_attach_unlock - Undo cgroup_attach_lock()
				2451	* @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
				2452	*/
				2453	static void cgroup_attach_unlock(bool lock_threadgroup)
				2454	{
				2455	if (lock_threadgroup)
				2456	percpu_up_write(&cgroup_threadgroup_rwsem);
				2457	cpus_read_unlock();
				2458	}
				2459
				2460	/**
				2461	* cgroup_migrate_add_task - add a migration target task to a migration context
				2462	* @task: target task
				2463	* @mgctx: target migration context
				2464	*
				2465	* Add @task, which is a migration target, to @mgctx->tset. This function
				2466	* becomes noop if @task doesn't need to be migrated. @task's css_set
				2467	* should have been added as a migration source and @task->cg_list will be
				2468	* moved from the css_set's tasks list to mg_tasks one.
				2469	*/
				2470	static void cgroup_migrate_add_task(struct task_struct *task,
				2471	struct cgroup_mgctx *mgctx)
				2472	{
				2473	struct css_set *cset;
				2474
				2475	lockdep_assert_held(&css_set_lock);
				2476
				2477	/* @task either already exited or can't exit until the end */
				2478	if (task->flags & PF_EXITING)
				2479	return;
				2480
				2481	/* leave @task alone if post_fork() hasn't linked it yet */
				2482	if (list_empty(&task->cg_list))
				2483	return;
				2484
				2485	cset = task_css_set(task);
				2486	if (!cset->mg_src_cgrp)
				2487	return;
				2488
				2489	mgctx->tset.nr_tasks++;
				2490
				2491	list_move_tail(&task->cg_list, &cset->mg_tasks);
				2492	if (list_empty(&cset->mg_node))
				2493	list_add_tail(&cset->mg_node,
				2494	&mgctx->tset.src_csets);
				2495	if (list_empty(&cset->mg_dst_cset->mg_node))
				2496	list_add_tail(&cset->mg_dst_cset->mg_node,
				2497	&mgctx->tset.dst_csets);
				2498	}
				2499
				2500	/**
				2501	* cgroup_taskset_first - reset taskset and return the first task
				2502	* @tset: taskset of interest
				2503	* @dst_cssp: output variable for the destination css
				2504	*
				2505	* @tset iteration is initialized and the first task is returned.
				2506	*/
				2507	struct task_struct cgroup_taskset_first(struct cgroup_taskset tset,
				2508	struct cgroup_subsys_state **dst_cssp)
				2509	{
				2510	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
				2511	tset->cur_task = NULL;
				2512
				2513	return cgroup_taskset_next(tset, dst_cssp);
				2514	}
				2515
				2516	/**
				2517	* cgroup_taskset_next - iterate to the next task in taskset
				2518	* @tset: taskset of interest
				2519	* @dst_cssp: output variable for the destination css
				2520	*
				2521	* Return the next task in @tset. Iteration must have been initialized
				2522	* with cgroup_taskset_first().
				2523	*/
				2524	struct task_struct cgroup_taskset_next(struct cgroup_taskset tset,
				2525	struct cgroup_subsys_state **dst_cssp)
				2526	{
				2527	struct css_set *cset = tset->cur_cset;
				2528	struct task_struct *task = tset->cur_task;
				2529
				2530	while (&cset->mg_node != tset->csets) {
				2531	if (!task)
				2532	task = list_first_entry(&cset->mg_tasks,
				2533	struct task_struct, cg_list);
				2534	else
				2535	task = list_next_entry(task, cg_list);
				2536
				2537	if (&task->cg_list != &cset->mg_tasks) {
				2538	tset->cur_cset = cset;
				2539	tset->cur_task = task;
				2540
				2541	/*
				2542	* This function may be called both before and
				2543	* after cgroup_taskset_migrate(). The two cases
				2544	* can be distinguished by looking at whether @cset
				2545	* has its ->mg_dst_cset set.
				2546	*/
				2547	if (cset->mg_dst_cset)
				2548	*dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
				2549	else
				2550	*dst_cssp = cset->subsys[tset->ssid];
				2551
				2552	return task;
				2553	}
				2554
				2555	cset = list_next_entry(cset, mg_node);
				2556	task = NULL;
				2557	}
				2558
				2559	return NULL;
				2560	}
				2561
				2562	/**
				2563	* cgroup_taskset_migrate - migrate a taskset
				2564	* @mgctx: migration context
				2565	*
				2566	* Migrate tasks in @mgctx as setup by migration preparation functions.
				2567	* This function fails iff one of the ->can_attach callbacks fails and
				2568	* guarantees that either all or none of the tasks in @mgctx are migrated.
				2569	* @mgctx is consumed regardless of success.
				2570	*/
				2571	static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
				2572	{
				2573	struct cgroup_taskset *tset = &mgctx->tset;
				2574	struct cgroup_subsys *ss;
				2575	struct task_struct task, tmp_task;
				2576	struct css_set cset, tmp_cset;
				2577	int ssid, failed_ssid, ret;
				2578
				2579	/* check that we can legitimately attach to the cgroup */
				2580	if (tset->nr_tasks) {
				2581	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
				2582	if (ss->can_attach) {
				2583	tset->ssid = ssid;
				2584	ret = ss->can_attach(tset);
				2585	if (ret) {
				2586	failed_ssid = ssid;
				2587	goto out_cancel_attach;
				2588	}
				2589	}
				2590	} while_each_subsys_mask();
				2591	}
				2592
				2593	/*
				2594	* Now that we're guaranteed success, proceed to move all tasks to
				2595	* the new cgroup. There are no failure cases after here, so this
				2596	* is the commit point.
				2597	*/
				2598	spin_lock_irq(&css_set_lock);
				2599	list_for_each_entry(cset, &tset->src_csets, mg_node) {
				2600	list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
				2601	struct css_set *from_cset = task_css_set(task);
				2602	struct css_set *to_cset = cset->mg_dst_cset;
				2603
				2604	get_css_set(to_cset);
				2605	to_cset->nr_tasks++;
				2606	css_set_move_task(task, from_cset, to_cset, true);
				2607	from_cset->nr_tasks--;
				2608	/*
				2609	* If the source or destination cgroup is frozen,
				2610	* the task might require to change its state.
				2611	*/
				2612	cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
				2613	to_cset->dfl_cgrp);
				2614	put_css_set_locked(from_cset);
				2615
				2616	}
				2617	}
				2618	spin_unlock_irq(&css_set_lock);
				2619
				2620	/*
				2621	* Migration is committed, all target tasks are now on dst_csets.
				2622	* Nothing is sensitive to fork() after this point. Notify
				2623	* controllers that migration is complete.
				2624	*/
				2625	tset->csets = &tset->dst_csets;
				2626
				2627	if (tset->nr_tasks) {
				2628	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
				2629	if (ss->attach) {
				2630	tset->ssid = ssid;
				2631	ss->attach(tset);
				2632	}
				2633	} while_each_subsys_mask();
				2634	}
				2635
				2636	ret = 0;
				2637	goto out_release_tset;
				2638
				2639	out_cancel_attach:
				2640	if (tset->nr_tasks) {
				2641	do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
				2642	if (ssid == failed_ssid)
				2643	break;
				2644	if (ss->cancel_attach) {
				2645	tset->ssid = ssid;
				2646	ss->cancel_attach(tset);
				2647	}
				2648	} while_each_subsys_mask();
				2649	}
				2650	out_release_tset:
				2651	spin_lock_irq(&css_set_lock);
				2652	list_splice_init(&tset->dst_csets, &tset->src_csets);
				2653	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
				2654	list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
				2655	list_del_init(&cset->mg_node);
				2656	}
				2657	spin_unlock_irq(&css_set_lock);
				2658
				2659	/*
				2660	* Re-initialize the cgroup_taskset structure in case it is reused
				2661	* again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
				2662	* iteration.
				2663	*/
				2664	tset->nr_tasks = 0;
				2665	tset->csets = &tset->src_csets;
				2666	return ret;
				2667	}
				2668
				2669	/**
				2670	* cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
				2671	* @dst_cgrp: destination cgroup to test
				2672	*
				2673	* On the default hierarchy, except for the mixable, (possible) thread root
				2674	* and threaded cgroups, subtree_control must be zero for migration
				2675	* destination cgroups with tasks so that child cgroups don't compete
				2676	* against tasks.
				2677	*/
				2678	int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
				2679	{
				2680	/* v1 doesn't have any restriction */
				2681	if (!cgroup_on_dfl(dst_cgrp))
				2682	return 0;
				2683
				2684	/* verify @dst_cgrp can host resources */
				2685	if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
				2686	return -EOPNOTSUPP;
				2687
				2688	/* mixables don't care */
				2689	if (cgroup_is_mixable(dst_cgrp))
				2690	return 0;
				2691
				2692	/*
				2693	* If @dst_cgrp is already or can become a thread root or is
				2694	* threaded, it doesn't matter.
				2695	*/
				2696	if (cgroup_can_be_thread_root(dst_cgrp) \|\| cgroup_is_threaded(dst_cgrp))
				2697	return 0;
				2698
				2699	/* apply no-internal-process constraint */
				2700	if (dst_cgrp->subtree_control)
				2701	return -EBUSY;
				2702
				2703	return 0;
				2704	}
				2705
				2706	/**
				2707	* cgroup_migrate_finish - cleanup after attach
				2708	* @mgctx: migration context
				2709	*
				2710	* Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst(). See
				2711	* those functions for details.
				2712	*/
				2713	void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
				2714	{
				2715	struct css_set cset, tmp_cset;
				2716
				2717	lockdep_assert_held(&cgroup_mutex);
				2718
				2719	spin_lock_irq(&css_set_lock);
				2720
				2721	list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
				2722	mg_src_preload_node) {
				2723	cset->mg_src_cgrp = NULL;
				2724	cset->mg_dst_cgrp = NULL;
				2725	cset->mg_dst_cset = NULL;
				2726	list_del_init(&cset->mg_src_preload_node);
				2727	put_css_set_locked(cset);
				2728	}
				2729
				2730	list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
				2731	mg_dst_preload_node) {
				2732	cset->mg_src_cgrp = NULL;
				2733	cset->mg_dst_cgrp = NULL;
				2734	cset->mg_dst_cset = NULL;
				2735	list_del_init(&cset->mg_dst_preload_node);
				2736	put_css_set_locked(cset);
				2737	}
				2738
				2739	spin_unlock_irq(&css_set_lock);
				2740	}
				2741
				2742	/**
				2743	* cgroup_migrate_add_src - add a migration source css_set
				2744	* @src_cset: the source css_set to add
				2745	* @dst_cgrp: the destination cgroup
				2746	* @mgctx: migration context
				2747	*
				2748	* Tasks belonging to @src_cset are about to be migrated to @dst_cgrp. Pin
				2749	* @src_cset and add it to @mgctx->src_csets, which should later be cleaned
				2750	* up by cgroup_migrate_finish().
				2751	*
				2752	* This function may be called without holding cgroup_threadgroup_rwsem
				2753	* even if the target is a process. Threads may be created and destroyed
				2754	* but as long as cgroup_mutex is not dropped, no new css_set can be put
				2755	* into play and the preloaded css_sets are guaranteed to cover all
				2756	* migrations.
				2757	*/
				2758	void cgroup_migrate_add_src(struct css_set *src_cset,
				2759	struct cgroup *dst_cgrp,
				2760	struct cgroup_mgctx *mgctx)
				2761	{
				2762	struct cgroup *src_cgrp;
				2763
				2764	lockdep_assert_held(&cgroup_mutex);
				2765	lockdep_assert_held(&css_set_lock);
				2766
				2767	/*
				2768	* If ->dead, @src_set is associated with one or more dead cgroups
				2769	* and doesn't contain any migratable tasks. Ignore it early so
				2770	* that the rest of migration path doesn't get confused by it.
				2771	*/
				2772	if (src_cset->dead)
				2773	return;
				2774
				2775	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
				2776
				2777	if (!list_empty(&src_cset->mg_src_preload_node))
				2778	return;
				2779
				2780	WARN_ON(src_cset->mg_src_cgrp);
				2781	WARN_ON(src_cset->mg_dst_cgrp);
				2782	WARN_ON(!list_empty(&src_cset->mg_tasks));
				2783	WARN_ON(!list_empty(&src_cset->mg_node));
				2784
				2785	src_cset->mg_src_cgrp = src_cgrp;
				2786	src_cset->mg_dst_cgrp = dst_cgrp;
				2787	get_css_set(src_cset);
				2788	list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
				2789	}
				2790
				2791	/**
				2792	* cgroup_migrate_prepare_dst - prepare destination css_sets for migration
				2793	* @mgctx: migration context
				2794	*
				2795	* Tasks are about to be moved and all the source css_sets have been
				2796	* preloaded to @mgctx->preloaded_src_csets. This function looks up and
				2797	* pins all destination css_sets, links each to its source, and append them
				2798	* to @mgctx->preloaded_dst_csets.
				2799	*
				2800	* This function must be called after cgroup_migrate_add_src() has been
				2801	* called on each migration source css_set. After migration is performed
				2802	* using cgroup_migrate(), cgroup_migrate_finish() must be called on
				2803	* @mgctx.
				2804	*/
				2805	int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
				2806	{
				2807	struct css_set src_cset, tmp_cset;
				2808
				2809	lockdep_assert_held(&cgroup_mutex);
				2810
				2811	/* look up the dst cset for each src cset and link it to src */
				2812	list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
				2813	mg_src_preload_node) {
				2814	struct css_set *dst_cset;
				2815	struct cgroup_subsys *ss;
				2816	int ssid;
				2817
				2818	dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
				2819	if (!dst_cset)
				2820	return -ENOMEM;
				2821
				2822	WARN_ON_ONCE(src_cset->mg_dst_cset \|\| dst_cset->mg_dst_cset);
				2823
				2824	/*
				2825	* If src cset equals dst, it's noop. Drop the src.
				2826	* cgroup_migrate() will skip the cset too. Note that we
				2827	* can't handle src == dst as some nodes are used by both.
				2828	*/
				2829	if (src_cset == dst_cset) {
				2830	src_cset->mg_src_cgrp = NULL;
				2831	src_cset->mg_dst_cgrp = NULL;
				2832	list_del_init(&src_cset->mg_src_preload_node);
				2833	put_css_set(src_cset);
				2834	put_css_set(dst_cset);
				2835	continue;
				2836	}
				2837
				2838	src_cset->mg_dst_cset = dst_cset;
				2839
				2840	if (list_empty(&dst_cset->mg_dst_preload_node))
				2841	list_add_tail(&dst_cset->mg_dst_preload_node,
				2842	&mgctx->preloaded_dst_csets);
				2843	else
				2844	put_css_set(dst_cset);
				2845
				2846	for_each_subsys(ss, ssid)
				2847	if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
				2848	mgctx->ss_mask \|= 1 << ssid;
				2849	}
				2850
				2851	return 0;
				2852	}
				2853
				2854	/**
				2855	* cgroup_migrate - migrate a process or task to a cgroup
				2856	* @leader: the leader of the process or the task to migrate
				2857	* @threadgroup: whether @leader points to the whole process or a single task
				2858	* @mgctx: migration context
				2859	*
				2860	* Migrate a process or task denoted by @leader. If migrating a process,
				2861	* the caller must be holding cgroup_threadgroup_rwsem. The caller is also
				2862	* responsible for invoking cgroup_migrate_add_src() and
				2863	* cgroup_migrate_prepare_dst() on the targets before invoking this
				2864	* function and following up with cgroup_migrate_finish().
				2865	*
				2866	* As long as a controller's ->can_attach() doesn't fail, this function is
				2867	* guaranteed to succeed. This means that, excluding ->can_attach()
				2868	* failure, when migrating multiple targets, the success or failure can be
				2869	* decided for all targets by invoking group_migrate_prepare_dst() before
				2870	* actually starting migrating.
				2871	*/
				2872	int cgroup_migrate(struct task_struct *leader, bool threadgroup,
				2873	struct cgroup_mgctx *mgctx)
				2874	{
				2875	struct task_struct *task;
				2876
				2877	/*
				2878	* Prevent freeing of tasks while we take a snapshot. Tasks that are
				2879	* already PF_EXITING could be freed from underneath us unless we
				2880	* take an rcu_read_lock.
				2881	*/
				2882	spin_lock_irq(&css_set_lock);
				2883	rcu_read_lock();
				2884	task = leader;
				2885	do {
				2886	cgroup_migrate_add_task(task, mgctx);
				2887	if (!threadgroup)
				2888	break;
				2889	} while_each_thread(leader, task);
				2890	rcu_read_unlock();
				2891	spin_unlock_irq(&css_set_lock);
				2892
				2893	return cgroup_migrate_execute(mgctx);
				2894	}
				2895
				2896	/**
				2897	* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
				2898	* @dst_cgrp: the cgroup to attach to
				2899	* @leader: the task or the leader of the threadgroup to be attached
				2900	* @threadgroup: attach the whole threadgroup?
				2901	*
				2902	* Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
				2903	*/
				2904	int cgroup_attach_task(struct cgroup dst_cgrp, struct task_struct leader,
				2905	bool threadgroup)
				2906	{
				2907	DEFINE_CGROUP_MGCTX(mgctx);
				2908	struct task_struct *task;
				2909	int ret;
				2910
				2911	ret = cgroup_migrate_vet_dst(dst_cgrp);
				2912	if (ret)
				2913	return ret;
				2914
				2915	/* look up all src csets */
				2916	spin_lock_irq(&css_set_lock);
				2917	rcu_read_lock();
				2918	task = leader;
				2919	do {
				2920	cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
				2921	if (!threadgroup)
				2922	break;
				2923	} while_each_thread(leader, task);
				2924	rcu_read_unlock();
				2925	spin_unlock_irq(&css_set_lock);
				2926
				2927	/* prepare dst csets and commit */
				2928	ret = cgroup_migrate_prepare_dst(&mgctx);
				2929	if (!ret)
				2930	ret = cgroup_migrate(leader, threadgroup, &mgctx);
				2931
				2932	cgroup_migrate_finish(&mgctx);
				2933
				2934	if (!ret)
				2935	TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
				2936
				2937	return ret;
				2938	}
				2939
				2940	struct task_struct cgroup_procs_write_start(char buf, bool threadgroup,
				2941	bool *threadgroup_locked)
				2942	{
				2943	struct task_struct *tsk;
				2944	pid_t pid;
				2945
				2946	if (kstrtoint(strstrip(buf), 0, &pid) \|\| pid < 0)
				2947	return ERR_PTR(-EINVAL);
				2948
				2949	/*
				2950	* If we migrate a single thread, we don't care about threadgroup
				2951	* stability. If the thread is `current`, it won't exit(2) under our
				2952	* hands or change PID through exec(2). We exclude
				2953	* cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
				2954	* callers by cgroup_mutex.
				2955	* Therefore, we can skip the global lock.
				2956	*/
				2957	lockdep_assert_held(&cgroup_mutex);
				2958	*threadgroup_locked = pid \|\| threadgroup;
				2959	cgroup_attach_lock(*threadgroup_locked);
				2960
				2961	rcu_read_lock();
				2962	if (pid) {
				2963	tsk = find_task_by_vpid(pid);
				2964	if (!tsk) {
				2965	tsk = ERR_PTR(-ESRCH);
				2966	goto out_unlock_threadgroup;
				2967	}
				2968	} else {
				2969	tsk = current;
				2970	}
				2971
				2972	if (threadgroup)
				2973	tsk = tsk->group_leader;
				2974
				2975	/*
				2976	* kthreads may acquire PF_NO_SETAFFINITY during initialization.
				2977	* If userland migrates such a kthread to a non-root cgroup, it can
				2978	* become trapped in a cpuset, or RT kthread may be born in a
				2979	* cgroup with no rt_runtime allocated. Just say no.
				2980	*/
				2981	if (tsk->no_cgroup_migration \|\| (tsk->flags & PF_NO_SETAFFINITY)) {
				2982	tsk = ERR_PTR(-EINVAL);
				2983	goto out_unlock_threadgroup;
				2984	}
				2985
				2986	get_task_struct(tsk);
				2987	goto out_unlock_rcu;
				2988
				2989	out_unlock_threadgroup:
				2990	cgroup_attach_unlock(*threadgroup_locked);
				2991	*threadgroup_locked = false;
				2992	out_unlock_rcu:
				2993	rcu_read_unlock();
				2994	return tsk;
				2995	}
				2996
				2997	void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
				2998	{
				2999	struct cgroup_subsys *ss;
				3000	int ssid;
				3001
				3002	/* release reference from cgroup_procs_write_start() */
				3003	put_task_struct(task);
				3004
				3005	cgroup_attach_unlock(threadgroup_locked);
				3006
				3007	for_each_subsys(ss, ssid)
				3008	if (ss->post_attach)
				3009	ss->post_attach();
				3010	}
				3011
				3012	static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
				3013	{
				3014	struct cgroup_subsys *ss;
				3015	bool printed = false;
				3016	int ssid;
				3017
				3018	do_each_subsys_mask(ss, ssid, ss_mask) {
				3019	if (printed)
				3020	seq_putc(seq, ' ');
				3021	seq_puts(seq, ss->name);
				3022	printed = true;
				3023	} while_each_subsys_mask();
				3024	if (printed)
				3025	seq_putc(seq, '\n');
				3026	}
				3027
				3028	/* show controllers which are enabled from the parent */
				3029	static int cgroup_controllers_show(struct seq_file seq, void v)
				3030	{
				3031	struct cgroup *cgrp = seq_css(seq)->cgroup;
				3032
				3033	cgroup_print_ss_mask(seq, cgroup_control(cgrp));
				3034	return 0;
				3035	}
				3036
				3037	/* show controllers which are enabled for a given cgroup's children */
				3038	static int cgroup_subtree_control_show(struct seq_file seq, void v)
				3039	{
				3040	struct cgroup *cgrp = seq_css(seq)->cgroup;
				3041
				3042	cgroup_print_ss_mask(seq, cgrp->subtree_control);
				3043	return 0;
				3044	}
				3045
				3046	/**
				3047	* cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
				3048	* @cgrp: root of the subtree to update csses for
				3049	*
				3050	* @cgrp's control masks have changed and its subtree's css associations
				3051	* need to be updated accordingly. This function looks up all css_sets
				3052	* which are attached to the subtree, creates the matching updated css_sets
				3053	* and migrates the tasks to the new ones.
				3054	*/
				3055	static int cgroup_update_dfl_csses(struct cgroup *cgrp)
				3056	{
				3057	DEFINE_CGROUP_MGCTX(mgctx);
				3058	struct cgroup_subsys_state *d_css;
				3059	struct cgroup *dsct;
				3060	struct css_set *src_cset;
				3061	bool has_tasks;
				3062	int ret;
				3063
				3064	lockdep_assert_held(&cgroup_mutex);
				3065
				3066	/* look up all csses currently attached to @cgrp's subtree */
				3067	spin_lock_irq(&css_set_lock);
				3068	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
				3069	struct cgrp_cset_link *link;
				3070
				3071	list_for_each_entry(link, &dsct->cset_links, cset_link)
				3072	cgroup_migrate_add_src(link->cset, dsct, &mgctx);
				3073	}
				3074	spin_unlock_irq(&css_set_lock);
				3075
				3076	/*
				3077	* We need to write-lock threadgroup_rwsem while migrating tasks.
				3078	* However, if there are no source csets for @cgrp, changing its
				3079	* controllers isn't gonna produce any task migrations and the
				3080	* write-locking can be skipped safely.
				3081	*/
				3082	has_tasks = !list_empty(&mgctx.preloaded_src_csets);
				3083	cgroup_attach_lock(has_tasks);
				3084
				3085	/* NULL dst indicates self on default hierarchy */
				3086	ret = cgroup_migrate_prepare_dst(&mgctx);
				3087	if (ret)
				3088	goto out_finish;
				3089
				3090	spin_lock_irq(&css_set_lock);
				3091	list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
				3092	mg_src_preload_node) {
				3093	struct task_struct task, ntask;
				3094
				3095	/* all tasks in src_csets need to be migrated */
				3096	list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
				3097	cgroup_migrate_add_task(task, &mgctx);
				3098	}
				3099	spin_unlock_irq(&css_set_lock);
				3100
				3101	ret = cgroup_migrate_execute(&mgctx);
				3102	out_finish:
				3103	cgroup_migrate_finish(&mgctx);
				3104	cgroup_attach_unlock(has_tasks);
				3105	return ret;
				3106	}
				3107
				3108	/**
				3109	* cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
				3110	* @cgrp: root of the target subtree
				3111	*
				3112	* Because css offlining is asynchronous, userland may try to re-enable a
				3113	* controller while the previous css is still around. This function grabs
				3114	* cgroup_mutex and drains the previous css instances of @cgrp's subtree.
				3115	*/
				3116	void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
				3117	__acquires(&cgroup_mutex)
				3118	{
				3119	struct cgroup *dsct;
				3120	struct cgroup_subsys_state *d_css;
				3121	struct cgroup_subsys *ss;
				3122	int ssid;
				3123
				3124	restart:
				3125	mutex_lock(&cgroup_mutex);
				3126
				3127	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
				3128	for_each_subsys(ss, ssid) {
				3129	struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
				3130	DEFINE_WAIT(wait);
				3131
				3132	if (!css \|\| !percpu_ref_is_dying(&css->refcnt))
				3133	continue;
				3134
				3135	cgroup_get_live(dsct);
				3136	prepare_to_wait(&dsct->offline_waitq, &wait,
				3137	TASK_UNINTERRUPTIBLE);
				3138
				3139	mutex_unlock(&cgroup_mutex);
				3140	schedule();
				3141	finish_wait(&dsct->offline_waitq, &wait);
				3142
				3143	cgroup_put(dsct);
				3144	goto restart;
				3145	}
				3146	}
				3147	}
				3148
				3149	/**
				3150	* cgroup_save_control - save control masks and dom_cgrp of a subtree
				3151	* @cgrp: root of the target subtree
				3152	*
				3153	* Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
				3154	* respective old_ prefixed fields for @cgrp's subtree including @cgrp
				3155	* itself.
				3156	*/
				3157	static void cgroup_save_control(struct cgroup *cgrp)
				3158	{
				3159	struct cgroup *dsct;
				3160	struct cgroup_subsys_state *d_css;
				3161
				3162	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
				3163	dsct->old_subtree_control = dsct->subtree_control;
				3164	dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
				3165	dsct->old_dom_cgrp = dsct->dom_cgrp;
				3166	}
				3167	}
				3168
				3169	/**
				3170	* cgroup_propagate_control - refresh control masks of a subtree
				3171	* @cgrp: root of the target subtree
				3172	*
				3173	* For @cgrp and its subtree, ensure ->subtree_ss_mask matches
				3174	* ->subtree_control and propagate controller availability through the
				3175	* subtree so that descendants don't have unavailable controllers enabled.
				3176	*/
				3177	static void cgroup_propagate_control(struct cgroup *cgrp)
				3178	{
				3179	struct cgroup *dsct;
				3180	struct cgroup_subsys_state *d_css;
				3181
				3182	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
				3183	dsct->subtree_control &= cgroup_control(dsct);
				3184	dsct->subtree_ss_mask =
				3185	cgroup_calc_subtree_ss_mask(dsct->subtree_control,
				3186	cgroup_ss_mask(dsct));
				3187	}
				3188	}
				3189
				3190	/**
				3191	* cgroup_restore_control - restore control masks and dom_cgrp of a subtree
				3192	* @cgrp: root of the target subtree
				3193	*
				3194	* Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
				3195	* respective old_ prefixed fields for @cgrp's subtree including @cgrp
				3196	* itself.
				3197	*/
				3198	static void cgroup_restore_control(struct cgroup *cgrp)
				3199	{
				3200	struct cgroup *dsct;
				3201	struct cgroup_subsys_state *d_css;
				3202
				3203	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
				3204	dsct->subtree_control = dsct->old_subtree_control;
				3205	dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
				3206	dsct->dom_cgrp = dsct->old_dom_cgrp;
				3207	}
				3208	}
				3209
				3210	static bool css_visible(struct cgroup_subsys_state *css)
				3211	{
				3212	struct cgroup_subsys *ss = css->ss;
				3213	struct cgroup *cgrp = css->cgroup;
				3214
				3215	if (cgroup_control(cgrp) & (1 << ss->id))
				3216	return true;
				3217	if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
				3218	return false;
				3219	return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
				3220	}
				3221
				3222	/**
				3223	* cgroup_apply_control_enable - enable or show csses according to control
				3224	* @cgrp: root of the target subtree
				3225	*
				3226	* Walk @cgrp's subtree and create new csses or make the existing ones
				3227	* visible. A css is created invisible if it's being implicitly enabled
				3228	* through dependency. An invisible css is made visible when the userland
				3229	* explicitly enables it.
				3230	*
				3231	* Returns 0 on success, -errno on failure. On failure, csses which have
				3232	* been processed already aren't cleaned up. The caller is responsible for
				3233	* cleaning up with cgroup_apply_control_disable().
				3234	*/
				3235	static int cgroup_apply_control_enable(struct cgroup *cgrp)
				3236	{
				3237	struct cgroup *dsct;
				3238	struct cgroup_subsys_state *d_css;
				3239	struct cgroup_subsys *ss;
				3240	int ssid, ret;
				3241
				3242	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
				3243	for_each_subsys(ss, ssid) {
				3244	struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
				3245
				3246	if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
				3247	continue;
				3248
				3249	if (!css) {
				3250	css = css_create(dsct, ss);
				3251	if (IS_ERR(css))
				3252	return PTR_ERR(css);
				3253	}
				3254
				3255	WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
				3256
				3257	if (css_visible(css)) {
				3258	ret = css_populate_dir(css);
				3259	if (ret)
				3260	return ret;
				3261	}
				3262	}
				3263	}
				3264
				3265	return 0;
				3266	}
				3267
				3268	/**
				3269	* cgroup_apply_control_disable - kill or hide csses according to control
				3270	* @cgrp: root of the target subtree
				3271	*
				3272	* Walk @cgrp's subtree and kill and hide csses so that they match
				3273	* cgroup_ss_mask() and cgroup_visible_mask().
				3274	*
				3275	* A css is hidden when the userland requests it to be disabled while other
				3276	* subsystems are still depending on it. The css must not actively control
				3277	* resources and be in the vanilla state if it's made visible again later.
				3278	* Controllers which may be depended upon should provide ->css_reset() for
				3279	* this purpose.
				3280	*/
				3281	static void cgroup_apply_control_disable(struct cgroup *cgrp)
				3282	{
				3283	struct cgroup *dsct;
				3284	struct cgroup_subsys_state *d_css;
				3285	struct cgroup_subsys *ss;
				3286	int ssid;
				3287
				3288	cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
				3289	for_each_subsys(ss, ssid) {
				3290	struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
				3291
				3292	if (!css)
				3293	continue;
				3294
				3295	WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
				3296
				3297	if (css->parent &&
				3298	!(cgroup_ss_mask(dsct) & (1 << ss->id))) {
				3299	kill_css(css);
				3300	} else if (!css_visible(css)) {
				3301	css_clear_dir(css);
				3302	if (ss->css_reset)
				3303	ss->css_reset(css);
				3304	}
				3305	}
				3306	}
				3307	}
				3308
				3309	/**
				3310	* cgroup_apply_control - apply control mask updates to the subtree
				3311	* @cgrp: root of the target subtree
				3312	*
				3313	* subsystems can be enabled and disabled in a subtree using the following
				3314	* steps.
				3315	*
				3316	* 1. Call cgroup_save_control() to stash the current state.
				3317	* 2. Update ->subtree_control masks in the subtree as desired.
				3318	* 3. Call cgroup_apply_control() to apply the changes.
				3319	* 4. Optionally perform other related operations.
				3320	* 5. Call cgroup_finalize_control() to finish up.
				3321	*
				3322	* This function implements step 3 and propagates the mask changes
				3323	* throughout @cgrp's subtree, updates csses accordingly and perform
				3324	* process migrations.
				3325	*/
				3326	static int cgroup_apply_control(struct cgroup *cgrp)
				3327	{
				3328	int ret;
				3329
				3330	cgroup_propagate_control(cgrp);
				3331
				3332	ret = cgroup_apply_control_enable(cgrp);
				3333	if (ret)
				3334	return ret;
				3335
				3336	/*
				3337	* At this point, cgroup_e_css_by_mask() results reflect the new csses
				3338	* making the following cgroup_update_dfl_csses() properly update
				3339	* css associations of all tasks in the subtree.
				3340	*/
				3341	ret = cgroup_update_dfl_csses(cgrp);
				3342	if (ret)
				3343	return ret;
				3344
				3345	return 0;
				3346	}
				3347
				3348	/**
				3349	* cgroup_finalize_control - finalize control mask update
				3350	* @cgrp: root of the target subtree
				3351	* @ret: the result of the update
				3352	*
				3353	* Finalize control mask update. See cgroup_apply_control() for more info.
				3354	*/
				3355	static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
				3356	{
				3357	if (ret) {
				3358	cgroup_restore_control(cgrp);
				3359	cgroup_propagate_control(cgrp);
				3360	}
				3361
				3362	cgroup_apply_control_disable(cgrp);
				3363	}
				3364
				3365	static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
				3366	{
				3367	u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
				3368
				3369	/* if nothing is getting enabled, nothing to worry about */
				3370	if (!enable)
				3371	return 0;
				3372
				3373	/* can @cgrp host any resources? */
				3374	if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
				3375	return -EOPNOTSUPP;
				3376
				3377	/* mixables don't care */
				3378	if (cgroup_is_mixable(cgrp))
				3379	return 0;
				3380
				3381	if (domain_enable) {
				3382	/* can't enable domain controllers inside a thread subtree */
				3383	if (cgroup_is_thread_root(cgrp) \|\| cgroup_is_threaded(cgrp))
				3384	return -EOPNOTSUPP;
				3385	} else {
				3386	/*
				3387	* Threaded controllers can handle internal competitions
				3388	* and are always allowed inside a (prospective) thread
				3389	* subtree.
				3390	*/
				3391	if (cgroup_can_be_thread_root(cgrp) \|\| cgroup_is_threaded(cgrp))
				3392	return 0;
				3393	}
				3394
				3395	/*
				3396	* Controllers can't be enabled for a cgroup with tasks to avoid
				3397	* child cgroups competing against tasks.
				3398	*/
				3399	if (cgroup_has_tasks(cgrp))
				3400	return -EBUSY;
				3401
				3402	return 0;
				3403	}
				3404
				3405	/* change the enabled child controllers for a cgroup in the default hierarchy */
				3406	static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
				3407	char *buf, size_t nbytes,
				3408	loff_t off)
				3409	{
				3410	u16 enable = 0, disable = 0;
				3411	struct cgroup cgrp, child;
				3412	struct cgroup_subsys *ss;
				3413	char *tok;
				3414	int ssid, ret;
				3415
				3416	/*
				3417	* Parse input - space separated list of subsystem names prefixed
				3418	* with either + or -.
				3419	*/
				3420	buf = strstrip(buf);
				3421	while ((tok = strsep(&buf, " "))) {
				3422	if (tok[0] == '\0')
				3423	continue;
				3424	do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
				3425	if (!cgroup_ssid_enabled(ssid) \|\|
				3426	strcmp(tok + 1, ss->name))
				3427	continue;
				3428
				3429	if (*tok == '+') {
				3430	enable \|= 1 << ssid;
				3431	disable &= ~(1 << ssid);
				3432	} else if (*tok == '-') {
				3433	disable \|= 1 << ssid;
				3434	enable &= ~(1 << ssid);
				3435	} else {
				3436	return -EINVAL;
				3437	}
				3438	break;
				3439	} while_each_subsys_mask();
				3440	if (ssid == CGROUP_SUBSYS_COUNT)
				3441	return -EINVAL;
				3442	}
				3443
				3444	cgrp = cgroup_kn_lock_live(of->kn, true);
				3445	if (!cgrp)
				3446	return -ENODEV;
				3447
				3448	for_each_subsys(ss, ssid) {
				3449	if (enable & (1 << ssid)) {
				3450	if (cgrp->subtree_control & (1 << ssid)) {
				3451	enable &= ~(1 << ssid);
				3452	continue;
				3453	}
				3454
				3455	if (!(cgroup_control(cgrp) & (1 << ssid))) {
				3456	ret = -ENOENT;
				3457	goto out_unlock;
				3458	}
				3459	} else if (disable & (1 << ssid)) {
				3460	if (!(cgrp->subtree_control & (1 << ssid))) {
				3461	disable &= ~(1 << ssid);
				3462	continue;
				3463	}
				3464
				3465	/* a child has it enabled? */
				3466	cgroup_for_each_live_child(child, cgrp) {
				3467	if (child->subtree_control & (1 << ssid)) {
				3468	ret = -EBUSY;
				3469	goto out_unlock;
				3470	}
				3471	}
				3472	}
				3473	}
				3474
				3475	if (!enable && !disable) {
				3476	ret = 0;
				3477	goto out_unlock;
				3478	}
				3479
				3480	ret = cgroup_vet_subtree_control_enable(cgrp, enable);
				3481	if (ret)
				3482	goto out_unlock;
				3483
				3484	/* save and update control masks and prepare csses */
				3485	cgroup_save_control(cgrp);
				3486
				3487	cgrp->subtree_control \|= enable;
				3488	cgrp->subtree_control &= ~disable;
				3489
				3490	ret = cgroup_apply_control(cgrp);
				3491	cgroup_finalize_control(cgrp, ret);
				3492	if (ret)
				3493	goto out_unlock;
				3494
				3495	kernfs_activate(cgrp->kn);
				3496	out_unlock:
				3497	cgroup_kn_unlock(of->kn);
				3498	return ret ?: nbytes;
				3499	}
				3500
				3501	/**
				3502	* cgroup_enable_threaded - make @cgrp threaded
				3503	* @cgrp: the target cgroup
				3504	*
				3505	* Called when "threaded" is written to the cgroup.type interface file and
				3506	* tries to make @cgrp threaded and join the parent's resource domain.
				3507	* This function is never called on the root cgroup as cgroup.type doesn't
				3508	* exist on it.
				3509	*/
				3510	static int cgroup_enable_threaded(struct cgroup *cgrp)
				3511	{
				3512	struct cgroup *parent = cgroup_parent(cgrp);
				3513	struct cgroup *dom_cgrp = parent->dom_cgrp;
				3514	struct cgroup *dsct;
				3515	struct cgroup_subsys_state *d_css;
				3516	int ret;
				3517
				3518	lockdep_assert_held(&cgroup_mutex);
				3519
				3520	/* noop if already threaded */
				3521	if (cgroup_is_threaded(cgrp))
				3522	return 0;
				3523
				3524	/*
				3525	* If @cgroup is populated or has domain controllers enabled, it
				3526	* can't be switched. While the below cgroup_can_be_thread_root()
				3527	* test can catch the same conditions, that's only when @parent is
				3528	* not mixable, so let's check it explicitly.
				3529	*/
				3530	if (cgroup_is_populated(cgrp) \|\|
				3531	cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
				3532	return -EOPNOTSUPP;
				3533
				3534	/* we're joining the parent's domain, ensure its validity */
				3535	if (!cgroup_is_valid_domain(dom_cgrp) \|\|
				3536	!cgroup_can_be_thread_root(dom_cgrp))
				3537	return -EOPNOTSUPP;
				3538
				3539	/*
				3540	* The following shouldn't cause actual migrations and should
				3541	* always succeed.
				3542	*/
				3543	cgroup_save_control(cgrp);
				3544
				3545	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
				3546	if (dsct == cgrp \|\| cgroup_is_threaded(dsct))
				3547	dsct->dom_cgrp = dom_cgrp;
				3548
				3549	ret = cgroup_apply_control(cgrp);
				3550	if (!ret)
				3551	parent->nr_threaded_children++;
				3552
				3553	cgroup_finalize_control(cgrp, ret);
				3554	return ret;
				3555	}
				3556
				3557	static int cgroup_type_show(struct seq_file seq, void v)
				3558	{
				3559	struct cgroup *cgrp = seq_css(seq)->cgroup;
				3560
				3561	if (cgroup_is_threaded(cgrp))
				3562	seq_puts(seq, "threaded\n");
				3563	else if (!cgroup_is_valid_domain(cgrp))
				3564	seq_puts(seq, "domain invalid\n");
				3565	else if (cgroup_is_thread_root(cgrp))
				3566	seq_puts(seq, "domain threaded\n");
				3567	else
				3568	seq_puts(seq, "domain\n");
				3569
				3570	return 0;
				3571	}
				3572
				3573	static ssize_t cgroup_type_write(struct kernfs_open_file of, char buf,
				3574	size_t nbytes, loff_t off)
				3575	{
				3576	struct cgroup *cgrp;
				3577	int ret;
				3578
				3579	/* only switching to threaded mode is supported */
				3580	if (strcmp(strstrip(buf), "threaded"))
				3581	return -EINVAL;
				3582
				3583	/* drain dying csses before we re-apply (threaded) subtree control */
				3584	cgrp = cgroup_kn_lock_live(of->kn, true);
				3585	if (!cgrp)
				3586	return -ENOENT;
				3587
				3588	/* threaded can only be enabled */
				3589	ret = cgroup_enable_threaded(cgrp);
				3590
				3591	cgroup_kn_unlock(of->kn);
				3592	return ret ?: nbytes;
				3593	}
				3594
				3595	static int cgroup_max_descendants_show(struct seq_file seq, void v)
				3596	{
				3597	struct cgroup *cgrp = seq_css(seq)->cgroup;
				3598	int descendants = READ_ONCE(cgrp->max_descendants);
				3599
				3600	if (descendants == INT_MAX)
				3601	seq_puts(seq, "max\n");
				3602	else
				3603	seq_printf(seq, "%d\n", descendants);
				3604
				3605	return 0;
				3606	}
				3607
				3608	static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
				3609	char *buf, size_t nbytes, loff_t off)
				3610	{
				3611	struct cgroup *cgrp;
				3612	int descendants;
				3613	ssize_t ret;
				3614
				3615	buf = strstrip(buf);
				3616	if (!strcmp(buf, "max")) {
				3617	descendants = INT_MAX;
				3618	} else {
				3619	ret = kstrtoint(buf, 0, &descendants);
				3620	if (ret)
				3621	return ret;
				3622	}
				3623
				3624	if (descendants < 0)
				3625	return -ERANGE;
				3626
				3627	cgrp = cgroup_kn_lock_live(of->kn, false);
				3628	if (!cgrp)
				3629	return -ENOENT;
				3630
				3631	cgrp->max_descendants = descendants;
				3632
				3633	cgroup_kn_unlock(of->kn);
				3634
				3635	return nbytes;
				3636	}
				3637
				3638	static int cgroup_max_depth_show(struct seq_file seq, void v)
				3639	{
				3640	struct cgroup *cgrp = seq_css(seq)->cgroup;
				3641	int depth = READ_ONCE(cgrp->max_depth);
				3642
				3643	if (depth == INT_MAX)
				3644	seq_puts(seq, "max\n");
				3645	else
				3646	seq_printf(seq, "%d\n", depth);
				3647
				3648	return 0;
				3649	}
				3650
				3651	static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
				3652	char *buf, size_t nbytes, loff_t off)
				3653	{
				3654	struct cgroup *cgrp;
				3655	ssize_t ret;
				3656	int depth;
				3657
				3658	buf = strstrip(buf);
				3659	if (!strcmp(buf, "max")) {
				3660	depth = INT_MAX;
				3661	} else {
				3662	ret = kstrtoint(buf, 0, &depth);
				3663	if (ret)
				3664	return ret;
				3665	}
				3666
				3667	if (depth < 0)
				3668	return -ERANGE;
				3669
				3670	cgrp = cgroup_kn_lock_live(of->kn, false);
				3671	if (!cgrp)
				3672	return -ENOENT;
				3673
				3674	cgrp->max_depth = depth;
				3675
				3676	cgroup_kn_unlock(of->kn);
				3677
				3678	return nbytes;
				3679	}
				3680
				3681	static int cgroup_events_show(struct seq_file seq, void v)
				3682	{
				3683	struct cgroup *cgrp = seq_css(seq)->cgroup;
				3684
				3685	seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
				3686	seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
				3687
				3688	return 0;
				3689	}
				3690
				3691	static int cgroup_stat_show(struct seq_file seq, void v)
				3692	{
				3693	struct cgroup *cgroup = seq_css(seq)->cgroup;
				3694
				3695	seq_printf(seq, "nr_descendants %d\n",
				3696	cgroup->nr_descendants);
				3697	seq_printf(seq, "nr_dying_descendants %d\n",
				3698	cgroup->nr_dying_descendants);
				3699
				3700	return 0;
				3701	}
				3702
				3703	static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
				3704	struct cgroup *cgrp, int ssid)
				3705	{
				3706	struct cgroup_subsys *ss = cgroup_subsys[ssid];
				3707	struct cgroup_subsys_state *css;
				3708	int ret;
				3709
				3710	if (!ss->css_extra_stat_show)
				3711	return 0;
				3712
				3713	css = cgroup_tryget_css(cgrp, ss);
				3714	if (!css)
				3715	return 0;
				3716
				3717	ret = ss->css_extra_stat_show(seq, css);
				3718	css_put(css);
				3719	return ret;
				3720	}
				3721
				3722	static int cpu_stat_show(struct seq_file seq, void v)
				3723	{
				3724	struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
				3725	int ret = 0;
				3726
				3727	cgroup_base_stat_cputime_show(seq);
				3728	#ifdef CONFIG_CGROUP_SCHED
				3729	ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
				3730	#endif
				3731	return ret;
				3732	}
				3733
				3734	#ifdef CONFIG_PSI
				3735	static int cgroup_io_pressure_show(struct seq_file seq, void v)
				3736	{
				3737	struct cgroup *cgroup = seq_css(seq)->cgroup;
				3738	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
				3739
				3740	return psi_show(seq, psi, PSI_IO);
				3741	}
				3742	static int cgroup_memory_pressure_show(struct seq_file seq, void v)
				3743	{
				3744	struct cgroup *cgroup = seq_css(seq)->cgroup;
				3745	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
				3746
				3747	return psi_show(seq, psi, PSI_MEM);
				3748	}
				3749	static int cgroup_cpu_pressure_show(struct seq_file seq, void v)
				3750	{
				3751	struct cgroup *cgroup = seq_css(seq)->cgroup;
				3752	struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi;
				3753
				3754	return psi_show(seq, psi, PSI_CPU);
				3755	}
				3756
				3757	static ssize_t cgroup_pressure_write(struct kernfs_open_file of, char buf,
				3758	size_t nbytes, enum psi_res res)
				3759	{
				3760	struct cgroup_file_ctx *ctx = of->priv;
				3761	struct psi_trigger *new;
				3762	struct cgroup *cgrp;
				3763	struct psi_group *psi;
				3764
				3765	cgrp = cgroup_kn_lock_live(of->kn, false);
				3766	if (!cgrp)
				3767	return -ENODEV;
				3768
				3769	cgroup_get(cgrp);
				3770	cgroup_kn_unlock(of->kn);
				3771
				3772	/* Allow only one trigger per file descriptor */
				3773	if (ctx->psi.trigger) {
				3774	cgroup_put(cgrp);
				3775	return -EBUSY;
				3776	}
				3777
				3778	psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
				3779	new = psi_trigger_create(psi, buf, nbytes, res);
				3780	if (IS_ERR(new)) {
				3781	cgroup_put(cgrp);
				3782	return PTR_ERR(new);
				3783	}
				3784
				3785	smp_store_release(&ctx->psi.trigger, new);
				3786	cgroup_put(cgrp);
				3787
				3788	return nbytes;
				3789	}
				3790
				3791	static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
				3792	char *buf, size_t nbytes,
				3793	loff_t off)
				3794	{
				3795	return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
				3796	}
				3797
				3798	static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
				3799	char *buf, size_t nbytes,
				3800	loff_t off)
				3801	{
				3802	return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
				3803	}
				3804
				3805	static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
				3806	char *buf, size_t nbytes,
				3807	loff_t off)
				3808	{
				3809	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
				3810	}
				3811
				3812	static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
				3813	poll_table *pt)
				3814	{
				3815	struct cgroup_file_ctx *ctx = of->priv;
				3816	return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
				3817	}
				3818
				3819	static void cgroup_pressure_release(struct kernfs_open_file *of)
				3820	{
				3821	struct cgroup_file_ctx *ctx = of->priv;
				3822
				3823	psi_trigger_destroy(ctx->psi.trigger);
				3824	}
				3825
				3826	bool cgroup_psi_enabled(void)
				3827	{
				3828	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
				3829	}
				3830
				3831	#else /* CONFIG_PSI */
				3832	bool cgroup_psi_enabled(void)
				3833	{
				3834	return false;
				3835	}
				3836
				3837	#endif /* CONFIG_PSI */
				3838
				3839	static int cgroup_freeze_show(struct seq_file seq, void v)
				3840	{
				3841	struct cgroup *cgrp = seq_css(seq)->cgroup;
				3842
				3843	seq_printf(seq, "%d\n", cgrp->freezer.freeze);
				3844
				3845	return 0;
				3846	}
				3847
				3848	static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
				3849	char *buf, size_t nbytes, loff_t off)
				3850	{
				3851	struct cgroup *cgrp;
				3852	ssize_t ret;
				3853	int freeze;
				3854
				3855	ret = kstrtoint(strstrip(buf), 0, &freeze);
				3856	if (ret)
				3857	return ret;
				3858
				3859	if (freeze < 0 \|\| freeze > 1)
				3860	return -ERANGE;
				3861
				3862	cgrp = cgroup_kn_lock_live(of->kn, false);
				3863	if (!cgrp)
				3864	return -ENOENT;
				3865
				3866	cgroup_freeze(cgrp, freeze);
				3867
				3868	cgroup_kn_unlock(of->kn);
				3869
				3870	return nbytes;
				3871	}
				3872
				3873	static int cgroup_file_open(struct kernfs_open_file *of)
				3874	{
				3875	struct cftype *cft = of->kn->priv;
				3876	struct cgroup_file_ctx *ctx;
				3877	int ret;
				3878
				3879	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
				3880	if (!ctx)
				3881	return -ENOMEM;
				3882
				3883	ctx->ns = current->nsproxy->cgroup_ns;
				3884	get_cgroup_ns(ctx->ns);
				3885	of->priv = ctx;
				3886
				3887	if (!cft->open)
				3888	return 0;
				3889
				3890	ret = cft->open(of);
				3891	if (ret) {
				3892	put_cgroup_ns(ctx->ns);
				3893	kfree(ctx);
				3894	}
				3895	return ret;
				3896	}
				3897
				3898	static void cgroup_file_release(struct kernfs_open_file *of)
				3899	{
				3900	struct cftype *cft = of->kn->priv;
				3901	struct cgroup_file_ctx *ctx = of->priv;
				3902
				3903	if (cft->release)
				3904	cft->release(of);
				3905	put_cgroup_ns(ctx->ns);
				3906	kfree(ctx);
				3907	}
				3908
				3909	static ssize_t cgroup_file_write(struct kernfs_open_file of, char buf,
				3910	size_t nbytes, loff_t off)
				3911	{
				3912	struct cgroup_file_ctx *ctx = of->priv;
				3913	struct cgroup *cgrp = of->kn->parent->priv;
				3914	struct cftype *cft = of->kn->priv;
				3915	struct cgroup_subsys_state *css;
				3916	int ret;
				3917
				3918	/*
				3919	* If namespaces are delegation boundaries, disallow writes to
				3920	* files in an non-init namespace root from inside the namespace
				3921	* except for the files explicitly marked delegatable -
				3922	* cgroup.procs and cgroup.subtree_control.
				3923	*/
				3924	if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
				3925	!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
				3926	ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
				3927	return -EPERM;
				3928
				3929	if (cft->write)
				3930	return cft->write(of, buf, nbytes, off);
				3931
				3932	/*
				3933	* kernfs guarantees that a file isn't deleted with operations in
				3934	* flight, which means that the matching css is and stays alive and
				3935	* doesn't need to be pinned. The RCU locking is not necessary
				3936	* either. It's just for the convenience of using cgroup_css().
				3937	*/
				3938	rcu_read_lock();
				3939	css = cgroup_css(cgrp, cft->ss);
				3940	rcu_read_unlock();
				3941
				3942	if (cft->write_u64) {
				3943	unsigned long long v;
				3944	ret = kstrtoull(buf, 0, &v);
				3945	if (!ret)
				3946	ret = cft->write_u64(css, cft, v);
				3947	} else if (cft->write_s64) {
				3948	long long v;
				3949	ret = kstrtoll(buf, 0, &v);
				3950	if (!ret)
				3951	ret = cft->write_s64(css, cft, v);
				3952	} else {
				3953	ret = -EINVAL;
				3954	}
				3955
				3956	return ret ?: nbytes;
				3957	}
				3958
				3959	static __poll_t cgroup_file_poll(struct kernfs_open_file of, poll_table pt)
				3960	{
				3961	struct cftype *cft = of->kn->priv;
				3962
				3963	if (cft->poll)
				3964	return cft->poll(of, pt);
				3965
				3966	return kernfs_generic_poll(of, pt);
				3967	}
				3968
				3969	static void cgroup_seqfile_start(struct seq_file seq, loff_t *ppos)
				3970	{
				3971	return seq_cft(seq)->seq_start(seq, ppos);
				3972	}
				3973
				3974	static void cgroup_seqfile_next(struct seq_file seq, void v, loff_t ppos)
				3975	{
				3976	return seq_cft(seq)->seq_next(seq, v, ppos);
				3977	}
				3978
				3979	static void cgroup_seqfile_stop(struct seq_file seq, void v)
				3980	{
				3981	if (seq_cft(seq)->seq_stop)
				3982	seq_cft(seq)->seq_stop(seq, v);
				3983	}
				3984
				3985	static int cgroup_seqfile_show(struct seq_file m, void arg)
				3986	{
				3987	struct cftype *cft = seq_cft(m);
				3988	struct cgroup_subsys_state *css = seq_css(m);
				3989
				3990	if (cft->seq_show)
				3991	return cft->seq_show(m, arg);
				3992
				3993	if (cft->read_u64)
				3994	seq_printf(m, "%llu\n", cft->read_u64(css, cft));
				3995	else if (cft->read_s64)
				3996	seq_printf(m, "%lld\n", cft->read_s64(css, cft));
				3997	else
				3998	return -EINVAL;
				3999	return 0;
				4000	}
				4001
				4002	static struct kernfs_ops cgroup_kf_single_ops = {
				4003	.atomic_write_len = PAGE_SIZE,
				4004	.open = cgroup_file_open,
				4005	.release = cgroup_file_release,
				4006	.write = cgroup_file_write,
				4007	.poll = cgroup_file_poll,
				4008	.seq_show = cgroup_seqfile_show,
				4009	};
				4010
				4011	static struct kernfs_ops cgroup_kf_ops = {
				4012	.atomic_write_len = PAGE_SIZE,
				4013	.open = cgroup_file_open,
				4014	.release = cgroup_file_release,
				4015	.write = cgroup_file_write,
				4016	.poll = cgroup_file_poll,
				4017	.seq_start = cgroup_seqfile_start,
				4018	.seq_next = cgroup_seqfile_next,
				4019	.seq_stop = cgroup_seqfile_stop,
				4020	.seq_show = cgroup_seqfile_show,
				4021	};
				4022
				4023	/* set uid and gid of cgroup dirs and files to that of the creator */
				4024	static int cgroup_kn_set_ugid(struct kernfs_node *kn)
				4025	{
				4026	struct iattr iattr = { .ia_valid = ATTR_UID \| ATTR_GID,
				4027	.ia_uid = current_fsuid(),
				4028	.ia_gid = current_fsgid(), };
				4029
				4030	if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
				4031	gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
				4032	return 0;
				4033
				4034	return kernfs_setattr(kn, &iattr);
				4035	}
				4036
				4037	static void cgroup_file_notify_timer(struct timer_list *timer)
				4038	{
				4039	cgroup_file_notify(container_of(timer, struct cgroup_file,
				4040	notify_timer));
				4041	}
				4042
				4043	static int cgroup_add_file(struct cgroup_subsys_state css, struct cgroup cgrp,
				4044	struct cftype *cft)
				4045	{
				4046	char name[CGROUP_FILE_NAME_MAX];
				4047	struct kernfs_node *kn;
				4048	struct lock_class_key *key = NULL;
				4049	int ret;
				4050
				4051	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				4052	key = &cft->lockdep_key;
				4053	#endif
				4054	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
				4055	cgroup_file_mode(cft),
				4056	GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
				4057	0, cft->kf_ops, cft,
				4058	NULL, key);
				4059	if (IS_ERR(kn))
				4060	return PTR_ERR(kn);
				4061
				4062	ret = cgroup_kn_set_ugid(kn);
				4063	if (ret) {
				4064	kernfs_remove(kn);
				4065	return ret;
				4066	}
				4067
				4068	if (cft->file_offset) {
				4069	struct cgroup_file cfile = (void )css + cft->file_offset;
				4070
				4071	timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
				4072
				4073	spin_lock_irq(&cgroup_file_kn_lock);
				4074	cfile->kn = kn;
				4075	spin_unlock_irq(&cgroup_file_kn_lock);
				4076	}
				4077
				4078	return 0;
				4079	}
				4080
				4081	/**
				4082	* cgroup_addrm_files - add or remove files to a cgroup directory
				4083	* @css: the target css
				4084	* @cgrp: the target cgroup (usually css->cgroup)
				4085	* @cfts: array of cftypes to be added
				4086	* @is_add: whether to add or remove
				4087	*
				4088	* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
				4089	* For removals, this function never fails.
				4090	*/
				4091	static int cgroup_addrm_files(struct cgroup_subsys_state *css,
				4092	struct cgroup *cgrp, struct cftype cfts[],
				4093	bool is_add)
				4094	{
				4095	struct cftype cft, cft_end = NULL;
				4096	int ret = 0;
				4097
				4098	lockdep_assert_held(&cgroup_mutex);
				4099
				4100	restart:
				4101	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
				4102	/* does cft->flags tell us to skip this file on @cgrp? */
				4103	if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
				4104	continue;
				4105	if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
				4106	continue;
				4107	if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
				4108	continue;
				4109	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
				4110	continue;
				4111	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
				4112	continue;
				4113	if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
				4114	continue;
				4115	if (is_add) {
				4116	ret = cgroup_add_file(css, cgrp, cft);
				4117	if (ret) {
				4118	pr_warn("%s: failed to add %s, err=%d\n",
				4119	__func__, cft->name, ret);
				4120	cft_end = cft;
				4121	is_add = false;
				4122	goto restart;
				4123	}
				4124	} else {
				4125	cgroup_rm_file(cgrp, cft);
				4126	}
				4127	}
				4128	return ret;
				4129	}
				4130
				4131	static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
				4132	{
				4133	struct cgroup_subsys *ss = cfts[0].ss;
				4134	struct cgroup *root = &ss->root->cgrp;
				4135	struct cgroup_subsys_state *css;
				4136	int ret = 0;
				4137
				4138	lockdep_assert_held(&cgroup_mutex);
				4139
				4140	/* add/rm files for all cgroups created before */
				4141	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
				4142	struct cgroup *cgrp = css->cgroup;
				4143
				4144	if (!(css->flags & CSS_VISIBLE))
				4145	continue;
				4146
				4147	ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
				4148	if (ret)
				4149	break;
				4150	}
				4151
				4152	if (is_add && !ret)
				4153	kernfs_activate(root->kn);
				4154	return ret;
				4155	}
				4156
				4157	static void cgroup_exit_cftypes(struct cftype *cfts)
				4158	{
				4159	struct cftype *cft;
				4160
				4161	for (cft = cfts; cft->name[0] != '\0'; cft++) {
				4162	/* free copy for custom atomic_write_len, see init_cftypes() */
				4163	if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
				4164	kfree(cft->kf_ops);
				4165	cft->kf_ops = NULL;
				4166	cft->ss = NULL;
				4167
				4168	/* revert flags set by cgroup core while adding @cfts */
				4169	cft->flags &= ~(__CFTYPE_ONLY_ON_DFL \| __CFTYPE_NOT_ON_DFL);
				4170	}
				4171	}
				4172
				4173	static int cgroup_init_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				4174	{
				4175	struct cftype *cft;
				4176
				4177	for (cft = cfts; cft->name[0] != '\0'; cft++) {
				4178	struct kernfs_ops *kf_ops;
				4179
				4180	WARN_ON(cft->ss \|\| cft->kf_ops);
				4181
				4182	if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
				4183	continue;
				4184
				4185	if (cft->seq_start)
				4186	kf_ops = &cgroup_kf_ops;
				4187	else
				4188	kf_ops = &cgroup_kf_single_ops;
				4189
				4190	/*
				4191	* Ugh... if @cft wants a custom max_write_len, we need to
				4192	* make a copy of kf_ops to set its atomic_write_len.
				4193	*/
				4194	if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
				4195	kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
				4196	if (!kf_ops) {
				4197	cgroup_exit_cftypes(cfts);
				4198	return -ENOMEM;
				4199	}
				4200	kf_ops->atomic_write_len = cft->max_write_len;
				4201	}
				4202
				4203	cft->kf_ops = kf_ops;
				4204	cft->ss = ss;
				4205	}
				4206
				4207	return 0;
				4208	}
				4209
				4210	static int cgroup_rm_cftypes_locked(struct cftype *cfts)
				4211	{
				4212	lockdep_assert_held(&cgroup_mutex);
				4213
				4214	if (!cfts \|\| !cfts[0].ss)
				4215	return -ENOENT;
				4216
				4217	list_del(&cfts->node);
				4218	cgroup_apply_cftypes(cfts, false);
				4219	cgroup_exit_cftypes(cfts);
				4220	return 0;
				4221	}
				4222
				4223	/**
				4224	* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
				4225	* @cfts: zero-length name terminated array of cftypes
				4226	*
				4227	* Unregister @cfts. Files described by @cfts are removed from all
				4228	* existing cgroups and all future cgroups won't have them either. This
				4229	* function can be called anytime whether @cfts' subsys is attached or not.
				4230	*
				4231	* Returns 0 on successful unregistration, -ENOENT if @cfts is not
				4232	* registered.
				4233	*/
				4234	int cgroup_rm_cftypes(struct cftype *cfts)
				4235	{
				4236	int ret;
				4237
				4238	mutex_lock(&cgroup_mutex);
				4239	ret = cgroup_rm_cftypes_locked(cfts);
				4240	mutex_unlock(&cgroup_mutex);
				4241	return ret;
				4242	}
				4243
				4244	/**
				4245	* cgroup_add_cftypes - add an array of cftypes to a subsystem
				4246	* @ss: target cgroup subsystem
				4247	* @cfts: zero-length name terminated array of cftypes
				4248	*
				4249	* Register @cfts to @ss. Files described by @cfts are created for all
				4250	* existing cgroups to which @ss is attached and all future cgroups will
				4251	* have them too. This function can be called anytime whether @ss is
				4252	* attached or not.
				4253	*
				4254	* Returns 0 on successful registration, -errno on failure. Note that this
				4255	* function currently returns 0 as long as @cfts registration is successful
				4256	* even if some file creation attempts on existing cgroups fail.
				4257	*/
				4258	static int cgroup_add_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				4259	{
				4260	int ret;
				4261
				4262	if (!cgroup_ssid_enabled(ss->id))
				4263	return 0;
				4264
				4265	if (!cfts \|\| cfts[0].name[0] == '\0')
				4266	return 0;
				4267
				4268	ret = cgroup_init_cftypes(ss, cfts);
				4269	if (ret)
				4270	return ret;
				4271
				4272	mutex_lock(&cgroup_mutex);
				4273
				4274	list_add_tail(&cfts->node, &ss->cfts);
				4275	ret = cgroup_apply_cftypes(cfts, true);
				4276	if (ret)
				4277	cgroup_rm_cftypes_locked(cfts);
				4278
				4279	mutex_unlock(&cgroup_mutex);
				4280	return ret;
				4281	}
				4282
				4283	/**
				4284	* cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
				4285	* @ss: target cgroup subsystem
				4286	* @cfts: zero-length name terminated array of cftypes
				4287	*
				4288	* Similar to cgroup_add_cftypes() but the added files are only used for
				4289	* the default hierarchy.
				4290	*/
				4291	int cgroup_add_dfl_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				4292	{
				4293	struct cftype *cft;
				4294
				4295	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
				4296	cft->flags \|= __CFTYPE_ONLY_ON_DFL;
				4297	return cgroup_add_cftypes(ss, cfts);
				4298	}
				4299
				4300	/**
				4301	* cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
				4302	* @ss: target cgroup subsystem
				4303	* @cfts: zero-length name terminated array of cftypes
				4304	*
				4305	* Similar to cgroup_add_cftypes() but the added files are only used for
				4306	* the legacy hierarchies.
				4307	*/
				4308	int cgroup_add_legacy_cftypes(struct cgroup_subsys ss, struct cftype cfts)
				4309	{
				4310	struct cftype *cft;
				4311
				4312	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
				4313	cft->flags \|= __CFTYPE_NOT_ON_DFL;
				4314	return cgroup_add_cftypes(ss, cfts);
				4315	}
				4316
				4317	/**
				4318	* cgroup_file_notify - generate a file modified event for a cgroup_file
				4319	* @cfile: target cgroup_file
				4320	*
				4321	* @cfile must have been obtained by setting cftype->file_offset.
				4322	*/
				4323	void cgroup_file_notify(struct cgroup_file *cfile)
				4324	{
				4325	unsigned long flags;
				4326
				4327	spin_lock_irqsave(&cgroup_file_kn_lock, flags);
				4328	if (cfile->kn) {
				4329	unsigned long last = cfile->notified_at;
				4330	unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
				4331
				4332	if (time_in_range(jiffies, last, next)) {
				4333	timer_reduce(&cfile->notify_timer, next);
				4334	} else {
				4335	kernfs_notify(cfile->kn);
				4336	cfile->notified_at = jiffies;
				4337	}
				4338	}
				4339	spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
				4340	}
				4341
				4342	/**
				4343	* css_next_child - find the next child of a given css
				4344	* @pos: the current position (%NULL to initiate traversal)
				4345	* @parent: css whose children to walk
				4346	*
				4347	* This function returns the next child of @parent and should be called
				4348	* under either cgroup_mutex or RCU read lock. The only requirement is
				4349	* that @parent and @pos are accessible. The next sibling is guaranteed to
				4350	* be returned regardless of their states.
				4351	*
				4352	* If a subsystem synchronizes ->css_online() and the start of iteration, a
				4353	* css which finished ->css_online() is guaranteed to be visible in the
				4354	* future iterations and will stay visible until the last reference is put.
				4355	* A css which hasn't finished ->css_online() or already finished
				4356	* ->css_offline() may show up during traversal. It's each subsystem's
				4357	* responsibility to synchronize against on/offlining.
				4358	*/
				4359	struct cgroup_subsys_state css_next_child(struct cgroup_subsys_state pos,
				4360	struct cgroup_subsys_state *parent)
				4361	{
				4362	struct cgroup_subsys_state *next;
				4363
				4364	cgroup_assert_mutex_or_rcu_locked();
				4365
				4366	/*
				4367	* @pos could already have been unlinked from the sibling list.
				4368	* Once a cgroup is removed, its ->sibling.next is no longer
				4369	* updated when its next sibling changes. CSS_RELEASED is set when
				4370	* @pos is taken off list, at which time its next pointer is valid,
				4371	* and, as releases are serialized, the one pointed to by the next
				4372	* pointer is guaranteed to not have started release yet. This
				4373	* implies that if we observe !CSS_RELEASED on @pos in this RCU
				4374	* critical section, the one pointed to by its next pointer is
				4375	* guaranteed to not have finished its RCU grace period even if we
				4376	* have dropped rcu_read_lock() inbetween iterations.
				4377	*
				4378	* If @pos has CSS_RELEASED set, its next pointer can't be
				4379	* dereferenced; however, as each css is given a monotonically
				4380	* increasing unique serial number and always appended to the
				4381	* sibling list, the next one can be found by walking the parent's
				4382	* children until the first css with higher serial number than
				4383	* @pos's. While this path can be slower, it happens iff iteration
				4384	* races against release and the race window is very small.
				4385	*/
				4386	if (!pos) {
				4387	next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
				4388	} else if (likely(!(pos->flags & CSS_RELEASED))) {
				4389	next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
				4390	} else {
				4391	list_for_each_entry_rcu(next, &parent->children, sibling)
				4392	if (next->serial_nr > pos->serial_nr)
				4393	break;
				4394	}
				4395
				4396	/*
				4397	* @next, if not pointing to the head, can be dereferenced and is
				4398	* the next sibling.
				4399	*/
				4400	if (&next->sibling != &parent->children)
				4401	return next;
				4402	return NULL;
				4403	}
				4404
				4405	/**
				4406	* css_next_descendant_pre - find the next descendant for pre-order walk
				4407	* @pos: the current position (%NULL to initiate traversal)
				4408	* @root: css whose descendants to walk
				4409	*
				4410	* To be used by css_for_each_descendant_pre(). Find the next descendant
				4411	* to visit for pre-order traversal of @root's descendants. @root is
				4412	* included in the iteration and the first node to be visited.
				4413	*
				4414	* While this function requires cgroup_mutex or RCU read locking, it
				4415	* doesn't require the whole traversal to be contained in a single critical
				4416	* section. This function will return the correct next descendant as long
				4417	* as both @pos and @root are accessible and @pos is a descendant of @root.
				4418	*
				4419	* If a subsystem synchronizes ->css_online() and the start of iteration, a
				4420	* css which finished ->css_online() is guaranteed to be visible in the
				4421	* future iterations and will stay visible until the last reference is put.
				4422	* A css which hasn't finished ->css_online() or already finished
				4423	* ->css_offline() may show up during traversal. It's each subsystem's
				4424	* responsibility to synchronize against on/offlining.
				4425	*/
				4426	struct cgroup_subsys_state *
				4427	css_next_descendant_pre(struct cgroup_subsys_state *pos,
				4428	struct cgroup_subsys_state *root)
				4429	{
				4430	struct cgroup_subsys_state *next;
				4431
				4432	cgroup_assert_mutex_or_rcu_locked();
				4433
				4434	/* if first iteration, visit @root */
				4435	if (!pos)
				4436	return root;
				4437
				4438	/* visit the first child if exists */
				4439	next = css_next_child(NULL, pos);
				4440	if (next)
				4441	return next;
				4442
				4443	/* no child, visit my or the closest ancestor's next sibling */
				4444	while (pos != root) {
				4445	next = css_next_child(pos, pos->parent);
				4446	if (next)
				4447	return next;
				4448	pos = pos->parent;
				4449	}
				4450
				4451	return NULL;
				4452	}
				4453	EXPORT_SYMBOL_GPL(css_next_descendant_pre);
				4454
				4455	/**
				4456	* css_rightmost_descendant - return the rightmost descendant of a css
				4457	* @pos: css of interest
				4458	*
				4459	* Return the rightmost descendant of @pos. If there's no descendant, @pos
				4460	* is returned. This can be used during pre-order traversal to skip
				4461	* subtree of @pos.
				4462	*
				4463	* While this function requires cgroup_mutex or RCU read locking, it
				4464	* doesn't require the whole traversal to be contained in a single critical
				4465	* section. This function will return the correct rightmost descendant as
				4466	* long as @pos is accessible.
				4467	*/
				4468	struct cgroup_subsys_state *
				4469	css_rightmost_descendant(struct cgroup_subsys_state *pos)
				4470	{
				4471	struct cgroup_subsys_state last, tmp;
				4472
				4473	cgroup_assert_mutex_or_rcu_locked();
				4474
				4475	do {
				4476	last = pos;
				4477	/* ->prev isn't RCU safe, walk ->next till the end */
				4478	pos = NULL;
				4479	css_for_each_child(tmp, last)
				4480	pos = tmp;
				4481	} while (pos);
				4482
				4483	return last;
				4484	}
				4485
				4486	static struct cgroup_subsys_state *
				4487	css_leftmost_descendant(struct cgroup_subsys_state *pos)
				4488	{
				4489	struct cgroup_subsys_state *last;
				4490
				4491	do {
				4492	last = pos;
				4493	pos = css_next_child(NULL, pos);
				4494	} while (pos);
				4495
				4496	return last;
				4497	}
				4498
				4499	/**
				4500	* css_next_descendant_post - find the next descendant for post-order walk
				4501	* @pos: the current position (%NULL to initiate traversal)
				4502	* @root: css whose descendants to walk
				4503	*
				4504	* To be used by css_for_each_descendant_post(). Find the next descendant
				4505	* to visit for post-order traversal of @root's descendants. @root is
				4506	* included in the iteration and the last node to be visited.
				4507	*
				4508	* While this function requires cgroup_mutex or RCU read locking, it
				4509	* doesn't require the whole traversal to be contained in a single critical
				4510	* section. This function will return the correct next descendant as long
				4511	* as both @pos and @cgroup are accessible and @pos is a descendant of
				4512	* @cgroup.
				4513	*
				4514	* If a subsystem synchronizes ->css_online() and the start of iteration, a
				4515	* css which finished ->css_online() is guaranteed to be visible in the
				4516	* future iterations and will stay visible until the last reference is put.
				4517	* A css which hasn't finished ->css_online() or already finished
				4518	* ->css_offline() may show up during traversal. It's each subsystem's
				4519	* responsibility to synchronize against on/offlining.
				4520	*/
				4521	struct cgroup_subsys_state *
				4522	css_next_descendant_post(struct cgroup_subsys_state *pos,
				4523	struct cgroup_subsys_state *root)
				4524	{
				4525	struct cgroup_subsys_state *next;
				4526
				4527	cgroup_assert_mutex_or_rcu_locked();
				4528
				4529	/* if first iteration, visit leftmost descendant which may be @root */
				4530	if (!pos)
				4531	return css_leftmost_descendant(root);
				4532
				4533	/* if we visited @root, we're done */
				4534	if (pos == root)
				4535	return NULL;
				4536
				4537	/* if there's an unvisited sibling, visit its leftmost descendant */
				4538	next = css_next_child(pos, pos->parent);
				4539	if (next)
				4540	return css_leftmost_descendant(next);
				4541
				4542	/* no sibling left, visit parent */
				4543	return pos->parent;
				4544	}
				4545
				4546	/**
				4547	* css_has_online_children - does a css have online children
				4548	* @css: the target css
				4549	*
				4550	* Returns %true if @css has any online children; otherwise, %false. This
				4551	* function can be called from any context but the caller is responsible
				4552	* for synchronizing against on/offlining as necessary.
				4553	*/
				4554	bool css_has_online_children(struct cgroup_subsys_state *css)
				4555	{
				4556	struct cgroup_subsys_state *child;
				4557	bool ret = false;
				4558
				4559	rcu_read_lock();
				4560	css_for_each_child(child, css) {
				4561	if (child->flags & CSS_ONLINE) {
				4562	ret = true;
				4563	break;
				4564	}
				4565	}
				4566	rcu_read_unlock();
				4567	return ret;
				4568	}
				4569
				4570	static struct css_set css_task_iter_next_css_set(struct css_task_iter it)
				4571	{
				4572	struct list_head *l;
				4573	struct cgrp_cset_link *link;
				4574	struct css_set *cset;
				4575
				4576	lockdep_assert_held(&css_set_lock);
				4577
				4578	/* find the next threaded cset */
				4579	if (it->tcset_pos) {
				4580	l = it->tcset_pos->next;
				4581
				4582	if (l != it->tcset_head) {
				4583	it->tcset_pos = l;
				4584	return container_of(l, struct css_set,
				4585	threaded_csets_node);
				4586	}
				4587
				4588	it->tcset_pos = NULL;
				4589	}
				4590
				4591	/* find the next cset */
				4592	l = it->cset_pos;
				4593	l = l->next;
				4594	if (l == it->cset_head) {
				4595	it->cset_pos = NULL;
				4596	return NULL;
				4597	}
				4598
				4599	if (it->ss) {
				4600	cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
				4601	} else {
				4602	link = list_entry(l, struct cgrp_cset_link, cset_link);
				4603	cset = link->cset;
				4604	}
				4605
				4606	it->cset_pos = l;
				4607
				4608	/* initialize threaded css_set walking */
				4609	if (it->flags & CSS_TASK_ITER_THREADED) {
				4610	if (it->cur_dcset)
				4611	put_css_set_locked(it->cur_dcset);
				4612	it->cur_dcset = cset;
				4613	get_css_set(cset);
				4614
				4615	it->tcset_head = &cset->threaded_csets;
				4616	it->tcset_pos = &cset->threaded_csets;
				4617	}
				4618
				4619	return cset;
				4620	}
				4621
				4622	/**
				4623	* css_task_iter_advance_css_set - advance a task itererator to the next css_set
				4624	* @it: the iterator to advance
				4625	*
				4626	* Advance @it to the next css_set to walk.
				4627	*/
				4628	static void css_task_iter_advance_css_set(struct css_task_iter *it)
				4629	{
				4630	struct css_set *cset;
				4631
				4632	lockdep_assert_held(&css_set_lock);
				4633
				4634	/* Advance to the next non-empty css_set */
				4635	do {
				4636	cset = css_task_iter_next_css_set(it);
				4637	if (!cset) {
				4638	it->task_pos = NULL;
				4639	return;
				4640	}
				4641	} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
				4642
				4643	if (!list_empty(&cset->tasks)) {
				4644	it->task_pos = cset->tasks.next;
				4645	it->cur_tasks_head = &cset->tasks;
				4646	} else if (!list_empty(&cset->mg_tasks)) {
				4647	it->task_pos = cset->mg_tasks.next;
				4648	it->cur_tasks_head = &cset->mg_tasks;
				4649	} else {
				4650	it->task_pos = cset->dying_tasks.next;
				4651	it->cur_tasks_head = &cset->dying_tasks;
				4652	}
				4653
				4654	it->tasks_head = &cset->tasks;
				4655	it->mg_tasks_head = &cset->mg_tasks;
				4656	it->dying_tasks_head = &cset->dying_tasks;
				4657
				4658	/*
				4659	* We don't keep css_sets locked across iteration steps and thus
				4660	* need to take steps to ensure that iteration can be resumed after
				4661	* the lock is re-acquired. Iteration is performed at two levels -
				4662	* css_sets and tasks in them.
				4663	*
				4664	* Once created, a css_set never leaves its cgroup lists, so a
				4665	* pinned css_set is guaranteed to stay put and we can resume
				4666	* iteration afterwards.
				4667	*
				4668	* Tasks may leave @cset across iteration steps. This is resolved
				4669	* by registering each iterator with the css_set currently being
				4670	* walked and making css_set_move_task() advance iterators whose
				4671	* next task is leaving.
				4672	*/
				4673	if (it->cur_cset) {
				4674	list_del(&it->iters_node);
				4675	put_css_set_locked(it->cur_cset);
				4676	}
				4677	get_css_set(cset);
				4678	it->cur_cset = cset;
				4679	list_add(&it->iters_node, &cset->task_iters);
				4680	}
				4681
				4682	static void css_task_iter_skip(struct css_task_iter *it,
				4683	struct task_struct *task)
				4684	{
				4685	lockdep_assert_held(&css_set_lock);
				4686
				4687	if (it->task_pos == &task->cg_list) {
				4688	it->task_pos = it->task_pos->next;
				4689	it->flags \|= CSS_TASK_ITER_SKIPPED;
				4690	}
				4691	}
				4692
				4693	static void css_task_iter_advance(struct css_task_iter *it)
				4694	{
				4695	struct task_struct *task;
				4696
				4697	lockdep_assert_held(&css_set_lock);
				4698	repeat:
				4699	if (it->task_pos) {
				4700	/*
				4701	* Advance iterator to find next entry. cset->tasks is
				4702	* consumed first and then ->mg_tasks. After ->mg_tasks,
				4703	* we move onto the next cset.
				4704	*/
				4705	if (it->flags & CSS_TASK_ITER_SKIPPED)
				4706	it->flags &= ~CSS_TASK_ITER_SKIPPED;
				4707	else
				4708	it->task_pos = it->task_pos->next;
				4709
				4710	if (it->task_pos == it->tasks_head) {
				4711	it->task_pos = it->mg_tasks_head->next;
				4712	it->cur_tasks_head = it->mg_tasks_head;
				4713	}
				4714	if (it->task_pos == it->mg_tasks_head) {
				4715	it->task_pos = it->dying_tasks_head->next;
				4716	it->cur_tasks_head = it->dying_tasks_head;
				4717	}
				4718	if (it->task_pos == it->dying_tasks_head)
				4719	css_task_iter_advance_css_set(it);
				4720	} else {
				4721	/* called from start, proceed to the first cset */
				4722	css_task_iter_advance_css_set(it);
				4723	}
				4724
				4725	if (!it->task_pos)
				4726	return;
				4727
				4728	task = list_entry(it->task_pos, struct task_struct, cg_list);
				4729
				4730	if (it->flags & CSS_TASK_ITER_PROCS) {
				4731	/* if PROCS, skip over tasks which aren't group leaders */
				4732	if (!thread_group_leader(task))
				4733	goto repeat;
				4734
				4735	/* and dying leaders w/o live member threads */
				4736	if (it->cur_tasks_head == it->dying_tasks_head &&
				4737	!atomic_read(&task->signal->live))
				4738	goto repeat;
				4739	} else {
				4740	/* skip all dying ones */
				4741	if (it->cur_tasks_head == it->dying_tasks_head)
				4742	goto repeat;
				4743	}
				4744	}
				4745
				4746	/**
				4747	* css_task_iter_start - initiate task iteration
				4748	* @css: the css to walk tasks of
				4749	* @flags: CSS_TASK_ITER_* flags
				4750	* @it: the task iterator to use
				4751	*
				4752	* Initiate iteration through the tasks of @css. The caller can call
				4753	* css_task_iter_next() to walk through the tasks until the function
				4754	* returns NULL. On completion of iteration, css_task_iter_end() must be
				4755	* called.
				4756	*/
				4757	void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
				4758	struct css_task_iter *it)
				4759	{
				4760	/* no one should try to iterate before mounting cgroups */
				4761	WARN_ON_ONCE(!use_task_css_set_links);
				4762
				4763	memset(it, 0, sizeof(*it));
				4764
				4765	spin_lock_irq(&css_set_lock);
				4766
				4767	it->ss = css->ss;
				4768	it->flags = flags;
				4769
				4770	if (it->ss)
				4771	it->cset_pos = &css->cgroup->e_csets[css->ss->id];
				4772	else
				4773	it->cset_pos = &css->cgroup->cset_links;
				4774
				4775	it->cset_head = it->cset_pos;
				4776
				4777	css_task_iter_advance(it);
				4778
				4779	spin_unlock_irq(&css_set_lock);
				4780	}
				4781
				4782	/**
				4783	* css_task_iter_next - return the next task for the iterator
				4784	* @it: the task iterator being iterated
				4785	*
				4786	* The "next" function for task iteration. @it should have been
				4787	* initialized via css_task_iter_start(). Returns NULL when the iteration
				4788	* reaches the end.
				4789	*/
				4790	struct task_struct css_task_iter_next(struct css_task_iter it)
				4791	{
				4792	if (it->cur_task) {
				4793	put_task_struct(it->cur_task);
				4794	it->cur_task = NULL;
				4795	}
				4796
				4797	spin_lock_irq(&css_set_lock);
				4798
				4799	/* @it may be half-advanced by skips, finish advancing */
				4800	if (it->flags & CSS_TASK_ITER_SKIPPED)
				4801	css_task_iter_advance(it);
				4802
				4803	if (it->task_pos) {
				4804	it->cur_task = list_entry(it->task_pos, struct task_struct,
				4805	cg_list);
				4806	get_task_struct(it->cur_task);
				4807	css_task_iter_advance(it);
				4808	}
				4809
				4810	spin_unlock_irq(&css_set_lock);
				4811
				4812	return it->cur_task;
				4813	}
				4814
				4815	/**
				4816	* css_task_iter_end - finish task iteration
				4817	* @it: the task iterator to finish
				4818	*
				4819	* Finish task iteration started by css_task_iter_start().
				4820	*/
				4821	void css_task_iter_end(struct css_task_iter *it)
				4822	{
				4823	if (it->cur_cset) {
				4824	spin_lock_irq(&css_set_lock);
				4825	list_del(&it->iters_node);
				4826	put_css_set_locked(it->cur_cset);
				4827	spin_unlock_irq(&css_set_lock);
				4828	}
				4829
				4830	if (it->cur_dcset)
				4831	put_css_set(it->cur_dcset);
				4832
				4833	if (it->cur_task)
				4834	put_task_struct(it->cur_task);
				4835	}
				4836
				4837	static void cgroup_procs_release(struct kernfs_open_file *of)
				4838	{
				4839	struct cgroup_file_ctx *ctx = of->priv;
				4840
				4841	if (ctx->procs.started)
				4842	css_task_iter_end(&ctx->procs.iter);
				4843	}
				4844
				4845	static void cgroup_procs_next(struct seq_file s, void v, loff_t pos)
				4846	{
				4847	struct kernfs_open_file *of = s->private;
				4848	struct cgroup_file_ctx *ctx = of->priv;
				4849
				4850	if (pos)
				4851	(*pos)++;
				4852
				4853	return css_task_iter_next(&ctx->procs.iter);
				4854	}
				4855
				4856	static void __cgroup_procs_start(struct seq_file s, loff_t *pos,
				4857	unsigned int iter_flags)
				4858	{
				4859	struct kernfs_open_file *of = s->private;
				4860	struct cgroup *cgrp = seq_css(s)->cgroup;
				4861	struct cgroup_file_ctx *ctx = of->priv;
				4862	struct css_task_iter *it = &ctx->procs.iter;
				4863
				4864	/*
				4865	* When a seq_file is seeked, it's always traversed sequentially
				4866	* from position 0, so we can simply keep iterating on !0 *pos.
				4867	*/
				4868	if (!ctx->procs.started) {
				4869	if (WARN_ON_ONCE((*pos)))
				4870	return ERR_PTR(-EINVAL);
				4871
				4872	css_task_iter_start(&cgrp->self, iter_flags, it);
				4873	ctx->procs.started = true;
				4874	} else if (!(*pos)) {
				4875	css_task_iter_end(it);
				4876	css_task_iter_start(&cgrp->self, iter_flags, it);
				4877	} else
				4878	return it->cur_task;
				4879
				4880	return cgroup_procs_next(s, NULL, NULL);
				4881	}
				4882
				4883	static void cgroup_procs_start(struct seq_file s, loff_t *pos)
				4884	{
				4885	struct cgroup *cgrp = seq_css(s)->cgroup;
				4886
				4887	/*
				4888	* All processes of a threaded subtree belong to the domain cgroup
				4889	* of the subtree. Only threads can be distributed across the
				4890	* subtree. Reject reads on cgroup.procs in the subtree proper.
				4891	* They're always empty anyway.
				4892	*/
				4893	if (cgroup_is_threaded(cgrp))
				4894	return ERR_PTR(-EOPNOTSUPP);
				4895
				4896	return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS \|
				4897	CSS_TASK_ITER_THREADED);
				4898	}
				4899
				4900	static int cgroup_procs_show(struct seq_file s, void v)
				4901	{
				4902	seq_printf(s, "%d\n", task_pid_vnr(v));
				4903	return 0;
				4904	}
				4905
				4906	static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
				4907	struct cgroup *dst_cgrp,
				4908	struct super_block *sb,
				4909	struct cgroup_namespace *ns)
				4910	{
				4911	struct cgroup *com_cgrp = src_cgrp;
				4912	struct inode *inode;
				4913	int ret;
				4914
				4915	lockdep_assert_held(&cgroup_mutex);
				4916
				4917	/* find the common ancestor */
				4918	while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
				4919	com_cgrp = cgroup_parent(com_cgrp);
				4920
				4921	/* %current should be authorized to migrate to the common ancestor */
				4922	inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
				4923	if (!inode)
				4924	return -ENOMEM;
				4925
				4926	ret = inode_permission(inode, MAY_WRITE);
				4927	iput(inode);
				4928	if (ret)
				4929	return ret;
				4930
				4931	/*
				4932	* If namespaces are delegation boundaries, %current must be able
				4933	* to see both source and destination cgroups from its namespace.
				4934	*/
				4935	if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
				4936	(!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) \|\|
				4937	!cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
				4938	return -ENOENT;
				4939
				4940	return 0;
				4941	}
				4942
				4943	static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
				4944	char *buf, size_t nbytes, loff_t off)
				4945	{
				4946	struct cgroup_file_ctx *ctx = of->priv;
				4947	struct cgroup src_cgrp, dst_cgrp;
				4948	struct task_struct *task;
				4949	const struct cred *saved_cred;
				4950	ssize_t ret;
				4951	bool threadgroup_locked;
				4952
				4953	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
				4954	if (!dst_cgrp)
				4955	return -ENODEV;
				4956
				4957	task = cgroup_procs_write_start(buf, true, &threadgroup_locked);
				4958	ret = PTR_ERR_OR_ZERO(task);
				4959	if (ret)
				4960	goto out_unlock;
				4961
				4962	/* find the source cgroup */
				4963	spin_lock_irq(&css_set_lock);
				4964	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
				4965	spin_unlock_irq(&css_set_lock);
				4966
				4967	/*
				4968	* Process and thread migrations follow same delegation rule. Check
				4969	* permissions using the credentials from file open to protect against
				4970	* inherited fd attacks.
				4971	*/
				4972	saved_cred = override_creds(of->file->f_cred);
				4973	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
				4974	of->file->f_path.dentry->d_sb,
				4975	ctx->ns);
				4976	revert_creds(saved_cred);
				4977	if (ret)
				4978	goto out_finish;
				4979
				4980	ret = cgroup_attach_task(dst_cgrp, task, true);
				4981
				4982	out_finish:
				4983	cgroup_procs_write_finish(task, threadgroup_locked);
				4984	out_unlock:
				4985	cgroup_kn_unlock(of->kn);
				4986
				4987	return ret ?: nbytes;
				4988	}
				4989
				4990	static void cgroup_threads_start(struct seq_file s, loff_t *pos)
				4991	{
				4992	return __cgroup_procs_start(s, pos, 0);
				4993	}
				4994
				4995	static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
				4996	char *buf, size_t nbytes, loff_t off)
				4997	{
				4998	struct cgroup_file_ctx *ctx = of->priv;
				4999	struct cgroup src_cgrp, dst_cgrp;
				5000	struct task_struct *task;
				5001	const struct cred *saved_cred;
				5002	ssize_t ret;
				5003	bool locked;
				5004
				5005	buf = strstrip(buf);
				5006
				5007	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
				5008	if (!dst_cgrp)
				5009	return -ENODEV;
				5010
				5011	task = cgroup_procs_write_start(buf, false, &locked);
				5012	ret = PTR_ERR_OR_ZERO(task);
				5013	if (ret)
				5014	goto out_unlock;
				5015
				5016	/* find the source cgroup */
				5017	spin_lock_irq(&css_set_lock);
				5018	src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
				5019	spin_unlock_irq(&css_set_lock);
				5020
				5021	/*
				5022	* Process and thread migrations follow same delegation rule. Check
				5023	* permissions using the credentials from file open to protect against
				5024	* inherited fd attacks.
				5025	*/
				5026	saved_cred = override_creds(of->file->f_cred);
				5027	ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
				5028	of->file->f_path.dentry->d_sb,
				5029	ctx->ns);
				5030	revert_creds(saved_cred);
				5031	if (ret)
				5032	goto out_finish;
				5033
				5034	/* and must be contained in the same domain */
				5035	ret = -EOPNOTSUPP;
				5036	if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
				5037	goto out_finish;
				5038
				5039	ret = cgroup_attach_task(dst_cgrp, task, false);
				5040
				5041	out_finish:
				5042	cgroup_procs_write_finish(task, locked);
				5043	out_unlock:
				5044	cgroup_kn_unlock(of->kn);
				5045
				5046	return ret ?: nbytes;
				5047	}
				5048
				5049	/* cgroup core interface files for the default hierarchy */
				5050	static struct cftype cgroup_base_files[] = {
				5051	{
				5052	.name = "cgroup.type",
				5053	.flags = CFTYPE_NOT_ON_ROOT,
				5054	.seq_show = cgroup_type_show,
				5055	.write = cgroup_type_write,
				5056	},
				5057	{
				5058	.name = "cgroup.procs",
				5059	.flags = CFTYPE_NS_DELEGATABLE,
				5060	.file_offset = offsetof(struct cgroup, procs_file),
				5061	.release = cgroup_procs_release,
				5062	.seq_start = cgroup_procs_start,
				5063	.seq_next = cgroup_procs_next,
				5064	.seq_show = cgroup_procs_show,
				5065	.write = cgroup_procs_write,
				5066	},
				5067	{
				5068	.name = "cgroup.threads",
				5069	.flags = CFTYPE_NS_DELEGATABLE,
				5070	.release = cgroup_procs_release,
				5071	.seq_start = cgroup_threads_start,
				5072	.seq_next = cgroup_procs_next,
				5073	.seq_show = cgroup_procs_show,
				5074	.write = cgroup_threads_write,
				5075	},
				5076	{
				5077	.name = "cgroup.controllers",
				5078	.seq_show = cgroup_controllers_show,
				5079	},
				5080	{
				5081	.name = "cgroup.subtree_control",
				5082	.flags = CFTYPE_NS_DELEGATABLE,
				5083	.seq_show = cgroup_subtree_control_show,
				5084	.write = cgroup_subtree_control_write,
				5085	},
				5086	{
				5087	.name = "cgroup.events",
				5088	.flags = CFTYPE_NOT_ON_ROOT,
				5089	.file_offset = offsetof(struct cgroup, events_file),
				5090	.seq_show = cgroup_events_show,
				5091	},
				5092	{
				5093	.name = "cgroup.max.descendants",
				5094	.seq_show = cgroup_max_descendants_show,
				5095	.write = cgroup_max_descendants_write,
				5096	},
				5097	{
				5098	.name = "cgroup.max.depth",
				5099	.seq_show = cgroup_max_depth_show,
				5100	.write = cgroup_max_depth_write,
				5101	},
				5102	{
				5103	.name = "cgroup.stat",
				5104	.seq_show = cgroup_stat_show,
				5105	},
				5106	{
				5107	.name = "cgroup.freeze",
				5108	.flags = CFTYPE_NOT_ON_ROOT,
				5109	.seq_show = cgroup_freeze_show,
				5110	.write = cgroup_freeze_write,
				5111	},
				5112	{
				5113	.name = "cpu.stat",
				5114	.flags = CFTYPE_NOT_ON_ROOT,
				5115	.seq_show = cpu_stat_show,
				5116	},
				5117	#ifdef CONFIG_PSI
				5118	{
				5119	.name = "io.pressure",
				5120	.flags = CFTYPE_PRESSURE,
				5121	.seq_show = cgroup_io_pressure_show,
				5122	.write = cgroup_io_pressure_write,
				5123	.poll = cgroup_pressure_poll,
				5124	.release = cgroup_pressure_release,
				5125	},
				5126	{
				5127	.name = "memory.pressure",
				5128	.flags = CFTYPE_PRESSURE,
				5129	.seq_show = cgroup_memory_pressure_show,
				5130	.write = cgroup_memory_pressure_write,
				5131	.poll = cgroup_pressure_poll,
				5132	.release = cgroup_pressure_release,
				5133	},
				5134	{
				5135	.name = "cpu.pressure",
				5136	.flags = CFTYPE_PRESSURE,
				5137	.seq_show = cgroup_cpu_pressure_show,
				5138	.write = cgroup_cpu_pressure_write,
				5139	.poll = cgroup_pressure_poll,
				5140	.release = cgroup_pressure_release,
				5141	},
				5142	#endif /* CONFIG_PSI */
				5143	{ } /* terminate */
				5144	};
				5145
				5146	/*
				5147	* css destruction is four-stage process.
				5148	*
				5149	* 1. Destruction starts. Killing of the percpu_ref is initiated.
				5150	* Implemented in kill_css().
				5151	*
				5152	* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
				5153	* and thus css_tryget_online() is guaranteed to fail, the css can be
				5154	* offlined by invoking offline_css(). After offlining, the base ref is
				5155	* put. Implemented in css_killed_work_fn().
				5156	*
				5157	* 3. When the percpu_ref reaches zero, the only possible remaining
				5158	* accessors are inside RCU read sections. css_release() schedules the
				5159	* RCU callback.
				5160	*
				5161	* 4. After the grace period, the css can be freed. Implemented in
				5162	* css_free_work_fn().
				5163	*
				5164	* It is actually hairier because both step 2 and 4 require process context
				5165	* and thus involve punting to css->destroy_work adding two additional
				5166	* steps to the already complex sequence.
				5167	*/
				5168	static void css_free_rwork_fn(struct work_struct *work)
				5169	{
				5170	struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
				5171	struct cgroup_subsys_state, destroy_rwork);
				5172	struct cgroup_subsys *ss = css->ss;
				5173	struct cgroup *cgrp = css->cgroup;
				5174
				5175	percpu_ref_exit(&css->refcnt);
				5176
				5177	if (ss) {
				5178	/* css free path */
				5179	struct cgroup_subsys_state *parent = css->parent;
				5180	int id = css->id;
				5181
				5182	ss->css_free(css);
				5183	cgroup_idr_remove(&ss->css_idr, id);
				5184	cgroup_put(cgrp);
				5185
				5186	if (parent)
				5187	css_put(parent);
				5188	} else {
				5189	/* cgroup free path */
				5190	atomic_dec(&cgrp->root->nr_cgrps);
				5191	cgroup1_pidlist_destroy_all(cgrp);
				5192	cancel_work_sync(&cgrp->release_agent_work);
				5193
				5194	if (cgroup_parent(cgrp)) {
				5195	/*
				5196	* We get a ref to the parent, and put the ref when
				5197	* this cgroup is being freed, so it's guaranteed
				5198	* that the parent won't be destroyed before its
				5199	* children.
				5200	*/
				5201	cgroup_put(cgroup_parent(cgrp));
				5202	kernfs_put(cgrp->kn);
				5203	psi_cgroup_free(cgrp);
				5204	if (cgroup_on_dfl(cgrp))
				5205	cgroup_rstat_exit(cgrp);
				5206	kfree(cgrp);
				5207	} else {
				5208	/*
				5209	* This is root cgroup's refcnt reaching zero,
				5210	* which indicates that the root should be
				5211	* released.
				5212	*/
				5213	cgroup_destroy_root(cgrp->root);
				5214	}
				5215	}
				5216	}
				5217
				5218	static void css_release_work_fn(struct work_struct *work)
				5219	{
				5220	struct cgroup_subsys_state *css =
				5221	container_of(work, struct cgroup_subsys_state, destroy_work);
				5222	struct cgroup_subsys *ss = css->ss;
				5223	struct cgroup *cgrp = css->cgroup;
				5224
				5225	mutex_lock(&cgroup_mutex);
				5226
				5227	css->flags \|= CSS_RELEASED;
				5228	list_del_rcu(&css->sibling);
				5229
				5230	if (ss) {
				5231	/* css release path */
				5232	if (!list_empty(&css->rstat_css_node)) {
				5233	cgroup_rstat_flush(cgrp);
				5234	list_del_rcu(&css->rstat_css_node);
				5235	}
				5236
				5237	cgroup_idr_replace(&ss->css_idr, NULL, css->id);
				5238	if (ss->css_released)
				5239	ss->css_released(css);
				5240	} else {
				5241	struct cgroup *tcgrp;
				5242
				5243	/* cgroup release path */
				5244	TRACE_CGROUP_PATH(release, cgrp);
				5245
				5246	if (cgroup_on_dfl(cgrp))
				5247	cgroup_rstat_flush(cgrp);
				5248
				5249	spin_lock_irq(&css_set_lock);
				5250	for (tcgrp = cgroup_parent(cgrp); tcgrp;
				5251	tcgrp = cgroup_parent(tcgrp))
				5252	tcgrp->nr_dying_descendants--;
				5253	spin_unlock_irq(&css_set_lock);
				5254
				5255	cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
				5256	cgrp->id = -1;
				5257
				5258	/*
				5259	* There are two control paths which try to determine
				5260	* cgroup from dentry without going through kernfs -
				5261	* cgroupstats_build() and css_tryget_online_from_dir().
				5262	* Those are supported by RCU protecting clearing of
				5263	* cgrp->kn->priv backpointer.
				5264	*/
				5265	if (cgrp->kn)
				5266	RCU_INIT_POINTER((void __rcu __force *)&cgrp->kn->priv,
				5267	NULL);
				5268	}
				5269
				5270	mutex_unlock(&cgroup_mutex);
				5271
				5272	INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
				5273	queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
				5274	}
				5275
				5276	static void css_release(struct percpu_ref *ref)
				5277	{
				5278	struct cgroup_subsys_state *css =
				5279	container_of(ref, struct cgroup_subsys_state, refcnt);
				5280
				5281	INIT_WORK(&css->destroy_work, css_release_work_fn);
				5282	queue_work(cgroup_destroy_wq, &css->destroy_work);
				5283	}
				5284
				5285	static void init_and_link_css(struct cgroup_subsys_state *css,
				5286	struct cgroup_subsys ss, struct cgroup cgrp)
				5287	{
				5288	lockdep_assert_held(&cgroup_mutex);
				5289
				5290	cgroup_get_live(cgrp);
				5291
				5292	memset(css, 0, sizeof(*css));
				5293	css->cgroup = cgrp;
				5294	css->ss = ss;
				5295	css->id = -1;
				5296	INIT_LIST_HEAD(&css->sibling);
				5297	INIT_LIST_HEAD(&css->children);
				5298	INIT_LIST_HEAD(&css->rstat_css_node);
				5299	css->serial_nr = css_serial_nr_next++;
				5300	atomic_set(&css->online_cnt, 0);
				5301
				5302	if (cgroup_parent(cgrp)) {
				5303	css->parent = cgroup_css(cgroup_parent(cgrp), ss);
				5304	css_get(css->parent);
				5305	}
				5306
				5307	if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
				5308	list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
				5309
				5310	BUG_ON(cgroup_css(cgrp, ss));
				5311	}
				5312
				5313	/* invoke ->css_online() on a new CSS and mark it online if successful */
				5314	static int online_css(struct cgroup_subsys_state *css)
				5315	{
				5316	struct cgroup_subsys *ss = css->ss;
				5317	int ret = 0;
				5318
				5319	lockdep_assert_held(&cgroup_mutex);
				5320
				5321	if (ss->css_online)
				5322	ret = ss->css_online(css);
				5323	if (!ret) {
				5324	css->flags \|= CSS_ONLINE;
				5325	rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
				5326
				5327	atomic_inc(&css->online_cnt);
				5328	if (css->parent)
				5329	atomic_inc(&css->parent->online_cnt);
				5330	}
				5331	return ret;
				5332	}
				5333
				5334	/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
				5335	static void offline_css(struct cgroup_subsys_state *css)
				5336	{
				5337	struct cgroup_subsys *ss = css->ss;
				5338
				5339	lockdep_assert_held(&cgroup_mutex);
				5340
				5341	if (!(css->flags & CSS_ONLINE))
				5342	return;
				5343
				5344	if (ss->css_offline)
				5345	ss->css_offline(css);
				5346
				5347	css->flags &= ~CSS_ONLINE;
				5348	RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
				5349
				5350	wake_up_all(&css->cgroup->offline_waitq);
				5351	}
				5352
				5353	/**
				5354	* css_create - create a cgroup_subsys_state
				5355	* @cgrp: the cgroup new css will be associated with
				5356	* @ss: the subsys of new css
				5357	*
				5358	* Create a new css associated with @cgrp - @ss pair. On success, the new
				5359	* css is online and installed in @cgrp. This function doesn't create the
				5360	* interface files. Returns 0 on success, -errno on failure.
				5361	*/
				5362	static struct cgroup_subsys_state css_create(struct cgroup cgrp,
				5363	struct cgroup_subsys *ss)
				5364	{
				5365	struct cgroup *parent = cgroup_parent(cgrp);
				5366	struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
				5367	struct cgroup_subsys_state *css;
				5368	int err;
				5369
				5370	lockdep_assert_held(&cgroup_mutex);
				5371
				5372	css = ss->css_alloc(parent_css);
				5373	if (!css)
				5374	css = ERR_PTR(-ENOMEM);
				5375	if (IS_ERR(css))
				5376	return css;
				5377
				5378	init_and_link_css(css, ss, cgrp);
				5379
				5380	err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
				5381	if (err)
				5382	goto err_free_css;
				5383
				5384	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
				5385	if (err < 0)
				5386	goto err_free_css;
				5387	css->id = err;
				5388
				5389	/* @css is ready to be brought online now, make it visible */
				5390	list_add_tail_rcu(&css->sibling, &parent_css->children);
				5391	cgroup_idr_replace(&ss->css_idr, css, css->id);
				5392
				5393	err = online_css(css);
				5394	if (err)
				5395	goto err_list_del;
				5396
				5397	if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
				5398	cgroup_parent(parent)) {
				5399	pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
				5400	current->comm, current->pid, ss->name);
				5401	if (!strcmp(ss->name, "memory"))
				5402	pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
				5403	ss->warned_broken_hierarchy = true;
				5404	}
				5405
				5406	return css;
				5407
				5408	err_list_del:
				5409	list_del_rcu(&css->sibling);
				5410	err_free_css:
				5411	list_del_rcu(&css->rstat_css_node);
				5412	INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
				5413	queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
				5414	return ERR_PTR(err);
				5415	}
				5416
				5417	/*
				5418	* The returned cgroup is fully initialized including its control mask, but
				5419	* it isn't associated with its kernfs_node and doesn't have the control
				5420	* mask applied.
				5421	*/
				5422	static struct cgroup cgroup_create(struct cgroup parent)
				5423	{
				5424	struct cgroup_root *root = parent->root;
				5425	struct cgroup cgrp, tcgrp;
				5426	int level = parent->level + 1;
				5427	int ret;
				5428
				5429	/* allocate the cgroup and its ID, 0 is reserved for the root */
				5430	cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
				5431	GFP_KERNEL);
				5432	if (!cgrp)
				5433	return ERR_PTR(-ENOMEM);
				5434
				5435	ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
				5436	if (ret)
				5437	goto out_free_cgrp;
				5438
				5439	if (cgroup_on_dfl(parent)) {
				5440	ret = cgroup_rstat_init(cgrp);
				5441	if (ret)
				5442	goto out_cancel_ref;
				5443	}
				5444
				5445	/*
				5446	* Temporarily set the pointer to NULL, so idr_find() won't return
				5447	* a half-baked cgroup.
				5448	*/
				5449	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
				5450	if (cgrp->id < 0) {
				5451	ret = -ENOMEM;
				5452	goto out_stat_exit;
				5453	}
				5454
				5455	init_cgroup_housekeeping(cgrp);
				5456
				5457	cgrp->self.parent = &parent->self;
				5458	cgrp->root = root;
				5459	cgrp->level = level;
				5460
				5461	ret = psi_cgroup_alloc(cgrp);
				5462	if (ret)
				5463	goto out_idr_free;
				5464
				5465	ret = cgroup_bpf_inherit(cgrp);
				5466	if (ret)
				5467	goto out_psi_free;
				5468
				5469	/*
				5470	* New cgroup inherits effective freeze counter, and
				5471	* if the parent has to be frozen, the child has too.
				5472	*/
				5473	cgrp->freezer.e_freeze = parent->freezer.e_freeze;
				5474	if (cgrp->freezer.e_freeze) {
				5475	/*
				5476	* Set the CGRP_FREEZE flag, so when a process will be
				5477	* attached to the child cgroup, it will become frozen.
				5478	* At this point the new cgroup is unpopulated, so we can
				5479	* consider it frozen immediately.
				5480	*/
				5481	set_bit(CGRP_FREEZE, &cgrp->flags);
				5482	set_bit(CGRP_FROZEN, &cgrp->flags);
				5483	}
				5484
				5485	spin_lock_irq(&css_set_lock);
				5486	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
				5487	cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
				5488
				5489	if (tcgrp != cgrp) {
				5490	tcgrp->nr_descendants++;
				5491
				5492	/*
				5493	* If the new cgroup is frozen, all ancestor cgroups
				5494	* get a new frozen descendant, but their state can't
				5495	* change because of this.
				5496	*/
				5497	if (cgrp->freezer.e_freeze)
				5498	tcgrp->freezer.nr_frozen_descendants++;
				5499	}
				5500	}
				5501	spin_unlock_irq(&css_set_lock);
				5502
				5503	if (notify_on_release(parent))
				5504	set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
				5505
				5506	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
				5507	set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
				5508
				5509	cgrp->self.serial_nr = css_serial_nr_next++;
				5510
				5511	/* allocation complete, commit to creation */
				5512	list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
				5513	atomic_inc(&root->nr_cgrps);
				5514	cgroup_get_live(parent);
				5515
				5516	/*
				5517	* @cgrp is now fully operational. If something fails after this
				5518	* point, it'll be released via the normal destruction path.
				5519	*/
				5520	cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
				5521
				5522	/*
				5523	* On the default hierarchy, a child doesn't automatically inherit
				5524	* subtree_control from the parent. Each is configured manually.
				5525	*/
				5526	if (!cgroup_on_dfl(cgrp))
				5527	cgrp->subtree_control = cgroup_control(cgrp);
				5528
				5529	cgroup_propagate_control(cgrp);
				5530
				5531	return cgrp;
				5532
				5533	out_psi_free:
				5534	psi_cgroup_free(cgrp);
				5535	out_idr_free:
				5536	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
				5537	out_stat_exit:
				5538	if (cgroup_on_dfl(parent))
				5539	cgroup_rstat_exit(cgrp);
				5540	out_cancel_ref:
				5541	percpu_ref_exit(&cgrp->self.refcnt);
				5542	out_free_cgrp:
				5543	kfree(cgrp);
				5544	return ERR_PTR(ret);
				5545	}
				5546
				5547	static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
				5548	{
				5549	struct cgroup *cgroup;
				5550	int ret = false;
				5551	int level = 0;
				5552
				5553	lockdep_assert_held(&cgroup_mutex);
				5554
				5555	for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
				5556	if (cgroup->nr_descendants >= cgroup->max_descendants)
				5557	goto fail;
				5558
				5559	if (level >= cgroup->max_depth)
				5560	goto fail;
				5561
				5562	level++;
				5563	}
				5564
				5565	ret = true;
				5566	fail:
				5567	return ret;
				5568	}
				5569
				5570	int cgroup_mkdir(struct kernfs_node parent_kn, const char name, umode_t mode)
				5571	{
				5572	struct cgroup parent, cgrp;
				5573	struct kernfs_node *kn;
				5574	int ret;
				5575
				5576	/* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
				5577	if (strchr(name, '\n'))
				5578	return -EINVAL;
				5579
				5580	parent = cgroup_kn_lock_live(parent_kn, false);
				5581	if (!parent)
				5582	return -ENODEV;
				5583
				5584	if (!cgroup_check_hierarchy_limits(parent)) {
				5585	ret = -EAGAIN;
				5586	goto out_unlock;
				5587	}
				5588
				5589	cgrp = cgroup_create(parent);
				5590	if (IS_ERR(cgrp)) {
				5591	ret = PTR_ERR(cgrp);
				5592	goto out_unlock;
				5593	}
				5594
				5595	/* create the directory */
				5596	kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
				5597	if (IS_ERR(kn)) {
				5598	ret = PTR_ERR(kn);
				5599	goto out_destroy;
				5600	}
				5601	cgrp->kn = kn;
				5602
				5603	/*
				5604	* This extra ref will be put in cgroup_free_fn() and guarantees
				5605	* that @cgrp->kn is always accessible.
				5606	*/
				5607	kernfs_get(kn);
				5608
				5609	ret = cgroup_kn_set_ugid(kn);
				5610	if (ret)
				5611	goto out_destroy;
				5612
				5613	ret = css_populate_dir(&cgrp->self);
				5614	if (ret)
				5615	goto out_destroy;
				5616
				5617	ret = cgroup_apply_control_enable(cgrp);
				5618	if (ret)
				5619	goto out_destroy;
				5620
				5621	TRACE_CGROUP_PATH(mkdir, cgrp);
				5622
				5623	/* let's create and online css's */
				5624	kernfs_activate(kn);
				5625
				5626	ret = 0;
				5627	goto out_unlock;
				5628
				5629	out_destroy:
				5630	cgroup_destroy_locked(cgrp);
				5631	out_unlock:
				5632	cgroup_kn_unlock(parent_kn);
				5633	return ret;
				5634	}
				5635
				5636	/*
				5637	* This is called when the refcnt of a css is confirmed to be killed.
				5638	* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
				5639	* initate destruction and put the css ref from kill_css().
				5640	*/
				5641	static void css_killed_work_fn(struct work_struct *work)
				5642	{
				5643	struct cgroup_subsys_state *css =
				5644	container_of(work, struct cgroup_subsys_state, destroy_work);
				5645
				5646	mutex_lock(&cgroup_mutex);
				5647
				5648	do {
				5649	offline_css(css);
				5650	css_put(css);
				5651	/* @css can't go away while we're holding cgroup_mutex */
				5652	css = css->parent;
				5653	} while (css && atomic_dec_and_test(&css->online_cnt));
				5654
				5655	mutex_unlock(&cgroup_mutex);
				5656	}
				5657
				5658	/* css kill confirmation processing requires process context, bounce */
				5659	static void css_killed_ref_fn(struct percpu_ref *ref)
				5660	{
				5661	struct cgroup_subsys_state *css =
				5662	container_of(ref, struct cgroup_subsys_state, refcnt);
				5663
				5664	if (atomic_dec_and_test(&css->online_cnt)) {
				5665	INIT_WORK(&css->destroy_work, css_killed_work_fn);
				5666	queue_work(cgroup_destroy_wq, &css->destroy_work);
				5667	}
				5668	}
				5669
				5670	/**
				5671	* kill_css - destroy a css
				5672	* @css: css to destroy
				5673	*
				5674	* This function initiates destruction of @css by removing cgroup interface
				5675	* files and putting its base reference. ->css_offline() will be invoked
				5676	* asynchronously once css_tryget_online() is guaranteed to fail and when
				5677	* the reference count reaches zero, @css will be released.
				5678	*/
				5679	static void kill_css(struct cgroup_subsys_state *css)
				5680	{
				5681	lockdep_assert_held(&cgroup_mutex);
				5682
				5683	if (css->flags & CSS_DYING)
				5684	return;
				5685
				5686	css->flags \|= CSS_DYING;
				5687
				5688	/*
				5689	* This must happen before css is disassociated with its cgroup.
				5690	* See seq_css() for details.
				5691	*/
				5692	css_clear_dir(css);
				5693
				5694	/*
				5695	* Killing would put the base ref, but we need to keep it alive
				5696	* until after ->css_offline().
				5697	*/
				5698	css_get(css);
				5699
				5700	/*
				5701	* cgroup core guarantees that, by the time ->css_offline() is
				5702	* invoked, no new css reference will be given out via
				5703	* css_tryget_online(). We can't simply call percpu_ref_kill() and
				5704	* proceed to offlining css's because percpu_ref_kill() doesn't
				5705	* guarantee that the ref is seen as killed on all CPUs on return.
				5706	*
				5707	* Use percpu_ref_kill_and_confirm() to get notifications as each
				5708	* css is confirmed to be seen as killed on all CPUs.
				5709	*/
				5710	percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
				5711	}
				5712
				5713	/**
				5714	* cgroup_destroy_locked - the first stage of cgroup destruction
				5715	* @cgrp: cgroup to be destroyed
				5716	*
				5717	* css's make use of percpu refcnts whose killing latency shouldn't be
				5718	* exposed to userland and are RCU protected. Also, cgroup core needs to
				5719	* guarantee that css_tryget_online() won't succeed by the time
				5720	* ->css_offline() is invoked. To satisfy all the requirements,
				5721	* destruction is implemented in the following two steps.
				5722	*
				5723	* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
				5724	* userland visible parts and start killing the percpu refcnts of
				5725	* css's. Set up so that the next stage will be kicked off once all
				5726	* the percpu refcnts are confirmed to be killed.
				5727	*
				5728	* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
				5729	* rest of destruction. Once all cgroup references are gone, the
				5730	* cgroup is RCU-freed.
				5731	*
				5732	* This function implements s1. After this step, @cgrp is gone as far as
				5733	* the userland is concerned and a new cgroup with the same name may be
				5734	* created. As cgroup doesn't care about the names internally, this
				5735	* doesn't cause any problem.
				5736	*/
				5737	static int cgroup_destroy_locked(struct cgroup *cgrp)
				5738	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
				5739	{
				5740	struct cgroup tcgrp, parent = cgroup_parent(cgrp);
				5741	struct cgroup_subsys_state *css;
				5742	struct cgrp_cset_link *link;
				5743	int ssid;
				5744
				5745	lockdep_assert_held(&cgroup_mutex);
				5746
				5747	/*
				5748	* Only migration can raise populated from zero and we're already
				5749	* holding cgroup_mutex.
				5750	*/
				5751	if (cgroup_is_populated(cgrp))
				5752	return -EBUSY;
				5753
				5754	/*
				5755	* Make sure there's no live children. We can't test emptiness of
				5756	* ->self.children as dead children linger on it while being
				5757	* drained; otherwise, "rmdir parent/child parent" may fail.
				5758	*/
				5759	if (css_has_online_children(&cgrp->self))
				5760	return -EBUSY;
				5761
				5762	/*
				5763	* Mark @cgrp and the associated csets dead. The former prevents
				5764	* further task migration and child creation by disabling
				5765	* cgroup_lock_live_group(). The latter makes the csets ignored by
				5766	* the migration path.
				5767	*/
				5768	cgrp->self.flags &= ~CSS_ONLINE;
				5769
				5770	spin_lock_irq(&css_set_lock);
				5771	list_for_each_entry(link, &cgrp->cset_links, cset_link)
				5772	link->cset->dead = true;
				5773	spin_unlock_irq(&css_set_lock);
				5774
				5775	/* initiate massacre of all css's */
				5776	for_each_css(css, ssid, cgrp)
				5777	kill_css(css);
				5778
				5779	/* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
				5780	css_clear_dir(&cgrp->self);
				5781	kernfs_remove(cgrp->kn);
				5782
				5783	if (parent && cgroup_is_threaded(cgrp))
				5784	parent->nr_threaded_children--;
				5785
				5786	spin_lock_irq(&css_set_lock);
				5787	for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
				5788	tcgrp->nr_descendants--;
				5789	tcgrp->nr_dying_descendants++;
				5790	/*
				5791	* If the dying cgroup is frozen, decrease frozen descendants
				5792	* counters of ancestor cgroups.
				5793	*/
				5794	if (test_bit(CGRP_FROZEN, &cgrp->flags))
				5795	tcgrp->freezer.nr_frozen_descendants--;
				5796	}
				5797	spin_unlock_irq(&css_set_lock);
				5798
				5799	cgroup1_check_for_release(parent);
				5800
				5801	cgroup_bpf_offline(cgrp);
				5802
				5803	/* put the base reference */
				5804	percpu_ref_kill(&cgrp->self.refcnt);
				5805
				5806	return 0;
				5807	};
				5808
				5809	int cgroup_rmdir(struct kernfs_node *kn)
				5810	{
				5811	struct cgroup *cgrp;
				5812	int ret = 0;
				5813
				5814	cgrp = cgroup_kn_lock_live(kn, false);
				5815	if (!cgrp)
				5816	return 0;
				5817
				5818	ret = cgroup_destroy_locked(cgrp);
				5819	if (!ret)
				5820	TRACE_CGROUP_PATH(rmdir, cgrp);
				5821
				5822	cgroup_kn_unlock(kn);
				5823	return ret;
				5824	}
				5825
				5826	static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
				5827	.show_options = cgroup_show_options,
				5828	.mkdir = cgroup_mkdir,
				5829	.rmdir = cgroup_rmdir,
				5830	.show_path = cgroup_show_path,
				5831	};
				5832
				5833	static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
				5834	{
				5835	struct cgroup_subsys_state *css;
				5836
				5837	pr_debug("Initializing cgroup subsys %s\n", ss->name);
				5838
				5839	mutex_lock(&cgroup_mutex);
				5840
				5841	idr_init(&ss->css_idr);
				5842	INIT_LIST_HEAD(&ss->cfts);
				5843
				5844	/* Create the root cgroup state for this subsystem */
				5845	ss->root = &cgrp_dfl_root;
				5846	css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
				5847	/* We don't handle early failures gracefully */
				5848	BUG_ON(IS_ERR(css));
				5849	init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
				5850
				5851	/*
				5852	* Root csses are never destroyed and we can't initialize
				5853	* percpu_ref during early init. Disable refcnting.
				5854	*/
				5855	css->flags \|= CSS_NO_REF;
				5856
				5857	if (early) {
				5858	/* allocation can't be done safely during early init */
				5859	css->id = 1;
				5860	} else {
				5861	css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
				5862	BUG_ON(css->id < 0);
				5863	}
				5864
				5865	/* Update the init_css_set to contain a subsys
				5866	* pointer to this state - since the subsystem is
				5867	* newly registered, all tasks and hence the
				5868	* init_css_set is in the subsystem's root cgroup. */
				5869	init_css_set.subsys[ss->id] = css;
				5870
				5871	have_fork_callback \|= (bool)ss->fork << ss->id;
				5872	have_exit_callback \|= (bool)ss->exit << ss->id;
				5873	have_release_callback \|= (bool)ss->release << ss->id;
				5874	have_canfork_callback \|= (bool)ss->can_fork << ss->id;
				5875
				5876	/* At system boot, before all subsystems have been
				5877	* registered, no tasks have been forked, so we don't
				5878	* need to invoke fork callbacks here. */
				5879	BUG_ON(!list_empty(&init_task.tasks));
				5880
				5881	BUG_ON(online_css(css));
				5882
				5883	mutex_unlock(&cgroup_mutex);
				5884	}
				5885
				5886	/**
				5887	* cgroup_init_early - cgroup initialization at system boot
				5888	*
				5889	* Initialize cgroups at system boot, and initialize any
				5890	* subsystems that request early init.
				5891	*/
				5892	int __init cgroup_init_early(void)
				5893	{
				5894	static struct cgroup_fs_context __initdata ctx;
				5895	struct cgroup_subsys *ss;
				5896	int i;
				5897
				5898	ctx.root = &cgrp_dfl_root;
				5899	init_cgroup_root(&ctx);
				5900	cgrp_dfl_root.cgrp.self.flags \|= CSS_NO_REF;
				5901
				5902	RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
				5903
				5904	for_each_subsys(ss, i) {
				5905	WARN(!ss->css_alloc \|\| !ss->css_free \|\| ss->name \|\| ss->id,
				5906	"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
				5907	i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
				5908	ss->id, ss->name);
				5909	WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
				5910	"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
				5911
				5912	ss->id = i;
				5913	ss->name = cgroup_subsys_name[i];
				5914	if (!ss->legacy_name)
				5915	ss->legacy_name = cgroup_subsys_name[i];
				5916
				5917	if (ss->early_init)
				5918	cgroup_init_subsys(ss, true);
				5919	}
				5920	return 0;
				5921	}
				5922
				5923	/**
				5924	* cgroup_init - cgroup initialization
				5925	*
				5926	* Register cgroup filesystem and /proc file, and initialize
				5927	* any subsystems that didn't request early init.
				5928	*/
				5929	int __init cgroup_init(void)
				5930	{
				5931	struct cgroup_subsys *ss;
				5932	int ssid;
				5933
				5934	BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
				5935	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
				5936	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
				5937
				5938	cgroup_rstat_boot();
				5939
				5940	/*
				5941	* The latency of the synchronize_rcu() is too high for cgroups,
				5942	* avoid it at the cost of forcing all readers into the slow path.
				5943	*/
				5944	rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
				5945
				5946	get_user_ns(init_cgroup_ns.user_ns);
				5947
				5948	mutex_lock(&cgroup_mutex);
				5949
				5950	/*
				5951	* Add init_css_set to the hash table so that dfl_root can link to
				5952	* it during init.
				5953	*/
				5954	hash_add(css_set_table, &init_css_set.hlist,
				5955	css_set_hash(init_css_set.subsys));
				5956
				5957	BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
				5958
				5959	mutex_unlock(&cgroup_mutex);
				5960
				5961	for_each_subsys(ss, ssid) {
				5962	if (ss->early_init) {
				5963	struct cgroup_subsys_state *css =
				5964	init_css_set.subsys[ss->id];
				5965
				5966	css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
				5967	GFP_KERNEL);
				5968	BUG_ON(css->id < 0);
				5969	} else {
				5970	cgroup_init_subsys(ss, false);
				5971	}
				5972
				5973	list_add_tail(&init_css_set.e_cset_node[ssid],
				5974	&cgrp_dfl_root.cgrp.e_csets[ssid]);
				5975
				5976	/*
				5977	* Setting dfl_root subsys_mask needs to consider the
				5978	* disabled flag and cftype registration needs kmalloc,
				5979	* both of which aren't available during early_init.
				5980	*/
				5981	if (!cgroup_ssid_enabled(ssid))
				5982	continue;
				5983
				5984	if (cgroup1_ssid_disabled(ssid))
				5985	printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
				5986	ss->name);
				5987
				5988	cgrp_dfl_root.subsys_mask \|= 1 << ss->id;
				5989
				5990	/* implicit controllers must be threaded too */
				5991	WARN_ON(ss->implicit_on_dfl && !ss->threaded);
				5992
				5993	if (ss->implicit_on_dfl)
				5994	cgrp_dfl_implicit_ss_mask \|= 1 << ss->id;
				5995	else if (!ss->dfl_cftypes)
				5996	cgrp_dfl_inhibit_ss_mask \|= 1 << ss->id;
				5997
				5998	if (ss->threaded)
				5999	cgrp_dfl_threaded_ss_mask \|= 1 << ss->id;
				6000
				6001	if (ss->dfl_cftypes == ss->legacy_cftypes) {
				6002	WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
				6003	} else {
				6004	WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
				6005	WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
				6006	}
				6007
				6008	if (ss->bind)
				6009	ss->bind(init_css_set.subsys[ssid]);
				6010
				6011	mutex_lock(&cgroup_mutex);
				6012	css_populate_dir(init_css_set.subsys[ssid]);
				6013	mutex_unlock(&cgroup_mutex);
				6014	}
				6015
				6016	/* init_css_set.subsys[] has been updated, re-hash */
				6017	hash_del(&init_css_set.hlist);
				6018	hash_add(css_set_table, &init_css_set.hlist,
				6019	css_set_hash(init_css_set.subsys));
				6020
				6021	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
				6022	WARN_ON(register_filesystem(&cgroup_fs_type));
				6023	WARN_ON(register_filesystem(&cgroup2_fs_type));
				6024	WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
				6025	#ifdef CONFIG_CPUSETS
				6026	WARN_ON(register_filesystem(&cpuset_fs_type));
				6027	#endif
				6028
				6029	return 0;
				6030	}
				6031
				6032	static int __init cgroup_wq_init(void)
				6033	{
				6034	/*
				6035	* There isn't much point in executing destruction path in
				6036	* parallel. Good chunk is serialized with cgroup_mutex anyway.
				6037	* Use 1 for @max_active.
				6038	*
				6039	* We would prefer to do this in cgroup_init() above, but that
				6040	* is called before init_workqueues(): so leave this until after.
				6041	*/
				6042	cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
				6043	BUG_ON(!cgroup_destroy_wq);
				6044	return 0;
				6045	}
				6046	core_initcall(cgroup_wq_init);
				6047
				6048	void cgroup_path_from_kernfs_id(const union kernfs_node_id *id,
				6049	char *buf, size_t buflen)
				6050	{
				6051	struct kernfs_node *kn;
				6052
				6053	kn = kernfs_get_node_by_id(cgrp_dfl_root.kf_root, id);
				6054	if (!kn)
				6055	return;
				6056	kernfs_path(kn, buf, buflen);
				6057	kernfs_put(kn);
				6058	}
				6059
				6060	/*
				6061	* proc_cgroup_show()
				6062	* - Print task's cgroup paths into seq_file, one line for each hierarchy
				6063	* - Used for /proc/<pid>/cgroup.
				6064	*/
				6065	int proc_cgroup_show(struct seq_file m, struct pid_namespace ns,
				6066	struct pid pid, struct task_struct tsk)
				6067	{
				6068	char *buf;
				6069	int retval;
				6070	struct cgroup_root *root;
				6071
				6072	retval = -ENOMEM;
				6073	buf = kmalloc(PATH_MAX, GFP_KERNEL);
				6074	if (!buf)
				6075	goto out;
				6076
				6077	mutex_lock(&cgroup_mutex);
				6078	spin_lock_irq(&css_set_lock);
				6079
				6080	for_each_root(root) {
				6081	struct cgroup_subsys *ss;
				6082	struct cgroup *cgrp;
				6083	int ssid, count = 0;
				6084
				6085	if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
				6086	continue;
				6087
				6088	seq_printf(m, "%d:", root->hierarchy_id);
				6089	if (root != &cgrp_dfl_root)
				6090	for_each_subsys(ss, ssid)
				6091	if (root->subsys_mask & (1 << ssid))
				6092	seq_printf(m, "%s%s", count++ ? "," : "",
				6093	ss->legacy_name);
				6094	if (strlen(root->name))
				6095	seq_printf(m, "%sname=%s", count ? "," : "",
				6096	root->name);
				6097	seq_putc(m, ':');
				6098
				6099	cgrp = task_cgroup_from_root(tsk, root);
				6100
				6101	/*
				6102	* On traditional hierarchies, all zombie tasks show up as
				6103	* belonging to the root cgroup. On the default hierarchy,
				6104	* while a zombie doesn't show up in "cgroup.procs" and
				6105	* thus can't be migrated, its /proc/PID/cgroup keeps
				6106	* reporting the cgroup it belonged to before exiting. If
				6107	* the cgroup is removed before the zombie is reaped,
				6108	* " (deleted)" is appended to the cgroup path.
				6109	*/
				6110	if (cgroup_on_dfl(cgrp) \|\| !(tsk->flags & PF_EXITING)) {
				6111	retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
				6112	current->nsproxy->cgroup_ns);
				6113	if (retval >= PATH_MAX)
				6114	retval = -ENAMETOOLONG;
				6115	if (retval < 0)
				6116	goto out_unlock;
				6117
				6118	seq_puts(m, buf);
				6119	} else {
				6120	seq_puts(m, "/");
				6121	}
				6122
				6123	if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
				6124	seq_puts(m, " (deleted)\n");
				6125	else
				6126	seq_putc(m, '\n');
				6127	}
				6128
				6129	retval = 0;
				6130	out_unlock:
				6131	spin_unlock_irq(&css_set_lock);
				6132	mutex_unlock(&cgroup_mutex);
				6133	kfree(buf);
				6134	out:
				6135	return retval;
				6136	}
				6137
				6138	/**
				6139	* cgroup_fork - initialize cgroup related fields during copy_process()
				6140	* @child: pointer to task_struct of forking parent process.
				6141	*
				6142	* A task is associated with the init_css_set until cgroup_post_fork()
				6143	* attaches it to the parent's css_set. Empty cg_list indicates that
				6144	* @child isn't holding reference to its css_set.
				6145	*/
				6146	void cgroup_fork(struct task_struct *child)
				6147	{
				6148	RCU_INIT_POINTER(child->cgroups, &init_css_set);
				6149	INIT_LIST_HEAD(&child->cg_list);
				6150	}
				6151
				6152	/**
				6153	* cgroup_can_fork - called on a new task before the process is exposed
				6154	* @child: the task in question.
				6155	*
				6156	* This calls the subsystem can_fork() callbacks. If the can_fork() callback
				6157	* returns an error, the fork aborts with that error code. This allows for
				6158	* a cgroup subsystem to conditionally allow or deny new forks.
				6159	*/
				6160	int cgroup_can_fork(struct task_struct *child)
				6161	{
				6162	struct cgroup_subsys *ss;
				6163	int i, j, ret;
				6164
				6165	do_each_subsys_mask(ss, i, have_canfork_callback) {
				6166	ret = ss->can_fork(child);
				6167	if (ret)
				6168	goto out_revert;
				6169	} while_each_subsys_mask();
				6170
				6171	return 0;
				6172
				6173	out_revert:
				6174	for_each_subsys(ss, j) {
				6175	if (j >= i)
				6176	break;
				6177	if (ss->cancel_fork)
				6178	ss->cancel_fork(child);
				6179	}
				6180
				6181	return ret;
				6182	}
				6183
				6184	/**
				6185	* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
				6186	* @child: the task in question
				6187	*
				6188	* This calls the cancel_fork() callbacks if a fork failed after
				6189	* cgroup_can_fork() succeded.
				6190	*/
				6191	void cgroup_cancel_fork(struct task_struct *child)
				6192	{
				6193	struct cgroup_subsys *ss;
				6194	int i;
				6195
				6196	for_each_subsys(ss, i)
				6197	if (ss->cancel_fork)
				6198	ss->cancel_fork(child);
				6199	}
				6200
				6201	/**
				6202	* cgroup_post_fork - called on a new task after adding it to the task list
				6203	* @child: the task in question
				6204	*
				6205	* Adds the task to the list running through its css_set if necessary and
				6206	* call the subsystem fork() callbacks. Has to be after the task is
				6207	* visible on the task list in case we race with the first call to
				6208	* cgroup_task_iter_start() - to guarantee that the new task ends up on its
				6209	* list.
				6210	*/
				6211	void cgroup_post_fork(struct task_struct *child)
				6212	{
				6213	struct cgroup_subsys *ss;
				6214	int i;
				6215
				6216	/*
				6217	* This may race against cgroup_enable_task_cg_lists(). As that
				6218	* function sets use_task_css_set_links before grabbing
				6219	* tasklist_lock and we just went through tasklist_lock to add
				6220	* @child, it's guaranteed that either we see the set
				6221	* use_task_css_set_links or cgroup_enable_task_cg_lists() sees
				6222	* @child during its iteration.
				6223	*
				6224	* If we won the race, @child is associated with %current's
				6225	* css_set. Grabbing css_set_lock guarantees both that the
				6226	* association is stable, and, on completion of the parent's
				6227	* migration, @child is visible in the source of migration or
				6228	* already in the destination cgroup. This guarantee is necessary
				6229	* when implementing operations which need to migrate all tasks of
				6230	* a cgroup to another.
				6231	*
				6232	* Note that if we lose to cgroup_enable_task_cg_lists(), @child
				6233	* will remain in init_css_set. This is safe because all tasks are
				6234	* in the init_css_set before cg_links is enabled and there's no
				6235	* operation which transfers all tasks out of init_css_set.
				6236	*/
				6237	if (use_task_css_set_links) {
				6238	struct css_set *cset;
				6239
				6240	spin_lock_irq(&css_set_lock);
				6241	cset = task_css_set(current);
				6242	if (list_empty(&child->cg_list)) {
				6243	get_css_set(cset);
				6244	cset->nr_tasks++;
				6245	css_set_move_task(child, NULL, cset, false);
				6246	}
				6247
				6248	/*
				6249	* If the cgroup has to be frozen, the new task has too.
				6250	* Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get
				6251	* the task into the frozen state.
				6252	*/
				6253	if (unlikely(cgroup_task_freeze(child))) {
				6254	spin_lock(&child->sighand->siglock);
				6255	WARN_ON_ONCE(child->frozen);
				6256	child->jobctl \|= JOBCTL_TRAP_FREEZE;
				6257	spin_unlock(&child->sighand->siglock);
				6258
				6259	/*
				6260	* Calling cgroup_update_frozen() isn't required here,
				6261	* because it will be called anyway a bit later
				6262	* from do_freezer_trap(). So we avoid cgroup's
				6263	* transient switch from the frozen state and back.
				6264	*/
				6265	}
				6266
				6267	spin_unlock_irq(&css_set_lock);
				6268	}
				6269
				6270	/*
				6271	* Call ss->fork(). This must happen after @child is linked on
				6272	* css_set; otherwise, @child might change state between ->fork()
				6273	* and addition to css_set.
				6274	*/
				6275	do_each_subsys_mask(ss, i, have_fork_callback) {
				6276	ss->fork(child);
				6277	} while_each_subsys_mask();
				6278	}
				6279
				6280	/**
				6281	* cgroup_exit - detach cgroup from exiting task
				6282	* @tsk: pointer to task_struct of exiting process
				6283	*
				6284	* Description: Detach cgroup from @tsk and release it.
				6285	*
				6286	* Note that cgroups marked notify_on_release force every task in
				6287	* them to take the global cgroup_mutex mutex when exiting.
				6288	* This could impact scaling on very large systems. Be reluctant to
				6289	* use notify_on_release cgroups where very high task exit scaling
				6290	* is required on large systems.
				6291	*
				6292	* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
				6293	* call cgroup_exit() while the task is still competent to handle
				6294	* notify_on_release(), then leave the task attached to the root cgroup in
				6295	* each hierarchy for the remainder of its exit. No need to bother with
				6296	* init_css_set refcnting. init_css_set never goes away and we can't race
				6297	* with migration path - PF_EXITING is visible to migration path.
				6298	*/
				6299	void cgroup_exit(struct task_struct *tsk)
				6300	{
				6301	struct cgroup_subsys *ss;
				6302	struct css_set *cset;
				6303	int i;
				6304
				6305	/*
				6306	* Unlink from @tsk from its css_set. As migration path can't race
				6307	* with us, we can check css_set and cg_list without synchronization.
				6308	*/
				6309	cset = task_css_set(tsk);
				6310
				6311	if (!list_empty(&tsk->cg_list)) {
				6312	spin_lock_irq(&css_set_lock);
				6313	css_set_move_task(tsk, cset, NULL, false);
				6314	list_add_tail(&tsk->cg_list, &cset->dying_tasks);
				6315	cset->nr_tasks--;
				6316
				6317	WARN_ON_ONCE(cgroup_task_frozen(tsk));
				6318	if (unlikely(cgroup_task_freeze(tsk)))
				6319	cgroup_update_frozen(task_dfl_cgroup(tsk));
				6320
				6321	spin_unlock_irq(&css_set_lock);
				6322	} else {
				6323	get_css_set(cset);
				6324	}
				6325
				6326	/* see cgroup_post_fork() for details */
				6327	do_each_subsys_mask(ss, i, have_exit_callback) {
				6328	ss->exit(tsk);
				6329	} while_each_subsys_mask();
				6330	}
				6331
				6332	void cgroup_release(struct task_struct *task)
				6333	{
				6334	struct cgroup_subsys *ss;
				6335	int ssid;
				6336
				6337	do_each_subsys_mask(ss, ssid, have_release_callback) {
				6338	ss->release(task);
				6339	} while_each_subsys_mask();
				6340
				6341	if (use_task_css_set_links) {
				6342	spin_lock_irq(&css_set_lock);
				6343	css_set_skip_task_iters(task_css_set(task), task);
				6344	list_del_init(&task->cg_list);
				6345	spin_unlock_irq(&css_set_lock);
				6346	}
				6347	}
				6348
				6349	void cgroup_free(struct task_struct *task)
				6350	{
				6351	struct css_set *cset = task_css_set(task);
				6352	put_css_set(cset);
				6353	}
				6354
				6355	static int __init cgroup_disable(char *str)
				6356	{
				6357	struct cgroup_subsys *ss;
				6358	char *token;
				6359	int i;
				6360
				6361	while ((token = strsep(&str, ",")) != NULL) {
				6362	if (!*token)
				6363	continue;
				6364
				6365	for_each_subsys(ss, i) {
				6366	if (strcmp(token, ss->name) &&
				6367	strcmp(token, ss->legacy_name))
				6368	continue;
				6369
				6370	static_branch_disable(cgroup_subsys_enabled_key[i]);
				6371	pr_info("Disabling %s control group subsystem\n",
				6372	ss->name);
				6373	}
				6374
				6375	for (i = 0; i < OPT_FEATURE_COUNT; i++) {
				6376	if (strcmp(token, cgroup_opt_feature_names[i]))
				6377	continue;
				6378	cgroup_feature_disable_mask \|= 1 << i;
				6379	pr_info("Disabling %s control group feature\n",
				6380	cgroup_opt_feature_names[i]);
				6381	break;
				6382	}
				6383	}
				6384	return 1;
				6385	}
				6386	__setup("cgroup_disable=", cgroup_disable);
				6387
				6388	void __init __weak enable_debug_cgroup(void) { }
				6389
				6390	static int __init enable_cgroup_debug(char *str)
				6391	{
				6392	cgroup_debug = true;
				6393	enable_debug_cgroup();
				6394	return 1;
				6395	}
				6396	__setup("cgroup_debug", enable_cgroup_debug);
				6397
				6398	/**
				6399	* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
				6400	* @dentry: directory dentry of interest
				6401	* @ss: subsystem of interest
				6402	*
				6403	* If @dentry is a directory for a cgroup which has @ss enabled on it, try
				6404	* to get the corresponding css and return it. If such css doesn't exist
				6405	* or can't be pinned, an ERR_PTR value is returned.
				6406	*/
				6407	struct cgroup_subsys_state css_tryget_online_from_dir(struct dentry dentry,
				6408	struct cgroup_subsys *ss)
				6409	{
				6410	struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
				6411	struct file_system_type *s_type = dentry->d_sb->s_type;
				6412	struct cgroup_subsys_state *css = NULL;
				6413	struct cgroup *cgrp;
				6414
				6415	/* is @dentry a cgroup dir? */
				6416	if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) \|\|
				6417	!kn \|\| kernfs_type(kn) != KERNFS_DIR)
				6418	return ERR_PTR(-EBADF);
				6419
				6420	rcu_read_lock();
				6421
				6422	/*
				6423	* This path doesn't originate from kernfs and @kn could already
				6424	* have been or be removed at any point. @kn->priv is RCU
				6425	* protected for this access. See css_release_work_fn() for details.
				6426	*/
				6427	cgrp = rcu_dereference((void __rcu __force *)&kn->priv);
				6428	if (cgrp)
				6429	css = cgroup_css(cgrp, ss);
				6430
				6431	if (!css \|\| !css_tryget_online(css))
				6432	css = ERR_PTR(-ENOENT);
				6433
				6434	rcu_read_unlock();
				6435	return css;
				6436	}
				6437
				6438	/**
				6439	* css_from_id - lookup css by id
				6440	* @id: the cgroup id
				6441	* @ss: cgroup subsys to be looked into
				6442	*
				6443	* Returns the css if there's valid one with @id, otherwise returns NULL.
				6444	* Should be called under rcu_read_lock().
				6445	*/
				6446	struct cgroup_subsys_state css_from_id(int id, struct cgroup_subsys ss)
				6447	{
				6448	WARN_ON_ONCE(!rcu_read_lock_held());
				6449	return idr_find(&ss->css_idr, id);
				6450	}
				6451
				6452	/**
				6453	* cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
				6454	* @path: path on the default hierarchy
				6455	*
				6456	* Find the cgroup at @path on the default hierarchy, increment its
				6457	* reference count and return it. Returns pointer to the found cgroup on
				6458	* success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
				6459	* if @path points to a non-directory.
				6460	*/
				6461	struct cgroup cgroup_get_from_path(const char path)
				6462	{
				6463	struct kernfs_node *kn;
				6464	struct cgroup *cgrp;
				6465
				6466	mutex_lock(&cgroup_mutex);
				6467
				6468	kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
				6469	if (kn) {
				6470	if (kernfs_type(kn) == KERNFS_DIR) {
				6471	cgrp = kn->priv;
				6472	cgroup_get_live(cgrp);
				6473	} else {
				6474	cgrp = ERR_PTR(-ENOTDIR);
				6475	}
				6476	kernfs_put(kn);
				6477	} else {
				6478	cgrp = ERR_PTR(-ENOENT);
				6479	}
				6480
				6481	mutex_unlock(&cgroup_mutex);
				6482	return cgrp;
				6483	}
				6484	EXPORT_SYMBOL_GPL(cgroup_get_from_path);
				6485
				6486	/**
				6487	* cgroup_get_from_fd - get a cgroup pointer from a fd
				6488	* @fd: fd obtained by open(cgroup2_dir)
				6489	*
				6490	* Find the cgroup from a fd which should be obtained
				6491	* by opening a cgroup directory. Returns a pointer to the
				6492	* cgroup on success. ERR_PTR is returned if the cgroup
				6493	* cannot be found.
				6494	*/
				6495	struct cgroup *cgroup_get_from_fd(int fd)
				6496	{
				6497	struct cgroup_subsys_state *css;
				6498	struct cgroup *cgrp;
				6499	struct file *f;
				6500
				6501	f = fget_raw(fd);
				6502	if (!f)
				6503	return ERR_PTR(-EBADF);
				6504
				6505	css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
				6506	fput(f);
				6507	if (IS_ERR(css))
				6508	return ERR_CAST(css);
				6509
				6510	cgrp = css->cgroup;
				6511	if (!cgroup_on_dfl(cgrp)) {
				6512	cgroup_put(cgrp);
				6513	return ERR_PTR(-EBADF);
				6514	}
				6515
				6516	return cgrp;
				6517	}
				6518	EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
				6519
				6520	static u64 power_of_ten(int power)
				6521	{
				6522	u64 v = 1;
				6523	while (power--)
				6524	v *= 10;
				6525	return v;
				6526	}
				6527
				6528	/**
				6529	* cgroup_parse_float - parse a floating number
				6530	* @input: input string
				6531	* @dec_shift: number of decimal digits to shift
				6532	* @v: output
				6533	*
				6534	* Parse a decimal floating point number in @input and store the result in
				6535	* @v with decimal point right shifted @dec_shift times. For example, if
				6536	* @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
				6537	* Returns 0 on success, -errno otherwise.
				6538	*
				6539	* There's nothing cgroup specific about this function except that it's
				6540	* currently the only user.
				6541	*/
				6542	int cgroup_parse_float(const char input, unsigned dec_shift, s64 v)
				6543	{
				6544	s64 whole, frac = 0;
				6545	int fstart = 0, fend = 0, flen;
				6546
				6547	if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
				6548	return -EINVAL;
				6549	if (frac < 0)
				6550	return -EINVAL;
				6551
				6552	flen = fend > fstart ? fend - fstart : 0;
				6553	if (flen < dec_shift)
				6554	frac *= power_of_ten(dec_shift - flen);
				6555	else
				6556	frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
				6557
				6558	v = whole power_of_ten(dec_shift) + frac;
				6559	return 0;
				6560	}
				6561
				6562	/*
				6563	* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
				6564	* definition in cgroup-defs.h.
				6565	*/
				6566	#ifdef CONFIG_SOCK_CGROUP_DATA
				6567
				6568	#if defined(CONFIG_CGROUP_NET_PRIO) \|\| defined(CONFIG_CGROUP_NET_CLASSID)
				6569
				6570	DEFINE_SPINLOCK(cgroup_sk_update_lock);
				6571	static bool cgroup_sk_alloc_disabled __read_mostly;
				6572
				6573	void cgroup_sk_alloc_disable(void)
				6574	{
				6575	if (cgroup_sk_alloc_disabled)
				6576	return;
				6577	pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
				6578	cgroup_sk_alloc_disabled = true;
				6579	}
				6580
				6581	#else
				6582
				6583	#define cgroup_sk_alloc_disabled false
				6584
				6585	#endif
				6586
				6587	void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
				6588	{
				6589	if (cgroup_sk_alloc_disabled) {
				6590	skcd->no_refcnt = 1;
				6591	return;
				6592	}
				6593
				6594	/* Don't associate the sock with unrelated interrupted task's cgroup. */
				6595	if (in_interrupt())
				6596	return;
				6597
				6598	rcu_read_lock();
				6599
				6600	while (true) {
				6601	struct css_set *cset;
				6602
				6603	cset = task_css_set(current);
				6604	if (likely(cgroup_tryget(cset->dfl_cgrp))) {
				6605	skcd->val = (unsigned long)cset->dfl_cgrp;
				6606	cgroup_bpf_get(cset->dfl_cgrp);
				6607	break;
				6608	}
				6609	cpu_relax();
				6610	}
				6611
				6612	rcu_read_unlock();
				6613	}
				6614
				6615	void cgroup_sk_clone(struct sock_cgroup_data *skcd)
				6616	{
				6617	if (skcd->val) {
				6618	if (skcd->no_refcnt)
				6619	return;
				6620	/*
				6621	* We might be cloning a socket which is left in an empty
				6622	* cgroup and the cgroup might have already been rmdir'd.
				6623	* Don't use cgroup_get_live().
				6624	*/
				6625	cgroup_get(sock_cgroup_ptr(skcd));
				6626	cgroup_bpf_get(sock_cgroup_ptr(skcd));
				6627	}
				6628	}
				6629
				6630	void cgroup_sk_free(struct sock_cgroup_data *skcd)
				6631	{
				6632	struct cgroup *cgrp = sock_cgroup_ptr(skcd);
				6633
				6634	if (skcd->no_refcnt)
				6635	return;
				6636	cgroup_bpf_put(cgrp);
				6637	cgroup_put(cgrp);
				6638	}
				6639
				6640	#endif /* CONFIG_SOCK_CGROUP_DATA */
				6641
				6642	#ifdef CONFIG_CGROUP_BPF
				6643	int cgroup_bpf_attach(struct cgroup cgrp, struct bpf_prog prog,
				6644	enum bpf_attach_type type, u32 flags)
				6645	{
				6646	int ret;
				6647
				6648	mutex_lock(&cgroup_mutex);
				6649	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
				6650	mutex_unlock(&cgroup_mutex);
				6651	return ret;
				6652	}
				6653	int cgroup_bpf_detach(struct cgroup cgrp, struct bpf_prog prog,
				6654	enum bpf_attach_type type, u32 flags)
				6655	{
				6656	int ret;
				6657
				6658	mutex_lock(&cgroup_mutex);
				6659	ret = __cgroup_bpf_detach(cgrp, prog, type);
				6660	mutex_unlock(&cgroup_mutex);
				6661	return ret;
				6662	}
				6663	int cgroup_bpf_query(struct cgroup cgrp, const union bpf_attr attr,
				6664	union bpf_attr __user *uattr)
				6665	{
				6666	int ret;
				6667
				6668	mutex_lock(&cgroup_mutex);
				6669	ret = __cgroup_bpf_query(cgrp, attr, uattr);
				6670	mutex_unlock(&cgroup_mutex);
				6671	return ret;
				6672	}
				6673	#endif /* CONFIG_CGROUP_BPF */
				6674
				6675	#ifdef CONFIG_SYSFS
				6676	static ssize_t show_delegatable_files(struct cftype files, char buf,
				6677	ssize_t size, const char *prefix)
				6678	{
				6679	struct cftype *cft;
				6680	ssize_t ret = 0;
				6681
				6682	for (cft = files; cft && cft->name[0] != '\0'; cft++) {
				6683	if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
				6684	continue;
				6685
				6686	if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
				6687	continue;
				6688
				6689	if (prefix)
				6690	ret += snprintf(buf + ret, size - ret, "%s.", prefix);
				6691
				6692	ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
				6693
				6694	if (WARN_ON(ret >= size))
				6695	break;
				6696	}
				6697
				6698	return ret;
				6699	}
				6700
				6701	static ssize_t delegate_show(struct kobject kobj, struct kobj_attribute attr,
				6702	char *buf)
				6703	{
				6704	struct cgroup_subsys *ss;
				6705	int ssid;
				6706	ssize_t ret = 0;
				6707
				6708	ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
				6709	NULL);
				6710
				6711	for_each_subsys(ss, ssid)
				6712	ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
				6713	PAGE_SIZE - ret,
				6714	cgroup_subsys_name[ssid]);
				6715
				6716	return ret;
				6717	}
				6718	static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
				6719
				6720	static ssize_t features_show(struct kobject kobj, struct kobj_attribute attr,
				6721	char *buf)
				6722	{
				6723	return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
				6724	}
				6725	static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
				6726
				6727	static struct attribute *cgroup_sysfs_attrs[] = {
				6728	&cgroup_delegate_attr.attr,
				6729	&cgroup_features_attr.attr,
				6730	NULL,
				6731	};
				6732
				6733	static const struct attribute_group cgroup_sysfs_attr_group = {
				6734	.attrs = cgroup_sysfs_attrs,
				6735	.name = "cgroup",
				6736	};
				6737
				6738	static int __init cgroup_sysfs_init(void)
				6739	{
				6740	return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
				6741	}
				6742	subsys_initcall(cgroup_sysfs_init);
				6743
				6744	#endif /* CONFIG_SYSFS */