| xj | b04a402 | 2021-11-25 15:01:52 +0800 | [diff] [blame] | 1 | /* memcontrol.c - Memory Controller | 
 | 2 |  * | 
 | 3 |  * Copyright IBM Corporation, 2007 | 
 | 4 |  * Author Balbir Singh <balbir@linux.vnet.ibm.com> | 
 | 5 |  * | 
 | 6 |  * Copyright 2007 OpenVZ SWsoft Inc | 
 | 7 |  * Author: Pavel Emelianov <xemul@openvz.org> | 
 | 8 |  * | 
 | 9 |  * Memory thresholds | 
 | 10 |  * Copyright (C) 2009 Nokia Corporation | 
 | 11 |  * Author: Kirill A. Shutemov | 
 | 12 |  * | 
 | 13 |  * Kernel Memory Controller | 
 | 14 |  * Copyright (C) 2012 Parallels Inc. and Google Inc. | 
 | 15 |  * Authors: Glauber Costa and Suleiman Souhlal | 
 | 16 |  * | 
 | 17 |  * Native page reclaim | 
 | 18 |  * Charge lifetime sanitation | 
 | 19 |  * Lockless page tracking & accounting | 
 | 20 |  * Unified hierarchy configuration model | 
 | 21 |  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner | 
 | 22 |  * | 
 | 23 |  * This program is free software; you can redistribute it and/or modify | 
 | 24 |  * it under the terms of the GNU General Public License as published by | 
 | 25 |  * the Free Software Foundation; either version 2 of the License, or | 
 | 26 |  * (at your option) any later version. | 
 | 27 |  * | 
 | 28 |  * This program is distributed in the hope that it will be useful, | 
 | 29 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
 | 30 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
 | 31 |  * GNU General Public License for more details. | 
 | 32 |  */ | 
 | 33 |  | 
 | 34 | #include <linux/page_counter.h> | 
 | 35 | #include <linux/memcontrol.h> | 
 | 36 | #include <linux/cgroup.h> | 
 | 37 | #include <linux/mm.h> | 
 | 38 | #include <linux/sched/mm.h> | 
 | 39 | #include <linux/shmem_fs.h> | 
 | 40 | #include <linux/hugetlb.h> | 
 | 41 | #include <linux/pagemap.h> | 
 | 42 | #include <linux/smp.h> | 
 | 43 | #include <linux/page-flags.h> | 
 | 44 | #include <linux/backing-dev.h> | 
 | 45 | #include <linux/bit_spinlock.h> | 
 | 46 | #include <linux/rcupdate.h> | 
 | 47 | #include <linux/limits.h> | 
 | 48 | #include <linux/export.h> | 
 | 49 | #include <linux/mutex.h> | 
 | 50 | #include <linux/rbtree.h> | 
 | 51 | #include <linux/slab.h> | 
 | 52 | #include <linux/swap.h> | 
 | 53 | #include <linux/swapops.h> | 
 | 54 | #include <linux/spinlock.h> | 
 | 55 | #include <linux/eventfd.h> | 
 | 56 | #include <linux/poll.h> | 
 | 57 | #include <linux/sort.h> | 
 | 58 | #include <linux/fs.h> | 
 | 59 | #include <linux/seq_file.h> | 
 | 60 | #include <linux/vmpressure.h> | 
 | 61 | #include <linux/mm_inline.h> | 
 | 62 | #include <linux/swap_cgroup.h> | 
 | 63 | #include <linux/cpu.h> | 
 | 64 | #include <linux/oom.h> | 
 | 65 | #include <linux/lockdep.h> | 
 | 66 | #include <linux/file.h> | 
 | 67 | #include <linux/tracehook.h> | 
 | 68 | #include "internal.h" | 
 | 69 | #include <net/sock.h> | 
 | 70 | #include <net/ip.h> | 
 | 71 | #include "slab.h" | 
 | 72 |  | 
 | 73 | #include <linux/uaccess.h> | 
 | 74 |  | 
 | 75 | #include <trace/events/vmscan.h> | 
 | 76 |  | 
 | 77 | struct cgroup_subsys memory_cgrp_subsys __read_mostly; | 
 | 78 | EXPORT_SYMBOL(memory_cgrp_subsys); | 
 | 79 |  | 
 | 80 | struct mem_cgroup *root_mem_cgroup __read_mostly; | 
 | 81 |  | 
 | 82 | #define MEM_CGROUP_RECLAIM_RETRIES	5 | 
 | 83 |  | 
 | 84 | /* Socket memory accounting disabled? */ | 
 | 85 | static bool cgroup_memory_nosocket; | 
 | 86 |  | 
 | 87 | /* Kernel memory accounting disabled? */ | 
 | 88 | static bool cgroup_memory_nokmem; | 
 | 89 |  | 
 | 90 | /* Whether the swap controller is active */ | 
 | 91 | #ifdef CONFIG_MEMCG_SWAP | 
 | 92 | int do_swap_account __read_mostly; | 
 | 93 | #else | 
 | 94 | #define do_swap_account		0 | 
 | 95 | #endif | 
 | 96 |  | 
 | 97 | /* Whether legacy memory+swap accounting is active */ | 
 | 98 | static bool do_memsw_account(void) | 
 | 99 | { | 
 | 100 | 	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; | 
 | 101 | } | 
 | 102 |  | 
 | 103 | static const char *const mem_cgroup_lru_names[] = { | 
 | 104 | 	"inactive_anon", | 
 | 105 | 	"active_anon", | 
 | 106 | 	"inactive_file", | 
 | 107 | 	"active_file", | 
 | 108 | 	"unevictable", | 
 | 109 | }; | 
 | 110 |  | 
 | 111 | #define THRESHOLDS_EVENTS_TARGET 128 | 
 | 112 | #define SOFTLIMIT_EVENTS_TARGET 1024 | 
 | 113 | #define NUMAINFO_EVENTS_TARGET	1024 | 
 | 114 |  | 
 | 115 | /* | 
 | 116 |  * Cgroups above their limits are maintained in a RB-Tree, independent of | 
 | 117 |  * their hierarchy representation | 
 | 118 |  */ | 
 | 119 |  | 
 | 120 | struct mem_cgroup_tree_per_node { | 
 | 121 | 	struct rb_root rb_root; | 
 | 122 | 	struct rb_node *rb_rightmost; | 
 | 123 | 	spinlock_t lock; | 
 | 124 | }; | 
 | 125 |  | 
 | 126 | struct mem_cgroup_tree { | 
 | 127 | 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | 
 | 128 | }; | 
 | 129 |  | 
 | 130 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 
 | 131 |  | 
 | 132 | /* for OOM */ | 
 | 133 | struct mem_cgroup_eventfd_list { | 
 | 134 | 	struct list_head list; | 
 | 135 | 	struct eventfd_ctx *eventfd; | 
 | 136 | }; | 
 | 137 |  | 
 | 138 | /* | 
 | 139 |  * cgroup_event represents events which userspace want to receive. | 
 | 140 |  */ | 
 | 141 | struct mem_cgroup_event { | 
 | 142 | 	/* | 
 | 143 | 	 * memcg which the event belongs to. | 
 | 144 | 	 */ | 
 | 145 | 	struct mem_cgroup *memcg; | 
 | 146 | 	/* | 
 | 147 | 	 * eventfd to signal userspace about the event. | 
 | 148 | 	 */ | 
 | 149 | 	struct eventfd_ctx *eventfd; | 
 | 150 | 	/* | 
 | 151 | 	 * Each of these stored in a list by the cgroup. | 
 | 152 | 	 */ | 
 | 153 | 	struct list_head list; | 
 | 154 | 	/* | 
 | 155 | 	 * register_event() callback will be used to add new userspace | 
 | 156 | 	 * waiter for changes related to this event.  Use eventfd_signal() | 
 | 157 | 	 * on eventfd to send notification to userspace. | 
 | 158 | 	 */ | 
 | 159 | 	int (*register_event)(struct mem_cgroup *memcg, | 
 | 160 | 			      struct eventfd_ctx *eventfd, const char *args); | 
 | 161 | 	/* | 
 | 162 | 	 * unregister_event() callback will be called when userspace closes | 
 | 163 | 	 * the eventfd or on cgroup removing.  This callback must be set, | 
 | 164 | 	 * if you want provide notification functionality. | 
 | 165 | 	 */ | 
 | 166 | 	void (*unregister_event)(struct mem_cgroup *memcg, | 
 | 167 | 				 struct eventfd_ctx *eventfd); | 
 | 168 | 	/* | 
 | 169 | 	 * All fields below needed to unregister event when | 
 | 170 | 	 * userspace closes eventfd. | 
 | 171 | 	 */ | 
 | 172 | 	poll_table pt; | 
 | 173 | 	wait_queue_head_t *wqh; | 
 | 174 | 	wait_queue_entry_t wait; | 
 | 175 | 	struct work_struct remove; | 
 | 176 | }; | 
 | 177 |  | 
 | 178 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); | 
 | 179 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | 
 | 180 |  | 
 | 181 | /* Stuffs for move charges at task migration. */ | 
 | 182 | /* | 
 | 183 |  * Types of charges to be moved. | 
 | 184 |  */ | 
 | 185 | #define MOVE_ANON	0x1U | 
 | 186 | #define MOVE_FILE	0x2U | 
 | 187 | #define MOVE_MASK	(MOVE_ANON | MOVE_FILE) | 
 | 188 |  | 
 | 189 | /* "mc" and its members are protected by cgroup_mutex */ | 
 | 190 | static struct move_charge_struct { | 
 | 191 | 	spinlock_t	  lock; /* for from, to */ | 
 | 192 | 	struct mm_struct  *mm; | 
 | 193 | 	struct mem_cgroup *from; | 
 | 194 | 	struct mem_cgroup *to; | 
 | 195 | 	unsigned long flags; | 
 | 196 | 	unsigned long precharge; | 
 | 197 | 	unsigned long moved_charge; | 
 | 198 | 	unsigned long moved_swap; | 
 | 199 | 	struct task_struct *moving_task;	/* a task moving charges */ | 
 | 200 | 	wait_queue_head_t waitq;		/* a waitq for other context */ | 
 | 201 | } mc = { | 
 | 202 | 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 
 | 203 | 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 
 | 204 | }; | 
 | 205 |  | 
 | 206 | /* | 
 | 207 |  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 
 | 208 |  * limit reclaim to prevent infinite loops, if they ever occur. | 
 | 209 |  */ | 
 | 210 | #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100 | 
 | 211 | #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2 | 
 | 212 |  | 
 | 213 | enum charge_type { | 
 | 214 | 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 
 | 215 | 	MEM_CGROUP_CHARGE_TYPE_ANON, | 
 | 216 | 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */ | 
 | 217 | 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */ | 
 | 218 | 	NR_CHARGE_TYPE, | 
 | 219 | }; | 
 | 220 |  | 
 | 221 | /* for encoding cft->private value on file */ | 
 | 222 | enum res_type { | 
 | 223 | 	_MEM, | 
 | 224 | 	_MEMSWAP, | 
 | 225 | 	_OOM_TYPE, | 
 | 226 | 	_KMEM, | 
 | 227 | 	_TCP, | 
 | 228 | }; | 
 | 229 |  | 
 | 230 | #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val)) | 
 | 231 | #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff) | 
 | 232 | #define MEMFILE_ATTR(val)	((val) & 0xffff) | 
 | 233 | /* Used for OOM nofiier */ | 
 | 234 | #define OOM_CONTROL		(0) | 
 | 235 |  | 
 | 236 | /* | 
 | 237 |  * Iteration constructs for visiting all cgroups (under a tree).  If | 
 | 238 |  * loops are exited prematurely (break), mem_cgroup_iter_break() must | 
 | 239 |  * be used for reference counting. | 
 | 240 |  */ | 
 | 241 | #define for_each_mem_cgroup_tree(iter, root)		\ | 
 | 242 | 	for (iter = mem_cgroup_iter(root, NULL, NULL);	\ | 
 | 243 | 	     iter != NULL;				\ | 
 | 244 | 	     iter = mem_cgroup_iter(root, iter, NULL)) | 
 | 245 |  | 
 | 246 | #define for_each_mem_cgroup(iter)			\ | 
 | 247 | 	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\ | 
 | 248 | 	     iter != NULL;				\ | 
 | 249 | 	     iter = mem_cgroup_iter(NULL, iter, NULL)) | 
 | 250 |  | 
 | 251 | static inline bool should_force_charge(void) | 
 | 252 | { | 
 | 253 | 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) || | 
 | 254 | 		(current->flags & PF_EXITING); | 
 | 255 | } | 
 | 256 |  | 
 | 257 | /* Some nice accessors for the vmpressure. */ | 
 | 258 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) | 
 | 259 | { | 
 | 260 | 	if (!memcg) | 
 | 261 | 		memcg = root_mem_cgroup; | 
 | 262 | 	return &memcg->vmpressure; | 
 | 263 | } | 
 | 264 |  | 
 | 265 | struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | 
 | 266 | { | 
 | 267 | 	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | 
 | 268 | } | 
 | 269 |  | 
 | 270 | #ifdef CONFIG_MEMCG_KMEM | 
 | 271 | /* | 
 | 272 |  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. | 
 | 273 |  * The main reason for not using cgroup id for this: | 
 | 274 |  *  this works better in sparse environments, where we have a lot of memcgs, | 
 | 275 |  *  but only a few kmem-limited. Or also, if we have, for instance, 200 | 
 | 276 |  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a | 
 | 277 |  *  200 entry array for that. | 
 | 278 |  * | 
 | 279 |  * The current size of the caches array is stored in memcg_nr_cache_ids. It | 
 | 280 |  * will double each time we have to increase it. | 
 | 281 |  */ | 
 | 282 | static DEFINE_IDA(memcg_cache_ida); | 
 | 283 | int memcg_nr_cache_ids; | 
 | 284 |  | 
 | 285 | /* Protects memcg_nr_cache_ids */ | 
 | 286 | static DECLARE_RWSEM(memcg_cache_ids_sem); | 
 | 287 |  | 
 | 288 | void memcg_get_cache_ids(void) | 
 | 289 | { | 
 | 290 | 	down_read(&memcg_cache_ids_sem); | 
 | 291 | } | 
 | 292 |  | 
 | 293 | void memcg_put_cache_ids(void) | 
 | 294 | { | 
 | 295 | 	up_read(&memcg_cache_ids_sem); | 
 | 296 | } | 
 | 297 |  | 
 | 298 | /* | 
 | 299 |  * MIN_SIZE is different than 1, because we would like to avoid going through | 
 | 300 |  * the alloc/free process all the time. In a small machine, 4 kmem-limited | 
 | 301 |  * cgroups is a reasonable guess. In the future, it could be a parameter or | 
 | 302 |  * tunable, but that is strictly not necessary. | 
 | 303 |  * | 
 | 304 |  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get | 
 | 305 |  * this constant directly from cgroup, but it is understandable that this is | 
 | 306 |  * better kept as an internal representation in cgroup.c. In any case, the | 
 | 307 |  * cgrp_id space is not getting any smaller, and we don't have to necessarily | 
 | 308 |  * increase ours as well if it increases. | 
 | 309 |  */ | 
 | 310 | #define MEMCG_CACHES_MIN_SIZE 4 | 
 | 311 | #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX | 
 | 312 |  | 
 | 313 | /* | 
 | 314 |  * A lot of the calls to the cache allocation functions are expected to be | 
 | 315 |  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are | 
 | 316 |  * conditional to this static branch, we'll have to allow modules that does | 
 | 317 |  * kmem_cache_alloc and the such to see this symbol as well | 
 | 318 |  */ | 
 | 319 | DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); | 
 | 320 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 
 | 321 |  | 
 | 322 | struct workqueue_struct *memcg_kmem_cache_wq; | 
 | 323 |  | 
 | 324 | static int memcg_shrinker_map_size; | 
 | 325 | static DEFINE_MUTEX(memcg_shrinker_map_mutex); | 
 | 326 |  | 
 | 327 | static void memcg_free_shrinker_map_rcu(struct rcu_head *head) | 
 | 328 | { | 
 | 329 | 	kvfree(container_of(head, struct memcg_shrinker_map, rcu)); | 
 | 330 | } | 
 | 331 |  | 
 | 332 | static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, | 
 | 333 | 					 int size, int old_size) | 
 | 334 | { | 
 | 335 | 	struct memcg_shrinker_map *new, *old; | 
 | 336 | 	int nid; | 
 | 337 |  | 
 | 338 | 	lockdep_assert_held(&memcg_shrinker_map_mutex); | 
 | 339 |  | 
 | 340 | 	for_each_node(nid) { | 
 | 341 | 		old = rcu_dereference_protected( | 
 | 342 | 			mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true); | 
 | 343 | 		/* Not yet online memcg */ | 
 | 344 | 		if (!old) | 
 | 345 | 			return 0; | 
 | 346 |  | 
 | 347 | 		new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); | 
 | 348 | 		if (!new) | 
 | 349 | 			return -ENOMEM; | 
 | 350 |  | 
 | 351 | 		/* Set all old bits, clear all new bits */ | 
 | 352 | 		memset(new->map, (int)0xff, old_size); | 
 | 353 | 		memset((void *)new->map + old_size, 0, size - old_size); | 
 | 354 |  | 
 | 355 | 		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); | 
 | 356 | 		call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); | 
 | 357 | 	} | 
 | 358 |  | 
 | 359 | 	return 0; | 
 | 360 | } | 
 | 361 |  | 
 | 362 | static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) | 
 | 363 | { | 
 | 364 | 	struct mem_cgroup_per_node *pn; | 
 | 365 | 	struct memcg_shrinker_map *map; | 
 | 366 | 	int nid; | 
 | 367 |  | 
 | 368 | 	if (mem_cgroup_is_root(memcg)) | 
 | 369 | 		return; | 
 | 370 |  | 
 | 371 | 	for_each_node(nid) { | 
 | 372 | 		pn = mem_cgroup_nodeinfo(memcg, nid); | 
 | 373 | 		map = rcu_dereference_protected(pn->shrinker_map, true); | 
 | 374 | 		if (map) | 
 | 375 | 			kvfree(map); | 
 | 376 | 		rcu_assign_pointer(pn->shrinker_map, NULL); | 
 | 377 | 	} | 
 | 378 | } | 
 | 379 |  | 
 | 380 | static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) | 
 | 381 | { | 
 | 382 | 	struct memcg_shrinker_map *map; | 
 | 383 | 	int nid, size, ret = 0; | 
 | 384 |  | 
 | 385 | 	if (mem_cgroup_is_root(memcg)) | 
 | 386 | 		return 0; | 
 | 387 |  | 
 | 388 | 	mutex_lock(&memcg_shrinker_map_mutex); | 
 | 389 | 	size = memcg_shrinker_map_size; | 
 | 390 | 	for_each_node(nid) { | 
 | 391 | 		map = kvzalloc(sizeof(*map) + size, GFP_KERNEL); | 
 | 392 | 		if (!map) { | 
 | 393 | 			memcg_free_shrinker_maps(memcg); | 
 | 394 | 			ret = -ENOMEM; | 
 | 395 | 			break; | 
 | 396 | 		} | 
 | 397 | 		rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map); | 
 | 398 | 	} | 
 | 399 | 	mutex_unlock(&memcg_shrinker_map_mutex); | 
 | 400 |  | 
 | 401 | 	return ret; | 
 | 402 | } | 
 | 403 |  | 
 | 404 | int memcg_expand_shrinker_maps(int new_id) | 
 | 405 | { | 
 | 406 | 	int size, old_size, ret = 0; | 
 | 407 | 	struct mem_cgroup *memcg; | 
 | 408 |  | 
 | 409 | 	size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long); | 
 | 410 | 	old_size = memcg_shrinker_map_size; | 
 | 411 | 	if (size <= old_size) | 
 | 412 | 		return 0; | 
 | 413 |  | 
 | 414 | 	mutex_lock(&memcg_shrinker_map_mutex); | 
 | 415 | 	if (!root_mem_cgroup) | 
 | 416 | 		goto unlock; | 
 | 417 |  | 
 | 418 | 	for_each_mem_cgroup(memcg) { | 
 | 419 | 		if (mem_cgroup_is_root(memcg)) | 
 | 420 | 			continue; | 
 | 421 | 		ret = memcg_expand_one_shrinker_map(memcg, size, old_size); | 
 | 422 | 		if (ret) | 
 | 423 | 			goto unlock; | 
 | 424 | 	} | 
 | 425 | unlock: | 
 | 426 | 	if (!ret) | 
 | 427 | 		memcg_shrinker_map_size = size; | 
 | 428 | 	mutex_unlock(&memcg_shrinker_map_mutex); | 
 | 429 | 	return ret; | 
 | 430 | } | 
 | 431 |  | 
 | 432 | void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) | 
 | 433 | { | 
 | 434 | 	if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { | 
 | 435 | 		struct memcg_shrinker_map *map; | 
 | 436 |  | 
 | 437 | 		rcu_read_lock(); | 
 | 438 | 		map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map); | 
 | 439 | 		/* Pairs with smp mb in shrink_slab() */ | 
 | 440 | 		smp_mb__before_atomic(); | 
 | 441 | 		set_bit(shrinker_id, map->map); | 
 | 442 | 		rcu_read_unlock(); | 
 | 443 | 	} | 
 | 444 | } | 
 | 445 |  | 
 | 446 | #else /* CONFIG_MEMCG_KMEM */ | 
 | 447 | static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) | 
 | 448 | { | 
 | 449 | 	return 0; | 
 | 450 | } | 
 | 451 | static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } | 
 | 452 | #endif /* CONFIG_MEMCG_KMEM */ | 
 | 453 |  | 
 | 454 | /** | 
 | 455 |  * mem_cgroup_css_from_page - css of the memcg associated with a page | 
 | 456 |  * @page: page of interest | 
 | 457 |  * | 
 | 458 |  * If memcg is bound to the default hierarchy, css of the memcg associated | 
 | 459 |  * with @page is returned.  The returned css remains associated with @page | 
 | 460 |  * until it is released. | 
 | 461 |  * | 
 | 462 |  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup | 
 | 463 |  * is returned. | 
 | 464 |  */ | 
 | 465 | struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) | 
 | 466 | { | 
 | 467 | 	struct mem_cgroup *memcg; | 
 | 468 |  | 
 | 469 | 	memcg = page->mem_cgroup; | 
 | 470 |  | 
 | 471 | 	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 472 | 		memcg = root_mem_cgroup; | 
 | 473 |  | 
 | 474 | 	return &memcg->css; | 
 | 475 | } | 
 | 476 |  | 
 | 477 | /** | 
 | 478 |  * page_cgroup_ino - return inode number of the memcg a page is charged to | 
 | 479 |  * @page: the page | 
 | 480 |  * | 
 | 481 |  * Look up the closest online ancestor of the memory cgroup @page is charged to | 
 | 482 |  * and return its inode number or 0 if @page is not charged to any cgroup. It | 
 | 483 |  * is safe to call this function without holding a reference to @page. | 
 | 484 |  * | 
 | 485 |  * Note, this function is inherently racy, because there is nothing to prevent | 
 | 486 |  * the cgroup inode from getting torn down and potentially reallocated a moment | 
 | 487 |  * after page_cgroup_ino() returns, so it only should be used by callers that | 
 | 488 |  * do not care (such as procfs interfaces). | 
 | 489 |  */ | 
 | 490 | ino_t page_cgroup_ino(struct page *page) | 
 | 491 | { | 
 | 492 | 	struct mem_cgroup *memcg; | 
 | 493 | 	unsigned long ino = 0; | 
 | 494 |  | 
 | 495 | 	rcu_read_lock(); | 
 | 496 | 	memcg = READ_ONCE(page->mem_cgroup); | 
 | 497 | 	while (memcg && !(memcg->css.flags & CSS_ONLINE)) | 
 | 498 | 		memcg = parent_mem_cgroup(memcg); | 
 | 499 | 	if (memcg) | 
 | 500 | 		ino = cgroup_ino(memcg->css.cgroup); | 
 | 501 | 	rcu_read_unlock(); | 
 | 502 | 	return ino; | 
 | 503 | } | 
 | 504 |  | 
 | 505 | static struct mem_cgroup_per_node * | 
 | 506 | mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) | 
 | 507 | { | 
 | 508 | 	int nid = page_to_nid(page); | 
 | 509 |  | 
 | 510 | 	return memcg->nodeinfo[nid]; | 
 | 511 | } | 
 | 512 |  | 
 | 513 | static struct mem_cgroup_tree_per_node * | 
 | 514 | soft_limit_tree_node(int nid) | 
 | 515 | { | 
 | 516 | 	return soft_limit_tree.rb_tree_per_node[nid]; | 
 | 517 | } | 
 | 518 |  | 
 | 519 | static struct mem_cgroup_tree_per_node * | 
 | 520 | soft_limit_tree_from_page(struct page *page) | 
 | 521 | { | 
 | 522 | 	int nid = page_to_nid(page); | 
 | 523 |  | 
 | 524 | 	return soft_limit_tree.rb_tree_per_node[nid]; | 
 | 525 | } | 
 | 526 |  | 
 | 527 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, | 
 | 528 | 					 struct mem_cgroup_tree_per_node *mctz, | 
 | 529 | 					 unsigned long new_usage_in_excess) | 
 | 530 | { | 
 | 531 | 	struct rb_node **p = &mctz->rb_root.rb_node; | 
 | 532 | 	struct rb_node *parent = NULL; | 
 | 533 | 	struct mem_cgroup_per_node *mz_node; | 
 | 534 | 	bool rightmost = true; | 
 | 535 |  | 
 | 536 | 	if (mz->on_tree) | 
 | 537 | 		return; | 
 | 538 |  | 
 | 539 | 	mz->usage_in_excess = new_usage_in_excess; | 
 | 540 | 	if (!mz->usage_in_excess) | 
 | 541 | 		return; | 
 | 542 | 	while (*p) { | 
 | 543 | 		parent = *p; | 
 | 544 | 		mz_node = rb_entry(parent, struct mem_cgroup_per_node, | 
 | 545 | 					tree_node); | 
 | 546 | 		if (mz->usage_in_excess < mz_node->usage_in_excess) { | 
 | 547 | 			p = &(*p)->rb_left; | 
 | 548 | 			rightmost = false; | 
 | 549 | 		} | 
 | 550 |  | 
 | 551 | 		/* | 
 | 552 | 		 * We can't avoid mem cgroups that are over their soft | 
 | 553 | 		 * limit by the same amount | 
 | 554 | 		 */ | 
 | 555 | 		else if (mz->usage_in_excess >= mz_node->usage_in_excess) | 
 | 556 | 			p = &(*p)->rb_right; | 
 | 557 | 	} | 
 | 558 |  | 
 | 559 | 	if (rightmost) | 
 | 560 | 		mctz->rb_rightmost = &mz->tree_node; | 
 | 561 |  | 
 | 562 | 	rb_link_node(&mz->tree_node, parent, p); | 
 | 563 | 	rb_insert_color(&mz->tree_node, &mctz->rb_root); | 
 | 564 | 	mz->on_tree = true; | 
 | 565 | } | 
 | 566 |  | 
 | 567 | static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, | 
 | 568 | 					 struct mem_cgroup_tree_per_node *mctz) | 
 | 569 | { | 
 | 570 | 	if (!mz->on_tree) | 
 | 571 | 		return; | 
 | 572 |  | 
 | 573 | 	if (&mz->tree_node == mctz->rb_rightmost) | 
 | 574 | 		mctz->rb_rightmost = rb_prev(&mz->tree_node); | 
 | 575 |  | 
 | 576 | 	rb_erase(&mz->tree_node, &mctz->rb_root); | 
 | 577 | 	mz->on_tree = false; | 
 | 578 | } | 
 | 579 |  | 
 | 580 | static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, | 
 | 581 | 				       struct mem_cgroup_tree_per_node *mctz) | 
 | 582 | { | 
 | 583 | 	unsigned long flags; | 
 | 584 |  | 
 | 585 | 	spin_lock_irqsave(&mctz->lock, flags); | 
 | 586 | 	__mem_cgroup_remove_exceeded(mz, mctz); | 
 | 587 | 	spin_unlock_irqrestore(&mctz->lock, flags); | 
 | 588 | } | 
 | 589 |  | 
 | 590 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | 
 | 591 | { | 
 | 592 | 	unsigned long nr_pages = page_counter_read(&memcg->memory); | 
 | 593 | 	unsigned long soft_limit = READ_ONCE(memcg->soft_limit); | 
 | 594 | 	unsigned long excess = 0; | 
 | 595 |  | 
 | 596 | 	if (nr_pages > soft_limit) | 
 | 597 | 		excess = nr_pages - soft_limit; | 
 | 598 |  | 
 | 599 | 	return excess; | 
 | 600 | } | 
 | 601 |  | 
 | 602 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | 
 | 603 | { | 
 | 604 | 	unsigned long excess; | 
 | 605 | 	struct mem_cgroup_per_node *mz; | 
 | 606 | 	struct mem_cgroup_tree_per_node *mctz; | 
 | 607 |  | 
 | 608 | 	mctz = soft_limit_tree_from_page(page); | 
 | 609 | 	if (!mctz) | 
 | 610 | 		return; | 
 | 611 | 	/* | 
 | 612 | 	 * Necessary to update all ancestors when hierarchy is used. | 
 | 613 | 	 * because their event counter is not touched. | 
 | 614 | 	 */ | 
 | 615 | 	for (; memcg; memcg = parent_mem_cgroup(memcg)) { | 
 | 616 | 		mz = mem_cgroup_page_nodeinfo(memcg, page); | 
 | 617 | 		excess = soft_limit_excess(memcg); | 
 | 618 | 		/* | 
 | 619 | 		 * We have to update the tree if mz is on RB-tree or | 
 | 620 | 		 * mem is over its softlimit. | 
 | 621 | 		 */ | 
 | 622 | 		if (excess || mz->on_tree) { | 
 | 623 | 			unsigned long flags; | 
 | 624 |  | 
 | 625 | 			spin_lock_irqsave(&mctz->lock, flags); | 
 | 626 | 			/* if on-tree, remove it */ | 
 | 627 | 			if (mz->on_tree) | 
 | 628 | 				__mem_cgroup_remove_exceeded(mz, mctz); | 
 | 629 | 			/* | 
 | 630 | 			 * Insert again. mz->usage_in_excess will be updated. | 
 | 631 | 			 * If excess is 0, no tree ops. | 
 | 632 | 			 */ | 
 | 633 | 			__mem_cgroup_insert_exceeded(mz, mctz, excess); | 
 | 634 | 			spin_unlock_irqrestore(&mctz->lock, flags); | 
 | 635 | 		} | 
 | 636 | 	} | 
 | 637 | } | 
 | 638 |  | 
 | 639 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | 
 | 640 | { | 
 | 641 | 	struct mem_cgroup_tree_per_node *mctz; | 
 | 642 | 	struct mem_cgroup_per_node *mz; | 
 | 643 | 	int nid; | 
 | 644 |  | 
 | 645 | 	for_each_node(nid) { | 
 | 646 | 		mz = mem_cgroup_nodeinfo(memcg, nid); | 
 | 647 | 		mctz = soft_limit_tree_node(nid); | 
 | 648 | 		if (mctz) | 
 | 649 | 			mem_cgroup_remove_exceeded(mz, mctz); | 
 | 650 | 	} | 
 | 651 | } | 
 | 652 |  | 
 | 653 | static struct mem_cgroup_per_node * | 
 | 654 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) | 
 | 655 | { | 
 | 656 | 	struct mem_cgroup_per_node *mz; | 
 | 657 |  | 
 | 658 | retry: | 
 | 659 | 	mz = NULL; | 
 | 660 | 	if (!mctz->rb_rightmost) | 
 | 661 | 		goto done;		/* Nothing to reclaim from */ | 
 | 662 |  | 
 | 663 | 	mz = rb_entry(mctz->rb_rightmost, | 
 | 664 | 		      struct mem_cgroup_per_node, tree_node); | 
 | 665 | 	/* | 
 | 666 | 	 * Remove the node now but someone else can add it back, | 
 | 667 | 	 * we will to add it back at the end of reclaim to its correct | 
 | 668 | 	 * position in the tree. | 
 | 669 | 	 */ | 
 | 670 | 	__mem_cgroup_remove_exceeded(mz, mctz); | 
 | 671 | 	if (!soft_limit_excess(mz->memcg) || | 
 | 672 | 	    !css_tryget_online(&mz->memcg->css)) | 
 | 673 | 		goto retry; | 
 | 674 | done: | 
 | 675 | 	return mz; | 
 | 676 | } | 
 | 677 |  | 
 | 678 | static struct mem_cgroup_per_node * | 
 | 679 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) | 
 | 680 | { | 
 | 681 | 	struct mem_cgroup_per_node *mz; | 
 | 682 |  | 
 | 683 | 	spin_lock_irq(&mctz->lock); | 
 | 684 | 	mz = __mem_cgroup_largest_soft_limit_node(mctz); | 
 | 685 | 	spin_unlock_irq(&mctz->lock); | 
 | 686 | 	return mz; | 
 | 687 | } | 
 | 688 |  | 
 | 689 | static unsigned long memcg_sum_events(struct mem_cgroup *memcg, | 
 | 690 | 				      int event) | 
 | 691 | { | 
 | 692 | 	return atomic_long_read(&memcg->events[event]); | 
 | 693 | } | 
 | 694 |  | 
 | 695 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 
 | 696 | 					 struct page *page, | 
 | 697 | 					 bool compound, int nr_pages) | 
 | 698 | { | 
 | 699 | 	/* | 
 | 700 | 	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is | 
 | 701 | 	 * counted as CACHE even if it's on ANON LRU. | 
 | 702 | 	 */ | 
 | 703 | 	if (PageAnon(page)) | 
 | 704 | 		__mod_memcg_state(memcg, MEMCG_RSS, nr_pages); | 
 | 705 | 	else { | 
 | 706 | 		__mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); | 
 | 707 | 		if (PageSwapBacked(page)) | 
 | 708 | 			__mod_memcg_state(memcg, NR_SHMEM, nr_pages); | 
 | 709 | 	} | 
 | 710 |  | 
 | 711 | 	if (compound) { | 
 | 712 | 		VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 
 | 713 | 		__mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); | 
 | 714 | 	} | 
 | 715 |  | 
 | 716 | 	/* pagein of a big page is an event. So, ignore page size */ | 
 | 717 | 	if (nr_pages > 0) | 
 | 718 | 		__count_memcg_events(memcg, PGPGIN, 1); | 
 | 719 | 	else { | 
 | 720 | 		__count_memcg_events(memcg, PGPGOUT, 1); | 
 | 721 | 		nr_pages = -nr_pages; /* for event */ | 
 | 722 | 	} | 
 | 723 |  | 
 | 724 | 	__this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); | 
 | 725 | } | 
 | 726 |  | 
 | 727 | unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | 
 | 728 | 					   int nid, unsigned int lru_mask) | 
 | 729 | { | 
 | 730 | 	struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); | 
 | 731 | 	unsigned long nr = 0; | 
 | 732 | 	enum lru_list lru; | 
 | 733 |  | 
 | 734 | 	VM_BUG_ON((unsigned)nid >= nr_node_ids); | 
 | 735 |  | 
 | 736 | 	for_each_lru(lru) { | 
 | 737 | 		if (!(BIT(lru) & lru_mask)) | 
 | 738 | 			continue; | 
 | 739 | 		nr += mem_cgroup_get_lru_size(lruvec, lru); | 
 | 740 | 	} | 
 | 741 | 	return nr; | 
 | 742 | } | 
 | 743 |  | 
 | 744 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | 
 | 745 | 			unsigned int lru_mask) | 
 | 746 | { | 
 | 747 | 	unsigned long nr = 0; | 
 | 748 | 	int nid; | 
 | 749 |  | 
 | 750 | 	for_each_node_state(nid, N_MEMORY) | 
 | 751 | 		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); | 
 | 752 | 	return nr; | 
 | 753 | } | 
 | 754 |  | 
 | 755 | static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | 
 | 756 | 				       enum mem_cgroup_events_target target) | 
 | 757 | { | 
 | 758 | 	unsigned long val, next; | 
 | 759 |  | 
 | 760 | 	val = __this_cpu_read(memcg->stat_cpu->nr_page_events); | 
 | 761 | 	next = __this_cpu_read(memcg->stat_cpu->targets[target]); | 
 | 762 | 	/* from time_after() in jiffies.h */ | 
 | 763 | 	if ((long)(next - val) < 0) { | 
 | 764 | 		switch (target) { | 
 | 765 | 		case MEM_CGROUP_TARGET_THRESH: | 
 | 766 | 			next = val + THRESHOLDS_EVENTS_TARGET; | 
 | 767 | 			break; | 
 | 768 | 		case MEM_CGROUP_TARGET_SOFTLIMIT: | 
 | 769 | 			next = val + SOFTLIMIT_EVENTS_TARGET; | 
 | 770 | 			break; | 
 | 771 | 		case MEM_CGROUP_TARGET_NUMAINFO: | 
 | 772 | 			next = val + NUMAINFO_EVENTS_TARGET; | 
 | 773 | 			break; | 
 | 774 | 		default: | 
 | 775 | 			break; | 
 | 776 | 		} | 
 | 777 | 		__this_cpu_write(memcg->stat_cpu->targets[target], next); | 
 | 778 | 		return true; | 
 | 779 | 	} | 
 | 780 | 	return false; | 
 | 781 | } | 
 | 782 |  | 
 | 783 | /* | 
 | 784 |  * Check events in order. | 
 | 785 |  * | 
 | 786 |  */ | 
 | 787 | static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | 
 | 788 | { | 
 | 789 | 	/* threshold event is triggered in finer grain than soft limit */ | 
 | 790 | 	if (unlikely(mem_cgroup_event_ratelimit(memcg, | 
 | 791 | 						MEM_CGROUP_TARGET_THRESH))) { | 
 | 792 | 		bool do_softlimit; | 
 | 793 | 		bool do_numainfo __maybe_unused; | 
 | 794 |  | 
 | 795 | 		do_softlimit = mem_cgroup_event_ratelimit(memcg, | 
 | 796 | 						MEM_CGROUP_TARGET_SOFTLIMIT); | 
 | 797 | #if MAX_NUMNODES > 1 | 
 | 798 | 		do_numainfo = mem_cgroup_event_ratelimit(memcg, | 
 | 799 | 						MEM_CGROUP_TARGET_NUMAINFO); | 
 | 800 | #endif | 
 | 801 | 		mem_cgroup_threshold(memcg); | 
 | 802 | 		if (unlikely(do_softlimit)) | 
 | 803 | 			mem_cgroup_update_tree(memcg, page); | 
 | 804 | #if MAX_NUMNODES > 1 | 
 | 805 | 		if (unlikely(do_numainfo)) | 
 | 806 | 			atomic_inc(&memcg->numainfo_events); | 
 | 807 | #endif | 
 | 808 | 	} | 
 | 809 | } | 
 | 810 |  | 
 | 811 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 
 | 812 | { | 
 | 813 | 	/* | 
 | 814 | 	 * mm_update_next_owner() may clear mm->owner to NULL | 
 | 815 | 	 * if it races with swapoff, page migration, etc. | 
 | 816 | 	 * So this can be called with p == NULL. | 
 | 817 | 	 */ | 
 | 818 | 	if (unlikely(!p)) | 
 | 819 | 		return NULL; | 
 | 820 |  | 
 | 821 | 	return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); | 
 | 822 | } | 
 | 823 | EXPORT_SYMBOL(mem_cgroup_from_task); | 
 | 824 |  | 
 | 825 | /** | 
 | 826 |  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg. | 
 | 827 |  * @mm: mm from which memcg should be extracted. It can be NULL. | 
 | 828 |  * | 
 | 829 |  * Obtain a reference on mm->memcg and returns it if successful. Otherwise | 
 | 830 |  * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is | 
 | 831 |  * returned. | 
 | 832 |  */ | 
 | 833 | struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) | 
 | 834 | { | 
 | 835 | 	struct mem_cgroup *memcg; | 
 | 836 |  | 
 | 837 | 	if (mem_cgroup_disabled()) | 
 | 838 | 		return NULL; | 
 | 839 |  | 
 | 840 | 	rcu_read_lock(); | 
 | 841 | 	do { | 
 | 842 | 		/* | 
 | 843 | 		 * Page cache insertions can happen withou an | 
 | 844 | 		 * actual mm context, e.g. during disk probing | 
 | 845 | 		 * on boot, loopback IO, acct() writes etc. | 
 | 846 | 		 */ | 
 | 847 | 		if (unlikely(!mm)) | 
 | 848 | 			memcg = root_mem_cgroup; | 
 | 849 | 		else { | 
 | 850 | 			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 
 | 851 | 			if (unlikely(!memcg)) | 
 | 852 | 				memcg = root_mem_cgroup; | 
 | 853 | 		} | 
 | 854 | 	} while (!css_tryget(&memcg->css)); | 
 | 855 | 	rcu_read_unlock(); | 
 | 856 | 	return memcg; | 
 | 857 | } | 
 | 858 | EXPORT_SYMBOL(get_mem_cgroup_from_mm); | 
 | 859 |  | 
 | 860 | /** | 
 | 861 |  * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. | 
 | 862 |  * @page: page from which memcg should be extracted. | 
 | 863 |  * | 
 | 864 |  * Obtain a reference on page->memcg and returns it if successful. Otherwise | 
 | 865 |  * root_mem_cgroup is returned. | 
 | 866 |  */ | 
 | 867 | struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) | 
 | 868 | { | 
 | 869 | 	struct mem_cgroup *memcg = page->mem_cgroup; | 
 | 870 |  | 
 | 871 | 	if (mem_cgroup_disabled()) | 
 | 872 | 		return NULL; | 
 | 873 |  | 
 | 874 | 	rcu_read_lock(); | 
 | 875 | 	if (!memcg || !css_tryget_online(&memcg->css)) | 
 | 876 | 		memcg = root_mem_cgroup; | 
 | 877 | 	rcu_read_unlock(); | 
 | 878 | 	return memcg; | 
 | 879 | } | 
 | 880 | EXPORT_SYMBOL(get_mem_cgroup_from_page); | 
 | 881 |  | 
 | 882 | /** | 
 | 883 |  * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg. | 
 | 884 |  */ | 
 | 885 | static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) | 
 | 886 | { | 
 | 887 | 	if (unlikely(current->active_memcg)) { | 
 | 888 | 		struct mem_cgroup *memcg = root_mem_cgroup; | 
 | 889 |  | 
 | 890 | 		rcu_read_lock(); | 
 | 891 | 		if (css_tryget_online(¤t->active_memcg->css)) | 
 | 892 | 			memcg = current->active_memcg; | 
 | 893 | 		rcu_read_unlock(); | 
 | 894 | 		return memcg; | 
 | 895 | 	} | 
 | 896 | 	return get_mem_cgroup_from_mm(current->mm); | 
 | 897 | } | 
 | 898 |  | 
 | 899 | /** | 
 | 900 |  * mem_cgroup_iter - iterate over memory cgroup hierarchy | 
 | 901 |  * @root: hierarchy root | 
 | 902 |  * @prev: previously returned memcg, NULL on first invocation | 
 | 903 |  * @reclaim: cookie for shared reclaim walks, NULL for full walks | 
 | 904 |  * | 
 | 905 |  * Returns references to children of the hierarchy below @root, or | 
 | 906 |  * @root itself, or %NULL after a full round-trip. | 
 | 907 |  * | 
 | 908 |  * Caller must pass the return value in @prev on subsequent | 
 | 909 |  * invocations for reference counting, or use mem_cgroup_iter_break() | 
 | 910 |  * to cancel a hierarchy walk before the round-trip is complete. | 
 | 911 |  * | 
 | 912 |  * Reclaimers can specify a node and a priority level in @reclaim to | 
 | 913 |  * divide up the memcgs in the hierarchy among all concurrent | 
 | 914 |  * reclaimers operating on the same node and priority. | 
 | 915 |  */ | 
 | 916 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | 
 | 917 | 				   struct mem_cgroup *prev, | 
 | 918 | 				   struct mem_cgroup_reclaim_cookie *reclaim) | 
 | 919 | { | 
 | 920 | 	struct mem_cgroup_reclaim_iter *uninitialized_var(iter); | 
 | 921 | 	struct cgroup_subsys_state *css = NULL; | 
 | 922 | 	struct mem_cgroup *memcg = NULL; | 
 | 923 | 	struct mem_cgroup *pos = NULL; | 
 | 924 |  | 
 | 925 | 	if (mem_cgroup_disabled()) | 
 | 926 | 		return NULL; | 
 | 927 |  | 
 | 928 | 	if (!root) | 
 | 929 | 		root = root_mem_cgroup; | 
 | 930 |  | 
 | 931 | 	if (prev && !reclaim) | 
 | 932 | 		pos = prev; | 
 | 933 |  | 
 | 934 | 	if (!root->use_hierarchy && root != root_mem_cgroup) { | 
 | 935 | 		if (prev) | 
 | 936 | 			goto out; | 
 | 937 | 		return root; | 
 | 938 | 	} | 
 | 939 |  | 
 | 940 | 	rcu_read_lock(); | 
 | 941 |  | 
 | 942 | 	if (reclaim) { | 
 | 943 | 		struct mem_cgroup_per_node *mz; | 
 | 944 |  | 
 | 945 | 		mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); | 
 | 946 | 		iter = &mz->iter[reclaim->priority]; | 
 | 947 |  | 
 | 948 | 		if (prev && reclaim->generation != iter->generation) | 
 | 949 | 			goto out_unlock; | 
 | 950 |  | 
 | 951 | 		while (1) { | 
 | 952 | 			pos = READ_ONCE(iter->position); | 
 | 953 | 			if (!pos || css_tryget(&pos->css)) | 
 | 954 | 				break; | 
 | 955 | 			/* | 
 | 956 | 			 * css reference reached zero, so iter->position will | 
 | 957 | 			 * be cleared by ->css_released. However, we should not | 
 | 958 | 			 * rely on this happening soon, because ->css_released | 
 | 959 | 			 * is called from a work queue, and by busy-waiting we | 
 | 960 | 			 * might block it. So we clear iter->position right | 
 | 961 | 			 * away. | 
 | 962 | 			 */ | 
 | 963 | 			(void)cmpxchg(&iter->position, pos, NULL); | 
 | 964 | 		} | 
 | 965 | 	} | 
 | 966 |  | 
 | 967 | 	if (pos) | 
 | 968 | 		css = &pos->css; | 
 | 969 |  | 
 | 970 | 	for (;;) { | 
 | 971 | 		css = css_next_descendant_pre(css, &root->css); | 
 | 972 | 		if (!css) { | 
 | 973 | 			/* | 
 | 974 | 			 * Reclaimers share the hierarchy walk, and a | 
 | 975 | 			 * new one might jump in right at the end of | 
 | 976 | 			 * the hierarchy - make sure they see at least | 
 | 977 | 			 * one group and restart from the beginning. | 
 | 978 | 			 */ | 
 | 979 | 			if (!prev) | 
 | 980 | 				continue; | 
 | 981 | 			break; | 
 | 982 | 		} | 
 | 983 |  | 
 | 984 | 		/* | 
 | 985 | 		 * Verify the css and acquire a reference.  The root | 
 | 986 | 		 * is provided by the caller, so we know it's alive | 
 | 987 | 		 * and kicking, and don't take an extra reference. | 
 | 988 | 		 */ | 
 | 989 | 		memcg = mem_cgroup_from_css(css); | 
 | 990 |  | 
 | 991 | 		if (css == &root->css) | 
 | 992 | 			break; | 
 | 993 |  | 
 | 994 | 		if (css_tryget(css)) | 
 | 995 | 			break; | 
 | 996 |  | 
 | 997 | 		memcg = NULL; | 
 | 998 | 	} | 
 | 999 |  | 
 | 1000 | 	if (reclaim) { | 
 | 1001 | 		/* | 
 | 1002 | 		 * The position could have already been updated by a competing | 
 | 1003 | 		 * thread, so check that the value hasn't changed since we read | 
 | 1004 | 		 * it to avoid reclaiming from the same cgroup twice. | 
 | 1005 | 		 */ | 
 | 1006 | 		(void)cmpxchg(&iter->position, pos, memcg); | 
 | 1007 |  | 
 | 1008 | 		if (pos) | 
 | 1009 | 			css_put(&pos->css); | 
 | 1010 |  | 
 | 1011 | 		if (!memcg) | 
 | 1012 | 			iter->generation++; | 
 | 1013 | 		else if (!prev) | 
 | 1014 | 			reclaim->generation = iter->generation; | 
 | 1015 | 	} | 
 | 1016 |  | 
 | 1017 | out_unlock: | 
 | 1018 | 	rcu_read_unlock(); | 
 | 1019 | out: | 
 | 1020 | 	if (prev && prev != root) | 
 | 1021 | 		css_put(&prev->css); | 
 | 1022 |  | 
 | 1023 | 	return memcg; | 
 | 1024 | } | 
 | 1025 |  | 
 | 1026 | /** | 
 | 1027 |  * mem_cgroup_iter_break - abort a hierarchy walk prematurely | 
 | 1028 |  * @root: hierarchy root | 
 | 1029 |  * @prev: last visited hierarchy member as returned by mem_cgroup_iter() | 
 | 1030 |  */ | 
 | 1031 | void mem_cgroup_iter_break(struct mem_cgroup *root, | 
 | 1032 | 			   struct mem_cgroup *prev) | 
 | 1033 | { | 
 | 1034 | 	if (!root) | 
 | 1035 | 		root = root_mem_cgroup; | 
 | 1036 | 	if (prev && prev != root) | 
 | 1037 | 		css_put(&prev->css); | 
 | 1038 | } | 
 | 1039 |  | 
 | 1040 | static void __invalidate_reclaim_iterators(struct mem_cgroup *from, | 
 | 1041 | 					struct mem_cgroup *dead_memcg) | 
 | 1042 | { | 
 | 1043 | 	struct mem_cgroup_reclaim_iter *iter; | 
 | 1044 | 	struct mem_cgroup_per_node *mz; | 
 | 1045 | 	int nid; | 
 | 1046 | 	int i; | 
 | 1047 |  | 
 | 1048 | 	for_each_node(nid) { | 
 | 1049 | 		mz = mem_cgroup_nodeinfo(from, nid); | 
 | 1050 | 		for (i = 0; i <= DEF_PRIORITY; i++) { | 
 | 1051 | 			iter = &mz->iter[i]; | 
 | 1052 | 			cmpxchg(&iter->position, | 
 | 1053 | 				dead_memcg, NULL); | 
 | 1054 | 		} | 
 | 1055 | 	} | 
 | 1056 | } | 
 | 1057 |  | 
 | 1058 | static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) | 
 | 1059 | { | 
 | 1060 | 	struct mem_cgroup *memcg = dead_memcg; | 
 | 1061 | 	struct mem_cgroup *last; | 
 | 1062 |  | 
 | 1063 | 	do { | 
 | 1064 | 		__invalidate_reclaim_iterators(memcg, dead_memcg); | 
 | 1065 | 		last = memcg; | 
 | 1066 | 	} while ((memcg = parent_mem_cgroup(memcg))); | 
 | 1067 |  | 
 | 1068 | 	/* | 
 | 1069 | 	 * When cgruop1 non-hierarchy mode is used, | 
 | 1070 | 	 * parent_mem_cgroup() does not walk all the way up to the | 
 | 1071 | 	 * cgroup root (root_mem_cgroup). So we have to handle | 
 | 1072 | 	 * dead_memcg from cgroup root separately. | 
 | 1073 | 	 */ | 
 | 1074 | 	if (last != root_mem_cgroup) | 
 | 1075 | 		__invalidate_reclaim_iterators(root_mem_cgroup, | 
 | 1076 | 						dead_memcg); | 
 | 1077 | } | 
 | 1078 |  | 
 | 1079 | /** | 
 | 1080 |  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy | 
 | 1081 |  * @memcg: hierarchy root | 
 | 1082 |  * @fn: function to call for each task | 
 | 1083 |  * @arg: argument passed to @fn | 
 | 1084 |  * | 
 | 1085 |  * This function iterates over tasks attached to @memcg or to any of its | 
 | 1086 |  * descendants and calls @fn for each task. If @fn returns a non-zero | 
 | 1087 |  * value, the function breaks the iteration loop and returns the value. | 
 | 1088 |  * Otherwise, it will iterate over all tasks and return 0. | 
 | 1089 |  * | 
 | 1090 |  * This function must not be called for the root memory cgroup. | 
 | 1091 |  */ | 
 | 1092 | int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, | 
 | 1093 | 			  int (*fn)(struct task_struct *, void *), void *arg) | 
 | 1094 | { | 
 | 1095 | 	struct mem_cgroup *iter; | 
 | 1096 | 	int ret = 0; | 
 | 1097 |  | 
 | 1098 | 	BUG_ON(memcg == root_mem_cgroup); | 
 | 1099 |  | 
 | 1100 | 	for_each_mem_cgroup_tree(iter, memcg) { | 
 | 1101 | 		struct css_task_iter it; | 
 | 1102 | 		struct task_struct *task; | 
 | 1103 |  | 
 | 1104 | 		css_task_iter_start(&iter->css, 0, &it); | 
 | 1105 | 		while (!ret && (task = css_task_iter_next(&it))) | 
 | 1106 | 			ret = fn(task, arg); | 
 | 1107 | 		css_task_iter_end(&it); | 
 | 1108 | 		if (ret) { | 
 | 1109 | 			mem_cgroup_iter_break(memcg, iter); | 
 | 1110 | 			break; | 
 | 1111 | 		} | 
 | 1112 | 	} | 
 | 1113 | 	return ret; | 
 | 1114 | } | 
 | 1115 |  | 
 | 1116 | /** | 
 | 1117 |  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page | 
 | 1118 |  * @page: the page | 
 | 1119 |  * @pgdat: pgdat of the page | 
 | 1120 |  * | 
 | 1121 |  * This function is only safe when following the LRU page isolation | 
 | 1122 |  * and putback protocol: the LRU lock must be held, and the page must | 
 | 1123 |  * either be PageLRU() or the caller must have isolated/allocated it. | 
 | 1124 |  */ | 
 | 1125 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) | 
 | 1126 | { | 
 | 1127 | 	struct mem_cgroup_per_node *mz; | 
 | 1128 | 	struct mem_cgroup *memcg; | 
 | 1129 | 	struct lruvec *lruvec; | 
 | 1130 |  | 
 | 1131 | 	if (mem_cgroup_disabled()) { | 
 | 1132 | 		lruvec = &pgdat->lruvec; | 
 | 1133 | 		goto out; | 
 | 1134 | 	} | 
 | 1135 |  | 
 | 1136 | 	memcg = page->mem_cgroup; | 
 | 1137 | 	/* | 
 | 1138 | 	 * Swapcache readahead pages are added to the LRU - and | 
 | 1139 | 	 * possibly migrated - before they are charged. | 
 | 1140 | 	 */ | 
 | 1141 | 	if (!memcg) | 
 | 1142 | 		memcg = root_mem_cgroup; | 
 | 1143 |  | 
 | 1144 | 	mz = mem_cgroup_page_nodeinfo(memcg, page); | 
 | 1145 | 	lruvec = &mz->lruvec; | 
 | 1146 | out: | 
 | 1147 | 	/* | 
 | 1148 | 	 * Since a node can be onlined after the mem_cgroup was created, | 
 | 1149 | 	 * we have to be prepared to initialize lruvec->zone here; | 
 | 1150 | 	 * and if offlined then reonlined, we need to reinitialize it. | 
 | 1151 | 	 */ | 
 | 1152 | 	if (unlikely(lruvec->pgdat != pgdat)) | 
 | 1153 | 		lruvec->pgdat = pgdat; | 
 | 1154 | 	return lruvec; | 
 | 1155 | } | 
 | 1156 |  | 
 | 1157 | /** | 
 | 1158 |  * mem_cgroup_update_lru_size - account for adding or removing an lru page | 
 | 1159 |  * @lruvec: mem_cgroup per zone lru vector | 
 | 1160 |  * @lru: index of lru list the page is sitting on | 
 | 1161 |  * @zid: zone id of the accounted pages | 
 | 1162 |  * @nr_pages: positive when adding or negative when removing | 
 | 1163 |  * | 
 | 1164 |  * This function must be called under lru_lock, just before a page is added | 
 | 1165 |  * to or just after a page is removed from an lru list (that ordering being | 
 | 1166 |  * so as to allow it to check that lru_size 0 is consistent with list_empty). | 
 | 1167 |  */ | 
 | 1168 | void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, | 
 | 1169 | 				int zid, int nr_pages) | 
 | 1170 | { | 
 | 1171 | 	struct mem_cgroup_per_node *mz; | 
 | 1172 | 	unsigned long *lru_size; | 
 | 1173 | 	long size; | 
 | 1174 |  | 
 | 1175 | 	if (mem_cgroup_disabled()) | 
 | 1176 | 		return; | 
 | 1177 |  | 
 | 1178 | 	mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); | 
 | 1179 | 	lru_size = &mz->lru_zone_size[zid][lru]; | 
 | 1180 |  | 
 | 1181 | 	if (nr_pages < 0) | 
 | 1182 | 		*lru_size += nr_pages; | 
 | 1183 |  | 
 | 1184 | 	size = *lru_size; | 
 | 1185 | 	if (WARN_ONCE(size < 0, | 
 | 1186 | 		"%s(%p, %d, %d): lru_size %ld\n", | 
 | 1187 | 		__func__, lruvec, lru, nr_pages, size)) { | 
 | 1188 | 		VM_BUG_ON(1); | 
 | 1189 | 		*lru_size = 0; | 
 | 1190 | 	} | 
 | 1191 |  | 
 | 1192 | 	if (nr_pages > 0) | 
 | 1193 | 		*lru_size += nr_pages; | 
 | 1194 | } | 
 | 1195 |  | 
 | 1196 | bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) | 
 | 1197 | { | 
 | 1198 | 	struct mem_cgroup *task_memcg; | 
 | 1199 | 	struct task_struct *p; | 
 | 1200 | 	bool ret; | 
 | 1201 |  | 
 | 1202 | 	p = find_lock_task_mm(task); | 
 | 1203 | 	if (p) { | 
 | 1204 | 		task_memcg = get_mem_cgroup_from_mm(p->mm); | 
 | 1205 | 		task_unlock(p); | 
 | 1206 | 	} else { | 
 | 1207 | 		/* | 
 | 1208 | 		 * All threads may have already detached their mm's, but the oom | 
 | 1209 | 		 * killer still needs to detect if they have already been oom | 
 | 1210 | 		 * killed to prevent needlessly killing additional tasks. | 
 | 1211 | 		 */ | 
 | 1212 | 		rcu_read_lock(); | 
 | 1213 | 		task_memcg = mem_cgroup_from_task(task); | 
 | 1214 | 		css_get(&task_memcg->css); | 
 | 1215 | 		rcu_read_unlock(); | 
 | 1216 | 	} | 
 | 1217 | 	ret = mem_cgroup_is_descendant(task_memcg, memcg); | 
 | 1218 | 	css_put(&task_memcg->css); | 
 | 1219 | 	return ret; | 
 | 1220 | } | 
 | 1221 |  | 
 | 1222 | /** | 
 | 1223 |  * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 
 | 1224 |  * @memcg: the memory cgroup | 
 | 1225 |  * | 
 | 1226 |  * Returns the maximum amount of memory @mem can be charged with, in | 
 | 1227 |  * pages. | 
 | 1228 |  */ | 
 | 1229 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | 
 | 1230 | { | 
 | 1231 | 	unsigned long margin = 0; | 
 | 1232 | 	unsigned long count; | 
 | 1233 | 	unsigned long limit; | 
 | 1234 |  | 
 | 1235 | 	count = page_counter_read(&memcg->memory); | 
 | 1236 | 	limit = READ_ONCE(memcg->memory.max); | 
 | 1237 | 	if (count < limit) | 
 | 1238 | 		margin = limit - count; | 
 | 1239 |  | 
 | 1240 | 	if (do_memsw_account()) { | 
 | 1241 | 		count = page_counter_read(&memcg->memsw); | 
 | 1242 | 		limit = READ_ONCE(memcg->memsw.max); | 
 | 1243 | 		if (count <= limit) | 
 | 1244 | 			margin = min(margin, limit - count); | 
 | 1245 | 		else | 
 | 1246 | 			margin = 0; | 
 | 1247 | 	} | 
 | 1248 |  | 
 | 1249 | 	return margin; | 
 | 1250 | } | 
 | 1251 |  | 
 | 1252 | /* | 
 | 1253 |  * A routine for checking "mem" is under move_account() or not. | 
 | 1254 |  * | 
 | 1255 |  * Checking a cgroup is mc.from or mc.to or under hierarchy of | 
 | 1256 |  * moving cgroups. This is for waiting at high-memory pressure | 
 | 1257 |  * caused by "move". | 
 | 1258 |  */ | 
 | 1259 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | 
 | 1260 | { | 
 | 1261 | 	struct mem_cgroup *from; | 
 | 1262 | 	struct mem_cgroup *to; | 
 | 1263 | 	bool ret = false; | 
 | 1264 | 	/* | 
 | 1265 | 	 * Unlike task_move routines, we access mc.to, mc.from not under | 
 | 1266 | 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. | 
 | 1267 | 	 */ | 
 | 1268 | 	spin_lock(&mc.lock); | 
 | 1269 | 	from = mc.from; | 
 | 1270 | 	to = mc.to; | 
 | 1271 | 	if (!from) | 
 | 1272 | 		goto unlock; | 
 | 1273 |  | 
 | 1274 | 	ret = mem_cgroup_is_descendant(from, memcg) || | 
 | 1275 | 		mem_cgroup_is_descendant(to, memcg); | 
 | 1276 | unlock: | 
 | 1277 | 	spin_unlock(&mc.lock); | 
 | 1278 | 	return ret; | 
 | 1279 | } | 
 | 1280 |  | 
 | 1281 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | 
 | 1282 | { | 
 | 1283 | 	if (mc.moving_task && current != mc.moving_task) { | 
 | 1284 | 		if (mem_cgroup_under_move(memcg)) { | 
 | 1285 | 			DEFINE_WAIT(wait); | 
 | 1286 | 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | 
 | 1287 | 			/* moving charge context might have finished. */ | 
 | 1288 | 			if (mc.moving_task) | 
 | 1289 | 				schedule(); | 
 | 1290 | 			finish_wait(&mc.waitq, &wait); | 
 | 1291 | 			return true; | 
 | 1292 | 		} | 
 | 1293 | 	} | 
 | 1294 | 	return false; | 
 | 1295 | } | 
 | 1296 |  | 
 | 1297 | static const unsigned int memcg1_stats[] = { | 
 | 1298 | 	MEMCG_CACHE, | 
 | 1299 | 	MEMCG_RSS, | 
 | 1300 | 	MEMCG_RSS_HUGE, | 
 | 1301 | 	NR_SHMEM, | 
 | 1302 | 	NR_FILE_MAPPED, | 
 | 1303 | 	NR_FILE_DIRTY, | 
 | 1304 | 	NR_WRITEBACK, | 
 | 1305 | 	MEMCG_SWAP, | 
 | 1306 | }; | 
 | 1307 |  | 
 | 1308 | static const char *const memcg1_stat_names[] = { | 
 | 1309 | 	"cache", | 
 | 1310 | 	"rss", | 
 | 1311 | 	"rss_huge", | 
 | 1312 | 	"shmem", | 
 | 1313 | 	"mapped_file", | 
 | 1314 | 	"dirty", | 
 | 1315 | 	"writeback", | 
 | 1316 | 	"swap", | 
 | 1317 | }; | 
 | 1318 |  | 
 | 1319 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 
 | 1320 | /** | 
 | 1321 |  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. | 
 | 1322 |  * @memcg: The memory cgroup that went over limit | 
 | 1323 |  * @p: Task that is going to be killed | 
 | 1324 |  * | 
 | 1325 |  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is | 
 | 1326 |  * enabled | 
 | 1327 |  */ | 
 | 1328 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 
 | 1329 | { | 
 | 1330 | 	struct mem_cgroup *iter; | 
 | 1331 | 	unsigned int i; | 
 | 1332 |  | 
 | 1333 | 	rcu_read_lock(); | 
 | 1334 |  | 
 | 1335 | 	if (p) { | 
 | 1336 | 		pr_info("Task in "); | 
 | 1337 | 		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); | 
 | 1338 | 		pr_cont(" killed as a result of limit of "); | 
 | 1339 | 	} else { | 
 | 1340 | 		pr_info("Memory limit reached of cgroup "); | 
 | 1341 | 	} | 
 | 1342 |  | 
 | 1343 | 	pr_cont_cgroup_path(memcg->css.cgroup); | 
 | 1344 | 	pr_cont("\n"); | 
 | 1345 |  | 
 | 1346 | 	rcu_read_unlock(); | 
 | 1347 |  | 
 | 1348 | 	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", | 
 | 1349 | 		K((u64)page_counter_read(&memcg->memory)), | 
 | 1350 | 		K((u64)memcg->memory.max), memcg->memory.failcnt); | 
 | 1351 | 	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", | 
 | 1352 | 		K((u64)page_counter_read(&memcg->memsw)), | 
 | 1353 | 		K((u64)memcg->memsw.max), memcg->memsw.failcnt); | 
 | 1354 | 	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", | 
 | 1355 | 		K((u64)page_counter_read(&memcg->kmem)), | 
 | 1356 | 		K((u64)memcg->kmem.max), memcg->kmem.failcnt); | 
 | 1357 |  | 
 | 1358 | 	for_each_mem_cgroup_tree(iter, memcg) { | 
 | 1359 | 		pr_info("Memory cgroup stats for "); | 
 | 1360 | 		pr_cont_cgroup_path(iter->css.cgroup); | 
 | 1361 | 		pr_cont(":"); | 
 | 1362 |  | 
 | 1363 | 		for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { | 
 | 1364 | 			if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) | 
 | 1365 | 				continue; | 
 | 1366 | 			pr_cont(" %s:%luKB", memcg1_stat_names[i], | 
 | 1367 | 				K(memcg_page_state(iter, memcg1_stats[i]))); | 
 | 1368 | 		} | 
 | 1369 |  | 
 | 1370 | 		for (i = 0; i < NR_LRU_LISTS; i++) | 
 | 1371 | 			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], | 
 | 1372 | 				K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); | 
 | 1373 |  | 
 | 1374 | 		pr_cont("\n"); | 
 | 1375 | 	} | 
 | 1376 | } | 
 | 1377 |  | 
 | 1378 | /* | 
 | 1379 |  * Return the memory (and swap, if configured) limit for a memcg. | 
 | 1380 |  */ | 
 | 1381 | unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) | 
 | 1382 | { | 
 | 1383 | 	unsigned long max; | 
 | 1384 |  | 
 | 1385 | 	max = memcg->memory.max; | 
 | 1386 | 	if (mem_cgroup_swappiness(memcg)) { | 
 | 1387 | 		unsigned long memsw_max; | 
 | 1388 | 		unsigned long swap_max; | 
 | 1389 |  | 
 | 1390 | 		memsw_max = memcg->memsw.max; | 
 | 1391 | 		swap_max = memcg->swap.max; | 
 | 1392 | 		swap_max = min(swap_max, (unsigned long)total_swap_pages); | 
 | 1393 | 		max = min(max + swap_max, memsw_max); | 
 | 1394 | 	} | 
 | 1395 | 	return max; | 
 | 1396 | } | 
 | 1397 |  | 
 | 1398 | static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | 
 | 1399 | 				     int order) | 
 | 1400 | { | 
 | 1401 | 	struct oom_control oc = { | 
 | 1402 | 		.zonelist = NULL, | 
 | 1403 | 		.nodemask = NULL, | 
 | 1404 | 		.memcg = memcg, | 
 | 1405 | 		.gfp_mask = gfp_mask, | 
 | 1406 | 		.order = order, | 
 | 1407 | 	}; | 
 | 1408 | 	bool ret; | 
 | 1409 |  | 
 | 1410 | 	if (mutex_lock_killable(&oom_lock)) | 
 | 1411 | 		return true; | 
 | 1412 | 	/* | 
 | 1413 | 	 * A few threads which were not waiting at mutex_lock_killable() can | 
 | 1414 | 	 * fail to bail out. Therefore, check again after holding oom_lock. | 
 | 1415 | 	 */ | 
 | 1416 | 	ret = should_force_charge() || out_of_memory(&oc); | 
 | 1417 | 	mutex_unlock(&oom_lock); | 
 | 1418 | 	return ret; | 
 | 1419 | } | 
 | 1420 |  | 
 | 1421 | #if MAX_NUMNODES > 1 | 
 | 1422 |  | 
 | 1423 | /** | 
 | 1424 |  * test_mem_cgroup_node_reclaimable | 
 | 1425 |  * @memcg: the target memcg | 
 | 1426 |  * @nid: the node ID to be checked. | 
 | 1427 |  * @noswap : specify true here if the user wants flle only information. | 
 | 1428 |  * | 
 | 1429 |  * This function returns whether the specified memcg contains any | 
 | 1430 |  * reclaimable pages on a node. Returns true if there are any reclaimable | 
 | 1431 |  * pages in the node. | 
 | 1432 |  */ | 
 | 1433 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | 
 | 1434 | 		int nid, bool noswap) | 
 | 1435 | { | 
 | 1436 | 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) | 
 | 1437 | 		return true; | 
 | 1438 | 	if (noswap || !total_swap_pages) | 
 | 1439 | 		return false; | 
 | 1440 | 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) | 
 | 1441 | 		return true; | 
 | 1442 | 	return false; | 
 | 1443 |  | 
 | 1444 | } | 
 | 1445 |  | 
 | 1446 | /* | 
 | 1447 |  * Always updating the nodemask is not very good - even if we have an empty | 
 | 1448 |  * list or the wrong list here, we can start from some node and traverse all | 
 | 1449 |  * nodes based on the zonelist. So update the list loosely once per 10 secs. | 
 | 1450 |  * | 
 | 1451 |  */ | 
 | 1452 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) | 
 | 1453 | { | 
 | 1454 | 	int nid; | 
 | 1455 | 	/* | 
 | 1456 | 	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET | 
 | 1457 | 	 * pagein/pageout changes since the last update. | 
 | 1458 | 	 */ | 
 | 1459 | 	if (!atomic_read(&memcg->numainfo_events)) | 
 | 1460 | 		return; | 
 | 1461 | 	if (atomic_inc_return(&memcg->numainfo_updating) > 1) | 
 | 1462 | 		return; | 
 | 1463 |  | 
 | 1464 | 	/* make a nodemask where this memcg uses memory from */ | 
 | 1465 | 	memcg->scan_nodes = node_states[N_MEMORY]; | 
 | 1466 |  | 
 | 1467 | 	for_each_node_mask(nid, node_states[N_MEMORY]) { | 
 | 1468 |  | 
 | 1469 | 		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) | 
 | 1470 | 			node_clear(nid, memcg->scan_nodes); | 
 | 1471 | 	} | 
 | 1472 |  | 
 | 1473 | 	atomic_set(&memcg->numainfo_events, 0); | 
 | 1474 | 	atomic_set(&memcg->numainfo_updating, 0); | 
 | 1475 | } | 
 | 1476 |  | 
 | 1477 | /* | 
 | 1478 |  * Selecting a node where we start reclaim from. Because what we need is just | 
 | 1479 |  * reducing usage counter, start from anywhere is O,K. Considering | 
 | 1480 |  * memory reclaim from current node, there are pros. and cons. | 
 | 1481 |  * | 
 | 1482 |  * Freeing memory from current node means freeing memory from a node which | 
 | 1483 |  * we'll use or we've used. So, it may make LRU bad. And if several threads | 
 | 1484 |  * hit limits, it will see a contention on a node. But freeing from remote | 
 | 1485 |  * node means more costs for memory reclaim because of memory latency. | 
 | 1486 |  * | 
 | 1487 |  * Now, we use round-robin. Better algorithm is welcomed. | 
 | 1488 |  */ | 
 | 1489 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 
 | 1490 | { | 
 | 1491 | 	int node; | 
 | 1492 |  | 
 | 1493 | 	mem_cgroup_may_update_nodemask(memcg); | 
 | 1494 | 	node = memcg->last_scanned_node; | 
 | 1495 |  | 
 | 1496 | 	node = next_node_in(node, memcg->scan_nodes); | 
 | 1497 | 	/* | 
 | 1498 | 	 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages | 
 | 1499 | 	 * last time it really checked all the LRUs due to rate limiting. | 
 | 1500 | 	 * Fallback to the current node in that case for simplicity. | 
 | 1501 | 	 */ | 
 | 1502 | 	if (unlikely(node == MAX_NUMNODES)) | 
 | 1503 | 		node = numa_node_id(); | 
 | 1504 |  | 
 | 1505 | 	memcg->last_scanned_node = node; | 
 | 1506 | 	return node; | 
 | 1507 | } | 
 | 1508 | #else | 
 | 1509 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 
 | 1510 | { | 
 | 1511 | 	return 0; | 
 | 1512 | } | 
 | 1513 | #endif | 
 | 1514 |  | 
 | 1515 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 
 | 1516 | 				   pg_data_t *pgdat, | 
 | 1517 | 				   gfp_t gfp_mask, | 
 | 1518 | 				   unsigned long *total_scanned) | 
 | 1519 | { | 
 | 1520 | 	struct mem_cgroup *victim = NULL; | 
 | 1521 | 	int total = 0; | 
 | 1522 | 	int loop = 0; | 
 | 1523 | 	unsigned long excess; | 
 | 1524 | 	unsigned long nr_scanned; | 
 | 1525 | 	struct mem_cgroup_reclaim_cookie reclaim = { | 
 | 1526 | 		.pgdat = pgdat, | 
 | 1527 | 		.priority = 0, | 
 | 1528 | 	}; | 
 | 1529 |  | 
 | 1530 | 	excess = soft_limit_excess(root_memcg); | 
 | 1531 |  | 
 | 1532 | 	while (1) { | 
 | 1533 | 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | 
 | 1534 | 		if (!victim) { | 
 | 1535 | 			loop++; | 
 | 1536 | 			if (loop >= 2) { | 
 | 1537 | 				/* | 
 | 1538 | 				 * If we have not been able to reclaim | 
 | 1539 | 				 * anything, it might because there are | 
 | 1540 | 				 * no reclaimable pages under this hierarchy | 
 | 1541 | 				 */ | 
 | 1542 | 				if (!total) | 
 | 1543 | 					break; | 
 | 1544 | 				/* | 
 | 1545 | 				 * We want to do more targeted reclaim. | 
 | 1546 | 				 * excess >> 2 is not to excessive so as to | 
 | 1547 | 				 * reclaim too much, nor too less that we keep | 
 | 1548 | 				 * coming back to reclaim from this cgroup | 
 | 1549 | 				 */ | 
 | 1550 | 				if (total >= (excess >> 2) || | 
 | 1551 | 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | 
 | 1552 | 					break; | 
 | 1553 | 			} | 
 | 1554 | 			continue; | 
 | 1555 | 		} | 
 | 1556 | 		total += mem_cgroup_shrink_node(victim, gfp_mask, false, | 
 | 1557 | 					pgdat, &nr_scanned); | 
 | 1558 | 		*total_scanned += nr_scanned; | 
 | 1559 | 		if (!soft_limit_excess(root_memcg)) | 
 | 1560 | 			break; | 
 | 1561 | 	} | 
 | 1562 | 	mem_cgroup_iter_break(root_memcg, victim); | 
 | 1563 | 	return total; | 
 | 1564 | } | 
 | 1565 |  | 
 | 1566 | #ifdef CONFIG_LOCKDEP | 
 | 1567 | static struct lockdep_map memcg_oom_lock_dep_map = { | 
 | 1568 | 	.name = "memcg_oom_lock", | 
 | 1569 | }; | 
 | 1570 | #endif | 
 | 1571 |  | 
 | 1572 | static DEFINE_SPINLOCK(memcg_oom_lock); | 
 | 1573 |  | 
 | 1574 | /* | 
 | 1575 |  * Check OOM-Killer is already running under our hierarchy. | 
 | 1576 |  * If someone is running, return false. | 
 | 1577 |  */ | 
 | 1578 | static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) | 
 | 1579 | { | 
 | 1580 | 	struct mem_cgroup *iter, *failed = NULL; | 
 | 1581 |  | 
 | 1582 | 	spin_lock(&memcg_oom_lock); | 
 | 1583 |  | 
 | 1584 | 	for_each_mem_cgroup_tree(iter, memcg) { | 
 | 1585 | 		if (iter->oom_lock) { | 
 | 1586 | 			/* | 
 | 1587 | 			 * this subtree of our hierarchy is already locked | 
 | 1588 | 			 * so we cannot give a lock. | 
 | 1589 | 			 */ | 
 | 1590 | 			failed = iter; | 
 | 1591 | 			mem_cgroup_iter_break(memcg, iter); | 
 | 1592 | 			break; | 
 | 1593 | 		} else | 
 | 1594 | 			iter->oom_lock = true; | 
 | 1595 | 	} | 
 | 1596 |  | 
 | 1597 | 	if (failed) { | 
 | 1598 | 		/* | 
 | 1599 | 		 * OK, we failed to lock the whole subtree so we have | 
 | 1600 | 		 * to clean up what we set up to the failing subtree | 
 | 1601 | 		 */ | 
 | 1602 | 		for_each_mem_cgroup_tree(iter, memcg) { | 
 | 1603 | 			if (iter == failed) { | 
 | 1604 | 				mem_cgroup_iter_break(memcg, iter); | 
 | 1605 | 				break; | 
 | 1606 | 			} | 
 | 1607 | 			iter->oom_lock = false; | 
 | 1608 | 		} | 
 | 1609 | 	} else | 
 | 1610 | 		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); | 
 | 1611 |  | 
 | 1612 | 	spin_unlock(&memcg_oom_lock); | 
 | 1613 |  | 
 | 1614 | 	return !failed; | 
 | 1615 | } | 
 | 1616 |  | 
 | 1617 | static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | 
 | 1618 | { | 
 | 1619 | 	struct mem_cgroup *iter; | 
 | 1620 |  | 
 | 1621 | 	spin_lock(&memcg_oom_lock); | 
 | 1622 | 	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); | 
 | 1623 | 	for_each_mem_cgroup_tree(iter, memcg) | 
 | 1624 | 		iter->oom_lock = false; | 
 | 1625 | 	spin_unlock(&memcg_oom_lock); | 
 | 1626 | } | 
 | 1627 |  | 
 | 1628 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | 
 | 1629 | { | 
 | 1630 | 	struct mem_cgroup *iter; | 
 | 1631 |  | 
 | 1632 | 	spin_lock(&memcg_oom_lock); | 
 | 1633 | 	for_each_mem_cgroup_tree(iter, memcg) | 
 | 1634 | 		iter->under_oom++; | 
 | 1635 | 	spin_unlock(&memcg_oom_lock); | 
 | 1636 | } | 
 | 1637 |  | 
 | 1638 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | 
 | 1639 | { | 
 | 1640 | 	struct mem_cgroup *iter; | 
 | 1641 |  | 
 | 1642 | 	/* | 
 | 1643 | 	 * When a new child is created while the hierarchy is under oom, | 
 | 1644 | 	 * mem_cgroup_oom_lock() may not be called. Watch for underflow. | 
 | 1645 | 	 */ | 
 | 1646 | 	spin_lock(&memcg_oom_lock); | 
 | 1647 | 	for_each_mem_cgroup_tree(iter, memcg) | 
 | 1648 | 		if (iter->under_oom > 0) | 
 | 1649 | 			iter->under_oom--; | 
 | 1650 | 	spin_unlock(&memcg_oom_lock); | 
 | 1651 | } | 
 | 1652 |  | 
 | 1653 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 
 | 1654 |  | 
 | 1655 | struct oom_wait_info { | 
 | 1656 | 	struct mem_cgroup *memcg; | 
 | 1657 | 	wait_queue_entry_t	wait; | 
 | 1658 | }; | 
 | 1659 |  | 
 | 1660 | static int memcg_oom_wake_function(wait_queue_entry_t *wait, | 
 | 1661 | 	unsigned mode, int sync, void *arg) | 
 | 1662 | { | 
 | 1663 | 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; | 
 | 1664 | 	struct mem_cgroup *oom_wait_memcg; | 
 | 1665 | 	struct oom_wait_info *oom_wait_info; | 
 | 1666 |  | 
 | 1667 | 	oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 
 | 1668 | 	oom_wait_memcg = oom_wait_info->memcg; | 
 | 1669 |  | 
 | 1670 | 	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) && | 
 | 1671 | 	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg)) | 
 | 1672 | 		return 0; | 
 | 1673 | 	return autoremove_wake_function(wait, mode, sync, arg); | 
 | 1674 | } | 
 | 1675 |  | 
 | 1676 | static void memcg_oom_recover(struct mem_cgroup *memcg) | 
 | 1677 | { | 
 | 1678 | 	/* | 
 | 1679 | 	 * For the following lockless ->under_oom test, the only required | 
 | 1680 | 	 * guarantee is that it must see the state asserted by an OOM when | 
 | 1681 | 	 * this function is called as a result of userland actions | 
 | 1682 | 	 * triggered by the notification of the OOM.  This is trivially | 
 | 1683 | 	 * achieved by invoking mem_cgroup_mark_under_oom() before | 
 | 1684 | 	 * triggering notification. | 
 | 1685 | 	 */ | 
 | 1686 | 	if (memcg && memcg->under_oom) | 
 | 1687 | 		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | 
 | 1688 | } | 
 | 1689 |  | 
 | 1690 | enum oom_status { | 
 | 1691 | 	OOM_SUCCESS, | 
 | 1692 | 	OOM_FAILED, | 
 | 1693 | 	OOM_ASYNC, | 
 | 1694 | 	OOM_SKIPPED | 
 | 1695 | }; | 
 | 1696 |  | 
 | 1697 | static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 
 | 1698 | { | 
 | 1699 | 	enum oom_status ret; | 
 | 1700 | 	bool locked; | 
 | 1701 |  | 
 | 1702 | 	if (order > PAGE_ALLOC_COSTLY_ORDER) | 
 | 1703 | 		return OOM_SKIPPED; | 
 | 1704 |  | 
 | 1705 | 	/* | 
 | 1706 | 	 * We are in the middle of the charge context here, so we | 
 | 1707 | 	 * don't want to block when potentially sitting on a callstack | 
 | 1708 | 	 * that holds all kinds of filesystem and mm locks. | 
 | 1709 | 	 * | 
 | 1710 | 	 * cgroup1 allows disabling the OOM killer and waiting for outside | 
 | 1711 | 	 * handling until the charge can succeed; remember the context and put | 
 | 1712 | 	 * the task to sleep at the end of the page fault when all locks are | 
 | 1713 | 	 * released. | 
 | 1714 | 	 * | 
 | 1715 | 	 * On the other hand, in-kernel OOM killer allows for an async victim | 
 | 1716 | 	 * memory reclaim (oom_reaper) and that means that we are not solely | 
 | 1717 | 	 * relying on the oom victim to make a forward progress and we can | 
 | 1718 | 	 * invoke the oom killer here. | 
 | 1719 | 	 * | 
 | 1720 | 	 * Please note that mem_cgroup_out_of_memory might fail to find a | 
 | 1721 | 	 * victim and then we have to bail out from the charge path. | 
 | 1722 | 	 */ | 
 | 1723 | 	if (memcg->oom_kill_disable) { | 
 | 1724 | 		if (!current->in_user_fault) | 
 | 1725 | 			return OOM_SKIPPED; | 
 | 1726 | 		css_get(&memcg->css); | 
 | 1727 | 		current->memcg_in_oom = memcg; | 
 | 1728 | 		current->memcg_oom_gfp_mask = mask; | 
 | 1729 | 		current->memcg_oom_order = order; | 
 | 1730 |  | 
 | 1731 | 		return OOM_ASYNC; | 
 | 1732 | 	} | 
 | 1733 |  | 
 | 1734 | 	mem_cgroup_mark_under_oom(memcg); | 
 | 1735 |  | 
 | 1736 | 	locked = mem_cgroup_oom_trylock(memcg); | 
 | 1737 |  | 
 | 1738 | 	if (locked) | 
 | 1739 | 		mem_cgroup_oom_notify(memcg); | 
 | 1740 |  | 
 | 1741 | 	mem_cgroup_unmark_under_oom(memcg); | 
 | 1742 | 	if (mem_cgroup_out_of_memory(memcg, mask, order)) | 
 | 1743 | 		ret = OOM_SUCCESS; | 
 | 1744 | 	else | 
 | 1745 | 		ret = OOM_FAILED; | 
 | 1746 |  | 
 | 1747 | 	if (locked) | 
 | 1748 | 		mem_cgroup_oom_unlock(memcg); | 
 | 1749 |  | 
 | 1750 | 	return ret; | 
 | 1751 | } | 
 | 1752 |  | 
 | 1753 | /** | 
 | 1754 |  * mem_cgroup_oom_synchronize - complete memcg OOM handling | 
 | 1755 |  * @handle: actually kill/wait or just clean up the OOM state | 
 | 1756 |  * | 
 | 1757 |  * This has to be called at the end of a page fault if the memcg OOM | 
 | 1758 |  * handler was enabled. | 
 | 1759 |  * | 
 | 1760 |  * Memcg supports userspace OOM handling where failed allocations must | 
 | 1761 |  * sleep on a waitqueue until the userspace task resolves the | 
 | 1762 |  * situation.  Sleeping directly in the charge context with all kinds | 
 | 1763 |  * of locks held is not a good idea, instead we remember an OOM state | 
 | 1764 |  * in the task and mem_cgroup_oom_synchronize() has to be called at | 
 | 1765 |  * the end of the page fault to complete the OOM handling. | 
 | 1766 |  * | 
 | 1767 |  * Returns %true if an ongoing memcg OOM situation was detected and | 
 | 1768 |  * completed, %false otherwise. | 
 | 1769 |  */ | 
 | 1770 | bool mem_cgroup_oom_synchronize(bool handle) | 
 | 1771 | { | 
 | 1772 | 	struct mem_cgroup *memcg = current->memcg_in_oom; | 
 | 1773 | 	struct oom_wait_info owait; | 
 | 1774 | 	bool locked; | 
 | 1775 |  | 
 | 1776 | 	/* OOM is global, do not handle */ | 
 | 1777 | 	if (!memcg) | 
 | 1778 | 		return false; | 
 | 1779 |  | 
 | 1780 | 	if (!handle) | 
 | 1781 | 		goto cleanup; | 
 | 1782 |  | 
 | 1783 | 	owait.memcg = memcg; | 
 | 1784 | 	owait.wait.flags = 0; | 
 | 1785 | 	owait.wait.func = memcg_oom_wake_function; | 
 | 1786 | 	owait.wait.private = current; | 
 | 1787 | 	INIT_LIST_HEAD(&owait.wait.entry); | 
 | 1788 |  | 
 | 1789 | 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 
 | 1790 | 	mem_cgroup_mark_under_oom(memcg); | 
 | 1791 |  | 
 | 1792 | 	locked = mem_cgroup_oom_trylock(memcg); | 
 | 1793 |  | 
 | 1794 | 	if (locked) | 
 | 1795 | 		mem_cgroup_oom_notify(memcg); | 
 | 1796 |  | 
 | 1797 | 	if (locked && !memcg->oom_kill_disable) { | 
 | 1798 | 		mem_cgroup_unmark_under_oom(memcg); | 
 | 1799 | 		finish_wait(&memcg_oom_waitq, &owait.wait); | 
 | 1800 | 		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, | 
 | 1801 | 					 current->memcg_oom_order); | 
 | 1802 | 	} else { | 
 | 1803 | 		schedule(); | 
 | 1804 | 		mem_cgroup_unmark_under_oom(memcg); | 
 | 1805 | 		finish_wait(&memcg_oom_waitq, &owait.wait); | 
 | 1806 | 	} | 
 | 1807 |  | 
 | 1808 | 	if (locked) { | 
 | 1809 | 		mem_cgroup_oom_unlock(memcg); | 
 | 1810 | 		/* | 
 | 1811 | 		 * There is no guarantee that an OOM-lock contender | 
 | 1812 | 		 * sees the wakeups triggered by the OOM kill | 
 | 1813 | 		 * uncharges.  Wake any sleepers explicitely. | 
 | 1814 | 		 */ | 
 | 1815 | 		memcg_oom_recover(memcg); | 
 | 1816 | 	} | 
 | 1817 | cleanup: | 
 | 1818 | 	current->memcg_in_oom = NULL; | 
 | 1819 | 	css_put(&memcg->css); | 
 | 1820 | 	return true; | 
 | 1821 | } | 
 | 1822 |  | 
 | 1823 | /** | 
 | 1824 |  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM | 
 | 1825 |  * @victim: task to be killed by the OOM killer | 
 | 1826 |  * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM | 
 | 1827 |  * | 
 | 1828 |  * Returns a pointer to a memory cgroup, which has to be cleaned up | 
 | 1829 |  * by killing all belonging OOM-killable tasks. | 
 | 1830 |  * | 
 | 1831 |  * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. | 
 | 1832 |  */ | 
 | 1833 | struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, | 
 | 1834 | 					    struct mem_cgroup *oom_domain) | 
 | 1835 | { | 
 | 1836 | 	struct mem_cgroup *oom_group = NULL; | 
 | 1837 | 	struct mem_cgroup *memcg; | 
 | 1838 |  | 
 | 1839 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 1840 | 		return NULL; | 
 | 1841 |  | 
 | 1842 | 	if (!oom_domain) | 
 | 1843 | 		oom_domain = root_mem_cgroup; | 
 | 1844 |  | 
 | 1845 | 	rcu_read_lock(); | 
 | 1846 |  | 
 | 1847 | 	memcg = mem_cgroup_from_task(victim); | 
 | 1848 | 	if (memcg == root_mem_cgroup) | 
 | 1849 | 		goto out; | 
 | 1850 |  | 
 | 1851 | 	/* | 
 | 1852 | 	 * Traverse the memory cgroup hierarchy from the victim task's | 
 | 1853 | 	 * cgroup up to the OOMing cgroup (or root) to find the | 
 | 1854 | 	 * highest-level memory cgroup with oom.group set. | 
 | 1855 | 	 */ | 
 | 1856 | 	for (; memcg; memcg = parent_mem_cgroup(memcg)) { | 
 | 1857 | 		if (memcg->oom_group) | 
 | 1858 | 			oom_group = memcg; | 
 | 1859 |  | 
 | 1860 | 		if (memcg == oom_domain) | 
 | 1861 | 			break; | 
 | 1862 | 	} | 
 | 1863 |  | 
 | 1864 | 	if (oom_group) | 
 | 1865 | 		css_get(&oom_group->css); | 
 | 1866 | out: | 
 | 1867 | 	rcu_read_unlock(); | 
 | 1868 |  | 
 | 1869 | 	return oom_group; | 
 | 1870 | } | 
 | 1871 |  | 
 | 1872 | void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) | 
 | 1873 | { | 
 | 1874 | 	pr_info("Tasks in "); | 
 | 1875 | 	pr_cont_cgroup_path(memcg->css.cgroup); | 
 | 1876 | 	pr_cont(" are going to be killed due to memory.oom.group set\n"); | 
 | 1877 | } | 
 | 1878 |  | 
 | 1879 | /** | 
 | 1880 |  * lock_page_memcg - lock a page->mem_cgroup binding | 
 | 1881 |  * @page: the page | 
 | 1882 |  * | 
 | 1883 |  * This function protects unlocked LRU pages from being moved to | 
 | 1884 |  * another cgroup. | 
 | 1885 |  * | 
 | 1886 |  * It ensures lifetime of the returned memcg. Caller is responsible | 
 | 1887 |  * for the lifetime of the page; __unlock_page_memcg() is available | 
 | 1888 |  * when @page might get freed inside the locked section. | 
 | 1889 |  */ | 
 | 1890 | struct mem_cgroup *lock_page_memcg(struct page *page) | 
 | 1891 | { | 
 | 1892 | 	struct mem_cgroup *memcg; | 
 | 1893 | 	unsigned long flags; | 
 | 1894 |  | 
 | 1895 | 	/* | 
 | 1896 | 	 * The RCU lock is held throughout the transaction.  The fast | 
 | 1897 | 	 * path can get away without acquiring the memcg->move_lock | 
 | 1898 | 	 * because page moving starts with an RCU grace period. | 
 | 1899 | 	 * | 
 | 1900 | 	 * The RCU lock also protects the memcg from being freed when | 
 | 1901 | 	 * the page state that is going to change is the only thing | 
 | 1902 | 	 * preventing the page itself from being freed. E.g. writeback | 
 | 1903 | 	 * doesn't hold a page reference and relies on PG_writeback to | 
 | 1904 | 	 * keep off truncation, migration and so forth. | 
 | 1905 |          */ | 
 | 1906 | 	rcu_read_lock(); | 
 | 1907 |  | 
 | 1908 | 	if (mem_cgroup_disabled()) | 
 | 1909 | 		return NULL; | 
 | 1910 | again: | 
 | 1911 | 	memcg = page->mem_cgroup; | 
 | 1912 | 	if (unlikely(!memcg)) | 
 | 1913 | 		return NULL; | 
 | 1914 |  | 
 | 1915 | 	if (atomic_read(&memcg->moving_account) <= 0) | 
 | 1916 | 		return memcg; | 
 | 1917 |  | 
 | 1918 | 	spin_lock_irqsave(&memcg->move_lock, flags); | 
 | 1919 | 	if (memcg != page->mem_cgroup) { | 
 | 1920 | 		spin_unlock_irqrestore(&memcg->move_lock, flags); | 
 | 1921 | 		goto again; | 
 | 1922 | 	} | 
 | 1923 |  | 
 | 1924 | 	/* | 
 | 1925 | 	 * When charge migration first begins, we can have locked and | 
 | 1926 | 	 * unlocked page stat updates happening concurrently.  Track | 
 | 1927 | 	 * the task who has the lock for unlock_page_memcg(). | 
 | 1928 | 	 */ | 
 | 1929 | 	memcg->move_lock_task = current; | 
 | 1930 | 	memcg->move_lock_flags = flags; | 
 | 1931 |  | 
 | 1932 | 	return memcg; | 
 | 1933 | } | 
 | 1934 | EXPORT_SYMBOL(lock_page_memcg); | 
 | 1935 |  | 
 | 1936 | /** | 
 | 1937 |  * __unlock_page_memcg - unlock and unpin a memcg | 
 | 1938 |  * @memcg: the memcg | 
 | 1939 |  * | 
 | 1940 |  * Unlock and unpin a memcg returned by lock_page_memcg(). | 
 | 1941 |  */ | 
 | 1942 | void __unlock_page_memcg(struct mem_cgroup *memcg) | 
 | 1943 | { | 
 | 1944 | 	if (memcg && memcg->move_lock_task == current) { | 
 | 1945 | 		unsigned long flags = memcg->move_lock_flags; | 
 | 1946 |  | 
 | 1947 | 		memcg->move_lock_task = NULL; | 
 | 1948 | 		memcg->move_lock_flags = 0; | 
 | 1949 |  | 
 | 1950 | 		spin_unlock_irqrestore(&memcg->move_lock, flags); | 
 | 1951 | 	} | 
 | 1952 |  | 
 | 1953 | 	rcu_read_unlock(); | 
 | 1954 | } | 
 | 1955 |  | 
 | 1956 | /** | 
 | 1957 |  * unlock_page_memcg - unlock a page->mem_cgroup binding | 
 | 1958 |  * @page: the page | 
 | 1959 |  */ | 
 | 1960 | void unlock_page_memcg(struct page *page) | 
 | 1961 | { | 
 | 1962 | 	__unlock_page_memcg(page->mem_cgroup); | 
 | 1963 | } | 
 | 1964 | EXPORT_SYMBOL(unlock_page_memcg); | 
 | 1965 |  | 
 | 1966 | struct memcg_stock_pcp { | 
 | 1967 | 	struct mem_cgroup *cached; /* this never be root cgroup */ | 
 | 1968 | 	unsigned int nr_pages; | 
 | 1969 | 	struct work_struct work; | 
 | 1970 | 	unsigned long flags; | 
 | 1971 | #define FLUSHING_CACHED_CHARGE	0 | 
 | 1972 | }; | 
 | 1973 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 
 | 1974 | static DEFINE_MUTEX(percpu_charge_mutex); | 
 | 1975 |  | 
 | 1976 | /** | 
 | 1977 |  * consume_stock: Try to consume stocked charge on this cpu. | 
 | 1978 |  * @memcg: memcg to consume from. | 
 | 1979 |  * @nr_pages: how many pages to charge. | 
 | 1980 |  * | 
 | 1981 |  * The charges will only happen if @memcg matches the current cpu's memcg | 
 | 1982 |  * stock, and at least @nr_pages are available in that stock.  Failure to | 
 | 1983 |  * service an allocation will refill the stock. | 
 | 1984 |  * | 
 | 1985 |  * returns true if successful, false otherwise. | 
 | 1986 |  */ | 
 | 1987 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 
 | 1988 | { | 
 | 1989 | 	struct memcg_stock_pcp *stock; | 
 | 1990 | 	unsigned long flags; | 
 | 1991 | 	bool ret = false; | 
 | 1992 |  | 
 | 1993 | 	if (nr_pages > MEMCG_CHARGE_BATCH) | 
 | 1994 | 		return ret; | 
 | 1995 |  | 
 | 1996 | 	local_irq_save(flags); | 
 | 1997 |  | 
 | 1998 | 	stock = this_cpu_ptr(&memcg_stock); | 
 | 1999 | 	if (memcg == stock->cached && stock->nr_pages >= nr_pages) { | 
 | 2000 | 		stock->nr_pages -= nr_pages; | 
 | 2001 | 		ret = true; | 
 | 2002 | 	} | 
 | 2003 |  | 
 | 2004 | 	local_irq_restore(flags); | 
 | 2005 |  | 
 | 2006 | 	return ret; | 
 | 2007 | } | 
 | 2008 |  | 
 | 2009 | /* | 
 | 2010 |  * Returns stocks cached in percpu and reset cached information. | 
 | 2011 |  */ | 
 | 2012 | static void drain_stock(struct memcg_stock_pcp *stock) | 
 | 2013 | { | 
 | 2014 | 	struct mem_cgroup *old = stock->cached; | 
 | 2015 |  | 
 | 2016 | 	if (stock->nr_pages) { | 
 | 2017 | 		page_counter_uncharge(&old->memory, stock->nr_pages); | 
 | 2018 | 		if (do_memsw_account()) | 
 | 2019 | 			page_counter_uncharge(&old->memsw, stock->nr_pages); | 
 | 2020 | 		css_put_many(&old->css, stock->nr_pages); | 
 | 2021 | 		stock->nr_pages = 0; | 
 | 2022 | 	} | 
 | 2023 | 	stock->cached = NULL; | 
 | 2024 | } | 
 | 2025 |  | 
 | 2026 | static void drain_local_stock(struct work_struct *dummy) | 
 | 2027 | { | 
 | 2028 | 	struct memcg_stock_pcp *stock; | 
 | 2029 | 	unsigned long flags; | 
 | 2030 |  | 
 | 2031 | 	/* | 
 | 2032 | 	 * The only protection from memory hotplug vs. drain_stock races is | 
 | 2033 | 	 * that we always operate on local CPU stock here with IRQ disabled | 
 | 2034 | 	 */ | 
 | 2035 | 	local_irq_save(flags); | 
 | 2036 |  | 
 | 2037 | 	stock = this_cpu_ptr(&memcg_stock); | 
 | 2038 | 	drain_stock(stock); | 
 | 2039 | 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 
 | 2040 |  | 
 | 2041 | 	local_irq_restore(flags); | 
 | 2042 | } | 
 | 2043 |  | 
 | 2044 | /* | 
 | 2045 |  * Cache charges(val) to local per_cpu area. | 
 | 2046 |  * This will be consumed by consume_stock() function, later. | 
 | 2047 |  */ | 
 | 2048 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 
 | 2049 | { | 
 | 2050 | 	struct memcg_stock_pcp *stock; | 
 | 2051 | 	unsigned long flags; | 
 | 2052 |  | 
 | 2053 | 	local_irq_save(flags); | 
 | 2054 |  | 
 | 2055 | 	stock = this_cpu_ptr(&memcg_stock); | 
 | 2056 | 	if (stock->cached != memcg) { /* reset if necessary */ | 
 | 2057 | 		drain_stock(stock); | 
 | 2058 | 		stock->cached = memcg; | 
 | 2059 | 	} | 
 | 2060 | 	stock->nr_pages += nr_pages; | 
 | 2061 |  | 
 | 2062 | 	if (stock->nr_pages > MEMCG_CHARGE_BATCH) | 
 | 2063 | 		drain_stock(stock); | 
 | 2064 |  | 
 | 2065 | 	local_irq_restore(flags); | 
 | 2066 | } | 
 | 2067 |  | 
 | 2068 | /* | 
 | 2069 |  * Drains all per-CPU charge caches for given root_memcg resp. subtree | 
 | 2070 |  * of the hierarchy under it. | 
 | 2071 |  */ | 
 | 2072 | static void drain_all_stock(struct mem_cgroup *root_memcg) | 
 | 2073 | { | 
 | 2074 | 	int cpu, curcpu; | 
 | 2075 |  | 
 | 2076 | 	/* If someone's already draining, avoid adding running more workers. */ | 
 | 2077 | 	if (!mutex_trylock(&percpu_charge_mutex)) | 
 | 2078 | 		return; | 
 | 2079 | 	/* | 
 | 2080 | 	 * Notify other cpus that system-wide "drain" is running | 
 | 2081 | 	 * We do not care about races with the cpu hotplug because cpu down | 
 | 2082 | 	 * as well as workers from this path always operate on the local | 
 | 2083 | 	 * per-cpu data. CPU up doesn't touch memcg_stock at all. | 
 | 2084 | 	 */ | 
 | 2085 | 	curcpu = get_cpu(); | 
 | 2086 | 	for_each_online_cpu(cpu) { | 
 | 2087 | 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 
 | 2088 | 		struct mem_cgroup *memcg; | 
 | 2089 |  | 
 | 2090 | 		memcg = stock->cached; | 
 | 2091 | 		if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css)) | 
 | 2092 | 			continue; | 
 | 2093 | 		if (!mem_cgroup_is_descendant(memcg, root_memcg)) { | 
 | 2094 | 			css_put(&memcg->css); | 
 | 2095 | 			continue; | 
 | 2096 | 		} | 
 | 2097 | 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { | 
 | 2098 | 			if (cpu == curcpu) | 
 | 2099 | 				drain_local_stock(&stock->work); | 
 | 2100 | 			else | 
 | 2101 | 				schedule_work_on(cpu, &stock->work); | 
 | 2102 | 		} | 
 | 2103 | 		css_put(&memcg->css); | 
 | 2104 | 	} | 
 | 2105 | 	put_cpu(); | 
 | 2106 | 	mutex_unlock(&percpu_charge_mutex); | 
 | 2107 | } | 
 | 2108 |  | 
 | 2109 | static int memcg_hotplug_cpu_dead(unsigned int cpu) | 
 | 2110 | { | 
 | 2111 | 	struct memcg_stock_pcp *stock; | 
 | 2112 | 	struct mem_cgroup *memcg; | 
 | 2113 |  | 
 | 2114 | 	stock = &per_cpu(memcg_stock, cpu); | 
 | 2115 | 	drain_stock(stock); | 
 | 2116 |  | 
 | 2117 | 	for_each_mem_cgroup(memcg) { | 
 | 2118 | 		int i; | 
 | 2119 |  | 
 | 2120 | 		for (i = 0; i < MEMCG_NR_STAT; i++) { | 
 | 2121 | 			int nid; | 
 | 2122 | 			long x; | 
 | 2123 |  | 
 | 2124 | 			x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); | 
 | 2125 | 			if (x) | 
 | 2126 | 				atomic_long_add(x, &memcg->stat[i]); | 
 | 2127 |  | 
 | 2128 | 			if (i >= NR_VM_NODE_STAT_ITEMS) | 
 | 2129 | 				continue; | 
 | 2130 |  | 
 | 2131 | 			for_each_node(nid) { | 
 | 2132 | 				struct mem_cgroup_per_node *pn; | 
 | 2133 |  | 
 | 2134 | 				pn = mem_cgroup_nodeinfo(memcg, nid); | 
 | 2135 | 				x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); | 
 | 2136 | 				if (x) | 
 | 2137 | 					atomic_long_add(x, &pn->lruvec_stat[i]); | 
 | 2138 | 			} | 
 | 2139 | 		} | 
 | 2140 |  | 
 | 2141 | 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { | 
 | 2142 | 			long x; | 
 | 2143 |  | 
 | 2144 | 			x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); | 
 | 2145 | 			if (x) | 
 | 2146 | 				atomic_long_add(x, &memcg->events[i]); | 
 | 2147 | 		} | 
 | 2148 | 	} | 
 | 2149 |  | 
 | 2150 | 	return 0; | 
 | 2151 | } | 
 | 2152 |  | 
 | 2153 | static void reclaim_high(struct mem_cgroup *memcg, | 
 | 2154 | 			 unsigned int nr_pages, | 
 | 2155 | 			 gfp_t gfp_mask) | 
 | 2156 | { | 
 | 2157 | 	do { | 
 | 2158 | 		if (page_counter_read(&memcg->memory) <= memcg->high) | 
 | 2159 | 			continue; | 
 | 2160 | 		memcg_memory_event(memcg, MEMCG_HIGH); | 
 | 2161 | 		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); | 
 | 2162 | 	} while ((memcg = parent_mem_cgroup(memcg))); | 
 | 2163 | } | 
 | 2164 |  | 
 | 2165 | static void high_work_func(struct work_struct *work) | 
 | 2166 | { | 
 | 2167 | 	struct mem_cgroup *memcg; | 
 | 2168 |  | 
 | 2169 | 	memcg = container_of(work, struct mem_cgroup, high_work); | 
 | 2170 | 	reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); | 
 | 2171 | } | 
 | 2172 |  | 
 | 2173 | /* | 
 | 2174 |  * Scheduled by try_charge() to be executed from the userland return path | 
 | 2175 |  * and reclaims memory over the high limit. | 
 | 2176 |  */ | 
 | 2177 | void mem_cgroup_handle_over_high(void) | 
 | 2178 | { | 
 | 2179 | 	unsigned int nr_pages = current->memcg_nr_pages_over_high; | 
 | 2180 | 	struct mem_cgroup *memcg; | 
 | 2181 |  | 
 | 2182 | 	if (likely(!nr_pages)) | 
 | 2183 | 		return; | 
 | 2184 |  | 
 | 2185 | 	memcg = get_mem_cgroup_from_mm(current->mm); | 
 | 2186 | 	reclaim_high(memcg, nr_pages, GFP_KERNEL); | 
 | 2187 | 	css_put(&memcg->css); | 
 | 2188 | 	current->memcg_nr_pages_over_high = 0; | 
 | 2189 | } | 
 | 2190 |  | 
 | 2191 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 
 | 2192 | 		      unsigned int nr_pages) | 
 | 2193 | { | 
 | 2194 | 	unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); | 
 | 2195 | 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 
 | 2196 | 	struct mem_cgroup *mem_over_limit; | 
 | 2197 | 	struct page_counter *counter; | 
 | 2198 | 	unsigned long nr_reclaimed; | 
 | 2199 | 	bool may_swap = true; | 
 | 2200 | 	bool drained = false; | 
 | 2201 | 	bool oomed = false; | 
 | 2202 | 	enum oom_status oom_status; | 
 | 2203 |  | 
 | 2204 | 	if (mem_cgroup_is_root(memcg)) | 
 | 2205 | 		return 0; | 
 | 2206 | retry: | 
 | 2207 | 	if (consume_stock(memcg, nr_pages)) | 
 | 2208 | 		return 0; | 
 | 2209 |  | 
 | 2210 | 	if (!do_memsw_account() || | 
 | 2211 | 	    page_counter_try_charge(&memcg->memsw, batch, &counter)) { | 
 | 2212 | 		if (page_counter_try_charge(&memcg->memory, batch, &counter)) | 
 | 2213 | 			goto done_restock; | 
 | 2214 | 		if (do_memsw_account()) | 
 | 2215 | 			page_counter_uncharge(&memcg->memsw, batch); | 
 | 2216 | 		mem_over_limit = mem_cgroup_from_counter(counter, memory); | 
 | 2217 | 	} else { | 
 | 2218 | 		mem_over_limit = mem_cgroup_from_counter(counter, memsw); | 
 | 2219 | 		may_swap = false; | 
 | 2220 | 	} | 
 | 2221 |  | 
 | 2222 | 	if (batch > nr_pages) { | 
 | 2223 | 		batch = nr_pages; | 
 | 2224 | 		goto retry; | 
 | 2225 | 	} | 
 | 2226 |  | 
 | 2227 | 	/* | 
 | 2228 | 	 * Memcg doesn't have a dedicated reserve for atomic | 
 | 2229 | 	 * allocations. But like the global atomic pool, we need to | 
 | 2230 | 	 * put the burden of reclaim on regular allocation requests | 
 | 2231 | 	 * and let these go through as privileged allocations. | 
 | 2232 | 	 */ | 
 | 2233 | 	if (gfp_mask & __GFP_ATOMIC) | 
 | 2234 | 		goto force; | 
 | 2235 |  | 
 | 2236 | 	/* | 
 | 2237 | 	 * Unlike in global OOM situations, memcg is not in a physical | 
 | 2238 | 	 * memory shortage.  Allow dying and OOM-killed tasks to | 
 | 2239 | 	 * bypass the last charges so that they can exit quickly and | 
 | 2240 | 	 * free their memory. | 
 | 2241 | 	 */ | 
 | 2242 | 	if (unlikely(should_force_charge())) | 
 | 2243 | 		goto force; | 
 | 2244 |  | 
 | 2245 | 	/* | 
 | 2246 | 	 * Prevent unbounded recursion when reclaim operations need to | 
 | 2247 | 	 * allocate memory. This might exceed the limits temporarily, | 
 | 2248 | 	 * but we prefer facilitating memory reclaim and getting back | 
 | 2249 | 	 * under the limit over triggering OOM kills in these cases. | 
 | 2250 | 	 */ | 
 | 2251 | 	if (unlikely(current->flags & PF_MEMALLOC)) | 
 | 2252 | 		goto force; | 
 | 2253 |  | 
 | 2254 | 	if (unlikely(task_in_memcg_oom(current))) | 
 | 2255 | 		goto nomem; | 
 | 2256 |  | 
 | 2257 | 	if (!gfpflags_allow_blocking(gfp_mask)) | 
 | 2258 | 		goto nomem; | 
 | 2259 |  | 
 | 2260 | 	memcg_memory_event(mem_over_limit, MEMCG_MAX); | 
 | 2261 |  | 
 | 2262 | 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, | 
 | 2263 | 						    gfp_mask, may_swap); | 
 | 2264 |  | 
 | 2265 | 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 
 | 2266 | 		goto retry; | 
 | 2267 |  | 
 | 2268 | 	if (!drained) { | 
 | 2269 | 		drain_all_stock(mem_over_limit); | 
 | 2270 | 		drained = true; | 
 | 2271 | 		goto retry; | 
 | 2272 | 	} | 
 | 2273 |  | 
 | 2274 | 	if (gfp_mask & __GFP_NORETRY) | 
 | 2275 | 		goto nomem; | 
 | 2276 | 	/* | 
 | 2277 | 	 * Even though the limit is exceeded at this point, reclaim | 
 | 2278 | 	 * may have been able to free some pages.  Retry the charge | 
 | 2279 | 	 * before killing the task. | 
 | 2280 | 	 * | 
 | 2281 | 	 * Only for regular pages, though: huge pages are rather | 
 | 2282 | 	 * unlikely to succeed so close to the limit, and we fall back | 
 | 2283 | 	 * to regular pages anyway in case of failure. | 
 | 2284 | 	 */ | 
 | 2285 | 	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) | 
 | 2286 | 		goto retry; | 
 | 2287 | 	/* | 
 | 2288 | 	 * At task move, charge accounts can be doubly counted. So, it's | 
 | 2289 | 	 * better to wait until the end of task_move if something is going on. | 
 | 2290 | 	 */ | 
 | 2291 | 	if (mem_cgroup_wait_acct_move(mem_over_limit)) | 
 | 2292 | 		goto retry; | 
 | 2293 |  | 
 | 2294 | 	if (nr_retries--) | 
 | 2295 | 		goto retry; | 
 | 2296 |  | 
 | 2297 | 	if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed) | 
 | 2298 | 		goto nomem; | 
 | 2299 |  | 
 | 2300 | 	if (gfp_mask & __GFP_NOFAIL) | 
 | 2301 | 		goto force; | 
 | 2302 |  | 
 | 2303 | 	if (fatal_signal_pending(current)) | 
 | 2304 | 		goto force; | 
 | 2305 |  | 
 | 2306 | 	memcg_memory_event(mem_over_limit, MEMCG_OOM); | 
 | 2307 |  | 
 | 2308 | 	/* | 
 | 2309 | 	 * keep retrying as long as the memcg oom killer is able to make | 
 | 2310 | 	 * a forward progress or bypass the charge if the oom killer | 
 | 2311 | 	 * couldn't make any progress. | 
 | 2312 | 	 */ | 
 | 2313 | 	oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask, | 
 | 2314 | 		       get_order(nr_pages * PAGE_SIZE)); | 
 | 2315 | 	switch (oom_status) { | 
 | 2316 | 	case OOM_SUCCESS: | 
 | 2317 | 		nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 
 | 2318 | 		oomed = true; | 
 | 2319 | 		goto retry; | 
 | 2320 | 	case OOM_FAILED: | 
 | 2321 | 		goto force; | 
 | 2322 | 	default: | 
 | 2323 | 		goto nomem; | 
 | 2324 | 	} | 
 | 2325 | nomem: | 
 | 2326 | 	if (!(gfp_mask & __GFP_NOFAIL)) | 
 | 2327 | 		return -ENOMEM; | 
 | 2328 | force: | 
 | 2329 | 	/* | 
 | 2330 | 	 * The allocation either can't fail or will lead to more memory | 
 | 2331 | 	 * being freed very soon.  Allow memory usage go over the limit | 
 | 2332 | 	 * temporarily by force charging it. | 
 | 2333 | 	 */ | 
 | 2334 | 	page_counter_charge(&memcg->memory, nr_pages); | 
 | 2335 | 	if (do_memsw_account()) | 
 | 2336 | 		page_counter_charge(&memcg->memsw, nr_pages); | 
 | 2337 | 	css_get_many(&memcg->css, nr_pages); | 
 | 2338 |  | 
 | 2339 | 	return 0; | 
 | 2340 |  | 
 | 2341 | done_restock: | 
 | 2342 | 	css_get_many(&memcg->css, batch); | 
 | 2343 | 	if (batch > nr_pages) | 
 | 2344 | 		refill_stock(memcg, batch - nr_pages); | 
 | 2345 |  | 
 | 2346 | 	/* | 
 | 2347 | 	 * If the hierarchy is above the normal consumption range, schedule | 
 | 2348 | 	 * reclaim on returning to userland.  We can perform reclaim here | 
 | 2349 | 	 * if __GFP_RECLAIM but let's always punt for simplicity and so that | 
 | 2350 | 	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is | 
 | 2351 | 	 * not recorded as it most likely matches current's and won't | 
 | 2352 | 	 * change in the meantime.  As high limit is checked again before | 
 | 2353 | 	 * reclaim, the cost of mismatch is negligible. | 
 | 2354 | 	 */ | 
 | 2355 | 	do { | 
 | 2356 | 		if (page_counter_read(&memcg->memory) > memcg->high) { | 
 | 2357 | 			/* Don't bother a random interrupted task */ | 
 | 2358 | 			if (in_interrupt()) { | 
 | 2359 | 				schedule_work(&memcg->high_work); | 
 | 2360 | 				break; | 
 | 2361 | 			} | 
 | 2362 | 			current->memcg_nr_pages_over_high += batch; | 
 | 2363 | 			set_notify_resume(current); | 
 | 2364 | 			break; | 
 | 2365 | 		} | 
 | 2366 | 	} while ((memcg = parent_mem_cgroup(memcg))); | 
 | 2367 |  | 
 | 2368 | 	return 0; | 
 | 2369 | } | 
 | 2370 |  | 
 | 2371 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | 
 | 2372 | { | 
 | 2373 | 	if (mem_cgroup_is_root(memcg)) | 
 | 2374 | 		return; | 
 | 2375 |  | 
 | 2376 | 	page_counter_uncharge(&memcg->memory, nr_pages); | 
 | 2377 | 	if (do_memsw_account()) | 
 | 2378 | 		page_counter_uncharge(&memcg->memsw, nr_pages); | 
 | 2379 |  | 
 | 2380 | 	css_put_many(&memcg->css, nr_pages); | 
 | 2381 | } | 
 | 2382 |  | 
 | 2383 | static void lock_page_lru(struct page *page, int *isolated) | 
 | 2384 | { | 
 | 2385 | 	struct zone *zone = page_zone(page); | 
 | 2386 |  | 
 | 2387 | 	spin_lock_irq(zone_lru_lock(zone)); | 
 | 2388 | 	if (PageLRU(page)) { | 
 | 2389 | 		struct lruvec *lruvec; | 
 | 2390 |  | 
 | 2391 | 		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); | 
 | 2392 | 		ClearPageLRU(page); | 
 | 2393 | 		del_page_from_lru_list(page, lruvec, page_lru(page)); | 
 | 2394 | 		*isolated = 1; | 
 | 2395 | 	} else | 
 | 2396 | 		*isolated = 0; | 
 | 2397 | } | 
 | 2398 |  | 
 | 2399 | static void unlock_page_lru(struct page *page, int isolated) | 
 | 2400 | { | 
 | 2401 | 	struct zone *zone = page_zone(page); | 
 | 2402 |  | 
 | 2403 | 	if (isolated) { | 
 | 2404 | 		struct lruvec *lruvec; | 
 | 2405 |  | 
 | 2406 | 		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat); | 
 | 2407 | 		VM_BUG_ON_PAGE(PageLRU(page), page); | 
 | 2408 | 		SetPageLRU(page); | 
 | 2409 | 		add_page_to_lru_list(page, lruvec, page_lru(page)); | 
 | 2410 | 	} | 
 | 2411 | 	spin_unlock_irq(zone_lru_lock(zone)); | 
 | 2412 | } | 
 | 2413 |  | 
 | 2414 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, | 
 | 2415 | 			  bool lrucare) | 
 | 2416 | { | 
 | 2417 | 	int isolated; | 
 | 2418 |  | 
 | 2419 | 	VM_BUG_ON_PAGE(page->mem_cgroup, page); | 
 | 2420 |  | 
 | 2421 | 	/* | 
 | 2422 | 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page | 
 | 2423 | 	 * may already be on some other mem_cgroup's LRU.  Take care of it. | 
 | 2424 | 	 */ | 
 | 2425 | 	if (lrucare) | 
 | 2426 | 		lock_page_lru(page, &isolated); | 
 | 2427 |  | 
 | 2428 | 	/* | 
 | 2429 | 	 * Nobody should be changing or seriously looking at | 
 | 2430 | 	 * page->mem_cgroup at this point: | 
 | 2431 | 	 * | 
 | 2432 | 	 * - the page is uncharged | 
 | 2433 | 	 * | 
 | 2434 | 	 * - the page is off-LRU | 
 | 2435 | 	 * | 
 | 2436 | 	 * - an anonymous fault has exclusive page access, except for | 
 | 2437 | 	 *   a locked page table | 
 | 2438 | 	 * | 
 | 2439 | 	 * - a page cache insertion, a swapin fault, or a migration | 
 | 2440 | 	 *   have the page locked | 
 | 2441 | 	 */ | 
 | 2442 | 	page->mem_cgroup = memcg; | 
 | 2443 |  | 
 | 2444 | 	if (lrucare) | 
 | 2445 | 		unlock_page_lru(page, isolated); | 
 | 2446 | } | 
 | 2447 |  | 
 | 2448 | #ifdef CONFIG_MEMCG_KMEM | 
 | 2449 | static int memcg_alloc_cache_id(void) | 
 | 2450 | { | 
 | 2451 | 	int id, size; | 
 | 2452 | 	int err; | 
 | 2453 |  | 
 | 2454 | 	id = ida_simple_get(&memcg_cache_ida, | 
 | 2455 | 			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | 
 | 2456 | 	if (id < 0) | 
 | 2457 | 		return id; | 
 | 2458 |  | 
 | 2459 | 	if (id < memcg_nr_cache_ids) | 
 | 2460 | 		return id; | 
 | 2461 |  | 
 | 2462 | 	/* | 
 | 2463 | 	 * There's no space for the new id in memcg_caches arrays, | 
 | 2464 | 	 * so we have to grow them. | 
 | 2465 | 	 */ | 
 | 2466 | 	down_write(&memcg_cache_ids_sem); | 
 | 2467 |  | 
 | 2468 | 	size = 2 * (id + 1); | 
 | 2469 | 	if (size < MEMCG_CACHES_MIN_SIZE) | 
 | 2470 | 		size = MEMCG_CACHES_MIN_SIZE; | 
 | 2471 | 	else if (size > MEMCG_CACHES_MAX_SIZE) | 
 | 2472 | 		size = MEMCG_CACHES_MAX_SIZE; | 
 | 2473 |  | 
 | 2474 | 	err = memcg_update_all_caches(size); | 
 | 2475 | 	if (!err) | 
 | 2476 | 		err = memcg_update_all_list_lrus(size); | 
 | 2477 | 	if (!err) | 
 | 2478 | 		memcg_nr_cache_ids = size; | 
 | 2479 |  | 
 | 2480 | 	up_write(&memcg_cache_ids_sem); | 
 | 2481 |  | 
 | 2482 | 	if (err) { | 
 | 2483 | 		ida_simple_remove(&memcg_cache_ida, id); | 
 | 2484 | 		return err; | 
 | 2485 | 	} | 
 | 2486 | 	return id; | 
 | 2487 | } | 
 | 2488 |  | 
 | 2489 | static void memcg_free_cache_id(int id) | 
 | 2490 | { | 
 | 2491 | 	ida_simple_remove(&memcg_cache_ida, id); | 
 | 2492 | } | 
 | 2493 |  | 
 | 2494 | struct memcg_kmem_cache_create_work { | 
 | 2495 | 	struct mem_cgroup *memcg; | 
 | 2496 | 	struct kmem_cache *cachep; | 
 | 2497 | 	struct work_struct work; | 
 | 2498 | }; | 
 | 2499 |  | 
 | 2500 | static void memcg_kmem_cache_create_func(struct work_struct *w) | 
 | 2501 | { | 
 | 2502 | 	struct memcg_kmem_cache_create_work *cw = | 
 | 2503 | 		container_of(w, struct memcg_kmem_cache_create_work, work); | 
 | 2504 | 	struct mem_cgroup *memcg = cw->memcg; | 
 | 2505 | 	struct kmem_cache *cachep = cw->cachep; | 
 | 2506 |  | 
 | 2507 | 	memcg_create_kmem_cache(memcg, cachep); | 
 | 2508 |  | 
 | 2509 | 	css_put(&memcg->css); | 
 | 2510 | 	kfree(cw); | 
 | 2511 | } | 
 | 2512 |  | 
 | 2513 | /* | 
 | 2514 |  * Enqueue the creation of a per-memcg kmem_cache. | 
 | 2515 |  */ | 
 | 2516 | static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, | 
 | 2517 | 					       struct kmem_cache *cachep) | 
 | 2518 | { | 
 | 2519 | 	struct memcg_kmem_cache_create_work *cw; | 
 | 2520 |  | 
 | 2521 | 	cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN); | 
 | 2522 | 	if (!cw) | 
 | 2523 | 		return; | 
 | 2524 |  | 
 | 2525 | 	css_get(&memcg->css); | 
 | 2526 |  | 
 | 2527 | 	cw->memcg = memcg; | 
 | 2528 | 	cw->cachep = cachep; | 
 | 2529 | 	INIT_WORK(&cw->work, memcg_kmem_cache_create_func); | 
 | 2530 |  | 
 | 2531 | 	queue_work(memcg_kmem_cache_wq, &cw->work); | 
 | 2532 | } | 
 | 2533 |  | 
 | 2534 | static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, | 
 | 2535 | 					     struct kmem_cache *cachep) | 
 | 2536 | { | 
 | 2537 | 	/* | 
 | 2538 | 	 * We need to stop accounting when we kmalloc, because if the | 
 | 2539 | 	 * corresponding kmalloc cache is not yet created, the first allocation | 
 | 2540 | 	 * in __memcg_schedule_kmem_cache_create will recurse. | 
 | 2541 | 	 * | 
 | 2542 | 	 * However, it is better to enclose the whole function. Depending on | 
 | 2543 | 	 * the debugging options enabled, INIT_WORK(), for instance, can | 
 | 2544 | 	 * trigger an allocation. This too, will make us recurse. Because at | 
 | 2545 | 	 * this point we can't allow ourselves back into memcg_kmem_get_cache, | 
 | 2546 | 	 * the safest choice is to do it like this, wrapping the whole function. | 
 | 2547 | 	 */ | 
 | 2548 | 	current->memcg_kmem_skip_account = 1; | 
 | 2549 | 	__memcg_schedule_kmem_cache_create(memcg, cachep); | 
 | 2550 | 	current->memcg_kmem_skip_account = 0; | 
 | 2551 | } | 
 | 2552 |  | 
 | 2553 | static inline bool memcg_kmem_bypass(void) | 
 | 2554 | { | 
 | 2555 | 	if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD)) | 
 | 2556 | 		return true; | 
 | 2557 | 	return false; | 
 | 2558 | } | 
 | 2559 |  | 
 | 2560 | /** | 
 | 2561 |  * memcg_kmem_get_cache: select the correct per-memcg cache for allocation | 
 | 2562 |  * @cachep: the original global kmem cache | 
 | 2563 |  * | 
 | 2564 |  * Return the kmem_cache we're supposed to use for a slab allocation. | 
 | 2565 |  * We try to use the current memcg's version of the cache. | 
 | 2566 |  * | 
 | 2567 |  * If the cache does not exist yet, if we are the first user of it, we | 
 | 2568 |  * create it asynchronously in a workqueue and let the current allocation | 
 | 2569 |  * go through with the original cache. | 
 | 2570 |  * | 
 | 2571 |  * This function takes a reference to the cache it returns to assure it | 
 | 2572 |  * won't get destroyed while we are working with it. Once the caller is | 
 | 2573 |  * done with it, memcg_kmem_put_cache() must be called to release the | 
 | 2574 |  * reference. | 
 | 2575 |  */ | 
 | 2576 | struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep) | 
 | 2577 | { | 
 | 2578 | 	struct mem_cgroup *memcg; | 
 | 2579 | 	struct kmem_cache *memcg_cachep; | 
 | 2580 | 	int kmemcg_id; | 
 | 2581 |  | 
 | 2582 | 	VM_BUG_ON(!is_root_cache(cachep)); | 
 | 2583 |  | 
 | 2584 | 	if (memcg_kmem_bypass()) | 
 | 2585 | 		return cachep; | 
 | 2586 |  | 
 | 2587 | 	if (current->memcg_kmem_skip_account) | 
 | 2588 | 		return cachep; | 
 | 2589 |  | 
 | 2590 | 	memcg = get_mem_cgroup_from_current(); | 
 | 2591 | 	kmemcg_id = READ_ONCE(memcg->kmemcg_id); | 
 | 2592 | 	if (kmemcg_id < 0) | 
 | 2593 | 		goto out; | 
 | 2594 |  | 
 | 2595 | 	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); | 
 | 2596 | 	if (likely(memcg_cachep)) | 
 | 2597 | 		return memcg_cachep; | 
 | 2598 |  | 
 | 2599 | 	/* | 
 | 2600 | 	 * If we are in a safe context (can wait, and not in interrupt | 
 | 2601 | 	 * context), we could be be predictable and return right away. | 
 | 2602 | 	 * This would guarantee that the allocation being performed | 
 | 2603 | 	 * already belongs in the new cache. | 
 | 2604 | 	 * | 
 | 2605 | 	 * However, there are some clashes that can arrive from locking. | 
 | 2606 | 	 * For instance, because we acquire the slab_mutex while doing | 
 | 2607 | 	 * memcg_create_kmem_cache, this means no further allocation | 
 | 2608 | 	 * could happen with the slab_mutex held. So it's better to | 
 | 2609 | 	 * defer everything. | 
 | 2610 | 	 */ | 
 | 2611 | 	memcg_schedule_kmem_cache_create(memcg, cachep); | 
 | 2612 | out: | 
 | 2613 | 	css_put(&memcg->css); | 
 | 2614 | 	return cachep; | 
 | 2615 | } | 
 | 2616 |  | 
 | 2617 | /** | 
 | 2618 |  * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache | 
 | 2619 |  * @cachep: the cache returned by memcg_kmem_get_cache | 
 | 2620 |  */ | 
 | 2621 | void memcg_kmem_put_cache(struct kmem_cache *cachep) | 
 | 2622 | { | 
 | 2623 | 	if (!is_root_cache(cachep)) | 
 | 2624 | 		css_put(&cachep->memcg_params.memcg->css); | 
 | 2625 | } | 
 | 2626 |  | 
 | 2627 | /** | 
 | 2628 |  * memcg_kmem_charge_memcg: charge a kmem page | 
 | 2629 |  * @page: page to charge | 
 | 2630 |  * @gfp: reclaim mode | 
 | 2631 |  * @order: allocation order | 
 | 2632 |  * @memcg: memory cgroup to charge | 
 | 2633 |  * | 
 | 2634 |  * Returns 0 on success, an error code on failure. | 
 | 2635 |  */ | 
 | 2636 | int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, | 
 | 2637 | 			    struct mem_cgroup *memcg) | 
 | 2638 | { | 
 | 2639 | 	unsigned int nr_pages = 1 << order; | 
 | 2640 | 	struct page_counter *counter; | 
 | 2641 | 	int ret; | 
 | 2642 |  | 
 | 2643 | 	ret = try_charge(memcg, gfp, nr_pages); | 
 | 2644 | 	if (ret) | 
 | 2645 | 		return ret; | 
 | 2646 |  | 
 | 2647 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && | 
 | 2648 | 	    !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { | 
 | 2649 |  | 
 | 2650 | 		/* | 
 | 2651 | 		 * Enforce __GFP_NOFAIL allocation because callers are not | 
 | 2652 | 		 * prepared to see failures and likely do not have any failure | 
 | 2653 | 		 * handling code. | 
 | 2654 | 		 */ | 
 | 2655 | 		if (gfp & __GFP_NOFAIL) { | 
 | 2656 | 			page_counter_charge(&memcg->kmem, nr_pages); | 
 | 2657 | 			return 0; | 
 | 2658 | 		} | 
 | 2659 | 		cancel_charge(memcg, nr_pages); | 
 | 2660 | 		return -ENOMEM; | 
 | 2661 | 	} | 
 | 2662 |  | 
 | 2663 | 	page->mem_cgroup = memcg; | 
 | 2664 |  | 
 | 2665 | 	return 0; | 
 | 2666 | } | 
 | 2667 |  | 
 | 2668 | /** | 
 | 2669 |  * memcg_kmem_charge: charge a kmem page to the current memory cgroup | 
 | 2670 |  * @page: page to charge | 
 | 2671 |  * @gfp: reclaim mode | 
 | 2672 |  * @order: allocation order | 
 | 2673 |  * | 
 | 2674 |  * Returns 0 on success, an error code on failure. | 
 | 2675 |  */ | 
 | 2676 | int memcg_kmem_charge(struct page *page, gfp_t gfp, int order) | 
 | 2677 | { | 
 | 2678 | 	struct mem_cgroup *memcg; | 
 | 2679 | 	int ret = 0; | 
 | 2680 |  | 
 | 2681 | 	if (mem_cgroup_disabled() || memcg_kmem_bypass()) | 
 | 2682 | 		return 0; | 
 | 2683 |  | 
 | 2684 | 	memcg = get_mem_cgroup_from_current(); | 
 | 2685 | 	if (!mem_cgroup_is_root(memcg)) { | 
 | 2686 | 		ret = memcg_kmem_charge_memcg(page, gfp, order, memcg); | 
 | 2687 | 		if (!ret) | 
 | 2688 | 			__SetPageKmemcg(page); | 
 | 2689 | 	} | 
 | 2690 | 	css_put(&memcg->css); | 
 | 2691 | 	return ret; | 
 | 2692 | } | 
 | 2693 | /** | 
 | 2694 |  * memcg_kmem_uncharge: uncharge a kmem page | 
 | 2695 |  * @page: page to uncharge | 
 | 2696 |  * @order: allocation order | 
 | 2697 |  */ | 
 | 2698 | void memcg_kmem_uncharge(struct page *page, int order) | 
 | 2699 | { | 
 | 2700 | 	struct mem_cgroup *memcg = page->mem_cgroup; | 
 | 2701 | 	unsigned int nr_pages = 1 << order; | 
 | 2702 |  | 
 | 2703 | 	if (!memcg) | 
 | 2704 | 		return; | 
 | 2705 |  | 
 | 2706 | 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); | 
 | 2707 |  | 
 | 2708 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 2709 | 		page_counter_uncharge(&memcg->kmem, nr_pages); | 
 | 2710 |  | 
 | 2711 | 	page_counter_uncharge(&memcg->memory, nr_pages); | 
 | 2712 | 	if (do_memsw_account()) | 
 | 2713 | 		page_counter_uncharge(&memcg->memsw, nr_pages); | 
 | 2714 |  | 
 | 2715 | 	page->mem_cgroup = NULL; | 
 | 2716 |  | 
 | 2717 | 	/* slab pages do not have PageKmemcg flag set */ | 
 | 2718 | 	if (PageKmemcg(page)) | 
 | 2719 | 		__ClearPageKmemcg(page); | 
 | 2720 |  | 
 | 2721 | 	css_put_many(&memcg->css, nr_pages); | 
 | 2722 | } | 
 | 2723 | #endif /* CONFIG_MEMCG_KMEM */ | 
 | 2724 |  | 
 | 2725 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 
 | 2726 |  | 
 | 2727 | /* | 
 | 2728 |  * Because tail pages are not marked as "used", set it. We're under | 
 | 2729 |  * zone_lru_lock and migration entries setup in all page mappings. | 
 | 2730 |  */ | 
 | 2731 | void mem_cgroup_split_huge_fixup(struct page *head) | 
 | 2732 | { | 
 | 2733 | 	int i; | 
 | 2734 |  | 
 | 2735 | 	if (mem_cgroup_disabled()) | 
 | 2736 | 		return; | 
 | 2737 |  | 
 | 2738 | 	for (i = 1; i < HPAGE_PMD_NR; i++) | 
 | 2739 | 		head[i].mem_cgroup = head->mem_cgroup; | 
 | 2740 |  | 
 | 2741 | 	__mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); | 
 | 2742 | } | 
 | 2743 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 
 | 2744 |  | 
 | 2745 | #ifdef CONFIG_MEMCG_SWAP | 
 | 2746 | /** | 
 | 2747 |  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | 
 | 2748 |  * @entry: swap entry to be moved | 
 | 2749 |  * @from:  mem_cgroup which the entry is moved from | 
 | 2750 |  * @to:  mem_cgroup which the entry is moved to | 
 | 2751 |  * | 
 | 2752 |  * It succeeds only when the swap_cgroup's record for this entry is the same | 
 | 2753 |  * as the mem_cgroup's id of @from. | 
 | 2754 |  * | 
 | 2755 |  * Returns 0 on success, -EINVAL on failure. | 
 | 2756 |  * | 
 | 2757 |  * The caller must have charged to @to, IOW, called page_counter_charge() about | 
 | 2758 |  * both res and memsw, and called css_get(). | 
 | 2759 |  */ | 
 | 2760 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 
 | 2761 | 				struct mem_cgroup *from, struct mem_cgroup *to) | 
 | 2762 | { | 
 | 2763 | 	unsigned short old_id, new_id; | 
 | 2764 |  | 
 | 2765 | 	old_id = mem_cgroup_id(from); | 
 | 2766 | 	new_id = mem_cgroup_id(to); | 
 | 2767 |  | 
 | 2768 | 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | 
 | 2769 | 		mod_memcg_state(from, MEMCG_SWAP, -1); | 
 | 2770 | 		mod_memcg_state(to, MEMCG_SWAP, 1); | 
 | 2771 | 		return 0; | 
 | 2772 | 	} | 
 | 2773 | 	return -EINVAL; | 
 | 2774 | } | 
 | 2775 | #else | 
 | 2776 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | 
 | 2777 | 				struct mem_cgroup *from, struct mem_cgroup *to) | 
 | 2778 | { | 
 | 2779 | 	return -EINVAL; | 
 | 2780 | } | 
 | 2781 | #endif | 
 | 2782 |  | 
 | 2783 | static DEFINE_MUTEX(memcg_max_mutex); | 
 | 2784 |  | 
 | 2785 | static int mem_cgroup_resize_max(struct mem_cgroup *memcg, | 
 | 2786 | 				 unsigned long max, bool memsw) | 
 | 2787 | { | 
 | 2788 | 	bool enlarge = false; | 
 | 2789 | 	bool drained = false; | 
 | 2790 | 	int ret; | 
 | 2791 | 	bool limits_invariant; | 
 | 2792 | 	struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; | 
 | 2793 |  | 
 | 2794 | 	do { | 
 | 2795 | 		if (signal_pending(current)) { | 
 | 2796 | 			ret = -EINTR; | 
 | 2797 | 			break; | 
 | 2798 | 		} | 
 | 2799 |  | 
 | 2800 | 		mutex_lock(&memcg_max_mutex); | 
 | 2801 | 		/* | 
 | 2802 | 		 * Make sure that the new limit (memsw or memory limit) doesn't | 
 | 2803 | 		 * break our basic invariant rule memory.max <= memsw.max. | 
 | 2804 | 		 */ | 
 | 2805 | 		limits_invariant = memsw ? max >= memcg->memory.max : | 
 | 2806 | 					   max <= memcg->memsw.max; | 
 | 2807 | 		if (!limits_invariant) { | 
 | 2808 | 			mutex_unlock(&memcg_max_mutex); | 
 | 2809 | 			ret = -EINVAL; | 
 | 2810 | 			break; | 
 | 2811 | 		} | 
 | 2812 | 		if (max > counter->max) | 
 | 2813 | 			enlarge = true; | 
 | 2814 | 		ret = page_counter_set_max(counter, max); | 
 | 2815 | 		mutex_unlock(&memcg_max_mutex); | 
 | 2816 |  | 
 | 2817 | 		if (!ret) | 
 | 2818 | 			break; | 
 | 2819 |  | 
 | 2820 | 		if (!drained) { | 
 | 2821 | 			drain_all_stock(memcg); | 
 | 2822 | 			drained = true; | 
 | 2823 | 			continue; | 
 | 2824 | 		} | 
 | 2825 |  | 
 | 2826 | 		if (!try_to_free_mem_cgroup_pages(memcg, 1, | 
 | 2827 | 					GFP_KERNEL, !memsw)) { | 
 | 2828 | 			ret = -EBUSY; | 
 | 2829 | 			break; | 
 | 2830 | 		} | 
 | 2831 | 	} while (true); | 
 | 2832 |  | 
 | 2833 | 	if (!ret && enlarge) | 
 | 2834 | 		memcg_oom_recover(memcg); | 
 | 2835 |  | 
 | 2836 | 	return ret; | 
 | 2837 | } | 
 | 2838 |  | 
 | 2839 | unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, | 
 | 2840 | 					    gfp_t gfp_mask, | 
 | 2841 | 					    unsigned long *total_scanned) | 
 | 2842 | { | 
 | 2843 | 	unsigned long nr_reclaimed = 0; | 
 | 2844 | 	struct mem_cgroup_per_node *mz, *next_mz = NULL; | 
 | 2845 | 	unsigned long reclaimed; | 
 | 2846 | 	int loop = 0; | 
 | 2847 | 	struct mem_cgroup_tree_per_node *mctz; | 
 | 2848 | 	unsigned long excess; | 
 | 2849 | 	unsigned long nr_scanned; | 
 | 2850 |  | 
 | 2851 | 	if (order > 0) | 
 | 2852 | 		return 0; | 
 | 2853 |  | 
 | 2854 | 	mctz = soft_limit_tree_node(pgdat->node_id); | 
 | 2855 |  | 
 | 2856 | 	/* | 
 | 2857 | 	 * Do not even bother to check the largest node if the root | 
 | 2858 | 	 * is empty. Do it lockless to prevent lock bouncing. Races | 
 | 2859 | 	 * are acceptable as soft limit is best effort anyway. | 
 | 2860 | 	 */ | 
 | 2861 | 	if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root)) | 
 | 2862 | 		return 0; | 
 | 2863 |  | 
 | 2864 | 	/* | 
 | 2865 | 	 * This loop can run a while, specially if mem_cgroup's continuously | 
 | 2866 | 	 * keep exceeding their soft limit and putting the system under | 
 | 2867 | 	 * pressure | 
 | 2868 | 	 */ | 
 | 2869 | 	do { | 
 | 2870 | 		if (next_mz) | 
 | 2871 | 			mz = next_mz; | 
 | 2872 | 		else | 
 | 2873 | 			mz = mem_cgroup_largest_soft_limit_node(mctz); | 
 | 2874 | 		if (!mz) | 
 | 2875 | 			break; | 
 | 2876 |  | 
 | 2877 | 		nr_scanned = 0; | 
 | 2878 | 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, | 
 | 2879 | 						    gfp_mask, &nr_scanned); | 
 | 2880 | 		nr_reclaimed += reclaimed; | 
 | 2881 | 		*total_scanned += nr_scanned; | 
 | 2882 | 		spin_lock_irq(&mctz->lock); | 
 | 2883 | 		__mem_cgroup_remove_exceeded(mz, mctz); | 
 | 2884 |  | 
 | 2885 | 		/* | 
 | 2886 | 		 * If we failed to reclaim anything from this memory cgroup | 
 | 2887 | 		 * it is time to move on to the next cgroup | 
 | 2888 | 		 */ | 
 | 2889 | 		next_mz = NULL; | 
 | 2890 | 		if (!reclaimed) | 
 | 2891 | 			next_mz = __mem_cgroup_largest_soft_limit_node(mctz); | 
 | 2892 |  | 
 | 2893 | 		excess = soft_limit_excess(mz->memcg); | 
 | 2894 | 		/* | 
 | 2895 | 		 * One school of thought says that we should not add | 
 | 2896 | 		 * back the node to the tree if reclaim returns 0. | 
 | 2897 | 		 * But our reclaim could return 0, simply because due | 
 | 2898 | 		 * to priority we are exposing a smaller subset of | 
 | 2899 | 		 * memory to reclaim from. Consider this as a longer | 
 | 2900 | 		 * term TODO. | 
 | 2901 | 		 */ | 
 | 2902 | 		/* If excess == 0, no tree ops */ | 
 | 2903 | 		__mem_cgroup_insert_exceeded(mz, mctz, excess); | 
 | 2904 | 		spin_unlock_irq(&mctz->lock); | 
 | 2905 | 		css_put(&mz->memcg->css); | 
 | 2906 | 		loop++; | 
 | 2907 | 		/* | 
 | 2908 | 		 * Could not reclaim anything and there are no more | 
 | 2909 | 		 * mem cgroups to try or we seem to be looping without | 
 | 2910 | 		 * reclaiming anything. | 
 | 2911 | 		 */ | 
 | 2912 | 		if (!nr_reclaimed && | 
 | 2913 | 			(next_mz == NULL || | 
 | 2914 | 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | 
 | 2915 | 			break; | 
 | 2916 | 	} while (!nr_reclaimed); | 
 | 2917 | 	if (next_mz) | 
 | 2918 | 		css_put(&next_mz->memcg->css); | 
 | 2919 | 	return nr_reclaimed; | 
 | 2920 | } | 
 | 2921 |  | 
 | 2922 | /* | 
 | 2923 |  * Test whether @memcg has children, dead or alive.  Note that this | 
 | 2924 |  * function doesn't care whether @memcg has use_hierarchy enabled and | 
 | 2925 |  * returns %true if there are child csses according to the cgroup | 
 | 2926 |  * hierarchy.  Testing use_hierarchy is the caller's responsiblity. | 
 | 2927 |  */ | 
 | 2928 | static inline bool memcg_has_children(struct mem_cgroup *memcg) | 
 | 2929 | { | 
 | 2930 | 	bool ret; | 
 | 2931 |  | 
 | 2932 | 	rcu_read_lock(); | 
 | 2933 | 	ret = css_next_child(NULL, &memcg->css); | 
 | 2934 | 	rcu_read_unlock(); | 
 | 2935 | 	return ret; | 
 | 2936 | } | 
 | 2937 |  | 
 | 2938 | /* | 
 | 2939 |  * Reclaims as many pages from the given memcg as possible. | 
 | 2940 |  * | 
 | 2941 |  * Caller is responsible for holding css reference for memcg. | 
 | 2942 |  */ | 
 | 2943 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | 
 | 2944 | { | 
 | 2945 | 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 
 | 2946 |  | 
 | 2947 | 	/* we call try-to-free pages for make this cgroup empty */ | 
 | 2948 | 	lru_add_drain_all(); | 
 | 2949 |  | 
 | 2950 | 	drain_all_stock(memcg); | 
 | 2951 |  | 
 | 2952 | 	/* try to free all pages in this cgroup */ | 
 | 2953 | 	while (nr_retries && page_counter_read(&memcg->memory)) { | 
 | 2954 | 		int progress; | 
 | 2955 |  | 
 | 2956 | 		if (signal_pending(current)) | 
 | 2957 | 			return -EINTR; | 
 | 2958 |  | 
 | 2959 | 		progress = try_to_free_mem_cgroup_pages(memcg, 1, | 
 | 2960 | 							GFP_KERNEL, true); | 
 | 2961 | 		if (!progress) { | 
 | 2962 | 			nr_retries--; | 
 | 2963 | 			/* maybe some writeback is necessary */ | 
 | 2964 | 			congestion_wait(BLK_RW_ASYNC, HZ/10); | 
 | 2965 | 		} | 
 | 2966 |  | 
 | 2967 | 	} | 
 | 2968 |  | 
 | 2969 | 	return 0; | 
 | 2970 | } | 
 | 2971 |  | 
 | 2972 | static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, | 
 | 2973 | 					    char *buf, size_t nbytes, | 
 | 2974 | 					    loff_t off) | 
 | 2975 | { | 
 | 2976 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 2977 |  | 
 | 2978 | 	if (mem_cgroup_is_root(memcg)) | 
 | 2979 | 		return -EINVAL; | 
 | 2980 | 	return mem_cgroup_force_empty(memcg) ?: nbytes; | 
 | 2981 | } | 
 | 2982 |  | 
 | 2983 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, | 
 | 2984 | 				     struct cftype *cft) | 
 | 2985 | { | 
 | 2986 | 	return mem_cgroup_from_css(css)->use_hierarchy; | 
 | 2987 | } | 
 | 2988 |  | 
 | 2989 | static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, | 
 | 2990 | 				      struct cftype *cft, u64 val) | 
 | 2991 | { | 
 | 2992 | 	int retval = 0; | 
 | 2993 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 2994 | 	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); | 
 | 2995 |  | 
 | 2996 | 	if (memcg->use_hierarchy == val) | 
 | 2997 | 		return 0; | 
 | 2998 |  | 
 | 2999 | 	/* | 
 | 3000 | 	 * If parent's use_hierarchy is set, we can't make any modifications | 
 | 3001 | 	 * in the child subtrees. If it is unset, then the change can | 
 | 3002 | 	 * occur, provided the current cgroup has no children. | 
 | 3003 | 	 * | 
 | 3004 | 	 * For the root cgroup, parent_mem is NULL, we allow value to be | 
 | 3005 | 	 * set if there are no children. | 
 | 3006 | 	 */ | 
 | 3007 | 	if ((!parent_memcg || !parent_memcg->use_hierarchy) && | 
 | 3008 | 				(val == 1 || val == 0)) { | 
 | 3009 | 		if (!memcg_has_children(memcg)) | 
 | 3010 | 			memcg->use_hierarchy = val; | 
 | 3011 | 		else | 
 | 3012 | 			retval = -EBUSY; | 
 | 3013 | 	} else | 
 | 3014 | 		retval = -EINVAL; | 
 | 3015 |  | 
 | 3016 | 	return retval; | 
 | 3017 | } | 
 | 3018 |  | 
 | 3019 | struct accumulated_stats { | 
 | 3020 | 	unsigned long stat[MEMCG_NR_STAT]; | 
 | 3021 | 	unsigned long events[NR_VM_EVENT_ITEMS]; | 
 | 3022 | 	unsigned long lru_pages[NR_LRU_LISTS]; | 
 | 3023 | 	const unsigned int *stats_array; | 
 | 3024 | 	const unsigned int *events_array; | 
 | 3025 | 	int stats_size; | 
 | 3026 | 	int events_size; | 
 | 3027 | }; | 
 | 3028 |  | 
 | 3029 | static void accumulate_memcg_tree(struct mem_cgroup *memcg, | 
 | 3030 | 				  struct accumulated_stats *acc) | 
 | 3031 | { | 
 | 3032 | 	struct mem_cgroup *mi; | 
 | 3033 | 	int i; | 
 | 3034 |  | 
 | 3035 | 	for_each_mem_cgroup_tree(mi, memcg) { | 
 | 3036 | 		for (i = 0; i < acc->stats_size; i++) | 
 | 3037 | 			acc->stat[i] += memcg_page_state(mi, | 
 | 3038 | 				acc->stats_array ? acc->stats_array[i] : i); | 
 | 3039 |  | 
 | 3040 | 		for (i = 0; i < acc->events_size; i++) | 
 | 3041 | 			acc->events[i] += memcg_sum_events(mi, | 
 | 3042 | 				acc->events_array ? acc->events_array[i] : i); | 
 | 3043 |  | 
 | 3044 | 		for (i = 0; i < NR_LRU_LISTS; i++) | 
 | 3045 | 			acc->lru_pages[i] += | 
 | 3046 | 				mem_cgroup_nr_lru_pages(mi, BIT(i)); | 
 | 3047 | 	} | 
 | 3048 | } | 
 | 3049 |  | 
 | 3050 | static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | 
 | 3051 | { | 
 | 3052 | 	unsigned long val = 0; | 
 | 3053 |  | 
 | 3054 | 	if (mem_cgroup_is_root(memcg)) { | 
 | 3055 | 		struct mem_cgroup *iter; | 
 | 3056 |  | 
 | 3057 | 		for_each_mem_cgroup_tree(iter, memcg) { | 
 | 3058 | 			val += memcg_page_state(iter, MEMCG_CACHE); | 
 | 3059 | 			val += memcg_page_state(iter, MEMCG_RSS); | 
 | 3060 | 			if (swap) | 
 | 3061 | 				val += memcg_page_state(iter, MEMCG_SWAP); | 
 | 3062 | 		} | 
 | 3063 | 	} else { | 
 | 3064 | 		if (!swap) | 
 | 3065 | 			val = page_counter_read(&memcg->memory); | 
 | 3066 | 		else | 
 | 3067 | 			val = page_counter_read(&memcg->memsw); | 
 | 3068 | 	} | 
 | 3069 | 	return val; | 
 | 3070 | } | 
 | 3071 |  | 
 | 3072 | enum { | 
 | 3073 | 	RES_USAGE, | 
 | 3074 | 	RES_LIMIT, | 
 | 3075 | 	RES_MAX_USAGE, | 
 | 3076 | 	RES_FAILCNT, | 
 | 3077 | 	RES_SOFT_LIMIT, | 
 | 3078 | }; | 
 | 3079 |  | 
 | 3080 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | 
 | 3081 | 			       struct cftype *cft) | 
 | 3082 | { | 
 | 3083 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 3084 | 	struct page_counter *counter; | 
 | 3085 |  | 
 | 3086 | 	switch (MEMFILE_TYPE(cft->private)) { | 
 | 3087 | 	case _MEM: | 
 | 3088 | 		counter = &memcg->memory; | 
 | 3089 | 		break; | 
 | 3090 | 	case _MEMSWAP: | 
 | 3091 | 		counter = &memcg->memsw; | 
 | 3092 | 		break; | 
 | 3093 | 	case _KMEM: | 
 | 3094 | 		counter = &memcg->kmem; | 
 | 3095 | 		break; | 
 | 3096 | 	case _TCP: | 
 | 3097 | 		counter = &memcg->tcpmem; | 
 | 3098 | 		break; | 
 | 3099 | 	default: | 
 | 3100 | 		BUG(); | 
 | 3101 | 	} | 
 | 3102 |  | 
 | 3103 | 	switch (MEMFILE_ATTR(cft->private)) { | 
 | 3104 | 	case RES_USAGE: | 
 | 3105 | 		if (counter == &memcg->memory) | 
 | 3106 | 			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE; | 
 | 3107 | 		if (counter == &memcg->memsw) | 
 | 3108 | 			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; | 
 | 3109 | 		return (u64)page_counter_read(counter) * PAGE_SIZE; | 
 | 3110 | 	case RES_LIMIT: | 
 | 3111 | 		return (u64)counter->max * PAGE_SIZE; | 
 | 3112 | 	case RES_MAX_USAGE: | 
 | 3113 | 		return (u64)counter->watermark * PAGE_SIZE; | 
 | 3114 | 	case RES_FAILCNT: | 
 | 3115 | 		return counter->failcnt; | 
 | 3116 | 	case RES_SOFT_LIMIT: | 
 | 3117 | 		return (u64)memcg->soft_limit * PAGE_SIZE; | 
 | 3118 | 	default: | 
 | 3119 | 		BUG(); | 
 | 3120 | 	} | 
 | 3121 | } | 
 | 3122 |  | 
 | 3123 | #ifdef CONFIG_MEMCG_KMEM | 
 | 3124 | static int memcg_online_kmem(struct mem_cgroup *memcg) | 
 | 3125 | { | 
 | 3126 | 	int memcg_id; | 
 | 3127 |  | 
 | 3128 | 	if (cgroup_memory_nokmem) | 
 | 3129 | 		return 0; | 
 | 3130 |  | 
 | 3131 | 	BUG_ON(memcg->kmemcg_id >= 0); | 
 | 3132 | 	BUG_ON(memcg->kmem_state); | 
 | 3133 |  | 
 | 3134 | 	memcg_id = memcg_alloc_cache_id(); | 
 | 3135 | 	if (memcg_id < 0) | 
 | 3136 | 		return memcg_id; | 
 | 3137 |  | 
 | 3138 | 	static_branch_inc(&memcg_kmem_enabled_key); | 
 | 3139 | 	/* | 
 | 3140 | 	 * A memory cgroup is considered kmem-online as soon as it gets | 
 | 3141 | 	 * kmemcg_id. Setting the id after enabling static branching will | 
 | 3142 | 	 * guarantee no one starts accounting before all call sites are | 
 | 3143 | 	 * patched. | 
 | 3144 | 	 */ | 
 | 3145 | 	memcg->kmemcg_id = memcg_id; | 
 | 3146 | 	memcg->kmem_state = KMEM_ONLINE; | 
 | 3147 | 	INIT_LIST_HEAD(&memcg->kmem_caches); | 
 | 3148 |  | 
 | 3149 | 	return 0; | 
 | 3150 | } | 
 | 3151 |  | 
 | 3152 | static void memcg_offline_kmem(struct mem_cgroup *memcg) | 
 | 3153 | { | 
 | 3154 | 	struct cgroup_subsys_state *css; | 
 | 3155 | 	struct mem_cgroup *parent, *child; | 
 | 3156 | 	int kmemcg_id; | 
 | 3157 |  | 
 | 3158 | 	if (memcg->kmem_state != KMEM_ONLINE) | 
 | 3159 | 		return; | 
 | 3160 | 	/* | 
 | 3161 | 	 * Clear the online state before clearing memcg_caches array | 
 | 3162 | 	 * entries. The slab_mutex in memcg_deactivate_kmem_caches() | 
 | 3163 | 	 * guarantees that no cache will be created for this cgroup | 
 | 3164 | 	 * after we are done (see memcg_create_kmem_cache()). | 
 | 3165 | 	 */ | 
 | 3166 | 	memcg->kmem_state = KMEM_ALLOCATED; | 
 | 3167 |  | 
 | 3168 | 	memcg_deactivate_kmem_caches(memcg); | 
 | 3169 |  | 
 | 3170 | 	kmemcg_id = memcg->kmemcg_id; | 
 | 3171 | 	BUG_ON(kmemcg_id < 0); | 
 | 3172 |  | 
 | 3173 | 	parent = parent_mem_cgroup(memcg); | 
 | 3174 | 	if (!parent) | 
 | 3175 | 		parent = root_mem_cgroup; | 
 | 3176 |  | 
 | 3177 | 	/* | 
 | 3178 | 	 * Change kmemcg_id of this cgroup and all its descendants to the | 
 | 3179 | 	 * parent's id, and then move all entries from this cgroup's list_lrus | 
 | 3180 | 	 * to ones of the parent. After we have finished, all list_lrus | 
 | 3181 | 	 * corresponding to this cgroup are guaranteed to remain empty. The | 
 | 3182 | 	 * ordering is imposed by list_lru_node->lock taken by | 
 | 3183 | 	 * memcg_drain_all_list_lrus(). | 
 | 3184 | 	 */ | 
 | 3185 | 	rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ | 
 | 3186 | 	css_for_each_descendant_pre(css, &memcg->css) { | 
 | 3187 | 		child = mem_cgroup_from_css(css); | 
 | 3188 | 		BUG_ON(child->kmemcg_id != kmemcg_id); | 
 | 3189 | 		child->kmemcg_id = parent->kmemcg_id; | 
 | 3190 | 		if (!memcg->use_hierarchy) | 
 | 3191 | 			break; | 
 | 3192 | 	} | 
 | 3193 | 	rcu_read_unlock(); | 
 | 3194 |  | 
 | 3195 | 	memcg_drain_all_list_lrus(kmemcg_id, parent); | 
 | 3196 |  | 
 | 3197 | 	memcg_free_cache_id(kmemcg_id); | 
 | 3198 | } | 
 | 3199 |  | 
 | 3200 | static void memcg_free_kmem(struct mem_cgroup *memcg) | 
 | 3201 | { | 
 | 3202 | 	/* css_alloc() failed, offlining didn't happen */ | 
 | 3203 | 	if (unlikely(memcg->kmem_state == KMEM_ONLINE)) | 
 | 3204 | 		memcg_offline_kmem(memcg); | 
 | 3205 |  | 
 | 3206 | 	if (memcg->kmem_state == KMEM_ALLOCATED) { | 
 | 3207 | 		memcg_destroy_kmem_caches(memcg); | 
 | 3208 | 		static_branch_dec(&memcg_kmem_enabled_key); | 
 | 3209 | 		WARN_ON(page_counter_read(&memcg->kmem)); | 
 | 3210 | 	} | 
 | 3211 | } | 
 | 3212 | #else | 
 | 3213 | static int memcg_online_kmem(struct mem_cgroup *memcg) | 
 | 3214 | { | 
 | 3215 | 	return 0; | 
 | 3216 | } | 
 | 3217 | static void memcg_offline_kmem(struct mem_cgroup *memcg) | 
 | 3218 | { | 
 | 3219 | } | 
 | 3220 | static void memcg_free_kmem(struct mem_cgroup *memcg) | 
 | 3221 | { | 
 | 3222 | } | 
 | 3223 | #endif /* CONFIG_MEMCG_KMEM */ | 
 | 3224 |  | 
 | 3225 | static int memcg_update_kmem_max(struct mem_cgroup *memcg, | 
 | 3226 | 				 unsigned long max) | 
 | 3227 | { | 
 | 3228 | 	int ret; | 
 | 3229 |  | 
 | 3230 | 	mutex_lock(&memcg_max_mutex); | 
 | 3231 | 	ret = page_counter_set_max(&memcg->kmem, max); | 
 | 3232 | 	mutex_unlock(&memcg_max_mutex); | 
 | 3233 | 	return ret; | 
 | 3234 | } | 
 | 3235 |  | 
 | 3236 | static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) | 
 | 3237 | { | 
 | 3238 | 	int ret; | 
 | 3239 |  | 
 | 3240 | 	mutex_lock(&memcg_max_mutex); | 
 | 3241 |  | 
 | 3242 | 	ret = page_counter_set_max(&memcg->tcpmem, max); | 
 | 3243 | 	if (ret) | 
 | 3244 | 		goto out; | 
 | 3245 |  | 
 | 3246 | 	if (!memcg->tcpmem_active) { | 
 | 3247 | 		/* | 
 | 3248 | 		 * The active flag needs to be written after the static_key | 
 | 3249 | 		 * update. This is what guarantees that the socket activation | 
 | 3250 | 		 * function is the last one to run. See mem_cgroup_sk_alloc() | 
 | 3251 | 		 * for details, and note that we don't mark any socket as | 
 | 3252 | 		 * belonging to this memcg until that flag is up. | 
 | 3253 | 		 * | 
 | 3254 | 		 * We need to do this, because static_keys will span multiple | 
 | 3255 | 		 * sites, but we can't control their order. If we mark a socket | 
 | 3256 | 		 * as accounted, but the accounting functions are not patched in | 
 | 3257 | 		 * yet, we'll lose accounting. | 
 | 3258 | 		 * | 
 | 3259 | 		 * We never race with the readers in mem_cgroup_sk_alloc(), | 
 | 3260 | 		 * because when this value change, the code to process it is not | 
 | 3261 | 		 * patched in yet. | 
 | 3262 | 		 */ | 
 | 3263 | 		static_branch_inc(&memcg_sockets_enabled_key); | 
 | 3264 | 		memcg->tcpmem_active = true; | 
 | 3265 | 	} | 
 | 3266 | out: | 
 | 3267 | 	mutex_unlock(&memcg_max_mutex); | 
 | 3268 | 	return ret; | 
 | 3269 | } | 
 | 3270 |  | 
 | 3271 | /* | 
 | 3272 |  * The user of this function is... | 
 | 3273 |  * RES_LIMIT. | 
 | 3274 |  */ | 
 | 3275 | static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | 
 | 3276 | 				char *buf, size_t nbytes, loff_t off) | 
 | 3277 | { | 
 | 3278 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 3279 | 	unsigned long nr_pages; | 
 | 3280 | 	int ret; | 
 | 3281 |  | 
 | 3282 | 	buf = strstrip(buf); | 
 | 3283 | 	ret = page_counter_memparse(buf, "-1", &nr_pages); | 
 | 3284 | 	if (ret) | 
 | 3285 | 		return ret; | 
 | 3286 |  | 
 | 3287 | 	switch (MEMFILE_ATTR(of_cft(of)->private)) { | 
 | 3288 | 	case RES_LIMIT: | 
 | 3289 | 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 
 | 3290 | 			ret = -EINVAL; | 
 | 3291 | 			break; | 
 | 3292 | 		} | 
 | 3293 | 		switch (MEMFILE_TYPE(of_cft(of)->private)) { | 
 | 3294 | 		case _MEM: | 
 | 3295 | 			ret = mem_cgroup_resize_max(memcg, nr_pages, false); | 
 | 3296 | 			break; | 
 | 3297 | 		case _MEMSWAP: | 
 | 3298 | 			ret = mem_cgroup_resize_max(memcg, nr_pages, true); | 
 | 3299 | 			break; | 
 | 3300 | 		case _KMEM: | 
 | 3301 | 			ret = memcg_update_kmem_max(memcg, nr_pages); | 
 | 3302 | 			break; | 
 | 3303 | 		case _TCP: | 
 | 3304 | 			ret = memcg_update_tcp_max(memcg, nr_pages); | 
 | 3305 | 			break; | 
 | 3306 | 		} | 
 | 3307 | 		break; | 
 | 3308 | 	case RES_SOFT_LIMIT: | 
 | 3309 | 		memcg->soft_limit = nr_pages; | 
 | 3310 | 		ret = 0; | 
 | 3311 | 		break; | 
 | 3312 | 	} | 
 | 3313 | 	return ret ?: nbytes; | 
 | 3314 | } | 
 | 3315 |  | 
 | 3316 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, | 
 | 3317 | 				size_t nbytes, loff_t off) | 
 | 3318 | { | 
 | 3319 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 3320 | 	struct page_counter *counter; | 
 | 3321 |  | 
 | 3322 | 	switch (MEMFILE_TYPE(of_cft(of)->private)) { | 
 | 3323 | 	case _MEM: | 
 | 3324 | 		counter = &memcg->memory; | 
 | 3325 | 		break; | 
 | 3326 | 	case _MEMSWAP: | 
 | 3327 | 		counter = &memcg->memsw; | 
 | 3328 | 		break; | 
 | 3329 | 	case _KMEM: | 
 | 3330 | 		counter = &memcg->kmem; | 
 | 3331 | 		break; | 
 | 3332 | 	case _TCP: | 
 | 3333 | 		counter = &memcg->tcpmem; | 
 | 3334 | 		break; | 
 | 3335 | 	default: | 
 | 3336 | 		BUG(); | 
 | 3337 | 	} | 
 | 3338 |  | 
 | 3339 | 	switch (MEMFILE_ATTR(of_cft(of)->private)) { | 
 | 3340 | 	case RES_MAX_USAGE: | 
 | 3341 | 		page_counter_reset_watermark(counter); | 
 | 3342 | 		break; | 
 | 3343 | 	case RES_FAILCNT: | 
 | 3344 | 		counter->failcnt = 0; | 
 | 3345 | 		break; | 
 | 3346 | 	default: | 
 | 3347 | 		BUG(); | 
 | 3348 | 	} | 
 | 3349 |  | 
 | 3350 | 	return nbytes; | 
 | 3351 | } | 
 | 3352 |  | 
 | 3353 | static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, | 
 | 3354 | 					struct cftype *cft) | 
 | 3355 | { | 
 | 3356 | 	return mem_cgroup_from_css(css)->move_charge_at_immigrate; | 
 | 3357 | } | 
 | 3358 |  | 
 | 3359 | #ifdef CONFIG_MMU | 
 | 3360 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | 
 | 3361 | 					struct cftype *cft, u64 val) | 
 | 3362 | { | 
 | 3363 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 3364 |  | 
 | 3365 | 	if (val & ~MOVE_MASK) | 
 | 3366 | 		return -EINVAL; | 
 | 3367 |  | 
 | 3368 | 	/* | 
 | 3369 | 	 * No kind of locking is needed in here, because ->can_attach() will | 
 | 3370 | 	 * check this value once in the beginning of the process, and then carry | 
 | 3371 | 	 * on with stale data. This means that changes to this value will only | 
 | 3372 | 	 * affect task migrations starting after the change. | 
 | 3373 | 	 */ | 
 | 3374 | 	memcg->move_charge_at_immigrate = val; | 
 | 3375 | 	return 0; | 
 | 3376 | } | 
 | 3377 | #else | 
 | 3378 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | 
 | 3379 | 					struct cftype *cft, u64 val) | 
 | 3380 | { | 
 | 3381 | 	return -ENOSYS; | 
 | 3382 | } | 
 | 3383 | #endif | 
 | 3384 |  | 
 | 3385 | #ifdef CONFIG_NUMA | 
 | 3386 | static int memcg_numa_stat_show(struct seq_file *m, void *v) | 
 | 3387 | { | 
 | 3388 | 	struct numa_stat { | 
 | 3389 | 		const char *name; | 
 | 3390 | 		unsigned int lru_mask; | 
 | 3391 | 	}; | 
 | 3392 |  | 
 | 3393 | 	static const struct numa_stat stats[] = { | 
 | 3394 | 		{ "total", LRU_ALL }, | 
 | 3395 | 		{ "file", LRU_ALL_FILE }, | 
 | 3396 | 		{ "anon", LRU_ALL_ANON }, | 
 | 3397 | 		{ "unevictable", BIT(LRU_UNEVICTABLE) }, | 
 | 3398 | 	}; | 
 | 3399 | 	const struct numa_stat *stat; | 
 | 3400 | 	int nid; | 
 | 3401 | 	unsigned long nr; | 
 | 3402 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 3403 |  | 
 | 3404 | 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | 
 | 3405 | 		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); | 
 | 3406 | 		seq_printf(m, "%s=%lu", stat->name, nr); | 
 | 3407 | 		for_each_node_state(nid, N_MEMORY) { | 
 | 3408 | 			nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 
 | 3409 | 							  stat->lru_mask); | 
 | 3410 | 			seq_printf(m, " N%d=%lu", nid, nr); | 
 | 3411 | 		} | 
 | 3412 | 		seq_putc(m, '\n'); | 
 | 3413 | 	} | 
 | 3414 |  | 
 | 3415 | 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | 
 | 3416 | 		struct mem_cgroup *iter; | 
 | 3417 |  | 
 | 3418 | 		nr = 0; | 
 | 3419 | 		for_each_mem_cgroup_tree(iter, memcg) | 
 | 3420 | 			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); | 
 | 3421 | 		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); | 
 | 3422 | 		for_each_node_state(nid, N_MEMORY) { | 
 | 3423 | 			nr = 0; | 
 | 3424 | 			for_each_mem_cgroup_tree(iter, memcg) | 
 | 3425 | 				nr += mem_cgroup_node_nr_lru_pages( | 
 | 3426 | 					iter, nid, stat->lru_mask); | 
 | 3427 | 			seq_printf(m, " N%d=%lu", nid, nr); | 
 | 3428 | 		} | 
 | 3429 | 		seq_putc(m, '\n'); | 
 | 3430 | 	} | 
 | 3431 |  | 
 | 3432 | 	return 0; | 
 | 3433 | } | 
 | 3434 | #endif /* CONFIG_NUMA */ | 
 | 3435 |  | 
 | 3436 | /* Universal VM events cgroup1 shows, original sort order */ | 
 | 3437 | static const unsigned int memcg1_events[] = { | 
 | 3438 | 	PGPGIN, | 
 | 3439 | 	PGPGOUT, | 
 | 3440 | 	PGFAULT, | 
 | 3441 | 	PGMAJFAULT, | 
 | 3442 | }; | 
 | 3443 |  | 
 | 3444 | static const char *const memcg1_event_names[] = { | 
 | 3445 | 	"pgpgin", | 
 | 3446 | 	"pgpgout", | 
 | 3447 | 	"pgfault", | 
 | 3448 | 	"pgmajfault", | 
 | 3449 | }; | 
 | 3450 |  | 
 | 3451 | static int memcg_stat_show(struct seq_file *m, void *v) | 
 | 3452 | { | 
 | 3453 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 3454 | 	unsigned long memory, memsw; | 
 | 3455 | 	struct mem_cgroup *mi; | 
 | 3456 | 	unsigned int i; | 
 | 3457 | 	struct accumulated_stats acc; | 
 | 3458 |  | 
 | 3459 | 	BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); | 
 | 3460 | 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 
 | 3461 |  | 
 | 3462 | 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { | 
 | 3463 | 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) | 
 | 3464 | 			continue; | 
 | 3465 | 		seq_printf(m, "%s %lu\n", memcg1_stat_names[i], | 
 | 3466 | 			   memcg_page_state(memcg, memcg1_stats[i]) * | 
 | 3467 | 			   PAGE_SIZE); | 
 | 3468 | 	} | 
 | 3469 |  | 
 | 3470 | 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) | 
 | 3471 | 		seq_printf(m, "%s %lu\n", memcg1_event_names[i], | 
 | 3472 | 			   memcg_sum_events(memcg, memcg1_events[i])); | 
 | 3473 |  | 
 | 3474 | 	for (i = 0; i < NR_LRU_LISTS; i++) | 
 | 3475 | 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], | 
 | 3476 | 			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); | 
 | 3477 |  | 
 | 3478 | 	/* Hierarchical information */ | 
 | 3479 | 	memory = memsw = PAGE_COUNTER_MAX; | 
 | 3480 | 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { | 
 | 3481 | 		memory = min(memory, mi->memory.max); | 
 | 3482 | 		memsw = min(memsw, mi->memsw.max); | 
 | 3483 | 	} | 
 | 3484 | 	seq_printf(m, "hierarchical_memory_limit %llu\n", | 
 | 3485 | 		   (u64)memory * PAGE_SIZE); | 
 | 3486 | 	if (do_memsw_account()) | 
 | 3487 | 		seq_printf(m, "hierarchical_memsw_limit %llu\n", | 
 | 3488 | 			   (u64)memsw * PAGE_SIZE); | 
 | 3489 |  | 
 | 3490 | 	memset(&acc, 0, sizeof(acc)); | 
 | 3491 | 	acc.stats_size = ARRAY_SIZE(memcg1_stats); | 
 | 3492 | 	acc.stats_array = memcg1_stats; | 
 | 3493 | 	acc.events_size = ARRAY_SIZE(memcg1_events); | 
 | 3494 | 	acc.events_array = memcg1_events; | 
 | 3495 | 	accumulate_memcg_tree(memcg, &acc); | 
 | 3496 |  | 
 | 3497 | 	for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { | 
 | 3498 | 		if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) | 
 | 3499 | 			continue; | 
 | 3500 | 		seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], | 
 | 3501 | 			   (u64)acc.stat[i] * PAGE_SIZE); | 
 | 3502 | 	} | 
 | 3503 |  | 
 | 3504 | 	for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) | 
 | 3505 | 		seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], | 
 | 3506 | 			   (u64)acc.events[i]); | 
 | 3507 |  | 
 | 3508 | 	for (i = 0; i < NR_LRU_LISTS; i++) | 
 | 3509 | 		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], | 
 | 3510 | 			   (u64)acc.lru_pages[i] * PAGE_SIZE); | 
 | 3511 |  | 
 | 3512 | #ifdef CONFIG_DEBUG_VM | 
 | 3513 | 	{ | 
 | 3514 | 		pg_data_t *pgdat; | 
 | 3515 | 		struct mem_cgroup_per_node *mz; | 
 | 3516 | 		struct zone_reclaim_stat *rstat; | 
 | 3517 | 		unsigned long recent_rotated[2] = {0, 0}; | 
 | 3518 | 		unsigned long recent_scanned[2] = {0, 0}; | 
 | 3519 |  | 
 | 3520 | 		for_each_online_pgdat(pgdat) { | 
 | 3521 | 			mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); | 
 | 3522 | 			rstat = &mz->lruvec.reclaim_stat; | 
 | 3523 |  | 
 | 3524 | 			recent_rotated[0] += rstat->recent_rotated[0]; | 
 | 3525 | 			recent_rotated[1] += rstat->recent_rotated[1]; | 
 | 3526 | 			recent_scanned[0] += rstat->recent_scanned[0]; | 
 | 3527 | 			recent_scanned[1] += rstat->recent_scanned[1]; | 
 | 3528 | 		} | 
 | 3529 | 		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); | 
 | 3530 | 		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); | 
 | 3531 | 		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); | 
 | 3532 | 		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); | 
 | 3533 | 	} | 
 | 3534 | #endif | 
 | 3535 |  | 
 | 3536 | 	return 0; | 
 | 3537 | } | 
 | 3538 |  | 
 | 3539 | static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, | 
 | 3540 | 				      struct cftype *cft) | 
 | 3541 | { | 
 | 3542 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 3543 |  | 
 | 3544 | 	return mem_cgroup_swappiness(memcg); | 
 | 3545 | } | 
 | 3546 |  | 
 | 3547 | static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, | 
 | 3548 | 				       struct cftype *cft, u64 val) | 
 | 3549 | { | 
 | 3550 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 3551 |  | 
 | 3552 | 	if (val > 100) | 
 | 3553 | 		return -EINVAL; | 
 | 3554 |  | 
 | 3555 | 	if (css->parent) | 
 | 3556 | 		memcg->swappiness = val; | 
 | 3557 | 	else | 
 | 3558 | 		vm_swappiness = val; | 
 | 3559 |  | 
 | 3560 | 	return 0; | 
 | 3561 | } | 
 | 3562 |  | 
 | 3563 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | 
 | 3564 | { | 
 | 3565 | 	struct mem_cgroup_threshold_ary *t; | 
 | 3566 | 	unsigned long usage; | 
 | 3567 | 	int i; | 
 | 3568 |  | 
 | 3569 | 	rcu_read_lock(); | 
 | 3570 | 	if (!swap) | 
 | 3571 | 		t = rcu_dereference(memcg->thresholds.primary); | 
 | 3572 | 	else | 
 | 3573 | 		t = rcu_dereference(memcg->memsw_thresholds.primary); | 
 | 3574 |  | 
 | 3575 | 	if (!t) | 
 | 3576 | 		goto unlock; | 
 | 3577 |  | 
 | 3578 | 	usage = mem_cgroup_usage(memcg, swap); | 
 | 3579 |  | 
 | 3580 | 	/* | 
 | 3581 | 	 * current_threshold points to threshold just below or equal to usage. | 
 | 3582 | 	 * If it's not true, a threshold was crossed after last | 
 | 3583 | 	 * call of __mem_cgroup_threshold(). | 
 | 3584 | 	 */ | 
 | 3585 | 	i = t->current_threshold; | 
 | 3586 |  | 
 | 3587 | 	/* | 
 | 3588 | 	 * Iterate backward over array of thresholds starting from | 
 | 3589 | 	 * current_threshold and check if a threshold is crossed. | 
 | 3590 | 	 * If none of thresholds below usage is crossed, we read | 
 | 3591 | 	 * only one element of the array here. | 
 | 3592 | 	 */ | 
 | 3593 | 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | 
 | 3594 | 		eventfd_signal(t->entries[i].eventfd, 1); | 
 | 3595 |  | 
 | 3596 | 	/* i = current_threshold + 1 */ | 
 | 3597 | 	i++; | 
 | 3598 |  | 
 | 3599 | 	/* | 
 | 3600 | 	 * Iterate forward over array of thresholds starting from | 
 | 3601 | 	 * current_threshold+1 and check if a threshold is crossed. | 
 | 3602 | 	 * If none of thresholds above usage is crossed, we read | 
 | 3603 | 	 * only one element of the array here. | 
 | 3604 | 	 */ | 
 | 3605 | 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | 
 | 3606 | 		eventfd_signal(t->entries[i].eventfd, 1); | 
 | 3607 |  | 
 | 3608 | 	/* Update current_threshold */ | 
 | 3609 | 	t->current_threshold = i - 1; | 
 | 3610 | unlock: | 
 | 3611 | 	rcu_read_unlock(); | 
 | 3612 | } | 
 | 3613 |  | 
 | 3614 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | 
 | 3615 | { | 
 | 3616 | 	while (memcg) { | 
 | 3617 | 		__mem_cgroup_threshold(memcg, false); | 
 | 3618 | 		if (do_memsw_account()) | 
 | 3619 | 			__mem_cgroup_threshold(memcg, true); | 
 | 3620 |  | 
 | 3621 | 		memcg = parent_mem_cgroup(memcg); | 
 | 3622 | 	} | 
 | 3623 | } | 
 | 3624 |  | 
 | 3625 | static int compare_thresholds(const void *a, const void *b) | 
 | 3626 | { | 
 | 3627 | 	const struct mem_cgroup_threshold *_a = a; | 
 | 3628 | 	const struct mem_cgroup_threshold *_b = b; | 
 | 3629 |  | 
 | 3630 | 	if (_a->threshold > _b->threshold) | 
 | 3631 | 		return 1; | 
 | 3632 |  | 
 | 3633 | 	if (_a->threshold < _b->threshold) | 
 | 3634 | 		return -1; | 
 | 3635 |  | 
 | 3636 | 	return 0; | 
 | 3637 | } | 
 | 3638 |  | 
 | 3639 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) | 
 | 3640 | { | 
 | 3641 | 	struct mem_cgroup_eventfd_list *ev; | 
 | 3642 |  | 
 | 3643 | 	spin_lock(&memcg_oom_lock); | 
 | 3644 |  | 
 | 3645 | 	list_for_each_entry(ev, &memcg->oom_notify, list) | 
 | 3646 | 		eventfd_signal(ev->eventfd, 1); | 
 | 3647 |  | 
 | 3648 | 	spin_unlock(&memcg_oom_lock); | 
 | 3649 | 	return 0; | 
 | 3650 | } | 
 | 3651 |  | 
 | 3652 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) | 
 | 3653 | { | 
 | 3654 | 	struct mem_cgroup *iter; | 
 | 3655 |  | 
 | 3656 | 	for_each_mem_cgroup_tree(iter, memcg) | 
 | 3657 | 		mem_cgroup_oom_notify_cb(iter); | 
 | 3658 | } | 
 | 3659 |  | 
 | 3660 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | 
 | 3661 | 	struct eventfd_ctx *eventfd, const char *args, enum res_type type) | 
 | 3662 | { | 
 | 3663 | 	struct mem_cgroup_thresholds *thresholds; | 
 | 3664 | 	struct mem_cgroup_threshold_ary *new; | 
 | 3665 | 	unsigned long threshold; | 
 | 3666 | 	unsigned long usage; | 
 | 3667 | 	int i, size, ret; | 
 | 3668 |  | 
 | 3669 | 	ret = page_counter_memparse(args, "-1", &threshold); | 
 | 3670 | 	if (ret) | 
 | 3671 | 		return ret; | 
 | 3672 |  | 
 | 3673 | 	mutex_lock(&memcg->thresholds_lock); | 
 | 3674 |  | 
 | 3675 | 	if (type == _MEM) { | 
 | 3676 | 		thresholds = &memcg->thresholds; | 
 | 3677 | 		usage = mem_cgroup_usage(memcg, false); | 
 | 3678 | 	} else if (type == _MEMSWAP) { | 
 | 3679 | 		thresholds = &memcg->memsw_thresholds; | 
 | 3680 | 		usage = mem_cgroup_usage(memcg, true); | 
 | 3681 | 	} else | 
 | 3682 | 		BUG(); | 
 | 3683 |  | 
 | 3684 | 	/* Check if a threshold crossed before adding a new one */ | 
 | 3685 | 	if (thresholds->primary) | 
 | 3686 | 		__mem_cgroup_threshold(memcg, type == _MEMSWAP); | 
 | 3687 |  | 
 | 3688 | 	size = thresholds->primary ? thresholds->primary->size + 1 : 1; | 
 | 3689 |  | 
 | 3690 | 	/* Allocate memory for new array of thresholds */ | 
 | 3691 | 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), | 
 | 3692 | 			GFP_KERNEL); | 
 | 3693 | 	if (!new) { | 
 | 3694 | 		ret = -ENOMEM; | 
 | 3695 | 		goto unlock; | 
 | 3696 | 	} | 
 | 3697 | 	new->size = size; | 
 | 3698 |  | 
 | 3699 | 	/* Copy thresholds (if any) to new array */ | 
 | 3700 | 	if (thresholds->primary) { | 
 | 3701 | 		memcpy(new->entries, thresholds->primary->entries, (size - 1) * | 
 | 3702 | 				sizeof(struct mem_cgroup_threshold)); | 
 | 3703 | 	} | 
 | 3704 |  | 
 | 3705 | 	/* Add new threshold */ | 
 | 3706 | 	new->entries[size - 1].eventfd = eventfd; | 
 | 3707 | 	new->entries[size - 1].threshold = threshold; | 
 | 3708 |  | 
 | 3709 | 	/* Sort thresholds. Registering of new threshold isn't time-critical */ | 
 | 3710 | 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold), | 
 | 3711 | 			compare_thresholds, NULL); | 
 | 3712 |  | 
 | 3713 | 	/* Find current threshold */ | 
 | 3714 | 	new->current_threshold = -1; | 
 | 3715 | 	for (i = 0; i < size; i++) { | 
 | 3716 | 		if (new->entries[i].threshold <= usage) { | 
 | 3717 | 			/* | 
 | 3718 | 			 * new->current_threshold will not be used until | 
 | 3719 | 			 * rcu_assign_pointer(), so it's safe to increment | 
 | 3720 | 			 * it here. | 
 | 3721 | 			 */ | 
 | 3722 | 			++new->current_threshold; | 
 | 3723 | 		} else | 
 | 3724 | 			break; | 
 | 3725 | 	} | 
 | 3726 |  | 
 | 3727 | 	/* Free old spare buffer and save old primary buffer as spare */ | 
 | 3728 | 	kfree(thresholds->spare); | 
 | 3729 | 	thresholds->spare = thresholds->primary; | 
 | 3730 |  | 
 | 3731 | 	rcu_assign_pointer(thresholds->primary, new); | 
 | 3732 |  | 
 | 3733 | 	/* To be sure that nobody uses thresholds */ | 
 | 3734 | 	synchronize_rcu(); | 
 | 3735 |  | 
 | 3736 | unlock: | 
 | 3737 | 	mutex_unlock(&memcg->thresholds_lock); | 
 | 3738 |  | 
 | 3739 | 	return ret; | 
 | 3740 | } | 
 | 3741 |  | 
 | 3742 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | 
 | 3743 | 	struct eventfd_ctx *eventfd, const char *args) | 
 | 3744 | { | 
 | 3745 | 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | 
 | 3746 | } | 
 | 3747 |  | 
 | 3748 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | 
 | 3749 | 	struct eventfd_ctx *eventfd, const char *args) | 
 | 3750 | { | 
 | 3751 | 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | 
 | 3752 | } | 
 | 3753 |  | 
 | 3754 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | 
 | 3755 | 	struct eventfd_ctx *eventfd, enum res_type type) | 
 | 3756 | { | 
 | 3757 | 	struct mem_cgroup_thresholds *thresholds; | 
 | 3758 | 	struct mem_cgroup_threshold_ary *new; | 
 | 3759 | 	unsigned long usage; | 
 | 3760 | 	int i, j, size; | 
 | 3761 |  | 
 | 3762 | 	mutex_lock(&memcg->thresholds_lock); | 
 | 3763 |  | 
 | 3764 | 	if (type == _MEM) { | 
 | 3765 | 		thresholds = &memcg->thresholds; | 
 | 3766 | 		usage = mem_cgroup_usage(memcg, false); | 
 | 3767 | 	} else if (type == _MEMSWAP) { | 
 | 3768 | 		thresholds = &memcg->memsw_thresholds; | 
 | 3769 | 		usage = mem_cgroup_usage(memcg, true); | 
 | 3770 | 	} else | 
 | 3771 | 		BUG(); | 
 | 3772 |  | 
 | 3773 | 	if (!thresholds->primary) | 
 | 3774 | 		goto unlock; | 
 | 3775 |  | 
 | 3776 | 	/* Check if a threshold crossed before removing */ | 
 | 3777 | 	__mem_cgroup_threshold(memcg, type == _MEMSWAP); | 
 | 3778 |  | 
 | 3779 | 	/* Calculate new number of threshold */ | 
 | 3780 | 	size = 0; | 
 | 3781 | 	for (i = 0; i < thresholds->primary->size; i++) { | 
 | 3782 | 		if (thresholds->primary->entries[i].eventfd != eventfd) | 
 | 3783 | 			size++; | 
 | 3784 | 	} | 
 | 3785 |  | 
 | 3786 | 	new = thresholds->spare; | 
 | 3787 |  | 
 | 3788 | 	/* Set thresholds array to NULL if we don't have thresholds */ | 
 | 3789 | 	if (!size) { | 
 | 3790 | 		kfree(new); | 
 | 3791 | 		new = NULL; | 
 | 3792 | 		goto swap_buffers; | 
 | 3793 | 	} | 
 | 3794 |  | 
 | 3795 | 	new->size = size; | 
 | 3796 |  | 
 | 3797 | 	/* Copy thresholds and find current threshold */ | 
 | 3798 | 	new->current_threshold = -1; | 
 | 3799 | 	for (i = 0, j = 0; i < thresholds->primary->size; i++) { | 
 | 3800 | 		if (thresholds->primary->entries[i].eventfd == eventfd) | 
 | 3801 | 			continue; | 
 | 3802 |  | 
 | 3803 | 		new->entries[j] = thresholds->primary->entries[i]; | 
 | 3804 | 		if (new->entries[j].threshold <= usage) { | 
 | 3805 | 			/* | 
 | 3806 | 			 * new->current_threshold will not be used | 
 | 3807 | 			 * until rcu_assign_pointer(), so it's safe to increment | 
 | 3808 | 			 * it here. | 
 | 3809 | 			 */ | 
 | 3810 | 			++new->current_threshold; | 
 | 3811 | 		} | 
 | 3812 | 		j++; | 
 | 3813 | 	} | 
 | 3814 |  | 
 | 3815 | swap_buffers: | 
 | 3816 | 	/* Swap primary and spare array */ | 
 | 3817 | 	thresholds->spare = thresholds->primary; | 
 | 3818 |  | 
 | 3819 | 	rcu_assign_pointer(thresholds->primary, new); | 
 | 3820 |  | 
 | 3821 | 	/* To be sure that nobody uses thresholds */ | 
 | 3822 | 	synchronize_rcu(); | 
 | 3823 |  | 
 | 3824 | 	/* If all events are unregistered, free the spare array */ | 
 | 3825 | 	if (!new) { | 
 | 3826 | 		kfree(thresholds->spare); | 
 | 3827 | 		thresholds->spare = NULL; | 
 | 3828 | 	} | 
 | 3829 | unlock: | 
 | 3830 | 	mutex_unlock(&memcg->thresholds_lock); | 
 | 3831 | } | 
 | 3832 |  | 
 | 3833 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | 
 | 3834 | 	struct eventfd_ctx *eventfd) | 
 | 3835 | { | 
 | 3836 | 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | 
 | 3837 | } | 
 | 3838 |  | 
 | 3839 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | 
 | 3840 | 	struct eventfd_ctx *eventfd) | 
 | 3841 | { | 
 | 3842 | 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | 
 | 3843 | } | 
 | 3844 |  | 
 | 3845 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | 
 | 3846 | 	struct eventfd_ctx *eventfd, const char *args) | 
 | 3847 | { | 
 | 3848 | 	struct mem_cgroup_eventfd_list *event; | 
 | 3849 |  | 
 | 3850 | 	event = kmalloc(sizeof(*event),	GFP_KERNEL); | 
 | 3851 | 	if (!event) | 
 | 3852 | 		return -ENOMEM; | 
 | 3853 |  | 
 | 3854 | 	spin_lock(&memcg_oom_lock); | 
 | 3855 |  | 
 | 3856 | 	event->eventfd = eventfd; | 
 | 3857 | 	list_add(&event->list, &memcg->oom_notify); | 
 | 3858 |  | 
 | 3859 | 	/* already in OOM ? */ | 
 | 3860 | 	if (memcg->under_oom) | 
 | 3861 | 		eventfd_signal(eventfd, 1); | 
 | 3862 | 	spin_unlock(&memcg_oom_lock); | 
 | 3863 |  | 
 | 3864 | 	return 0; | 
 | 3865 | } | 
 | 3866 |  | 
 | 3867 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, | 
 | 3868 | 	struct eventfd_ctx *eventfd) | 
 | 3869 | { | 
 | 3870 | 	struct mem_cgroup_eventfd_list *ev, *tmp; | 
 | 3871 |  | 
 | 3872 | 	spin_lock(&memcg_oom_lock); | 
 | 3873 |  | 
 | 3874 | 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { | 
 | 3875 | 		if (ev->eventfd == eventfd) { | 
 | 3876 | 			list_del(&ev->list); | 
 | 3877 | 			kfree(ev); | 
 | 3878 | 		} | 
 | 3879 | 	} | 
 | 3880 |  | 
 | 3881 | 	spin_unlock(&memcg_oom_lock); | 
 | 3882 | } | 
 | 3883 |  | 
 | 3884 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) | 
 | 3885 | { | 
 | 3886 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); | 
 | 3887 |  | 
 | 3888 | 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); | 
 | 3889 | 	seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom); | 
 | 3890 | 	seq_printf(sf, "oom_kill %lu\n", | 
 | 3891 | 		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); | 
 | 3892 | 	return 0; | 
 | 3893 | } | 
 | 3894 |  | 
 | 3895 | static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, | 
 | 3896 | 	struct cftype *cft, u64 val) | 
 | 3897 | { | 
 | 3898 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 3899 |  | 
 | 3900 | 	/* cannot set to root cgroup and only 0 and 1 are allowed */ | 
 | 3901 | 	if (!css->parent || !((val == 0) || (val == 1))) | 
 | 3902 | 		return -EINVAL; | 
 | 3903 |  | 
 | 3904 | 	memcg->oom_kill_disable = val; | 
 | 3905 | 	if (!val) | 
 | 3906 | 		memcg_oom_recover(memcg); | 
 | 3907 |  | 
 | 3908 | 	return 0; | 
 | 3909 | } | 
 | 3910 |  | 
 | 3911 | #ifdef CONFIG_CGROUP_WRITEBACK | 
 | 3912 |  | 
 | 3913 | static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) | 
 | 3914 | { | 
 | 3915 | 	return wb_domain_init(&memcg->cgwb_domain, gfp); | 
 | 3916 | } | 
 | 3917 |  | 
 | 3918 | static void memcg_wb_domain_exit(struct mem_cgroup *memcg) | 
 | 3919 | { | 
 | 3920 | 	wb_domain_exit(&memcg->cgwb_domain); | 
 | 3921 | } | 
 | 3922 |  | 
 | 3923 | static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) | 
 | 3924 | { | 
 | 3925 | 	wb_domain_size_changed(&memcg->cgwb_domain); | 
 | 3926 | } | 
 | 3927 |  | 
 | 3928 | struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) | 
 | 3929 | { | 
 | 3930 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); | 
 | 3931 |  | 
 | 3932 | 	if (!memcg->css.parent) | 
 | 3933 | 		return NULL; | 
 | 3934 |  | 
 | 3935 | 	return &memcg->cgwb_domain; | 
 | 3936 | } | 
 | 3937 |  | 
 | 3938 | /* | 
 | 3939 |  * idx can be of type enum memcg_stat_item or node_stat_item. | 
 | 3940 |  * Keep in sync with memcg_exact_page(). | 
 | 3941 |  */ | 
 | 3942 | static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) | 
 | 3943 | { | 
 | 3944 | 	long x = atomic_long_read(&memcg->stat[idx]); | 
 | 3945 | 	int cpu; | 
 | 3946 |  | 
 | 3947 | 	for_each_online_cpu(cpu) | 
 | 3948 | 		x += per_cpu_ptr(memcg->stat_cpu, cpu)->count[idx]; | 
 | 3949 | 	if (x < 0) | 
 | 3950 | 		x = 0; | 
 | 3951 | 	return x; | 
 | 3952 | } | 
 | 3953 |  | 
 | 3954 | /** | 
 | 3955 |  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg | 
 | 3956 |  * @wb: bdi_writeback in question | 
 | 3957 |  * @pfilepages: out parameter for number of file pages | 
 | 3958 |  * @pheadroom: out parameter for number of allocatable pages according to memcg | 
 | 3959 |  * @pdirty: out parameter for number of dirty pages | 
 | 3960 |  * @pwriteback: out parameter for number of pages under writeback | 
 | 3961 |  * | 
 | 3962 |  * Determine the numbers of file, headroom, dirty, and writeback pages in | 
 | 3963 |  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom | 
 | 3964 |  * is a bit more involved. | 
 | 3965 |  * | 
 | 3966 |  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the | 
 | 3967 |  * headroom is calculated as the lowest headroom of itself and the | 
 | 3968 |  * ancestors.  Note that this doesn't consider the actual amount of | 
 | 3969 |  * available memory in the system.  The caller should further cap | 
 | 3970 |  * *@pheadroom accordingly. | 
 | 3971 |  */ | 
 | 3972 | void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, | 
 | 3973 | 			 unsigned long *pheadroom, unsigned long *pdirty, | 
 | 3974 | 			 unsigned long *pwriteback) | 
 | 3975 | { | 
 | 3976 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); | 
 | 3977 | 	struct mem_cgroup *parent; | 
 | 3978 |  | 
 | 3979 | 	*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); | 
 | 3980 |  | 
 | 3981 | 	/* this should eventually include NR_UNSTABLE_NFS */ | 
 | 3982 | 	*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); | 
 | 3983 | 	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | | 
 | 3984 | 						     (1 << LRU_ACTIVE_FILE)); | 
 | 3985 | 	*pheadroom = PAGE_COUNTER_MAX; | 
 | 3986 |  | 
 | 3987 | 	while ((parent = parent_mem_cgroup(memcg))) { | 
 | 3988 | 		unsigned long ceiling = min(memcg->memory.max, memcg->high); | 
 | 3989 | 		unsigned long used = page_counter_read(&memcg->memory); | 
 | 3990 |  | 
 | 3991 | 		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); | 
 | 3992 | 		memcg = parent; | 
 | 3993 | 	} | 
 | 3994 | } | 
 | 3995 |  | 
 | 3996 | #else	/* CONFIG_CGROUP_WRITEBACK */ | 
 | 3997 |  | 
 | 3998 | static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) | 
 | 3999 | { | 
 | 4000 | 	return 0; | 
 | 4001 | } | 
 | 4002 |  | 
 | 4003 | static void memcg_wb_domain_exit(struct mem_cgroup *memcg) | 
 | 4004 | { | 
 | 4005 | } | 
 | 4006 |  | 
 | 4007 | static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) | 
 | 4008 | { | 
 | 4009 | } | 
 | 4010 |  | 
 | 4011 | #endif	/* CONFIG_CGROUP_WRITEBACK */ | 
 | 4012 |  | 
 | 4013 | /* | 
 | 4014 |  * DO NOT USE IN NEW FILES. | 
 | 4015 |  * | 
 | 4016 |  * "cgroup.event_control" implementation. | 
 | 4017 |  * | 
 | 4018 |  * This is way over-engineered.  It tries to support fully configurable | 
 | 4019 |  * events for each user.  Such level of flexibility is completely | 
 | 4020 |  * unnecessary especially in the light of the planned unified hierarchy. | 
 | 4021 |  * | 
 | 4022 |  * Please deprecate this and replace with something simpler if at all | 
 | 4023 |  * possible. | 
 | 4024 |  */ | 
 | 4025 |  | 
 | 4026 | /* | 
 | 4027 |  * Unregister event and free resources. | 
 | 4028 |  * | 
 | 4029 |  * Gets called from workqueue. | 
 | 4030 |  */ | 
 | 4031 | static void memcg_event_remove(struct work_struct *work) | 
 | 4032 | { | 
 | 4033 | 	struct mem_cgroup_event *event = | 
 | 4034 | 		container_of(work, struct mem_cgroup_event, remove); | 
 | 4035 | 	struct mem_cgroup *memcg = event->memcg; | 
 | 4036 |  | 
 | 4037 | 	remove_wait_queue(event->wqh, &event->wait); | 
 | 4038 |  | 
 | 4039 | 	event->unregister_event(memcg, event->eventfd); | 
 | 4040 |  | 
 | 4041 | 	/* Notify userspace the event is going away. */ | 
 | 4042 | 	eventfd_signal(event->eventfd, 1); | 
 | 4043 |  | 
 | 4044 | 	eventfd_ctx_put(event->eventfd); | 
 | 4045 | 	kfree(event); | 
 | 4046 | 	css_put(&memcg->css); | 
 | 4047 | } | 
 | 4048 |  | 
 | 4049 | /* | 
 | 4050 |  * Gets called on EPOLLHUP on eventfd when user closes it. | 
 | 4051 |  * | 
 | 4052 |  * Called with wqh->lock held and interrupts disabled. | 
 | 4053 |  */ | 
 | 4054 | static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode, | 
 | 4055 | 			    int sync, void *key) | 
 | 4056 | { | 
 | 4057 | 	struct mem_cgroup_event *event = | 
 | 4058 | 		container_of(wait, struct mem_cgroup_event, wait); | 
 | 4059 | 	struct mem_cgroup *memcg = event->memcg; | 
 | 4060 | 	__poll_t flags = key_to_poll(key); | 
 | 4061 |  | 
 | 4062 | 	if (flags & EPOLLHUP) { | 
 | 4063 | 		/* | 
 | 4064 | 		 * If the event has been detached at cgroup removal, we | 
 | 4065 | 		 * can simply return knowing the other side will cleanup | 
 | 4066 | 		 * for us. | 
 | 4067 | 		 * | 
 | 4068 | 		 * We can't race against event freeing since the other | 
 | 4069 | 		 * side will require wqh->lock via remove_wait_queue(), | 
 | 4070 | 		 * which we hold. | 
 | 4071 | 		 */ | 
 | 4072 | 		spin_lock(&memcg->event_list_lock); | 
 | 4073 | 		if (!list_empty(&event->list)) { | 
 | 4074 | 			list_del_init(&event->list); | 
 | 4075 | 			/* | 
 | 4076 | 			 * We are in atomic context, but cgroup_event_remove() | 
 | 4077 | 			 * may sleep, so we have to call it in workqueue. | 
 | 4078 | 			 */ | 
 | 4079 | 			schedule_work(&event->remove); | 
 | 4080 | 		} | 
 | 4081 | 		spin_unlock(&memcg->event_list_lock); | 
 | 4082 | 	} | 
 | 4083 |  | 
 | 4084 | 	return 0; | 
 | 4085 | } | 
 | 4086 |  | 
 | 4087 | static void memcg_event_ptable_queue_proc(struct file *file, | 
 | 4088 | 		wait_queue_head_t *wqh, poll_table *pt) | 
 | 4089 | { | 
 | 4090 | 	struct mem_cgroup_event *event = | 
 | 4091 | 		container_of(pt, struct mem_cgroup_event, pt); | 
 | 4092 |  | 
 | 4093 | 	event->wqh = wqh; | 
 | 4094 | 	add_wait_queue(wqh, &event->wait); | 
 | 4095 | } | 
 | 4096 |  | 
 | 4097 | /* | 
 | 4098 |  * DO NOT USE IN NEW FILES. | 
 | 4099 |  * | 
 | 4100 |  * Parse input and register new cgroup event handler. | 
 | 4101 |  * | 
 | 4102 |  * Input must be in format '<event_fd> <control_fd> <args>'. | 
 | 4103 |  * Interpretation of args is defined by control file implementation. | 
 | 4104 |  */ | 
 | 4105 | static ssize_t memcg_write_event_control(struct kernfs_open_file *of, | 
 | 4106 | 					 char *buf, size_t nbytes, loff_t off) | 
 | 4107 | { | 
 | 4108 | 	struct cgroup_subsys_state *css = of_css(of); | 
 | 4109 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 4110 | 	struct mem_cgroup_event *event; | 
 | 4111 | 	struct cgroup_subsys_state *cfile_css; | 
 | 4112 | 	unsigned int efd, cfd; | 
 | 4113 | 	struct fd efile; | 
 | 4114 | 	struct fd cfile; | 
 | 4115 | 	const char *name; | 
 | 4116 | 	char *endp; | 
 | 4117 | 	int ret; | 
 | 4118 |  | 
 | 4119 | 	buf = strstrip(buf); | 
 | 4120 |  | 
 | 4121 | 	efd = simple_strtoul(buf, &endp, 10); | 
 | 4122 | 	if (*endp != ' ') | 
 | 4123 | 		return -EINVAL; | 
 | 4124 | 	buf = endp + 1; | 
 | 4125 |  | 
 | 4126 | 	cfd = simple_strtoul(buf, &endp, 10); | 
 | 4127 | 	if ((*endp != ' ') && (*endp != '\0')) | 
 | 4128 | 		return -EINVAL; | 
 | 4129 | 	buf = endp + 1; | 
 | 4130 |  | 
 | 4131 | 	event = kzalloc(sizeof(*event), GFP_KERNEL); | 
 | 4132 | 	if (!event) | 
 | 4133 | 		return -ENOMEM; | 
 | 4134 |  | 
 | 4135 | 	event->memcg = memcg; | 
 | 4136 | 	INIT_LIST_HEAD(&event->list); | 
 | 4137 | 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | 
 | 4138 | 	init_waitqueue_func_entry(&event->wait, memcg_event_wake); | 
 | 4139 | 	INIT_WORK(&event->remove, memcg_event_remove); | 
 | 4140 |  | 
 | 4141 | 	efile = fdget(efd); | 
 | 4142 | 	if (!efile.file) { | 
 | 4143 | 		ret = -EBADF; | 
 | 4144 | 		goto out_kfree; | 
 | 4145 | 	} | 
 | 4146 |  | 
 | 4147 | 	event->eventfd = eventfd_ctx_fileget(efile.file); | 
 | 4148 | 	if (IS_ERR(event->eventfd)) { | 
 | 4149 | 		ret = PTR_ERR(event->eventfd); | 
 | 4150 | 		goto out_put_efile; | 
 | 4151 | 	} | 
 | 4152 |  | 
 | 4153 | 	cfile = fdget(cfd); | 
 | 4154 | 	if (!cfile.file) { | 
 | 4155 | 		ret = -EBADF; | 
 | 4156 | 		goto out_put_eventfd; | 
 | 4157 | 	} | 
 | 4158 |  | 
 | 4159 | 	/* the process need read permission on control file */ | 
 | 4160 | 	/* AV: shouldn't we check that it's been opened for read instead? */ | 
 | 4161 | 	ret = inode_permission(file_inode(cfile.file), MAY_READ); | 
 | 4162 | 	if (ret < 0) | 
 | 4163 | 		goto out_put_cfile; | 
 | 4164 |  | 
 | 4165 | 	/* | 
 | 4166 | 	 * Determine the event callbacks and set them in @event.  This used | 
 | 4167 | 	 * to be done via struct cftype but cgroup core no longer knows | 
 | 4168 | 	 * about these events.  The following is crude but the whole thing | 
 | 4169 | 	 * is for compatibility anyway. | 
 | 4170 | 	 * | 
 | 4171 | 	 * DO NOT ADD NEW FILES. | 
 | 4172 | 	 */ | 
 | 4173 | 	name = cfile.file->f_path.dentry->d_name.name; | 
 | 4174 |  | 
 | 4175 | 	if (!strcmp(name, "memory.usage_in_bytes")) { | 
 | 4176 | 		event->register_event = mem_cgroup_usage_register_event; | 
 | 4177 | 		event->unregister_event = mem_cgroup_usage_unregister_event; | 
 | 4178 | 	} else if (!strcmp(name, "memory.oom_control")) { | 
 | 4179 | 		event->register_event = mem_cgroup_oom_register_event; | 
 | 4180 | 		event->unregister_event = mem_cgroup_oom_unregister_event; | 
 | 4181 | 	} else if (!strcmp(name, "memory.pressure_level")) { | 
 | 4182 | 		event->register_event = vmpressure_register_event; | 
 | 4183 | 		event->unregister_event = vmpressure_unregister_event; | 
 | 4184 | 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | 
 | 4185 | 		event->register_event = memsw_cgroup_usage_register_event; | 
 | 4186 | 		event->unregister_event = memsw_cgroup_usage_unregister_event; | 
 | 4187 | 	} else { | 
 | 4188 | 		ret = -EINVAL; | 
 | 4189 | 		goto out_put_cfile; | 
 | 4190 | 	} | 
 | 4191 |  | 
 | 4192 | 	/* | 
 | 4193 | 	 * Verify @cfile should belong to @css.  Also, remaining events are | 
 | 4194 | 	 * automatically removed on cgroup destruction but the removal is | 
 | 4195 | 	 * asynchronous, so take an extra ref on @css. | 
 | 4196 | 	 */ | 
 | 4197 | 	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, | 
 | 4198 | 					       &memory_cgrp_subsys); | 
 | 4199 | 	ret = -EINVAL; | 
 | 4200 | 	if (IS_ERR(cfile_css)) | 
 | 4201 | 		goto out_put_cfile; | 
 | 4202 | 	if (cfile_css != css) { | 
 | 4203 | 		css_put(cfile_css); | 
 | 4204 | 		goto out_put_cfile; | 
 | 4205 | 	} | 
 | 4206 |  | 
 | 4207 | 	ret = event->register_event(memcg, event->eventfd, buf); | 
 | 4208 | 	if (ret) | 
 | 4209 | 		goto out_put_css; | 
 | 4210 |  | 
 | 4211 | 	vfs_poll(efile.file, &event->pt); | 
 | 4212 |  | 
 | 4213 | 	spin_lock(&memcg->event_list_lock); | 
 | 4214 | 	list_add(&event->list, &memcg->event_list); | 
 | 4215 | 	spin_unlock(&memcg->event_list_lock); | 
 | 4216 |  | 
 | 4217 | 	fdput(cfile); | 
 | 4218 | 	fdput(efile); | 
 | 4219 |  | 
 | 4220 | 	return nbytes; | 
 | 4221 |  | 
 | 4222 | out_put_css: | 
 | 4223 | 	css_put(css); | 
 | 4224 | out_put_cfile: | 
 | 4225 | 	fdput(cfile); | 
 | 4226 | out_put_eventfd: | 
 | 4227 | 	eventfd_ctx_put(event->eventfd); | 
 | 4228 | out_put_efile: | 
 | 4229 | 	fdput(efile); | 
 | 4230 | out_kfree: | 
 | 4231 | 	kfree(event); | 
 | 4232 |  | 
 | 4233 | 	return ret; | 
 | 4234 | } | 
 | 4235 |  | 
 | 4236 | static struct cftype mem_cgroup_legacy_files[] = { | 
 | 4237 | 	{ | 
 | 4238 | 		.name = "usage_in_bytes", | 
 | 4239 | 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 
 | 4240 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4241 | 	}, | 
 | 4242 | 	{ | 
 | 4243 | 		.name = "max_usage_in_bytes", | 
 | 4244 | 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | 
 | 4245 | 		.write = mem_cgroup_reset, | 
 | 4246 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4247 | 	}, | 
 | 4248 | 	{ | 
 | 4249 | 		.name = "limit_in_bytes", | 
 | 4250 | 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | 
 | 4251 | 		.write = mem_cgroup_write, | 
 | 4252 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4253 | 	}, | 
 | 4254 | 	{ | 
 | 4255 | 		.name = "soft_limit_in_bytes", | 
 | 4256 | 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | 
 | 4257 | 		.write = mem_cgroup_write, | 
 | 4258 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4259 | 	}, | 
 | 4260 | 	{ | 
 | 4261 | 		.name = "failcnt", | 
 | 4262 | 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 
 | 4263 | 		.write = mem_cgroup_reset, | 
 | 4264 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4265 | 	}, | 
 | 4266 | 	{ | 
 | 4267 | 		.name = "stat", | 
 | 4268 | 		.seq_show = memcg_stat_show, | 
 | 4269 | 	}, | 
 | 4270 | 	{ | 
 | 4271 | 		.name = "force_empty", | 
 | 4272 | 		.write = mem_cgroup_force_empty_write, | 
 | 4273 | 	}, | 
 | 4274 | 	{ | 
 | 4275 | 		.name = "use_hierarchy", | 
 | 4276 | 		.write_u64 = mem_cgroup_hierarchy_write, | 
 | 4277 | 		.read_u64 = mem_cgroup_hierarchy_read, | 
 | 4278 | 	}, | 
 | 4279 | 	{ | 
 | 4280 | 		.name = "cgroup.event_control",		/* XXX: for compat */ | 
 | 4281 | 		.write = memcg_write_event_control, | 
 | 4282 | 		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, | 
 | 4283 | 	}, | 
 | 4284 | 	{ | 
 | 4285 | 		.name = "swappiness", | 
 | 4286 | 		.read_u64 = mem_cgroup_swappiness_read, | 
 | 4287 | 		.write_u64 = mem_cgroup_swappiness_write, | 
 | 4288 | 	}, | 
 | 4289 | 	{ | 
 | 4290 | 		.name = "move_charge_at_immigrate", | 
 | 4291 | 		.read_u64 = mem_cgroup_move_charge_read, | 
 | 4292 | 		.write_u64 = mem_cgroup_move_charge_write, | 
 | 4293 | 	}, | 
 | 4294 | 	{ | 
 | 4295 | 		.name = "oom_control", | 
 | 4296 | 		.seq_show = mem_cgroup_oom_control_read, | 
 | 4297 | 		.write_u64 = mem_cgroup_oom_control_write, | 
 | 4298 | 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 
 | 4299 | 	}, | 
 | 4300 | 	{ | 
 | 4301 | 		.name = "pressure_level", | 
 | 4302 | 	}, | 
 | 4303 | #ifdef CONFIG_NUMA | 
 | 4304 | 	{ | 
 | 4305 | 		.name = "numa_stat", | 
 | 4306 | 		.seq_show = memcg_numa_stat_show, | 
 | 4307 | 	}, | 
 | 4308 | #endif | 
 | 4309 | 	{ | 
 | 4310 | 		.name = "kmem.limit_in_bytes", | 
 | 4311 | 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | 
 | 4312 | 		.write = mem_cgroup_write, | 
 | 4313 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4314 | 	}, | 
 | 4315 | 	{ | 
 | 4316 | 		.name = "kmem.usage_in_bytes", | 
 | 4317 | 		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | 
 | 4318 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4319 | 	}, | 
 | 4320 | 	{ | 
 | 4321 | 		.name = "kmem.failcnt", | 
 | 4322 | 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), | 
 | 4323 | 		.write = mem_cgroup_reset, | 
 | 4324 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4325 | 	}, | 
 | 4326 | 	{ | 
 | 4327 | 		.name = "kmem.max_usage_in_bytes", | 
 | 4328 | 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), | 
 | 4329 | 		.write = mem_cgroup_reset, | 
 | 4330 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4331 | 	}, | 
 | 4332 | #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) | 
 | 4333 | 	{ | 
 | 4334 | 		.name = "kmem.slabinfo", | 
 | 4335 | 		.seq_start = memcg_slab_start, | 
 | 4336 | 		.seq_next = memcg_slab_next, | 
 | 4337 | 		.seq_stop = memcg_slab_stop, | 
 | 4338 | 		.seq_show = memcg_slab_show, | 
 | 4339 | 	}, | 
 | 4340 | #endif | 
 | 4341 | 	{ | 
 | 4342 | 		.name = "kmem.tcp.limit_in_bytes", | 
 | 4343 | 		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT), | 
 | 4344 | 		.write = mem_cgroup_write, | 
 | 4345 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4346 | 	}, | 
 | 4347 | 	{ | 
 | 4348 | 		.name = "kmem.tcp.usage_in_bytes", | 
 | 4349 | 		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE), | 
 | 4350 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4351 | 	}, | 
 | 4352 | 	{ | 
 | 4353 | 		.name = "kmem.tcp.failcnt", | 
 | 4354 | 		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT), | 
 | 4355 | 		.write = mem_cgroup_reset, | 
 | 4356 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4357 | 	}, | 
 | 4358 | 	{ | 
 | 4359 | 		.name = "kmem.tcp.max_usage_in_bytes", | 
 | 4360 | 		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE), | 
 | 4361 | 		.write = mem_cgroup_reset, | 
 | 4362 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 4363 | 	}, | 
 | 4364 | 	{ },	/* terminate */ | 
 | 4365 | }; | 
 | 4366 |  | 
 | 4367 | /* | 
 | 4368 |  * Private memory cgroup IDR | 
 | 4369 |  * | 
 | 4370 |  * Swap-out records and page cache shadow entries need to store memcg | 
 | 4371 |  * references in constrained space, so we maintain an ID space that is | 
 | 4372 |  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of | 
 | 4373 |  * memory-controlled cgroups to 64k. | 
 | 4374 |  * | 
 | 4375 |  * However, there usually are many references to the oflline CSS after | 
 | 4376 |  * the cgroup has been destroyed, such as page cache or reclaimable | 
 | 4377 |  * slab objects, that don't need to hang on to the ID. We want to keep | 
 | 4378 |  * those dead CSS from occupying IDs, or we might quickly exhaust the | 
 | 4379 |  * relatively small ID space and prevent the creation of new cgroups | 
 | 4380 |  * even when there are much fewer than 64k cgroups - possibly none. | 
 | 4381 |  * | 
 | 4382 |  * Maintain a private 16-bit ID space for memcg, and allow the ID to | 
 | 4383 |  * be freed and recycled when it's no longer needed, which is usually | 
 | 4384 |  * when the CSS is offlined. | 
 | 4385 |  * | 
 | 4386 |  * The only exception to that are records of swapped out tmpfs/shmem | 
 | 4387 |  * pages that need to be attributed to live ancestors on swapin. But | 
 | 4388 |  * those references are manageable from userspace. | 
 | 4389 |  */ | 
 | 4390 |  | 
 | 4391 | static DEFINE_IDR(mem_cgroup_idr); | 
 | 4392 |  | 
 | 4393 | static void mem_cgroup_id_remove(struct mem_cgroup *memcg) | 
 | 4394 | { | 
 | 4395 | 	if (memcg->id.id > 0) { | 
 | 4396 | 		idr_remove(&mem_cgroup_idr, memcg->id.id); | 
 | 4397 | 		memcg->id.id = 0; | 
 | 4398 | 	} | 
 | 4399 | } | 
 | 4400 |  | 
 | 4401 | static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) | 
 | 4402 | { | 
 | 4403 | 	VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0); | 
 | 4404 | 	atomic_add(n, &memcg->id.ref); | 
 | 4405 | } | 
 | 4406 |  | 
 | 4407 | static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) | 
 | 4408 | { | 
 | 4409 | 	VM_BUG_ON(atomic_read(&memcg->id.ref) < n); | 
 | 4410 | 	if (atomic_sub_and_test(n, &memcg->id.ref)) { | 
 | 4411 | 		mem_cgroup_id_remove(memcg); | 
 | 4412 |  | 
 | 4413 | 		/* Memcg ID pins CSS */ | 
 | 4414 | 		css_put(&memcg->css); | 
 | 4415 | 	} | 
 | 4416 | } | 
 | 4417 |  | 
 | 4418 | static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) | 
 | 4419 | { | 
 | 4420 | 	mem_cgroup_id_get_many(memcg, 1); | 
 | 4421 | } | 
 | 4422 |  | 
 | 4423 | static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) | 
 | 4424 | { | 
 | 4425 | 	mem_cgroup_id_put_many(memcg, 1); | 
 | 4426 | } | 
 | 4427 |  | 
 | 4428 | /** | 
 | 4429 |  * mem_cgroup_from_id - look up a memcg from a memcg id | 
 | 4430 |  * @id: the memcg id to look up | 
 | 4431 |  * | 
 | 4432 |  * Caller must hold rcu_read_lock(). | 
 | 4433 |  */ | 
 | 4434 | struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | 
 | 4435 | { | 
 | 4436 | 	WARN_ON_ONCE(!rcu_read_lock_held()); | 
 | 4437 | 	return idr_find(&mem_cgroup_idr, id); | 
 | 4438 | } | 
 | 4439 |  | 
 | 4440 | static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) | 
 | 4441 | { | 
 | 4442 | 	struct mem_cgroup_per_node *pn; | 
 | 4443 | 	int tmp = node; | 
 | 4444 | 	/* | 
 | 4445 | 	 * This routine is called against possible nodes. | 
 | 4446 | 	 * But it's BUG to call kmalloc() against offline node. | 
 | 4447 | 	 * | 
 | 4448 | 	 * TODO: this routine can waste much memory for nodes which will | 
 | 4449 | 	 *       never be onlined. It's better to use memory hotplug callback | 
 | 4450 | 	 *       function. | 
 | 4451 | 	 */ | 
 | 4452 | 	if (!node_state(node, N_NORMAL_MEMORY)) | 
 | 4453 | 		tmp = -1; | 
 | 4454 | 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 
 | 4455 | 	if (!pn) | 
 | 4456 | 		return 1; | 
 | 4457 |  | 
 | 4458 | 	pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); | 
 | 4459 | 	if (!pn->lruvec_stat_cpu) { | 
 | 4460 | 		kfree(pn); | 
 | 4461 | 		return 1; | 
 | 4462 | 	} | 
 | 4463 |  | 
 | 4464 | 	lruvec_init(&pn->lruvec); | 
 | 4465 | 	pn->usage_in_excess = 0; | 
 | 4466 | 	pn->on_tree = false; | 
 | 4467 | 	pn->memcg = memcg; | 
 | 4468 |  | 
 | 4469 | 	memcg->nodeinfo[node] = pn; | 
 | 4470 | 	return 0; | 
 | 4471 | } | 
 | 4472 |  | 
 | 4473 | static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) | 
 | 4474 | { | 
 | 4475 | 	struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; | 
 | 4476 |  | 
 | 4477 | 	if (!pn) | 
 | 4478 | 		return; | 
 | 4479 |  | 
 | 4480 | 	free_percpu(pn->lruvec_stat_cpu); | 
 | 4481 | 	kfree(pn); | 
 | 4482 | } | 
 | 4483 |  | 
 | 4484 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | 
 | 4485 | { | 
 | 4486 | 	int node; | 
 | 4487 |  | 
 | 4488 | 	for_each_node(node) | 
 | 4489 | 		free_mem_cgroup_per_node_info(memcg, node); | 
 | 4490 | 	free_percpu(memcg->stat_cpu); | 
 | 4491 | 	kfree(memcg); | 
 | 4492 | } | 
 | 4493 |  | 
 | 4494 | static void mem_cgroup_free(struct mem_cgroup *memcg) | 
 | 4495 | { | 
 | 4496 | 	memcg_wb_domain_exit(memcg); | 
 | 4497 | 	__mem_cgroup_free(memcg); | 
 | 4498 | } | 
 | 4499 |  | 
 | 4500 | static struct mem_cgroup *mem_cgroup_alloc(void) | 
 | 4501 | { | 
 | 4502 | 	struct mem_cgroup *memcg; | 
 | 4503 | 	size_t size; | 
 | 4504 | 	int node; | 
 | 4505 |  | 
 | 4506 | 	size = sizeof(struct mem_cgroup); | 
 | 4507 | 	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); | 
 | 4508 |  | 
 | 4509 | 	memcg = kzalloc(size, GFP_KERNEL); | 
 | 4510 | 	if (!memcg) | 
 | 4511 | 		return NULL; | 
 | 4512 |  | 
 | 4513 | 	memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, | 
 | 4514 | 				 1, MEM_CGROUP_ID_MAX, | 
 | 4515 | 				 GFP_KERNEL); | 
 | 4516 | 	if (memcg->id.id < 0) | 
 | 4517 | 		goto fail; | 
 | 4518 |  | 
 | 4519 | 	memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); | 
 | 4520 | 	if (!memcg->stat_cpu) | 
 | 4521 | 		goto fail; | 
 | 4522 |  | 
 | 4523 | 	for_each_node(node) | 
 | 4524 | 		if (alloc_mem_cgroup_per_node_info(memcg, node)) | 
 | 4525 | 			goto fail; | 
 | 4526 |  | 
 | 4527 | 	if (memcg_wb_domain_init(memcg, GFP_KERNEL)) | 
 | 4528 | 		goto fail; | 
 | 4529 |  | 
 | 4530 | 	INIT_WORK(&memcg->high_work, high_work_func); | 
 | 4531 | 	memcg->last_scanned_node = MAX_NUMNODES; | 
 | 4532 | 	INIT_LIST_HEAD(&memcg->oom_notify); | 
 | 4533 | 	mutex_init(&memcg->thresholds_lock); | 
 | 4534 | 	spin_lock_init(&memcg->move_lock); | 
 | 4535 | 	vmpressure_init(&memcg->vmpressure); | 
 | 4536 | 	INIT_LIST_HEAD(&memcg->event_list); | 
 | 4537 | 	spin_lock_init(&memcg->event_list_lock); | 
 | 4538 | 	memcg->socket_pressure = jiffies; | 
 | 4539 | #ifdef CONFIG_MEMCG_KMEM | 
 | 4540 | 	memcg->kmemcg_id = -1; | 
 | 4541 | #endif | 
 | 4542 | #ifdef CONFIG_CGROUP_WRITEBACK | 
 | 4543 | 	INIT_LIST_HEAD(&memcg->cgwb_list); | 
 | 4544 | #endif | 
 | 4545 | 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); | 
 | 4546 | 	return memcg; | 
 | 4547 | fail: | 
 | 4548 | 	mem_cgroup_id_remove(memcg); | 
 | 4549 | 	__mem_cgroup_free(memcg); | 
 | 4550 | 	return NULL; | 
 | 4551 | } | 
 | 4552 |  | 
 | 4553 | static struct cgroup_subsys_state * __ref | 
 | 4554 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 
 | 4555 | { | 
 | 4556 | 	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css); | 
 | 4557 | 	struct mem_cgroup *memcg; | 
 | 4558 | 	long error = -ENOMEM; | 
 | 4559 |  | 
 | 4560 | 	memcg = mem_cgroup_alloc(); | 
 | 4561 | 	if (!memcg) | 
 | 4562 | 		return ERR_PTR(error); | 
 | 4563 |  | 
 | 4564 | 	memcg->high = PAGE_COUNTER_MAX; | 
 | 4565 | 	memcg->soft_limit = PAGE_COUNTER_MAX; | 
 | 4566 | 	if (parent) { | 
 | 4567 | 		memcg->swappiness = mem_cgroup_swappiness(parent); | 
 | 4568 | 		memcg->oom_kill_disable = parent->oom_kill_disable; | 
 | 4569 | 	} | 
 | 4570 | 	if (parent && parent->use_hierarchy) { | 
 | 4571 | 		memcg->use_hierarchy = true; | 
 | 4572 | 		page_counter_init(&memcg->memory, &parent->memory); | 
 | 4573 | 		page_counter_init(&memcg->swap, &parent->swap); | 
 | 4574 | 		page_counter_init(&memcg->memsw, &parent->memsw); | 
 | 4575 | 		page_counter_init(&memcg->kmem, &parent->kmem); | 
 | 4576 | 		page_counter_init(&memcg->tcpmem, &parent->tcpmem); | 
 | 4577 | 	} else { | 
 | 4578 | 		page_counter_init(&memcg->memory, NULL); | 
 | 4579 | 		page_counter_init(&memcg->swap, NULL); | 
 | 4580 | 		page_counter_init(&memcg->memsw, NULL); | 
 | 4581 | 		page_counter_init(&memcg->kmem, NULL); | 
 | 4582 | 		page_counter_init(&memcg->tcpmem, NULL); | 
 | 4583 | 		/* | 
 | 4584 | 		 * Deeper hierachy with use_hierarchy == false doesn't make | 
 | 4585 | 		 * much sense so let cgroup subsystem know about this | 
 | 4586 | 		 * unfortunate state in our controller. | 
 | 4587 | 		 */ | 
 | 4588 | 		if (parent != root_mem_cgroup) | 
 | 4589 | 			memory_cgrp_subsys.broken_hierarchy = true; | 
 | 4590 | 	} | 
 | 4591 |  | 
 | 4592 | 	/* The following stuff does not apply to the root */ | 
 | 4593 | 	if (!parent) { | 
 | 4594 | 		root_mem_cgroup = memcg; | 
 | 4595 | 		return &memcg->css; | 
 | 4596 | 	} | 
 | 4597 |  | 
 | 4598 | 	error = memcg_online_kmem(memcg); | 
 | 4599 | 	if (error) | 
 | 4600 | 		goto fail; | 
 | 4601 |  | 
 | 4602 | 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) | 
 | 4603 | 		static_branch_inc(&memcg_sockets_enabled_key); | 
 | 4604 |  | 
 | 4605 | 	return &memcg->css; | 
 | 4606 | fail: | 
 | 4607 | 	mem_cgroup_id_remove(memcg); | 
 | 4608 | 	mem_cgroup_free(memcg); | 
 | 4609 | 	return ERR_PTR(-ENOMEM); | 
 | 4610 | } | 
 | 4611 |  | 
 | 4612 | static int mem_cgroup_css_online(struct cgroup_subsys_state *css) | 
 | 4613 | { | 
 | 4614 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 4615 |  | 
 | 4616 | 	/* | 
 | 4617 | 	 * A memcg must be visible for memcg_expand_shrinker_maps() | 
 | 4618 | 	 * by the time the maps are allocated. So, we allocate maps | 
 | 4619 | 	 * here, when for_each_mem_cgroup() can't skip it. | 
 | 4620 | 	 */ | 
 | 4621 | 	if (memcg_alloc_shrinker_maps(memcg)) { | 
 | 4622 | 		mem_cgroup_id_remove(memcg); | 
 | 4623 | 		return -ENOMEM; | 
 | 4624 | 	} | 
 | 4625 |  | 
 | 4626 | 	/* Online state pins memcg ID, memcg ID pins CSS */ | 
 | 4627 | 	atomic_set(&memcg->id.ref, 1); | 
 | 4628 | 	css_get(css); | 
 | 4629 | 	return 0; | 
 | 4630 | } | 
 | 4631 |  | 
 | 4632 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 
 | 4633 | { | 
 | 4634 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 4635 | 	struct mem_cgroup_event *event, *tmp; | 
 | 4636 |  | 
 | 4637 | 	/* | 
 | 4638 | 	 * Unregister events and notify userspace. | 
 | 4639 | 	 * Notify userspace about cgroup removing only after rmdir of cgroup | 
 | 4640 | 	 * directory to avoid race between userspace and kernelspace. | 
 | 4641 | 	 */ | 
 | 4642 | 	spin_lock(&memcg->event_list_lock); | 
 | 4643 | 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | 
 | 4644 | 		list_del_init(&event->list); | 
 | 4645 | 		schedule_work(&event->remove); | 
 | 4646 | 	} | 
 | 4647 | 	spin_unlock(&memcg->event_list_lock); | 
 | 4648 |  | 
 | 4649 | 	page_counter_set_min(&memcg->memory, 0); | 
 | 4650 | 	page_counter_set_low(&memcg->memory, 0); | 
 | 4651 |  | 
 | 4652 | 	memcg_offline_kmem(memcg); | 
 | 4653 | 	wb_memcg_offline(memcg); | 
 | 4654 |  | 
 | 4655 | 	mem_cgroup_id_put(memcg); | 
 | 4656 | } | 
 | 4657 |  | 
 | 4658 | static void mem_cgroup_css_released(struct cgroup_subsys_state *css) | 
 | 4659 | { | 
 | 4660 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 4661 |  | 
 | 4662 | 	invalidate_reclaim_iterators(memcg); | 
 | 4663 | } | 
 | 4664 |  | 
 | 4665 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) | 
 | 4666 | { | 
 | 4667 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 4668 |  | 
 | 4669 | 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) | 
 | 4670 | 		static_branch_dec(&memcg_sockets_enabled_key); | 
 | 4671 |  | 
 | 4672 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active) | 
 | 4673 | 		static_branch_dec(&memcg_sockets_enabled_key); | 
 | 4674 |  | 
 | 4675 | 	vmpressure_cleanup(&memcg->vmpressure); | 
 | 4676 | 	cancel_work_sync(&memcg->high_work); | 
 | 4677 | 	mem_cgroup_remove_from_trees(memcg); | 
 | 4678 | 	memcg_free_shrinker_maps(memcg); | 
 | 4679 | 	memcg_free_kmem(memcg); | 
 | 4680 | 	mem_cgroup_free(memcg); | 
 | 4681 | } | 
 | 4682 |  | 
 | 4683 | /** | 
 | 4684 |  * mem_cgroup_css_reset - reset the states of a mem_cgroup | 
 | 4685 |  * @css: the target css | 
 | 4686 |  * | 
 | 4687 |  * Reset the states of the mem_cgroup associated with @css.  This is | 
 | 4688 |  * invoked when the userland requests disabling on the default hierarchy | 
 | 4689 |  * but the memcg is pinned through dependency.  The memcg should stop | 
 | 4690 |  * applying policies and should revert to the vanilla state as it may be | 
 | 4691 |  * made visible again. | 
 | 4692 |  * | 
 | 4693 |  * The current implementation only resets the essential configurations. | 
 | 4694 |  * This needs to be expanded to cover all the visible parts. | 
 | 4695 |  */ | 
 | 4696 | static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | 
 | 4697 | { | 
 | 4698 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 4699 |  | 
 | 4700 | 	page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); | 
 | 4701 | 	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); | 
 | 4702 | 	page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); | 
 | 4703 | 	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); | 
 | 4704 | 	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); | 
 | 4705 | 	page_counter_set_min(&memcg->memory, 0); | 
 | 4706 | 	page_counter_set_low(&memcg->memory, 0); | 
 | 4707 | 	memcg->high = PAGE_COUNTER_MAX; | 
 | 4708 | 	memcg->soft_limit = PAGE_COUNTER_MAX; | 
 | 4709 | 	memcg_wb_domain_size_changed(memcg); | 
 | 4710 | } | 
 | 4711 |  | 
 | 4712 | #ifdef CONFIG_MMU | 
 | 4713 | /* Handlers for move charge at task migration. */ | 
 | 4714 | static int mem_cgroup_do_precharge(unsigned long count) | 
 | 4715 | { | 
 | 4716 | 	int ret; | 
 | 4717 |  | 
 | 4718 | 	/* Try a single bulk charge without reclaim first, kswapd may wake */ | 
 | 4719 | 	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); | 
 | 4720 | 	if (!ret) { | 
 | 4721 | 		mc.precharge += count; | 
 | 4722 | 		return ret; | 
 | 4723 | 	} | 
 | 4724 |  | 
 | 4725 | 	/* Try charges one by one with reclaim, but do not retry */ | 
 | 4726 | 	while (count--) { | 
 | 4727 | 		ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); | 
 | 4728 | 		if (ret) | 
 | 4729 | 			return ret; | 
 | 4730 | 		mc.precharge++; | 
 | 4731 | 		cond_resched(); | 
 | 4732 | 	} | 
 | 4733 | 	return 0; | 
 | 4734 | } | 
 | 4735 |  | 
 | 4736 | union mc_target { | 
 | 4737 | 	struct page	*page; | 
 | 4738 | 	swp_entry_t	ent; | 
 | 4739 | }; | 
 | 4740 |  | 
 | 4741 | enum mc_target_type { | 
 | 4742 | 	MC_TARGET_NONE = 0, | 
 | 4743 | 	MC_TARGET_PAGE, | 
 | 4744 | 	MC_TARGET_SWAP, | 
 | 4745 | 	MC_TARGET_DEVICE, | 
 | 4746 | }; | 
 | 4747 |  | 
 | 4748 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | 
 | 4749 | 						unsigned long addr, pte_t ptent) | 
 | 4750 | { | 
 | 4751 | 	struct page *page = _vm_normal_page(vma, addr, ptent, true); | 
 | 4752 |  | 
 | 4753 | 	if (!page || !page_mapped(page)) | 
 | 4754 | 		return NULL; | 
 | 4755 | 	if (PageAnon(page)) { | 
 | 4756 | 		if (!(mc.flags & MOVE_ANON)) | 
 | 4757 | 			return NULL; | 
 | 4758 | 	} else { | 
 | 4759 | 		if (!(mc.flags & MOVE_FILE)) | 
 | 4760 | 			return NULL; | 
 | 4761 | 	} | 
 | 4762 | 	if (!get_page_unless_zero(page)) | 
 | 4763 | 		return NULL; | 
 | 4764 |  | 
 | 4765 | 	return page; | 
 | 4766 | } | 
 | 4767 |  | 
 | 4768 | #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) | 
 | 4769 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 
 | 4770 | 			pte_t ptent, swp_entry_t *entry) | 
 | 4771 | { | 
 | 4772 | 	struct page *page = NULL; | 
 | 4773 | 	swp_entry_t ent = pte_to_swp_entry(ptent); | 
 | 4774 |  | 
 | 4775 | 	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) | 
 | 4776 | 		return NULL; | 
 | 4777 |  | 
 | 4778 | 	/* | 
 | 4779 | 	 * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to | 
 | 4780 | 	 * a device and because they are not accessible by CPU they are store | 
 | 4781 | 	 * as special swap entry in the CPU page table. | 
 | 4782 | 	 */ | 
 | 4783 | 	if (is_device_private_entry(ent)) { | 
 | 4784 | 		page = device_private_entry_to_page(ent); | 
 | 4785 | 		/* | 
 | 4786 | 		 * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have | 
 | 4787 | 		 * a refcount of 1 when free (unlike normal page) | 
 | 4788 | 		 */ | 
 | 4789 | 		if (!page_ref_add_unless(page, 1, 1)) | 
 | 4790 | 			return NULL; | 
 | 4791 | 		return page; | 
 | 4792 | 	} | 
 | 4793 |  | 
 | 4794 | 	/* | 
 | 4795 | 	 * Because lookup_swap_cache() updates some statistics counter, | 
 | 4796 | 	 * we call find_get_page() with swapper_space directly. | 
 | 4797 | 	 */ | 
 | 4798 | 	page = find_get_page(swap_address_space(ent), swp_offset(ent)); | 
 | 4799 | 	if (do_memsw_account()) | 
 | 4800 | 		entry->val = ent.val; | 
 | 4801 |  | 
 | 4802 | 	return page; | 
 | 4803 | } | 
 | 4804 | #else | 
 | 4805 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 
 | 4806 | 			pte_t ptent, swp_entry_t *entry) | 
 | 4807 | { | 
 | 4808 | 	return NULL; | 
 | 4809 | } | 
 | 4810 | #endif | 
 | 4811 |  | 
 | 4812 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | 
 | 4813 | 			unsigned long addr, pte_t ptent, swp_entry_t *entry) | 
 | 4814 | { | 
 | 4815 | 	struct page *page = NULL; | 
 | 4816 | 	struct address_space *mapping; | 
 | 4817 | 	pgoff_t pgoff; | 
 | 4818 |  | 
 | 4819 | 	if (!vma->vm_file) /* anonymous vma */ | 
 | 4820 | 		return NULL; | 
 | 4821 | 	if (!(mc.flags & MOVE_FILE)) | 
 | 4822 | 		return NULL; | 
 | 4823 |  | 
 | 4824 | 	mapping = vma->vm_file->f_mapping; | 
 | 4825 | 	pgoff = linear_page_index(vma, addr); | 
 | 4826 |  | 
 | 4827 | 	/* page is moved even if it's not RSS of this task(page-faulted). */ | 
 | 4828 | #ifdef CONFIG_SWAP | 
 | 4829 | 	/* shmem/tmpfs may report page out on swap: account for that too. */ | 
 | 4830 | 	if (shmem_mapping(mapping)) { | 
 | 4831 | 		page = find_get_entry(mapping, pgoff); | 
 | 4832 | 		if (radix_tree_exceptional_entry(page)) { | 
 | 4833 | 			swp_entry_t swp = radix_to_swp_entry(page); | 
 | 4834 | 			if (do_memsw_account()) | 
 | 4835 | 				*entry = swp; | 
 | 4836 | 			page = find_get_page(swap_address_space(swp), | 
 | 4837 | 					     swp_offset(swp)); | 
 | 4838 | 		} | 
 | 4839 | 	} else | 
 | 4840 | 		page = find_get_page(mapping, pgoff); | 
 | 4841 | #else | 
 | 4842 | 	page = find_get_page(mapping, pgoff); | 
 | 4843 | #endif | 
 | 4844 | 	return page; | 
 | 4845 | } | 
 | 4846 |  | 
 | 4847 | /** | 
 | 4848 |  * mem_cgroup_move_account - move account of the page | 
 | 4849 |  * @page: the page | 
 | 4850 |  * @compound: charge the page as compound or small page | 
 | 4851 |  * @from: mem_cgroup which the page is moved from. | 
 | 4852 |  * @to:	mem_cgroup which the page is moved to. @from != @to. | 
 | 4853 |  * | 
 | 4854 |  * The caller must make sure the page is not on LRU (isolate_page() is useful.) | 
 | 4855 |  * | 
 | 4856 |  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" | 
 | 4857 |  * from old cgroup. | 
 | 4858 |  */ | 
 | 4859 | static int mem_cgroup_move_account(struct page *page, | 
 | 4860 | 				   bool compound, | 
 | 4861 | 				   struct mem_cgroup *from, | 
 | 4862 | 				   struct mem_cgroup *to) | 
 | 4863 | { | 
 | 4864 | 	unsigned long flags; | 
 | 4865 | 	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; | 
 | 4866 | 	int ret; | 
 | 4867 | 	bool anon; | 
 | 4868 |  | 
 | 4869 | 	VM_BUG_ON(from == to); | 
 | 4870 | 	VM_BUG_ON_PAGE(PageLRU(page), page); | 
 | 4871 | 	VM_BUG_ON(compound && !PageTransHuge(page)); | 
 | 4872 |  | 
 | 4873 | 	/* | 
 | 4874 | 	 * Prevent mem_cgroup_migrate() from looking at | 
 | 4875 | 	 * page->mem_cgroup of its source page while we change it. | 
 | 4876 | 	 */ | 
 | 4877 | 	ret = -EBUSY; | 
 | 4878 | 	if (!trylock_page(page)) | 
 | 4879 | 		goto out; | 
 | 4880 |  | 
 | 4881 | 	ret = -EINVAL; | 
 | 4882 | 	if (page->mem_cgroup != from) | 
 | 4883 | 		goto out_unlock; | 
 | 4884 |  | 
 | 4885 | 	anon = PageAnon(page); | 
 | 4886 |  | 
 | 4887 | 	spin_lock_irqsave(&from->move_lock, flags); | 
 | 4888 |  | 
 | 4889 | 	if (!anon && page_mapped(page)) { | 
 | 4890 | 		__mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); | 
 | 4891 | 		__mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); | 
 | 4892 | 	} | 
 | 4893 |  | 
 | 4894 | 	/* | 
 | 4895 | 	 * move_lock grabbed above and caller set from->moving_account, so | 
 | 4896 | 	 * mod_memcg_page_state will serialize updates to PageDirty. | 
 | 4897 | 	 * So mapping should be stable for dirty pages. | 
 | 4898 | 	 */ | 
 | 4899 | 	if (!anon && PageDirty(page)) { | 
 | 4900 | 		struct address_space *mapping = page_mapping(page); | 
 | 4901 |  | 
 | 4902 | 		if (mapping_cap_account_dirty(mapping)) { | 
 | 4903 | 			__mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); | 
 | 4904 | 			__mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); | 
 | 4905 | 		} | 
 | 4906 | 	} | 
 | 4907 |  | 
 | 4908 | 	if (PageWriteback(page)) { | 
 | 4909 | 		__mod_memcg_state(from, NR_WRITEBACK, -nr_pages); | 
 | 4910 | 		__mod_memcg_state(to, NR_WRITEBACK, nr_pages); | 
 | 4911 | 	} | 
 | 4912 |  | 
 | 4913 | 	/* | 
 | 4914 | 	 * It is safe to change page->mem_cgroup here because the page | 
 | 4915 | 	 * is referenced, charged, and isolated - we can't race with | 
 | 4916 | 	 * uncharging, charging, migration, or LRU putback. | 
 | 4917 | 	 */ | 
 | 4918 |  | 
 | 4919 | 	/* caller should have done css_get */ | 
 | 4920 | 	page->mem_cgroup = to; | 
 | 4921 | 	spin_unlock_irqrestore(&from->move_lock, flags); | 
 | 4922 |  | 
 | 4923 | 	ret = 0; | 
 | 4924 |  | 
 | 4925 | 	local_irq_disable(); | 
 | 4926 | 	mem_cgroup_charge_statistics(to, page, compound, nr_pages); | 
 | 4927 | 	memcg_check_events(to, page); | 
 | 4928 | 	mem_cgroup_charge_statistics(from, page, compound, -nr_pages); | 
 | 4929 | 	memcg_check_events(from, page); | 
 | 4930 | 	local_irq_enable(); | 
 | 4931 | out_unlock: | 
 | 4932 | 	unlock_page(page); | 
 | 4933 | out: | 
 | 4934 | 	return ret; | 
 | 4935 | } | 
 | 4936 |  | 
 | 4937 | /** | 
 | 4938 |  * get_mctgt_type - get target type of moving charge | 
 | 4939 |  * @vma: the vma the pte to be checked belongs | 
 | 4940 |  * @addr: the address corresponding to the pte to be checked | 
 | 4941 |  * @ptent: the pte to be checked | 
 | 4942 |  * @target: the pointer the target page or swap ent will be stored(can be NULL) | 
 | 4943 |  * | 
 | 4944 |  * Returns | 
 | 4945 |  *   0(MC_TARGET_NONE): if the pte is not a target for move charge. | 
 | 4946 |  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | 
 | 4947 |  *     move charge. if @target is not NULL, the page is stored in target->page | 
 | 4948 |  *     with extra refcnt got(Callers should handle it). | 
 | 4949 |  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | 
 | 4950 |  *     target for charge migration. if @target is not NULL, the entry is stored | 
 | 4951 |  *     in target->ent. | 
 | 4952 |  *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC | 
 | 4953 |  *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). | 
 | 4954 |  *     For now we such page is charge like a regular page would be as for all | 
 | 4955 |  *     intent and purposes it is just special memory taking the place of a | 
 | 4956 |  *     regular page. | 
 | 4957 |  * | 
 | 4958 |  *     See Documentations/vm/hmm.txt and include/linux/hmm.h | 
 | 4959 |  * | 
 | 4960 |  * Called with pte lock held. | 
 | 4961 |  */ | 
 | 4962 |  | 
 | 4963 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | 
 | 4964 | 		unsigned long addr, pte_t ptent, union mc_target *target) | 
 | 4965 | { | 
 | 4966 | 	struct page *page = NULL; | 
 | 4967 | 	enum mc_target_type ret = MC_TARGET_NONE; | 
 | 4968 | 	swp_entry_t ent = { .val = 0 }; | 
 | 4969 |  | 
 | 4970 | 	if (pte_present(ptent)) | 
 | 4971 | 		page = mc_handle_present_pte(vma, addr, ptent); | 
 | 4972 | 	else if (is_swap_pte(ptent)) | 
 | 4973 | 		page = mc_handle_swap_pte(vma, ptent, &ent); | 
 | 4974 | 	else if (pte_none(ptent)) | 
 | 4975 | 		page = mc_handle_file_pte(vma, addr, ptent, &ent); | 
 | 4976 |  | 
 | 4977 | 	if (!page && !ent.val) | 
 | 4978 | 		return ret; | 
 | 4979 | 	if (page) { | 
 | 4980 | 		/* | 
 | 4981 | 		 * Do only loose check w/o serialization. | 
 | 4982 | 		 * mem_cgroup_move_account() checks the page is valid or | 
 | 4983 | 		 * not under LRU exclusion. | 
 | 4984 | 		 */ | 
 | 4985 | 		if (page->mem_cgroup == mc.from) { | 
 | 4986 | 			ret = MC_TARGET_PAGE; | 
 | 4987 | 			if (is_device_private_page(page) || | 
 | 4988 | 			    is_device_public_page(page)) | 
 | 4989 | 				ret = MC_TARGET_DEVICE; | 
 | 4990 | 			if (target) | 
 | 4991 | 				target->page = page; | 
 | 4992 | 		} | 
 | 4993 | 		if (!ret || !target) | 
 | 4994 | 			put_page(page); | 
 | 4995 | 	} | 
 | 4996 | 	/* | 
 | 4997 | 	 * There is a swap entry and a page doesn't exist or isn't charged. | 
 | 4998 | 	 * But we cannot move a tail-page in a THP. | 
 | 4999 | 	 */ | 
 | 5000 | 	if (ent.val && !ret && (!page || !PageTransCompound(page)) && | 
 | 5001 | 	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { | 
 | 5002 | 		ret = MC_TARGET_SWAP; | 
 | 5003 | 		if (target) | 
 | 5004 | 			target->ent = ent; | 
 | 5005 | 	} | 
 | 5006 | 	return ret; | 
 | 5007 | } | 
 | 5008 |  | 
 | 5009 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 
 | 5010 | /* | 
 | 5011 |  * We don't consider PMD mapped swapping or file mapped pages because THP does | 
 | 5012 |  * not support them for now. | 
 | 5013 |  * Caller should make sure that pmd_trans_huge(pmd) is true. | 
 | 5014 |  */ | 
 | 5015 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | 
 | 5016 | 		unsigned long addr, pmd_t pmd, union mc_target *target) | 
 | 5017 | { | 
 | 5018 | 	struct page *page = NULL; | 
 | 5019 | 	enum mc_target_type ret = MC_TARGET_NONE; | 
 | 5020 |  | 
 | 5021 | 	if (unlikely(is_swap_pmd(pmd))) { | 
 | 5022 | 		VM_BUG_ON(thp_migration_supported() && | 
 | 5023 | 				  !is_pmd_migration_entry(pmd)); | 
 | 5024 | 		return ret; | 
 | 5025 | 	} | 
 | 5026 | 	page = pmd_page(pmd); | 
 | 5027 | 	VM_BUG_ON_PAGE(!page || !PageHead(page), page); | 
 | 5028 | 	if (!(mc.flags & MOVE_ANON)) | 
 | 5029 | 		return ret; | 
 | 5030 | 	if (page->mem_cgroup == mc.from) { | 
 | 5031 | 		ret = MC_TARGET_PAGE; | 
 | 5032 | 		if (target) { | 
 | 5033 | 			get_page(page); | 
 | 5034 | 			target->page = page; | 
 | 5035 | 		} | 
 | 5036 | 	} | 
 | 5037 | 	return ret; | 
 | 5038 | } | 
 | 5039 | #else | 
 | 5040 | static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | 
 | 5041 | 		unsigned long addr, pmd_t pmd, union mc_target *target) | 
 | 5042 | { | 
 | 5043 | 	return MC_TARGET_NONE; | 
 | 5044 | } | 
 | 5045 | #endif | 
 | 5046 |  | 
 | 5047 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | 
 | 5048 | 					unsigned long addr, unsigned long end, | 
 | 5049 | 					struct mm_walk *walk) | 
 | 5050 | { | 
 | 5051 | 	struct vm_area_struct *vma = walk->vma; | 
 | 5052 | 	pte_t *pte; | 
 | 5053 | 	spinlock_t *ptl; | 
 | 5054 |  | 
 | 5055 | 	ptl = pmd_trans_huge_lock(pmd, vma); | 
 | 5056 | 	if (ptl) { | 
 | 5057 | 		/* | 
 | 5058 | 		 * Note their can not be MC_TARGET_DEVICE for now as we do not | 
 | 5059 | 		 * support transparent huge page with MEMORY_DEVICE_PUBLIC or | 
 | 5060 | 		 * MEMORY_DEVICE_PRIVATE but this might change. | 
 | 5061 | 		 */ | 
 | 5062 | 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | 
 | 5063 | 			mc.precharge += HPAGE_PMD_NR; | 
 | 5064 | 		spin_unlock(ptl); | 
 | 5065 | 		return 0; | 
 | 5066 | 	} | 
 | 5067 |  | 
 | 5068 | 	if (pmd_trans_unstable(pmd)) | 
 | 5069 | 		return 0; | 
 | 5070 | 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 
 | 5071 | 	for (; addr != end; pte++, addr += PAGE_SIZE) | 
 | 5072 | 		if (get_mctgt_type(vma, addr, *pte, NULL)) | 
 | 5073 | 			mc.precharge++;	/* increment precharge temporarily */ | 
 | 5074 | 	pte_unmap_unlock(pte - 1, ptl); | 
 | 5075 | 	cond_resched(); | 
 | 5076 |  | 
 | 5077 | 	return 0; | 
 | 5078 | } | 
 | 5079 |  | 
 | 5080 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | 
 | 5081 | { | 
 | 5082 | 	unsigned long precharge; | 
 | 5083 |  | 
 | 5084 | 	struct mm_walk mem_cgroup_count_precharge_walk = { | 
 | 5085 | 		.pmd_entry = mem_cgroup_count_precharge_pte_range, | 
 | 5086 | 		.mm = mm, | 
 | 5087 | 	}; | 
 | 5088 | 	down_read(&mm->mmap_sem); | 
 | 5089 | 	walk_page_range(0, mm->highest_vm_end, | 
 | 5090 | 			&mem_cgroup_count_precharge_walk); | 
 | 5091 | 	up_read(&mm->mmap_sem); | 
 | 5092 |  | 
 | 5093 | 	precharge = mc.precharge; | 
 | 5094 | 	mc.precharge = 0; | 
 | 5095 |  | 
 | 5096 | 	return precharge; | 
 | 5097 | } | 
 | 5098 |  | 
 | 5099 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 
 | 5100 | { | 
 | 5101 | 	unsigned long precharge = mem_cgroup_count_precharge(mm); | 
 | 5102 |  | 
 | 5103 | 	VM_BUG_ON(mc.moving_task); | 
 | 5104 | 	mc.moving_task = current; | 
 | 5105 | 	return mem_cgroup_do_precharge(precharge); | 
 | 5106 | } | 
 | 5107 |  | 
 | 5108 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ | 
 | 5109 | static void __mem_cgroup_clear_mc(void) | 
 | 5110 | { | 
 | 5111 | 	struct mem_cgroup *from = mc.from; | 
 | 5112 | 	struct mem_cgroup *to = mc.to; | 
 | 5113 |  | 
 | 5114 | 	/* we must uncharge all the leftover precharges from mc.to */ | 
 | 5115 | 	if (mc.precharge) { | 
 | 5116 | 		cancel_charge(mc.to, mc.precharge); | 
 | 5117 | 		mc.precharge = 0; | 
 | 5118 | 	} | 
 | 5119 | 	/* | 
 | 5120 | 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 
 | 5121 | 	 * we must uncharge here. | 
 | 5122 | 	 */ | 
 | 5123 | 	if (mc.moved_charge) { | 
 | 5124 | 		cancel_charge(mc.from, mc.moved_charge); | 
 | 5125 | 		mc.moved_charge = 0; | 
 | 5126 | 	} | 
 | 5127 | 	/* we must fixup refcnts and charges */ | 
 | 5128 | 	if (mc.moved_swap) { | 
 | 5129 | 		/* uncharge swap account from the old cgroup */ | 
 | 5130 | 		if (!mem_cgroup_is_root(mc.from)) | 
 | 5131 | 			page_counter_uncharge(&mc.from->memsw, mc.moved_swap); | 
 | 5132 |  | 
 | 5133 | 		mem_cgroup_id_put_many(mc.from, mc.moved_swap); | 
 | 5134 |  | 
 | 5135 | 		/* | 
 | 5136 | 		 * we charged both to->memory and to->memsw, so we | 
 | 5137 | 		 * should uncharge to->memory. | 
 | 5138 | 		 */ | 
 | 5139 | 		if (!mem_cgroup_is_root(mc.to)) | 
 | 5140 | 			page_counter_uncharge(&mc.to->memory, mc.moved_swap); | 
 | 5141 |  | 
 | 5142 | 		mem_cgroup_id_get_many(mc.to, mc.moved_swap); | 
 | 5143 | 		css_put_many(&mc.to->css, mc.moved_swap); | 
 | 5144 |  | 
 | 5145 | 		mc.moved_swap = 0; | 
 | 5146 | 	} | 
 | 5147 | 	memcg_oom_recover(from); | 
 | 5148 | 	memcg_oom_recover(to); | 
 | 5149 | 	wake_up_all(&mc.waitq); | 
 | 5150 | } | 
 | 5151 |  | 
 | 5152 | static void mem_cgroup_clear_mc(void) | 
 | 5153 | { | 
 | 5154 | 	struct mm_struct *mm = mc.mm; | 
 | 5155 |  | 
 | 5156 | 	/* | 
 | 5157 | 	 * we must clear moving_task before waking up waiters at the end of | 
 | 5158 | 	 * task migration. | 
 | 5159 | 	 */ | 
 | 5160 | 	mc.moving_task = NULL; | 
 | 5161 | 	__mem_cgroup_clear_mc(); | 
 | 5162 | 	spin_lock(&mc.lock); | 
 | 5163 | 	mc.from = NULL; | 
 | 5164 | 	mc.to = NULL; | 
 | 5165 | 	mc.mm = NULL; | 
 | 5166 | 	spin_unlock(&mc.lock); | 
 | 5167 |  | 
 | 5168 | 	mmput(mm); | 
 | 5169 | } | 
 | 5170 |  | 
 | 5171 | static int mem_cgroup_can_attach(struct cgroup_taskset *tset) | 
 | 5172 | { | 
 | 5173 | 	struct cgroup_subsys_state *css; | 
 | 5174 | 	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ | 
 | 5175 | 	struct mem_cgroup *from; | 
 | 5176 | 	struct task_struct *leader, *p; | 
 | 5177 | 	struct mm_struct *mm; | 
 | 5178 | 	unsigned long move_flags; | 
 | 5179 | 	int ret = 0; | 
 | 5180 |  | 
 | 5181 | 	/* charge immigration isn't supported on the default hierarchy */ | 
 | 5182 | 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 5183 | 		return 0; | 
 | 5184 |  | 
 | 5185 | 	/* | 
 | 5186 | 	 * Multi-process migrations only happen on the default hierarchy | 
 | 5187 | 	 * where charge immigration is not used.  Perform charge | 
 | 5188 | 	 * immigration if @tset contains a leader and whine if there are | 
 | 5189 | 	 * multiple. | 
 | 5190 | 	 */ | 
 | 5191 | 	p = NULL; | 
 | 5192 | 	cgroup_taskset_for_each_leader(leader, css, tset) { | 
 | 5193 | 		WARN_ON_ONCE(p); | 
 | 5194 | 		p = leader; | 
 | 5195 | 		memcg = mem_cgroup_from_css(css); | 
 | 5196 | 	} | 
 | 5197 | 	if (!p) | 
 | 5198 | 		return 0; | 
 | 5199 |  | 
 | 5200 | 	/* | 
 | 5201 | 	 * We are now commited to this value whatever it is. Changes in this | 
 | 5202 | 	 * tunable will only affect upcoming migrations, not the current one. | 
 | 5203 | 	 * So we need to save it, and keep it going. | 
 | 5204 | 	 */ | 
 | 5205 | 	move_flags = READ_ONCE(memcg->move_charge_at_immigrate); | 
 | 5206 | 	if (!move_flags) | 
 | 5207 | 		return 0; | 
 | 5208 |  | 
 | 5209 | 	from = mem_cgroup_from_task(p); | 
 | 5210 |  | 
 | 5211 | 	VM_BUG_ON(from == memcg); | 
 | 5212 |  | 
 | 5213 | 	mm = get_task_mm(p); | 
 | 5214 | 	if (!mm) | 
 | 5215 | 		return 0; | 
 | 5216 | 	/* We move charges only when we move a owner of the mm */ | 
 | 5217 | 	if (mm->owner == p) { | 
 | 5218 | 		VM_BUG_ON(mc.from); | 
 | 5219 | 		VM_BUG_ON(mc.to); | 
 | 5220 | 		VM_BUG_ON(mc.precharge); | 
 | 5221 | 		VM_BUG_ON(mc.moved_charge); | 
 | 5222 | 		VM_BUG_ON(mc.moved_swap); | 
 | 5223 |  | 
 | 5224 | 		spin_lock(&mc.lock); | 
 | 5225 | 		mc.mm = mm; | 
 | 5226 | 		mc.from = from; | 
 | 5227 | 		mc.to = memcg; | 
 | 5228 | 		mc.flags = move_flags; | 
 | 5229 | 		spin_unlock(&mc.lock); | 
 | 5230 | 		/* We set mc.moving_task later */ | 
 | 5231 |  | 
 | 5232 | 		ret = mem_cgroup_precharge_mc(mm); | 
 | 5233 | 		if (ret) | 
 | 5234 | 			mem_cgroup_clear_mc(); | 
 | 5235 | 	} else { | 
 | 5236 | 		mmput(mm); | 
 | 5237 | 	} | 
 | 5238 | 	return ret; | 
 | 5239 | } | 
 | 5240 |  | 
 | 5241 | static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) | 
 | 5242 | { | 
 | 5243 | 	if (mc.to) | 
 | 5244 | 		mem_cgroup_clear_mc(); | 
 | 5245 | } | 
 | 5246 |  | 
 | 5247 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | 
 | 5248 | 				unsigned long addr, unsigned long end, | 
 | 5249 | 				struct mm_walk *walk) | 
 | 5250 | { | 
 | 5251 | 	int ret = 0; | 
 | 5252 | 	struct vm_area_struct *vma = walk->vma; | 
 | 5253 | 	pte_t *pte; | 
 | 5254 | 	spinlock_t *ptl; | 
 | 5255 | 	enum mc_target_type target_type; | 
 | 5256 | 	union mc_target target; | 
 | 5257 | 	struct page *page; | 
 | 5258 |  | 
 | 5259 | 	ptl = pmd_trans_huge_lock(pmd, vma); | 
 | 5260 | 	if (ptl) { | 
 | 5261 | 		if (mc.precharge < HPAGE_PMD_NR) { | 
 | 5262 | 			spin_unlock(ptl); | 
 | 5263 | 			return 0; | 
 | 5264 | 		} | 
 | 5265 | 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); | 
 | 5266 | 		if (target_type == MC_TARGET_PAGE) { | 
 | 5267 | 			page = target.page; | 
 | 5268 | 			if (!isolate_lru_page(page)) { | 
 | 5269 | 				if (!mem_cgroup_move_account(page, true, | 
 | 5270 | 							     mc.from, mc.to)) { | 
 | 5271 | 					mc.precharge -= HPAGE_PMD_NR; | 
 | 5272 | 					mc.moved_charge += HPAGE_PMD_NR; | 
 | 5273 | 				} | 
 | 5274 | 				putback_lru_page(page); | 
 | 5275 | 			} | 
 | 5276 | 			put_page(page); | 
 | 5277 | 		} else if (target_type == MC_TARGET_DEVICE) { | 
 | 5278 | 			page = target.page; | 
 | 5279 | 			if (!mem_cgroup_move_account(page, true, | 
 | 5280 | 						     mc.from, mc.to)) { | 
 | 5281 | 				mc.precharge -= HPAGE_PMD_NR; | 
 | 5282 | 				mc.moved_charge += HPAGE_PMD_NR; | 
 | 5283 | 			} | 
 | 5284 | 			put_page(page); | 
 | 5285 | 		} | 
 | 5286 | 		spin_unlock(ptl); | 
 | 5287 | 		return 0; | 
 | 5288 | 	} | 
 | 5289 |  | 
 | 5290 | 	if (pmd_trans_unstable(pmd)) | 
 | 5291 | 		return 0; | 
 | 5292 | retry: | 
 | 5293 | 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 
 | 5294 | 	for (; addr != end; addr += PAGE_SIZE) { | 
 | 5295 | 		pte_t ptent = *(pte++); | 
 | 5296 | 		bool device = false; | 
 | 5297 | 		swp_entry_t ent; | 
 | 5298 |  | 
 | 5299 | 		if (!mc.precharge) | 
 | 5300 | 			break; | 
 | 5301 |  | 
 | 5302 | 		switch (get_mctgt_type(vma, addr, ptent, &target)) { | 
 | 5303 | 		case MC_TARGET_DEVICE: | 
 | 5304 | 			device = true; | 
 | 5305 | 			/* fall through */ | 
 | 5306 | 		case MC_TARGET_PAGE: | 
 | 5307 | 			page = target.page; | 
 | 5308 | 			/* | 
 | 5309 | 			 * We can have a part of the split pmd here. Moving it | 
 | 5310 | 			 * can be done but it would be too convoluted so simply | 
 | 5311 | 			 * ignore such a partial THP and keep it in original | 
 | 5312 | 			 * memcg. There should be somebody mapping the head. | 
 | 5313 | 			 */ | 
 | 5314 | 			if (PageTransCompound(page)) | 
 | 5315 | 				goto put; | 
 | 5316 | 			if (!device && isolate_lru_page(page)) | 
 | 5317 | 				goto put; | 
 | 5318 | 			if (!mem_cgroup_move_account(page, false, | 
 | 5319 | 						mc.from, mc.to)) { | 
 | 5320 | 				mc.precharge--; | 
 | 5321 | 				/* we uncharge from mc.from later. */ | 
 | 5322 | 				mc.moved_charge++; | 
 | 5323 | 			} | 
 | 5324 | 			if (!device) | 
 | 5325 | 				putback_lru_page(page); | 
 | 5326 | put:			/* get_mctgt_type() gets the page */ | 
 | 5327 | 			put_page(page); | 
 | 5328 | 			break; | 
 | 5329 | 		case MC_TARGET_SWAP: | 
 | 5330 | 			ent = target.ent; | 
 | 5331 | 			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { | 
 | 5332 | 				mc.precharge--; | 
 | 5333 | 				/* we fixup refcnts and charges later. */ | 
 | 5334 | 				mc.moved_swap++; | 
 | 5335 | 			} | 
 | 5336 | 			break; | 
 | 5337 | 		default: | 
 | 5338 | 			break; | 
 | 5339 | 		} | 
 | 5340 | 	} | 
 | 5341 | 	pte_unmap_unlock(pte - 1, ptl); | 
 | 5342 | 	cond_resched(); | 
 | 5343 |  | 
 | 5344 | 	if (addr != end) { | 
 | 5345 | 		/* | 
 | 5346 | 		 * We have consumed all precharges we got in can_attach(). | 
 | 5347 | 		 * We try charge one by one, but don't do any additional | 
 | 5348 | 		 * charges to mc.to if we have failed in charge once in attach() | 
 | 5349 | 		 * phase. | 
 | 5350 | 		 */ | 
 | 5351 | 		ret = mem_cgroup_do_precharge(1); | 
 | 5352 | 		if (!ret) | 
 | 5353 | 			goto retry; | 
 | 5354 | 	} | 
 | 5355 |  | 
 | 5356 | 	return ret; | 
 | 5357 | } | 
 | 5358 |  | 
 | 5359 | static void mem_cgroup_move_charge(void) | 
 | 5360 | { | 
 | 5361 | 	struct mm_walk mem_cgroup_move_charge_walk = { | 
 | 5362 | 		.pmd_entry = mem_cgroup_move_charge_pte_range, | 
 | 5363 | 		.mm = mc.mm, | 
 | 5364 | 	}; | 
 | 5365 |  | 
 | 5366 | 	lru_add_drain_all(); | 
 | 5367 | 	/* | 
 | 5368 | 	 * Signal lock_page_memcg() to take the memcg's move_lock | 
 | 5369 | 	 * while we're moving its pages to another memcg. Then wait | 
 | 5370 | 	 * for already started RCU-only updates to finish. | 
 | 5371 | 	 */ | 
 | 5372 | 	atomic_inc(&mc.from->moving_account); | 
 | 5373 | 	synchronize_rcu(); | 
 | 5374 | retry: | 
 | 5375 | 	if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) { | 
 | 5376 | 		/* | 
 | 5377 | 		 * Someone who are holding the mmap_sem might be waiting in | 
 | 5378 | 		 * waitq. So we cancel all extra charges, wake up all waiters, | 
 | 5379 | 		 * and retry. Because we cancel precharges, we might not be able | 
 | 5380 | 		 * to move enough charges, but moving charge is a best-effort | 
 | 5381 | 		 * feature anyway, so it wouldn't be a big problem. | 
 | 5382 | 		 */ | 
 | 5383 | 		__mem_cgroup_clear_mc(); | 
 | 5384 | 		cond_resched(); | 
 | 5385 | 		goto retry; | 
 | 5386 | 	} | 
 | 5387 | 	/* | 
 | 5388 | 	 * When we have consumed all precharges and failed in doing | 
 | 5389 | 	 * additional charge, the page walk just aborts. | 
 | 5390 | 	 */ | 
 | 5391 | 	walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk); | 
 | 5392 |  | 
 | 5393 | 	up_read(&mc.mm->mmap_sem); | 
 | 5394 | 	atomic_dec(&mc.from->moving_account); | 
 | 5395 | } | 
 | 5396 |  | 
 | 5397 | static void mem_cgroup_move_task(void) | 
 | 5398 | { | 
 | 5399 | 	if (mc.to) { | 
 | 5400 | 		mem_cgroup_move_charge(); | 
 | 5401 | 		mem_cgroup_clear_mc(); | 
 | 5402 | 	} | 
 | 5403 | } | 
 | 5404 | #else	/* !CONFIG_MMU */ | 
 | 5405 | static int mem_cgroup_can_attach(struct cgroup_taskset *tset) | 
 | 5406 | { | 
 | 5407 | 	return 0; | 
 | 5408 | } | 
 | 5409 | static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset) | 
 | 5410 | { | 
 | 5411 | } | 
 | 5412 | static void mem_cgroup_move_task(void) | 
 | 5413 | { | 
 | 5414 | } | 
 | 5415 | #endif | 
 | 5416 |  | 
 | 5417 | /* | 
 | 5418 |  * Cgroup retains root cgroups across [un]mount cycles making it necessary | 
 | 5419 |  * to verify whether we're attached to the default hierarchy on each mount | 
 | 5420 |  * attempt. | 
 | 5421 |  */ | 
 | 5422 | static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | 
 | 5423 | { | 
 | 5424 | 	/* | 
 | 5425 | 	 * use_hierarchy is forced on the default hierarchy.  cgroup core | 
 | 5426 | 	 * guarantees that @root doesn't have any children, so turning it | 
 | 5427 | 	 * on for the root memcg is enough. | 
 | 5428 | 	 */ | 
 | 5429 | 	if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 5430 | 		root_mem_cgroup->use_hierarchy = true; | 
 | 5431 | 	else | 
 | 5432 | 		root_mem_cgroup->use_hierarchy = false; | 
 | 5433 | } | 
 | 5434 |  | 
 | 5435 | static u64 memory_current_read(struct cgroup_subsys_state *css, | 
 | 5436 | 			       struct cftype *cft) | 
 | 5437 | { | 
 | 5438 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 5439 |  | 
 | 5440 | 	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; | 
 | 5441 | } | 
 | 5442 |  | 
 | 5443 | static int memory_min_show(struct seq_file *m, void *v) | 
 | 5444 | { | 
 | 5445 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 5446 | 	unsigned long min = READ_ONCE(memcg->memory.min); | 
 | 5447 |  | 
 | 5448 | 	if (min == PAGE_COUNTER_MAX) | 
 | 5449 | 		seq_puts(m, "max\n"); | 
 | 5450 | 	else | 
 | 5451 | 		seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); | 
 | 5452 |  | 
 | 5453 | 	return 0; | 
 | 5454 | } | 
 | 5455 |  | 
 | 5456 | static ssize_t memory_min_write(struct kernfs_open_file *of, | 
 | 5457 | 				char *buf, size_t nbytes, loff_t off) | 
 | 5458 | { | 
 | 5459 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 5460 | 	unsigned long min; | 
 | 5461 | 	int err; | 
 | 5462 |  | 
 | 5463 | 	buf = strstrip(buf); | 
 | 5464 | 	err = page_counter_memparse(buf, "max", &min); | 
 | 5465 | 	if (err) | 
 | 5466 | 		return err; | 
 | 5467 |  | 
 | 5468 | 	page_counter_set_min(&memcg->memory, min); | 
 | 5469 |  | 
 | 5470 | 	return nbytes; | 
 | 5471 | } | 
 | 5472 |  | 
 | 5473 | static int memory_low_show(struct seq_file *m, void *v) | 
 | 5474 | { | 
 | 5475 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 5476 | 	unsigned long low = READ_ONCE(memcg->memory.low); | 
 | 5477 |  | 
 | 5478 | 	if (low == PAGE_COUNTER_MAX) | 
 | 5479 | 		seq_puts(m, "max\n"); | 
 | 5480 | 	else | 
 | 5481 | 		seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); | 
 | 5482 |  | 
 | 5483 | 	return 0; | 
 | 5484 | } | 
 | 5485 |  | 
 | 5486 | static ssize_t memory_low_write(struct kernfs_open_file *of, | 
 | 5487 | 				char *buf, size_t nbytes, loff_t off) | 
 | 5488 | { | 
 | 5489 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 5490 | 	unsigned long low; | 
 | 5491 | 	int err; | 
 | 5492 |  | 
 | 5493 | 	buf = strstrip(buf); | 
 | 5494 | 	err = page_counter_memparse(buf, "max", &low); | 
 | 5495 | 	if (err) | 
 | 5496 | 		return err; | 
 | 5497 |  | 
 | 5498 | 	page_counter_set_low(&memcg->memory, low); | 
 | 5499 |  | 
 | 5500 | 	return nbytes; | 
 | 5501 | } | 
 | 5502 |  | 
 | 5503 | static int memory_high_show(struct seq_file *m, void *v) | 
 | 5504 | { | 
 | 5505 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 5506 | 	unsigned long high = READ_ONCE(memcg->high); | 
 | 5507 |  | 
 | 5508 | 	if (high == PAGE_COUNTER_MAX) | 
 | 5509 | 		seq_puts(m, "max\n"); | 
 | 5510 | 	else | 
 | 5511 | 		seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); | 
 | 5512 |  | 
 | 5513 | 	return 0; | 
 | 5514 | } | 
 | 5515 |  | 
 | 5516 | static ssize_t memory_high_write(struct kernfs_open_file *of, | 
 | 5517 | 				 char *buf, size_t nbytes, loff_t off) | 
 | 5518 | { | 
 | 5519 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 5520 | 	unsigned long nr_pages; | 
 | 5521 | 	unsigned long high; | 
 | 5522 | 	int err; | 
 | 5523 |  | 
 | 5524 | 	buf = strstrip(buf); | 
 | 5525 | 	err = page_counter_memparse(buf, "max", &high); | 
 | 5526 | 	if (err) | 
 | 5527 | 		return err; | 
 | 5528 |  | 
 | 5529 | 	memcg->high = high; | 
 | 5530 |  | 
 | 5531 | 	nr_pages = page_counter_read(&memcg->memory); | 
 | 5532 | 	if (nr_pages > high) | 
 | 5533 | 		try_to_free_mem_cgroup_pages(memcg, nr_pages - high, | 
 | 5534 | 					     GFP_KERNEL, true); | 
 | 5535 |  | 
 | 5536 | 	memcg_wb_domain_size_changed(memcg); | 
 | 5537 | 	return nbytes; | 
 | 5538 | } | 
 | 5539 |  | 
 | 5540 | static int memory_max_show(struct seq_file *m, void *v) | 
 | 5541 | { | 
 | 5542 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 5543 | 	unsigned long max = READ_ONCE(memcg->memory.max); | 
 | 5544 |  | 
 | 5545 | 	if (max == PAGE_COUNTER_MAX) | 
 | 5546 | 		seq_puts(m, "max\n"); | 
 | 5547 | 	else | 
 | 5548 | 		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); | 
 | 5549 |  | 
 | 5550 | 	return 0; | 
 | 5551 | } | 
 | 5552 |  | 
 | 5553 | static ssize_t memory_max_write(struct kernfs_open_file *of, | 
 | 5554 | 				char *buf, size_t nbytes, loff_t off) | 
 | 5555 | { | 
 | 5556 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 5557 | 	unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; | 
 | 5558 | 	bool drained = false; | 
 | 5559 | 	unsigned long max; | 
 | 5560 | 	int err; | 
 | 5561 |  | 
 | 5562 | 	buf = strstrip(buf); | 
 | 5563 | 	err = page_counter_memparse(buf, "max", &max); | 
 | 5564 | 	if (err) | 
 | 5565 | 		return err; | 
 | 5566 |  | 
 | 5567 | 	xchg(&memcg->memory.max, max); | 
 | 5568 |  | 
 | 5569 | 	for (;;) { | 
 | 5570 | 		unsigned long nr_pages = page_counter_read(&memcg->memory); | 
 | 5571 |  | 
 | 5572 | 		if (nr_pages <= max) | 
 | 5573 | 			break; | 
 | 5574 |  | 
 | 5575 | 		if (signal_pending(current)) { | 
 | 5576 | 			err = -EINTR; | 
 | 5577 | 			break; | 
 | 5578 | 		} | 
 | 5579 |  | 
 | 5580 | 		if (!drained) { | 
 | 5581 | 			drain_all_stock(memcg); | 
 | 5582 | 			drained = true; | 
 | 5583 | 			continue; | 
 | 5584 | 		} | 
 | 5585 |  | 
 | 5586 | 		if (nr_reclaims) { | 
 | 5587 | 			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, | 
 | 5588 | 							  GFP_KERNEL, true)) | 
 | 5589 | 				nr_reclaims--; | 
 | 5590 | 			continue; | 
 | 5591 | 		} | 
 | 5592 |  | 
 | 5593 | 		memcg_memory_event(memcg, MEMCG_OOM); | 
 | 5594 | 		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) | 
 | 5595 | 			break; | 
 | 5596 | 	} | 
 | 5597 |  | 
 | 5598 | 	memcg_wb_domain_size_changed(memcg); | 
 | 5599 | 	return nbytes; | 
 | 5600 | } | 
 | 5601 |  | 
 | 5602 | static int memory_events_show(struct seq_file *m, void *v) | 
 | 5603 | { | 
 | 5604 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 5605 |  | 
 | 5606 | 	seq_printf(m, "low %lu\n", | 
 | 5607 | 		   atomic_long_read(&memcg->memory_events[MEMCG_LOW])); | 
 | 5608 | 	seq_printf(m, "high %lu\n", | 
 | 5609 | 		   atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); | 
 | 5610 | 	seq_printf(m, "max %lu\n", | 
 | 5611 | 		   atomic_long_read(&memcg->memory_events[MEMCG_MAX])); | 
 | 5612 | 	seq_printf(m, "oom %lu\n", | 
 | 5613 | 		   atomic_long_read(&memcg->memory_events[MEMCG_OOM])); | 
 | 5614 | 	seq_printf(m, "oom_kill %lu\n", | 
 | 5615 | 		   atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL])); | 
 | 5616 |  | 
 | 5617 | 	return 0; | 
 | 5618 | } | 
 | 5619 |  | 
 | 5620 | static int memory_stat_show(struct seq_file *m, void *v) | 
 | 5621 | { | 
 | 5622 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 5623 | 	struct accumulated_stats acc; | 
 | 5624 | 	int i; | 
 | 5625 |  | 
 | 5626 | 	/* | 
 | 5627 | 	 * Provide statistics on the state of the memory subsystem as | 
 | 5628 | 	 * well as cumulative event counters that show past behavior. | 
 | 5629 | 	 * | 
 | 5630 | 	 * This list is ordered following a combination of these gradients: | 
 | 5631 | 	 * 1) generic big picture -> specifics and details | 
 | 5632 | 	 * 2) reflecting userspace activity -> reflecting kernel heuristics | 
 | 5633 | 	 * | 
 | 5634 | 	 * Current memory state: | 
 | 5635 | 	 */ | 
 | 5636 |  | 
 | 5637 | 	memset(&acc, 0, sizeof(acc)); | 
 | 5638 | 	acc.stats_size = MEMCG_NR_STAT; | 
 | 5639 | 	acc.events_size = NR_VM_EVENT_ITEMS; | 
 | 5640 | 	accumulate_memcg_tree(memcg, &acc); | 
 | 5641 |  | 
 | 5642 | 	seq_printf(m, "anon %llu\n", | 
 | 5643 | 		   (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); | 
 | 5644 | 	seq_printf(m, "file %llu\n", | 
 | 5645 | 		   (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); | 
 | 5646 | 	seq_printf(m, "kernel_stack %llu\n", | 
 | 5647 | 		   (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); | 
 | 5648 | 	seq_printf(m, "slab %llu\n", | 
 | 5649 | 		   (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + | 
 | 5650 | 			 acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); | 
 | 5651 | 	seq_printf(m, "sock %llu\n", | 
 | 5652 | 		   (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); | 
 | 5653 |  | 
 | 5654 | 	seq_printf(m, "shmem %llu\n", | 
 | 5655 | 		   (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); | 
 | 5656 | 	seq_printf(m, "file_mapped %llu\n", | 
 | 5657 | 		   (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); | 
 | 5658 | 	seq_printf(m, "file_dirty %llu\n", | 
 | 5659 | 		   (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); | 
 | 5660 | 	seq_printf(m, "file_writeback %llu\n", | 
 | 5661 | 		   (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); | 
 | 5662 |  | 
 | 5663 | 	for (i = 0; i < NR_LRU_LISTS; i++) | 
 | 5664 | 		seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], | 
 | 5665 | 			   (u64)acc.lru_pages[i] * PAGE_SIZE); | 
 | 5666 |  | 
 | 5667 | 	seq_printf(m, "slab_reclaimable %llu\n", | 
 | 5668 | 		   (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); | 
 | 5669 | 	seq_printf(m, "slab_unreclaimable %llu\n", | 
 | 5670 | 		   (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); | 
 | 5671 |  | 
 | 5672 | 	/* Accumulated memory events */ | 
 | 5673 |  | 
 | 5674 | 	seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); | 
 | 5675 | 	seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); | 
 | 5676 |  | 
 | 5677 | 	seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); | 
 | 5678 | 	seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + | 
 | 5679 | 		   acc.events[PGSCAN_DIRECT]); | 
 | 5680 | 	seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + | 
 | 5681 | 		   acc.events[PGSTEAL_DIRECT]); | 
 | 5682 | 	seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); | 
 | 5683 | 	seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); | 
 | 5684 | 	seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); | 
 | 5685 | 	seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); | 
 | 5686 |  | 
 | 5687 | 	seq_printf(m, "workingset_refault %lu\n", | 
 | 5688 | 		   acc.stat[WORKINGSET_REFAULT]); | 
 | 5689 | 	seq_printf(m, "workingset_activate %lu\n", | 
 | 5690 | 		   acc.stat[WORKINGSET_ACTIVATE]); | 
 | 5691 | 	seq_printf(m, "workingset_nodereclaim %lu\n", | 
 | 5692 | 		   acc.stat[WORKINGSET_NODERECLAIM]); | 
 | 5693 |  | 
 | 5694 | 	return 0; | 
 | 5695 | } | 
 | 5696 |  | 
 | 5697 | static int memory_oom_group_show(struct seq_file *m, void *v) | 
 | 5698 | { | 
 | 5699 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 5700 |  | 
 | 5701 | 	seq_printf(m, "%d\n", memcg->oom_group); | 
 | 5702 |  | 
 | 5703 | 	return 0; | 
 | 5704 | } | 
 | 5705 |  | 
 | 5706 | static ssize_t memory_oom_group_write(struct kernfs_open_file *of, | 
 | 5707 | 				      char *buf, size_t nbytes, loff_t off) | 
 | 5708 | { | 
 | 5709 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 5710 | 	int ret, oom_group; | 
 | 5711 |  | 
 | 5712 | 	buf = strstrip(buf); | 
 | 5713 | 	if (!buf) | 
 | 5714 | 		return -EINVAL; | 
 | 5715 |  | 
 | 5716 | 	ret = kstrtoint(buf, 0, &oom_group); | 
 | 5717 | 	if (ret) | 
 | 5718 | 		return ret; | 
 | 5719 |  | 
 | 5720 | 	if (oom_group != 0 && oom_group != 1) | 
 | 5721 | 		return -EINVAL; | 
 | 5722 |  | 
 | 5723 | 	memcg->oom_group = oom_group; | 
 | 5724 |  | 
 | 5725 | 	return nbytes; | 
 | 5726 | } | 
 | 5727 |  | 
 | 5728 | static struct cftype memory_files[] = { | 
 | 5729 | 	{ | 
 | 5730 | 		.name = "current", | 
 | 5731 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 5732 | 		.read_u64 = memory_current_read, | 
 | 5733 | 	}, | 
 | 5734 | 	{ | 
 | 5735 | 		.name = "min", | 
 | 5736 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 5737 | 		.seq_show = memory_min_show, | 
 | 5738 | 		.write = memory_min_write, | 
 | 5739 | 	}, | 
 | 5740 | 	{ | 
 | 5741 | 		.name = "low", | 
 | 5742 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 5743 | 		.seq_show = memory_low_show, | 
 | 5744 | 		.write = memory_low_write, | 
 | 5745 | 	}, | 
 | 5746 | 	{ | 
 | 5747 | 		.name = "high", | 
 | 5748 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 5749 | 		.seq_show = memory_high_show, | 
 | 5750 | 		.write = memory_high_write, | 
 | 5751 | 	}, | 
 | 5752 | 	{ | 
 | 5753 | 		.name = "max", | 
 | 5754 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 5755 | 		.seq_show = memory_max_show, | 
 | 5756 | 		.write = memory_max_write, | 
 | 5757 | 	}, | 
 | 5758 | 	{ | 
 | 5759 | 		.name = "events", | 
 | 5760 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 5761 | 		.file_offset = offsetof(struct mem_cgroup, events_file), | 
 | 5762 | 		.seq_show = memory_events_show, | 
 | 5763 | 	}, | 
 | 5764 | 	{ | 
 | 5765 | 		.name = "stat", | 
 | 5766 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 5767 | 		.seq_show = memory_stat_show, | 
 | 5768 | 	}, | 
 | 5769 | 	{ | 
 | 5770 | 		.name = "oom.group", | 
 | 5771 | 		.flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, | 
 | 5772 | 		.seq_show = memory_oom_group_show, | 
 | 5773 | 		.write = memory_oom_group_write, | 
 | 5774 | 	}, | 
 | 5775 | 	{ }	/* terminate */ | 
 | 5776 | }; | 
 | 5777 |  | 
 | 5778 | struct cgroup_subsys memory_cgrp_subsys = { | 
 | 5779 | 	.css_alloc = mem_cgroup_css_alloc, | 
 | 5780 | 	.css_online = mem_cgroup_css_online, | 
 | 5781 | 	.css_offline = mem_cgroup_css_offline, | 
 | 5782 | 	.css_released = mem_cgroup_css_released, | 
 | 5783 | 	.css_free = mem_cgroup_css_free, | 
 | 5784 | 	.css_reset = mem_cgroup_css_reset, | 
 | 5785 | 	.can_attach = mem_cgroup_can_attach, | 
 | 5786 | 	.cancel_attach = mem_cgroup_cancel_attach, | 
 | 5787 | 	.post_attach = mem_cgroup_move_task, | 
 | 5788 | 	.bind = mem_cgroup_bind, | 
 | 5789 | 	.dfl_cftypes = memory_files, | 
 | 5790 | 	.legacy_cftypes = mem_cgroup_legacy_files, | 
 | 5791 | 	.early_init = 0, | 
 | 5792 | }; | 
 | 5793 |  | 
 | 5794 | /** | 
 | 5795 |  * mem_cgroup_protected - check if memory consumption is in the normal range | 
 | 5796 |  * @root: the top ancestor of the sub-tree being checked | 
 | 5797 |  * @memcg: the memory cgroup to check | 
 | 5798 |  * | 
 | 5799 |  * WARNING: This function is not stateless! It can only be used as part | 
 | 5800 |  *          of a top-down tree iteration, not for isolated queries. | 
 | 5801 |  * | 
 | 5802 |  * Returns one of the following: | 
 | 5803 |  *   MEMCG_PROT_NONE: cgroup memory is not protected | 
 | 5804 |  *   MEMCG_PROT_LOW: cgroup memory is protected as long there is | 
 | 5805 |  *     an unprotected supply of reclaimable memory from other cgroups. | 
 | 5806 |  *   MEMCG_PROT_MIN: cgroup memory is protected | 
 | 5807 |  * | 
 | 5808 |  * @root is exclusive; it is never protected when looked at directly | 
 | 5809 |  * | 
 | 5810 |  * To provide a proper hierarchical behavior, effective memory.min/low values | 
 | 5811 |  * are used. Below is the description of how effective memory.low is calculated. | 
 | 5812 |  * Effective memory.min values is calculated in the same way. | 
 | 5813 |  * | 
 | 5814 |  * Effective memory.low is always equal or less than the original memory.low. | 
 | 5815 |  * If there is no memory.low overcommittment (which is always true for | 
 | 5816 |  * top-level memory cgroups), these two values are equal. | 
 | 5817 |  * Otherwise, it's a part of parent's effective memory.low, | 
 | 5818 |  * calculated as a cgroup's memory.low usage divided by sum of sibling's | 
 | 5819 |  * memory.low usages, where memory.low usage is the size of actually | 
 | 5820 |  * protected memory. | 
 | 5821 |  * | 
 | 5822 |  *                                             low_usage | 
 | 5823 |  * elow = min( memory.low, parent->elow * ------------------ ), | 
 | 5824 |  *                                        siblings_low_usage | 
 | 5825 |  * | 
 | 5826 |  *             | memory.current, if memory.current < memory.low | 
 | 5827 |  * low_usage = | | 
 | 5828 | 	       | 0, otherwise. | 
 | 5829 |  * | 
 | 5830 |  * | 
 | 5831 |  * Such definition of the effective memory.low provides the expected | 
 | 5832 |  * hierarchical behavior: parent's memory.low value is limiting | 
 | 5833 |  * children, unprotected memory is reclaimed first and cgroups, | 
 | 5834 |  * which are not using their guarantee do not affect actual memory | 
 | 5835 |  * distribution. | 
 | 5836 |  * | 
 | 5837 |  * For example, if there are memcgs A, A/B, A/C, A/D and A/E: | 
 | 5838 |  * | 
 | 5839 |  *     A      A/memory.low = 2G, A/memory.current = 6G | 
 | 5840 |  *    //\\ | 
 | 5841 |  *   BC  DE   B/memory.low = 3G  B/memory.current = 2G | 
 | 5842 |  *            C/memory.low = 1G  C/memory.current = 2G | 
 | 5843 |  *            D/memory.low = 0   D/memory.current = 2G | 
 | 5844 |  *            E/memory.low = 10G E/memory.current = 0 | 
 | 5845 |  * | 
 | 5846 |  * and the memory pressure is applied, the following memory distribution | 
 | 5847 |  * is expected (approximately): | 
 | 5848 |  * | 
 | 5849 |  *     A/memory.current = 2G | 
 | 5850 |  * | 
 | 5851 |  *     B/memory.current = 1.3G | 
 | 5852 |  *     C/memory.current = 0.6G | 
 | 5853 |  *     D/memory.current = 0 | 
 | 5854 |  *     E/memory.current = 0 | 
 | 5855 |  * | 
 | 5856 |  * These calculations require constant tracking of the actual low usages | 
 | 5857 |  * (see propagate_protected_usage()), as well as recursive calculation of | 
 | 5858 |  * effective memory.low values. But as we do call mem_cgroup_protected() | 
 | 5859 |  * path for each memory cgroup top-down from the reclaim, | 
 | 5860 |  * it's possible to optimize this part, and save calculated elow | 
 | 5861 |  * for next usage. This part is intentionally racy, but it's ok, | 
 | 5862 |  * as memory.low is a best-effort mechanism. | 
 | 5863 |  */ | 
 | 5864 | enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, | 
 | 5865 | 						struct mem_cgroup *memcg) | 
 | 5866 | { | 
 | 5867 | 	struct mem_cgroup *parent; | 
 | 5868 | 	unsigned long emin, parent_emin; | 
 | 5869 | 	unsigned long elow, parent_elow; | 
 | 5870 | 	unsigned long usage; | 
 | 5871 |  | 
 | 5872 | 	if (mem_cgroup_disabled()) | 
 | 5873 | 		return MEMCG_PROT_NONE; | 
 | 5874 |  | 
 | 5875 | 	if (!root) | 
 | 5876 | 		root = root_mem_cgroup; | 
 | 5877 | 	if (memcg == root) | 
 | 5878 | 		return MEMCG_PROT_NONE; | 
 | 5879 |  | 
 | 5880 | 	usage = page_counter_read(&memcg->memory); | 
 | 5881 | 	if (!usage) | 
 | 5882 | 		return MEMCG_PROT_NONE; | 
 | 5883 |  | 
 | 5884 | 	emin = memcg->memory.min; | 
 | 5885 | 	elow = memcg->memory.low; | 
 | 5886 |  | 
 | 5887 | 	parent = parent_mem_cgroup(memcg); | 
 | 5888 | 	/* No parent means a non-hierarchical mode on v1 memcg */ | 
 | 5889 | 	if (!parent) | 
 | 5890 | 		return MEMCG_PROT_NONE; | 
 | 5891 |  | 
 | 5892 | 	if (parent == root) | 
 | 5893 | 		goto exit; | 
 | 5894 |  | 
 | 5895 | 	parent_emin = READ_ONCE(parent->memory.emin); | 
 | 5896 | 	emin = min(emin, parent_emin); | 
 | 5897 | 	if (emin && parent_emin) { | 
 | 5898 | 		unsigned long min_usage, siblings_min_usage; | 
 | 5899 |  | 
 | 5900 | 		min_usage = min(usage, memcg->memory.min); | 
 | 5901 | 		siblings_min_usage = atomic_long_read( | 
 | 5902 | 			&parent->memory.children_min_usage); | 
 | 5903 |  | 
 | 5904 | 		if (min_usage && siblings_min_usage) | 
 | 5905 | 			emin = min(emin, parent_emin * min_usage / | 
 | 5906 | 				   siblings_min_usage); | 
 | 5907 | 	} | 
 | 5908 |  | 
 | 5909 | 	parent_elow = READ_ONCE(parent->memory.elow); | 
 | 5910 | 	elow = min(elow, parent_elow); | 
 | 5911 | 	if (elow && parent_elow) { | 
 | 5912 | 		unsigned long low_usage, siblings_low_usage; | 
 | 5913 |  | 
 | 5914 | 		low_usage = min(usage, memcg->memory.low); | 
 | 5915 | 		siblings_low_usage = atomic_long_read( | 
 | 5916 | 			&parent->memory.children_low_usage); | 
 | 5917 |  | 
 | 5918 | 		if (low_usage && siblings_low_usage) | 
 | 5919 | 			elow = min(elow, parent_elow * low_usage / | 
 | 5920 | 				   siblings_low_usage); | 
 | 5921 | 	} | 
 | 5922 |  | 
 | 5923 | exit: | 
 | 5924 | 	memcg->memory.emin = emin; | 
 | 5925 | 	memcg->memory.elow = elow; | 
 | 5926 |  | 
 | 5927 | 	if (usage <= emin) | 
 | 5928 | 		return MEMCG_PROT_MIN; | 
 | 5929 | 	else if (usage <= elow) | 
 | 5930 | 		return MEMCG_PROT_LOW; | 
 | 5931 | 	else | 
 | 5932 | 		return MEMCG_PROT_NONE; | 
 | 5933 | } | 
 | 5934 |  | 
 | 5935 | /** | 
 | 5936 |  * mem_cgroup_try_charge - try charging a page | 
 | 5937 |  * @page: page to charge | 
 | 5938 |  * @mm: mm context of the victim | 
 | 5939 |  * @gfp_mask: reclaim mode | 
 | 5940 |  * @memcgp: charged memcg return | 
 | 5941 |  * @compound: charge the page as compound or small page | 
 | 5942 |  * | 
 | 5943 |  * Try to charge @page to the memcg that @mm belongs to, reclaiming | 
 | 5944 |  * pages according to @gfp_mask if necessary. | 
 | 5945 |  * | 
 | 5946 |  * Returns 0 on success, with *@memcgp pointing to the charged memcg. | 
 | 5947 |  * Otherwise, an error code is returned. | 
 | 5948 |  * | 
 | 5949 |  * After page->mapping has been set up, the caller must finalize the | 
 | 5950 |  * charge with mem_cgroup_commit_charge().  Or abort the transaction | 
 | 5951 |  * with mem_cgroup_cancel_charge() in case page instantiation fails. | 
 | 5952 |  */ | 
 | 5953 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 
 | 5954 | 			  gfp_t gfp_mask, struct mem_cgroup **memcgp, | 
 | 5955 | 			  bool compound) | 
 | 5956 | { | 
 | 5957 | 	struct mem_cgroup *memcg = NULL; | 
 | 5958 | 	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; | 
 | 5959 | 	int ret = 0; | 
 | 5960 |  | 
 | 5961 | 	if (mem_cgroup_disabled()) | 
 | 5962 | 		goto out; | 
 | 5963 |  | 
 | 5964 | 	if (PageSwapCache(page)) { | 
 | 5965 | 		/* | 
 | 5966 | 		 * Every swap fault against a single page tries to charge the | 
 | 5967 | 		 * page, bail as early as possible.  shmem_unuse() encounters | 
 | 5968 | 		 * already charged pages, too.  The USED bit is protected by | 
 | 5969 | 		 * the page lock, which serializes swap cache removal, which | 
 | 5970 | 		 * in turn serializes uncharging. | 
 | 5971 | 		 */ | 
 | 5972 | 		VM_BUG_ON_PAGE(!PageLocked(page), page); | 
 | 5973 | 		if (compound_head(page)->mem_cgroup) | 
 | 5974 | 			goto out; | 
 | 5975 |  | 
 | 5976 | 		if (do_swap_account) { | 
 | 5977 | 			swp_entry_t ent = { .val = page_private(page), }; | 
 | 5978 | 			unsigned short id = lookup_swap_cgroup_id(ent); | 
 | 5979 |  | 
 | 5980 | 			rcu_read_lock(); | 
 | 5981 | 			memcg = mem_cgroup_from_id(id); | 
 | 5982 | 			if (memcg && !css_tryget_online(&memcg->css)) | 
 | 5983 | 				memcg = NULL; | 
 | 5984 | 			rcu_read_unlock(); | 
 | 5985 | 		} | 
 | 5986 | 	} | 
 | 5987 |  | 
 | 5988 | 	if (!memcg) | 
 | 5989 | 		memcg = get_mem_cgroup_from_mm(mm); | 
 | 5990 |  | 
 | 5991 | 	ret = try_charge(memcg, gfp_mask, nr_pages); | 
 | 5992 |  | 
 | 5993 | 	css_put(&memcg->css); | 
 | 5994 | out: | 
 | 5995 | 	*memcgp = memcg; | 
 | 5996 | 	return ret; | 
 | 5997 | } | 
 | 5998 |  | 
 | 5999 | int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm, | 
 | 6000 | 			  gfp_t gfp_mask, struct mem_cgroup **memcgp, | 
 | 6001 | 			  bool compound) | 
 | 6002 | { | 
 | 6003 | 	struct mem_cgroup *memcg; | 
 | 6004 | 	int ret; | 
 | 6005 |  | 
 | 6006 | 	ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound); | 
 | 6007 | 	memcg = *memcgp; | 
 | 6008 | 	mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask); | 
 | 6009 | 	return ret; | 
 | 6010 | } | 
 | 6011 |  | 
 | 6012 | /** | 
 | 6013 |  * mem_cgroup_commit_charge - commit a page charge | 
 | 6014 |  * @page: page to charge | 
 | 6015 |  * @memcg: memcg to charge the page to | 
 | 6016 |  * @lrucare: page might be on LRU already | 
 | 6017 |  * @compound: charge the page as compound or small page | 
 | 6018 |  * | 
 | 6019 |  * Finalize a charge transaction started by mem_cgroup_try_charge(), | 
 | 6020 |  * after page->mapping has been set up.  This must happen atomically | 
 | 6021 |  * as part of the page instantiation, i.e. under the page table lock | 
 | 6022 |  * for anonymous pages, under the page lock for page and swap cache. | 
 | 6023 |  * | 
 | 6024 |  * In addition, the page must not be on the LRU during the commit, to | 
 | 6025 |  * prevent racing with task migration.  If it might be, use @lrucare. | 
 | 6026 |  * | 
 | 6027 |  * Use mem_cgroup_cancel_charge() to cancel the transaction instead. | 
 | 6028 |  */ | 
 | 6029 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | 
 | 6030 | 			      bool lrucare, bool compound) | 
 | 6031 | { | 
 | 6032 | 	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; | 
 | 6033 |  | 
 | 6034 | 	VM_BUG_ON_PAGE(!page->mapping, page); | 
 | 6035 | 	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); | 
 | 6036 |  | 
 | 6037 | 	if (mem_cgroup_disabled()) | 
 | 6038 | 		return; | 
 | 6039 | 	/* | 
 | 6040 | 	 * Swap faults will attempt to charge the same page multiple | 
 | 6041 | 	 * times.  But reuse_swap_page() might have removed the page | 
 | 6042 | 	 * from swapcache already, so we can't check PageSwapCache(). | 
 | 6043 | 	 */ | 
 | 6044 | 	if (!memcg) | 
 | 6045 | 		return; | 
 | 6046 |  | 
 | 6047 | 	commit_charge(page, memcg, lrucare); | 
 | 6048 |  | 
 | 6049 | 	local_irq_disable(); | 
 | 6050 | 	mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); | 
 | 6051 | 	memcg_check_events(memcg, page); | 
 | 6052 | 	local_irq_enable(); | 
 | 6053 |  | 
 | 6054 | 	if (do_memsw_account() && PageSwapCache(page)) { | 
 | 6055 | 		swp_entry_t entry = { .val = page_private(page) }; | 
 | 6056 | 		/* | 
 | 6057 | 		 * The swap entry might not get freed for a long time, | 
 | 6058 | 		 * let's not wait for it.  The page already received a | 
 | 6059 | 		 * memory+swap charge, drop the swap entry duplicate. | 
 | 6060 | 		 */ | 
 | 6061 | 		mem_cgroup_uncharge_swap(entry, nr_pages); | 
 | 6062 | 	} | 
 | 6063 | } | 
 | 6064 |  | 
 | 6065 | /** | 
 | 6066 |  * mem_cgroup_cancel_charge - cancel a page charge | 
 | 6067 |  * @page: page to charge | 
 | 6068 |  * @memcg: memcg to charge the page to | 
 | 6069 |  * @compound: charge the page as compound or small page | 
 | 6070 |  * | 
 | 6071 |  * Cancel a charge transaction started by mem_cgroup_try_charge(). | 
 | 6072 |  */ | 
 | 6073 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, | 
 | 6074 | 		bool compound) | 
 | 6075 | { | 
 | 6076 | 	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; | 
 | 6077 |  | 
 | 6078 | 	if (mem_cgroup_disabled()) | 
 | 6079 | 		return; | 
 | 6080 | 	/* | 
 | 6081 | 	 * Swap faults will attempt to charge the same page multiple | 
 | 6082 | 	 * times.  But reuse_swap_page() might have removed the page | 
 | 6083 | 	 * from swapcache already, so we can't check PageSwapCache(). | 
 | 6084 | 	 */ | 
 | 6085 | 	if (!memcg) | 
 | 6086 | 		return; | 
 | 6087 |  | 
 | 6088 | 	cancel_charge(memcg, nr_pages); | 
 | 6089 | } | 
 | 6090 |  | 
 | 6091 | struct uncharge_gather { | 
 | 6092 | 	struct mem_cgroup *memcg; | 
 | 6093 | 	unsigned long pgpgout; | 
 | 6094 | 	unsigned long nr_anon; | 
 | 6095 | 	unsigned long nr_file; | 
 | 6096 | 	unsigned long nr_kmem; | 
 | 6097 | 	unsigned long nr_huge; | 
 | 6098 | 	unsigned long nr_shmem; | 
 | 6099 | 	struct page *dummy_page; | 
 | 6100 | }; | 
 | 6101 |  | 
 | 6102 | static inline void uncharge_gather_clear(struct uncharge_gather *ug) | 
 | 6103 | { | 
 | 6104 | 	memset(ug, 0, sizeof(*ug)); | 
 | 6105 | } | 
 | 6106 |  | 
 | 6107 | static void uncharge_batch(const struct uncharge_gather *ug) | 
 | 6108 | { | 
 | 6109 | 	unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem; | 
 | 6110 | 	unsigned long flags; | 
 | 6111 |  | 
 | 6112 | 	if (!mem_cgroup_is_root(ug->memcg)) { | 
 | 6113 | 		page_counter_uncharge(&ug->memcg->memory, nr_pages); | 
 | 6114 | 		if (do_memsw_account()) | 
 | 6115 | 			page_counter_uncharge(&ug->memcg->memsw, nr_pages); | 
 | 6116 | 		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem) | 
 | 6117 | 			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem); | 
 | 6118 | 		memcg_oom_recover(ug->memcg); | 
 | 6119 | 	} | 
 | 6120 |  | 
 | 6121 | 	local_irq_save(flags); | 
 | 6122 | 	__mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); | 
 | 6123 | 	__mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); | 
 | 6124 | 	__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); | 
 | 6125 | 	__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); | 
 | 6126 | 	__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); | 
 | 6127 | 	__this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); | 
 | 6128 | 	memcg_check_events(ug->memcg, ug->dummy_page); | 
 | 6129 | 	local_irq_restore(flags); | 
 | 6130 |  | 
 | 6131 | 	if (!mem_cgroup_is_root(ug->memcg)) | 
 | 6132 | 		css_put_many(&ug->memcg->css, nr_pages); | 
 | 6133 | } | 
 | 6134 |  | 
 | 6135 | static void uncharge_page(struct page *page, struct uncharge_gather *ug) | 
 | 6136 | { | 
 | 6137 | 	VM_BUG_ON_PAGE(PageLRU(page), page); | 
 | 6138 | 	VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) && | 
 | 6139 | 			!PageHWPoison(page) , page); | 
 | 6140 |  | 
 | 6141 | 	if (!page->mem_cgroup) | 
 | 6142 | 		return; | 
 | 6143 |  | 
 | 6144 | 	/* | 
 | 6145 | 	 * Nobody should be changing or seriously looking at | 
 | 6146 | 	 * page->mem_cgroup at this point, we have fully | 
 | 6147 | 	 * exclusive access to the page. | 
 | 6148 | 	 */ | 
 | 6149 |  | 
 | 6150 | 	if (ug->memcg != page->mem_cgroup) { | 
 | 6151 | 		if (ug->memcg) { | 
 | 6152 | 			uncharge_batch(ug); | 
 | 6153 | 			uncharge_gather_clear(ug); | 
 | 6154 | 		} | 
 | 6155 | 		ug->memcg = page->mem_cgroup; | 
 | 6156 | 	} | 
 | 6157 |  | 
 | 6158 | 	if (!PageKmemcg(page)) { | 
 | 6159 | 		unsigned int nr_pages = 1; | 
 | 6160 |  | 
 | 6161 | 		if (PageTransHuge(page)) { | 
 | 6162 | 			nr_pages <<= compound_order(page); | 
 | 6163 | 			ug->nr_huge += nr_pages; | 
 | 6164 | 		} | 
 | 6165 | 		if (PageAnon(page)) | 
 | 6166 | 			ug->nr_anon += nr_pages; | 
 | 6167 | 		else { | 
 | 6168 | 			ug->nr_file += nr_pages; | 
 | 6169 | 			if (PageSwapBacked(page)) | 
 | 6170 | 				ug->nr_shmem += nr_pages; | 
 | 6171 | 		} | 
 | 6172 | 		ug->pgpgout++; | 
 | 6173 | 	} else { | 
 | 6174 | 		ug->nr_kmem += 1 << compound_order(page); | 
 | 6175 | 		__ClearPageKmemcg(page); | 
 | 6176 | 	} | 
 | 6177 |  | 
 | 6178 | 	ug->dummy_page = page; | 
 | 6179 | 	page->mem_cgroup = NULL; | 
 | 6180 | } | 
 | 6181 |  | 
 | 6182 | static void uncharge_list(struct list_head *page_list) | 
 | 6183 | { | 
 | 6184 | 	struct uncharge_gather ug; | 
 | 6185 | 	struct list_head *next; | 
 | 6186 |  | 
 | 6187 | 	uncharge_gather_clear(&ug); | 
 | 6188 |  | 
 | 6189 | 	/* | 
 | 6190 | 	 * Note that the list can be a single page->lru; hence the | 
 | 6191 | 	 * do-while loop instead of a simple list_for_each_entry(). | 
 | 6192 | 	 */ | 
 | 6193 | 	next = page_list->next; | 
 | 6194 | 	do { | 
 | 6195 | 		struct page *page; | 
 | 6196 |  | 
 | 6197 | 		page = list_entry(next, struct page, lru); | 
 | 6198 | 		next = page->lru.next; | 
 | 6199 |  | 
 | 6200 | 		uncharge_page(page, &ug); | 
 | 6201 | 	} while (next != page_list); | 
 | 6202 |  | 
 | 6203 | 	if (ug.memcg) | 
 | 6204 | 		uncharge_batch(&ug); | 
 | 6205 | } | 
 | 6206 |  | 
 | 6207 | /** | 
 | 6208 |  * mem_cgroup_uncharge - uncharge a page | 
 | 6209 |  * @page: page to uncharge | 
 | 6210 |  * | 
 | 6211 |  * Uncharge a page previously charged with mem_cgroup_try_charge() and | 
 | 6212 |  * mem_cgroup_commit_charge(). | 
 | 6213 |  */ | 
 | 6214 | void mem_cgroup_uncharge(struct page *page) | 
 | 6215 | { | 
 | 6216 | 	struct uncharge_gather ug; | 
 | 6217 |  | 
 | 6218 | 	if (mem_cgroup_disabled()) | 
 | 6219 | 		return; | 
 | 6220 |  | 
 | 6221 | 	/* Don't touch page->lru of any random page, pre-check: */ | 
 | 6222 | 	if (!page->mem_cgroup) | 
 | 6223 | 		return; | 
 | 6224 |  | 
 | 6225 | 	uncharge_gather_clear(&ug); | 
 | 6226 | 	uncharge_page(page, &ug); | 
 | 6227 | 	uncharge_batch(&ug); | 
 | 6228 | } | 
 | 6229 |  | 
 | 6230 | /** | 
 | 6231 |  * mem_cgroup_uncharge_list - uncharge a list of page | 
 | 6232 |  * @page_list: list of pages to uncharge | 
 | 6233 |  * | 
 | 6234 |  * Uncharge a list of pages previously charged with | 
 | 6235 |  * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). | 
 | 6236 |  */ | 
 | 6237 | void mem_cgroup_uncharge_list(struct list_head *page_list) | 
 | 6238 | { | 
 | 6239 | 	if (mem_cgroup_disabled()) | 
 | 6240 | 		return; | 
 | 6241 |  | 
 | 6242 | 	if (!list_empty(page_list)) | 
 | 6243 | 		uncharge_list(page_list); | 
 | 6244 | } | 
 | 6245 |  | 
 | 6246 | /** | 
 | 6247 |  * mem_cgroup_migrate - charge a page's replacement | 
 | 6248 |  * @oldpage: currently circulating page | 
 | 6249 |  * @newpage: replacement page | 
 | 6250 |  * | 
 | 6251 |  * Charge @newpage as a replacement page for @oldpage. @oldpage will | 
 | 6252 |  * be uncharged upon free. | 
 | 6253 |  * | 
 | 6254 |  * Both pages must be locked, @newpage->mapping must be set up. | 
 | 6255 |  */ | 
 | 6256 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) | 
 | 6257 | { | 
 | 6258 | 	struct mem_cgroup *memcg; | 
 | 6259 | 	unsigned int nr_pages; | 
 | 6260 | 	bool compound; | 
 | 6261 | 	unsigned long flags; | 
 | 6262 |  | 
 | 6263 | 	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); | 
 | 6264 | 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); | 
 | 6265 | 	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); | 
 | 6266 | 	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), | 
 | 6267 | 		       newpage); | 
 | 6268 |  | 
 | 6269 | 	if (mem_cgroup_disabled()) | 
 | 6270 | 		return; | 
 | 6271 |  | 
 | 6272 | 	/* Page cache replacement: new page already charged? */ | 
 | 6273 | 	if (newpage->mem_cgroup) | 
 | 6274 | 		return; | 
 | 6275 |  | 
 | 6276 | 	/* Swapcache readahead pages can get replaced before being charged */ | 
 | 6277 | 	memcg = oldpage->mem_cgroup; | 
 | 6278 | 	if (!memcg) | 
 | 6279 | 		return; | 
 | 6280 |  | 
 | 6281 | 	/* Force-charge the new page. The old one will be freed soon */ | 
 | 6282 | 	compound = PageTransHuge(newpage); | 
 | 6283 | 	nr_pages = compound ? hpage_nr_pages(newpage) : 1; | 
 | 6284 |  | 
 | 6285 | 	page_counter_charge(&memcg->memory, nr_pages); | 
 | 6286 | 	if (do_memsw_account()) | 
 | 6287 | 		page_counter_charge(&memcg->memsw, nr_pages); | 
 | 6288 | 	css_get_many(&memcg->css, nr_pages); | 
 | 6289 |  | 
 | 6290 | 	commit_charge(newpage, memcg, false); | 
 | 6291 |  | 
 | 6292 | 	local_irq_save(flags); | 
 | 6293 | 	mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); | 
 | 6294 | 	memcg_check_events(memcg, newpage); | 
 | 6295 | 	local_irq_restore(flags); | 
 | 6296 | } | 
 | 6297 |  | 
 | 6298 | DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); | 
 | 6299 | EXPORT_SYMBOL(memcg_sockets_enabled_key); | 
 | 6300 |  | 
 | 6301 | void mem_cgroup_sk_alloc(struct sock *sk) | 
 | 6302 | { | 
 | 6303 | 	struct mem_cgroup *memcg; | 
 | 6304 |  | 
 | 6305 | 	if (!mem_cgroup_sockets_enabled) | 
 | 6306 | 		return; | 
 | 6307 |  | 
 | 6308 | 	/* | 
 | 6309 | 	 * Socket cloning can throw us here with sk_memcg already | 
 | 6310 | 	 * filled. It won't however, necessarily happen from | 
 | 6311 | 	 * process context. So the test for root memcg given | 
 | 6312 | 	 * the current task's memcg won't help us in this case. | 
 | 6313 | 	 * | 
 | 6314 | 	 * Respecting the original socket's memcg is a better | 
 | 6315 | 	 * decision in this case. | 
 | 6316 | 	 */ | 
 | 6317 | 	if (sk->sk_memcg) { | 
 | 6318 | 		css_get(&sk->sk_memcg->css); | 
 | 6319 | 		return; | 
 | 6320 | 	} | 
 | 6321 |  | 
 | 6322 | 	rcu_read_lock(); | 
 | 6323 | 	memcg = mem_cgroup_from_task(current); | 
 | 6324 | 	if (memcg == root_mem_cgroup) | 
 | 6325 | 		goto out; | 
 | 6326 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) | 
 | 6327 | 		goto out; | 
 | 6328 | 	if (css_tryget_online(&memcg->css)) | 
 | 6329 | 		sk->sk_memcg = memcg; | 
 | 6330 | out: | 
 | 6331 | 	rcu_read_unlock(); | 
 | 6332 | } | 
 | 6333 |  | 
 | 6334 | void mem_cgroup_sk_free(struct sock *sk) | 
 | 6335 | { | 
 | 6336 | 	if (sk->sk_memcg) | 
 | 6337 | 		css_put(&sk->sk_memcg->css); | 
 | 6338 | } | 
 | 6339 |  | 
 | 6340 | /** | 
 | 6341 |  * mem_cgroup_charge_skmem - charge socket memory | 
 | 6342 |  * @memcg: memcg to charge | 
 | 6343 |  * @nr_pages: number of pages to charge | 
 | 6344 |  * | 
 | 6345 |  * Charges @nr_pages to @memcg. Returns %true if the charge fit within | 
 | 6346 |  * @memcg's configured limit, %false if the charge had to be forced. | 
 | 6347 |  */ | 
 | 6348 | bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) | 
 | 6349 | { | 
 | 6350 | 	gfp_t gfp_mask = GFP_KERNEL; | 
 | 6351 |  | 
 | 6352 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { | 
 | 6353 | 		struct page_counter *fail; | 
 | 6354 |  | 
 | 6355 | 		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { | 
 | 6356 | 			memcg->tcpmem_pressure = 0; | 
 | 6357 | 			return true; | 
 | 6358 | 		} | 
 | 6359 | 		page_counter_charge(&memcg->tcpmem, nr_pages); | 
 | 6360 | 		memcg->tcpmem_pressure = 1; | 
 | 6361 | 		return false; | 
 | 6362 | 	} | 
 | 6363 |  | 
 | 6364 | 	/* Don't block in the packet receive path */ | 
 | 6365 | 	if (in_softirq()) | 
 | 6366 | 		gfp_mask = GFP_NOWAIT; | 
 | 6367 |  | 
 | 6368 | 	mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); | 
 | 6369 |  | 
 | 6370 | 	if (try_charge(memcg, gfp_mask, nr_pages) == 0) | 
 | 6371 | 		return true; | 
 | 6372 |  | 
 | 6373 | 	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages); | 
 | 6374 | 	return false; | 
 | 6375 | } | 
 | 6376 |  | 
 | 6377 | /** | 
 | 6378 |  * mem_cgroup_uncharge_skmem - uncharge socket memory | 
 | 6379 |  * @memcg: memcg to uncharge | 
 | 6380 |  * @nr_pages: number of pages to uncharge | 
 | 6381 |  */ | 
 | 6382 | void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) | 
 | 6383 | { | 
 | 6384 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) { | 
 | 6385 | 		page_counter_uncharge(&memcg->tcpmem, nr_pages); | 
 | 6386 | 		return; | 
 | 6387 | 	} | 
 | 6388 |  | 
 | 6389 | 	mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); | 
 | 6390 |  | 
 | 6391 | 	refill_stock(memcg, nr_pages); | 
 | 6392 | } | 
 | 6393 |  | 
 | 6394 | static int __init cgroup_memory(char *s) | 
 | 6395 | { | 
 | 6396 | 	char *token; | 
 | 6397 |  | 
 | 6398 | 	while ((token = strsep(&s, ",")) != NULL) { | 
 | 6399 | 		if (!*token) | 
 | 6400 | 			continue; | 
 | 6401 | 		if (!strcmp(token, "nosocket")) | 
 | 6402 | 			cgroup_memory_nosocket = true; | 
 | 6403 | 		if (!strcmp(token, "nokmem")) | 
 | 6404 | 			cgroup_memory_nokmem = true; | 
 | 6405 | 	} | 
 | 6406 | 	return 0; | 
 | 6407 | } | 
 | 6408 | __setup("cgroup.memory=", cgroup_memory); | 
 | 6409 |  | 
 | 6410 | /* | 
 | 6411 |  * subsys_initcall() for memory controller. | 
 | 6412 |  * | 
 | 6413 |  * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this | 
 | 6414 |  * context because of lock dependencies (cgroup_lock -> cpu hotplug) but | 
 | 6415 |  * basically everything that doesn't depend on a specific mem_cgroup structure | 
 | 6416 |  * should be initialized from here. | 
 | 6417 |  */ | 
 | 6418 | static int __init mem_cgroup_init(void) | 
 | 6419 | { | 
 | 6420 | 	int cpu, node; | 
 | 6421 |  | 
 | 6422 | #ifdef CONFIG_MEMCG_KMEM | 
 | 6423 | 	/* | 
 | 6424 | 	 * Kmem cache creation is mostly done with the slab_mutex held, | 
 | 6425 | 	 * so use a workqueue with limited concurrency to avoid stalling | 
 | 6426 | 	 * all worker threads in case lots of cgroups are created and | 
 | 6427 | 	 * destroyed simultaneously. | 
 | 6428 | 	 */ | 
 | 6429 | 	memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1); | 
 | 6430 | 	BUG_ON(!memcg_kmem_cache_wq); | 
 | 6431 | #endif | 
 | 6432 |  | 
 | 6433 | 	cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, | 
 | 6434 | 				  memcg_hotplug_cpu_dead); | 
 | 6435 |  | 
 | 6436 | 	for_each_possible_cpu(cpu) | 
 | 6437 | 		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, | 
 | 6438 | 			  drain_local_stock); | 
 | 6439 |  | 
 | 6440 | 	for_each_node(node) { | 
 | 6441 | 		struct mem_cgroup_tree_per_node *rtpn; | 
 | 6442 |  | 
 | 6443 | 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, | 
 | 6444 | 				    node_online(node) ? node : NUMA_NO_NODE); | 
 | 6445 |  | 
 | 6446 | 		rtpn->rb_root = RB_ROOT; | 
 | 6447 | 		rtpn->rb_rightmost = NULL; | 
 | 6448 | 		spin_lock_init(&rtpn->lock); | 
 | 6449 | 		soft_limit_tree.rb_tree_per_node[node] = rtpn; | 
 | 6450 | 	} | 
 | 6451 |  | 
 | 6452 | 	return 0; | 
 | 6453 | } | 
 | 6454 | subsys_initcall(mem_cgroup_init); | 
 | 6455 |  | 
 | 6456 | #ifdef CONFIG_MEMCG_SWAP | 
 | 6457 | static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) | 
 | 6458 | { | 
 | 6459 | 	while (!atomic_inc_not_zero(&memcg->id.ref)) { | 
 | 6460 | 		/* | 
 | 6461 | 		 * The root cgroup cannot be destroyed, so it's refcount must | 
 | 6462 | 		 * always be >= 1. | 
 | 6463 | 		 */ | 
 | 6464 | 		if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { | 
 | 6465 | 			VM_BUG_ON(1); | 
 | 6466 | 			break; | 
 | 6467 | 		} | 
 | 6468 | 		memcg = parent_mem_cgroup(memcg); | 
 | 6469 | 		if (!memcg) | 
 | 6470 | 			memcg = root_mem_cgroup; | 
 | 6471 | 	} | 
 | 6472 | 	return memcg; | 
 | 6473 | } | 
 | 6474 |  | 
 | 6475 | /** | 
 | 6476 |  * mem_cgroup_swapout - transfer a memsw charge to swap | 
 | 6477 |  * @page: page whose memsw charge to transfer | 
 | 6478 |  * @entry: swap entry to move the charge to | 
 | 6479 |  * | 
 | 6480 |  * Transfer the memsw charge of @page to @entry. | 
 | 6481 |  */ | 
 | 6482 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | 
 | 6483 | { | 
 | 6484 | 	struct mem_cgroup *memcg, *swap_memcg; | 
 | 6485 | 	unsigned int nr_entries; | 
 | 6486 | 	unsigned short oldid; | 
 | 6487 |  | 
 | 6488 | 	VM_BUG_ON_PAGE(PageLRU(page), page); | 
 | 6489 | 	VM_BUG_ON_PAGE(page_count(page), page); | 
 | 6490 |  | 
 | 6491 | 	if (!do_memsw_account()) | 
 | 6492 | 		return; | 
 | 6493 |  | 
 | 6494 | 	memcg = page->mem_cgroup; | 
 | 6495 |  | 
 | 6496 | 	/* Readahead page, never charged */ | 
 | 6497 | 	if (!memcg) | 
 | 6498 | 		return; | 
 | 6499 |  | 
 | 6500 | 	/* | 
 | 6501 | 	 * In case the memcg owning these pages has been offlined and doesn't | 
 | 6502 | 	 * have an ID allocated to it anymore, charge the closest online | 
 | 6503 | 	 * ancestor for the swap instead and transfer the memory+swap charge. | 
 | 6504 | 	 */ | 
 | 6505 | 	swap_memcg = mem_cgroup_id_get_online(memcg); | 
 | 6506 | 	nr_entries = hpage_nr_pages(page); | 
 | 6507 | 	/* Get references for the tail pages, too */ | 
 | 6508 | 	if (nr_entries > 1) | 
 | 6509 | 		mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); | 
 | 6510 | 	oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), | 
 | 6511 | 				   nr_entries); | 
 | 6512 | 	VM_BUG_ON_PAGE(oldid, page); | 
 | 6513 | 	mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); | 
 | 6514 |  | 
 | 6515 | 	page->mem_cgroup = NULL; | 
 | 6516 |  | 
 | 6517 | 	if (!mem_cgroup_is_root(memcg)) | 
 | 6518 | 		page_counter_uncharge(&memcg->memory, nr_entries); | 
 | 6519 |  | 
 | 6520 | 	if (memcg != swap_memcg) { | 
 | 6521 | 		if (!mem_cgroup_is_root(swap_memcg)) | 
 | 6522 | 			page_counter_charge(&swap_memcg->memsw, nr_entries); | 
 | 6523 | 		page_counter_uncharge(&memcg->memsw, nr_entries); | 
 | 6524 | 	} | 
 | 6525 |  | 
 | 6526 | 	/* | 
 | 6527 | 	 * Interrupts should be disabled here because the caller holds the | 
 | 6528 | 	 * i_pages lock which is taken with interrupts-off. It is | 
 | 6529 | 	 * important here to have the interrupts disabled because it is the | 
 | 6530 | 	 * only synchronisation we have for updating the per-CPU variables. | 
 | 6531 | 	 */ | 
 | 6532 | 	VM_BUG_ON(!irqs_disabled()); | 
 | 6533 | 	mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), | 
 | 6534 | 				     -nr_entries); | 
 | 6535 | 	memcg_check_events(memcg, page); | 
 | 6536 |  | 
 | 6537 | 	if (!mem_cgroup_is_root(memcg)) | 
 | 6538 | 		css_put_many(&memcg->css, nr_entries); | 
 | 6539 | } | 
 | 6540 |  | 
 | 6541 | /** | 
 | 6542 |  * mem_cgroup_try_charge_swap - try charging swap space for a page | 
 | 6543 |  * @page: page being added to swap | 
 | 6544 |  * @entry: swap entry to charge | 
 | 6545 |  * | 
 | 6546 |  * Try to charge @page's memcg for the swap space at @entry. | 
 | 6547 |  * | 
 | 6548 |  * Returns 0 on success, -ENOMEM on failure. | 
 | 6549 |  */ | 
 | 6550 | int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) | 
 | 6551 | { | 
 | 6552 | 	unsigned int nr_pages = hpage_nr_pages(page); | 
 | 6553 | 	struct page_counter *counter; | 
 | 6554 | 	struct mem_cgroup *memcg; | 
 | 6555 | 	unsigned short oldid; | 
 | 6556 |  | 
 | 6557 | 	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account) | 
 | 6558 | 		return 0; | 
 | 6559 |  | 
 | 6560 | 	memcg = page->mem_cgroup; | 
 | 6561 |  | 
 | 6562 | 	/* Readahead page, never charged */ | 
 | 6563 | 	if (!memcg) | 
 | 6564 | 		return 0; | 
 | 6565 |  | 
 | 6566 | 	if (!entry.val) { | 
 | 6567 | 		memcg_memory_event(memcg, MEMCG_SWAP_FAIL); | 
 | 6568 | 		return 0; | 
 | 6569 | 	} | 
 | 6570 |  | 
 | 6571 | 	memcg = mem_cgroup_id_get_online(memcg); | 
 | 6572 |  | 
 | 6573 | 	if (!mem_cgroup_is_root(memcg) && | 
 | 6574 | 	    !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { | 
 | 6575 | 		memcg_memory_event(memcg, MEMCG_SWAP_MAX); | 
 | 6576 | 		memcg_memory_event(memcg, MEMCG_SWAP_FAIL); | 
 | 6577 | 		mem_cgroup_id_put(memcg); | 
 | 6578 | 		return -ENOMEM; | 
 | 6579 | 	} | 
 | 6580 |  | 
 | 6581 | 	/* Get references for the tail pages, too */ | 
 | 6582 | 	if (nr_pages > 1) | 
 | 6583 | 		mem_cgroup_id_get_many(memcg, nr_pages - 1); | 
 | 6584 | 	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); | 
 | 6585 | 	VM_BUG_ON_PAGE(oldid, page); | 
 | 6586 | 	mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); | 
 | 6587 |  | 
 | 6588 | 	return 0; | 
 | 6589 | } | 
 | 6590 |  | 
 | 6591 | /** | 
 | 6592 |  * mem_cgroup_uncharge_swap - uncharge swap space | 
 | 6593 |  * @entry: swap entry to uncharge | 
 | 6594 |  * @nr_pages: the amount of swap space to uncharge | 
 | 6595 |  */ | 
 | 6596 | void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) | 
 | 6597 | { | 
 | 6598 | 	struct mem_cgroup *memcg; | 
 | 6599 | 	unsigned short id; | 
 | 6600 |  | 
 | 6601 | 	if (!do_swap_account) | 
 | 6602 | 		return; | 
 | 6603 |  | 
 | 6604 | 	id = swap_cgroup_record(entry, 0, nr_pages); | 
 | 6605 | 	rcu_read_lock(); | 
 | 6606 | 	memcg = mem_cgroup_from_id(id); | 
 | 6607 | 	if (memcg) { | 
 | 6608 | 		if (!mem_cgroup_is_root(memcg)) { | 
 | 6609 | 			if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 6610 | 				page_counter_uncharge(&memcg->swap, nr_pages); | 
 | 6611 | 			else | 
 | 6612 | 				page_counter_uncharge(&memcg->memsw, nr_pages); | 
 | 6613 | 		} | 
 | 6614 | 		mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); | 
 | 6615 | 		mem_cgroup_id_put_many(memcg, nr_pages); | 
 | 6616 | 	} | 
 | 6617 | 	rcu_read_unlock(); | 
 | 6618 | } | 
 | 6619 |  | 
 | 6620 | long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) | 
 | 6621 | { | 
 | 6622 | 	long nr_swap_pages = get_nr_swap_pages(); | 
 | 6623 |  | 
 | 6624 | 	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 6625 | 		return nr_swap_pages; | 
 | 6626 | 	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) | 
 | 6627 | 		nr_swap_pages = min_t(long, nr_swap_pages, | 
 | 6628 | 				      READ_ONCE(memcg->swap.max) - | 
 | 6629 | 				      page_counter_read(&memcg->swap)); | 
 | 6630 | 	return nr_swap_pages; | 
 | 6631 | } | 
 | 6632 |  | 
 | 6633 | bool mem_cgroup_swap_full(struct page *page) | 
 | 6634 | { | 
 | 6635 | 	struct mem_cgroup *memcg; | 
 | 6636 |  | 
 | 6637 | 	VM_BUG_ON_PAGE(!PageLocked(page), page); | 
 | 6638 |  | 
 | 6639 | 	if (vm_swap_full()) | 
 | 6640 | 		return true; | 
 | 6641 | 	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 
 | 6642 | 		return false; | 
 | 6643 |  | 
 | 6644 | 	memcg = page->mem_cgroup; | 
 | 6645 | 	if (!memcg) | 
 | 6646 | 		return false; | 
 | 6647 |  | 
 | 6648 | 	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) | 
 | 6649 | 		if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) | 
 | 6650 | 			return true; | 
 | 6651 |  | 
 | 6652 | 	return false; | 
 | 6653 | } | 
 | 6654 |  | 
 | 6655 | /* for remember boot option*/ | 
 | 6656 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | 
 | 6657 | static int really_do_swap_account __initdata = 1; | 
 | 6658 | #else | 
 | 6659 | static int really_do_swap_account __initdata; | 
 | 6660 | #endif | 
 | 6661 |  | 
 | 6662 | static int __init enable_swap_account(char *s) | 
 | 6663 | { | 
 | 6664 | 	if (!strcmp(s, "1")) | 
 | 6665 | 		really_do_swap_account = 1; | 
 | 6666 | 	else if (!strcmp(s, "0")) | 
 | 6667 | 		really_do_swap_account = 0; | 
 | 6668 | 	return 1; | 
 | 6669 | } | 
 | 6670 | __setup("swapaccount=", enable_swap_account); | 
 | 6671 |  | 
 | 6672 | static u64 swap_current_read(struct cgroup_subsys_state *css, | 
 | 6673 | 			     struct cftype *cft) | 
 | 6674 | { | 
 | 6675 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 
 | 6676 |  | 
 | 6677 | 	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; | 
 | 6678 | } | 
 | 6679 |  | 
 | 6680 | static int swap_max_show(struct seq_file *m, void *v) | 
 | 6681 | { | 
 | 6682 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 6683 | 	unsigned long max = READ_ONCE(memcg->swap.max); | 
 | 6684 |  | 
 | 6685 | 	if (max == PAGE_COUNTER_MAX) | 
 | 6686 | 		seq_puts(m, "max\n"); | 
 | 6687 | 	else | 
 | 6688 | 		seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); | 
 | 6689 |  | 
 | 6690 | 	return 0; | 
 | 6691 | } | 
 | 6692 |  | 
 | 6693 | static ssize_t swap_max_write(struct kernfs_open_file *of, | 
 | 6694 | 			      char *buf, size_t nbytes, loff_t off) | 
 | 6695 | { | 
 | 6696 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 
 | 6697 | 	unsigned long max; | 
 | 6698 | 	int err; | 
 | 6699 |  | 
 | 6700 | 	buf = strstrip(buf); | 
 | 6701 | 	err = page_counter_memparse(buf, "max", &max); | 
 | 6702 | 	if (err) | 
 | 6703 | 		return err; | 
 | 6704 |  | 
 | 6705 | 	xchg(&memcg->swap.max, max); | 
 | 6706 |  | 
 | 6707 | 	return nbytes; | 
 | 6708 | } | 
 | 6709 |  | 
 | 6710 | static int swap_events_show(struct seq_file *m, void *v) | 
 | 6711 | { | 
 | 6712 | 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 
 | 6713 |  | 
 | 6714 | 	seq_printf(m, "max %lu\n", | 
 | 6715 | 		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); | 
 | 6716 | 	seq_printf(m, "fail %lu\n", | 
 | 6717 | 		   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); | 
 | 6718 |  | 
 | 6719 | 	return 0; | 
 | 6720 | } | 
 | 6721 |  | 
 | 6722 | static struct cftype swap_files[] = { | 
 | 6723 | 	{ | 
 | 6724 | 		.name = "swap.current", | 
 | 6725 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 6726 | 		.read_u64 = swap_current_read, | 
 | 6727 | 	}, | 
 | 6728 | 	{ | 
 | 6729 | 		.name = "swap.max", | 
 | 6730 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 6731 | 		.seq_show = swap_max_show, | 
 | 6732 | 		.write = swap_max_write, | 
 | 6733 | 	}, | 
 | 6734 | 	{ | 
 | 6735 | 		.name = "swap.events", | 
 | 6736 | 		.flags = CFTYPE_NOT_ON_ROOT, | 
 | 6737 | 		.file_offset = offsetof(struct mem_cgroup, swap_events_file), | 
 | 6738 | 		.seq_show = swap_events_show, | 
 | 6739 | 	}, | 
 | 6740 | 	{ }	/* terminate */ | 
 | 6741 | }; | 
 | 6742 |  | 
 | 6743 | static struct cftype memsw_cgroup_files[] = { | 
 | 6744 | 	{ | 
 | 6745 | 		.name = "memsw.usage_in_bytes", | 
 | 6746 | 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 
 | 6747 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 6748 | 	}, | 
 | 6749 | 	{ | 
 | 6750 | 		.name = "memsw.max_usage_in_bytes", | 
 | 6751 | 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | 
 | 6752 | 		.write = mem_cgroup_reset, | 
 | 6753 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 6754 | 	}, | 
 | 6755 | 	{ | 
 | 6756 | 		.name = "memsw.limit_in_bytes", | 
 | 6757 | 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | 
 | 6758 | 		.write = mem_cgroup_write, | 
 | 6759 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 6760 | 	}, | 
 | 6761 | 	{ | 
 | 6762 | 		.name = "memsw.failcnt", | 
 | 6763 | 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | 
 | 6764 | 		.write = mem_cgroup_reset, | 
 | 6765 | 		.read_u64 = mem_cgroup_read_u64, | 
 | 6766 | 	}, | 
 | 6767 | 	{ },	/* terminate */ | 
 | 6768 | }; | 
 | 6769 |  | 
 | 6770 | static int __init mem_cgroup_swap_init(void) | 
 | 6771 | { | 
 | 6772 | 	if (!mem_cgroup_disabled() && really_do_swap_account) { | 
 | 6773 | 		do_swap_account = 1; | 
 | 6774 | 		WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, | 
 | 6775 | 					       swap_files)); | 
 | 6776 | 		WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | 
 | 6777 | 						  memsw_cgroup_files)); | 
 | 6778 | 	} | 
 | 6779 | 	return 0; | 
 | 6780 | } | 
 | 6781 | subsys_initcall(mem_cgroup_swap_init); | 
 | 6782 |  | 
 | 6783 | #endif /* CONFIG_MEMCG_SWAP */ |