| xj | b04a402 | 2021-11-25 15:01:52 +0800 | [diff] [blame] | 1 | /* | 
 | 2 |  *  linux/mm/oom_kill.c | 
 | 3 |  *  | 
 | 4 |  *  Copyright (C)  1998,2000  Rik van Riel | 
 | 5 |  *	Thanks go out to Claus Fischer for some serious inspiration and | 
 | 6 |  *	for goading me into coding this file... | 
 | 7 |  *  Copyright (C)  2010  Google, Inc. | 
 | 8 |  *	Rewritten by David Rientjes | 
 | 9 |  * | 
 | 10 |  *  The routines in this file are used to kill a process when | 
 | 11 |  *  we're seriously out of memory. This gets called from __alloc_pages() | 
 | 12 |  *  in mm/page_alloc.c when we really run out of memory. | 
 | 13 |  * | 
 | 14 |  *  Since we won't call these routines often (on a well-configured | 
 | 15 |  *  machine) this file will double as a 'coding guide' and a signpost | 
 | 16 |  *  for newbie kernel hackers. It features several pointers to major | 
 | 17 |  *  kernel subsystems and hints as to where to find out what things do. | 
 | 18 |  */ | 
 | 19 |  | 
 | 20 | #include <linux/oom.h> | 
 | 21 | #include <linux/mm.h> | 
 | 22 | #include <linux/err.h> | 
 | 23 | #include <linux/gfp.h> | 
 | 24 | #include <linux/sched.h> | 
 | 25 | #include <linux/sched/mm.h> | 
 | 26 | #include <linux/sched/coredump.h> | 
 | 27 | #include <linux/sched/task.h> | 
 | 28 | #include <linux/swap.h> | 
 | 29 | #include <linux/timex.h> | 
 | 30 | #include <linux/jiffies.h> | 
 | 31 | #include <linux/cpuset.h> | 
 | 32 | #include <linux/export.h> | 
 | 33 | #include <linux/notifier.h> | 
 | 34 | #include <linux/memcontrol.h> | 
 | 35 | #include <linux/mempolicy.h> | 
 | 36 | #include <linux/security.h> | 
 | 37 | #include <linux/ptrace.h> | 
 | 38 | #include <linux/freezer.h> | 
 | 39 | #include <linux/ftrace.h> | 
 | 40 | #include <linux/ratelimit.h> | 
 | 41 | #include <linux/kthread.h> | 
 | 42 | #include <linux/init.h> | 
 | 43 | #include <linux/mmu_notifier.h> | 
 | 44 |  | 
 | 45 | #include <asm/tlb.h> | 
 | 46 | #include "internal.h" | 
 | 47 | #include "slab.h" | 
 | 48 |  | 
 | 49 | #define CREATE_TRACE_POINTS | 
 | 50 | #include <trace/events/oom.h> | 
 | 51 |  | 
 | 52 | int sysctl_panic_on_oom; | 
 | 53 | int sysctl_oom_kill_allocating_task; | 
 | 54 | int sysctl_oom_dump_tasks = 1; | 
 | 55 |  | 
 | 56 | /* | 
 | 57 |  * Serializes oom killer invocations (out_of_memory()) from all contexts to | 
 | 58 |  * prevent from over eager oom killing (e.g. when the oom killer is invoked | 
 | 59 |  * from different domains). | 
 | 60 |  * | 
 | 61 |  * oom_killer_disable() relies on this lock to stabilize oom_killer_disabled | 
 | 62 |  * and mark_oom_victim | 
 | 63 |  */ | 
 | 64 | DEFINE_MUTEX(oom_lock); | 
 | 65 |  | 
 | 66 | #ifdef CONFIG_NUMA | 
 | 67 | /** | 
 | 68 |  * has_intersects_mems_allowed() - check task eligiblity for kill | 
 | 69 |  * @start: task struct of which task to consider | 
 | 70 |  * @mask: nodemask passed to page allocator for mempolicy ooms | 
 | 71 |  * | 
 | 72 |  * Task eligibility is determined by whether or not a candidate task, @tsk, | 
 | 73 |  * shares the same mempolicy nodes as current if it is bound by such a policy | 
 | 74 |  * and whether or not it has the same set of allowed cpuset nodes. | 
 | 75 |  */ | 
 | 76 | static bool has_intersects_mems_allowed(struct task_struct *start, | 
 | 77 | 					const nodemask_t *mask) | 
 | 78 | { | 
 | 79 | 	struct task_struct *tsk; | 
 | 80 | 	bool ret = false; | 
 | 81 |  | 
 | 82 | 	rcu_read_lock(); | 
 | 83 | 	for_each_thread(start, tsk) { | 
 | 84 | 		if (mask) { | 
 | 85 | 			/* | 
 | 86 | 			 * If this is a mempolicy constrained oom, tsk's | 
 | 87 | 			 * cpuset is irrelevant.  Only return true if its | 
 | 88 | 			 * mempolicy intersects current, otherwise it may be | 
 | 89 | 			 * needlessly killed. | 
 | 90 | 			 */ | 
 | 91 | 			ret = mempolicy_nodemask_intersects(tsk, mask); | 
 | 92 | 		} else { | 
 | 93 | 			/* | 
 | 94 | 			 * This is not a mempolicy constrained oom, so only | 
 | 95 | 			 * check the mems of tsk's cpuset. | 
 | 96 | 			 */ | 
 | 97 | 			ret = cpuset_mems_allowed_intersects(current, tsk); | 
 | 98 | 		} | 
 | 99 | 		if (ret) | 
 | 100 | 			break; | 
 | 101 | 	} | 
 | 102 | 	rcu_read_unlock(); | 
 | 103 |  | 
 | 104 | 	return ret; | 
 | 105 | } | 
 | 106 | #else | 
 | 107 | static bool has_intersects_mems_allowed(struct task_struct *tsk, | 
 | 108 | 					const nodemask_t *mask) | 
 | 109 | { | 
 | 110 | 	return true; | 
 | 111 | } | 
 | 112 | #endif /* CONFIG_NUMA */ | 
 | 113 |  | 
 | 114 | /* | 
 | 115 |  * The process p may have detached its own ->mm while exiting or through | 
 | 116 |  * use_mm(), but one or more of its subthreads may still have a valid | 
 | 117 |  * pointer.  Return p, or any of its subthreads with a valid ->mm, with | 
 | 118 |  * task_lock() held. | 
 | 119 |  */ | 
 | 120 | struct task_struct *find_lock_task_mm(struct task_struct *p) | 
 | 121 | { | 
 | 122 | 	struct task_struct *t; | 
 | 123 |  | 
 | 124 | 	rcu_read_lock(); | 
 | 125 |  | 
 | 126 | 	for_each_thread(p, t) { | 
 | 127 | 		task_lock(t); | 
 | 128 | 		if (likely(t->mm)) | 
 | 129 | 			goto found; | 
 | 130 | 		task_unlock(t); | 
 | 131 | 	} | 
 | 132 | 	t = NULL; | 
 | 133 | found: | 
 | 134 | 	rcu_read_unlock(); | 
 | 135 |  | 
 | 136 | 	return t; | 
 | 137 | } | 
 | 138 |  | 
 | 139 | /* | 
 | 140 |  * order == -1 means the oom kill is required by sysrq, otherwise only | 
 | 141 |  * for display purposes. | 
 | 142 |  */ | 
 | 143 | static inline bool is_sysrq_oom(struct oom_control *oc) | 
 | 144 | { | 
 | 145 | 	return oc->order == -1; | 
 | 146 | } | 
 | 147 |  | 
 | 148 | static inline bool is_memcg_oom(struct oom_control *oc) | 
 | 149 | { | 
 | 150 | 	return oc->memcg != NULL; | 
 | 151 | } | 
 | 152 |  | 
 | 153 | /* return true if the task is not adequate as candidate victim task. */ | 
 | 154 | static bool oom_unkillable_task(struct task_struct *p, | 
 | 155 | 		struct mem_cgroup *memcg, const nodemask_t *nodemask) | 
 | 156 | { | 
 | 157 | 	if (is_global_init(p)) | 
 | 158 | 		return true; | 
 | 159 | 	if (p->flags & PF_KTHREAD) | 
 | 160 | 		return true; | 
 | 161 |  | 
 | 162 | 	/* When mem_cgroup_out_of_memory() and p is not member of the group */ | 
 | 163 | 	if (memcg && !task_in_mem_cgroup(p, memcg)) | 
 | 164 | 		return true; | 
 | 165 |  | 
 | 166 | 	/* p may not have freeable memory in nodemask */ | 
 | 167 | 	if (!has_intersects_mems_allowed(p, nodemask)) | 
 | 168 | 		return true; | 
 | 169 |  | 
 | 170 | 	return false; | 
 | 171 | } | 
 | 172 |  | 
 | 173 | /* | 
 | 174 |  * Print out unreclaimble slabs info when unreclaimable slabs amount is greater | 
 | 175 |  * than all user memory (LRU pages) | 
 | 176 |  */ | 
 | 177 | static bool is_dump_unreclaim_slabs(void) | 
 | 178 | { | 
 | 179 | 	unsigned long nr_lru; | 
 | 180 |  | 
 | 181 | 	nr_lru = global_node_page_state(NR_ACTIVE_ANON) + | 
 | 182 | 		 global_node_page_state(NR_INACTIVE_ANON) + | 
 | 183 | 		 global_node_page_state(NR_ACTIVE_FILE) + | 
 | 184 | 		 global_node_page_state(NR_INACTIVE_FILE) + | 
 | 185 | 		 global_node_page_state(NR_ISOLATED_ANON) + | 
 | 186 | 		 global_node_page_state(NR_ISOLATED_FILE) + | 
 | 187 | 		 global_node_page_state(NR_UNEVICTABLE); | 
 | 188 |  | 
 | 189 | 	return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru); | 
 | 190 | } | 
 | 191 |  | 
 | 192 | /** | 
 | 193 |  * oom_badness - heuristic function to determine which candidate task to kill | 
 | 194 |  * @p: task struct of which task we should calculate | 
 | 195 |  * @totalpages: total present RAM allowed for page allocation | 
 | 196 |  * @memcg: task's memory controller, if constrained | 
 | 197 |  * @nodemask: nodemask passed to page allocator for mempolicy ooms | 
 | 198 |  * | 
 | 199 |  * The heuristic for determining which task to kill is made to be as simple and | 
 | 200 |  * predictable as possible.  The goal is to return the highest value for the | 
 | 201 |  * task consuming the most memory to avoid subsequent oom failures. | 
 | 202 |  */ | 
 | 203 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 
 | 204 | 			  const nodemask_t *nodemask, unsigned long totalpages) | 
 | 205 | { | 
 | 206 | 	long points; | 
 | 207 | 	long adj; | 
 | 208 |  | 
 | 209 | 	if (oom_unkillable_task(p, memcg, nodemask)) | 
 | 210 | 		return 0; | 
 | 211 |  | 
 | 212 | 	p = find_lock_task_mm(p); | 
 | 213 | 	if (!p) | 
 | 214 | 		return 0; | 
 | 215 |  | 
 | 216 | 	/* | 
 | 217 | 	 * Do not even consider tasks which are explicitly marked oom | 
 | 218 | 	 * unkillable or have been already oom reaped or the are in | 
 | 219 | 	 * the middle of vfork | 
 | 220 | 	 */ | 
 | 221 | 	adj = (long)p->signal->oom_score_adj; | 
 | 222 | 	if (adj == OOM_SCORE_ADJ_MIN || | 
 | 223 | 			test_bit(MMF_OOM_SKIP, &p->mm->flags) || | 
 | 224 | 			in_vfork(p)) { | 
 | 225 | 		task_unlock(p); | 
 | 226 | 		return 0; | 
 | 227 | 	} | 
 | 228 |  | 
 | 229 | 	/* | 
 | 230 | 	 * The baseline for the badness score is the proportion of RAM that each | 
 | 231 | 	 * task's rss, pagetable and swap space use. | 
 | 232 | 	 */ | 
 | 233 | 	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + | 
 | 234 | 		mm_pgtables_bytes(p->mm) / PAGE_SIZE; | 
 | 235 | 	task_unlock(p); | 
 | 236 |  | 
 | 237 | 	/* Normalize to oom_score_adj units */ | 
 | 238 | 	adj *= totalpages / 1000; | 
 | 239 | 	points += adj; | 
 | 240 |  | 
 | 241 | 	/* | 
 | 242 | 	 * Never return 0 for an eligible task regardless of the root bonus and | 
 | 243 | 	 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). | 
 | 244 | 	 */ | 
 | 245 | 	return points > 0 ? points : 1; | 
 | 246 | } | 
 | 247 |  | 
 | 248 | enum oom_constraint { | 
 | 249 | 	CONSTRAINT_NONE, | 
 | 250 | 	CONSTRAINT_CPUSET, | 
 | 251 | 	CONSTRAINT_MEMORY_POLICY, | 
 | 252 | 	CONSTRAINT_MEMCG, | 
 | 253 | }; | 
 | 254 |  | 
 | 255 | /* | 
 | 256 |  * Determine the type of allocation constraint. | 
 | 257 |  */ | 
 | 258 | static enum oom_constraint constrained_alloc(struct oom_control *oc) | 
 | 259 | { | 
 | 260 | 	struct zone *zone; | 
 | 261 | 	struct zoneref *z; | 
 | 262 | 	enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask); | 
 | 263 | 	bool cpuset_limited = false; | 
 | 264 | 	int nid; | 
 | 265 |  | 
 | 266 | 	if (is_memcg_oom(oc)) { | 
 | 267 | 		oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; | 
 | 268 | 		return CONSTRAINT_MEMCG; | 
 | 269 | 	} | 
 | 270 |  | 
 | 271 | 	/* Default to all available memory */ | 
 | 272 | 	oc->totalpages = totalram_pages + total_swap_pages; | 
 | 273 |  | 
 | 274 | 	if (!IS_ENABLED(CONFIG_NUMA)) | 
 | 275 | 		return CONSTRAINT_NONE; | 
 | 276 |  | 
 | 277 | 	if (!oc->zonelist) | 
 | 278 | 		return CONSTRAINT_NONE; | 
 | 279 | 	/* | 
 | 280 | 	 * Reach here only when __GFP_NOFAIL is used. So, we should avoid | 
 | 281 | 	 * to kill current.We have to random task kill in this case. | 
 | 282 | 	 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. | 
 | 283 | 	 */ | 
 | 284 | 	if (oc->gfp_mask & __GFP_THISNODE) | 
 | 285 | 		return CONSTRAINT_NONE; | 
 | 286 |  | 
 | 287 | 	/* | 
 | 288 | 	 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in | 
 | 289 | 	 * the page allocator means a mempolicy is in effect.  Cpuset policy | 
 | 290 | 	 * is enforced in get_page_from_freelist(). | 
 | 291 | 	 */ | 
 | 292 | 	if (oc->nodemask && | 
 | 293 | 	    !nodes_subset(node_states[N_MEMORY], *oc->nodemask)) { | 
 | 294 | 		oc->totalpages = total_swap_pages; | 
 | 295 | 		for_each_node_mask(nid, *oc->nodemask) | 
 | 296 | 			oc->totalpages += node_spanned_pages(nid); | 
 | 297 | 		return CONSTRAINT_MEMORY_POLICY; | 
 | 298 | 	} | 
 | 299 |  | 
 | 300 | 	/* Check this allocation failure is caused by cpuset's wall function */ | 
 | 301 | 	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, | 
 | 302 | 			high_zoneidx, oc->nodemask) | 
 | 303 | 		if (!cpuset_zone_allowed(zone, oc->gfp_mask)) | 
 | 304 | 			cpuset_limited = true; | 
 | 305 |  | 
 | 306 | 	if (cpuset_limited) { | 
 | 307 | 		oc->totalpages = total_swap_pages; | 
 | 308 | 		for_each_node_mask(nid, cpuset_current_mems_allowed) | 
 | 309 | 			oc->totalpages += node_spanned_pages(nid); | 
 | 310 | 		return CONSTRAINT_CPUSET; | 
 | 311 | 	} | 
 | 312 | 	return CONSTRAINT_NONE; | 
 | 313 | } | 
 | 314 |  | 
 | 315 | static int oom_evaluate_task(struct task_struct *task, void *arg) | 
 | 316 | { | 
 | 317 | 	struct oom_control *oc = arg; | 
 | 318 | 	unsigned long points; | 
 | 319 |  | 
 | 320 | 	if (oom_unkillable_task(task, NULL, oc->nodemask)) | 
 | 321 | 		goto next; | 
 | 322 |  | 
 | 323 | 	/* | 
 | 324 | 	 * This task already has access to memory reserves and is being killed. | 
 | 325 | 	 * Don't allow any other task to have access to the reserves unless | 
 | 326 | 	 * the task has MMF_OOM_SKIP because chances that it would release | 
 | 327 | 	 * any memory is quite low. | 
 | 328 | 	 */ | 
 | 329 | 	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { | 
 | 330 | 		if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) | 
 | 331 | 			goto next; | 
 | 332 | 		goto abort; | 
 | 333 | 	} | 
 | 334 |  | 
 | 335 | 	/* | 
 | 336 | 	 * If task is allocating a lot of memory and has been marked to be | 
 | 337 | 	 * killed first if it triggers an oom, then select it. | 
 | 338 | 	 */ | 
 | 339 | 	if (oom_task_origin(task)) { | 
 | 340 | 		points = ULONG_MAX; | 
 | 341 | 		goto select; | 
 | 342 | 	} | 
 | 343 |  | 
 | 344 | 	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages); | 
 | 345 | 	if (!points || points < oc->chosen_points) | 
 | 346 | 		goto next; | 
 | 347 |  | 
 | 348 | 	/* Prefer thread group leaders for display purposes */ | 
 | 349 | 	if (points == oc->chosen_points && thread_group_leader(oc->chosen)) | 
 | 350 | 		goto next; | 
 | 351 | select: | 
 | 352 | 	if (oc->chosen) | 
 | 353 | 		put_task_struct(oc->chosen); | 
 | 354 | 	get_task_struct(task); | 
 | 355 | 	oc->chosen = task; | 
 | 356 | 	oc->chosen_points = points; | 
 | 357 | next: | 
 | 358 | 	return 0; | 
 | 359 | abort: | 
 | 360 | 	if (oc->chosen) | 
 | 361 | 		put_task_struct(oc->chosen); | 
 | 362 | 	oc->chosen = (void *)-1UL; | 
 | 363 | 	return 1; | 
 | 364 | } | 
 | 365 |  | 
 | 366 | /* | 
 | 367 |  * Simple selection loop. We choose the process with the highest number of | 
 | 368 |  * 'points'. In case scan was aborted, oc->chosen is set to -1. | 
 | 369 |  */ | 
 | 370 | static void select_bad_process(struct oom_control *oc) | 
 | 371 | { | 
 | 372 | 	if (is_memcg_oom(oc)) | 
 | 373 | 		mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc); | 
 | 374 | 	else { | 
 | 375 | 		struct task_struct *p; | 
 | 376 |  | 
 | 377 | 		rcu_read_lock(); | 
 | 378 | 		for_each_process(p) | 
 | 379 | 			if (oom_evaluate_task(p, oc)) | 
 | 380 | 				break; | 
 | 381 | 		rcu_read_unlock(); | 
 | 382 | 	} | 
 | 383 |  | 
 | 384 | 	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages; | 
 | 385 | } | 
 | 386 |  | 
 | 387 | /** | 
 | 388 |  * dump_tasks - dump current memory state of all system tasks | 
 | 389 |  * @memcg: current's memory controller, if constrained | 
 | 390 |  * @nodemask: nodemask passed to page allocator for mempolicy ooms | 
 | 391 |  * | 
 | 392 |  * Dumps the current memory state of all eligible tasks.  Tasks not in the same | 
 | 393 |  * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes | 
 | 394 |  * are not shown. | 
 | 395 |  * State information includes task's pid, uid, tgid, vm size, rss, | 
 | 396 |  * pgtables_bytes, swapents, oom_score_adj value, and name. | 
 | 397 |  */ | 
 | 398 | static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) | 
 | 399 | { | 
 | 400 | 	struct task_struct *p; | 
 | 401 | 	struct task_struct *task; | 
 | 402 |  | 
 | 403 | 	pr_info("Tasks state (memory values in pages):\n"); | 
 | 404 | 	pr_info("[  pid  ]   uid  tgid total_vm      rss pgtables_bytes swapents oom_score_adj name\n"); | 
 | 405 | 	rcu_read_lock(); | 
 | 406 | 	for_each_process(p) { | 
 | 407 | 		if (oom_unkillable_task(p, memcg, nodemask)) | 
 | 408 | 			continue; | 
 | 409 |  | 
 | 410 | 		task = find_lock_task_mm(p); | 
 | 411 | 		if (!task) { | 
 | 412 | 			/* | 
 | 413 | 			 * This is a kthread or all of p's threads have already | 
 | 414 | 			 * detached their mm's.  There's no need to report | 
 | 415 | 			 * them; they can't be oom killed anyway. | 
 | 416 | 			 */ | 
 | 417 | 			continue; | 
 | 418 | 		} | 
 | 419 |  | 
 | 420 | 		pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu         %5hd %s\n", | 
 | 421 | 			task->pid, from_kuid(&init_user_ns, task_uid(task)), | 
 | 422 | 			task->tgid, task->mm->total_vm, get_mm_rss(task->mm), | 
 | 423 | 			mm_pgtables_bytes(task->mm), | 
 | 424 | 			get_mm_counter(task->mm, MM_SWAPENTS), | 
 | 425 | 			task->signal->oom_score_adj, task->comm); | 
 | 426 | 		task_unlock(task); | 
 | 427 | 	} | 
 | 428 | 	rcu_read_unlock(); | 
 | 429 | } | 
 | 430 |  | 
 | 431 | static void dump_header(struct oom_control *oc, struct task_struct *p) | 
 | 432 | { | 
 | 433 | 	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", | 
 | 434 | 		current->comm, oc->gfp_mask, &oc->gfp_mask, | 
 | 435 | 		nodemask_pr_args(oc->nodemask), oc->order, | 
 | 436 | 			current->signal->oom_score_adj); | 
 | 437 | 	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) | 
 | 438 | 		pr_warn("COMPACTION is disabled!!!\n"); | 
 | 439 |  | 
 | 440 | 	cpuset_print_current_mems_allowed(); | 
 | 441 | 	dump_stack(); | 
 | 442 | 	if (is_memcg_oom(oc)) | 
 | 443 | 		mem_cgroup_print_oom_info(oc->memcg, p); | 
 | 444 | 	else { | 
 | 445 | 		show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); | 
 | 446 | 		if (is_dump_unreclaim_slabs()) | 
 | 447 | 			dump_unreclaimable_slab(); | 
 | 448 | 	} | 
 | 449 | 	if (sysctl_oom_dump_tasks) | 
 | 450 | 		dump_tasks(oc->memcg, oc->nodemask); | 
 | 451 | } | 
 | 452 |  | 
 | 453 | /* | 
 | 454 |  * Number of OOM victims in flight | 
 | 455 |  */ | 
 | 456 | static atomic_t oom_victims = ATOMIC_INIT(0); | 
 | 457 | static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); | 
 | 458 |  | 
 | 459 | static bool oom_killer_disabled __read_mostly; | 
 | 460 |  | 
 | 461 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 
 | 462 |  | 
 | 463 | /* | 
 | 464 |  * task->mm can be NULL if the task is the exited group leader.  So to | 
 | 465 |  * determine whether the task is using a particular mm, we examine all the | 
 | 466 |  * task's threads: if one of those is using this mm then this task was also | 
 | 467 |  * using it. | 
 | 468 |  */ | 
 | 469 | bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) | 
 | 470 | { | 
 | 471 | 	struct task_struct *t; | 
 | 472 |  | 
 | 473 | 	for_each_thread(p, t) { | 
 | 474 | 		struct mm_struct *t_mm = READ_ONCE(t->mm); | 
 | 475 | 		if (t_mm) | 
 | 476 | 			return t_mm == mm; | 
 | 477 | 	} | 
 | 478 | 	return false; | 
 | 479 | } | 
 | 480 |  | 
 | 481 | #ifdef CONFIG_MMU | 
 | 482 | /* | 
 | 483 |  * OOM Reaper kernel thread which tries to reap the memory used by the OOM | 
 | 484 |  * victim (if that is possible) to help the OOM killer to move on. | 
 | 485 |  */ | 
 | 486 | static struct task_struct *oom_reaper_th; | 
 | 487 | static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); | 
 | 488 | static struct task_struct *oom_reaper_list; | 
 | 489 | static DEFINE_SPINLOCK(oom_reaper_lock); | 
 | 490 |  | 
 | 491 | bool __oom_reap_task_mm(struct mm_struct *mm) | 
 | 492 | { | 
 | 493 | 	struct vm_area_struct *vma; | 
 | 494 | 	bool ret = true; | 
 | 495 |  | 
 | 496 | 	/* | 
 | 497 | 	 * Tell all users of get_user/copy_from_user etc... that the content | 
 | 498 | 	 * is no longer stable. No barriers really needed because unmapping | 
 | 499 | 	 * should imply barriers already and the reader would hit a page fault | 
 | 500 | 	 * if it stumbled over a reaped memory. | 
 | 501 | 	 */ | 
 | 502 | 	set_bit(MMF_UNSTABLE, &mm->flags); | 
 | 503 |  | 
 | 504 | 	for (vma = mm->mmap ; vma; vma = vma->vm_next) { | 
 | 505 | 		if (!can_madv_dontneed_vma(vma)) | 
 | 506 | 			continue; | 
 | 507 |  | 
 | 508 | 		/* | 
 | 509 | 		 * Only anonymous pages have a good chance to be dropped | 
 | 510 | 		 * without additional steps which we cannot afford as we | 
 | 511 | 		 * are OOM already. | 
 | 512 | 		 * | 
 | 513 | 		 * We do not even care about fs backed pages because all | 
 | 514 | 		 * which are reclaimable have already been reclaimed and | 
 | 515 | 		 * we do not want to block exit_mmap by keeping mm ref | 
 | 516 | 		 * count elevated without a good reason. | 
 | 517 | 		 */ | 
 | 518 | 		if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { | 
 | 519 | 			const unsigned long start = vma->vm_start; | 
 | 520 | 			const unsigned long end = vma->vm_end; | 
 | 521 | 			struct mmu_gather tlb; | 
 | 522 |  | 
 | 523 | 			tlb_gather_mmu(&tlb, mm, start, end); | 
 | 524 | 			if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { | 
 | 525 | 				tlb_finish_mmu(&tlb, start, end); | 
 | 526 | 				ret = false; | 
 | 527 | 				continue; | 
 | 528 | 			} | 
 | 529 | 			unmap_page_range(&tlb, vma, start, end, NULL); | 
 | 530 | 			mmu_notifier_invalidate_range_end(mm, start, end); | 
 | 531 | 			tlb_finish_mmu(&tlb, start, end); | 
 | 532 | 		} | 
 | 533 | 	} | 
 | 534 |  | 
 | 535 | 	return ret; | 
 | 536 | } | 
 | 537 |  | 
 | 538 | /* | 
 | 539 |  * Reaps the address space of the give task. | 
 | 540 |  * | 
 | 541 |  * Returns true on success and false if none or part of the address space | 
 | 542 |  * has been reclaimed and the caller should retry later. | 
 | 543 |  */ | 
 | 544 | static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) | 
 | 545 | { | 
 | 546 | 	bool ret = true; | 
 | 547 |  | 
 | 548 | 	if (!down_read_trylock(&mm->mmap_sem)) { | 
 | 549 | 		trace_skip_task_reaping(tsk->pid); | 
 | 550 | 		return false; | 
 | 551 | 	} | 
 | 552 |  | 
 | 553 | 	/* | 
 | 554 | 	 * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't | 
 | 555 | 	 * work on the mm anymore. The check for MMF_OOM_SKIP must run | 
 | 556 | 	 * under mmap_sem for reading because it serializes against the | 
 | 557 | 	 * down_write();up_write() cycle in exit_mmap(). | 
 | 558 | 	 */ | 
 | 559 | 	if (test_bit(MMF_OOM_SKIP, &mm->flags)) { | 
 | 560 | 		trace_skip_task_reaping(tsk->pid); | 
 | 561 | 		goto out_unlock; | 
 | 562 | 	} | 
 | 563 |  | 
 | 564 | 	trace_start_task_reaping(tsk->pid); | 
 | 565 |  | 
 | 566 | 	/* failed to reap part of the address space. Try again later */ | 
 | 567 | 	ret = __oom_reap_task_mm(mm); | 
 | 568 | 	if (!ret) | 
 | 569 | 		goto out_finish; | 
 | 570 |  | 
 | 571 | 	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", | 
 | 572 | 			task_pid_nr(tsk), tsk->comm, | 
 | 573 | 			K(get_mm_counter(mm, MM_ANONPAGES)), | 
 | 574 | 			K(get_mm_counter(mm, MM_FILEPAGES)), | 
 | 575 | 			K(get_mm_counter(mm, MM_SHMEMPAGES))); | 
 | 576 | out_finish: | 
 | 577 | 	trace_finish_task_reaping(tsk->pid); | 
 | 578 | out_unlock: | 
 | 579 | 	up_read(&mm->mmap_sem); | 
 | 580 |  | 
 | 581 | 	return ret; | 
 | 582 | } | 
 | 583 |  | 
 | 584 | #define MAX_OOM_REAP_RETRIES 10 | 
 | 585 | static void oom_reap_task(struct task_struct *tsk) | 
 | 586 | { | 
 | 587 | 	int attempts = 0; | 
 | 588 | 	struct mm_struct *mm = tsk->signal->oom_mm; | 
 | 589 |  | 
 | 590 | 	/* Retry the down_read_trylock(mmap_sem) a few times */ | 
 | 591 | 	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm)) | 
 | 592 | 		schedule_timeout_idle(HZ/10); | 
 | 593 |  | 
 | 594 | 	if (attempts <= MAX_OOM_REAP_RETRIES || | 
 | 595 | 	    test_bit(MMF_OOM_SKIP, &mm->flags)) | 
 | 596 | 		goto done; | 
 | 597 |  | 
 | 598 | 	pr_info("oom_reaper: unable to reap pid:%d (%s)\n", | 
 | 599 | 		task_pid_nr(tsk), tsk->comm); | 
 | 600 | 	debug_show_all_locks(); | 
 | 601 |  | 
 | 602 | done: | 
 | 603 | 	tsk->oom_reaper_list = NULL; | 
 | 604 |  | 
 | 605 | 	/* | 
 | 606 | 	 * Hide this mm from OOM killer because it has been either reaped or | 
 | 607 | 	 * somebody can't call up_write(mmap_sem). | 
 | 608 | 	 */ | 
 | 609 | 	set_bit(MMF_OOM_SKIP, &mm->flags); | 
 | 610 |  | 
 | 611 | 	/* Drop a reference taken by wake_oom_reaper */ | 
 | 612 | 	put_task_struct(tsk); | 
 | 613 | } | 
 | 614 |  | 
 | 615 | static int oom_reaper(void *unused) | 
 | 616 | { | 
 | 617 | 	while (true) { | 
 | 618 | 		struct task_struct *tsk = NULL; | 
 | 619 |  | 
 | 620 | 		wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); | 
 | 621 | 		spin_lock(&oom_reaper_lock); | 
 | 622 | 		if (oom_reaper_list != NULL) { | 
 | 623 | 			tsk = oom_reaper_list; | 
 | 624 | 			oom_reaper_list = tsk->oom_reaper_list; | 
 | 625 | 		} | 
 | 626 | 		spin_unlock(&oom_reaper_lock); | 
 | 627 |  | 
 | 628 | 		if (tsk) | 
 | 629 | 			oom_reap_task(tsk); | 
 | 630 | 	} | 
 | 631 |  | 
 | 632 | 	return 0; | 
 | 633 | } | 
 | 634 |  | 
 | 635 | static void wake_oom_reaper(struct task_struct *tsk) | 
 | 636 | { | 
 | 637 | 	/* mm is already queued? */ | 
 | 638 | 	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) | 
 | 639 | 		return; | 
 | 640 |  | 
 | 641 | 	get_task_struct(tsk); | 
 | 642 |  | 
 | 643 | 	spin_lock(&oom_reaper_lock); | 
 | 644 | 	tsk->oom_reaper_list = oom_reaper_list; | 
 | 645 | 	oom_reaper_list = tsk; | 
 | 646 | 	spin_unlock(&oom_reaper_lock); | 
 | 647 | 	trace_wake_reaper(tsk->pid); | 
 | 648 | 	wake_up(&oom_reaper_wait); | 
 | 649 | } | 
 | 650 |  | 
 | 651 | static int __init oom_init(void) | 
 | 652 | { | 
 | 653 | 	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); | 
 | 654 | 	return 0; | 
 | 655 | } | 
 | 656 | subsys_initcall(oom_init) | 
 | 657 | #else | 
 | 658 | static inline void wake_oom_reaper(struct task_struct *tsk) | 
 | 659 | { | 
 | 660 | } | 
 | 661 | #endif /* CONFIG_MMU */ | 
 | 662 |  | 
 | 663 | /** | 
 | 664 |  * mark_oom_victim - mark the given task as OOM victim | 
 | 665 |  * @tsk: task to mark | 
 | 666 |  * | 
 | 667 |  * Has to be called with oom_lock held and never after | 
 | 668 |  * oom has been disabled already. | 
 | 669 |  * | 
 | 670 |  * tsk->mm has to be non NULL and caller has to guarantee it is stable (either | 
 | 671 |  * under task_lock or operate on the current). | 
 | 672 |  */ | 
 | 673 | static void mark_oom_victim(struct task_struct *tsk) | 
 | 674 | { | 
 | 675 | 	struct mm_struct *mm = tsk->mm; | 
 | 676 |  | 
 | 677 | 	WARN_ON(oom_killer_disabled); | 
 | 678 | 	/* OOM killer might race with memcg OOM */ | 
 | 679 | 	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) | 
 | 680 | 		return; | 
 | 681 |  | 
 | 682 | 	/* oom_mm is bound to the signal struct life time. */ | 
 | 683 | 	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { | 
 | 684 | 		mmgrab(tsk->signal->oom_mm); | 
 | 685 | 		set_bit(MMF_OOM_VICTIM, &mm->flags); | 
 | 686 | 	} | 
 | 687 |  | 
 | 688 | 	/* | 
 | 689 | 	 * Make sure that the task is woken up from uninterruptible sleep | 
 | 690 | 	 * if it is frozen because OOM killer wouldn't be able to free | 
 | 691 | 	 * any memory and livelock. freezing_slow_path will tell the freezer | 
 | 692 | 	 * that TIF_MEMDIE tasks should be ignored. | 
 | 693 | 	 */ | 
 | 694 | 	__thaw_task(tsk); | 
 | 695 | 	atomic_inc(&oom_victims); | 
 | 696 | 	trace_mark_victim(tsk->pid); | 
 | 697 | } | 
 | 698 |  | 
 | 699 | /** | 
 | 700 |  * exit_oom_victim - note the exit of an OOM victim | 
 | 701 |  */ | 
 | 702 | void exit_oom_victim(void) | 
 | 703 | { | 
 | 704 | 	clear_thread_flag(TIF_MEMDIE); | 
 | 705 |  | 
 | 706 | 	if (!atomic_dec_return(&oom_victims)) | 
 | 707 | 		wake_up_all(&oom_victims_wait); | 
 | 708 | } | 
 | 709 |  | 
 | 710 | /** | 
 | 711 |  * oom_killer_enable - enable OOM killer | 
 | 712 |  */ | 
 | 713 | void oom_killer_enable(void) | 
 | 714 | { | 
 | 715 | 	oom_killer_disabled = false; | 
 | 716 | 	pr_info("OOM killer enabled.\n"); | 
 | 717 | } | 
 | 718 |  | 
 | 719 | /** | 
 | 720 |  * oom_killer_disable - disable OOM killer | 
 | 721 |  * @timeout: maximum timeout to wait for oom victims in jiffies | 
 | 722 |  * | 
 | 723 |  * Forces all page allocations to fail rather than trigger OOM killer. | 
 | 724 |  * Will block and wait until all OOM victims are killed or the given | 
 | 725 |  * timeout expires. | 
 | 726 |  * | 
 | 727 |  * The function cannot be called when there are runnable user tasks because | 
 | 728 |  * the userspace would see unexpected allocation failures as a result. Any | 
 | 729 |  * new usage of this function should be consulted with MM people. | 
 | 730 |  * | 
 | 731 |  * Returns true if successful and false if the OOM killer cannot be | 
 | 732 |  * disabled. | 
 | 733 |  */ | 
 | 734 | bool oom_killer_disable(signed long timeout) | 
 | 735 | { | 
 | 736 | 	signed long ret; | 
 | 737 |  | 
 | 738 | 	/* | 
 | 739 | 	 * Make sure to not race with an ongoing OOM killer. Check that the | 
 | 740 | 	 * current is not killed (possibly due to sharing the victim's memory). | 
 | 741 | 	 */ | 
 | 742 | 	if (mutex_lock_killable(&oom_lock)) | 
 | 743 | 		return false; | 
 | 744 | 	oom_killer_disabled = true; | 
 | 745 | 	mutex_unlock(&oom_lock); | 
 | 746 |  | 
 | 747 | 	ret = wait_event_interruptible_timeout(oom_victims_wait, | 
 | 748 | 			!atomic_read(&oom_victims), timeout); | 
 | 749 | 	if (ret <= 0) { | 
 | 750 | 		oom_killer_enable(); | 
 | 751 | 		return false; | 
 | 752 | 	} | 
 | 753 | 	pr_info("OOM killer disabled.\n"); | 
 | 754 |  | 
 | 755 | 	return true; | 
 | 756 | } | 
 | 757 |  | 
 | 758 | static inline bool __task_will_free_mem(struct task_struct *task) | 
 | 759 | { | 
 | 760 | 	struct signal_struct *sig = task->signal; | 
 | 761 |  | 
 | 762 | 	/* | 
 | 763 | 	 * A coredumping process may sleep for an extended period in exit_mm(), | 
 | 764 | 	 * so the oom killer cannot assume that the process will promptly exit | 
 | 765 | 	 * and release memory. | 
 | 766 | 	 */ | 
 | 767 | 	if (sig->flags & SIGNAL_GROUP_COREDUMP) | 
 | 768 | 		return false; | 
 | 769 |  | 
 | 770 | 	if (sig->flags & SIGNAL_GROUP_EXIT) | 
 | 771 | 		return true; | 
 | 772 |  | 
 | 773 | 	if (thread_group_empty(task) && (task->flags & PF_EXITING)) | 
 | 774 | 		return true; | 
 | 775 |  | 
 | 776 | 	return false; | 
 | 777 | } | 
 | 778 |  | 
 | 779 | /* | 
 | 780 |  * Checks whether the given task is dying or exiting and likely to | 
 | 781 |  * release its address space. This means that all threads and processes | 
 | 782 |  * sharing the same mm have to be killed or exiting. | 
 | 783 |  * Caller has to make sure that task->mm is stable (hold task_lock or | 
 | 784 |  * it operates on the current). | 
 | 785 |  */ | 
 | 786 | static bool task_will_free_mem(struct task_struct *task) | 
 | 787 | { | 
 | 788 | 	struct mm_struct *mm = task->mm; | 
 | 789 | 	struct task_struct *p; | 
 | 790 | 	bool ret = true; | 
 | 791 |  | 
 | 792 | 	/* | 
 | 793 | 	 * Skip tasks without mm because it might have passed its exit_mm and | 
 | 794 | 	 * exit_oom_victim. oom_reaper could have rescued that but do not rely | 
 | 795 | 	 * on that for now. We can consider find_lock_task_mm in future. | 
 | 796 | 	 */ | 
 | 797 | 	if (!mm) | 
 | 798 | 		return false; | 
 | 799 |  | 
 | 800 | 	if (!__task_will_free_mem(task)) | 
 | 801 | 		return false; | 
 | 802 |  | 
 | 803 | 	/* | 
 | 804 | 	 * This task has already been drained by the oom reaper so there are | 
 | 805 | 	 * only small chances it will free some more | 
 | 806 | 	 */ | 
 | 807 | 	if (test_bit(MMF_OOM_SKIP, &mm->flags)) | 
 | 808 | 		return false; | 
 | 809 |  | 
 | 810 | 	if (atomic_read(&mm->mm_users) <= 1) | 
 | 811 | 		return true; | 
 | 812 |  | 
 | 813 | 	/* | 
 | 814 | 	 * Make sure that all tasks which share the mm with the given tasks | 
 | 815 | 	 * are dying as well to make sure that a) nobody pins its mm and | 
 | 816 | 	 * b) the task is also reapable by the oom reaper. | 
 | 817 | 	 */ | 
 | 818 | 	rcu_read_lock(); | 
 | 819 | 	for_each_process(p) { | 
 | 820 | 		if (!process_shares_mm(p, mm)) | 
 | 821 | 			continue; | 
 | 822 | 		if (same_thread_group(task, p)) | 
 | 823 | 			continue; | 
 | 824 | 		ret = __task_will_free_mem(p); | 
 | 825 | 		if (!ret) | 
 | 826 | 			break; | 
 | 827 | 	} | 
 | 828 | 	rcu_read_unlock(); | 
 | 829 |  | 
 | 830 | 	return ret; | 
 | 831 | } | 
 | 832 |  | 
 | 833 | static void __oom_kill_process(struct task_struct *victim) | 
 | 834 | { | 
 | 835 | 	struct task_struct *p; | 
 | 836 | 	struct mm_struct *mm; | 
 | 837 | 	bool can_oom_reap = true; | 
 | 838 |  | 
 | 839 | 	p = find_lock_task_mm(victim); | 
 | 840 | 	if (!p) { | 
 | 841 | 		put_task_struct(victim); | 
 | 842 | 		return; | 
 | 843 | 	} else if (victim != p) { | 
 | 844 | 		get_task_struct(p); | 
 | 845 | 		put_task_struct(victim); | 
 | 846 | 		victim = p; | 
 | 847 | 	} | 
 | 848 |  | 
 | 849 | 	/* Get a reference to safely compare mm after task_unlock(victim) */ | 
 | 850 | 	mm = victim->mm; | 
 | 851 | 	mmgrab(mm); | 
 | 852 |  | 
 | 853 | 	/* Raise event before sending signal: task reaper must see this */ | 
 | 854 | 	count_vm_event(OOM_KILL); | 
 | 855 | 	memcg_memory_event_mm(mm, MEMCG_OOM_KILL); | 
 | 856 |  | 
 | 857 | 	/* | 
 | 858 | 	 * We should send SIGKILL before granting access to memory reserves | 
 | 859 | 	 * in order to prevent the OOM victim from depleting the memory | 
 | 860 | 	 * reserves from the user space under its control. | 
 | 861 | 	 */ | 
 | 862 | 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID); | 
 | 863 | 	mark_oom_victim(victim); | 
 | 864 | 	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", | 
 | 865 | 		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm), | 
 | 866 | 		K(get_mm_counter(victim->mm, MM_ANONPAGES)), | 
 | 867 | 		K(get_mm_counter(victim->mm, MM_FILEPAGES)), | 
 | 868 | 		K(get_mm_counter(victim->mm, MM_SHMEMPAGES))); | 
 | 869 | 	task_unlock(victim); | 
 | 870 |  | 
 | 871 | 	/* | 
 | 872 | 	 * Kill all user processes sharing victim->mm in other thread groups, if | 
 | 873 | 	 * any.  They don't get access to memory reserves, though, to avoid | 
 | 874 | 	 * depletion of all memory.  This prevents mm->mmap_sem livelock when an | 
 | 875 | 	 * oom killed thread cannot exit because it requires the semaphore and | 
 | 876 | 	 * its contended by another thread trying to allocate memory itself. | 
 | 877 | 	 * That thread will now get access to memory reserves since it has a | 
 | 878 | 	 * pending fatal signal. | 
 | 879 | 	 */ | 
 | 880 | 	rcu_read_lock(); | 
 | 881 | 	for_each_process(p) { | 
 | 882 | 		if (!process_shares_mm(p, mm)) | 
 | 883 | 			continue; | 
 | 884 | 		if (same_thread_group(p, victim)) | 
 | 885 | 			continue; | 
 | 886 | 		if (is_global_init(p)) { | 
 | 887 | 			can_oom_reap = false; | 
 | 888 | 			set_bit(MMF_OOM_SKIP, &mm->flags); | 
 | 889 | 			pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", | 
 | 890 | 					task_pid_nr(victim), victim->comm, | 
 | 891 | 					task_pid_nr(p), p->comm); | 
 | 892 | 			continue; | 
 | 893 | 		} | 
 | 894 | 		/* | 
 | 895 | 		 * No use_mm() user needs to read from the userspace so we are | 
 | 896 | 		 * ok to reap it. | 
 | 897 | 		 */ | 
 | 898 | 		if (unlikely(p->flags & PF_KTHREAD)) | 
 | 899 | 			continue; | 
 | 900 | 		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID); | 
 | 901 | 	} | 
 | 902 | 	rcu_read_unlock(); | 
 | 903 |  | 
 | 904 | 	if (can_oom_reap) | 
 | 905 | 		wake_oom_reaper(victim); | 
 | 906 |  | 
 | 907 | 	mmdrop(mm); | 
 | 908 | 	put_task_struct(victim); | 
 | 909 | } | 
 | 910 | #undef K | 
 | 911 |  | 
 | 912 | /* | 
 | 913 |  * Kill provided task unless it's secured by setting | 
 | 914 |  * oom_score_adj to OOM_SCORE_ADJ_MIN. | 
 | 915 |  */ | 
 | 916 | static int oom_kill_memcg_member(struct task_struct *task, void *unused) | 
 | 917 | { | 
 | 918 | 	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN && | 
 | 919 | 	    !is_global_init(task)) { | 
 | 920 | 		get_task_struct(task); | 
 | 921 | 		__oom_kill_process(task); | 
 | 922 | 	} | 
 | 923 | 	return 0; | 
 | 924 | } | 
 | 925 |  | 
 | 926 | static void oom_kill_process(struct oom_control *oc, const char *message) | 
 | 927 | { | 
 | 928 | 	struct task_struct *p = oc->chosen; | 
 | 929 | 	unsigned int points = oc->chosen_points; | 
 | 930 | 	struct task_struct *victim = p; | 
 | 931 | 	struct task_struct *child; | 
 | 932 | 	struct task_struct *t; | 
 | 933 | 	struct mem_cgroup *oom_group; | 
 | 934 | 	unsigned int victim_points = 0; | 
 | 935 | 	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, | 
 | 936 | 					      DEFAULT_RATELIMIT_BURST); | 
 | 937 |  | 
 | 938 | 	/* | 
 | 939 | 	 * If the task is already exiting, don't alarm the sysadmin or kill | 
 | 940 | 	 * its children or threads, just give it access to memory reserves | 
 | 941 | 	 * so it can die quickly | 
 | 942 | 	 */ | 
 | 943 | 	task_lock(p); | 
 | 944 | 	if (task_will_free_mem(p)) { | 
 | 945 | 		mark_oom_victim(p); | 
 | 946 | 		wake_oom_reaper(p); | 
 | 947 | 		task_unlock(p); | 
 | 948 | 		put_task_struct(p); | 
 | 949 | 		return; | 
 | 950 | 	} | 
 | 951 | 	task_unlock(p); | 
 | 952 |  | 
 | 953 | 	if (__ratelimit(&oom_rs)) | 
 | 954 | 		dump_header(oc, p); | 
 | 955 |  | 
 | 956 | 	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", | 
 | 957 | 		message, task_pid_nr(p), p->comm, points); | 
 | 958 |  | 
 | 959 | 	/* | 
 | 960 | 	 * If any of p's children has a different mm and is eligible for kill, | 
 | 961 | 	 * the one with the highest oom_badness() score is sacrificed for its | 
 | 962 | 	 * parent.  This attempts to lose the minimal amount of work done while | 
 | 963 | 	 * still freeing memory. | 
 | 964 | 	 */ | 
 | 965 | 	read_lock(&tasklist_lock); | 
 | 966 |  | 
 | 967 | 	/* | 
 | 968 | 	 * The task 'p' might have already exited before reaching here. The | 
 | 969 | 	 * put_task_struct() will free task_struct 'p' while the loop still try | 
 | 970 | 	 * to access the field of 'p', so, get an extra reference. | 
 | 971 | 	 */ | 
 | 972 | 	get_task_struct(p); | 
 | 973 | 	for_each_thread(p, t) { | 
 | 974 | 		list_for_each_entry(child, &t->children, sibling) { | 
 | 975 | 			unsigned int child_points; | 
 | 976 |  | 
 | 977 | 			if (process_shares_mm(child, p->mm)) | 
 | 978 | 				continue; | 
 | 979 | 			/* | 
 | 980 | 			 * oom_badness() returns 0 if the thread is unkillable | 
 | 981 | 			 */ | 
 | 982 | 			child_points = oom_badness(child, | 
 | 983 | 				oc->memcg, oc->nodemask, oc->totalpages); | 
 | 984 | 			if (child_points > victim_points) { | 
 | 985 | 				put_task_struct(victim); | 
 | 986 | 				victim = child; | 
 | 987 | 				victim_points = child_points; | 
 | 988 | 				get_task_struct(victim); | 
 | 989 | 			} | 
 | 990 | 		} | 
 | 991 | 	} | 
 | 992 | 	put_task_struct(p); | 
 | 993 | 	read_unlock(&tasklist_lock); | 
 | 994 |  | 
 | 995 | 	/* | 
 | 996 | 	 * Do we need to kill the entire memory cgroup? | 
 | 997 | 	 * Or even one of the ancestor memory cgroups? | 
 | 998 | 	 * Check this out before killing the victim task. | 
 | 999 | 	 */ | 
 | 1000 | 	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); | 
 | 1001 |  | 
 | 1002 | 	__oom_kill_process(victim); | 
 | 1003 |  | 
 | 1004 | 	/* | 
 | 1005 | 	 * If necessary, kill all tasks in the selected memory cgroup. | 
 | 1006 | 	 */ | 
 | 1007 | 	if (oom_group) { | 
 | 1008 | 		mem_cgroup_print_oom_group(oom_group); | 
 | 1009 | 		mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); | 
 | 1010 | 		mem_cgroup_put(oom_group); | 
 | 1011 | 	} | 
 | 1012 | } | 
 | 1013 |  | 
 | 1014 | /* | 
 | 1015 |  * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 
 | 1016 |  */ | 
 | 1017 | static void check_panic_on_oom(struct oom_control *oc, | 
 | 1018 | 			       enum oom_constraint constraint) | 
 | 1019 | { | 
 | 1020 | 	if (likely(!sysctl_panic_on_oom)) | 
 | 1021 | 		return; | 
 | 1022 | 	if (sysctl_panic_on_oom != 2) { | 
 | 1023 | 		/* | 
 | 1024 | 		 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel | 
 | 1025 | 		 * does not panic for cpuset, mempolicy, or memcg allocation | 
 | 1026 | 		 * failures. | 
 | 1027 | 		 */ | 
 | 1028 | 		if (constraint != CONSTRAINT_NONE) | 
 | 1029 | 			return; | 
 | 1030 | 	} | 
 | 1031 | 	/* Do not panic for oom kills triggered by sysrq */ | 
 | 1032 | 	if (is_sysrq_oom(oc)) | 
 | 1033 | 		return; | 
 | 1034 | 	dump_header(oc, NULL); | 
 | 1035 | 	panic("Out of memory: %s panic_on_oom is enabled\n", | 
 | 1036 | 		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 
 | 1037 | } | 
 | 1038 |  | 
 | 1039 | static BLOCKING_NOTIFIER_HEAD(oom_notify_list); | 
 | 1040 |  | 
 | 1041 | int register_oom_notifier(struct notifier_block *nb) | 
 | 1042 | { | 
 | 1043 | 	return blocking_notifier_chain_register(&oom_notify_list, nb); | 
 | 1044 | } | 
 | 1045 | EXPORT_SYMBOL_GPL(register_oom_notifier); | 
 | 1046 |  | 
 | 1047 | int unregister_oom_notifier(struct notifier_block *nb) | 
 | 1048 | { | 
 | 1049 | 	return blocking_notifier_chain_unregister(&oom_notify_list, nb); | 
 | 1050 | } | 
 | 1051 | EXPORT_SYMBOL_GPL(unregister_oom_notifier); | 
 | 1052 |  | 
 | 1053 | /** | 
 | 1054 |  * out_of_memory - kill the "best" process when we run out of memory | 
 | 1055 |  * @oc: pointer to struct oom_control | 
 | 1056 |  * | 
 | 1057 |  * If we run out of memory, we have the choice between either | 
 | 1058 |  * killing a random task (bad), letting the system crash (worse) | 
 | 1059 |  * OR try to be smart about which process to kill. Note that we | 
 | 1060 |  * don't have to be perfect here, we just have to be good. | 
 | 1061 |  */ | 
 | 1062 | bool out_of_memory(struct oom_control *oc) | 
 | 1063 | { | 
 | 1064 | 	unsigned long freed = 0; | 
 | 1065 | 	enum oom_constraint constraint = CONSTRAINT_NONE; | 
 | 1066 |  | 
 | 1067 | 	if (oom_killer_disabled) | 
 | 1068 | 		return false; | 
 | 1069 |  | 
 | 1070 | 	if (!is_memcg_oom(oc)) { | 
 | 1071 | 		blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | 
 | 1072 | 		if (freed > 0) | 
 | 1073 | 			/* Got some memory back in the last second. */ | 
 | 1074 | 			return true; | 
 | 1075 | 	} | 
 | 1076 |  | 
 | 1077 | 	/* | 
 | 1078 | 	 * If current has a pending SIGKILL or is exiting, then automatically | 
 | 1079 | 	 * select it.  The goal is to allow it to allocate so that it may | 
 | 1080 | 	 * quickly exit and free its memory. | 
 | 1081 | 	 */ | 
 | 1082 | 	if (task_will_free_mem(current)) { | 
 | 1083 | 		mark_oom_victim(current); | 
 | 1084 | 		wake_oom_reaper(current); | 
 | 1085 | 		return true; | 
 | 1086 | 	} | 
 | 1087 |  | 
 | 1088 | 	/* | 
 | 1089 | 	 * The OOM killer does not compensate for IO-less reclaim. | 
 | 1090 | 	 * pagefault_out_of_memory lost its gfp context so we have to | 
 | 1091 | 	 * make sure exclude 0 mask - all other users should have at least | 
 | 1092 | 	 * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to | 
 | 1093 | 	 * invoke the OOM killer even if it is a GFP_NOFS allocation. | 
 | 1094 | 	 */ | 
 | 1095 | 	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) | 
 | 1096 | 		return true; | 
 | 1097 |  | 
 | 1098 | 	/* | 
 | 1099 | 	 * Check if there were limitations on the allocation (only relevant for | 
 | 1100 | 	 * NUMA and memcg) that may require different handling. | 
 | 1101 | 	 */ | 
 | 1102 | 	constraint = constrained_alloc(oc); | 
 | 1103 | 	if (constraint != CONSTRAINT_MEMORY_POLICY) | 
 | 1104 | 		oc->nodemask = NULL; | 
 | 1105 | 	check_panic_on_oom(oc, constraint); | 
 | 1106 |  | 
 | 1107 | 	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task && | 
 | 1108 | 	    current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) && | 
 | 1109 | 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { | 
 | 1110 | 		get_task_struct(current); | 
 | 1111 | 		oc->chosen = current; | 
 | 1112 | 		oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)"); | 
 | 1113 | 		return true; | 
 | 1114 | 	} | 
 | 1115 |  | 
 | 1116 | 	select_bad_process(oc); | 
 | 1117 | 	/* Found nothing?!?! */ | 
 | 1118 | 	if (!oc->chosen) { | 
 | 1119 | 		dump_header(oc, NULL); | 
 | 1120 | 		pr_warn("Out of memory and no killable processes...\n"); | 
 | 1121 | 		/* | 
 | 1122 | 		 * If we got here due to an actual allocation at the | 
 | 1123 | 		 * system level, we cannot survive this and will enter | 
 | 1124 | 		 * an endless loop in the allocator. Bail out now. | 
 | 1125 | 		 */ | 
 | 1126 | 		if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) { | 
 | 1127 | #ifdef CONFIG_PAGE_OWNER | 
 | 1128 | 			print_max_page_owner(); | 
 | 1129 | #endif | 
 | 1130 | 			panic("System is deadlocked on memory\n"); | 
 | 1131 | 		} | 
 | 1132 | 	} | 
 | 1133 | 	if (oc->chosen && oc->chosen != (void *)-1UL) | 
 | 1134 | 		oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" : | 
 | 1135 | 				 "Memory cgroup out of memory"); | 
 | 1136 | 	return !!oc->chosen; | 
 | 1137 | } | 
 | 1138 |  | 
 | 1139 | /* | 
 | 1140 |  * The pagefault handler calls here because it is out of memory, so kill a | 
 | 1141 |  * memory-hogging task. If oom_lock is held by somebody else, a parallel oom | 
 | 1142 |  * killing is already in progress so do nothing. | 
 | 1143 |  */ | 
 | 1144 | void pagefault_out_of_memory(void) | 
 | 1145 | { | 
 | 1146 | 	struct oom_control oc = { | 
 | 1147 | 		.zonelist = NULL, | 
 | 1148 | 		.nodemask = NULL, | 
 | 1149 | 		.memcg = NULL, | 
 | 1150 | 		.gfp_mask = 0, | 
 | 1151 | 		.order = 0, | 
 | 1152 | 	}; | 
 | 1153 |  | 
 | 1154 | 	if (mem_cgroup_oom_synchronize(true)) | 
 | 1155 | 		return; | 
 | 1156 |  | 
 | 1157 | 	if (!mutex_trylock(&oom_lock)) | 
 | 1158 | 		return; | 
 | 1159 | 	out_of_memory(&oc); | 
 | 1160 | 	mutex_unlock(&oom_lock); | 
 | 1161 | } |