Blame - marvell/linux/mm/oom_kill.c - T108

blob: cb60fe1ed30b4d17a228a2ad58d23d32ed14a86e [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* linux/mm/oom_kill.c
				4	*
				5	* Copyright (C) 1998,2000 Rik van Riel
				6	* Thanks go out to Claus Fischer for some serious inspiration and
				7	* for goading me into coding this file...
				8	* Copyright (C) 2010 Google, Inc.
				9	* Rewritten by David Rientjes
				10	*
				11	* The routines in this file are used to kill a process when
				12	* we're seriously out of memory. This gets called from __alloc_pages()
				13	* in mm/page_alloc.c when we really run out of memory.
				14	*
				15	* Since we won't call these routines often (on a well-configured
				16	* machine) this file will double as a 'coding guide' and a signpost
				17	* for newbie kernel hackers. It features several pointers to major
				18	* kernel subsystems and hints as to where to find out what things do.
				19	*/
				20
				21	#include <linux/oom.h>
				22	#include <linux/mm.h>
				23	#include <linux/err.h>
				24	#include <linux/gfp.h>
				25	#include <linux/sched.h>
				26	#include <linux/sched/mm.h>
				27	#include <linux/sched/coredump.h>
				28	#include <linux/sched/task.h>
				29	#include <linux/swap.h>
				30	#include <linux/timex.h>
				31	#include <linux/jiffies.h>
				32	#include <linux/cpuset.h>
				33	#include <linux/export.h>
				34	#include <linux/notifier.h>
				35	#include <linux/memcontrol.h>
				36	#include <linux/mempolicy.h>
				37	#include <linux/security.h>
				38	#include <linux/ptrace.h>
				39	#include <linux/freezer.h>
				40	#include <linux/ftrace.h>
				41	#include <linux/ratelimit.h>
				42	#include <linux/kthread.h>
				43	#include <linux/init.h>
				44	#include <linux/mmu_notifier.h>
				45
				46	#include <asm/tlb.h>
				47	#include "internal.h"
				48	#include "slab.h"
				49
				50	#define CREATE_TRACE_POINTS
				51	#include <trace/events/oom.h>
				52
				53	int sysctl_panic_on_oom;
				54	int sysctl_oom_kill_allocating_task;
				55	int sysctl_oom_dump_tasks = 1;
				56
				57	/*
				58	* Serializes oom killer invocations (out_of_memory()) from all contexts to
				59	* prevent from over eager oom killing (e.g. when the oom killer is invoked
				60	* from different domains).
				61	*
				62	* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
				63	* and mark_oom_victim
				64	*/
				65	DEFINE_MUTEX(oom_lock);
				66	/* Serializes oom_score_adj and oom_score_adj_min updates */
				67	DEFINE_MUTEX(oom_adj_mutex);
				68
				69	static inline bool is_memcg_oom(struct oom_control *oc)
				70	{
				71	return oc->memcg != NULL;
				72	}
				73
				74	#ifdef CONFIG_NUMA
				75	/**
				76	* oom_cpuset_eligible() - check task eligiblity for kill
				77	* @start: task struct of which task to consider
				78	* @oc: pointer to struct oom_control
				79	*
				80	* Task eligibility is determined by whether or not a candidate task, @tsk,
				81	* shares the same mempolicy nodes as current if it is bound by such a policy
				82	* and whether or not it has the same set of allowed cpuset nodes.
				83	*
				84	* This function is assuming oom-killer context and 'current' has triggered
				85	* the oom-killer.
				86	*/
				87	static bool oom_cpuset_eligible(struct task_struct *start,
				88	struct oom_control *oc)
				89	{
				90	struct task_struct *tsk;
				91	bool ret = false;
				92	const nodemask_t *mask = oc->nodemask;
				93
				94	if (is_memcg_oom(oc))
				95	return true;
				96
				97	rcu_read_lock();
				98	for_each_thread(start, tsk) {
				99	if (mask) {
				100	/*
				101	* If this is a mempolicy constrained oom, tsk's
				102	* cpuset is irrelevant. Only return true if its
				103	* mempolicy intersects current, otherwise it may be
				104	* needlessly killed.
				105	*/
				106	ret = mempolicy_nodemask_intersects(tsk, mask);
				107	} else {
				108	/*
				109	* This is not a mempolicy constrained oom, so only
				110	* check the mems of tsk's cpuset.
				111	*/
				112	ret = cpuset_mems_allowed_intersects(current, tsk);
				113	}
				114	if (ret)
				115	break;
				116	}
				117	rcu_read_unlock();
				118
				119	return ret;
				120	}
				121	#else
				122	static bool oom_cpuset_eligible(struct task_struct tsk, struct oom_control oc)
				123	{
				124	return true;
				125	}
				126	#endif /* CONFIG_NUMA */
				127
				128	/*
				129	* The process p may have detached its own ->mm while exiting or through
				130	* use_mm(), but one or more of its subthreads may still have a valid
				131	* pointer. Return p, or any of its subthreads with a valid ->mm, with
				132	* task_lock() held.
				133	*/
				134	struct task_struct find_lock_task_mm(struct task_struct p)
				135	{
				136	struct task_struct *t;
				137
				138	rcu_read_lock();
				139
				140	for_each_thread(p, t) {
				141	task_lock(t);
				142	if (likely(t->mm))
				143	goto found;
				144	task_unlock(t);
				145	}
				146	t = NULL;
				147	found:
				148	rcu_read_unlock();
				149
				150	return t;
				151	}
				152
				153	/*
				154	* order == -1 means the oom kill is required by sysrq, otherwise only
				155	* for display purposes.
				156	*/
				157	static inline bool is_sysrq_oom(struct oom_control *oc)
				158	{
				159	return oc->order == -1;
				160	}
				161
				162	/* return true if the task is not adequate as candidate victim task. */
				163	static bool oom_unkillable_task(struct task_struct *p)
				164	{
				165	if (is_global_init(p))
				166	return true;
				167	if (p->flags & PF_KTHREAD)
				168	return true;
				169	return false;
				170	}
				171
				172	/*
				173	* Print out unreclaimble slabs info when unreclaimable slabs amount is greater
				174	* than all user memory (LRU pages)
				175	*/
				176	static bool is_dump_unreclaim_slabs(void)
				177	{
				178	unsigned long nr_lru;
				179
				180	nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
				181	global_node_page_state(NR_INACTIVE_ANON) +
				182	global_node_page_state(NR_ACTIVE_FILE) +
				183	global_node_page_state(NR_INACTIVE_FILE) +
				184	global_node_page_state(NR_ISOLATED_ANON) +
				185	global_node_page_state(NR_ISOLATED_FILE) +
				186	global_node_page_state(NR_UNEVICTABLE);
				187
				188	return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
				189	}
				190
				191	/**
				192	* oom_badness - heuristic function to determine which candidate task to kill
				193	* @p: task struct of which task we should calculate
				194	* @totalpages: total present RAM allowed for page allocation
				195	*
				196	* The heuristic for determining which task to kill is made to be as simple and
				197	* predictable as possible. The goal is to return the highest value for the
				198	* task consuming the most memory to avoid subsequent oom failures.
				199	*/
				200	long oom_badness(struct task_struct *p, unsigned long totalpages)
				201	{
				202	long points;
				203	long adj;
				204
				205	if (oom_unkillable_task(p))
				206	return LONG_MIN;
				207
				208	p = find_lock_task_mm(p);
				209	if (!p)
				210	return LONG_MIN;
				211
				212	/*
				213	* Do not even consider tasks which are explicitly marked oom
				214	* unkillable or have been already oom reaped or the are in
				215	* the middle of vfork
				216	*/
				217	adj = (long)p->signal->oom_score_adj;
				218	if (adj == OOM_SCORE_ADJ_MIN \|\|
				219	test_bit(MMF_OOM_SKIP, &p->mm->flags) \|\|
				220	in_vfork(p)) {
				221	task_unlock(p);
				222	return LONG_MIN;
				223	}
				224
				225	/*
				226	* The baseline for the badness score is the proportion of RAM that each
				227	* task's rss, pagetable and swap space use.
				228	*/
				229	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
				230	mm_pgtables_bytes(p->mm) / PAGE_SIZE;
				231	task_unlock(p);
				232
				233	/* Normalize to oom_score_adj units */
				234	adj *= totalpages / 1000;
				235	points += adj;
				236
				237	return points;
				238	}
				239
				240	static const char * const oom_constraint_text[] = {
				241	[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
				242	[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
				243	[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
				244	[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
				245	};
				246
				247	/*
				248	* Determine the type of allocation constraint.
				249	*/
				250	static enum oom_constraint constrained_alloc(struct oom_control *oc)
				251	{
				252	struct zone *zone;
				253	struct zoneref *z;
				254	enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
				255	bool cpuset_limited = false;
				256	int nid;
				257
				258	if (is_memcg_oom(oc)) {
				259	oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
				260	return CONSTRAINT_MEMCG;
				261	}
				262
				263	/* Default to all available memory */
				264	oc->totalpages = totalram_pages() + total_swap_pages;
				265
				266	if (!IS_ENABLED(CONFIG_NUMA))
				267	return CONSTRAINT_NONE;
				268
				269	if (!oc->zonelist)
				270	return CONSTRAINT_NONE;
				271	/*
				272	* Reach here only when __GFP_NOFAIL is used. So, we should avoid
				273	* to kill current.We have to random task kill in this case.
				274	* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
				275	*/
				276	if (oc->gfp_mask & __GFP_THISNODE)
				277	return CONSTRAINT_NONE;
				278
				279	/*
				280	* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
				281	* the page allocator means a mempolicy is in effect. Cpuset policy
				282	* is enforced in get_page_from_freelist().
				283	*/
				284	if (oc->nodemask &&
				285	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
				286	oc->totalpages = total_swap_pages;
				287	for_each_node_mask(nid, *oc->nodemask)
				288	oc->totalpages += node_present_pages(nid);
				289	return CONSTRAINT_MEMORY_POLICY;
				290	}
				291
				292	/* Check this allocation failure is caused by cpuset's wall function */
				293	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
				294	high_zoneidx, oc->nodemask)
				295	if (!cpuset_zone_allowed(zone, oc->gfp_mask))
				296	cpuset_limited = true;
				297
				298	if (cpuset_limited) {
				299	oc->totalpages = total_swap_pages;
				300	for_each_node_mask(nid, cpuset_current_mems_allowed)
				301	oc->totalpages += node_present_pages(nid);
				302	return CONSTRAINT_CPUSET;
				303	}
				304	return CONSTRAINT_NONE;
				305	}
				306
				307	static int oom_evaluate_task(struct task_struct task, void arg)
				308	{
				309	struct oom_control *oc = arg;
				310	long points;
				311
				312	if (oom_unkillable_task(task))
				313	goto next;
				314
				315	/* p may not have freeable memory in nodemask */
				316	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
				317	goto next;
				318
				319	/*
				320	* This task already has access to memory reserves and is being killed.
				321	* Don't allow any other task to have access to the reserves unless
				322	* the task has MMF_OOM_SKIP because chances that it would release
				323	* any memory is quite low.
				324	*/
				325	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
				326	if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
				327	goto next;
				328	goto abort;
				329	}
				330
				331	/*
				332	* If task is allocating a lot of memory and has been marked to be
				333	* killed first if it triggers an oom, then select it.
				334	*/
				335	if (oom_task_origin(task)) {
				336	points = LONG_MAX;
				337	goto select;
				338	}
				339
				340	points = oom_badness(task, oc->totalpages);
				341	if (points == LONG_MIN \|\| points < oc->chosen_points)
				342	goto next;
				343
				344	select:
				345	if (oc->chosen)
				346	put_task_struct(oc->chosen);
				347	get_task_struct(task);
				348	oc->chosen = task;
				349	oc->chosen_points = points;
				350	next:
				351	return 0;
				352	abort:
				353	if (oc->chosen)
				354	put_task_struct(oc->chosen);
				355	oc->chosen = (void *)-1UL;
				356	return 1;
				357	}
				358
				359	/*
				360	* Simple selection loop. We choose the process with the highest number of
				361	* 'points'. In case scan was aborted, oc->chosen is set to -1.
				362	*/
				363	static void select_bad_process(struct oom_control *oc)
				364	{
				365	oc->chosen_points = LONG_MIN;
				366
				367	if (is_memcg_oom(oc))
				368	mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
				369	else {
				370	struct task_struct *p;
				371
				372	rcu_read_lock();
				373	for_each_process(p)
				374	if (oom_evaluate_task(p, oc))
				375	break;
				376	rcu_read_unlock();
				377	}
				378	}
				379
				380	static int dump_task(struct task_struct p, void arg)
				381	{
				382	struct oom_control *oc = arg;
				383	struct task_struct *task;
				384
				385	if (oom_unkillable_task(p))
				386	return 0;
				387
				388	/* p may not have freeable memory in nodemask */
				389	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
				390	return 0;
				391
				392	task = find_lock_task_mm(p);
				393	if (!task) {
				394	/*
				395	* This is a kthread or all of p's threads have already
				396	* detached their mm's. There's no need to report
				397	* them; they can't be oom killed anyway.
				398	*/
				399	return 0;
				400	}
				401
				402	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
				403	task->pid, from_kuid(&init_user_ns, task_uid(task)),
				404	task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
				405	mm_pgtables_bytes(task->mm),
				406	get_mm_counter(task->mm, MM_SWAPENTS),
				407	task->signal->oom_score_adj, task->comm);
				408	task_unlock(task);
				409
				410	return 0;
				411	}
				412
				413	/**
				414	* dump_tasks - dump current memory state of all system tasks
				415	* @oc: pointer to struct oom_control
				416	*
				417	* Dumps the current memory state of all eligible tasks. Tasks not in the same
				418	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
				419	* are not shown.
				420	* State information includes task's pid, uid, tgid, vm size, rss,
				421	* pgtables_bytes, swapents, oom_score_adj value, and name.
				422	*/
				423	static void dump_tasks(struct oom_control *oc)
				424	{
				425	pr_info("Tasks state (memory values in pages):\n");
				426	pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
				427
				428	if (is_memcg_oom(oc))
				429	mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
				430	else {
				431	struct task_struct *p;
				432
				433	rcu_read_lock();
				434	for_each_process(p)
				435	dump_task(p, oc);
				436	rcu_read_unlock();
				437	}
				438	}
				439
				440	static void dump_oom_summary(struct oom_control oc, struct task_struct victim)
				441	{
				442	/* one line summary of the oom killer context. */
				443	pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
				444	oom_constraint_text[oc->constraint],
				445	nodemask_pr_args(oc->nodemask));
				446	cpuset_print_current_mems_allowed();
				447	mem_cgroup_print_oom_context(oc->memcg, victim);
				448	pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
				449	from_kuid(&init_user_ns, task_uid(victim)));
				450	}
				451
				452	static void dump_header(struct oom_control oc, struct task_struct p)
				453	{
				454	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
				455	current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
				456	current->signal->oom_score_adj);
				457	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
				458	pr_warn("COMPACTION is disabled!!!\n");
				459
				460	dump_stack();
				461	if (is_memcg_oom(oc))
				462	mem_cgroup_print_oom_meminfo(oc->memcg);
				463	else {
				464	show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
				465	if (is_dump_unreclaim_slabs())
				466	dump_unreclaimable_slab();
				467	}
				468	if (sysctl_oom_dump_tasks)
				469	dump_tasks(oc);
				470	if (p)
				471	dump_oom_summary(oc, p);
				472	}
				473
				474	/*
				475	* Number of OOM victims in flight
				476	*/
				477	static atomic_t oom_victims = ATOMIC_INIT(0);
				478	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
				479
				480	static bool oom_killer_disabled __read_mostly;
				481
				482	#define K(x) ((x) << (PAGE_SHIFT-10))
				483
				484	/*
				485	* task->mm can be NULL if the task is the exited group leader. So to
				486	* determine whether the task is using a particular mm, we examine all the
				487	* task's threads: if one of those is using this mm then this task was also
				488	* using it.
				489	*/
				490	bool process_shares_mm(struct task_struct p, struct mm_struct mm)
				491	{
				492	struct task_struct *t;
				493
				494	for_each_thread(p, t) {
				495	struct mm_struct *t_mm = READ_ONCE(t->mm);
				496	if (t_mm)
				497	return t_mm == mm;
				498	}
				499	return false;
				500	}
				501
				502	#ifdef CONFIG_MMU
				503	/*
				504	* OOM Reaper kernel thread which tries to reap the memory used by the OOM
				505	* victim (if that is possible) to help the OOM killer to move on.
				506	*/
				507	static struct task_struct *oom_reaper_th;
				508	static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
				509	static struct task_struct *oom_reaper_list;
				510	static DEFINE_SPINLOCK(oom_reaper_lock);
				511
				512	bool __oom_reap_task_mm(struct mm_struct *mm)
				513	{
				514	struct vm_area_struct *vma;
				515	bool ret = true;
				516
				517	/*
				518	* Tell all users of get_user/copy_from_user etc... that the content
				519	* is no longer stable. No barriers really needed because unmapping
				520	* should imply barriers already and the reader would hit a page fault
				521	* if it stumbled over a reaped memory.
				522	*/
				523	set_bit(MMF_UNSTABLE, &mm->flags);
				524
				525	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
				526	if (!can_madv_lru_vma(vma))
				527	continue;
				528
				529	/*
				530	* Only anonymous pages have a good chance to be dropped
				531	* without additional steps which we cannot afford as we
				532	* are OOM already.
				533	*
				534	* We do not even care about fs backed pages because all
				535	* which are reclaimable have already been reclaimed and
				536	* we do not want to block exit_mmap by keeping mm ref
				537	* count elevated without a good reason.
				538	*/
				539	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED)) {
				540	struct mmu_notifier_range range;
				541	struct mmu_gather tlb;
				542
				543	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
				544	vma, mm, vma->vm_start,
				545	vma->vm_end);
				546	tlb_gather_mmu(&tlb, mm, range.start, range.end);
				547	if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
				548	tlb_finish_mmu(&tlb, range.start, range.end);
				549	ret = false;
				550	continue;
				551	}
				552	unmap_page_range(&tlb, vma, range.start, range.end, NULL);
				553	mmu_notifier_invalidate_range_end(&range);
				554	tlb_finish_mmu(&tlb, range.start, range.end);
				555	}
				556	}
				557
				558	return ret;
				559	}
				560
				561	/*
				562	* Reaps the address space of the give task.
				563	*
				564	* Returns true on success and false if none or part of the address space
				565	* has been reclaimed and the caller should retry later.
				566	*/
				567	static bool oom_reap_task_mm(struct task_struct tsk, struct mm_struct mm)
				568	{
				569	bool ret = true;
				570
				571	if (!down_read_trylock(&mm->mmap_sem)) {
				572	trace_skip_task_reaping(tsk->pid);
				573	return false;
				574	}
				575
				576	/*
				577	* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
				578	* work on the mm anymore. The check for MMF_OOM_SKIP must run
				579	* under mmap_sem for reading because it serializes against the
				580	* down_write();up_write() cycle in exit_mmap().
				581	*/
				582	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
				583	trace_skip_task_reaping(tsk->pid);
				584	goto out_unlock;
				585	}
				586
				587	trace_start_task_reaping(tsk->pid);
				588
				589	/* failed to reap part of the address space. Try again later */
				590	ret = __oom_reap_task_mm(mm);
				591	if (!ret)
				592	goto out_finish;
				593
				594	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
				595	task_pid_nr(tsk), tsk->comm,
				596	K(get_mm_counter(mm, MM_ANONPAGES)),
				597	K(get_mm_counter(mm, MM_FILEPAGES)),
				598	K(get_mm_counter(mm, MM_SHMEMPAGES)));
				599	out_finish:
				600	trace_finish_task_reaping(tsk->pid);
				601	out_unlock:
				602	up_read(&mm->mmap_sem);
				603
				604	return ret;
				605	}
				606
				607	#define MAX_OOM_REAP_RETRIES 10
				608	static void oom_reap_task(struct task_struct *tsk)
				609	{
				610	int attempts = 0;
				611	struct mm_struct *mm = tsk->signal->oom_mm;
				612
				613	/* Retry the down_read_trylock(mmap_sem) a few times */
				614	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
				615	schedule_timeout_idle(HZ/10);
				616
				617	if (attempts <= MAX_OOM_REAP_RETRIES \|\|
				618	test_bit(MMF_OOM_SKIP, &mm->flags))
				619	goto done;
				620
				621	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
				622	task_pid_nr(tsk), tsk->comm);
				623	debug_show_all_locks();
				624
				625	done:
				626	tsk->oom_reaper_list = NULL;
				627
				628	/*
				629	* Hide this mm from OOM killer because it has been either reaped or
				630	* somebody can't call up_write(mmap_sem).
				631	*/
				632	set_bit(MMF_OOM_SKIP, &mm->flags);
				633
				634	/* Drop a reference taken by queue_oom_reaper */
				635	put_task_struct(tsk);
				636	}
				637
				638	static int oom_reaper(void *unused)
				639	{
				640	while (true) {
				641	struct task_struct *tsk = NULL;
				642
				643	wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
				644	spin_lock_irq(&oom_reaper_lock);
				645	if (oom_reaper_list != NULL) {
				646	tsk = oom_reaper_list;
				647	oom_reaper_list = tsk->oom_reaper_list;
				648	}
				649	spin_unlock_irq(&oom_reaper_lock);
				650
				651	if (tsk)
				652	oom_reap_task(tsk);
				653	}
				654
				655	return 0;
				656	}
				657
				658	static void wake_oom_reaper(struct timer_list *timer)
				659	{
				660	struct task_struct *tsk = container_of(timer, struct task_struct,
				661	oom_reaper_timer);
				662	struct mm_struct *mm = tsk->signal->oom_mm;
				663	unsigned long flags;
				664
				665	/* The victim managed to terminate on its own - see exit_mmap */
				666	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
				667	put_task_struct(tsk);
				668	return;
				669	}
				670
				671	spin_lock_irqsave(&oom_reaper_lock, flags);
				672	tsk->oom_reaper_list = oom_reaper_list;
				673	oom_reaper_list = tsk;
				674	spin_unlock_irqrestore(&oom_reaper_lock, flags);
				675	trace_wake_reaper(tsk->pid);
				676	wake_up(&oom_reaper_wait);
				677	}
				678
				679	/*
				680	* Give the OOM victim time to exit naturally before invoking the oom_reaping.
				681	* The timers timeout is arbitrary... the longer it is, the longer the worst
				682	* case scenario for the OOM can take. If it is too small, the oom_reaper can
				683	* get in the way and release resources needed by the process exit path.
				684	* e.g. The futex robust list can sit in Anon\|Private memory that gets reaped
				685	* before the exit path is able to wake the futex waiters.
				686	*/
				687	#define OOM_REAPER_DELAY (2*HZ)
				688	static void queue_oom_reaper(struct task_struct *tsk)
				689	{
				690	/* mm is already queued? */
				691	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
				692	return;
				693
				694	get_task_struct(tsk);
				695	timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
				696	tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
				697	add_timer(&tsk->oom_reaper_timer);
				698	}
				699
				700	static int __init oom_init(void)
				701	{
				702	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
				703	return 0;
				704	}
				705	subsys_initcall(oom_init)
				706	#else
				707	static inline void queue_oom_reaper(struct task_struct *tsk)
				708	{
				709	}
				710	#endif /* CONFIG_MMU */
				711
				712	/**
				713	* mark_oom_victim - mark the given task as OOM victim
				714	* @tsk: task to mark
				715	*
				716	* Has to be called with oom_lock held and never after
				717	* oom has been disabled already.
				718	*
				719	* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
				720	* under task_lock or operate on the current).
				721	*/
				722	static void mark_oom_victim(struct task_struct *tsk)
				723	{
				724	struct mm_struct *mm = tsk->mm;
				725
				726	WARN_ON(oom_killer_disabled);
				727	/* OOM killer might race with memcg OOM */
				728	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
				729	return;
				730
				731	/* oom_mm is bound to the signal struct life time. */
				732	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
				733	mmgrab(tsk->signal->oom_mm);
				734	set_bit(MMF_OOM_VICTIM, &mm->flags);
				735	}
				736
				737	/*
				738	* Make sure that the task is woken up from uninterruptible sleep
				739	* if it is frozen because OOM killer wouldn't be able to free
				740	* any memory and livelock. freezing_slow_path will tell the freezer
				741	* that TIF_MEMDIE tasks should be ignored.
				742	*/
				743	__thaw_task(tsk);
				744	atomic_inc(&oom_victims);
				745	trace_mark_victim(tsk->pid);
				746	}
				747
				748	/**
				749	* exit_oom_victim - note the exit of an OOM victim
				750	*/
				751	void exit_oom_victim(void)
				752	{
				753	clear_thread_flag(TIF_MEMDIE);
				754
				755	if (!atomic_dec_return(&oom_victims))
				756	wake_up_all(&oom_victims_wait);
				757	}
				758
				759	/**
				760	* oom_killer_enable - enable OOM killer
				761	*/
				762	void oom_killer_enable(void)
				763	{
				764	oom_killer_disabled = false;
				765	pr_pm_debug("OOM killer enabled.\n");
				766	}
				767
				768	/**
				769	* oom_killer_disable - disable OOM killer
				770	* @timeout: maximum timeout to wait for oom victims in jiffies
				771	*
				772	* Forces all page allocations to fail rather than trigger OOM killer.
				773	* Will block and wait until all OOM victims are killed or the given
				774	* timeout expires.
				775	*
				776	* The function cannot be called when there are runnable user tasks because
				777	* the userspace would see unexpected allocation failures as a result. Any
				778	* new usage of this function should be consulted with MM people.
				779	*
				780	* Returns true if successful and false if the OOM killer cannot be
				781	* disabled.
				782	*/
				783	bool oom_killer_disable(signed long timeout)
				784	{
				785	signed long ret;
				786
				787	/*
				788	* Make sure to not race with an ongoing OOM killer. Check that the
				789	* current is not killed (possibly due to sharing the victim's memory).
				790	*/
				791	if (mutex_lock_killable(&oom_lock))
				792	return false;
				793	oom_killer_disabled = true;
				794	mutex_unlock(&oom_lock);
				795
				796	ret = wait_event_interruptible_timeout(oom_victims_wait,
				797	!atomic_read(&oom_victims), timeout);
				798	if (ret <= 0) {
				799	oom_killer_enable();
				800	return false;
				801	}
				802	pr_pm_debug("OOM killer disabled.\n");
				803
				804	return true;
				805	}
				806
				807	static inline bool __task_will_free_mem(struct task_struct *task)
				808	{
				809	struct signal_struct *sig = task->signal;
				810
				811	/*
				812	* A coredumping process may sleep for an extended period in exit_mm(),
				813	* so the oom killer cannot assume that the process will promptly exit
				814	* and release memory.
				815	*/
				816	if (sig->flags & SIGNAL_GROUP_COREDUMP)
				817	return false;
				818
				819	if (sig->flags & SIGNAL_GROUP_EXIT)
				820	return true;
				821
				822	if (thread_group_empty(task) && (task->flags & PF_EXITING))
				823	return true;
				824
				825	return false;
				826	}
				827
				828	/*
				829	* Checks whether the given task is dying or exiting and likely to
				830	* release its address space. This means that all threads and processes
				831	* sharing the same mm have to be killed or exiting.
				832	* Caller has to make sure that task->mm is stable (hold task_lock or
				833	* it operates on the current).
				834	*/
				835	static bool task_will_free_mem(struct task_struct *task)
				836	{
				837	struct mm_struct *mm = task->mm;
				838	struct task_struct *p;
				839	bool ret = true;
				840
				841	/*
				842	* Skip tasks without mm because it might have passed its exit_mm and
				843	* exit_oom_victim. oom_reaper could have rescued that but do not rely
				844	* on that for now. We can consider find_lock_task_mm in future.
				845	*/
				846	if (!mm)
				847	return false;
				848
				849	if (!__task_will_free_mem(task))
				850	return false;
				851
				852	/*
				853	* This task has already been drained by the oom reaper so there are
				854	* only small chances it will free some more
				855	*/
				856	if (test_bit(MMF_OOM_SKIP, &mm->flags))
				857	return false;
				858
				859	if (atomic_read(&mm->mm_users) <= 1)
				860	return true;
				861
				862	/*
				863	* Make sure that all tasks which share the mm with the given tasks
				864	* are dying as well to make sure that a) nobody pins its mm and
				865	* b) the task is also reapable by the oom reaper.
				866	*/
				867	rcu_read_lock();
				868	for_each_process(p) {
				869	if (!process_shares_mm(p, mm))
				870	continue;
				871	if (same_thread_group(task, p))
				872	continue;
				873	ret = __task_will_free_mem(p);
				874	if (!ret)
				875	break;
				876	}
				877	rcu_read_unlock();
				878
				879	return ret;
				880	}
				881
				882	static void __oom_kill_process(struct task_struct victim, const char message)
				883	{
				884	struct task_struct *p;
				885	struct mm_struct *mm;
				886	bool can_oom_reap = true;
				887
				888	p = find_lock_task_mm(victim);
				889	if (!p) {
				890	put_task_struct(victim);
				891	return;
				892	} else if (victim != p) {
				893	get_task_struct(p);
				894	put_task_struct(victim);
				895	victim = p;
				896	}
				897
				898	/* Get a reference to safely compare mm after task_unlock(victim) */
				899	mm = victim->mm;
				900	mmgrab(mm);
				901
				902	/* Raise event before sending signal: task reaper must see this */
				903	count_vm_event(OOM_KILL);
				904	memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
				905
				906	/*
				907	* We should send SIGKILL before granting access to memory reserves
				908	* in order to prevent the OOM victim from depleting the memory
				909	* reserves from the user space under its control.
				910	*/
				911	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
				912	mark_oom_victim(victim);
				913	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
				914	message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
				915	K(get_mm_counter(mm, MM_ANONPAGES)),
				916	K(get_mm_counter(mm, MM_FILEPAGES)),
				917	K(get_mm_counter(mm, MM_SHMEMPAGES)),
				918	from_kuid(&init_user_ns, task_uid(victim)),
				919	mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
				920	task_unlock(victim);
				921
				922	/*
				923	* Kill all user processes sharing victim->mm in other thread groups, if
				924	* any. They don't get access to memory reserves, though, to avoid
				925	* depletion of all memory. This prevents mm->mmap_sem livelock when an
				926	* oom killed thread cannot exit because it requires the semaphore and
				927	* its contended by another thread trying to allocate memory itself.
				928	* That thread will now get access to memory reserves since it has a
				929	* pending fatal signal.
				930	*/
				931	rcu_read_lock();
				932	for_each_process(p) {
				933	if (!process_shares_mm(p, mm))
				934	continue;
				935	if (same_thread_group(p, victim))
				936	continue;
				937	if (is_global_init(p)) {
				938	can_oom_reap = false;
				939	set_bit(MMF_OOM_SKIP, &mm->flags);
				940	pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
				941	task_pid_nr(victim), victim->comm,
				942	task_pid_nr(p), p->comm);
				943	continue;
				944	}
				945	/*
				946	* No use_mm() user needs to read from the userspace so we are
				947	* ok to reap it.
				948	*/
				949	if (unlikely(p->flags & PF_KTHREAD))
				950	continue;
				951	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
				952	}
				953	rcu_read_unlock();
				954
				955	if (can_oom_reap)
				956	queue_oom_reaper(victim);
				957
				958	mmdrop(mm);
				959	put_task_struct(victim);
				960	}
				961	#undef K
				962
				963	/*
				964	* Kill provided task unless it's secured by setting
				965	* oom_score_adj to OOM_SCORE_ADJ_MIN.
				966	*/
				967	static int oom_kill_memcg_member(struct task_struct task, void message)
				968	{
				969	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
				970	!is_global_init(task)) {
				971	get_task_struct(task);
				972	__oom_kill_process(task, message);
				973	}
				974	return 0;
				975	}
				976
				977	static void oom_kill_process(struct oom_control oc, const char message)
				978	{
				979	struct task_struct *victim = oc->chosen;
				980	struct mem_cgroup *oom_group;
				981	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				982	DEFAULT_RATELIMIT_BURST);
				983
				984	/*
				985	* If the task is already exiting, don't alarm the sysadmin or kill
				986	* its children or threads, just give it access to memory reserves
				987	* so it can die quickly
				988	*/
				989	task_lock(victim);
				990	if (task_will_free_mem(victim)) {
				991	mark_oom_victim(victim);
				992	queue_oom_reaper(victim);
				993	task_unlock(victim);
				994	put_task_struct(victim);
				995	return;
				996	}
				997	task_unlock(victim);
				998
				999	if (__ratelimit(&oom_rs))
				1000	dump_header(oc, victim);
				1001
				1002	/*
				1003	* Do we need to kill the entire memory cgroup?
				1004	* Or even one of the ancestor memory cgroups?
				1005	* Check this out before killing the victim task.
				1006	*/
				1007	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
				1008
				1009	__oom_kill_process(victim, message);
				1010
				1011	/*
				1012	* If necessary, kill all tasks in the selected memory cgroup.
				1013	*/
				1014	if (oom_group) {
				1015	mem_cgroup_print_oom_group(oom_group);
				1016	mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
				1017	(void*)message);
				1018	mem_cgroup_put(oom_group);
				1019	}
				1020	}
				1021
				1022	/*
				1023	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
				1024	*/
				1025	static void check_panic_on_oom(struct oom_control *oc)
				1026	{
				1027	if (likely(!sysctl_panic_on_oom))
				1028	return;
				1029	if (sysctl_panic_on_oom != 2) {
				1030	/*
				1031	* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
				1032	* does not panic for cpuset, mempolicy, or memcg allocation
				1033	* failures.
				1034	*/
				1035	if (oc->constraint != CONSTRAINT_NONE)
				1036	return;
				1037	}
				1038	/* Do not panic for oom kills triggered by sysrq */
				1039	if (is_sysrq_oom(oc))
				1040	return;
				1041	dump_header(oc, NULL);
				1042	panic("Out of memory: %s panic_on_oom is enabled\n",
				1043	sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
				1044	}
				1045
				1046	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
				1047
				1048	int register_oom_notifier(struct notifier_block *nb)
				1049	{
				1050	return blocking_notifier_chain_register(&oom_notify_list, nb);
				1051	}
				1052	EXPORT_SYMBOL_GPL(register_oom_notifier);
				1053
				1054	int unregister_oom_notifier(struct notifier_block *nb)
				1055	{
				1056	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
				1057	}
				1058	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
				1059
				1060	/**
				1061	* out_of_memory - kill the "best" process when we run out of memory
				1062	* @oc: pointer to struct oom_control
				1063	*
				1064	* If we run out of memory, we have the choice between either
				1065	* killing a random task (bad), letting the system crash (worse)
				1066	* OR try to be smart about which process to kill. Note that we
				1067	* don't have to be perfect here, we just have to be good.
				1068	*/
				1069	bool out_of_memory(struct oom_control *oc)
				1070	{
				1071	unsigned long freed = 0;
				1072
				1073	if (oom_killer_disabled)
				1074	return false;
				1075
				1076	if (!is_memcg_oom(oc)) {
				1077	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
				1078	if (freed > 0)
				1079	/* Got some memory back in the last second. */
				1080	return true;
				1081	}
				1082
				1083	/*
				1084	* If current has a pending SIGKILL or is exiting, then automatically
				1085	* select it. The goal is to allow it to allocate so that it may
				1086	* quickly exit and free its memory.
				1087	*/
				1088	if (task_will_free_mem(current)) {
				1089	mark_oom_victim(current);
				1090	queue_oom_reaper(current);
				1091	return true;
				1092	}
				1093
				1094	/*
				1095	* The OOM killer does not compensate for IO-less reclaim.
				1096	* pagefault_out_of_memory lost its gfp context so we have to
				1097	* make sure exclude 0 mask - all other users should have at least
				1098	* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
				1099	* invoke the OOM killer even if it is a GFP_NOFS allocation.
				1100	*/
				1101	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
				1102	return true;
				1103
				1104	/*
				1105	* Check if there were limitations on the allocation (only relevant for
				1106	* NUMA and memcg) that may require different handling.
				1107	*/
				1108	oc->constraint = constrained_alloc(oc);
				1109	if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
				1110	oc->nodemask = NULL;
				1111	check_panic_on_oom(oc);
				1112
				1113	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
				1114	current->mm && !oom_unkillable_task(current) &&
				1115	oom_cpuset_eligible(current, oc) &&
				1116	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
				1117	get_task_struct(current);
				1118	oc->chosen = current;
				1119	oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
				1120	return true;
				1121	}
				1122
				1123	select_bad_process(oc);
				1124	/* Found nothing?!?! */
				1125	if (!oc->chosen) {
				1126	dump_header(oc, NULL);
				1127	pr_warn("Out of memory and no killable processes...\n");
				1128	/*
				1129	* If we got here due to an actual allocation at the
				1130	* system level, we cannot survive this and will enter
				1131	* an endless loop in the allocator. Bail out now.
				1132	*/
				1133	if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
				1134	panic("System is deadlocked on memory\n");
				1135	}
				1136	if (oc->chosen && oc->chosen != (void *)-1UL)
				1137	oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
				1138	"Memory cgroup out of memory");
				1139	return !!oc->chosen;
				1140	}
				1141
				1142	/*
				1143	* The pagefault handler calls here because some allocation has failed. We have
				1144	* to take care of the memcg OOM here because this is the only safe context without
				1145	* any locks held but let the oom killer triggered from the allocation context care
				1146	* about the global OOM.
				1147	*/
				1148	void pagefault_out_of_memory(void)
				1149	{
				1150	static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
				1151	DEFAULT_RATELIMIT_BURST);
				1152
				1153	if (mem_cgroup_oom_synchronize(true))
				1154	return;
				1155
				1156	if (fatal_signal_pending(current))
				1157	return;
				1158
				1159	if (__ratelimit(&pfoom_rs))
				1160	pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
				1161	}