Blame - src/kernel/linux/v4.19/mm/oom_kill.c - T800

blob: 081876e514c3671fd284cb30aed9c60be0ad45c6 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* linux/mm/oom_kill.c
				3	*
				4	* Copyright (C) 1998,2000 Rik van Riel
				5	* Thanks go out to Claus Fischer for some serious inspiration and
				6	* for goading me into coding this file...
				7	* Copyright (C) 2010 Google, Inc.
				8	* Rewritten by David Rientjes
				9	*
				10	* The routines in this file are used to kill a process when
				11	* we're seriously out of memory. This gets called from __alloc_pages()
				12	* in mm/page_alloc.c when we really run out of memory.
				13	*
				14	* Since we won't call these routines often (on a well-configured
				15	* machine) this file will double as a 'coding guide' and a signpost
				16	* for newbie kernel hackers. It features several pointers to major
				17	* kernel subsystems and hints as to where to find out what things do.
				18	*/
				19
				20	#include <linux/oom.h>
				21	#include <linux/mm.h>
				22	#include <linux/err.h>
				23	#include <linux/gfp.h>
				24	#include <linux/sched.h>
				25	#include <linux/sched/mm.h>
				26	#include <linux/sched/coredump.h>
				27	#include <linux/sched/task.h>
				28	#include <linux/swap.h>
				29	#include <linux/timex.h>
				30	#include <linux/jiffies.h>
				31	#include <linux/cpuset.h>
				32	#include <linux/export.h>
				33	#include <linux/notifier.h>
				34	#include <linux/memcontrol.h>
				35	#include <linux/mempolicy.h>
				36	#include <linux/security.h>
				37	#include <linux/ptrace.h>
				38	#include <linux/freezer.h>
				39	#include <linux/ftrace.h>
				40	#include <linux/ratelimit.h>
				41	#include <linux/kthread.h>
				42	#include <linux/init.h>
				43	#include <linux/mmu_notifier.h>
				44
				45	#include <asm/tlb.h>
				46	#include "internal.h"
				47	#include "slab.h"
				48
				49	#define CREATE_TRACE_POINTS
				50	#include <trace/events/oom.h>
				51
				52	int sysctl_panic_on_oom;
				53	int sysctl_oom_kill_allocating_task;
				54	int sysctl_oom_dump_tasks = 1;
				55
				56	/*
				57	* Serializes oom killer invocations (out_of_memory()) from all contexts to
				58	* prevent from over eager oom killing (e.g. when the oom killer is invoked
				59	* from different domains).
				60	*
				61	* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
				62	* and mark_oom_victim
				63	*/
				64	DEFINE_MUTEX(oom_lock);
				65
				66	#ifdef CONFIG_NUMA
				67	/**
				68	* has_intersects_mems_allowed() - check task eligiblity for kill
				69	* @start: task struct of which task to consider
				70	* @mask: nodemask passed to page allocator for mempolicy ooms
				71	*
				72	* Task eligibility is determined by whether or not a candidate task, @tsk,
				73	* shares the same mempolicy nodes as current if it is bound by such a policy
				74	* and whether or not it has the same set of allowed cpuset nodes.
				75	*/
				76	static bool has_intersects_mems_allowed(struct task_struct *start,
				77	const nodemask_t *mask)
				78	{
				79	struct task_struct *tsk;
				80	bool ret = false;
				81
				82	rcu_read_lock();
				83	for_each_thread(start, tsk) {
				84	if (mask) {
				85	/*
				86	* If this is a mempolicy constrained oom, tsk's
				87	* cpuset is irrelevant. Only return true if its
				88	* mempolicy intersects current, otherwise it may be
				89	* needlessly killed.
				90	*/
				91	ret = mempolicy_nodemask_intersects(tsk, mask);
				92	} else {
				93	/*
				94	* This is not a mempolicy constrained oom, so only
				95	* check the mems of tsk's cpuset.
				96	*/
				97	ret = cpuset_mems_allowed_intersects(current, tsk);
				98	}
				99	if (ret)
				100	break;
				101	}
				102	rcu_read_unlock();
				103
				104	return ret;
				105	}
				106	#else
				107	static bool has_intersects_mems_allowed(struct task_struct *tsk,
				108	const nodemask_t *mask)
				109	{
				110	return true;
				111	}
				112	#endif /* CONFIG_NUMA */
				113
				114	/*
				115	* The process p may have detached its own ->mm while exiting or through
				116	* use_mm(), but one or more of its subthreads may still have a valid
				117	* pointer. Return p, or any of its subthreads with a valid ->mm, with
				118	* task_lock() held.
				119	*/
				120	struct task_struct find_lock_task_mm(struct task_struct p)
				121	{
				122	struct task_struct *t;
				123
				124	rcu_read_lock();
				125
				126	for_each_thread(p, t) {
				127	task_lock(t);
				128	if (likely(t->mm))
				129	goto found;
				130	task_unlock(t);
				131	}
				132	t = NULL;
				133	found:
				134	rcu_read_unlock();
				135
				136	return t;
				137	}
				138
				139	/*
				140	* order == -1 means the oom kill is required by sysrq, otherwise only
				141	* for display purposes.
				142	*/
				143	static inline bool is_sysrq_oom(struct oom_control *oc)
				144	{
				145	return oc->order == -1;
				146	}
				147
				148	static inline bool is_memcg_oom(struct oom_control *oc)
				149	{
				150	return oc->memcg != NULL;
				151	}
				152
				153	/* return true if the task is not adequate as candidate victim task. */
				154	static bool oom_unkillable_task(struct task_struct *p,
				155	struct mem_cgroup memcg, const nodemask_t nodemask)
				156	{
				157	if (is_global_init(p))
				158	return true;
				159	if (p->flags & PF_KTHREAD)
				160	return true;
				161
				162	/* When mem_cgroup_out_of_memory() and p is not member of the group */
				163	if (memcg && !task_in_mem_cgroup(p, memcg))
				164	return true;
				165
				166	/* p may not have freeable memory in nodemask */
				167	if (!has_intersects_mems_allowed(p, nodemask))
				168	return true;
				169
				170	return false;
				171	}
				172
				173	/*
				174	* Print out unreclaimble slabs info when unreclaimable slabs amount is greater
				175	* than all user memory (LRU pages)
				176	*/
				177	static bool is_dump_unreclaim_slabs(void)
				178	{
				179	unsigned long nr_lru;
				180
				181	nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
				182	global_node_page_state(NR_INACTIVE_ANON) +
				183	global_node_page_state(NR_ACTIVE_FILE) +
				184	global_node_page_state(NR_INACTIVE_FILE) +
				185	global_node_page_state(NR_ISOLATED_ANON) +
				186	global_node_page_state(NR_ISOLATED_FILE) +
				187	global_node_page_state(NR_UNEVICTABLE);
				188
				189	return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
				190	}
				191
				192	/**
				193	* oom_badness - heuristic function to determine which candidate task to kill
				194	* @p: task struct of which task we should calculate
				195	* @totalpages: total present RAM allowed for page allocation
				196	* @memcg: task's memory controller, if constrained
				197	* @nodemask: nodemask passed to page allocator for mempolicy ooms
				198	*
				199	* The heuristic for determining which task to kill is made to be as simple and
				200	* predictable as possible. The goal is to return the highest value for the
				201	* task consuming the most memory to avoid subsequent oom failures.
				202	*/
				203	unsigned long oom_badness(struct task_struct p, struct mem_cgroup memcg,
				204	const nodemask_t *nodemask, unsigned long totalpages)
				205	{
				206	long points;
				207	long adj;
				208
				209	if (oom_unkillable_task(p, memcg, nodemask))
				210	return 0;
				211
				212	p = find_lock_task_mm(p);
				213	if (!p)
				214	return 0;
				215
				216	/*
				217	* Do not even consider tasks which are explicitly marked oom
				218	* unkillable or have been already oom reaped or the are in
				219	* the middle of vfork
				220	*/
				221	adj = (long)p->signal->oom_score_adj;
				222	if (adj == OOM_SCORE_ADJ_MIN \|\|
				223	test_bit(MMF_OOM_SKIP, &p->mm->flags) \|\|
				224	in_vfork(p)) {
				225	task_unlock(p);
				226	return 0;
				227	}
				228
				229	/*
				230	* The baseline for the badness score is the proportion of RAM that each
				231	* task's rss, pagetable and swap space use.
				232	*/
				233	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
				234	mm_pgtables_bytes(p->mm) / PAGE_SIZE;
				235	task_unlock(p);
				236
				237	/* Normalize to oom_score_adj units */
				238	adj *= totalpages / 1000;
				239	points += adj;
				240
				241	/*
				242	* Never return 0 for an eligible task regardless of the root bonus and
				243	* oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
				244	*/
				245	return points > 0 ? points : 1;
				246	}
				247
				248	enum oom_constraint {
				249	CONSTRAINT_NONE,
				250	CONSTRAINT_CPUSET,
				251	CONSTRAINT_MEMORY_POLICY,
				252	CONSTRAINT_MEMCG,
				253	};
				254
				255	/*
				256	* Determine the type of allocation constraint.
				257	*/
				258	static enum oom_constraint constrained_alloc(struct oom_control *oc)
				259	{
				260	struct zone *zone;
				261	struct zoneref *z;
				262	enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
				263	bool cpuset_limited = false;
				264	int nid;
				265
				266	if (is_memcg_oom(oc)) {
				267	oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
				268	return CONSTRAINT_MEMCG;
				269	}
				270
				271	/* Default to all available memory */
				272	oc->totalpages = totalram_pages + total_swap_pages;
				273
				274	if (!IS_ENABLED(CONFIG_NUMA))
				275	return CONSTRAINT_NONE;
				276
				277	if (!oc->zonelist)
				278	return CONSTRAINT_NONE;
				279	/*
				280	* Reach here only when __GFP_NOFAIL is used. So, we should avoid
				281	* to kill current.We have to random task kill in this case.
				282	* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
				283	*/
				284	if (oc->gfp_mask & __GFP_THISNODE)
				285	return CONSTRAINT_NONE;
				286
				287	/*
				288	* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
				289	* the page allocator means a mempolicy is in effect. Cpuset policy
				290	* is enforced in get_page_from_freelist().
				291	*/
				292	if (oc->nodemask &&
				293	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
				294	oc->totalpages = total_swap_pages;
				295	for_each_node_mask(nid, *oc->nodemask)
				296	oc->totalpages += node_spanned_pages(nid);
				297	return CONSTRAINT_MEMORY_POLICY;
				298	}
				299
				300	/* Check this allocation failure is caused by cpuset's wall function */
				301	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
				302	high_zoneidx, oc->nodemask)
				303	if (!cpuset_zone_allowed(zone, oc->gfp_mask))
				304	cpuset_limited = true;
				305
				306	if (cpuset_limited) {
				307	oc->totalpages = total_swap_pages;
				308	for_each_node_mask(nid, cpuset_current_mems_allowed)
				309	oc->totalpages += node_spanned_pages(nid);
				310	return CONSTRAINT_CPUSET;
				311	}
				312	return CONSTRAINT_NONE;
				313	}
				314
				315	static int oom_evaluate_task(struct task_struct task, void arg)
				316	{
				317	struct oom_control *oc = arg;
				318	unsigned long points;
				319
				320	if (oom_unkillable_task(task, NULL, oc->nodemask))
				321	goto next;
				322
				323	/*
				324	* This task already has access to memory reserves and is being killed.
				325	* Don't allow any other task to have access to the reserves unless
				326	* the task has MMF_OOM_SKIP because chances that it would release
				327	* any memory is quite low.
				328	*/
				329	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
				330	if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
				331	goto next;
				332	goto abort;
				333	}
				334
				335	/*
				336	* If task is allocating a lot of memory and has been marked to be
				337	* killed first if it triggers an oom, then select it.
				338	*/
				339	if (oom_task_origin(task)) {
				340	points = ULONG_MAX;
				341	goto select;
				342	}
				343
				344	points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
				345	if (!points \|\| points < oc->chosen_points)
				346	goto next;
				347
				348	/* Prefer thread group leaders for display purposes */
				349	if (points == oc->chosen_points && thread_group_leader(oc->chosen))
				350	goto next;
				351	select:
				352	if (oc->chosen)
				353	put_task_struct(oc->chosen);
				354	get_task_struct(task);
				355	oc->chosen = task;
				356	oc->chosen_points = points;
				357	next:
				358	return 0;
				359	abort:
				360	if (oc->chosen)
				361	put_task_struct(oc->chosen);
				362	oc->chosen = (void *)-1UL;
				363	return 1;
				364	}
				365
				366	/*
				367	* Simple selection loop. We choose the process with the highest number of
				368	* 'points'. In case scan was aborted, oc->chosen is set to -1.
				369	*/
				370	static void select_bad_process(struct oom_control *oc)
				371	{
				372	if (is_memcg_oom(oc))
				373	mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
				374	else {
				375	struct task_struct *p;
				376
				377	rcu_read_lock();
				378	for_each_process(p)
				379	if (oom_evaluate_task(p, oc))
				380	break;
				381	rcu_read_unlock();
				382	}
				383
				384	oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
				385	}
				386
				387	/**
				388	* dump_tasks - dump current memory state of all system tasks
				389	* @memcg: current's memory controller, if constrained
				390	* @nodemask: nodemask passed to page allocator for mempolicy ooms
				391	*
				392	* Dumps the current memory state of all eligible tasks. Tasks not in the same
				393	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
				394	* are not shown.
				395	* State information includes task's pid, uid, tgid, vm size, rss,
				396	* pgtables_bytes, swapents, oom_score_adj value, and name.
				397	*/
				398	static void dump_tasks(struct mem_cgroup memcg, const nodemask_t nodemask)
				399	{
				400	struct task_struct *p;
				401	struct task_struct *task;
				402
				403	pr_info("Tasks state (memory values in pages):\n");
				404	pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
				405	rcu_read_lock();
				406	for_each_process(p) {
				407	if (oom_unkillable_task(p, memcg, nodemask))
				408	continue;
				409
				410	task = find_lock_task_mm(p);
				411	if (!task) {
				412	/*
				413	* This is a kthread or all of p's threads have already
				414	* detached their mm's. There's no need to report
				415	* them; they can't be oom killed anyway.
				416	*/
				417	continue;
				418	}
				419
				420	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
				421	task->pid, from_kuid(&init_user_ns, task_uid(task)),
				422	task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
				423	mm_pgtables_bytes(task->mm),
				424	get_mm_counter(task->mm, MM_SWAPENTS),
				425	task->signal->oom_score_adj, task->comm);
				426	task_unlock(task);
				427	}
				428	rcu_read_unlock();
				429	}
				430
				431	static void dump_header(struct oom_control oc, struct task_struct p)
				432	{
				433	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
				434	current->comm, oc->gfp_mask, &oc->gfp_mask,
				435	nodemask_pr_args(oc->nodemask), oc->order,
				436	current->signal->oom_score_adj);
				437	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
				438	pr_warn("COMPACTION is disabled!!!\n");
				439
				440	cpuset_print_current_mems_allowed();
				441	dump_stack();
				442	if (is_memcg_oom(oc))
				443	mem_cgroup_print_oom_info(oc->memcg, p);
				444	else {
				445	show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
				446	if (is_dump_unreclaim_slabs())
				447	dump_unreclaimable_slab();
				448	}
				449	if (sysctl_oom_dump_tasks)
				450	dump_tasks(oc->memcg, oc->nodemask);
				451	}
				452
				453	/*
				454	* Number of OOM victims in flight
				455	*/
				456	static atomic_t oom_victims = ATOMIC_INIT(0);
				457	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
				458
				459	static bool oom_killer_disabled __read_mostly;
				460
				461	#define K(x) ((x) << (PAGE_SHIFT-10))
				462
				463	/*
				464	* task->mm can be NULL if the task is the exited group leader. So to
				465	* determine whether the task is using a particular mm, we examine all the
				466	* task's threads: if one of those is using this mm then this task was also
				467	* using it.
				468	*/
				469	bool process_shares_mm(struct task_struct p, struct mm_struct mm)
				470	{
				471	struct task_struct *t;
				472
				473	for_each_thread(p, t) {
				474	struct mm_struct *t_mm = READ_ONCE(t->mm);
				475	if (t_mm)
				476	return t_mm == mm;
				477	}
				478	return false;
				479	}
				480
				481	#ifdef CONFIG_MMU
				482	/*
				483	* OOM Reaper kernel thread which tries to reap the memory used by the OOM
				484	* victim (if that is possible) to help the OOM killer to move on.
				485	*/
				486	static struct task_struct *oom_reaper_th;
				487	static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
				488	static struct task_struct *oom_reaper_list;
				489	static DEFINE_SPINLOCK(oom_reaper_lock);
				490
				491	bool __oom_reap_task_mm(struct mm_struct *mm)
				492	{
				493	struct vm_area_struct *vma;
				494	bool ret = true;
				495
				496	/*
				497	* Tell all users of get_user/copy_from_user etc... that the content
				498	* is no longer stable. No barriers really needed because unmapping
				499	* should imply barriers already and the reader would hit a page fault
				500	* if it stumbled over a reaped memory.
				501	*/
				502	set_bit(MMF_UNSTABLE, &mm->flags);
				503
				504	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
				505	if (!can_madv_dontneed_vma(vma))
				506	continue;
				507
				508	/*
				509	* Only anonymous pages have a good chance to be dropped
				510	* without additional steps which we cannot afford as we
				511	* are OOM already.
				512	*
				513	* We do not even care about fs backed pages because all
				514	* which are reclaimable have already been reclaimed and
				515	* we do not want to block exit_mmap by keeping mm ref
				516	* count elevated without a good reason.
				517	*/
				518	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED)) {
				519	const unsigned long start = vma->vm_start;
				520	const unsigned long end = vma->vm_end;
				521	struct mmu_gather tlb;
				522
				523	tlb_gather_mmu(&tlb, mm, start, end);
				524	if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
				525	tlb_finish_mmu(&tlb, start, end);
				526	ret = false;
				527	continue;
				528	}
				529	unmap_page_range(&tlb, vma, start, end, NULL);
				530	mmu_notifier_invalidate_range_end(mm, start, end);
				531	tlb_finish_mmu(&tlb, start, end);
				532	}
				533	}
				534
				535	return ret;
				536	}
				537
				538	/*
				539	* Reaps the address space of the give task.
				540	*
				541	* Returns true on success and false if none or part of the address space
				542	* has been reclaimed and the caller should retry later.
				543	*/
				544	static bool oom_reap_task_mm(struct task_struct tsk, struct mm_struct mm)
				545	{
				546	bool ret = true;
				547
				548	if (!down_read_trylock(&mm->mmap_sem)) {
				549	trace_skip_task_reaping(tsk->pid);
				550	return false;
				551	}
				552
				553	/*
				554	* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
				555	* work on the mm anymore. The check for MMF_OOM_SKIP must run
				556	* under mmap_sem for reading because it serializes against the
				557	* down_write();up_write() cycle in exit_mmap().
				558	*/
				559	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
				560	trace_skip_task_reaping(tsk->pid);
				561	goto out_unlock;
				562	}
				563
				564	trace_start_task_reaping(tsk->pid);
				565
				566	/* failed to reap part of the address space. Try again later */
				567	ret = __oom_reap_task_mm(mm);
				568	if (!ret)
				569	goto out_finish;
				570
				571	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
				572	task_pid_nr(tsk), tsk->comm,
				573	K(get_mm_counter(mm, MM_ANONPAGES)),
				574	K(get_mm_counter(mm, MM_FILEPAGES)),
				575	K(get_mm_counter(mm, MM_SHMEMPAGES)));
				576	out_finish:
				577	trace_finish_task_reaping(tsk->pid);
				578	out_unlock:
				579	up_read(&mm->mmap_sem);
				580
				581	return ret;
				582	}
				583
				584	#define MAX_OOM_REAP_RETRIES 10
				585	static void oom_reap_task(struct task_struct *tsk)
				586	{
				587	int attempts = 0;
				588	struct mm_struct *mm = tsk->signal->oom_mm;
				589
				590	/* Retry the down_read_trylock(mmap_sem) a few times */
				591	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
				592	schedule_timeout_idle(HZ/10);
				593
				594	if (attempts <= MAX_OOM_REAP_RETRIES \|\|
				595	test_bit(MMF_OOM_SKIP, &mm->flags))
				596	goto done;
				597
				598	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
				599	task_pid_nr(tsk), tsk->comm);
				600	debug_show_all_locks();
				601
				602	done:
				603	tsk->oom_reaper_list = NULL;
				604
				605	/*
				606	* Hide this mm from OOM killer because it has been either reaped or
				607	* somebody can't call up_write(mmap_sem).
				608	*/
				609	set_bit(MMF_OOM_SKIP, &mm->flags);
				610
				611	/* Drop a reference taken by wake_oom_reaper */
				612	put_task_struct(tsk);
				613	}
				614
				615	static int oom_reaper(void *unused)
				616	{
				617	while (true) {
				618	struct task_struct *tsk = NULL;
				619
				620	wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
				621	spin_lock(&oom_reaper_lock);
				622	if (oom_reaper_list != NULL) {
				623	tsk = oom_reaper_list;
				624	oom_reaper_list = tsk->oom_reaper_list;
				625	}
				626	spin_unlock(&oom_reaper_lock);
				627
				628	if (tsk)
				629	oom_reap_task(tsk);
				630	}
				631
				632	return 0;
				633	}
				634
				635	static void wake_oom_reaper(struct task_struct *tsk)
				636	{
				637	/* mm is already queued? */
				638	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
				639	return;
				640
				641	get_task_struct(tsk);
				642
				643	spin_lock(&oom_reaper_lock);
				644	tsk->oom_reaper_list = oom_reaper_list;
				645	oom_reaper_list = tsk;
				646	spin_unlock(&oom_reaper_lock);
				647	trace_wake_reaper(tsk->pid);
				648	wake_up(&oom_reaper_wait);
				649	}
				650
				651	static int __init oom_init(void)
				652	{
				653	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
				654	return 0;
				655	}
				656	subsys_initcall(oom_init)
				657	#else
				658	static inline void wake_oom_reaper(struct task_struct *tsk)
				659	{
				660	}
				661	#endif /* CONFIG_MMU */
				662
				663	/**
				664	* mark_oom_victim - mark the given task as OOM victim
				665	* @tsk: task to mark
				666	*
				667	* Has to be called with oom_lock held and never after
				668	* oom has been disabled already.
				669	*
				670	* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
				671	* under task_lock or operate on the current).
				672	*/
				673	static void mark_oom_victim(struct task_struct *tsk)
				674	{
				675	struct mm_struct *mm = tsk->mm;
				676
				677	WARN_ON(oom_killer_disabled);
				678	/* OOM killer might race with memcg OOM */
				679	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
				680	return;
				681
				682	/* oom_mm is bound to the signal struct life time. */
				683	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
				684	mmgrab(tsk->signal->oom_mm);
				685	set_bit(MMF_OOM_VICTIM, &mm->flags);
				686	}
				687
				688	/*
				689	* Make sure that the task is woken up from uninterruptible sleep
				690	* if it is frozen because OOM killer wouldn't be able to free
				691	* any memory and livelock. freezing_slow_path will tell the freezer
				692	* that TIF_MEMDIE tasks should be ignored.
				693	*/
				694	__thaw_task(tsk);
				695	atomic_inc(&oom_victims);
				696	trace_mark_victim(tsk->pid);
				697	}
				698
				699	/**
				700	* exit_oom_victim - note the exit of an OOM victim
				701	*/
				702	void exit_oom_victim(void)
				703	{
				704	clear_thread_flag(TIF_MEMDIE);
				705
				706	if (!atomic_dec_return(&oom_victims))
				707	wake_up_all(&oom_victims_wait);
				708	}
				709
				710	/**
				711	* oom_killer_enable - enable OOM killer
				712	*/
				713	void oom_killer_enable(void)
				714	{
				715	oom_killer_disabled = false;
				716	pr_info("OOM killer enabled.\n");
				717	}
				718
				719	/**
				720	* oom_killer_disable - disable OOM killer
				721	* @timeout: maximum timeout to wait for oom victims in jiffies
				722	*
				723	* Forces all page allocations to fail rather than trigger OOM killer.
				724	* Will block and wait until all OOM victims are killed or the given
				725	* timeout expires.
				726	*
				727	* The function cannot be called when there are runnable user tasks because
				728	* the userspace would see unexpected allocation failures as a result. Any
				729	* new usage of this function should be consulted with MM people.
				730	*
				731	* Returns true if successful and false if the OOM killer cannot be
				732	* disabled.
				733	*/
				734	bool oom_killer_disable(signed long timeout)
				735	{
				736	signed long ret;
				737
				738	/*
				739	* Make sure to not race with an ongoing OOM killer. Check that the
				740	* current is not killed (possibly due to sharing the victim's memory).
				741	*/
				742	if (mutex_lock_killable(&oom_lock))
				743	return false;
				744	oom_killer_disabled = true;
				745	mutex_unlock(&oom_lock);
				746
				747	ret = wait_event_interruptible_timeout(oom_victims_wait,
				748	!atomic_read(&oom_victims), timeout);
				749	if (ret <= 0) {
				750	oom_killer_enable();
				751	return false;
				752	}
				753	pr_info("OOM killer disabled.\n");
				754
				755	return true;
				756	}
				757
				758	static inline bool __task_will_free_mem(struct task_struct *task)
				759	{
				760	struct signal_struct *sig = task->signal;
				761
				762	/*
				763	* A coredumping process may sleep for an extended period in exit_mm(),
				764	* so the oom killer cannot assume that the process will promptly exit
				765	* and release memory.
				766	*/
				767	if (sig->flags & SIGNAL_GROUP_COREDUMP)
				768	return false;
				769
				770	if (sig->flags & SIGNAL_GROUP_EXIT)
				771	return true;
				772
				773	if (thread_group_empty(task) && (task->flags & PF_EXITING))
				774	return true;
				775
				776	return false;
				777	}
				778
				779	/*
				780	* Checks whether the given task is dying or exiting and likely to
				781	* release its address space. This means that all threads and processes
				782	* sharing the same mm have to be killed or exiting.
				783	* Caller has to make sure that task->mm is stable (hold task_lock or
				784	* it operates on the current).
				785	*/
				786	static bool task_will_free_mem(struct task_struct *task)
				787	{
				788	struct mm_struct *mm = task->mm;
				789	struct task_struct *p;
				790	bool ret = true;
				791
				792	/*
				793	* Skip tasks without mm because it might have passed its exit_mm and
				794	* exit_oom_victim. oom_reaper could have rescued that but do not rely
				795	* on that for now. We can consider find_lock_task_mm in future.
				796	*/
				797	if (!mm)
				798	return false;
				799
				800	if (!__task_will_free_mem(task))
				801	return false;
				802
				803	/*
				804	* This task has already been drained by the oom reaper so there are
				805	* only small chances it will free some more
				806	*/
				807	if (test_bit(MMF_OOM_SKIP, &mm->flags))
				808	return false;
				809
				810	if (atomic_read(&mm->mm_users) <= 1)
				811	return true;
				812
				813	/*
				814	* Make sure that all tasks which share the mm with the given tasks
				815	* are dying as well to make sure that a) nobody pins its mm and
				816	* b) the task is also reapable by the oom reaper.
				817	*/
				818	rcu_read_lock();
				819	for_each_process(p) {
				820	if (!process_shares_mm(p, mm))
				821	continue;
				822	if (same_thread_group(task, p))
				823	continue;
				824	ret = __task_will_free_mem(p);
				825	if (!ret)
				826	break;
				827	}
				828	rcu_read_unlock();
				829
				830	return ret;
				831	}
				832
				833	static void __oom_kill_process(struct task_struct *victim)
				834	{
				835	struct task_struct *p;
				836	struct mm_struct *mm;
				837	bool can_oom_reap = true;
				838
				839	p = find_lock_task_mm(victim);
				840	if (!p) {
				841	put_task_struct(victim);
				842	return;
				843	} else if (victim != p) {
				844	get_task_struct(p);
				845	put_task_struct(victim);
				846	victim = p;
				847	}
				848
				849	/* Get a reference to safely compare mm after task_unlock(victim) */
				850	mm = victim->mm;
				851	mmgrab(mm);
				852
				853	/* Raise event before sending signal: task reaper must see this */
				854	count_vm_event(OOM_KILL);
				855	memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
				856
				857	/*
				858	* We should send SIGKILL before granting access to memory reserves
				859	* in order to prevent the OOM victim from depleting the memory
				860	* reserves from the user space under its control.
				861	*/
				862	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
				863	mark_oom_victim(victim);
				864	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
				865	task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
				866	K(get_mm_counter(victim->mm, MM_ANONPAGES)),
				867	K(get_mm_counter(victim->mm, MM_FILEPAGES)),
				868	K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
				869	task_unlock(victim);
				870
				871	/*
				872	* Kill all user processes sharing victim->mm in other thread groups, if
				873	* any. They don't get access to memory reserves, though, to avoid
				874	* depletion of all memory. This prevents mm->mmap_sem livelock when an
				875	* oom killed thread cannot exit because it requires the semaphore and
				876	* its contended by another thread trying to allocate memory itself.
				877	* That thread will now get access to memory reserves since it has a
				878	* pending fatal signal.
				879	*/
				880	rcu_read_lock();
				881	for_each_process(p) {
				882	if (!process_shares_mm(p, mm))
				883	continue;
				884	if (same_thread_group(p, victim))
				885	continue;
				886	if (is_global_init(p)) {
				887	can_oom_reap = false;
				888	set_bit(MMF_OOM_SKIP, &mm->flags);
				889	pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
				890	task_pid_nr(victim), victim->comm,
				891	task_pid_nr(p), p->comm);
				892	continue;
				893	}
				894	/*
				895	* No use_mm() user needs to read from the userspace so we are
				896	* ok to reap it.
				897	*/
				898	if (unlikely(p->flags & PF_KTHREAD))
				899	continue;
				900	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
				901	}
				902	rcu_read_unlock();
				903
				904	if (can_oom_reap)
				905	wake_oom_reaper(victim);
				906
				907	mmdrop(mm);
				908	put_task_struct(victim);
				909	}
				910	#undef K
				911
				912	/*
				913	* Kill provided task unless it's secured by setting
				914	* oom_score_adj to OOM_SCORE_ADJ_MIN.
				915	*/
				916	static int oom_kill_memcg_member(struct task_struct task, void unused)
				917	{
				918	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
				919	!is_global_init(task)) {
				920	get_task_struct(task);
				921	__oom_kill_process(task);
				922	}
				923	return 0;
				924	}
				925
				926	static void oom_kill_process(struct oom_control oc, const char message)
				927	{
				928	struct task_struct *p = oc->chosen;
				929	unsigned int points = oc->chosen_points;
				930	struct task_struct *victim = p;
				931	struct task_struct *child;
				932	struct task_struct *t;
				933	struct mem_cgroup *oom_group;
				934	unsigned int victim_points = 0;
				935	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				936	DEFAULT_RATELIMIT_BURST);
				937
				938	/*
				939	* If the task is already exiting, don't alarm the sysadmin or kill
				940	* its children or threads, just give it access to memory reserves
				941	* so it can die quickly
				942	*/
				943	task_lock(p);
				944	if (task_will_free_mem(p)) {
				945	mark_oom_victim(p);
				946	wake_oom_reaper(p);
				947	task_unlock(p);
				948	put_task_struct(p);
				949	return;
				950	}
				951	task_unlock(p);
				952
				953	if (__ratelimit(&oom_rs))
				954	dump_header(oc, p);
				955
				956	pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
				957	message, task_pid_nr(p), p->comm, points);
				958
				959	/*
				960	* If any of p's children has a different mm and is eligible for kill,
				961	* the one with the highest oom_badness() score is sacrificed for its
				962	* parent. This attempts to lose the minimal amount of work done while
				963	* still freeing memory.
				964	*/
				965	read_lock(&tasklist_lock);
				966
				967	/*
				968	* The task 'p' might have already exited before reaching here. The
				969	* put_task_struct() will free task_struct 'p' while the loop still try
				970	* to access the field of 'p', so, get an extra reference.
				971	*/
				972	get_task_struct(p);
				973	for_each_thread(p, t) {
				974	list_for_each_entry(child, &t->children, sibling) {
				975	unsigned int child_points;
				976
				977	if (process_shares_mm(child, p->mm))
				978	continue;
				979	/*
				980	* oom_badness() returns 0 if the thread is unkillable
				981	*/
				982	child_points = oom_badness(child,
				983	oc->memcg, oc->nodemask, oc->totalpages);
				984	if (child_points > victim_points) {
				985	put_task_struct(victim);
				986	victim = child;
				987	victim_points = child_points;
				988	get_task_struct(victim);
				989	}
				990	}
				991	}
				992	put_task_struct(p);
				993	read_unlock(&tasklist_lock);
				994
				995	/*
				996	* Do we need to kill the entire memory cgroup?
				997	* Or even one of the ancestor memory cgroups?
				998	* Check this out before killing the victim task.
				999	*/
				1000	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
				1001
				1002	__oom_kill_process(victim);
				1003
				1004	/*
				1005	* If necessary, kill all tasks in the selected memory cgroup.
				1006	*/
				1007	if (oom_group) {
				1008	mem_cgroup_print_oom_group(oom_group);
				1009	mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
				1010	mem_cgroup_put(oom_group);
				1011	}
				1012	}
				1013
				1014	/*
				1015	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
				1016	*/
				1017	static void check_panic_on_oom(struct oom_control *oc,
				1018	enum oom_constraint constraint)
				1019	{
				1020	if (likely(!sysctl_panic_on_oom))
				1021	return;
				1022	if (sysctl_panic_on_oom != 2) {
				1023	/*
				1024	* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
				1025	* does not panic for cpuset, mempolicy, or memcg allocation
				1026	* failures.
				1027	*/
				1028	if (constraint != CONSTRAINT_NONE)
				1029	return;
				1030	}
				1031	/* Do not panic for oom kills triggered by sysrq */
				1032	if (is_sysrq_oom(oc))
				1033	return;
				1034	dump_header(oc, NULL);
				1035	panic("Out of memory: %s panic_on_oom is enabled\n",
				1036	sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
				1037	}
				1038
				1039	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
				1040
				1041	int register_oom_notifier(struct notifier_block *nb)
				1042	{
				1043	return blocking_notifier_chain_register(&oom_notify_list, nb);
				1044	}
				1045	EXPORT_SYMBOL_GPL(register_oom_notifier);
				1046
				1047	int unregister_oom_notifier(struct notifier_block *nb)
				1048	{
				1049	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
				1050	}
				1051	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
				1052
				1053	/**
				1054	* out_of_memory - kill the "best" process when we run out of memory
				1055	* @oc: pointer to struct oom_control
				1056	*
				1057	* If we run out of memory, we have the choice between either
				1058	* killing a random task (bad), letting the system crash (worse)
				1059	* OR try to be smart about which process to kill. Note that we
				1060	* don't have to be perfect here, we just have to be good.
				1061	*/
				1062	bool out_of_memory(struct oom_control *oc)
				1063	{
				1064	unsigned long freed = 0;
				1065	enum oom_constraint constraint = CONSTRAINT_NONE;
				1066
				1067	if (oom_killer_disabled)
				1068	return false;
				1069
				1070	if (!is_memcg_oom(oc)) {
				1071	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
				1072	if (freed > 0)
				1073	/* Got some memory back in the last second. */
				1074	return true;
				1075	}
				1076
				1077	/*
				1078	* If current has a pending SIGKILL or is exiting, then automatically
				1079	* select it. The goal is to allow it to allocate so that it may
				1080	* quickly exit and free its memory.
				1081	*/
				1082	if (task_will_free_mem(current)) {
				1083	mark_oom_victim(current);
				1084	wake_oom_reaper(current);
				1085	return true;
				1086	}
				1087
				1088	/*
				1089	* The OOM killer does not compensate for IO-less reclaim.
				1090	* pagefault_out_of_memory lost its gfp context so we have to
				1091	* make sure exclude 0 mask - all other users should have at least
				1092	* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
				1093	* invoke the OOM killer even if it is a GFP_NOFS allocation.
				1094	*/
				1095	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
				1096	return true;
				1097
				1098	/*
				1099	* Check if there were limitations on the allocation (only relevant for
				1100	* NUMA and memcg) that may require different handling.
				1101	*/
				1102	constraint = constrained_alloc(oc);
				1103	if (constraint != CONSTRAINT_MEMORY_POLICY)
				1104	oc->nodemask = NULL;
				1105	check_panic_on_oom(oc, constraint);
				1106
				1107	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
				1108	current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
				1109	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
				1110	get_task_struct(current);
				1111	oc->chosen = current;
				1112	oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
				1113	return true;
				1114	}
				1115
				1116	select_bad_process(oc);
				1117	/* Found nothing?!?! */
				1118	if (!oc->chosen) {
				1119	dump_header(oc, NULL);
				1120	pr_warn("Out of memory and no killable processes...\n");
				1121	/*
				1122	* If we got here due to an actual allocation at the
				1123	* system level, we cannot survive this and will enter
				1124	* an endless loop in the allocator. Bail out now.
				1125	*/
				1126	if (!is_sysrq_oom(oc) && !is_memcg_oom(oc)) {
				1127	#ifdef CONFIG_PAGE_OWNER
				1128	print_max_page_owner();
				1129	#endif
				1130	panic("System is deadlocked on memory\n");
				1131	}
				1132	}
				1133	if (oc->chosen && oc->chosen != (void *)-1UL)
				1134	oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
				1135	"Memory cgroup out of memory");
				1136	return !!oc->chosen;
				1137	}
				1138
				1139	/*
				1140	* The pagefault handler calls here because it is out of memory, so kill a
				1141	* memory-hogging task. If oom_lock is held by somebody else, a parallel oom
				1142	* killing is already in progress so do nothing.
				1143	*/
				1144	void pagefault_out_of_memory(void)
				1145	{
				1146	struct oom_control oc = {
				1147	.zonelist = NULL,
				1148	.nodemask = NULL,
				1149	.memcg = NULL,
				1150	.gfp_mask = 0,
				1151	.order = 0,
				1152	};
				1153
				1154	if (mem_cgroup_oom_synchronize(true))
				1155	return;
				1156
				1157	if (!mutex_trylock(&oom_lock))
				1158	return;
				1159	out_of_memory(&oc);
				1160	mutex_unlock(&oom_lock);
				1161	}