Blame - marvell/linux/kernel/exit.c - T108

blob: fdc6c76be7235abacef4ebfba5f82dc921b375e5 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
				2	/*
				3	* linux/kernel/exit.c
				4	*
				5	* Copyright (C) 1991, 1992 Linus Torvalds
				6	*/
				7
				8	#include <linux/mm.h>
				9	#include <linux/slab.h>
				10	#include <linux/sched/autogroup.h>
				11	#include <linux/sched/mm.h>
				12	#include <linux/sched/stat.h>
				13	#include <linux/sched/task.h>
				14	#include <linux/sched/task_stack.h>
				15	#include <linux/sched/cputime.h>
				16	#include <linux/interrupt.h>
				17	#include <linux/module.h>
				18	#include <linux/capability.h>
				19	#include <linux/completion.h>
				20	#include <linux/personality.h>
				21	#include <linux/tty.h>
				22	#include <linux/iocontext.h>
				23	#include <linux/key.h>
				24	#include <linux/cpu.h>
				25	#include <linux/acct.h>
				26	#include <linux/tsacct_kern.h>
				27	#include <linux/file.h>
				28	#include <linux/fdtable.h>
				29	#include <linux/freezer.h>
				30	#include <linux/binfmts.h>
				31	#include <linux/nsproxy.h>
				32	#include <linux/pid_namespace.h>
				33	#include <linux/ptrace.h>
				34	#include <linux/profile.h>
				35	#include <linux/mount.h>
				36	#include <linux/proc_fs.h>
				37	#include <linux/kthread.h>
				38	#include <linux/mempolicy.h>
				39	#include <linux/taskstats_kern.h>
				40	#include <linux/delayacct.h>
				41	#include <linux/cgroup.h>
				42	#include <linux/syscalls.h>
				43	#include <linux/signal.h>
				44	#include <linux/posix-timers.h>
				45	#include <linux/cn_proc.h>
				46	#include <linux/mutex.h>
				47	#include <linux/futex.h>
				48	#include <linux/pipe_fs_i.h>
				49	#include <linux/audit.h> /* for audit_free() */
				50	#include <linux/resource.h>
				51	#include <linux/blkdev.h>
				52	#include <linux/task_io_accounting_ops.h>
				53	#include <linux/tracehook.h>
				54	#include <linux/fs_struct.h>
				55	#include <linux/init_task.h>
				56	#include <linux/perf_event.h>
				57	#include <trace/events/sched.h>
				58	#include <linux/hw_breakpoint.h>
				59	#include <linux/oom.h>
				60	#include <linux/writeback.h>
				61	#include <linux/shm.h>
				62	#include <linux/kcov.h>
				63	#include <linux/random.h>
				64	#include <linux/rcuwait.h>
				65	#include <linux/compat.h>
				66	#include <linux/sysfs.h>
				67
				68	#include <linux/uaccess.h>
				69	#include <asm/unistd.h>
				70	#include <asm/pgtable.h>
				71	#include <asm/mmu_context.h>
				72
				73	/*
				74	* The default value should be high enough to not crash a system that randomly
				75	* crashes its kernel from time to time, but low enough to at least not permit
				76	* overflowing 32-bit refcounts or the ldsem writer count.
				77	*/
				78	static unsigned int oops_limit = 10000;
				79
				80	#ifdef CONFIG_SYSCTL
				81	static struct ctl_table kern_exit_table[] = {
				82	{
				83	.procname = "oops_limit",
				84	.data = &oops_limit,
				85	.maxlen = sizeof(oops_limit),
				86	.mode = 0644,
				87	.proc_handler = proc_douintvec,
				88	},
				89	{ }
				90	};
				91
				92	static __init int kernel_exit_sysctls_init(void)
				93	{
				94	register_sysctl_init("kernel", kern_exit_table);
				95	return 0;
				96	}
				97	late_initcall(kernel_exit_sysctls_init);
				98	#endif
				99
				100	static atomic_t oops_count = ATOMIC_INIT(0);
				101
				102	#ifdef CONFIG_SYSFS
				103	static ssize_t oops_count_show(struct kobject kobj, struct kobj_attribute attr,
				104	char *page)
				105	{
				106	return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
				107	}
				108
				109	static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);
				110
				111	static __init int kernel_exit_sysfs_init(void)
				112	{
				113	sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
				114	return 0;
				115	}
				116	late_initcall(kernel_exit_sysfs_init);
				117	#endif
				118
				119	static void __unhash_process(struct task_struct *p, bool group_dead)
				120	{
				121	nr_threads--;
				122	detach_pid(p, PIDTYPE_PID);
				123	if (group_dead) {
				124	detach_pid(p, PIDTYPE_TGID);
				125	detach_pid(p, PIDTYPE_PGID);
				126	detach_pid(p, PIDTYPE_SID);
				127
				128	list_del_rcu(&p->tasks);
				129	list_del_init(&p->sibling);
				130	__this_cpu_dec(process_counts);
				131	}
				132	list_del_rcu(&p->thread_group);
				133	list_del_rcu(&p->thread_node);
				134	}
				135
				136	/*
				137	* This function expects the tasklist_lock write-locked.
				138	*/
				139	static void __exit_signal(struct task_struct *tsk)
				140	{
				141	struct signal_struct *sig = tsk->signal;
				142	bool group_dead = thread_group_leader(tsk);
				143	struct sighand_struct *sighand;
				144	struct tty_struct *tty;
				145	u64 utime, stime;
				146
				147	sighand = rcu_dereference_check(tsk->sighand,
				148	lockdep_tasklist_lock_is_held());
				149	spin_lock(&sighand->siglock);
				150
				151	#ifdef CONFIG_POSIX_TIMERS
				152	posix_cpu_timers_exit(tsk);
				153	if (group_dead) {
				154	posix_cpu_timers_exit_group(tsk);
				155	} else {
				156	/*
				157	* This can only happen if the caller is de_thread().
				158	* FIXME: this is the temporary hack, we should teach
				159	* posix-cpu-timers to handle this case correctly.
				160	*/
				161	if (unlikely(has_group_leader_pid(tsk)))
				162	posix_cpu_timers_exit_group(tsk);
				163	}
				164	#endif
				165
				166	if (group_dead) {
				167	tty = sig->tty;
				168	sig->tty = NULL;
				169	} else {
				170	/*
				171	* If there is any task waiting for the group exit
				172	* then notify it:
				173	*/
				174	if (sig->notify_count > 0 && !--sig->notify_count)
				175	wake_up_process(sig->group_exit_task);
				176
				177	if (tsk == sig->curr_target)
				178	sig->curr_target = next_thread(tsk);
				179	}
				180
				181	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
				182	sizeof(unsigned long long));
				183
				184	/*
				185	* Accumulate here the counters for all threads as they die. We could
				186	* skip the group leader because it is the last user of signal_struct,
				187	* but we want to avoid the race with thread_group_cputime() which can
				188	* see the empty ->thread_head list.
				189	*/
				190	task_cputime(tsk, &utime, &stime);
				191	write_seqlock(&sig->stats_lock);
				192	sig->utime += utime;
				193	sig->stime += stime;
				194	sig->gtime += task_gtime(tsk);
				195	sig->min_flt += tsk->min_flt;
				196	sig->maj_flt += tsk->maj_flt;
				197	sig->nvcsw += tsk->nvcsw;
				198	sig->nivcsw += tsk->nivcsw;
				199	sig->inblock += task_io_get_inblock(tsk);
				200	sig->oublock += task_io_get_oublock(tsk);
				201	task_io_accounting_add(&sig->ioac, &tsk->ioac);
				202	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
				203	sig->nr_threads--;
				204	__unhash_process(tsk, group_dead);
				205	write_sequnlock(&sig->stats_lock);
				206
				207	/*
				208	* Do this under ->siglock, we can race with another thread
				209	* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
				210	*/
				211	flush_sigqueue(&tsk->pending);
				212	tsk->sighand = NULL;
				213	spin_unlock(&sighand->siglock);
				214
				215	__cleanup_sighand(sighand);
				216	clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
				217	if (group_dead) {
				218	flush_sigqueue(&sig->shared_pending);
				219	tty_kref_put(tty);
				220	}
				221	}
				222
				223	static void delayed_put_task_struct(struct rcu_head *rhp)
				224	{
				225	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
				226
				227	perf_event_delayed_put(tsk);
				228	trace_sched_process_free(tsk);
				229	put_task_struct(tsk);
				230	}
				231
				232	void put_task_struct_rcu_user(struct task_struct *task)
				233	{
				234	if (refcount_dec_and_test(&task->rcu_users))
				235	call_rcu(&task->rcu, delayed_put_task_struct);
				236	}
				237
				238	void release_task(struct task_struct *p)
				239	{
				240	struct task_struct *leader;
				241	int zap_leader;
				242	repeat:
				243	/* don't need to get the RCU readlock here - the process is dead and
				244	* can't be modifying its own credentials. But shut RCU-lockdep up */
				245	rcu_read_lock();
				246	atomic_dec(&__task_cred(p)->user->processes);
				247	rcu_read_unlock();
				248
				249	proc_flush_task(p);
				250	cgroup_release(p);
				251
				252	write_lock_irq(&tasklist_lock);
				253	ptrace_release_task(p);
				254	__exit_signal(p);
				255
				256	/*
				257	* If we are the last non-leader member of the thread
				258	* group, and the leader is zombie, then notify the
				259	* group leader's parent process. (if it wants notification.)
				260	*/
				261	zap_leader = 0;
				262	leader = p->group_leader;
				263	if (leader != p && thread_group_empty(leader)
				264	&& leader->exit_state == EXIT_ZOMBIE) {
				265	/*
				266	* If we were the last child thread and the leader has
				267	* exited already, and the leader's parent ignores SIGCHLD,
				268	* then we are the one who should release the leader.
				269	*/
				270	zap_leader = do_notify_parent(leader, leader->exit_signal);
				271	if (zap_leader)
				272	leader->exit_state = EXIT_DEAD;
				273	}
				274
				275	write_unlock_irq(&tasklist_lock);
				276	seccomp_filter_release(p);
				277	release_thread(p);
				278	put_task_struct_rcu_user(p);
				279
				280	p = leader;
				281	if (unlikely(zap_leader))
				282	goto repeat;
				283	}
				284
				285	void rcuwait_wake_up(struct rcuwait *w)
				286	{
				287	struct task_struct *task;
				288
				289	rcu_read_lock();
				290
				291	/*
				292	* Order condition vs @task, such that everything prior to the load
				293	* of @task is visible. This is the condition as to why the user called
				294	* rcuwait_trywake() in the first place. Pairs with set_current_state()
				295	* barrier (A) in rcuwait_wait_event().
				296	*
				297	* WAIT WAKE
				298	* [S] tsk = current [S] cond = true
				299	* MB (A) MB (B)
				300	* [L] cond [L] tsk
				301	*/
				302	smp_mb(); /* (B) */
				303
				304	task = rcu_dereference(w->task);
				305	if (task)
				306	wake_up_process(task);
				307	rcu_read_unlock();
				308	}
				309
				310	/*
				311	* Determine if a process group is "orphaned", according to the POSIX
				312	* definition in 2.2.2.52. Orphaned process groups are not to be affected
				313	* by terminal-generated stop signals. Newly orphaned process groups are
				314	* to receive a SIGHUP and a SIGCONT.
				315	*
				316	* "I ask you, have you ever known what it is to be an orphan?"
				317	*/
				318	static int will_become_orphaned_pgrp(struct pid *pgrp,
				319	struct task_struct *ignored_task)
				320	{
				321	struct task_struct *p;
				322
				323	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
				324	if ((p == ignored_task) \|\|
				325	(p->exit_state && thread_group_empty(p)) \|\|
				326	is_global_init(p->real_parent))
				327	continue;
				328
				329	if (task_pgrp(p->real_parent) != pgrp &&
				330	task_session(p->real_parent) == task_session(p))
				331	return 0;
				332	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
				333
				334	return 1;
				335	}
				336
				337	int is_current_pgrp_orphaned(void)
				338	{
				339	int retval;
				340
				341	read_lock(&tasklist_lock);
				342	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
				343	read_unlock(&tasklist_lock);
				344
				345	return retval;
				346	}
				347
				348	static bool has_stopped_jobs(struct pid *pgrp)
				349	{
				350	struct task_struct *p;
				351
				352	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
				353	if (p->signal->flags & SIGNAL_STOP_STOPPED)
				354	return true;
				355	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
				356
				357	return false;
				358	}
				359
				360	/*
				361	* Check to see if any process groups have become orphaned as
				362	* a result of our exiting, and if they have any stopped jobs,
				363	* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
				364	*/
				365	static void
				366	kill_orphaned_pgrp(struct task_struct tsk, struct task_struct parent)
				367	{
				368	struct pid *pgrp = task_pgrp(tsk);
				369	struct task_struct *ignored_task = tsk;
				370
				371	if (!parent)
				372	/* exit: our father is in a different pgrp than
				373	* we are and we were the only connection outside.
				374	*/
				375	parent = tsk->real_parent;
				376	else
				377	/* reparent: our child is in a different pgrp than
				378	* we are, and it was the only connection outside.
				379	*/
				380	ignored_task = NULL;
				381
				382	if (task_pgrp(parent) != pgrp &&
				383	task_session(parent) == task_session(tsk) &&
				384	will_become_orphaned_pgrp(pgrp, ignored_task) &&
				385	has_stopped_jobs(pgrp)) {
				386	__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
				387	__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
				388	}
				389	}
				390
				391	#ifdef CONFIG_MEMCG
				392	/*
				393	* A task is exiting. If it owned this mm, find a new owner for the mm.
				394	*/
				395	void mm_update_next_owner(struct mm_struct *mm)
				396	{
				397	struct task_struct c, g, *p = current;
				398
				399	retry:
				400	/*
				401	* If the exiting or execing task is not the owner, it's
				402	* someone else's problem.
				403	*/
				404	if (mm->owner != p)
				405	return;
				406	/*
				407	* The current owner is exiting/execing and there are no other
				408	* candidates. Do not leave the mm pointing to a possibly
				409	* freed task structure.
				410	*/
				411	if (atomic_read(&mm->mm_users) <= 1) {
				412	WRITE_ONCE(mm->owner, NULL);
				413	return;
				414	}
				415
				416	read_lock(&tasklist_lock);
				417	/*
				418	* Search in the children
				419	*/
				420	list_for_each_entry(c, &p->children, sibling) {
				421	if (c->mm == mm)
				422	goto assign_new_owner;
				423	}
				424
				425	/*
				426	* Search in the siblings
				427	*/
				428	list_for_each_entry(c, &p->real_parent->children, sibling) {
				429	if (c->mm == mm)
				430	goto assign_new_owner;
				431	}
				432
				433	/*
				434	* Search through everything else, we should not get here often.
				435	*/
				436	for_each_process(g) {
				437	if (atomic_read(&mm->mm_users) <= 1)
				438	break;
				439	if (g->flags & PF_KTHREAD)
				440	continue;
				441	for_each_thread(g, c) {
				442	if (c->mm == mm)
				443	goto assign_new_owner;
				444	if (c->mm)
				445	break;
				446	}
				447	}
				448	read_unlock(&tasklist_lock);
				449	/*
				450	* We found no owner yet mm_users > 1: this implies that we are
				451	* most likely racing with swapoff (try_to_unuse()) or /proc or
				452	* ptrace or page migration (get_task_mm()). Mark owner as NULL.
				453	*/
				454	WRITE_ONCE(mm->owner, NULL);
				455	return;
				456
				457	assign_new_owner:
				458	BUG_ON(c == p);
				459	get_task_struct(c);
				460	/*
				461	* The task_lock protects c->mm from changing.
				462	* We always want mm->owner->mm == mm
				463	*/
				464	task_lock(c);
				465	/*
				466	* Delay read_unlock() till we have the task_lock()
				467	* to ensure that c does not slip away underneath us
				468	*/
				469	read_unlock(&tasklist_lock);
				470	if (c->mm != mm) {
				471	task_unlock(c);
				472	put_task_struct(c);
				473	goto retry;
				474	}
				475	WRITE_ONCE(mm->owner, c);
				476	task_unlock(c);
				477	put_task_struct(c);
				478	}
				479	#endif /* CONFIG_MEMCG */
				480
				481	/*
				482	* Turn us into a lazy TLB process if we
				483	* aren't already..
				484	*/
				485	static void exit_mm(void)
				486	{
				487	struct mm_struct *mm = current->mm;
				488	struct core_state *core_state;
				489
				490	exit_mm_release(current, mm);
				491	if (!mm)
				492	return;
				493	sync_mm_rss(mm);
				494	/*
				495	* Serialize with any possible pending coredump.
				496	* We must hold mmap_sem around checking core_state
				497	* and clearing tsk->mm. The core-inducing thread
				498	* will increment ->nr_threads for each thread in the
				499	* group with ->mm != NULL.
				500	*/
				501	down_read(&mm->mmap_sem);
				502	core_state = mm->core_state;
				503	if (core_state) {
				504	struct core_thread self;
				505
				506	up_read(&mm->mmap_sem);
				507
				508	self.task = current;
				509	if (self.task->flags & PF_SIGNALED)
				510	self.next = xchg(&core_state->dumper.next, &self);
				511	else
				512	self.task = NULL;
				513	/*
				514	* Implies mb(), the result of xchg() must be visible
				515	* to core_state->dumper.
				516	*/
				517	if (atomic_dec_and_test(&core_state->nr_threads))
				518	complete(&core_state->startup);
				519
				520	for (;;) {
				521	set_current_state(TASK_UNINTERRUPTIBLE);
				522	if (!self.task) /* see coredump_finish() */
				523	break;
				524	freezable_schedule();
				525	}
				526	__set_current_state(TASK_RUNNING);
				527	down_read(&mm->mmap_sem);
				528	}
				529	mmgrab(mm);
				530	BUG_ON(mm != current->active_mm);
				531	/* more a memory barrier than a real lock */
				532	task_lock(current);
				533	current->mm = NULL;
				534	up_read(&mm->mmap_sem);
				535	enter_lazy_tlb(mm, current);
				536	task_unlock(current);
				537	mm_update_next_owner(mm);
				538	mmput(mm);
				539	if (test_thread_flag(TIF_MEMDIE))
				540	exit_oom_victim();
				541	}
				542
				543	static struct task_struct find_alive_thread(struct task_struct p)
				544	{
				545	struct task_struct *t;
				546
				547	for_each_thread(p, t) {
				548	if (!(t->flags & PF_EXITING))
				549	return t;
				550	}
				551	return NULL;
				552	}
				553
				554	static struct task_struct find_child_reaper(struct task_struct father,
				555	struct list_head *dead)
				556	__releases(&tasklist_lock)
				557	__acquires(&tasklist_lock)
				558	{
				559	struct pid_namespace *pid_ns = task_active_pid_ns(father);
				560	struct task_struct *reaper = pid_ns->child_reaper;
				561	struct task_struct p, n;
				562
				563	if (likely(reaper != father))
				564	return reaper;
				565
				566	reaper = find_alive_thread(father);
				567	if (reaper) {
				568	pid_ns->child_reaper = reaper;
				569	return reaper;
				570	}
				571
				572	write_unlock_irq(&tasklist_lock);
				573
				574	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
				575	list_del_init(&p->ptrace_entry);
				576	release_task(p);
				577	}
				578
				579	zap_pid_ns_processes(pid_ns);
				580	write_lock_irq(&tasklist_lock);
				581
				582	return father;
				583	}
				584
				585	/*
				586	* When we die, we re-parent all our children, and try to:
				587	* 1. give them to another thread in our thread group, if such a member exists
				588	* 2. give it to the first ancestor process which prctl'd itself as a
				589	* child_subreaper for its children (like a service manager)
				590	* 3. give it to the init process (PID 1) in our pid namespace
				591	*/
				592	static struct task_struct find_new_reaper(struct task_struct father,
				593	struct task_struct *child_reaper)
				594	{
				595	struct task_struct thread, reaper;
				596
				597	thread = find_alive_thread(father);
				598	if (thread)
				599	return thread;
				600
				601	if (father->signal->has_child_subreaper) {
				602	unsigned int ns_level = task_pid(father)->level;
				603	/*
				604	* Find the first ->is_child_subreaper ancestor in our pid_ns.
				605	* We can't check reaper != child_reaper to ensure we do not
				606	* cross the namespaces, the exiting parent could be injected
				607	* by setns() + fork().
				608	* We check pid->level, this is slightly more efficient than
				609	* task_active_pid_ns(reaper) != task_active_pid_ns(father).
				610	*/
				611	for (reaper = father->real_parent;
				612	task_pid(reaper)->level == ns_level;
				613	reaper = reaper->real_parent) {
				614	if (reaper == &init_task)
				615	break;
				616	if (!reaper->signal->is_child_subreaper)
				617	continue;
				618	thread = find_alive_thread(reaper);
				619	if (thread)
				620	return thread;
				621	}
				622	}
				623
				624	return child_reaper;
				625	}
				626
				627	/*
				628	* Any that need to be release_task'd are put on the @dead list.
				629	*/
				630	static void reparent_leader(struct task_struct father, struct task_struct p,
				631	struct list_head *dead)
				632	{
				633	if (unlikely(p->exit_state == EXIT_DEAD))
				634	return;
				635
				636	/* We don't want people slaying init. */
				637	p->exit_signal = SIGCHLD;
				638
				639	/* If it has exited notify the new parent about this child's death. */
				640	if (!p->ptrace &&
				641	p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
				642	if (do_notify_parent(p, p->exit_signal)) {
				643	p->exit_state = EXIT_DEAD;
				644	list_add(&p->ptrace_entry, dead);
				645	}
				646	}
				647
				648	kill_orphaned_pgrp(p, father);
				649	}
				650
				651	/*
				652	* This does two things:
				653	*
				654	* A. Make init inherit all the child processes
				655	* B. Check to see if any process groups have become orphaned
				656	* as a result of our exiting, and if they have any stopped
				657	* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
				658	*/
				659	static void forget_original_parent(struct task_struct *father,
				660	struct list_head *dead)
				661	{
				662	struct task_struct p, t, *reaper;
				663
				664	if (unlikely(!list_empty(&father->ptraced)))
				665	exit_ptrace(father, dead);
				666
				667	/* Can drop and reacquire tasklist_lock */
				668	reaper = find_child_reaper(father, dead);
				669	if (list_empty(&father->children))
				670	return;
				671
				672	reaper = find_new_reaper(father, reaper);
				673	list_for_each_entry(p, &father->children, sibling) {
				674	for_each_thread(p, t) {
				675	t->real_parent = reaper;
				676	BUG_ON((!t->ptrace) != (t->parent == father));
				677	if (likely(!t->ptrace))
				678	t->parent = t->real_parent;
				679	if (t->pdeath_signal)
				680	group_send_sig_info(t->pdeath_signal,
				681	SEND_SIG_NOINFO, t,
				682	PIDTYPE_TGID);
				683	}
				684	/*
				685	* If this is a threaded reparent there is no need to
				686	* notify anyone anything has happened.
				687	*/
				688	if (!same_thread_group(reaper, father))
				689	reparent_leader(father, p, dead);
				690	}
				691	list_splice_tail_init(&father->children, &reaper->children);
				692	}
				693
				694	/*
				695	* Send signals to all our closest relatives so that they know
				696	* to properly mourn us..
				697	*/
				698	static void exit_notify(struct task_struct *tsk, int group_dead)
				699	{
				700	bool autoreap;
				701	struct task_struct p, n;
				702	LIST_HEAD(dead);
				703
				704	write_lock_irq(&tasklist_lock);
				705	forget_original_parent(tsk, &dead);
				706
				707	if (group_dead)
				708	kill_orphaned_pgrp(tsk->group_leader, NULL);
				709
				710	tsk->exit_state = EXIT_ZOMBIE;
				711	if (unlikely(tsk->ptrace)) {
				712	int sig = thread_group_leader(tsk) &&
				713	thread_group_empty(tsk) &&
				714	!ptrace_reparented(tsk) ?
				715	tsk->exit_signal : SIGCHLD;
				716	autoreap = do_notify_parent(tsk, sig);
				717	} else if (thread_group_leader(tsk)) {
				718	autoreap = thread_group_empty(tsk) &&
				719	do_notify_parent(tsk, tsk->exit_signal);
				720	} else {
				721	autoreap = true;
				722	}
				723
				724	if (autoreap) {
				725	tsk->exit_state = EXIT_DEAD;
				726	list_add(&tsk->ptrace_entry, &dead);
				727	}
				728
				729	/* mt-exec, de_thread() is waiting for group leader */
				730	if (unlikely(tsk->signal->notify_count < 0))
				731	wake_up_process(tsk->signal->group_exit_task);
				732	write_unlock_irq(&tasklist_lock);
				733
				734	list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
				735	list_del_init(&p->ptrace_entry);
				736	release_task(p);
				737	}
				738	}
				739
				740	#ifdef CONFIG_DEBUG_STACK_USAGE
				741	static void check_stack_usage(void)
				742	{
				743	static DEFINE_SPINLOCK(low_water_lock);
				744	static int lowest_to_date = THREAD_SIZE;
				745	unsigned long free;
				746
				747	free = stack_not_used(current);
				748
				749	if (free >= lowest_to_date)
				750	return;
				751
				752	spin_lock(&low_water_lock);
				753	if (free < lowest_to_date) {
				754	pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
				755	current->comm, task_pid_nr(current), free);
				756	lowest_to_date = free;
				757	}
				758	spin_unlock(&low_water_lock);
				759	}
				760	#else
				761	static inline void check_stack_usage(void) {}
				762	#endif
				763
				764	void __noreturn do_exit(long code)
				765	{
				766	struct task_struct *tsk = current;
				767	int group_dead;
				768
				769	/*
				770	* We can get here from a kernel oops, sometimes with preemption off.
				771	* Start by checking for critical errors.
				772	* Then fix up important state like USER_DS and preemption.
				773	* Then do everything else.
				774	*/
				775
				776	WARN_ON(blk_needs_flush_plug(tsk));
				777
				778	if (unlikely(in_interrupt()))
				779	panic("Aiee, killing interrupt handler!");
				780	if (unlikely(!tsk->pid))
				781	panic("Attempted to kill the idle task!");
				782
				783	/*
				784	* If do_exit is called because this processes oopsed, it's possible
				785	* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
				786	* continuing. Amongst other possible reasons, this is to prevent
				787	* mm_release()->clear_child_tid() from writing to a user-controlled
				788	* kernel address.
				789	*/
				790	set_fs(USER_DS);
				791
				792	if (unlikely(in_atomic())) {
				793	pr_info("note: %s[%d] exited with preempt_count %d\n",
				794	current->comm, task_pid_nr(current),
				795	preempt_count());
				796	preempt_count_set(PREEMPT_ENABLED);
				797	}
				798
				799	profile_task_exit(tsk);
				800	kcov_task_exit(tsk);
				801
				802	ptrace_event(PTRACE_EVENT_EXIT, code);
				803
				804	validate_creds_for_do_exit(tsk);
				805
				806	/*
				807	* We're taking recursive faults here in do_exit. Safest is to just
				808	* leave this task alone and wait for reboot.
				809	*/
				810	if (unlikely(tsk->flags & PF_EXITING)) {
				811	pr_alert("Fixing recursive fault but reboot is needed!\n");
				812	futex_exit_recursive(tsk);
				813	set_current_state(TASK_UNINTERRUPTIBLE);
				814	schedule();
				815	}
				816
				817	exit_signals(tsk); /* sets PF_EXITING */
				818
				819	/* sync mm's RSS info before statistics gathering */
				820	if (tsk->mm)
				821	sync_mm_rss(tsk->mm);
				822	acct_update_integrals(tsk);
				823	group_dead = atomic_dec_and_test(&tsk->signal->live);
				824	if (group_dead) {
				825	/*
				826	* If the last thread of global init has exited, panic
				827	* immediately to get a useable coredump.
				828	*/
				829	if (unlikely(is_global_init(tsk))) {
				830	pr_err("Attempted to kill init! exitcode=0x%08x\n",
				831	tsk->signal->group_exit_code ?: (int)code);
				832	BUG();
				833	panic("Attempted to kill init! exitcode=0x%08x\n",
				834	tsk->signal->group_exit_code ?: (int)code);
				835	}
				836
				837	#ifdef CONFIG_POSIX_TIMERS
				838	hrtimer_cancel(&tsk->signal->real_timer);
				839	exit_itimers(tsk->signal);
				840	#endif
				841	if (tsk->mm)
				842	setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
				843	}
				844	acct_collect(code, group_dead);
				845	if (group_dead)
				846	tty_audit_exit();
				847	audit_free(tsk);
				848
				849	tsk->exit_code = code;
				850	taskstats_exit(tsk, group_dead);
				851
				852	exit_mm();
				853
				854	if (group_dead)
				855	acct_process();
				856	trace_sched_process_exit(tsk);
				857
				858	exit_sem(tsk);
				859	exit_shm(tsk);
				860	exit_files(tsk);
				861	exit_fs(tsk);
				862	if (group_dead)
				863	disassociate_ctty(1);
				864	exit_task_namespaces(tsk);
				865	exit_task_work(tsk);
				866	exit_thread(tsk);
				867	exit_umh(tsk);
				868
				869	/*
				870	* Flush inherited counters to the parent - before the parent
				871	* gets woken up by child-exit notifications.
				872	*
				873	* because of cgroup mode, must be called before cgroup_exit()
				874	*/
				875	perf_event_exit_task(tsk);
				876
				877	sched_autogroup_exit_task(tsk);
				878	cgroup_exit(tsk);
				879
				880	/*
				881	* FIXME: do that only when needed, using sched_exit tracepoint
				882	*/
				883	flush_ptrace_hw_breakpoint(tsk);
				884
				885	exit_tasks_rcu_start();
				886	exit_notify(tsk, group_dead);
				887	proc_exit_connector(tsk);
				888	mpol_put_task_policy(tsk);
				889	#ifdef CONFIG_FUTEX
				890	if (unlikely(current->pi_state_cache))
				891	kfree(current->pi_state_cache);
				892	#endif
				893	/*
				894	* Make sure we are holding no locks:
				895	*/
				896	debug_check_no_locks_held();
				897
				898	if (tsk->io_context)
				899	exit_io_context(tsk);
				900
				901	if (tsk->splice_pipe)
				902	free_pipe_info(tsk->splice_pipe);
				903
				904	if (tsk->task_frag.page)
				905	put_page(tsk->task_frag.page);
				906
				907	validate_creds_for_do_exit(tsk);
				908
				909	check_stack_usage();
				910	preempt_disable();
				911	if (tsk->nr_dirtied)
				912	__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
				913	exit_rcu();
				914	exit_tasks_rcu_finish();
				915
				916	lockdep_free_task(tsk);
				917	do_task_dead();
				918	}
				919	EXPORT_SYMBOL_GPL(do_exit);
				920
				921	void __noreturn make_task_dead(int signr)
				922	{
				923	/*
				924	* Take the task off the cpu after something catastrophic has
				925	* happened.
				926	*/
				927	unsigned int limit;
				928
				929	/*
				930	* Every time the system oopses, if the oops happens while a reference
				931	* to an object was held, the reference leaks.
				932	* If the oops doesn't also leak memory, repeated oopsing can cause
				933	* reference counters to wrap around (if they're not using refcount_t).
				934	* This means that repeated oopsing can make unexploitable-looking bugs
				935	* exploitable through repeated oopsing.
				936	* To make sure this can't happen, place an upper bound on how often the
				937	* kernel may oops without panic().
				938	*/
				939	limit = READ_ONCE(oops_limit);
				940	if (atomic_inc_return(&oops_count) >= limit && limit)
				941	panic("Oopsed too often (kernel.oops_limit is %d)", limit);
				942
				943	do_exit(signr);
				944	}
				945
				946	void complete_and_exit(struct completion *comp, long code)
				947	{
				948	if (comp)
				949	complete(comp);
				950
				951	do_exit(code);
				952	}
				953	EXPORT_SYMBOL(complete_and_exit);
				954
				955	SYSCALL_DEFINE1(exit, int, error_code)
				956	{
				957	do_exit((error_code&0xff)<<8);
				958	}
				959
				960	/*
				961	* Take down every thread in the group. This is called by fatal signals
				962	* as well as by sys_exit_group (below).
				963	*/
				964	void
				965	do_group_exit(int exit_code)
				966	{
				967	struct signal_struct *sig = current->signal;
				968
				969	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
				970
				971	if (signal_group_exit(sig))
				972	exit_code = sig->group_exit_code;
				973	else if (!thread_group_empty(current)) {
				974	struct sighand_struct *const sighand = current->sighand;
				975
				976	spin_lock_irq(&sighand->siglock);
				977	if (signal_group_exit(sig))
				978	/* Another thread got here before we took the lock. */
				979	exit_code = sig->group_exit_code;
				980	else {
				981	sig->group_exit_code = exit_code;
				982	sig->flags = SIGNAL_GROUP_EXIT;
				983	zap_other_threads(current);
				984	}
				985	spin_unlock_irq(&sighand->siglock);
				986	}
				987
				988	do_exit(exit_code);
				989	/* NOTREACHED */
				990	}
				991
				992	/*
				993	* this kills every thread in the thread group. Note that any externally
				994	* wait4()-ing process will get the correct exit code - even if this
				995	* thread is not the thread group leader.
				996	*/
				997	SYSCALL_DEFINE1(exit_group, int, error_code)
				998	{
				999	do_group_exit((error_code & 0xff) << 8);
				1000	/* NOTREACHED */
				1001	return 0;
				1002	}
				1003
				1004	struct waitid_info {
				1005	pid_t pid;
				1006	uid_t uid;
				1007	int status;
				1008	int cause;
				1009	};
				1010
				1011	struct wait_opts {
				1012	enum pid_type wo_type;
				1013	int wo_flags;
				1014	struct pid *wo_pid;
				1015
				1016	struct waitid_info *wo_info;
				1017	int wo_stat;
				1018	struct rusage *wo_rusage;
				1019
				1020	wait_queue_entry_t child_wait;
				1021	int notask_error;
				1022	};
				1023
				1024	static int eligible_pid(struct wait_opts wo, struct task_struct p)
				1025	{
				1026	return wo->wo_type == PIDTYPE_MAX \|\|
				1027	task_pid_type(p, wo->wo_type) == wo->wo_pid;
				1028	}
				1029
				1030	static int
				1031	eligible_child(struct wait_opts wo, bool ptrace, struct task_struct p)
				1032	{
				1033	if (!eligible_pid(wo, p))
				1034	return 0;
				1035
				1036	/*
				1037	* Wait for all children (clone and not) if __WALL is set or
				1038	* if it is traced by us.
				1039	*/
				1040	if (ptrace \|\| (wo->wo_flags & __WALL))
				1041	return 1;
				1042
				1043	/*
				1044	* Otherwise, wait for clone children only if __WCLONE is set;
				1045	* otherwise, wait for non-clone children only.
				1046	*
				1047	* Note: a "clone" child here is one that reports to its parent
				1048	* using a signal other than SIGCHLD, or a non-leader thread which
				1049	* we can only see if it is traced by us.
				1050	*/
				1051	if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
				1052	return 0;
				1053
				1054	return 1;
				1055	}
				1056
				1057	/*
				1058	* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
				1059	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
				1060	* the lock and this task is uninteresting. If we return nonzero, we have
				1061	* released the lock and the system call should return.
				1062	*/
				1063	static int wait_task_zombie(struct wait_opts wo, struct task_struct p)
				1064	{
				1065	int state, status;
				1066	pid_t pid = task_pid_vnr(p);
				1067	uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
				1068	struct waitid_info *infop;
				1069
				1070	if (!likely(wo->wo_flags & WEXITED))
				1071	return 0;
				1072
				1073	if (unlikely(wo->wo_flags & WNOWAIT)) {
				1074	status = p->exit_code;
				1075	get_task_struct(p);
				1076	read_unlock(&tasklist_lock);
				1077	sched_annotate_sleep();
				1078	if (wo->wo_rusage)
				1079	getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
				1080	put_task_struct(p);
				1081	goto out_info;
				1082	}
				1083	/*
				1084	* Move the task's state to DEAD/TRACE, only one thread can do this.
				1085	*/
				1086	state = (ptrace_reparented(p) && thread_group_leader(p)) ?
				1087	EXIT_TRACE : EXIT_DEAD;
				1088	if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
				1089	return 0;
				1090	/*
				1091	* We own this thread, nobody else can reap it.
				1092	*/
				1093	read_unlock(&tasklist_lock);
				1094	sched_annotate_sleep();
				1095
				1096	/*
				1097	* Check thread_group_leader() to exclude the traced sub-threads.
				1098	*/
				1099	if (state == EXIT_DEAD && thread_group_leader(p)) {
				1100	struct signal_struct *sig = p->signal;
				1101	struct signal_struct *psig = current->signal;
				1102	unsigned long maxrss;
				1103	u64 tgutime, tgstime;
				1104
				1105	/*
				1106	* The resource counters for the group leader are in its
				1107	* own task_struct. Those for dead threads in the group
				1108	* are in its signal_struct, as are those for the child
				1109	* processes it has previously reaped. All these
				1110	* accumulate in the parent's signal_struct c* fields.
				1111	*
				1112	* We don't bother to take a lock here to protect these
				1113	* p->signal fields because the whole thread group is dead
				1114	* and nobody can change them.
				1115	*
				1116	* psig->stats_lock also protects us from our sub-theads
				1117	* which can reap other children at the same time. Until
				1118	* we change k_getrusage()-like users to rely on this lock
				1119	* we have to take ->siglock as well.
				1120	*
				1121	* We use thread_group_cputime_adjusted() to get times for
				1122	* the thread group, which consolidates times for all threads
				1123	* in the group including the group leader.
				1124	*/
				1125	thread_group_cputime_adjusted(p, &tgutime, &tgstime);
				1126	spin_lock_irq(&current->sighand->siglock);
				1127	write_seqlock(&psig->stats_lock);
				1128	psig->cutime += tgutime + sig->cutime;
				1129	psig->cstime += tgstime + sig->cstime;
				1130	psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
				1131	psig->cmin_flt +=
				1132	p->min_flt + sig->min_flt + sig->cmin_flt;
				1133	psig->cmaj_flt +=
				1134	p->maj_flt + sig->maj_flt + sig->cmaj_flt;
				1135	psig->cnvcsw +=
				1136	p->nvcsw + sig->nvcsw + sig->cnvcsw;
				1137	psig->cnivcsw +=
				1138	p->nivcsw + sig->nivcsw + sig->cnivcsw;
				1139	psig->cinblock +=
				1140	task_io_get_inblock(p) +
				1141	sig->inblock + sig->cinblock;
				1142	psig->coublock +=
				1143	task_io_get_oublock(p) +
				1144	sig->oublock + sig->coublock;
				1145	maxrss = max(sig->maxrss, sig->cmaxrss);
				1146	if (psig->cmaxrss < maxrss)
				1147	psig->cmaxrss = maxrss;
				1148	task_io_accounting_add(&psig->ioac, &p->ioac);
				1149	task_io_accounting_add(&psig->ioac, &sig->ioac);
				1150	write_sequnlock(&psig->stats_lock);
				1151	spin_unlock_irq(&current->sighand->siglock);
				1152	}
				1153
				1154	if (wo->wo_rusage)
				1155	getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
				1156	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
				1157	? p->signal->group_exit_code : p->exit_code;
				1158	wo->wo_stat = status;
				1159
				1160	if (state == EXIT_TRACE) {
				1161	write_lock_irq(&tasklist_lock);
				1162	/* We dropped tasklist, ptracer could die and untrace */
				1163	ptrace_unlink(p);
				1164
				1165	/* If parent wants a zombie, don't release it now */
				1166	state = EXIT_ZOMBIE;
				1167	if (do_notify_parent(p, p->exit_signal))
				1168	state = EXIT_DEAD;
				1169	p->exit_state = state;
				1170	write_unlock_irq(&tasklist_lock);
				1171	}
				1172	if (state == EXIT_DEAD)
				1173	release_task(p);
				1174
				1175	out_info:
				1176	infop = wo->wo_info;
				1177	if (infop) {
				1178	if ((status & 0x7f) == 0) {
				1179	infop->cause = CLD_EXITED;
				1180	infop->status = status >> 8;
				1181	} else {
				1182	infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
				1183	infop->status = status & 0x7f;
				1184	}
				1185	infop->pid = pid;
				1186	infop->uid = uid;
				1187	}
				1188
				1189	return pid;
				1190	}
				1191
				1192	static int task_stopped_code(struct task_struct p, bool ptrace)
				1193	{
				1194	if (ptrace) {
				1195	if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
				1196	return &p->exit_code;
				1197	} else {
				1198	if (p->signal->flags & SIGNAL_STOP_STOPPED)
				1199	return &p->signal->group_exit_code;
				1200	}
				1201	return NULL;
				1202	}
				1203
				1204	/**
				1205	* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
				1206	* @wo: wait options
				1207	* @ptrace: is the wait for ptrace
				1208	* @p: task to wait for
				1209	*
				1210	* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
				1211	*
				1212	* CONTEXT:
				1213	* read_lock(&tasklist_lock), which is released if return value is
				1214	* non-zero. Also, grabs and releases @p->sighand->siglock.
				1215	*
				1216	* RETURNS:
				1217	* 0 if wait condition didn't exist and search for other wait conditions
				1218	* should continue. Non-zero return, -errno on failure and @p's pid on
				1219	* success, implies that tasklist_lock is released and wait condition
				1220	* search should terminate.
				1221	*/
				1222	static int wait_task_stopped(struct wait_opts *wo,
				1223	int ptrace, struct task_struct *p)
				1224	{
				1225	struct waitid_info *infop;
				1226	int exit_code, *p_code, why;
				1227	uid_t uid = 0; /* unneeded, required by compiler */
				1228	pid_t pid;
				1229
				1230	/*
				1231	* Traditionally we see ptrace'd stopped tasks regardless of options.
				1232	*/
				1233	if (!ptrace && !(wo->wo_flags & WUNTRACED))
				1234	return 0;
				1235
				1236	if (!task_stopped_code(p, ptrace))
				1237	return 0;
				1238
				1239	exit_code = 0;
				1240	spin_lock_irq(&p->sighand->siglock);
				1241
				1242	p_code = task_stopped_code(p, ptrace);
				1243	if (unlikely(!p_code))
				1244	goto unlock_sig;
				1245
				1246	exit_code = *p_code;
				1247	if (!exit_code)
				1248	goto unlock_sig;
				1249
				1250	if (!unlikely(wo->wo_flags & WNOWAIT))
				1251	*p_code = 0;
				1252
				1253	uid = from_kuid_munged(current_user_ns(), task_uid(p));
				1254	unlock_sig:
				1255	spin_unlock_irq(&p->sighand->siglock);
				1256	if (!exit_code)
				1257	return 0;
				1258
				1259	/*
				1260	* Now we are pretty sure this task is interesting.
				1261	* Make sure it doesn't get reaped out from under us while we
				1262	* give up the lock and then examine it below. We don't want to
				1263	* keep holding onto the tasklist_lock while we call getrusage and
				1264	* possibly take page faults for user memory.
				1265	*/
				1266	get_task_struct(p);
				1267	pid = task_pid_vnr(p);
				1268	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
				1269	read_unlock(&tasklist_lock);
				1270	sched_annotate_sleep();
				1271	if (wo->wo_rusage)
				1272	getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
				1273	put_task_struct(p);
				1274
				1275	if (likely(!(wo->wo_flags & WNOWAIT)))
				1276	wo->wo_stat = (exit_code << 8) \| 0x7f;
				1277
				1278	infop = wo->wo_info;
				1279	if (infop) {
				1280	infop->cause = why;
				1281	infop->status = exit_code;
				1282	infop->pid = pid;
				1283	infop->uid = uid;
				1284	}
				1285	return pid;
				1286	}
				1287
				1288	/*
				1289	* Handle do_wait work for one task in a live, non-stopped state.
				1290	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
				1291	* the lock and this task is uninteresting. If we return nonzero, we have
				1292	* released the lock and the system call should return.
				1293	*/
				1294	static int wait_task_continued(struct wait_opts wo, struct task_struct p)
				1295	{
				1296	struct waitid_info *infop;
				1297	pid_t pid;
				1298	uid_t uid;
				1299
				1300	if (!unlikely(wo->wo_flags & WCONTINUED))
				1301	return 0;
				1302
				1303	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
				1304	return 0;
				1305
				1306	spin_lock_irq(&p->sighand->siglock);
				1307	/* Re-check with the lock held. */
				1308	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
				1309	spin_unlock_irq(&p->sighand->siglock);
				1310	return 0;
				1311	}
				1312	if (!unlikely(wo->wo_flags & WNOWAIT))
				1313	p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
				1314	uid = from_kuid_munged(current_user_ns(), task_uid(p));
				1315	spin_unlock_irq(&p->sighand->siglock);
				1316
				1317	pid = task_pid_vnr(p);
				1318	get_task_struct(p);
				1319	read_unlock(&tasklist_lock);
				1320	sched_annotate_sleep();
				1321	if (wo->wo_rusage)
				1322	getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
				1323	put_task_struct(p);
				1324
				1325	infop = wo->wo_info;
				1326	if (!infop) {
				1327	wo->wo_stat = 0xffff;
				1328	} else {
				1329	infop->cause = CLD_CONTINUED;
				1330	infop->pid = pid;
				1331	infop->uid = uid;
				1332	infop->status = SIGCONT;
				1333	}
				1334	return pid;
				1335	}
				1336
				1337	/*
				1338	* Consider @p for a wait by @parent.
				1339	*
				1340	* -ECHILD should be in ->notask_error before the first call.
				1341	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
				1342	* Returns zero if the search for a child should continue;
				1343	* then ->notask_error is 0 if @p is an eligible child,
				1344	* or still -ECHILD.
				1345	*/
				1346	static int wait_consider_task(struct wait_opts *wo, int ptrace,
				1347	struct task_struct *p)
				1348	{
				1349	/*
				1350	* We can race with wait_task_zombie() from another thread.
				1351	* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
				1352	* can't confuse the checks below.
				1353	*/
				1354	int exit_state = READ_ONCE(p->exit_state);
				1355	int ret;
				1356
				1357	if (unlikely(exit_state == EXIT_DEAD))
				1358	return 0;
				1359
				1360	ret = eligible_child(wo, ptrace, p);
				1361	if (!ret)
				1362	return ret;
				1363
				1364	if (unlikely(exit_state == EXIT_TRACE)) {
				1365	/*
				1366	* ptrace == 0 means we are the natural parent. In this case
				1367	* we should clear notask_error, debugger will notify us.
				1368	*/
				1369	if (likely(!ptrace))
				1370	wo->notask_error = 0;
				1371	return 0;
				1372	}
				1373
				1374	if (likely(!ptrace) && unlikely(p->ptrace)) {
				1375	/*
				1376	* If it is traced by its real parent's group, just pretend
				1377	* the caller is ptrace_do_wait() and reap this child if it
				1378	* is zombie.
				1379	*
				1380	* This also hides group stop state from real parent; otherwise
				1381	* a single stop can be reported twice as group and ptrace stop.
				1382	* If a ptracer wants to distinguish these two events for its
				1383	* own children it should create a separate process which takes
				1384	* the role of real parent.
				1385	*/
				1386	if (!ptrace_reparented(p))
				1387	ptrace = 1;
				1388	}
				1389
				1390	/* slay zombie? */
				1391	if (exit_state == EXIT_ZOMBIE) {
				1392	/* we don't reap group leaders with subthreads */
				1393	if (!delay_group_leader(p)) {
				1394	/*
				1395	* A zombie ptracee is only visible to its ptracer.
				1396	* Notification and reaping will be cascaded to the
				1397	* real parent when the ptracer detaches.
				1398	*/
				1399	if (unlikely(ptrace) \|\| likely(!p->ptrace))
				1400	return wait_task_zombie(wo, p);
				1401	}
				1402
				1403	/*
				1404	* Allow access to stopped/continued state via zombie by
				1405	* falling through. Clearing of notask_error is complex.
				1406	*
				1407	* When !@ptrace:
				1408	*
				1409	* If WEXITED is set, notask_error should naturally be
				1410	* cleared. If not, subset of WSTOPPED\|WCONTINUED is set,
				1411	* so, if there are live subthreads, there are events to
				1412	* wait for. If all subthreads are dead, it's still safe
				1413	* to clear - this function will be called again in finite
				1414	* amount time once all the subthreads are released and
				1415	* will then return without clearing.
				1416	*
				1417	* When @ptrace:
				1418	*
				1419	* Stopped state is per-task and thus can't change once the
				1420	* target task dies. Only continued and exited can happen.
				1421	* Clear notask_error if WCONTINUED \| WEXITED.
				1422	*/
				1423	if (likely(!ptrace) \|\| (wo->wo_flags & (WCONTINUED \| WEXITED)))
				1424	wo->notask_error = 0;
				1425	} else {
				1426	/*
				1427	* @p is alive and it's gonna stop, continue or exit, so
				1428	* there always is something to wait for.
				1429	*/
				1430	wo->notask_error = 0;
				1431	}
				1432
				1433	/*
				1434	* Wait for stopped. Depending on @ptrace, different stopped state
				1435	* is used and the two don't interact with each other.
				1436	*/
				1437	ret = wait_task_stopped(wo, ptrace, p);
				1438	if (ret)
				1439	return ret;
				1440
				1441	/*
				1442	* Wait for continued. There's only one continued state and the
				1443	* ptracer can consume it which can confuse the real parent. Don't
				1444	* use WCONTINUED from ptracer. You don't need or want it.
				1445	*/
				1446	return wait_task_continued(wo, p);
				1447	}
				1448
				1449	/*
				1450	* Do the work of do_wait() for one thread in the group, @tsk.
				1451	*
				1452	* -ECHILD should be in ->notask_error before the first call.
				1453	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
				1454	* Returns zero if the search for a child should continue; then
				1455	* ->notask_error is 0 if there were any eligible children,
				1456	* or still -ECHILD.
				1457	*/
				1458	static int do_wait_thread(struct wait_opts wo, struct task_struct tsk)
				1459	{
				1460	struct task_struct *p;
				1461
				1462	list_for_each_entry(p, &tsk->children, sibling) {
				1463	int ret = wait_consider_task(wo, 0, p);
				1464
				1465	if (ret)
				1466	return ret;
				1467	}
				1468
				1469	return 0;
				1470	}
				1471
				1472	static int ptrace_do_wait(struct wait_opts wo, struct task_struct tsk)
				1473	{
				1474	struct task_struct *p;
				1475
				1476	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
				1477	int ret = wait_consider_task(wo, 1, p);
				1478
				1479	if (ret)
				1480	return ret;
				1481	}
				1482
				1483	return 0;
				1484	}
				1485
				1486	static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
				1487	int sync, void *key)
				1488	{
				1489	struct wait_opts *wo = container_of(wait, struct wait_opts,
				1490	child_wait);
				1491	struct task_struct *p = key;
				1492
				1493	if (!eligible_pid(wo, p))
				1494	return 0;
				1495
				1496	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
				1497	return 0;
				1498
				1499	return default_wake_function(wait, mode, sync, key);
				1500	}
				1501
				1502	void __wake_up_parent(struct task_struct p, struct task_struct parent)
				1503	{
				1504	__wake_up_sync_key(&parent->signal->wait_chldexit,
				1505	TASK_INTERRUPTIBLE, 1, p);
				1506	}
				1507
				1508	static long do_wait(struct wait_opts *wo)
				1509	{
				1510	struct task_struct *tsk;
				1511	int retval;
				1512
				1513	trace_sched_process_wait(wo->wo_pid);
				1514
				1515	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
				1516	wo->child_wait.private = current;
				1517	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
				1518	repeat:
				1519	/*
				1520	* If there is nothing that can match our criteria, just get out.
				1521	* We will clear ->notask_error to zero if we see any child that
				1522	* might later match our criteria, even if we are not able to reap
				1523	* it yet.
				1524	*/
				1525	wo->notask_error = -ECHILD;
				1526	if ((wo->wo_type < PIDTYPE_MAX) &&
				1527	(!wo->wo_pid \|\| hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
				1528	goto notask;
				1529
				1530	set_current_state(TASK_INTERRUPTIBLE);
				1531	read_lock(&tasklist_lock);
				1532	tsk = current;
				1533	do {
				1534	retval = do_wait_thread(wo, tsk);
				1535	if (retval)
				1536	goto end;
				1537
				1538	retval = ptrace_do_wait(wo, tsk);
				1539	if (retval)
				1540	goto end;
				1541
				1542	if (wo->wo_flags & __WNOTHREAD)
				1543	break;
				1544	} while_each_thread(current, tsk);
				1545	read_unlock(&tasklist_lock);
				1546
				1547	notask:
				1548	retval = wo->notask_error;
				1549	if (!retval && !(wo->wo_flags & WNOHANG)) {
				1550	retval = -ERESTARTSYS;
				1551	if (!signal_pending(current)) {
				1552	schedule();
				1553	goto repeat;
				1554	}
				1555	}
				1556	end:
				1557	__set_current_state(TASK_RUNNING);
				1558	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
				1559	return retval;
				1560	}
				1561
				1562	static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
				1563	int options, struct rusage *ru)
				1564	{
				1565	struct wait_opts wo;
				1566	struct pid *pid = NULL;
				1567	enum pid_type type;
				1568	long ret;
				1569	unsigned int f_flags;
				1570
				1571	if (options & ~(WNOHANG\|WNOWAIT\|WEXITED\|WSTOPPED\|WCONTINUED\|
				1572	__WNOTHREAD\|__WCLONE\|__WALL))
				1573	return -EINVAL;
				1574	if (!(options & (WEXITED\|WSTOPPED\|WCONTINUED)))
				1575	return -EINVAL;
				1576
				1577	switch (which) {
				1578	case P_ALL:
				1579	type = PIDTYPE_MAX;
				1580	break;
				1581	case P_PID:
				1582	type = PIDTYPE_PID;
				1583	if (upid <= 0)
				1584	return -EINVAL;
				1585
				1586	pid = find_get_pid(upid);
				1587	break;
				1588	case P_PGID:
				1589	type = PIDTYPE_PGID;
				1590	if (upid < 0)
				1591	return -EINVAL;
				1592
				1593	if (upid)
				1594	pid = find_get_pid(upid);
				1595	else
				1596	pid = get_task_pid(current, PIDTYPE_PGID);
				1597	break;
				1598	case P_PIDFD:
				1599	type = PIDTYPE_PID;
				1600	if (upid < 0)
				1601	return -EINVAL;
				1602
				1603	pid = pidfd_get_pid(upid, &f_flags);
				1604	if (IS_ERR(pid))
				1605	return PTR_ERR(pid);
				1606	break;
				1607	default:
				1608	return -EINVAL;
				1609	}
				1610
				1611	wo.wo_type = type;
				1612	wo.wo_pid = pid;
				1613	wo.wo_flags = options;
				1614	wo.wo_info = infop;
				1615	wo.wo_rusage = ru;
				1616	ret = do_wait(&wo);
				1617
				1618	put_pid(pid);
				1619	return ret;
				1620	}
				1621
				1622	SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
				1623	infop, int, options, struct rusage __user *, ru)
				1624	{
				1625	struct rusage r;
				1626	struct waitid_info info = {.status = 0};
				1627	long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
				1628	int signo = 0;
				1629
				1630	if (err > 0) {
				1631	signo = SIGCHLD;
				1632	err = 0;
				1633	if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
				1634	return -EFAULT;
				1635	}
				1636	if (!infop)
				1637	return err;
				1638
				1639	if (!user_access_begin(infop, sizeof(*infop)))
				1640	return -EFAULT;
				1641
				1642	unsafe_put_user(signo, &infop->si_signo, Efault);
				1643	unsafe_put_user(0, &infop->si_errno, Efault);
				1644	unsafe_put_user(info.cause, &infop->si_code, Efault);
				1645	unsafe_put_user(info.pid, &infop->si_pid, Efault);
				1646	unsafe_put_user(info.uid, &infop->si_uid, Efault);
				1647	unsafe_put_user(info.status, &infop->si_status, Efault);
				1648	user_access_end();
				1649	return err;
				1650	Efault:
				1651	user_access_end();
				1652	return -EFAULT;
				1653	}
				1654
				1655	long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
				1656	struct rusage *ru)
				1657	{
				1658	struct wait_opts wo;
				1659	struct pid *pid = NULL;
				1660	enum pid_type type;
				1661	long ret;
				1662
				1663	if (options & ~(WNOHANG\|WUNTRACED\|WCONTINUED\|
				1664	__WNOTHREAD\|__WCLONE\|__WALL))
				1665	return -EINVAL;
				1666
				1667	/* -INT_MIN is not defined */
				1668	if (upid == INT_MIN)
				1669	return -ESRCH;
				1670
				1671	if (upid == -1)
				1672	type = PIDTYPE_MAX;
				1673	else if (upid < 0) {
				1674	type = PIDTYPE_PGID;
				1675	pid = find_get_pid(-upid);
				1676	} else if (upid == 0) {
				1677	type = PIDTYPE_PGID;
				1678	pid = get_task_pid(current, PIDTYPE_PGID);
				1679	} else /* upid > 0 */ {
				1680	type = PIDTYPE_PID;
				1681	pid = find_get_pid(upid);
				1682	}
				1683
				1684	wo.wo_type = type;
				1685	wo.wo_pid = pid;
				1686	wo.wo_flags = options \| WEXITED;
				1687	wo.wo_info = NULL;
				1688	wo.wo_stat = 0;
				1689	wo.wo_rusage = ru;
				1690	ret = do_wait(&wo);
				1691	put_pid(pid);
				1692	if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
				1693	ret = -EFAULT;
				1694
				1695	return ret;
				1696	}
				1697
				1698	SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
				1699	int, options, struct rusage __user *, ru)
				1700	{
				1701	struct rusage r;
				1702	long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
				1703
				1704	if (err > 0) {
				1705	if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
				1706	return -EFAULT;
				1707	}
				1708	return err;
				1709	}
				1710
				1711	#ifdef __ARCH_WANT_SYS_WAITPID
				1712
				1713	/*
				1714	* sys_waitpid() remains for compatibility. waitpid() should be
				1715	* implemented by calling sys_wait4() from libc.a.
				1716	*/
				1717	SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
				1718	{
				1719	return kernel_wait4(pid, stat_addr, options, NULL);
				1720	}
				1721
				1722	#endif
				1723
				1724	#ifdef CONFIG_COMPAT
				1725	COMPAT_SYSCALL_DEFINE4(wait4,
				1726	compat_pid_t, pid,
				1727	compat_uint_t __user *, stat_addr,
				1728	int, options,
				1729	struct compat_rusage __user *, ru)
				1730	{
				1731	struct rusage r;
				1732	long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
				1733	if (err > 0) {
				1734	if (ru && put_compat_rusage(&r, ru))
				1735	return -EFAULT;
				1736	}
				1737	return err;
				1738	}
				1739
				1740	COMPAT_SYSCALL_DEFINE5(waitid,
				1741	int, which, compat_pid_t, pid,
				1742	struct compat_siginfo __user *, infop, int, options,
				1743	struct compat_rusage __user *, uru)
				1744	{
				1745	struct rusage ru;
				1746	struct waitid_info info = {.status = 0};
				1747	long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
				1748	int signo = 0;
				1749	if (err > 0) {
				1750	signo = SIGCHLD;
				1751	err = 0;
				1752	if (uru) {
				1753	/* kernel_waitid() overwrites everything in ru */
				1754	if (COMPAT_USE_64BIT_TIME)
				1755	err = copy_to_user(uru, &ru, sizeof(ru));
				1756	else
				1757	err = put_compat_rusage(&ru, uru);
				1758	if (err)
				1759	return -EFAULT;
				1760	}
				1761	}
				1762
				1763	if (!infop)
				1764	return err;
				1765
				1766	if (!user_access_begin(infop, sizeof(*infop)))
				1767	return -EFAULT;
				1768
				1769	unsafe_put_user(signo, &infop->si_signo, Efault);
				1770	unsafe_put_user(0, &infop->si_errno, Efault);
				1771	unsafe_put_user(info.cause, &infop->si_code, Efault);
				1772	unsafe_put_user(info.pid, &infop->si_pid, Efault);
				1773	unsafe_put_user(info.uid, &infop->si_uid, Efault);
				1774	unsafe_put_user(info.status, &infop->si_status, Efault);
				1775	user_access_end();
				1776	return err;
				1777	Efault:
				1778	user_access_end();
				1779	return -EFAULT;
				1780	}
				1781	#endif
				1782
				1783	__weak void abort(void)
				1784	{
				1785	BUG();
				1786
				1787	/* if that doesn't kill us, halt */
				1788	panic("Oops failed to kill thread");
				1789	}
				1790	EXPORT_SYMBOL(abort);