Blame - ap/os/linux/linux-3.4.x/kernel/exit.c - R306

blob: 320ab3535b5eaa2f643d92023665344cec941a76 [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	/*
				2	* linux/kernel/exit.c
				3	*
				4	* Copyright (C) 1991, 1992 Linus Torvalds
				5	*/
				6
				7	#include <linux/mm.h>
				8	#include <linux/slab.h>
				9	#include <linux/interrupt.h>
				10	#include <linux/module.h>
				11	#include <linux/capability.h>
				12	#include <linux/completion.h>
				13	#include <linux/personality.h>
				14	#include <linux/tty.h>
				15	#include <linux/iocontext.h>
				16	#include <linux/key.h>
				17	#include <linux/security.h>
				18	#include <linux/cpu.h>
				19	#include <linux/acct.h>
				20	#include <linux/tsacct_kern.h>
				21	#include <linux/file.h>
				22	#include <linux/fdtable.h>
				23	#include <linux/binfmts.h>
				24	#include <linux/nsproxy.h>
				25	#include <linux/pid_namespace.h>
				26	#include <linux/ptrace.h>
				27	#include <linux/profile.h>
				28	#include <linux/mount.h>
				29	#include <linux/proc_fs.h>
				30	#include <linux/kthread.h>
				31	#include <linux/mempolicy.h>
				32	#include <linux/taskstats_kern.h>
				33	#include <linux/delayacct.h>
				34	#include <linux/freezer.h>
				35	#include <linux/cgroup.h>
				36	#include <linux/syscalls.h>
				37	#include <linux/signal.h>
				38	#include <linux/posix-timers.h>
				39	#include <linux/cn_proc.h>
				40	#include <linux/mutex.h>
				41	#include <linux/futex.h>
				42	#include <linux/pipe_fs_i.h>
				43	#include <linux/audit.h> /* for audit_free() */
				44	#include <linux/resource.h>
				45	#include <linux/blkdev.h>
				46	#include <linux/task_io_accounting_ops.h>
				47	#include <linux/tracehook.h>
				48	#include <linux/fs_struct.h>
				49	#include <linux/init_task.h>
				50	#include <linux/perf_event.h>
				51	#include <trace/events/sched.h>
				52	#include <linux/hw_breakpoint.h>
				53	#include <linux/oom.h>
				54	#include <linux/writeback.h>
				55	#include <linux/shm.h>
				56
				57	#include <asm/uaccess.h>
				58	#include <asm/unistd.h>
				59	#include <asm/pgtable.h>
				60	#include <asm/mmu_context.h>
				61
				62	static void exit_mm(struct task_struct * tsk);
				63
				64	static void __unhash_process(struct task_struct *p, bool group_dead)
				65	{
				66	nr_threads--;
				67	detach_pid(p, PIDTYPE_PID);
				68	if (group_dead) {
				69	detach_pid(p, PIDTYPE_PGID);
				70	detach_pid(p, PIDTYPE_SID);
				71
				72	list_del_rcu(&p->tasks);
				73	list_del_init(&p->sibling);
				74	__this_cpu_dec(process_counts);
				75	}
				76	list_del_rcu(&p->thread_group);
				77	list_del_rcu(&p->thread_node);
				78	}
				79
				80	/*
				81	* This function expects the tasklist_lock write-locked.
				82	*/
				83	static void __exit_signal(struct task_struct *tsk)
				84	{
				85	struct signal_struct *sig = tsk->signal;
				86	bool group_dead = thread_group_leader(tsk);
				87	struct sighand_struct *sighand;
				88	struct tty_struct *uninitialized_var(tty);
				89
				90	sighand = rcu_dereference_check(tsk->sighand,
				91	lockdep_tasklist_lock_is_held());
				92	spin_lock(&sighand->siglock);
				93
				94	posix_cpu_timers_exit(tsk);
				95	if (group_dead) {
				96	posix_cpu_timers_exit_group(tsk);
				97	tty = sig->tty;
				98	sig->tty = NULL;
				99	} else {
				100	/*
				101	* This can only happen if the caller is de_thread().
				102	* FIXME: this is the temporary hack, we should teach
				103	* posix-cpu-timers to handle this case correctly.
				104	*/
				105	if (unlikely(has_group_leader_pid(tsk)))
				106	posix_cpu_timers_exit_group(tsk);
				107
				108	/*
				109	* If there is any task waiting for the group exit
				110	* then notify it:
				111	*/
				112	if (sig->notify_count > 0 && !--sig->notify_count)
				113	wake_up_process(sig->group_exit_task);
				114
				115	if (tsk == sig->curr_target)
				116	sig->curr_target = next_thread(tsk);
				117	/*
				118	* Accumulate here the counters for all threads but the
				119	* group leader as they die, so they can be added into
				120	* the process-wide totals when those are taken.
				121	* The group leader stays around as a zombie as long
				122	* as there are other threads. When it gets reaped,
				123	* the exit.c code will add its counts into these totals.
				124	* We won't ever get here for the group leader, since it
				125	* will have been the last reference on the signal_struct.
				126	*/
				127	sig->utime += tsk->utime;
				128	sig->stime += tsk->stime;
				129	sig->gtime += tsk->gtime;
				130	sig->min_flt += tsk->min_flt;
				131	sig->maj_flt += tsk->maj_flt;
				132	sig->nvcsw += tsk->nvcsw;
				133	sig->nivcsw += tsk->nivcsw;
				134	sig->inblock += task_io_get_inblock(tsk);
				135	sig->oublock += task_io_get_oublock(tsk);
				136	task_io_accounting_add(&sig->ioac, &tsk->ioac);
				137	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
				138	}
				139
				140	sig->nr_threads--;
				141	__unhash_process(tsk, group_dead);
				142
				143	/*
				144	* Do this under ->siglock, we can race with another thread
				145	* doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
				146	*/
				147	flush_task_sigqueue(tsk);
				148	tsk->sighand = NULL;
				149	spin_unlock(&sighand->siglock);
				150
				151	__cleanup_sighand(sighand);
				152	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
				153	if (group_dead) {
				154	flush_sigqueue(&sig->shared_pending);
				155	tty_kref_put(tty);
				156	}
				157	}
				158
				159	static void delayed_put_task_struct(struct rcu_head *rhp)
				160	{
				161	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
				162
				163	perf_event_delayed_put(tsk);
				164	trace_sched_process_free(tsk);
				165	put_task_struct(tsk);
				166	}
				167
				168
				169	void release_task(struct task_struct * p)
				170	{
				171	struct task_struct *leader;
				172	int zap_leader;
				173	repeat:
				174	/* don't need to get the RCU readlock here - the process is dead and
				175	* can't be modifying its own credentials. But shut RCU-lockdep up */
				176	rcu_read_lock();
				177	atomic_dec(&__task_cred(p)->user->processes);
				178	rcu_read_unlock();
				179
				180	proc_flush_task(p);
				181
				182	write_lock_irq(&tasklist_lock);
				183	ptrace_release_task(p);
				184	__exit_signal(p);
				185
				186	/*
				187	* If we are the last non-leader member of the thread
				188	* group, and the leader is zombie, then notify the
				189	* group leader's parent process. (if it wants notification.)
				190	*/
				191	zap_leader = 0;
				192	leader = p->group_leader;
				193	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
				194	/*
				195	* If we were the last child thread and the leader has
				196	* exited already, and the leader's parent ignores SIGCHLD,
				197	* then we are the one who should release the leader.
				198	*/
				199	zap_leader = do_notify_parent(leader, leader->exit_signal);
				200	if (zap_leader)
				201	leader->exit_state = EXIT_DEAD;
				202	}
				203
				204	write_unlock_irq(&tasklist_lock);
				205	release_thread(p);
				206	call_rcu(&p->rcu, delayed_put_task_struct);
				207
				208	p = leader;
				209	if (unlikely(zap_leader))
				210	goto repeat;
				211	}
				212
				213	/*
				214	* This checks not only the pgrp, but falls back on the pid if no
				215	* satisfactory pgrp is found. I dunno - gdb doesn't work correctly
				216	* without this...
				217	*
				218	* The caller must hold rcu lock or the tasklist lock.
				219	*/
				220	struct pid session_of_pgrp(struct pid pgrp)
				221	{
				222	struct task_struct *p;
				223	struct pid *sid = NULL;
				224
				225	p = pid_task(pgrp, PIDTYPE_PGID);
				226	if (p == NULL)
				227	p = pid_task(pgrp, PIDTYPE_PID);
				228	if (p != NULL)
				229	sid = task_session(p);
				230
				231	return sid;
				232	}
				233
				234	/*
				235	* Determine if a process group is "orphaned", according to the POSIX
				236	* definition in 2.2.2.52. Orphaned process groups are not to be affected
				237	* by terminal-generated stop signals. Newly orphaned process groups are
				238	* to receive a SIGHUP and a SIGCONT.
				239	*
				240	* "I ask you, have you ever known what it is to be an orphan?"
				241	*/
				242	static int will_become_orphaned_pgrp(struct pid pgrp, struct task_struct ignored_task)
				243	{
				244	struct task_struct *p;
				245
				246	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
				247	if ((p == ignored_task) \|\|
				248	(p->exit_state && thread_group_empty(p)) \|\|
				249	is_global_init(p->real_parent))
				250	continue;
				251
				252	if (task_pgrp(p->real_parent) != pgrp &&
				253	task_session(p->real_parent) == task_session(p))
				254	return 0;
				255	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
				256
				257	return 1;
				258	}
				259
				260	int is_current_pgrp_orphaned(void)
				261	{
				262	int retval;
				263
				264	read_lock(&tasklist_lock);
				265	retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
				266	read_unlock(&tasklist_lock);
				267
				268	return retval;
				269	}
				270
				271	static bool has_stopped_jobs(struct pid *pgrp)
				272	{
				273	struct task_struct *p;
				274
				275	do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
				276	if (p->signal->flags & SIGNAL_STOP_STOPPED)
				277	return true;
				278	} while_each_pid_task(pgrp, PIDTYPE_PGID, p);
				279
				280	return false;
				281	}
				282
				283	/*
				284	* Check to see if any process groups have become orphaned as
				285	* a result of our exiting, and if they have any stopped jobs,
				286	* send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
				287	*/
				288	static void
				289	kill_orphaned_pgrp(struct task_struct tsk, struct task_struct parent)
				290	{
				291	struct pid *pgrp = task_pgrp(tsk);
				292	struct task_struct *ignored_task = tsk;
				293
				294	if (!parent)
				295	/* exit: our father is in a different pgrp than
				296	* we are and we were the only connection outside.
				297	*/
				298	parent = tsk->real_parent;
				299	else
				300	/* reparent: our child is in a different pgrp than
				301	* we are, and it was the only connection outside.
				302	*/
				303	ignored_task = NULL;
				304
				305	if (task_pgrp(parent) != pgrp &&
				306	task_session(parent) == task_session(tsk) &&
				307	will_become_orphaned_pgrp(pgrp, ignored_task) &&
				308	has_stopped_jobs(pgrp)) {
				309	__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
				310	__kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
				311	}
				312	}
				313
				314	/**
				315	* reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd
				316	*
				317	* If a kernel thread is launched as a result of a system call, or if
				318	* it ever exits, it should generally reparent itself to kthreadd so it
				319	* isn't in the way of other processes and is correctly cleaned up on exit.
				320	*
				321	* The various task state such as scheduling policy and priority may have
				322	* been inherited from a user process, so we reset them to sane values here.
				323	*
				324	* NOTE that reparent_to_kthreadd() gives the caller full capabilities.
				325	*/
				326	static void reparent_to_kthreadd(void)
				327	{
				328	write_lock_irq(&tasklist_lock);
				329
				330	ptrace_unlink(current);
				331	/* Reparent to init */
				332	current->real_parent = current->parent = kthreadd_task;
				333	list_move_tail(&current->sibling, &current->real_parent->children);
				334
				335	/* Set the exit signal to SIGCHLD so we signal init on exit */
				336	current->exit_signal = SIGCHLD;
				337
				338	if (task_nice(current) < 0)
				339	set_user_nice(current, 0);
				340	/* cpus_allowed? */
				341	/* rt_priority? */
				342	/* signals? */
				343	memcpy(current->signal->rlim, init_task.signal->rlim,
				344	sizeof(current->signal->rlim));
				345
				346	atomic_inc(&init_cred.usage);
				347	commit_creds(&init_cred);
				348	write_unlock_irq(&tasklist_lock);
				349	}
				350
				351	void __set_special_pids(struct pid *pid)
				352	{
				353	struct task_struct *curr = current->group_leader;
				354
				355	if (task_session(curr) != pid)
				356	change_pid(curr, PIDTYPE_SID, pid);
				357
				358	if (task_pgrp(curr) != pid)
				359	change_pid(curr, PIDTYPE_PGID, pid);
				360	}
				361
				362	static void set_special_pids(struct pid *pid)
				363	{
				364	write_lock_irq(&tasklist_lock);
				365	__set_special_pids(pid);
				366	write_unlock_irq(&tasklist_lock);
				367	}
				368
				369	/*
				370	* Let kernel threads use this to say that they allow a certain signal.
				371	* Must not be used if kthread was cloned with CLONE_SIGHAND.
				372	*/
				373	int allow_signal(int sig)
				374	{
				375	if (!valid_signal(sig) \|\| sig < 1)
				376	return -EINVAL;
				377
				378	spin_lock_irq(&current->sighand->siglock);
				379	/* This is only needed for daemonize()'ed kthreads */
				380	sigdelset(&current->blocked, sig);
				381	/*
				382	* Kernel threads handle their own signals. Let the signal code
				383	* know it'll be handled, so that they don't get converted to
				384	* SIGKILL or just silently dropped.
				385	*/
				386	current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2;
				387	recalc_sigpending();
				388	spin_unlock_irq(&current->sighand->siglock);
				389	return 0;
				390	}
				391
				392	EXPORT_SYMBOL(allow_signal);
				393
				394	int disallow_signal(int sig)
				395	{
				396	if (!valid_signal(sig) \|\| sig < 1)
				397	return -EINVAL;
				398
				399	spin_lock_irq(&current->sighand->siglock);
				400	current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN;
				401	recalc_sigpending();
				402	spin_unlock_irq(&current->sighand->siglock);
				403	return 0;
				404	}
				405
				406	EXPORT_SYMBOL(disallow_signal);
				407
				408	/*
				409	* Put all the gunge required to become a kernel thread without
				410	* attached user resources in one place where it belongs.
				411	*/
				412
				413	void daemonize(const char *name, ...)
				414	{
				415	va_list args;
				416	sigset_t blocked;
				417
				418	va_start(args, name);
				419	vsnprintf(current->comm, sizeof(current->comm), name, args);
				420	va_end(args);
				421
				422	/*
				423	* If we were started as result of loading a module, close all of the
				424	* user space pages. We don't need them, and if we didn't close them
				425	* they would be locked into memory.
				426	*/
				427	exit_mm(current);
				428	/*
				429	* We don't want to get frozen, in case system-wide hibernation
				430	* or suspend transition begins right now.
				431	*/
				432	current->flags \|= (PF_NOFREEZE \| PF_KTHREAD);
				433
				434	if (current->nsproxy != &init_nsproxy) {
				435	get_nsproxy(&init_nsproxy);
				436	switch_task_namespaces(current, &init_nsproxy);
				437	}
				438	set_special_pids(&init_struct_pid);
				439	proc_clear_tty(current);
				440
				441	/* Block and flush all signals */
				442	sigfillset(&blocked);
				443	sigprocmask(SIG_BLOCK, &blocked, NULL);
				444	flush_signals(current);
				445
				446	/* Become as one with the init task */
				447
				448	daemonize_fs_struct();
				449	exit_files(current);
				450	current->files = init_task.files;
				451	atomic_inc(&current->files->count);
				452
				453	reparent_to_kthreadd();
				454	}
				455
				456	EXPORT_SYMBOL(daemonize);
				457
				458	static void close_files(struct files_struct * files)
				459	{
				460	int i, j;
				461	struct fdtable *fdt;
				462
				463	j = 0;
				464
				465	/*
				466	* It is safe to dereference the fd table without RCU or
				467	* ->file_lock because this is the last reference to the
				468	* files structure. But use RCU to shut RCU-lockdep up.
				469	*/
				470	rcu_read_lock();
				471	fdt = files_fdtable(files);
				472	rcu_read_unlock();
				473	for (;;) {
				474	unsigned long set;
				475	i = j * BITS_PER_LONG;
				476	if (i >= fdt->max_fds)
				477	break;
				478	set = fdt->open_fds[j++];
				479	while (set) {
				480	if (set & 1) {
				481	struct file * file = xchg(&fdt->fd[i], NULL);
				482	if (file) {
				483	filp_close(file, files);
				484	cond_resched();
				485	}
				486	}
				487	i++;
				488	set >>= 1;
				489	}
				490	}
				491	}
				492
				493	struct files_struct get_files_struct(struct task_struct task)
				494	{
				495	struct files_struct *files;
				496
				497	task_lock(task);
				498	files = task->files;
				499	if (files)
				500	atomic_inc(&files->count);
				501	task_unlock(task);
				502
				503	return files;
				504	}
				505
				506	void put_files_struct(struct files_struct *files)
				507	{
				508	struct fdtable *fdt;
				509
				510	if (atomic_dec_and_test(&files->count)) {
				511	close_files(files);
				512	/*
				513	* Free the fd and fdset arrays if we expanded them.
				514	* If the fdtable was embedded, pass files for freeing
				515	* at the end of the RCU grace period. Otherwise,
				516	* you can free files immediately.
				517	*/
				518	rcu_read_lock();
				519	fdt = files_fdtable(files);
				520	if (fdt != &files->fdtab)
				521	kmem_cache_free(files_cachep, files);
				522	free_fdtable(fdt);
				523	rcu_read_unlock();
				524	}
				525	}
				526
				527	void reset_files_struct(struct files_struct *files)
				528	{
				529	struct task_struct *tsk = current;
				530	struct files_struct *old;
				531
				532	old = tsk->files;
				533	task_lock(tsk);
				534	tsk->files = files;
				535	task_unlock(tsk);
				536	put_files_struct(old);
				537	}
				538
				539	void exit_files(struct task_struct *tsk)
				540	{
				541	struct files_struct * files = tsk->files;
				542
				543	if (files) {
				544	task_lock(tsk);
				545	tsk->files = NULL;
				546	task_unlock(tsk);
				547	put_files_struct(files);
				548	}
				549	}
				550
				551	#ifdef CONFIG_MM_OWNER
				552	/*
				553	* A task is exiting. If it owned this mm, find a new owner for the mm.
				554	*/
				555	void mm_update_next_owner(struct mm_struct *mm)
				556	{
				557	struct task_struct c, g, *p = current;
				558
				559	retry:
				560	/*
				561	* If the exiting or execing task is not the owner, it's
				562	* someone else's problem.
				563	*/
				564	if (mm->owner != p)
				565	return;
				566	/*
				567	* The current owner is exiting/execing and there are no other
				568	* candidates. Do not leave the mm pointing to a possibly
				569	* freed task structure.
				570	*/
				571	if (atomic_read(&mm->mm_users) <= 1) {
				572	mm->owner = NULL;
				573	return;
				574	}
				575
				576	read_lock(&tasklist_lock);
				577	/*
				578	* Search in the children
				579	*/
				580	list_for_each_entry(c, &p->children, sibling) {
				581	if (c->mm == mm)
				582	goto assign_new_owner;
				583	}
				584
				585	/*
				586	* Search in the siblings
				587	*/
				588	list_for_each_entry(c, &p->real_parent->children, sibling) {
				589	if (c->mm == mm)
				590	goto assign_new_owner;
				591	}
				592
				593	/*
				594	* Search through everything else. We should not get
				595	* here often
				596	*/
				597	do_each_thread(g, c) {
				598	if (c->mm == mm)
				599	goto assign_new_owner;
				600	} while_each_thread(g, c);
				601
				602	read_unlock(&tasklist_lock);
				603	/*
				604	* We found no owner yet mm_users > 1: this implies that we are
				605	* most likely racing with swapoff (try_to_unuse()) or /proc or
				606	* ptrace or page migration (get_task_mm()). Mark owner as NULL.
				607	*/
				608	mm->owner = NULL;
				609	return;
				610
				611	assign_new_owner:
				612	BUG_ON(c == p);
				613	get_task_struct(c);
				614	/*
				615	* The task_lock protects c->mm from changing.
				616	* We always want mm->owner->mm == mm
				617	*/
				618	task_lock(c);
				619	/*
				620	* Delay read_unlock() till we have the task_lock()
				621	* to ensure that c does not slip away underneath us
				622	*/
				623	read_unlock(&tasklist_lock);
				624	if (c->mm != mm) {
				625	task_unlock(c);
				626	put_task_struct(c);
				627	goto retry;
				628	}
				629	mm->owner = c;
				630	task_unlock(c);
				631	put_task_struct(c);
				632	}
				633	#endif /* CONFIG_MM_OWNER */
				634
				635	/*
				636	* Turn us into a lazy TLB process if we
				637	* aren't already..
				638	*/
				639	static void exit_mm(struct task_struct * tsk)
				640	{
				641	struct mm_struct *mm = tsk->mm;
				642	struct core_state *core_state;
				643
				644	mm_release(tsk, mm);
				645	if (!mm)
				646	return;
				647	sync_mm_rss(mm);
				648	/*
				649	* Serialize with any possible pending coredump.
				650	* We must hold mmap_sem around checking core_state
				651	* and clearing tsk->mm. The core-inducing thread
				652	* will increment ->nr_threads for each thread in the
				653	* group with ->mm != NULL.
				654	*/
				655	down_read(&mm->mmap_sem);
				656	core_state = mm->core_state;
				657	if (core_state) {
				658	struct core_thread self;
				659	up_read(&mm->mmap_sem);
				660
				661	self.task = tsk;
				662	self.next = xchg(&core_state->dumper.next, &self);
				663	/*
				664	* Implies mb(), the result of xchg() must be visible
				665	* to core_state->dumper.
				666	*/
				667	if (atomic_dec_and_test(&core_state->nr_threads))
				668	complete(&core_state->startup);
				669
				670	for (;;) {
				671	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
				672	if (!self.task) /* see coredump_finish() */
				673	break;
				674	schedule();
				675	}
				676	__set_task_state(tsk, TASK_RUNNING);
				677	down_read(&mm->mmap_sem);
				678	}
				679	atomic_inc(&mm->mm_count);
				680	BUG_ON(mm != tsk->active_mm);
				681	/* more a memory barrier than a real lock */
				682	task_lock(tsk);
				683	tsk->mm = NULL;
				684	up_read(&mm->mmap_sem);
				685	enter_lazy_tlb(mm, current);
				686	task_unlock(tsk);
				687	mm_update_next_owner(mm);
				688	mmput(mm);
				689	}
				690
				691	/*
				692	* When we die, we re-parent all our children, and try to:
				693	* 1. give them to another thread in our thread group, if such a member exists
				694	* 2. give it to the first ancestor process which prctl'd itself as a
				695	* child_subreaper for its children (like a service manager)
				696	* 3. give it to the init process (PID 1) in our pid namespace
				697	*/
				698	static struct task_struct find_new_reaper(struct task_struct father)
				699	__releases(&tasklist_lock)
				700	__acquires(&tasklist_lock)
				701	{
				702	struct pid_namespace *pid_ns = task_active_pid_ns(father);
				703	struct task_struct *thread;
				704
				705	thread = father;
				706	while_each_thread(father, thread) {
				707	if (thread->flags & PF_EXITING)
				708	continue;
				709	if (unlikely(pid_ns->child_reaper == father))
				710	pid_ns->child_reaper = thread;
				711	return thread;
				712	}
				713
				714	if (unlikely(pid_ns->child_reaper == father)) {
				715	write_unlock_irq(&tasklist_lock);
				716	if (unlikely(pid_ns == &init_pid_ns)) {
				717	panic("Attempted to kill init! exitcode=0x%08x\n",
				718	father->signal->group_exit_code ?:
				719	father->exit_code);
				720	}
				721
				722	zap_pid_ns_processes(pid_ns);
				723	write_lock_irq(&tasklist_lock);
				724	/*
				725	* We can not clear ->child_reaper or leave it alone.
				726	* There may by stealth EXIT_DEAD tasks on ->children,
				727	* forget_original_parent() must move them somewhere.
				728	*/
				729	pid_ns->child_reaper = init_pid_ns.child_reaper;
				730	} else if (father->signal->has_child_subreaper) {
				731	struct task_struct *reaper;
				732
				733	/*
				734	* Find the first ancestor marked as child_subreaper.
				735	* Note that the code below checks same_thread_group(reaper,
				736	* pid_ns->child_reaper). This is what we need to DTRT in a
				737	* PID namespace. However we still need the check above, see
				738	* http://marc.info/?l=linux-kernel&m=131385460420380
				739	*/
				740	for (reaper = father->real_parent;
				741	reaper != &init_task;
				742	reaper = reaper->real_parent) {
				743	if (same_thread_group(reaper, pid_ns->child_reaper))
				744	break;
				745	if (!reaper->signal->is_child_subreaper)
				746	continue;
				747	thread = reaper;
				748	do {
				749	if (!(thread->flags & PF_EXITING))
				750	return reaper;
				751	} while_each_thread(reaper, thread);
				752	}
				753	}
				754
				755	return pid_ns->child_reaper;
				756	}
				757
				758	/*
				759	* Any that need to be release_task'd are put on the @dead list.
				760	*/
				761	static void reparent_leader(struct task_struct father, struct task_struct p,
				762	struct list_head *dead)
				763	{
				764	list_move_tail(&p->sibling, &p->real_parent->children);
				765	/*
				766	* If this is a threaded reparent there is no need to
				767	* notify anyone anything has happened.
				768	*/
				769	if (same_thread_group(p->real_parent, father))
				770	return;
				771
				772	/*
				773	* We don't want people slaying init.
				774	*
				775	* Note: we do this even if it is EXIT_DEAD, wait_task_zombie()
				776	* can change ->exit_state to EXIT_ZOMBIE. If this is the final
				777	* state, do_notify_parent() was already called and ->exit_signal
				778	* doesn't matter.
				779	*/
				780	p->exit_signal = SIGCHLD;
				781
				782	if (p->exit_state == EXIT_DEAD)
				783	return;
				784
				785	/* If it has exited notify the new parent about this child's death. */
				786	if (!p->ptrace &&
				787	p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
				788	if (do_notify_parent(p, p->exit_signal)) {
				789	p->exit_state = EXIT_DEAD;
				790	list_move_tail(&p->sibling, dead);
				791	}
				792	}
				793
				794	kill_orphaned_pgrp(p, father);
				795	}
				796
				797	static void forget_original_parent(struct task_struct *father)
				798	{
				799	struct task_struct p, n, *reaper;
				800	LIST_HEAD(dead_children);
				801
				802	write_lock_irq(&tasklist_lock);
				803	/*
				804	* Note that exit_ptrace() and find_new_reaper() might
				805	* drop tasklist_lock and reacquire it.
				806	*/
				807	exit_ptrace(father);
				808	reaper = find_new_reaper(father);
				809
				810	list_for_each_entry_safe(p, n, &father->children, sibling) {
				811	struct task_struct *t = p;
				812	do {
				813	t->real_parent = reaper;
				814	if (t->parent == father) {
				815	BUG_ON(t->ptrace);
				816	t->parent = t->real_parent;
				817	}
				818	if (t->pdeath_signal)
				819	group_send_sig_info(t->pdeath_signal,
				820	SEND_SIG_NOINFO, t);
				821	} while_each_thread(p, t);
				822	reparent_leader(father, p, &dead_children);
				823	}
				824	write_unlock_irq(&tasklist_lock);
				825
				826	BUG_ON(!list_empty(&father->children));
				827
				828	list_for_each_entry_safe(p, n, &dead_children, sibling) {
				829	list_del_init(&p->sibling);
				830	release_task(p);
				831	}
				832	}
				833
				834	/*
				835	* Send signals to all our closest relatives so that they know
				836	* to properly mourn us..
				837	*/
				838	static void exit_notify(struct task_struct *tsk, int group_dead)
				839	{
				840	bool autoreap;
				841
				842	/*
				843	* This does two things:
				844	*
				845	* A. Make init inherit all the child processes
				846	* B. Check to see if any process groups have become orphaned
				847	* as a result of our exiting, and if they have any stopped
				848	* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
				849	*/
				850	forget_original_parent(tsk);
				851	exit_task_namespaces(tsk);
				852
				853	write_lock_irq(&tasklist_lock);
				854	if (group_dead)
				855	kill_orphaned_pgrp(tsk->group_leader, NULL);
				856
				857	if (unlikely(tsk->ptrace)) {
				858	int sig = thread_group_leader(tsk) &&
				859	thread_group_empty(tsk) &&
				860	!ptrace_reparented(tsk) ?
				861	tsk->exit_signal : SIGCHLD;
				862	autoreap = do_notify_parent(tsk, sig);
				863	} else if (thread_group_leader(tsk)) {
				864	autoreap = thread_group_empty(tsk) &&
				865	do_notify_parent(tsk, tsk->exit_signal);
				866	} else {
				867	autoreap = true;
				868	}
				869
				870	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
				871
				872	/* mt-exec, de_thread() is waiting for group leader */
				873	if (unlikely(tsk->signal->notify_count < 0))
				874	wake_up_process(tsk->signal->group_exit_task);
				875	write_unlock_irq(&tasklist_lock);
				876
				877	/* If the process is dead, release it - nobody will wait for it */
				878	if (autoreap)
				879	release_task(tsk);
				880	}
				881
				882	#ifdef CONFIG_DEBUG_STACK_USAGE
				883	static void check_stack_usage(void)
				884	{
				885	static DEFINE_SPINLOCK(low_water_lock);
				886	static int lowest_to_date = THREAD_SIZE;
				887	unsigned long free;
				888
				889	free = stack_not_used(current);
				890
				891	if (free >= lowest_to_date)
				892	return;
				893
				894	spin_lock(&low_water_lock);
				895	if (free < lowest_to_date) {
				896	printk(KERN_WARNING "%s used greatest stack depth: %lu bytes "
				897	"left\n",
				898	current->comm, free);
				899	lowest_to_date = free;
				900	}
				901	spin_unlock(&low_water_lock);
				902	}
				903	#else
				904	static inline void check_stack_usage(void) {}
				905	#endif
				906
				907	void do_exit(long code)
				908	{
				909	struct task_struct *tsk = current;
				910	int group_dead;
				911
				912	profile_task_exit(tsk);
				913
				914	WARN_ON(blk_needs_flush_plug(tsk));
				915
				916	if (unlikely(in_interrupt()))
				917	panic("Aiee, killing interrupt handler!");
				918	if (unlikely(!tsk->pid))
				919	panic("Attempted to kill the idle task!");
				920
				921	/*
				922	* If do_exit is called because this processes oopsed, it's possible
				923	* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
				924	* continuing. Amongst other possible reasons, this is to prevent
				925	* mm_release()->clear_child_tid() from writing to a user-controlled
				926	* kernel address.
				927	*/
				928	set_fs(USER_DS);
				929
				930	ptrace_event(PTRACE_EVENT_EXIT, code);
				931
				932	validate_creds_for_do_exit(tsk);
				933
				934	/*
				935	* We're taking recursive faults here in do_exit. Safest is to just
				936	* leave this task alone and wait for reboot.
				937	*/
				938	if (unlikely(tsk->flags & PF_EXITING)) {
				939	printk(KERN_ALERT
				940	"Fixing recursive fault but reboot is needed!\n");
				941	/*
				942	* We can do this unlocked here. The futex code uses
				943	* this flag just to verify whether the pi state
				944	* cleanup has been done or not. In the worst case it
				945	* loops once more. We pretend that the cleanup was
				946	* done as there is no way to return. Either the
				947	* OWNER_DIED bit is set by now or we push the blocked
				948	* task into the wait for ever nirwana as well.
				949	*/
				950	tsk->flags \|= PF_EXITPIDONE;
				951	set_current_state(TASK_UNINTERRUPTIBLE);
				952	schedule();
				953	}
				954
				955	exit_signals(tsk); /* sets PF_EXITING */
				956	/*
				957	* tsk->flags are checked in the futex code to protect against
				958	* an exiting task cleaning up the robust pi futexes.
				959	*/
				960	smp_mb();
				961	raw_spin_unlock_wait(&tsk->pi_lock);
				962
				963	exit_irq_thread();
				964
				965	if (unlikely(in_atomic()))
				966	printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
				967	current->comm, task_pid_nr(current),
				968	preempt_count());
				969
				970	acct_update_integrals(tsk);
				971	/* sync mm's RSS info before statistics gathering */
				972	if (tsk->mm)
				973	sync_mm_rss(tsk->mm);
				974	group_dead = atomic_dec_and_test(&tsk->signal->live);
				975	if (group_dead) {
				976	hrtimer_cancel(&tsk->signal->real_timer);
				977	exit_itimers(tsk->signal);
				978	if (tsk->mm)
				979	setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
				980	}
				981	acct_collect(code, group_dead);
				982	if (group_dead)
				983	tty_audit_exit();
				984	audit_free(tsk);
				985
				986	tsk->exit_code = code;
				987	taskstats_exit(tsk, group_dead);
				988
				989	exit_mm(tsk);
				990
				991	if (group_dead)
				992	acct_process();
				993	trace_sched_process_exit(tsk);
				994
				995	exit_sem(tsk);
				996	exit_shm(tsk);
				997	exit_files(tsk);
				998	exit_fs(tsk);
				999	check_stack_usage();
				1000	exit_thread();
				1001
				1002	/*
				1003	* Flush inherited counters to the parent - before the parent
				1004	* gets woken up by child-exit notifications.
				1005	*
				1006	* because of cgroup mode, must be called before cgroup_exit()
				1007	*/
				1008	perf_event_exit_task(tsk);
				1009
				1010	cgroup_exit(tsk, 1);
				1011
				1012	if (group_dead)
				1013	disassociate_ctty(1);
				1014
				1015	module_put(task_thread_info(tsk)->exec_domain->module);
				1016
				1017	proc_exit_connector(tsk);
				1018
				1019	/*
				1020	* FIXME: do that only when needed, using sched_exit tracepoint
				1021	*/
				1022	ptrace_put_breakpoints(tsk);
				1023
				1024	exit_notify(tsk, group_dead);
				1025	#ifdef CONFIG_NUMA
				1026	task_lock(tsk);
				1027	mpol_put(tsk->mempolicy);
				1028	tsk->mempolicy = NULL;
				1029	task_unlock(tsk);
				1030	#endif
				1031	#ifdef CONFIG_FUTEX
				1032	if (unlikely(current->pi_state_cache))
				1033	kfree(current->pi_state_cache);
				1034	#endif
				1035	/*
				1036	* Make sure we are holding no locks:
				1037	*/
				1038	debug_check_no_locks_held(tsk);
				1039	/*
				1040	* We can do this unlocked here. The futex code uses this flag
				1041	* just to verify whether the pi state cleanup has been done
				1042	* or not. In the worst case it loops once more.
				1043	*/
				1044	tsk->flags \|= PF_EXITPIDONE;
				1045
				1046	if (tsk->io_context)
				1047	exit_io_context(tsk);
				1048
				1049	if (tsk->splice_pipe)
				1050	__free_pipe_info(tsk->splice_pipe);
				1051
				1052	validate_creds_for_do_exit(tsk);
				1053
				1054	preempt_disable();
				1055	if (tsk->nr_dirtied)
				1056	__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
				1057	exit_rcu();
				1058
				1059	/*
				1060	* The setting of TASK_RUNNING by try_to_wake_up() may be delayed
				1061	* when the following two conditions become true.
				1062	* - There is race condition of mmap_sem (It is acquired by
				1063	* exit_mm()), and
				1064	* - SMI occurs before setting TASK_RUNINNG.
				1065	* (or hypervisor of virtual machine switches to other guest)
				1066	* As a result, we may become TASK_RUNNING after becoming TASK_DEAD
				1067	*
				1068	* To avoid it, we have to wait for releasing tsk->pi_lock which
				1069	* is held by try_to_wake_up()
				1070	*/
				1071	smp_mb();
				1072	raw_spin_unlock_wait(&tsk->pi_lock);
				1073
				1074	/* causes final put_task_struct in finish_task_switch(). */
				1075	tsk->state = TASK_DEAD;
				1076	tsk->flags \|= PF_NOFREEZE; /* tell freezer to ignore us */
				1077	schedule();
				1078	BUG();
				1079	/* Avoid "noreturn function does return". */
				1080	for (;;)
				1081	cpu_relax(); /* For when BUG is null */
				1082	}
				1083
				1084	EXPORT_SYMBOL(do_exit);
				1085
				1086	void complete_and_exit(struct completion *comp, long code)
				1087	{
				1088	if (comp)
				1089	complete(comp);
				1090
				1091	do_exit(code);
				1092	}
				1093
				1094	EXPORT_SYMBOL(complete_and_exit);
				1095
				1096	SYSCALL_DEFINE1(exit, int, error_code)
				1097	{
				1098	do_exit((error_code&0xff)<<8);
				1099	}
				1100
				1101	/*
				1102	* Take down every thread in the group. This is called by fatal signals
				1103	* as well as by sys_exit_group (below).
				1104	*/
				1105	void
				1106	do_group_exit(int exit_code)
				1107	{
				1108	struct signal_struct *sig = current->signal;
				1109
				1110	BUG_ON(exit_code & 0x80); /* core dumps don't get here */
				1111
				1112	if (signal_group_exit(sig))
				1113	exit_code = sig->group_exit_code;
				1114	else if (!thread_group_empty(current)) {
				1115	struct sighand_struct *const sighand = current->sighand;
				1116	spin_lock_irq(&sighand->siglock);
				1117	if (signal_group_exit(sig))
				1118	/* Another thread got here before we took the lock. */
				1119	exit_code = sig->group_exit_code;
				1120	else {
				1121	sig->group_exit_code = exit_code;
				1122	sig->flags = SIGNAL_GROUP_EXIT;
				1123	zap_other_threads(current);
				1124	}
				1125	spin_unlock_irq(&sighand->siglock);
				1126	}
				1127
				1128	do_exit(exit_code);
				1129	/* NOTREACHED */
				1130	}
				1131
				1132	/*
				1133	* this kills every thread in the thread group. Note that any externally
				1134	* wait4()-ing process will get the correct exit code - even if this
				1135	* thread is not the thread group leader.
				1136	*/
				1137	SYSCALL_DEFINE1(exit_group, int, error_code)
				1138	{
				1139	do_group_exit((error_code & 0xff) << 8);
				1140	/* NOTREACHED */
				1141	return 0;
				1142	}
				1143
				1144	struct wait_opts {
				1145	enum pid_type wo_type;
				1146	int wo_flags;
				1147	struct pid *wo_pid;
				1148
				1149	struct siginfo __user *wo_info;
				1150	int __user *wo_stat;
				1151	struct rusage __user *wo_rusage;
				1152
				1153	wait_queue_t child_wait;
				1154	int notask_error;
				1155	};
				1156
				1157	static inline
				1158	struct pid task_pid_type(struct task_struct task, enum pid_type type)
				1159	{
				1160	if (type != PIDTYPE_PID)
				1161	task = task->group_leader;
				1162	return task->pids[type].pid;
				1163	}
				1164
				1165	static int eligible_pid(struct wait_opts wo, struct task_struct p)
				1166	{
				1167	return wo->wo_type == PIDTYPE_MAX \|\|
				1168	task_pid_type(p, wo->wo_type) == wo->wo_pid;
				1169	}
				1170
				1171	static int eligible_child(struct wait_opts wo, struct task_struct p)
				1172	{
				1173	if (!eligible_pid(wo, p))
				1174	return 0;
				1175	/* Wait for all children (clone and not) if __WALL is set;
				1176	* otherwise, wait for clone children only if __WCLONE is
				1177	* set; otherwise, wait for non-clone children only. (Note:
				1178	* A "clone" child here is one that reports to its parent
				1179	* using a signal other than SIGCHLD.) */
				1180	if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
				1181	&& !(wo->wo_flags & __WALL))
				1182	return 0;
				1183
				1184	return 1;
				1185	}
				1186
				1187	static int wait_noreap_copyout(struct wait_opts wo, struct task_struct p,
				1188	pid_t pid, uid_t uid, int why, int status)
				1189	{
				1190	struct siginfo __user *infop;
				1191	int retval = wo->wo_rusage
				1192	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				1193
				1194	put_task_struct(p);
				1195	infop = wo->wo_info;
				1196	if (infop) {
				1197	if (!retval)
				1198	retval = put_user(SIGCHLD, &infop->si_signo);
				1199	if (!retval)
				1200	retval = put_user(0, &infop->si_errno);
				1201	if (!retval)
				1202	retval = put_user((short)why, &infop->si_code);
				1203	if (!retval)
				1204	retval = put_user(pid, &infop->si_pid);
				1205	if (!retval)
				1206	retval = put_user(uid, &infop->si_uid);
				1207	if (!retval)
				1208	retval = put_user(status, &infop->si_status);
				1209	}
				1210	if (!retval)
				1211	retval = pid;
				1212	return retval;
				1213	}
				1214
				1215	/*
				1216	* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
				1217	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
				1218	* the lock and this task is uninteresting. If we return nonzero, we have
				1219	* released the lock and the system call should return.
				1220	*/
				1221	static int wait_task_zombie(struct wait_opts wo, struct task_struct p)
				1222	{
				1223	unsigned long state;
				1224	int retval, status, traced;
				1225	pid_t pid = task_pid_vnr(p);
				1226	uid_t uid = __task_cred(p)->uid;
				1227	struct siginfo __user *infop;
				1228
				1229	if (!likely(wo->wo_flags & WEXITED))
				1230	return 0;
				1231
				1232	if (unlikely(wo->wo_flags & WNOWAIT)) {
				1233	int exit_code = p->exit_code;
				1234	int why;
				1235
				1236	get_task_struct(p);
				1237	read_unlock(&tasklist_lock);
				1238	if ((exit_code & 0x7f) == 0) {
				1239	why = CLD_EXITED;
				1240	status = exit_code >> 8;
				1241	} else {
				1242	why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
				1243	status = exit_code & 0x7f;
				1244	}
				1245	return wait_noreap_copyout(wo, p, pid, uid, why, status);
				1246	}
				1247
				1248	/*
				1249	* Try to move the task's state to DEAD
				1250	* only one thread is allowed to do this:
				1251	*/
				1252	state = xchg(&p->exit_state, EXIT_DEAD);
				1253	if (state != EXIT_ZOMBIE) {
				1254	BUG_ON(state != EXIT_DEAD);
				1255	return 0;
				1256	}
				1257
				1258	traced = ptrace_reparented(p);
				1259	/*
				1260	* It can be ptraced but not reparented, check
				1261	* thread_group_leader() to filter out sub-threads.
				1262	*/
				1263	if (likely(!traced) && thread_group_leader(p)) {
				1264	struct signal_struct *psig;
				1265	struct signal_struct *sig;
				1266	unsigned long maxrss;
				1267	cputime_t tgutime, tgstime;
				1268
				1269	/*
				1270	* The resource counters for the group leader are in its
				1271	* own task_struct. Those for dead threads in the group
				1272	* are in its signal_struct, as are those for the child
				1273	* processes it has previously reaped. All these
				1274	* accumulate in the parent's signal_struct c* fields.
				1275	*
				1276	* We don't bother to take a lock here to protect these
				1277	* p->signal fields, because they are only touched by
				1278	* __exit_signal, which runs with tasklist_lock
				1279	* write-locked anyway, and so is excluded here. We do
				1280	* need to protect the access to parent->signal fields,
				1281	* as other threads in the parent group can be right
				1282	* here reaping other children at the same time.
				1283	*
				1284	* We use thread_group_times() to get times for the thread
				1285	* group, which consolidates times for all threads in the
				1286	* group including the group leader.
				1287	*/
				1288	thread_group_times(p, &tgutime, &tgstime);
				1289	spin_lock_irq(&p->real_parent->sighand->siglock);
				1290	psig = p->real_parent->signal;
				1291	sig = p->signal;
				1292	psig->cutime += tgutime + sig->cutime;
				1293	psig->cstime += tgstime + sig->cstime;
				1294	psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
				1295	psig->cmin_flt +=
				1296	p->min_flt + sig->min_flt + sig->cmin_flt;
				1297	psig->cmaj_flt +=
				1298	p->maj_flt + sig->maj_flt + sig->cmaj_flt;
				1299	psig->cnvcsw +=
				1300	p->nvcsw + sig->nvcsw + sig->cnvcsw;
				1301	psig->cnivcsw +=
				1302	p->nivcsw + sig->nivcsw + sig->cnivcsw;
				1303	psig->cinblock +=
				1304	task_io_get_inblock(p) +
				1305	sig->inblock + sig->cinblock;
				1306	psig->coublock +=
				1307	task_io_get_oublock(p) +
				1308	sig->oublock + sig->coublock;
				1309	maxrss = max(sig->maxrss, sig->cmaxrss);
				1310	if (psig->cmaxrss < maxrss)
				1311	psig->cmaxrss = maxrss;
				1312	task_io_accounting_add(&psig->ioac, &p->ioac);
				1313	task_io_accounting_add(&psig->ioac, &sig->ioac);
				1314	spin_unlock_irq(&p->real_parent->sighand->siglock);
				1315	}
				1316
				1317	/*
				1318	* Now we are sure this task is interesting, and no other
				1319	* thread can reap it because we set its state to EXIT_DEAD.
				1320	*/
				1321	read_unlock(&tasklist_lock);
				1322
				1323	retval = wo->wo_rusage
				1324	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				1325	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
				1326	? p->signal->group_exit_code : p->exit_code;
				1327	if (!retval && wo->wo_stat)
				1328	retval = put_user(status, wo->wo_stat);
				1329
				1330	infop = wo->wo_info;
				1331	if (!retval && infop)
				1332	retval = put_user(SIGCHLD, &infop->si_signo);
				1333	if (!retval && infop)
				1334	retval = put_user(0, &infop->si_errno);
				1335	if (!retval && infop) {
				1336	int why;
				1337
				1338	if ((status & 0x7f) == 0) {
				1339	why = CLD_EXITED;
				1340	status >>= 8;
				1341	} else {
				1342	why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
				1343	status &= 0x7f;
				1344	}
				1345	retval = put_user((short)why, &infop->si_code);
				1346	if (!retval)
				1347	retval = put_user(status, &infop->si_status);
				1348	}
				1349	if (!retval && infop)
				1350	retval = put_user(pid, &infop->si_pid);
				1351	if (!retval && infop)
				1352	retval = put_user(uid, &infop->si_uid);
				1353	if (!retval)
				1354	retval = pid;
				1355
				1356	if (traced) {
				1357	write_lock_irq(&tasklist_lock);
				1358	/* We dropped tasklist, ptracer could die and untrace */
				1359	ptrace_unlink(p);
				1360	/*
				1361	* If this is not a sub-thread, notify the parent.
				1362	* If parent wants a zombie, don't release it now.
				1363	*/
				1364	if (thread_group_leader(p) &&
				1365	!do_notify_parent(p, p->exit_signal)) {
				1366	p->exit_state = EXIT_ZOMBIE;
				1367	p = NULL;
				1368	}
				1369	write_unlock_irq(&tasklist_lock);
				1370	}
				1371	if (p != NULL)
				1372	release_task(p);
				1373
				1374	return retval;
				1375	}
				1376
				1377	static int task_stopped_code(struct task_struct p, bool ptrace)
				1378	{
				1379	if (ptrace) {
				1380	if (task_is_stopped_or_traced(p) &&
				1381	!(p->jobctl & JOBCTL_LISTENING))
				1382	return &p->exit_code;
				1383	} else {
				1384	if (p->signal->flags & SIGNAL_STOP_STOPPED)
				1385	return &p->signal->group_exit_code;
				1386	}
				1387	return NULL;
				1388	}
				1389
				1390	/**
				1391	* wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
				1392	* @wo: wait options
				1393	* @ptrace: is the wait for ptrace
				1394	* @p: task to wait for
				1395	*
				1396	* Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
				1397	*
				1398	* CONTEXT:
				1399	* read_lock(&tasklist_lock), which is released if return value is
				1400	* non-zero. Also, grabs and releases @p->sighand->siglock.
				1401	*
				1402	* RETURNS:
				1403	* 0 if wait condition didn't exist and search for other wait conditions
				1404	* should continue. Non-zero return, -errno on failure and @p's pid on
				1405	* success, implies that tasklist_lock is released and wait condition
				1406	* search should terminate.
				1407	*/
				1408	static int wait_task_stopped(struct wait_opts *wo,
				1409	int ptrace, struct task_struct *p)
				1410	{
				1411	struct siginfo __user *infop;
				1412	int retval, exit_code, *p_code, why;
				1413	uid_t uid = 0; /* unneeded, required by compiler */
				1414	pid_t pid;
				1415
				1416	/*
				1417	* Traditionally we see ptrace'd stopped tasks regardless of options.
				1418	*/
				1419	if (!ptrace && !(wo->wo_flags & WUNTRACED))
				1420	return 0;
				1421
				1422	if (!task_stopped_code(p, ptrace))
				1423	return 0;
				1424
				1425	exit_code = 0;
				1426	spin_lock_irq(&p->sighand->siglock);
				1427
				1428	p_code = task_stopped_code(p, ptrace);
				1429	if (unlikely(!p_code))
				1430	goto unlock_sig;
				1431
				1432	exit_code = *p_code;
				1433	if (!exit_code)
				1434	goto unlock_sig;
				1435
				1436	if (!unlikely(wo->wo_flags & WNOWAIT))
				1437	*p_code = 0;
				1438
				1439	uid = task_uid(p);
				1440	unlock_sig:
				1441	spin_unlock_irq(&p->sighand->siglock);
				1442	if (!exit_code)
				1443	return 0;
				1444
				1445	/*
				1446	* Now we are pretty sure this task is interesting.
				1447	* Make sure it doesn't get reaped out from under us while we
				1448	* give up the lock and then examine it below. We don't want to
				1449	* keep holding onto the tasklist_lock while we call getrusage and
				1450	* possibly take page faults for user memory.
				1451	*/
				1452	get_task_struct(p);
				1453	pid = task_pid_vnr(p);
				1454	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
				1455	read_unlock(&tasklist_lock);
				1456
				1457	if (unlikely(wo->wo_flags & WNOWAIT))
				1458	return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
				1459
				1460	retval = wo->wo_rusage
				1461	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				1462	if (!retval && wo->wo_stat)
				1463	retval = put_user((exit_code << 8) \| 0x7f, wo->wo_stat);
				1464
				1465	infop = wo->wo_info;
				1466	if (!retval && infop)
				1467	retval = put_user(SIGCHLD, &infop->si_signo);
				1468	if (!retval && infop)
				1469	retval = put_user(0, &infop->si_errno);
				1470	if (!retval && infop)
				1471	retval = put_user((short)why, &infop->si_code);
				1472	if (!retval && infop)
				1473	retval = put_user(exit_code, &infop->si_status);
				1474	if (!retval && infop)
				1475	retval = put_user(pid, &infop->si_pid);
				1476	if (!retval && infop)
				1477	retval = put_user(uid, &infop->si_uid);
				1478	if (!retval)
				1479	retval = pid;
				1480	put_task_struct(p);
				1481
				1482	BUG_ON(!retval);
				1483	return retval;
				1484	}
				1485
				1486	/*
				1487	* Handle do_wait work for one task in a live, non-stopped state.
				1488	* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
				1489	* the lock and this task is uninteresting. If we return nonzero, we have
				1490	* released the lock and the system call should return.
				1491	*/
				1492	static int wait_task_continued(struct wait_opts wo, struct task_struct p)
				1493	{
				1494	int retval;
				1495	pid_t pid;
				1496	uid_t uid;
				1497
				1498	if (!unlikely(wo->wo_flags & WCONTINUED))
				1499	return 0;
				1500
				1501	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
				1502	return 0;
				1503
				1504	spin_lock_irq(&p->sighand->siglock);
				1505	/* Re-check with the lock held. */
				1506	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
				1507	spin_unlock_irq(&p->sighand->siglock);
				1508	return 0;
				1509	}
				1510	if (!unlikely(wo->wo_flags & WNOWAIT))
				1511	p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
				1512	uid = task_uid(p);
				1513	spin_unlock_irq(&p->sighand->siglock);
				1514
				1515	pid = task_pid_vnr(p);
				1516	get_task_struct(p);
				1517	read_unlock(&tasklist_lock);
				1518
				1519	if (!wo->wo_info) {
				1520	retval = wo->wo_rusage
				1521	? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
				1522	put_task_struct(p);
				1523	if (!retval && wo->wo_stat)
				1524	retval = put_user(0xffff, wo->wo_stat);
				1525	if (!retval)
				1526	retval = pid;
				1527	} else {
				1528	retval = wait_noreap_copyout(wo, p, pid, uid,
				1529	CLD_CONTINUED, SIGCONT);
				1530	BUG_ON(retval == 0);
				1531	}
				1532
				1533	return retval;
				1534	}
				1535
				1536	/*
				1537	* Consider @p for a wait by @parent.
				1538	*
				1539	* -ECHILD should be in ->notask_error before the first call.
				1540	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
				1541	* Returns zero if the search for a child should continue;
				1542	* then ->notask_error is 0 if @p is an eligible child,
				1543	* or another error from security_task_wait(), or still -ECHILD.
				1544	*/
				1545	static int wait_consider_task(struct wait_opts *wo, int ptrace,
				1546	struct task_struct *p)
				1547	{
				1548	int ret = eligible_child(wo, p);
				1549	if (!ret)
				1550	return ret;
				1551
				1552	ret = security_task_wait(p);
				1553	if (unlikely(ret < 0)) {
				1554	/*
				1555	* If we have not yet seen any eligible child,
				1556	* then let this error code replace -ECHILD.
				1557	* A permission error will give the user a clue
				1558	* to look for security policy problems, rather
				1559	* than for mysterious wait bugs.
				1560	*/
				1561	if (wo->notask_error)
				1562	wo->notask_error = ret;
				1563	return 0;
				1564	}
				1565
				1566	/* dead body doesn't have much to contribute */
				1567	if (unlikely(p->exit_state == EXIT_DEAD)) {
				1568	/*
				1569	* But do not ignore this task until the tracer does
				1570	* wait_task_zombie()->do_notify_parent().
				1571	*/
				1572	if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
				1573	wo->notask_error = 0;
				1574	return 0;
				1575	}
				1576
				1577	/* slay zombie? */
				1578	if (p->exit_state == EXIT_ZOMBIE) {
				1579	/*
				1580	* A zombie ptracee is only visible to its ptracer.
				1581	* Notification and reaping will be cascaded to the real
				1582	* parent when the ptracer detaches.
				1583	*/
				1584	if (likely(!ptrace) && unlikely(p->ptrace)) {
				1585	/* it will become visible, clear notask_error */
				1586	wo->notask_error = 0;
				1587	return 0;
				1588	}
				1589
				1590	/* we don't reap group leaders with subthreads */
				1591	if (!delay_group_leader(p))
				1592	return wait_task_zombie(wo, p);
				1593
				1594	/*
				1595	* Allow access to stopped/continued state via zombie by
				1596	* falling through. Clearing of notask_error is complex.
				1597	*
				1598	* When !@ptrace:
				1599	*
				1600	* If WEXITED is set, notask_error should naturally be
				1601	* cleared. If not, subset of WSTOPPED\|WCONTINUED is set,
				1602	* so, if there are live subthreads, there are events to
				1603	* wait for. If all subthreads are dead, it's still safe
				1604	* to clear - this function will be called again in finite
				1605	* amount time once all the subthreads are released and
				1606	* will then return without clearing.
				1607	*
				1608	* When @ptrace:
				1609	*
				1610	* Stopped state is per-task and thus can't change once the
				1611	* target task dies. Only continued and exited can happen.
				1612	* Clear notask_error if WCONTINUED \| WEXITED.
				1613	*/
				1614	if (likely(!ptrace) \|\| (wo->wo_flags & (WCONTINUED \| WEXITED)))
				1615	wo->notask_error = 0;
				1616	} else {
				1617	/*
				1618	* If @p is ptraced by a task in its real parent's group,
				1619	* hide group stop/continued state when looking at @p as
				1620	* the real parent; otherwise, a single stop can be
				1621	* reported twice as group and ptrace stops.
				1622	*
				1623	* If a ptracer wants to distinguish the two events for its
				1624	* own children, it should create a separate process which
				1625	* takes the role of real parent.
				1626	*/
				1627	if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p))
				1628	return 0;
				1629
				1630	/*
				1631	* @p is alive and it's gonna stop, continue or exit, so
				1632	* there always is something to wait for.
				1633	*/
				1634	wo->notask_error = 0;
				1635	}
				1636
				1637	/*
				1638	* Wait for stopped. Depending on @ptrace, different stopped state
				1639	* is used and the two don't interact with each other.
				1640	*/
				1641	ret = wait_task_stopped(wo, ptrace, p);
				1642	if (ret)
				1643	return ret;
				1644
				1645	/*
				1646	* Wait for continued. There's only one continued state and the
				1647	* ptracer can consume it which can confuse the real parent. Don't
				1648	* use WCONTINUED from ptracer. You don't need or want it.
				1649	*/
				1650	return wait_task_continued(wo, p);
				1651	}
				1652
				1653	/*
				1654	* Do the work of do_wait() for one thread in the group, @tsk.
				1655	*
				1656	* -ECHILD should be in ->notask_error before the first call.
				1657	* Returns nonzero for a final return, when we have unlocked tasklist_lock.
				1658	* Returns zero if the search for a child should continue; then
				1659	* ->notask_error is 0 if there were any eligible children,
				1660	* or another error from security_task_wait(), or still -ECHILD.
				1661	*/
				1662	static int do_wait_thread(struct wait_opts wo, struct task_struct tsk)
				1663	{
				1664	struct task_struct *p;
				1665
				1666	list_for_each_entry(p, &tsk->children, sibling) {
				1667	int ret = wait_consider_task(wo, 0, p);
				1668	if (ret)
				1669	return ret;
				1670	}
				1671
				1672	return 0;
				1673	}
				1674
				1675	static int ptrace_do_wait(struct wait_opts wo, struct task_struct tsk)
				1676	{
				1677	struct task_struct *p;
				1678
				1679	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
				1680	int ret = wait_consider_task(wo, 1, p);
				1681	if (ret)
				1682	return ret;
				1683	}
				1684
				1685	return 0;
				1686	}
				1687
				1688	static int child_wait_callback(wait_queue_t *wait, unsigned mode,
				1689	int sync, void *key)
				1690	{
				1691	struct wait_opts *wo = container_of(wait, struct wait_opts,
				1692	child_wait);
				1693	struct task_struct *p = key;
				1694
				1695	if (!eligible_pid(wo, p))
				1696	return 0;
				1697
				1698	if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent)
				1699	return 0;
				1700
				1701	return default_wake_function(wait, mode, sync, key);
				1702	}
				1703
				1704	void __wake_up_parent(struct task_struct p, struct task_struct parent)
				1705	{
				1706	__wake_up_sync_key(&parent->signal->wait_chldexit,
				1707	TASK_INTERRUPTIBLE, 1, p);
				1708	}
				1709
				1710	static long do_wait(struct wait_opts *wo)
				1711	{
				1712	struct task_struct *tsk;
				1713	int retval;
				1714
				1715	trace_sched_process_wait(wo->wo_pid);
				1716
				1717	init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
				1718	wo->child_wait.private = current;
				1719	add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
				1720	repeat:
				1721	/*
				1722	* If there is nothing that can match our critiera just get out.
				1723	* We will clear ->notask_error to zero if we see any child that
				1724	* might later match our criteria, even if we are not able to reap
				1725	* it yet.
				1726	*/
				1727	wo->notask_error = -ECHILD;
				1728	if ((wo->wo_type < PIDTYPE_MAX) &&
				1729	(!wo->wo_pid \|\| hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
				1730	goto notask;
				1731
				1732	set_current_state(TASK_INTERRUPTIBLE);
				1733	read_lock(&tasklist_lock);
				1734	tsk = current;
				1735	do {
				1736	retval = do_wait_thread(wo, tsk);
				1737	if (retval)
				1738	goto end;
				1739
				1740	retval = ptrace_do_wait(wo, tsk);
				1741	if (retval)
				1742	goto end;
				1743
				1744	if (wo->wo_flags & __WNOTHREAD)
				1745	break;
				1746	} while_each_thread(current, tsk);
				1747	read_unlock(&tasklist_lock);
				1748
				1749	notask:
				1750	retval = wo->notask_error;
				1751	if (!retval && !(wo->wo_flags & WNOHANG)) {
				1752	retval = -ERESTARTSYS;
				1753	if (!signal_pending(current)) {
				1754	schedule();
				1755	goto repeat;
				1756	}
				1757	}
				1758	end:
				1759	__set_current_state(TASK_RUNNING);
				1760	remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
				1761	return retval;
				1762	}
				1763
				1764	SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
				1765	infop, int, options, struct rusage __user *, ru)
				1766	{
				1767	struct wait_opts wo;
				1768	struct pid *pid = NULL;
				1769	enum pid_type type;
				1770	long ret;
				1771
				1772	if (options & ~(WNOHANG\|WNOWAIT\|WEXITED\|WSTOPPED\|WCONTINUED))
				1773	return -EINVAL;
				1774	if (!(options & (WEXITED\|WSTOPPED\|WCONTINUED)))
				1775	return -EINVAL;
				1776
				1777	switch (which) {
				1778	case P_ALL:
				1779	type = PIDTYPE_MAX;
				1780	break;
				1781	case P_PID:
				1782	type = PIDTYPE_PID;
				1783	if (upid <= 0)
				1784	return -EINVAL;
				1785	break;
				1786	case P_PGID:
				1787	type = PIDTYPE_PGID;
				1788	if (upid <= 0)
				1789	return -EINVAL;
				1790	break;
				1791	default:
				1792	return -EINVAL;
				1793	}
				1794
				1795	if (type < PIDTYPE_MAX)
				1796	pid = find_get_pid(upid);
				1797
				1798	wo.wo_type = type;
				1799	wo.wo_pid = pid;
				1800	wo.wo_flags = options;
				1801	wo.wo_info = infop;
				1802	wo.wo_stat = NULL;
				1803	wo.wo_rusage = ru;
				1804	ret = do_wait(&wo);
				1805
				1806	if (ret > 0) {
				1807	ret = 0;
				1808	} else if (infop) {
				1809	/*
				1810	* For a WNOHANG return, clear out all the fields
				1811	* we would set so the user can easily tell the
				1812	* difference.
				1813	*/
				1814	if (!ret)
				1815	ret = put_user(0, &infop->si_signo);
				1816	if (!ret)
				1817	ret = put_user(0, &infop->si_errno);
				1818	if (!ret)
				1819	ret = put_user(0, &infop->si_code);
				1820	if (!ret)
				1821	ret = put_user(0, &infop->si_pid);
				1822	if (!ret)
				1823	ret = put_user(0, &infop->si_uid);
				1824	if (!ret)
				1825	ret = put_user(0, &infop->si_status);
				1826	}
				1827
				1828	put_pid(pid);
				1829
				1830	/* avoid REGPARM breakage on x86: */
				1831	asmlinkage_protect(5, ret, which, upid, infop, options, ru);
				1832	return ret;
				1833	}
				1834
				1835	SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
				1836	int, options, struct rusage __user *, ru)
				1837	{
				1838	struct wait_opts wo;
				1839	struct pid *pid = NULL;
				1840	enum pid_type type;
				1841	long ret;
				1842
				1843	if (options & ~(WNOHANG\|WUNTRACED\|WCONTINUED\|
				1844	__WNOTHREAD\|__WCLONE\|__WALL))
				1845	return -EINVAL;
				1846
				1847	if (upid == -1)
				1848	type = PIDTYPE_MAX;
				1849	else if (upid < 0) {
				1850	type = PIDTYPE_PGID;
				1851	pid = find_get_pid(-upid);
				1852	} else if (upid == 0) {
				1853	type = PIDTYPE_PGID;
				1854	pid = get_task_pid(current, PIDTYPE_PGID);
				1855	} else /* upid > 0 */ {
				1856	type = PIDTYPE_PID;
				1857	pid = find_get_pid(upid);
				1858	}
				1859
				1860	wo.wo_type = type;
				1861	wo.wo_pid = pid;
				1862	wo.wo_flags = options \| WEXITED;
				1863	wo.wo_info = NULL;
				1864	wo.wo_stat = stat_addr;
				1865	wo.wo_rusage = ru;
				1866	ret = do_wait(&wo);
				1867	put_pid(pid);
				1868
				1869	/* avoid REGPARM breakage on x86: */
				1870	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
				1871	return ret;
				1872	}
				1873
				1874	#ifdef __ARCH_WANT_SYS_WAITPID
				1875
				1876	/*
				1877	* sys_waitpid() remains for compatibility. waitpid() should be
				1878	* implemented by calling sys_wait4() from libc.a.
				1879	*/
				1880	SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
				1881	{
				1882	return sys_wait4(pid, stat_addr, options, NULL);
				1883	}
				1884
				1885	#endif