Blame - ap/os/linux/linux-3.4.x/fs/exec.c - T106_DC

blob: 7200df02db208c926246d404eecdf21e63cd00c9 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* linux/fs/exec.c
				3	*
				4	* Copyright (C) 1991, 1992 Linus Torvalds
				5	*/
				6
				7	/*
				8	* #!-checking implemented by tytso.
				9	*/
				10	/*
				11	* Demand-loading implemented 01.12.91 - no need to read anything but
				12	* the header into memory. The inode of the executable is put into
				13	* "current->executable", and page faults do the actual loading. Clean.
				14	*
				15	* Once more I can proudly say that linux stood up to being changed: it
				16	* was less than 2 hours work to get demand-loading completely implemented.
				17	*
				18	* Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
				19	* current->executable is only used by the procfs. This allows a dispatch
				20	* table to check for several different types of binary formats. We keep
				21	* trying until we recognize the file or we run out of supported binary
				22	* formats.
				23	*/
				24
				25	#include <linux/slab.h>
				26	#include <linux/file.h>
				27	#include <linux/fdtable.h>
				28	#include <linux/mm.h>
				29	#include <linux/stat.h>
				30	#include <linux/fcntl.h>
				31	#include <linux/swap.h>
				32	#include <linux/string.h>
				33	#include <linux/init.h>
				34	#include <linux/pagemap.h>
				35	#include <linux/perf_event.h>
				36	#include <linux/highmem.h>
				37	#include <linux/spinlock.h>
				38	#include <linux/key.h>
				39	#include <linux/personality.h>
				40	#include <linux/binfmts.h>
				41	#include <linux/utsname.h>
				42	#include <linux/pid_namespace.h>
				43	#include <linux/module.h>
				44	#include <linux/namei.h>
				45	#include <linux/mount.h>
				46	#include <linux/security.h>
				47	#include <linux/syscalls.h>
				48	#include <linux/tsacct_kern.h>
				49	#include <linux/cn_proc.h>
				50	#include <linux/audit.h>
				51	#include <linux/tracehook.h>
				52	#include <linux/kmod.h>
				53	#include <linux/fsnotify.h>
				54	#include <linux/fs_struct.h>
				55	#include <linux/pipe_fs_i.h>
				56	#include <linux/oom.h>
				57	#include <linux/compat.h>
				58
				59	#include <asm/uaccess.h>
				60	#include <asm/mmu_context.h>
				61	#include <asm/tlb.h>
				62	#include <asm/exec.h>
				63
				64	#include <trace/events/task.h>
				65	#include "internal.h"
				66
				67	#include <trace/events/sched.h>
				68
				69	int core_uses_pid;
				70	char core_pattern[CORENAME_MAX_SIZE] = "core";
				71	unsigned int core_pipe_limit;
				72	int suid_dumpable = 0;
				73
				74	struct core_name {
				75	char *corename;
				76	int used, size;
				77	};
				78	static atomic_t call_count = ATOMIC_INIT(1);
				79
				80	/* The maximal length of core_pattern is also specified in sysctl.c */
				81
				82	static LIST_HEAD(formats);
				83	static DEFINE_RWLOCK(binfmt_lock);
				84
				85	void __register_binfmt(struct linux_binfmt * fmt, int insert)
				86	{
				87	BUG_ON(!fmt);
				88	write_lock(&binfmt_lock);
				89	insert ? list_add(&fmt->lh, &formats) :
				90	list_add_tail(&fmt->lh, &formats);
				91	write_unlock(&binfmt_lock);
				92	}
				93
				94	EXPORT_SYMBOL(__register_binfmt);
				95
				96	void unregister_binfmt(struct linux_binfmt * fmt)
				97	{
				98	write_lock(&binfmt_lock);
				99	list_del(&fmt->lh);
				100	write_unlock(&binfmt_lock);
				101	}
				102
				103	EXPORT_SYMBOL(unregister_binfmt);
				104
				105	static inline void put_binfmt(struct linux_binfmt * fmt)
				106	{
				107	module_put(fmt->module);
				108	}
				109
				110	/*
				111	* Note that a shared library must be both readable and executable due to
				112	* security reasons.
				113	*
				114	* Also note that we take the address to load from from the file itself.
				115	*/
				116	SYSCALL_DEFINE1(uselib, const char __user *, library)
				117	{
				118	struct file *file;
				119	char *tmp = getname(library);
				120	int error = PTR_ERR(tmp);
				121	static const struct open_flags uselib_flags = {
				122	.open_flag = O_LARGEFILE \| O_RDONLY \| __FMODE_EXEC,
				123	.acc_mode = MAY_READ \| MAY_EXEC \| MAY_OPEN,
				124	.intent = LOOKUP_OPEN
				125	};
				126
				127	if (IS_ERR(tmp))
				128	goto out;
				129
				130	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
				131	putname(tmp);
				132	error = PTR_ERR(file);
				133	if (IS_ERR(file))
				134	goto out;
				135
				136	error = -EINVAL;
				137	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
				138	goto exit;
				139
				140	error = -EACCES;
				141	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
				142	goto exit;
				143
				144	fsnotify_open(file);
				145
				146	error = -ENOEXEC;
				147	if(file->f_op) {
				148	struct linux_binfmt * fmt;
				149
				150	read_lock(&binfmt_lock);
				151	list_for_each_entry(fmt, &formats, lh) {
				152	if (!fmt->load_shlib)
				153	continue;
				154	if (!try_module_get(fmt->module))
				155	continue;
				156	read_unlock(&binfmt_lock);
				157	error = fmt->load_shlib(file);
				158	read_lock(&binfmt_lock);
				159	put_binfmt(fmt);
				160	if (error != -ENOEXEC)
				161	break;
				162	}
				163	read_unlock(&binfmt_lock);
				164	}
				165	exit:
				166	fput(file);
				167	out:
				168	return error;
				169	}
				170
				171	#ifdef CONFIG_MMU
				172	/*
				173	* The nascent bprm->mm is not visible until exec_mmap() but it can
				174	* use a lot of memory, account these pages in current->mm temporary
				175	* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
				176	* change the counter back via acct_arg_size(0).
				177	*/
				178	static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
				179	{
				180	struct mm_struct *mm = current->mm;
				181	long diff = (long)(pages - bprm->vma_pages);
				182
				183	if (!mm \|\| !diff)
				184	return;
				185
				186	bprm->vma_pages = pages;
				187	add_mm_counter(mm, MM_ANONPAGES, diff);
				188	}
				189
				190	static struct page get_arg_page(struct linux_binprm bprm, unsigned long pos,
				191	int write)
				192	{
				193	struct page *page;
				194	int ret;
				195
				196	#ifdef CONFIG_STACK_GROWSUP
				197	if (write) {
				198	ret = expand_downwards(bprm->vma, pos);
				199	if (ret < 0)
				200	return NULL;
				201	}
				202	#endif
				203	ret = get_user_pages(current, bprm->mm, pos,
				204	1, write, 1, &page, NULL);
				205	if (ret <= 0)
				206	return NULL;
				207
				208	if (write) {
				209	unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
				210	unsigned long ptr_size;
				211	struct rlimit *rlim;
				212	/fix for HUB CVE-2017-1000365/
				213	/*
				214	* Since the stack will hold pointers to the strings, we
				215	* must account for them as well.
				216	*
				217	* The size calculation is the entire vma while each arg page is
				218	* built, so each time we get here it's calculating how far it
				219	* is currently (rather than each call being just the newly
				220	* added size from the arg page). As a result, we need to
				221	* always add the entire size of the pointers, so that on the
				222	* last call to get_arg_page() we'll actually have the entire
				223	* correct size.
				224	*/
				225	ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
				226	if (ptr_size > ULONG_MAX - size)
				227	goto fail;
				228	size += ptr_size;
				229
				230	acct_arg_size(bprm, size / PAGE_SIZE);
				231
				232	/*
				233	* We've historically supported up to 32 pages (ARG_MAX)
				234	* of argument strings even with small stacks
				235	*/
				236	if (size <= ARG_MAX)
				237	return page;
				238
				239	/*
				240	* Limit to 1/4-th the stack size for the argv+env strings.
				241	* This ensures that:
				242	* - the remaining binfmt code will not run out of stack space,
				243	* - the program will have a reasonable amount of stack left
				244	* to work from.
				245	*/
				246	rlim = current->signal->rlim;
				247	if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4)
				248	goto fail;
				249	}
				250
				251	return page;
				252
				253	fail:
				254	put_page(page);
				255	return NULL;
				256	}
				257
				258	static void put_arg_page(struct page *page)
				259	{
				260	put_page(page);
				261	}
				262
				263	static void free_arg_page(struct linux_binprm *bprm, int i)
				264	{
				265	}
				266
				267	static void free_arg_pages(struct linux_binprm *bprm)
				268	{
				269	}
				270
				271	static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
				272	struct page *page)
				273	{
				274	flush_cache_page(bprm->vma, pos, page_to_pfn(page));
				275	}
				276
				277	static int __bprm_mm_init(struct linux_binprm *bprm)
				278	{
				279	int err;
				280	struct vm_area_struct *vma = NULL;
				281	struct mm_struct *mm = bprm->mm;
				282
				283	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
				284	if (!vma)
				285	return -ENOMEM;
				286
				287	down_write(&mm->mmap_sem);
				288	vma->vm_mm = mm;
				289
				290	/*
				291	* Place the stack at the largest stack address the architecture
				292	* supports. Later, we'll move this to an appropriate place. We don't
				293	* use STACK_TOP because that can depend on attributes which aren't
				294	* configured yet.
				295	*/
				296	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
				297	vma->vm_end = STACK_TOP_MAX;
				298	vma->vm_start = vma->vm_end - PAGE_SIZE;
				299	vma->vm_flags = VM_STACK_FLAGS \| VM_STACK_INCOMPLETE_SETUP;
				300	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
				301	INIT_LIST_HEAD(&vma->anon_vma_chain);
				302
				303	err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
				304	if (err)
				305	goto err;
				306
				307	err = insert_vm_struct(mm, vma);
				308	if (err)
				309	goto err;
				310
				311	mm->stack_vm = mm->total_vm = 1;
				312	up_write(&mm->mmap_sem);
				313	bprm->p = vma->vm_end - sizeof(void *);
				314	return 0;
				315	err:
				316	up_write(&mm->mmap_sem);
				317	bprm->vma = NULL;
				318	kmem_cache_free(vm_area_cachep, vma);
				319	return err;
				320	}
				321
				322	static bool valid_arg_len(struct linux_binprm *bprm, long len)
				323	{
				324	return len <= MAX_ARG_STRLEN;
				325	}
				326
				327	#else
				328
				329	static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
				330	{
				331	}
				332
				333	static struct page get_arg_page(struct linux_binprm bprm, unsigned long pos,
				334	int write)
				335	{
				336	struct page *page;
				337
				338	page = bprm->page[pos / PAGE_SIZE];
				339	if (!page && write) {
				340	page = alloc_page(GFP_HIGHUSER\|__GFP_ZERO);
				341	if (!page)
				342	return NULL;
				343	bprm->page[pos / PAGE_SIZE] = page;
				344	}
				345
				346	return page;
				347	}
				348
				349	static void put_arg_page(struct page *page)
				350	{
				351	}
				352
				353	static void free_arg_page(struct linux_binprm *bprm, int i)
				354	{
				355	if (bprm->page[i]) {
				356	__free_page(bprm->page[i]);
				357	bprm->page[i] = NULL;
				358	}
				359	}
				360
				361	static void free_arg_pages(struct linux_binprm *bprm)
				362	{
				363	int i;
				364
				365	for (i = 0; i < MAX_ARG_PAGES; i++)
				366	free_arg_page(bprm, i);
				367	}
				368
				369	static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
				370	struct page *page)
				371	{
				372	}
				373
				374	static int __bprm_mm_init(struct linux_binprm *bprm)
				375	{
				376	bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
				377	return 0;
				378	}
				379
				380	static bool valid_arg_len(struct linux_binprm *bprm, long len)
				381	{
				382	return len <= bprm->p;
				383	}
				384
				385	#endif /* CONFIG_MMU */
				386
				387	/*
				388	* Create a new mm_struct and populate it with a temporary stack
				389	* vm_area_struct. We don't have enough context at this point to set the stack
				390	* flags, permissions, and offset, so we use temporary values. We'll update
				391	* them later in setup_arg_pages().
				392	*/
				393	int bprm_mm_init(struct linux_binprm *bprm)
				394	{
				395	int err;
				396	struct mm_struct *mm = NULL;
				397
				398	bprm->mm = mm = mm_alloc();
				399	err = -ENOMEM;
				400	if (!mm)
				401	goto err;
				402
				403	err = init_new_context(current, mm);
				404	if (err)
				405	goto err;
				406
				407	err = __bprm_mm_init(bprm);
				408	if (err)
				409	goto err;
				410
				411	return 0;
				412
				413	err:
				414	if (mm) {
				415	bprm->mm = NULL;
				416	mmdrop(mm);
				417	}
				418
				419	return err;
				420	}
				421
				422	struct user_arg_ptr {
				423	#ifdef CONFIG_COMPAT
				424	bool is_compat;
				425	#endif
				426	union {
				427	const char __user const __user native;
				428	#ifdef CONFIG_COMPAT
				429	compat_uptr_t __user *compat;
				430	#endif
				431	} ptr;
				432	};
				433
				434	static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
				435	{
				436	const char __user *native;
				437
				438	#ifdef CONFIG_COMPAT
				439	if (unlikely(argv.is_compat)) {
				440	compat_uptr_t compat;
				441
				442	if (get_user(compat, argv.ptr.compat + nr))
				443	return ERR_PTR(-EFAULT);
				444
				445	return compat_ptr(compat);
				446	}
				447	#endif
				448
				449	if (get_user(native, argv.ptr.native + nr))
				450	return ERR_PTR(-EFAULT);
				451
				452	return native;
				453	}
				454
				455	/*
				456	* count() counts the number of strings in array ARGV.
				457	*/
				458	static int count(struct user_arg_ptr argv, int max)
				459	{
				460	int i = 0;
				461
				462	if (argv.ptr.native != NULL) {
				463	for (;;) {
				464	const char __user *p = get_user_arg_ptr(argv, i);
				465
				466	if (!p)
				467	break;
				468
				469	if (IS_ERR(p))
				470	return -EFAULT;
				471
				472	if (i++ >= max)
				473	return -E2BIG;
				474
				475	if (fatal_signal_pending(current))
				476	return -ERESTARTNOHAND;
				477	cond_resched();
				478	}
				479	}
				480	return i;
				481	}
				482
				483	/*
				484	* 'copy_strings()' copies argument/environment strings from the old
				485	* processes's memory to the new process's stack. The call to get_user_pages()
				486	* ensures the destination page is created and not swapped out.
				487	*/
				488	static int copy_strings(int argc, struct user_arg_ptr argv,
				489	struct linux_binprm *bprm)
				490	{
				491	struct page *kmapped_page = NULL;
				492	char *kaddr = NULL;
				493	unsigned long kpos = 0;
				494	int ret;
				495
				496	while (argc-- > 0) {
				497	const char __user *str;
				498	int len;
				499	unsigned long pos;
				500
				501	ret = -EFAULT;
				502	str = get_user_arg_ptr(argv, argc);
				503	if (IS_ERR(str))
				504	goto out;
				505
				506	len = strnlen_user(str, MAX_ARG_STRLEN);
				507	if (!len)
				508	goto out;
				509
				510	ret = -E2BIG;
				511	if (!valid_arg_len(bprm, len))
				512	goto out;
				513
				514	/* We're going to work our way backwords. */
				515	pos = bprm->p;
				516	str += len;
				517	bprm->p -= len;
				518
				519	while (len > 0) {
				520	int offset, bytes_to_copy;
				521
				522	if (fatal_signal_pending(current)) {
				523	ret = -ERESTARTNOHAND;
				524	goto out;
				525	}
				526	cond_resched();
				527
				528	offset = pos % PAGE_SIZE;
				529	if (offset == 0)
				530	offset = PAGE_SIZE;
				531
				532	bytes_to_copy = offset;
				533	if (bytes_to_copy > len)
				534	bytes_to_copy = len;
				535
				536	offset -= bytes_to_copy;
				537	pos -= bytes_to_copy;
				538	str -= bytes_to_copy;
				539	len -= bytes_to_copy;
				540
				541	if (!kmapped_page \|\| kpos != (pos & PAGE_MASK)) {
				542	struct page *page;
				543
				544	page = get_arg_page(bprm, pos, 1);
				545	if (!page) {
				546	ret = -E2BIG;
				547	goto out;
				548	}
				549
				550	if (kmapped_page) {
				551	flush_kernel_dcache_page(kmapped_page);
				552	kunmap(kmapped_page);
				553	put_arg_page(kmapped_page);
				554	}
				555	kmapped_page = page;
				556	kaddr = kmap(kmapped_page);
				557	kpos = pos & PAGE_MASK;
				558	flush_arg_page(bprm, kpos, kmapped_page);
				559	}
				560	if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
				561	ret = -EFAULT;
				562	goto out;
				563	}
				564	}
				565	}
				566	ret = 0;
				567	out:
				568	if (kmapped_page) {
				569	flush_kernel_dcache_page(kmapped_page);
				570	kunmap(kmapped_page);
				571	put_arg_page(kmapped_page);
				572	}
				573	return ret;
				574	}
				575
				576	/*
				577	* Like copy_strings, but get argv and its values from kernel memory.
				578	*/
				579	int copy_strings_kernel(int argc, const char const __argv,
				580	struct linux_binprm *bprm)
				581	{
				582	int r;
				583	mm_segment_t oldfs = get_fs();
				584	struct user_arg_ptr argv = {
				585	.ptr.native = (const char __user const __user )__argv,
				586	};
				587
				588	set_fs(KERNEL_DS);
				589	r = copy_strings(argc, argv, bprm);
				590	set_fs(oldfs);
				591
				592	return r;
				593	}
				594	EXPORT_SYMBOL(copy_strings_kernel);
				595
				596	#ifdef CONFIG_MMU
				597
				598	/*
				599	* During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
				600	* the binfmt code determines where the new stack should reside, we shift it to
				601	* its final location. The process proceeds as follows:
				602	*
				603	* 1) Use shift to calculate the new vma endpoints.
				604	* 2) Extend vma to cover both the old and new ranges. This ensures the
				605	* arguments passed to subsequent functions are consistent.
				606	* 3) Move vma's page tables to the new range.
				607	* 4) Free up any cleared pgd range.
				608	* 5) Shrink the vma to cover only the new range.
				609	*/
				610	static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
				611	{
				612	struct mm_struct *mm = vma->vm_mm;
				613	unsigned long old_start = vma->vm_start;
				614	unsigned long old_end = vma->vm_end;
				615	unsigned long length = old_end - old_start;
				616	unsigned long new_start = old_start - shift;
				617	unsigned long new_end = old_end - shift;
				618	struct mmu_gather tlb;
				619
				620	BUG_ON(new_start > new_end);
				621
				622	/*
				623	* ensure there are no vmas between where we want to go
				624	* and where we are
				625	*/
				626	if (vma != find_vma(mm, new_start))
				627	return -EFAULT;
				628
				629	/*
				630	* cover the whole range: [new_start, old_end)
				631	*/
				632	if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
				633	return -ENOMEM;
				634
				635	/*
				636	* move the page tables downwards, on failure we rely on
				637	* process cleanup to remove whatever mess we made.
				638	*/
				639	if (length != move_page_tables(vma, old_start,
				640	vma, new_start, length))
				641	return -ENOMEM;
				642
				643	lru_add_drain();
				644	tlb_gather_mmu(&tlb, mm, 0);
				645	if (new_end > old_start) {
				646	/*
				647	* when the old and new regions overlap clear from new_end.
				648	*/
				649	free_pgd_range(&tlb, new_end, old_end, new_end,
				650	vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
				651	} else {
				652	/*
				653	* otherwise, clean from old_start; this is done to not touch
				654	* the address space in [new_end, old_start) some architectures
				655	* have constraints on va-space that make this illegal (IA64) -
				656	* for the others its just a little faster.
				657	*/
				658	free_pgd_range(&tlb, old_start, old_end, new_end,
				659	vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING);
				660	}
				661	tlb_finish_mmu(&tlb, new_end, old_end);
				662
				663	/*
				664	* Shrink the vma to just the new range. Always succeeds.
				665	*/
				666	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
				667
				668	return 0;
				669	}
				670
				671	/*
				672	* Finalizes the stack vm_area_struct. The flags and permissions are updated,
				673	* the stack is optionally relocated, and some extra space is added.
				674	*/
				675	int setup_arg_pages(struct linux_binprm *bprm,
				676	unsigned long stack_top,
				677	int executable_stack)
				678	{
				679	unsigned long ret;
				680	unsigned long stack_shift;
				681	struct mm_struct *mm = current->mm;
				682	struct vm_area_struct *vma = bprm->vma;
				683	struct vm_area_struct *prev = NULL;
				684	unsigned long vm_flags;
				685	unsigned long stack_base;
				686	unsigned long stack_size;
				687	unsigned long stack_expand;
				688	unsigned long rlim_stack;
				689
				690	#ifdef CONFIG_STACK_GROWSUP
				691	/* Limit stack size to 1GB */
				692	stack_base = rlimit_max(RLIMIT_STACK);
				693	if (stack_base > (1 << 30))
				694	stack_base = 1 << 30;
				695
				696	/* Make sure we didn't let the argument array grow too large. */
				697	if (vma->vm_end - vma->vm_start > stack_base)
				698	return -ENOMEM;
				699
				700	stack_base = PAGE_ALIGN(stack_top - stack_base);
				701
				702	stack_shift = vma->vm_start - stack_base;
				703	mm->arg_start = bprm->p - stack_shift;
				704	bprm->p = vma->vm_end - stack_shift;
				705	#else
				706	stack_top = arch_align_stack(stack_top);
				707	stack_top = PAGE_ALIGN(stack_top);
				708
				709	if (unlikely(stack_top < mmap_min_addr) \|\|
				710	unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
				711	return -ENOMEM;
				712
				713	stack_shift = vma->vm_end - stack_top;
				714
				715	bprm->p -= stack_shift;
				716	mm->arg_start = bprm->p;
				717	#endif
				718
				719	if (bprm->loader)
				720	bprm->loader -= stack_shift;
				721	bprm->exec -= stack_shift;
				722
				723	down_write(&mm->mmap_sem);
				724	vm_flags = VM_STACK_FLAGS;
				725
				726	/*
				727	* Adjust stack execute permissions; explicitly enable for
				728	* EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
				729	* (arch default) otherwise.
				730	*/
				731	if (unlikely(executable_stack == EXSTACK_ENABLE_X))
				732	vm_flags \|= VM_EXEC;
				733	else if (executable_stack == EXSTACK_DISABLE_X)
				734	vm_flags &= ~VM_EXEC;
				735	vm_flags \|= mm->def_flags;
				736	vm_flags \|= VM_STACK_INCOMPLETE_SETUP;
				737
				738	ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
				739	vm_flags);
				740	if (ret)
				741	goto out_unlock;
				742	BUG_ON(prev != vma);
				743
				744	/* Move stack pages down in memory. */
				745	if (stack_shift) {
				746	ret = shift_arg_pages(vma, stack_shift);
				747	if (ret)
				748	goto out_unlock;
				749	}
				750
				751	/* mprotect_fixup is overkill to remove the temporary stack flags */
				752	vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
				753
				754	stack_expand = 131072UL; /* randomly 324k (or 264k) pages */
				755	stack_size = vma->vm_end - vma->vm_start;
				756	/*
				757	* Align this down to a page boundary as expand_stack
				758	* will align it up.
				759	*/
				760	rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
				761	#ifdef CONFIG_STACK_GROWSUP
				762	if (stack_size + stack_expand > rlim_stack)
				763	stack_base = vma->vm_start + rlim_stack;
				764	else
				765	stack_base = vma->vm_end + stack_expand;
				766	#else
				767	if (stack_size + stack_expand > rlim_stack)
				768	stack_base = vma->vm_end - rlim_stack;
				769	else
				770	stack_base = vma->vm_start - stack_expand;
				771	#endif
				772	current->mm->start_stack = bprm->p;
				773	ret = expand_stack(vma, stack_base);
				774	if (ret)
				775	ret = -EFAULT;
				776
				777	out_unlock:
				778	up_write(&mm->mmap_sem);
				779	return ret;
				780	}
				781	EXPORT_SYMBOL(setup_arg_pages);
				782
				783	#endif /* CONFIG_MMU */
				784
				785	struct file open_exec(const char name)
				786	{
				787	struct file *file;
				788	int err;
				789	static const struct open_flags open_exec_flags = {
				790	.open_flag = O_LARGEFILE \| O_RDONLY \| __FMODE_EXEC,
				791	.acc_mode = MAY_EXEC \| MAY_OPEN,
				792	.intent = LOOKUP_OPEN
				793	};
				794
				795	file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
				796	if (IS_ERR(file))
				797	goto out;
				798
				799	err = -EACCES;
				800	if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
				801	goto exit;
				802
				803	if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
				804	goto exit;
				805
				806	fsnotify_open(file);
				807
				808	err = deny_write_access(file);
				809	if (err)
				810	goto exit;
				811
				812	out:
				813	return file;
				814
				815	exit:
				816	fput(file);
				817	return ERR_PTR(err);
				818	}
				819	EXPORT_SYMBOL(open_exec);
				820
				821	int kernel_read(struct file *file, loff_t offset,
				822	char *addr, unsigned long count)
				823	{
				824	mm_segment_t old_fs;
				825	loff_t pos = offset;
				826	int result;
				827
				828	old_fs = get_fs();
				829	set_fs(get_ds());
				830	/* The cast to a user pointer is valid due to the set_fs() */
				831	result = vfs_read(file, (void __user *)addr, count, &pos);
				832	set_fs(old_fs);
				833	return result;
				834	}
				835
				836	EXPORT_SYMBOL(kernel_read);
				837
				838	static int exec_mmap(struct mm_struct *mm)
				839	{
				840	struct task_struct *tsk;
				841	struct mm_struct * old_mm, *active_mm;
				842
				843	/* Notify parent that we're no longer interested in the old VM */
				844	tsk = current;
				845	old_mm = current->mm;
				846	mm_release(tsk, old_mm);
				847
				848	if (old_mm) {
				849	sync_mm_rss(old_mm);
				850	/*
				851	* Make sure that if there is a core dump in progress
				852	* for the old mm, we get out and die instead of going
				853	* through with the exec. We must hold mmap_sem around
				854	* checking core_state and changing tsk->mm.
				855	*/
				856	down_read(&old_mm->mmap_sem);
				857	if (unlikely(old_mm->core_state)) {
				858	up_read(&old_mm->mmap_sem);
				859	return -EINTR;
				860	}
				861	}
				862	task_lock(tsk);
				863	preempt_disable_rt();
				864	active_mm = tsk->active_mm;
				865	tsk->mm = mm;
				866	tsk->active_mm = mm;
				867	activate_mm(active_mm, mm);
				868	preempt_enable_rt();
				869	task_unlock(tsk);
				870	arch_pick_mmap_layout(mm);
				871	if (old_mm) {
				872	up_read(&old_mm->mmap_sem);
				873	BUG_ON(active_mm != old_mm);
				874	setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
				875	mm_update_next_owner(old_mm);
				876	mmput(old_mm);
				877	return 0;
				878	}
				879	mmdrop(active_mm);
				880	return 0;
				881	}
				882
				883	/*
				884	* This function makes sure the current process has its own signal table,
				885	* so that flush_signal_handlers can later reset the handlers without
				886	* disturbing other processes. (Other processes might share the signal
				887	* table via the CLONE_SIGHAND option to clone().)
				888	*/
				889	static int de_thread(struct task_struct *tsk)
				890	{
				891	struct signal_struct *sig = tsk->signal;
				892	struct sighand_struct *oldsighand = tsk->sighand;
				893	spinlock_t *lock = &oldsighand->siglock;
				894
				895	if (thread_group_empty(tsk))
				896	goto no_thread_group;
				897
				898	/*
				899	* Kill all other threads in the thread group.
				900	*/
				901	spin_lock_irq(lock);
				902	if (signal_group_exit(sig)) {
				903	/*
				904	* Another group action in progress, just
				905	* return so that the signal is processed.
				906	*/
				907	spin_unlock_irq(lock);
				908	return -EAGAIN;
				909	}
				910
				911	sig->group_exit_task = tsk;
				912	sig->notify_count = zap_other_threads(tsk);
				913	if (!thread_group_leader(tsk))
				914	sig->notify_count--;
				915
				916	while (sig->notify_count) {
				917	__set_current_state(TASK_UNINTERRUPTIBLE);
				918	spin_unlock_irq(lock);
				919	schedule();
				920	spin_lock_irq(lock);
				921	}
				922	spin_unlock_irq(lock);
				923
				924	/*
				925	* At this point all other threads have exited, all we have to
				926	* do is to wait for the thread group leader to become inactive,
				927	* and to assume its PID:
				928	*/
				929	if (!thread_group_leader(tsk)) {
				930	struct task_struct *leader = tsk->group_leader;
				931
				932	sig->notify_count = -1; /* for exit_notify() */
				933	for (;;) {
				934	threadgroup_change_begin(tsk);
				935	write_lock_irq(&tasklist_lock);
				936	if (likely(leader->exit_state))
				937	break;
				938	__set_current_state(TASK_UNINTERRUPTIBLE);
				939	write_unlock_irq(&tasklist_lock);
				940	threadgroup_change_end(tsk);
				941	schedule();
				942	}
				943
				944	/*
				945	* The only record we have of the real-time age of a
				946	* process, regardless of execs it's done, is start_time.
				947	* All the past CPU time is accumulated in signal_struct
				948	* from sister threads now dead. But in this non-leader
				949	* exec, nothing survives from the original leader thread,
				950	* whose birth marks the true age of this process now.
				951	* When we take on its identity by switching to its PID, we
				952	* also take its birthdate (always earlier than our own).
				953	*/
				954	tsk->start_time = leader->start_time;
				955
				956	BUG_ON(!same_thread_group(leader, tsk));
				957	BUG_ON(has_group_leader_pid(tsk));
				958	/*
				959	* An exec() starts a new thread group with the
				960	* TGID of the previous thread group. Rehash the
				961	* two threads with a switched PID, and release
				962	* the former thread group leader:
				963	*/
				964
				965	/* Become a process group leader with the old leader's pid.
				966	* The old leader becomes a thread of the this thread group.
				967	* Note: The old leader also uses this pid until release_task
				968	* is called. Odd but simple and correct.
				969	*/
				970	detach_pid(tsk, PIDTYPE_PID);
				971	tsk->pid = leader->pid;
				972	attach_pid(tsk, PIDTYPE_PID, task_pid(leader));
				973	transfer_pid(leader, tsk, PIDTYPE_PGID);
				974	transfer_pid(leader, tsk, PIDTYPE_SID);
				975
				976	list_replace_rcu(&leader->tasks, &tsk->tasks);
				977	list_replace_init(&leader->sibling, &tsk->sibling);
				978
				979	tsk->group_leader = tsk;
				980	leader->group_leader = tsk;
				981
				982	tsk->exit_signal = SIGCHLD;
				983	leader->exit_signal = -1;
				984
				985	BUG_ON(leader->exit_state != EXIT_ZOMBIE);
				986	leader->exit_state = EXIT_DEAD;
				987
				988	/*
				989	* We are going to release_task()->ptrace_unlink() silently,
				990	* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
				991	* the tracer wont't block again waiting for this thread.
				992	*/
				993	if (unlikely(leader->ptrace))
				994	__wake_up_parent(leader, leader->parent);
				995	write_unlock_irq(&tasklist_lock);
				996	threadgroup_change_end(tsk);
				997
				998	release_task(leader);
				999	}
				1000
				1001	sig->group_exit_task = NULL;
				1002	sig->notify_count = 0;
				1003
				1004	no_thread_group:
				1005	/* we have changed execution domain */
				1006	tsk->exit_signal = SIGCHLD;
				1007
				1008	exit_itimers(sig);
				1009	flush_itimer_signals();
				1010
				1011	if (atomic_read(&oldsighand->count) != 1) {
				1012	struct sighand_struct *newsighand;
				1013	/*
				1014	* This ->sighand is shared with the CLONE_SIGHAND
				1015	* but not CLONE_THREAD task, switch to the new one.
				1016	*/
				1017	newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
				1018	if (!newsighand)
				1019	return -ENOMEM;
				1020
				1021	atomic_set(&newsighand->count, 1);
				1022	memcpy(newsighand->action, oldsighand->action,
				1023	sizeof(newsighand->action));
				1024
				1025	write_lock_irq(&tasklist_lock);
				1026	spin_lock(&oldsighand->siglock);
				1027	rcu_assign_pointer(tsk->sighand, newsighand);
				1028	spin_unlock(&oldsighand->siglock);
				1029	write_unlock_irq(&tasklist_lock);
				1030
				1031	__cleanup_sighand(oldsighand);
				1032	}
				1033
				1034	BUG_ON(!thread_group_leader(tsk));
				1035	return 0;
				1036	}
				1037
				1038	/*
				1039	* These functions flushes out all traces of the currently running executable
				1040	* so that a new one can be started
				1041	*/
				1042	static void flush_old_files(struct files_struct * files)
				1043	{
				1044	long j = -1;
				1045	struct fdtable *fdt;
				1046
				1047	spin_lock(&files->file_lock);
				1048	for (;;) {
				1049	unsigned long set, i;
				1050
				1051	j++;
				1052	i = j * BITS_PER_LONG;
				1053	fdt = files_fdtable(files);
				1054	if (i >= fdt->max_fds)
				1055	break;
				1056	set = fdt->close_on_exec[j];
				1057	if (!set)
				1058	continue;
				1059	fdt->close_on_exec[j] = 0;
				1060	spin_unlock(&files->file_lock);
				1061	for ( ; set ; i++,set >>= 1) {
				1062	if (set & 1) {
				1063	sys_close(i);
				1064	}
				1065	}
				1066	spin_lock(&files->file_lock);
				1067
				1068	}
				1069	spin_unlock(&files->file_lock);
				1070	}
				1071
				1072	char get_task_comm(char buf, struct task_struct *tsk)
				1073	{
				1074	/* buf must be at least sizeof(tsk->comm) in size */
				1075	task_lock(tsk);
				1076	strncpy(buf, tsk->comm, sizeof(tsk->comm));
				1077	task_unlock(tsk);
				1078	return buf;
				1079	}
				1080	EXPORT_SYMBOL_GPL(get_task_comm);
				1081
				1082	void set_task_comm(struct task_struct tsk, char buf)
				1083	{
				1084	task_lock(tsk);
				1085
				1086	trace_task_rename(tsk, buf);
				1087
				1088	/*
				1089	* Threads may access current->comm without holding
				1090	* the task lock, so write the string carefully.
				1091	* Readers without a lock may see incomplete new
				1092	* names but are safe from non-terminating string reads.
				1093	*/
				1094	memset(tsk->comm, 0, TASK_COMM_LEN);
				1095	wmb();
				1096	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
				1097	task_unlock(tsk);
				1098	perf_event_comm(tsk);
				1099	}
				1100
				1101	static void filename_to_taskname(char tcomm, const char fn, unsigned int len)
				1102	{
				1103	int i, ch;
				1104
				1105	/* Copies the binary name from after last slash */
				1106	for (i = 0; (ch = *(fn++)) != '\0';) {
				1107	if (ch == '/')
				1108	i = 0; /* overwrite what we wrote */
				1109	else
				1110	if (i < len - 1)
				1111	tcomm[i++] = ch;
				1112	}
				1113	tcomm[i] = '\0';
				1114	}
				1115
				1116	int flush_old_exec(struct linux_binprm * bprm)
				1117	{
				1118	int retval;
				1119
				1120	/*
				1121	* Make sure we have a private signal table and that
				1122	* we are unassociated from the previous thread group.
				1123	*/
				1124	retval = de_thread(current);
				1125	if (retval)
				1126	goto out;
				1127
				1128	set_mm_exe_file(bprm->mm, bprm->file);
				1129
				1130	filename_to_taskname(bprm->tcomm, bprm->filename, sizeof(bprm->tcomm));
				1131	/*
				1132	* Release all of the old mmap stuff
				1133	*/
				1134	acct_arg_size(bprm, 0);
				1135	retval = exec_mmap(bprm->mm);
				1136	if (retval)
				1137	goto out;
				1138
				1139	bprm->mm = NULL; /* We're using it now */
				1140
				1141	set_fs(USER_DS);
				1142	current->flags &=
				1143	~(PF_RANDOMIZE \| PF_FORKNOEXEC \| PF_KTHREAD \| PF_NOFREEZE);
				1144	flush_thread();
				1145	current->personality &= ~bprm->per_clear;
				1146
				1147	return 0;
				1148
				1149	out:
				1150	return retval;
				1151	}
				1152	EXPORT_SYMBOL(flush_old_exec);
				1153
				1154	void would_dump(struct linux_binprm bprm, struct file file)
				1155	{
				1156	if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
				1157	bprm->interp_flags \|= BINPRM_FLAGS_ENFORCE_NONDUMP;
				1158	}
				1159	EXPORT_SYMBOL(would_dump);
				1160
				1161	void setup_new_exec(struct linux_binprm * bprm)
				1162	{
				1163	arch_pick_mmap_layout(current->mm);
				1164
				1165	/* This is the point of no return */
				1166	current->sas_ss_sp = current->sas_ss_size = 0;
				1167
				1168	if (current_euid() == current_uid() && current_egid() == current_gid())
				1169	set_dumpable(current->mm, 1);
				1170	else
				1171	set_dumpable(current->mm, suid_dumpable);
				1172
				1173	set_task_comm(current, bprm->tcomm);
				1174
				1175	/* Set the new mm task size. We have to do that late because it may
				1176	* depend on TIF_32BIT which is only updated in flush_thread() on
				1177	* some architectures like powerpc
				1178	*/
				1179	current->mm->task_size = TASK_SIZE;
				1180
				1181	/* install the new credentials */
				1182	if (bprm->cred->uid != current_euid() \|\|
				1183	bprm->cred->gid != current_egid()) {
				1184	current->pdeath_signal = 0;
				1185	} else {
				1186	would_dump(bprm, bprm->file);
				1187	if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
				1188	set_dumpable(current->mm, suid_dumpable);
				1189	}
				1190
				1191	/* An exec changes our domain. We are no longer part of the thread
				1192	group */
				1193
				1194	current->self_exec_id++;
				1195
				1196	flush_signal_handlers(current, 0);
				1197	flush_old_files(current->files);
				1198	}
				1199	EXPORT_SYMBOL(setup_new_exec);
				1200
				1201	/*
				1202	* Prepare credentials and lock ->cred_guard_mutex.
				1203	* install_exec_creds() commits the new creds and drops the lock.
				1204	* Or, if exec fails before, free_bprm() should release ->cred and
				1205	* and unlock.
				1206	*/
				1207	int prepare_bprm_creds(struct linux_binprm *bprm)
				1208	{
				1209	if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
				1210	return -ERESTARTNOINTR;
				1211
				1212	bprm->cred = prepare_exec_creds();
				1213	if (likely(bprm->cred))
				1214	return 0;
				1215
				1216	mutex_unlock(&current->signal->cred_guard_mutex);
				1217	return -ENOMEM;
				1218	}
				1219
				1220	void free_bprm(struct linux_binprm *bprm)
				1221	{
				1222	free_arg_pages(bprm);
				1223	if (bprm->cred) {
				1224	mutex_unlock(&current->signal->cred_guard_mutex);
				1225	abort_creds(bprm->cred);
				1226	}
				1227	/* If a binfmt changed the interp, free it. */
				1228	if (bprm->interp != bprm->filename)
				1229	kfree(bprm->interp);
				1230	kfree(bprm);
				1231	}
				1232
				1233	int bprm_change_interp(char interp, struct linux_binprm bprm)
				1234	{
				1235	/* If a binfmt changed the interp, free it first. */
				1236	if (bprm->interp != bprm->filename)
				1237	kfree(bprm->interp);
				1238	bprm->interp = kstrdup(interp, GFP_KERNEL);
				1239	if (!bprm->interp)
				1240	return -ENOMEM;
				1241	return 0;
				1242	}
				1243	EXPORT_SYMBOL(bprm_change_interp);
				1244
				1245	/*
				1246	* install the new credentials for this executable
				1247	*/
				1248	void install_exec_creds(struct linux_binprm *bprm)
				1249	{
				1250	security_bprm_committing_creds(bprm);
				1251
				1252	commit_creds(bprm->cred);
				1253	bprm->cred = NULL;
				1254
				1255	/*
				1256	* Disable monitoring for regular users
				1257	* when executing setuid binaries. Must
				1258	* wait until new credentials are committed
				1259	* by commit_creds() above
				1260	*/
				1261	if (get_dumpable(current->mm) != SUID_DUMP_USER)
				1262	perf_event_exit_task(current);
				1263	/*
				1264	* cred_guard_mutex must be held at least to this point to prevent
				1265	* ptrace_attach() from altering our determination of the task's
				1266	* credentials; any time after this it may be unlocked.
				1267	*/
				1268	security_bprm_committed_creds(bprm);
				1269	mutex_unlock(&current->signal->cred_guard_mutex);
				1270	}
				1271	EXPORT_SYMBOL(install_exec_creds);
				1272
				1273	static void bprm_fill_uid(struct linux_binprm *bprm)
				1274	{
				1275	struct inode *inode;
				1276	unsigned int mode;
				1277	uid_t uid;
				1278	gid_t gid;
				1279
				1280	/* clear any previous set[ug]id data from a previous binary */
				1281	bprm->cred->euid = current_euid();
				1282	bprm->cred->egid = current_egid();
				1283
				1284	if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
				1285	return;
				1286
				1287	inode = bprm->file->f_path.dentry->d_inode;
				1288	mode = ACCESS_ONCE(inode->i_mode);
				1289	if (!(mode & (S_ISUID\|S_ISGID)))
				1290	return;
				1291
				1292	/* Be careful if suid/sgid is set */
				1293	mutex_lock(&inode->i_mutex);
				1294
				1295	/* reload atomically mode/uid/gid now that lock held */
				1296	mode = inode->i_mode;
				1297	uid = inode->i_uid;
				1298	gid = inode->i_gid;
				1299	mutex_unlock(&inode->i_mutex);
				1300
				1301	if (mode & S_ISUID) {
				1302	bprm->per_clear \|= PER_CLEAR_ON_SETID;
				1303	bprm->cred->euid = uid;
				1304	}
				1305
				1306	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP)) {
				1307	bprm->per_clear \|= PER_CLEAR_ON_SETID;
				1308	bprm->cred->egid = gid;
				1309	}
				1310	}
				1311
				1312	/*
				1313	* determine how safe it is to execute the proposed program
				1314	* - the caller must hold ->cred_guard_mutex to protect against
				1315	* PTRACE_ATTACH
				1316	*/
				1317	static int check_unsafe_exec(struct linux_binprm *bprm)
				1318	{
				1319	struct task_struct p = current, t;
				1320	unsigned n_fs;
				1321	int res = 0;
				1322
				1323	if (p->ptrace) {
				1324	if (p->ptrace & PT_PTRACE_CAP)
				1325	bprm->unsafe \|= LSM_UNSAFE_PTRACE_CAP;
				1326	else
				1327	bprm->unsafe \|= LSM_UNSAFE_PTRACE;
				1328	}
				1329
				1330	n_fs = 1;
				1331	spin_lock(&p->fs->lock);
				1332	rcu_read_lock();
				1333	for (t = next_thread(p); t != p; t = next_thread(t)) {
				1334	if (t->fs == p->fs)
				1335	n_fs++;
				1336	}
				1337	rcu_read_unlock();
				1338
				1339	if (p->fs->users > n_fs) {
				1340	bprm->unsafe \|= LSM_UNSAFE_SHARE;
				1341	} else {
				1342	res = -EAGAIN;
				1343	if (!p->fs->in_exec) {
				1344	p->fs->in_exec = 1;
				1345	res = 1;
				1346	}
				1347	}
				1348	spin_unlock(&p->fs->lock);
				1349
				1350	return res;
				1351	}
				1352
				1353	/*
				1354	* Fill the binprm structure from the inode.
				1355	* Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
				1356	*
				1357	* This may be called multiple times for binary chains (scripts for example).
				1358	*/
				1359	int prepare_binprm(struct linux_binprm *bprm)
				1360	{
				1361	int retval;
				1362
				1363	if (bprm->file->f_op == NULL)
				1364	return -EACCES;
				1365
				1366	bprm_fill_uid(bprm);
				1367
				1368	/* fill in binprm security blob */
				1369	retval = security_bprm_set_creds(bprm);
				1370	if (retval)
				1371	return retval;
				1372	bprm->cred_prepared = 1;
				1373
				1374	memset(bprm->buf, 0, BINPRM_BUF_SIZE);
				1375	return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
				1376	}
				1377
				1378	EXPORT_SYMBOL(prepare_binprm);
				1379
				1380	/*
				1381	* Arguments are '\0' separated strings found at the location bprm->p
				1382	* points to; chop off the first by relocating brpm->p to right after
				1383	* the first '\0' encountered.
				1384	*/
				1385	int remove_arg_zero(struct linux_binprm *bprm)
				1386	{
				1387	int ret = 0;
				1388	unsigned long offset;
				1389	char *kaddr;
				1390	struct page *page;
				1391
				1392	if (!bprm->argc)
				1393	return 0;
				1394
				1395	do {
				1396	offset = bprm->p & ~PAGE_MASK;
				1397	page = get_arg_page(bprm, bprm->p, 0);
				1398	if (!page) {
				1399	ret = -EFAULT;
				1400	goto out;
				1401	}
				1402	kaddr = kmap_atomic(page);
				1403
				1404	for (; offset < PAGE_SIZE && kaddr[offset];
				1405	offset++, bprm->p++)
				1406	;
				1407
				1408	kunmap_atomic(kaddr);
				1409	put_arg_page(page);
				1410
				1411	if (offset == PAGE_SIZE)
				1412	free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
				1413	} while (offset == PAGE_SIZE);
				1414
				1415	bprm->p++;
				1416	bprm->argc--;
				1417	ret = 0;
				1418
				1419	out:
				1420	return ret;
				1421	}
				1422	EXPORT_SYMBOL(remove_arg_zero);
				1423
				1424	/*
				1425	* cycle the list of binary formats handler, until one recognizes the image
				1426	*/
				1427	int search_binary_handler(struct linux_binprm bprm,struct pt_regs regs)
				1428	{
				1429	unsigned int depth = bprm->recursion_depth;
				1430	int try,retval;
				1431	struct linux_binfmt *fmt;
				1432	pid_t old_pid, old_vpid;
				1433
				1434	/* This allows 4 levels of binfmt rewrites before failing hard. */
				1435	if (depth > 5)
				1436	return -ELOOP;
				1437
				1438	retval = security_bprm_check(bprm);
				1439	if (retval)
				1440	return retval;
				1441
				1442	retval = audit_bprm(bprm);
				1443	if (retval)
				1444	return retval;
				1445
				1446	/* Need to fetch pid before load_binary changes it */
				1447	old_pid = current->pid;
				1448	rcu_read_lock();
				1449	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
				1450	rcu_read_unlock();
				1451
				1452	retval = -ENOENT;
				1453	for (try=0; try<2; try++) {
				1454	read_lock(&binfmt_lock);
				1455	list_for_each_entry(fmt, &formats, lh) {
				1456	int (fn)(struct linux_binprm , struct pt_regs *) = fmt->load_binary;
				1457	if (!fn)
				1458	continue;
				1459	if (!try_module_get(fmt->module))
				1460	continue;
				1461	read_unlock(&binfmt_lock);
				1462	bprm->recursion_depth = depth + 1;
				1463	retval = fn(bprm, regs);
				1464	bprm->recursion_depth = depth;
				1465	if (retval >= 0) {
				1466	if (depth == 0) {
				1467	trace_sched_process_exec(current, old_pid, bprm);
				1468	ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
				1469	}
				1470	put_binfmt(fmt);
				1471	allow_write_access(bprm->file);
				1472	if (bprm->file)
				1473	fput(bprm->file);
				1474	bprm->file = NULL;
				1475	current->did_exec = 1;
				1476	proc_exec_connector(current);
				1477	return retval;
				1478	}
				1479	read_lock(&binfmt_lock);
				1480	put_binfmt(fmt);
				1481	if (retval != -ENOEXEC \|\| bprm->mm == NULL)
				1482	break;
				1483	if (!bprm->file) {
				1484	read_unlock(&binfmt_lock);
				1485	return retval;
				1486	}
				1487	}
				1488	read_unlock(&binfmt_lock);
				1489	#ifdef CONFIG_MODULES
				1490	if (retval != -ENOEXEC \|\| bprm->mm == NULL) {
				1491	break;
				1492	} else {
				1493	#define printable(c) (((c)=='\t') \|\| ((c)=='\n') \|\| (0x20<=(c) && (c)<=0x7e))
				1494	if (printable(bprm->buf[0]) &&
				1495	printable(bprm->buf[1]) &&
				1496	printable(bprm->buf[2]) &&
				1497	printable(bprm->buf[3]))
				1498	break; /* -ENOEXEC */
				1499	if (try)
				1500	break; /* -ENOEXEC */
				1501	request_module("binfmt-%04x", (unsigned short )(&bprm->buf[2]));
				1502	}
				1503	#else
				1504	break;
				1505	#endif
				1506	}
				1507	return retval;
				1508	}
				1509
				1510	EXPORT_SYMBOL(search_binary_handler);
				1511
				1512	/*
				1513	* sys_execve() executes a new program.
				1514	*/
				1515	static int do_execve_common(const char *filename,
				1516	struct user_arg_ptr argv,
				1517	struct user_arg_ptr envp,
				1518	struct pt_regs *regs)
				1519	{
				1520	struct linux_binprm *bprm;
				1521	struct file *file;
				1522	struct files_struct *displaced;
				1523	bool clear_in_exec;
				1524	int retval;
				1525	const struct cred *cred = current_cred();
				1526
				1527	/*
				1528	* We move the actual failure in case of RLIMIT_NPROC excess from
				1529	* set*uid() to execve() because too many poorly written programs
				1530	* don't check setuid() return code. Here we additionally recheck
				1531	* whether NPROC limit is still exceeded.
				1532	*/
				1533	if ((current->flags & PF_NPROC_EXCEEDED) &&
				1534	atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
				1535	retval = -EAGAIN;
				1536	goto out_ret;
				1537	}
				1538
				1539	/* We're below the limit (still or again), so we don't want to make
				1540	* further execve() calls fail. */
				1541	current->flags &= ~PF_NPROC_EXCEEDED;
				1542
				1543	retval = unshare_files(&displaced);
				1544	if (retval)
				1545	goto out_ret;
				1546
				1547	retval = -ENOMEM;
				1548	bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
				1549	if (!bprm)
				1550	goto out_files;
				1551
				1552	retval = prepare_bprm_creds(bprm);
				1553	if (retval)
				1554	goto out_free;
				1555
				1556	retval = check_unsafe_exec(bprm);
				1557	if (retval < 0)
				1558	goto out_free;
				1559	clear_in_exec = retval;
				1560	current->in_execve = 1;
				1561
				1562	file = open_exec(filename);
				1563	retval = PTR_ERR(file);
				1564	if (IS_ERR(file))
				1565	goto out_unmark;
				1566
				1567	sched_exec();
				1568
				1569	bprm->file = file;
				1570	bprm->filename = filename;
				1571	bprm->interp = filename;
				1572
				1573	retval = bprm_mm_init(bprm);
				1574	if (retval)
				1575	goto out_file;
				1576
				1577	bprm->argc = count(argv, MAX_ARG_STRINGS);
				1578	if ((retval = bprm->argc) < 0)
				1579	goto out;
				1580
				1581	bprm->envc = count(envp, MAX_ARG_STRINGS);
				1582	if ((retval = bprm->envc) < 0)
				1583	goto out;
				1584
				1585	retval = prepare_binprm(bprm);
				1586	if (retval < 0)
				1587	goto out;
				1588
				1589	retval = copy_strings_kernel(1, &bprm->filename, bprm);
				1590	if (retval < 0)
				1591	goto out;
				1592
				1593	bprm->exec = bprm->p;
				1594	retval = copy_strings(bprm->envc, envp, bprm);
				1595	if (retval < 0)
				1596	goto out;
				1597
				1598	retval = copy_strings(bprm->argc, argv, bprm);
				1599	if (retval < 0)
				1600	goto out;
				1601
				1602	retval = search_binary_handler(bprm,regs);
				1603	if (retval < 0)
				1604	goto out;
				1605
				1606	/* execve succeeded */
				1607	current->fs->in_exec = 0;
				1608	current->in_execve = 0;
				1609	acct_update_integrals(current);
				1610	free_bprm(bprm);
				1611	if (displaced)
				1612	put_files_struct(displaced);
				1613	return retval;
				1614
				1615	out:
				1616	if (bprm->mm) {
				1617	acct_arg_size(bprm, 0);
				1618	mmput(bprm->mm);
				1619	}
				1620
				1621	out_file:
				1622	if (bprm->file) {
				1623	allow_write_access(bprm->file);
				1624	fput(bprm->file);
				1625	}
				1626
				1627	out_unmark:
				1628	if (clear_in_exec)
				1629	current->fs->in_exec = 0;
				1630	current->in_execve = 0;
				1631
				1632	out_free:
				1633	free_bprm(bprm);
				1634
				1635	out_files:
				1636	if (displaced)
				1637	reset_files_struct(displaced);
				1638	out_ret:
				1639	return retval;
				1640	}
				1641
				1642	int do_execve(const char *filename,
				1643	const char __user const __user __argv,
				1644	const char __user const __user __envp,
				1645	struct pt_regs *regs)
				1646	{
				1647	struct user_arg_ptr argv = { .ptr.native = __argv };
				1648	struct user_arg_ptr envp = { .ptr.native = __envp };
				1649	return do_execve_common(filename, argv, envp, regs);
				1650	}
				1651
				1652	#ifdef CONFIG_COMPAT
				1653	int compat_do_execve(char *filename,
				1654	compat_uptr_t __user *__argv,
				1655	compat_uptr_t __user *__envp,
				1656	struct pt_regs *regs)
				1657	{
				1658	struct user_arg_ptr argv = {
				1659	.is_compat = true,
				1660	.ptr.compat = __argv,
				1661	};
				1662	struct user_arg_ptr envp = {
				1663	.is_compat = true,
				1664	.ptr.compat = __envp,
				1665	};
				1666	return do_execve_common(filename, argv, envp, regs);
				1667	}
				1668	#endif
				1669
				1670	void set_binfmt(struct linux_binfmt *new)
				1671	{
				1672	struct mm_struct *mm = current->mm;
				1673
				1674	if (mm->binfmt)
				1675	module_put(mm->binfmt->module);
				1676
				1677	mm->binfmt = new;
				1678	if (new)
				1679	__module_get(new->module);
				1680	}
				1681
				1682	EXPORT_SYMBOL(set_binfmt);
				1683
				1684	static int expand_corename(struct core_name *cn)
				1685	{
				1686	char *old_corename = cn->corename;
				1687
				1688	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
				1689	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
				1690
				1691	if (!cn->corename) {
				1692	kfree(old_corename);
				1693	return -ENOMEM;
				1694	}
				1695
				1696	return 0;
				1697	}
				1698
				1699	static int cn_printf(struct core_name cn, const char fmt, ...)
				1700	{
				1701	char *cur;
				1702	int need;
				1703	int ret;
				1704	va_list arg;
				1705
				1706	va_start(arg, fmt);
				1707	need = vsnprintf(NULL, 0, fmt, arg);
				1708	va_end(arg);
				1709
				1710	if (likely(need < cn->size - cn->used - 1))
				1711	goto out_printf;
				1712
				1713	ret = expand_corename(cn);
				1714	if (ret)
				1715	goto expand_fail;
				1716
				1717	out_printf:
				1718	cur = cn->corename + cn->used;
				1719	va_start(arg, fmt);
				1720	vsnprintf(cur, need + 1, fmt, arg);
				1721	va_end(arg);
				1722	cn->used += need;
				1723	return 0;
				1724
				1725	expand_fail:
				1726	return ret;
				1727	}
				1728
				1729	static void cn_escape(char *str)
				1730	{
				1731	for (; *str; str++)
				1732	if (*str == '/')
				1733	*str = '!';
				1734	}
				1735
				1736	static int cn_print_exe_file(struct core_name *cn)
				1737	{
				1738	struct file *exe_file;
				1739	char pathbuf, path;
				1740	int ret;
				1741
				1742	exe_file = get_mm_exe_file(current->mm);
				1743	if (!exe_file) {
				1744	char *commstart = cn->corename + cn->used;
				1745	ret = cn_printf(cn, "%s (path unknown)", current->comm);
				1746	cn_escape(commstart);
				1747	return ret;
				1748	}
				1749
				1750	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
				1751	if (!pathbuf) {
				1752	ret = -ENOMEM;
				1753	goto put_exe_file;
				1754	}
				1755
				1756	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
				1757	if (IS_ERR(path)) {
				1758	ret = PTR_ERR(path);
				1759	goto free_buf;
				1760	}
				1761
				1762	cn_escape(path);
				1763
				1764	ret = cn_printf(cn, "%s", path);
				1765
				1766	free_buf:
				1767	kfree(pathbuf);
				1768	put_exe_file:
				1769	fput(exe_file);
				1770	return ret;
				1771	}
				1772
				1773	/* format_corename will inspect the pattern parameter, and output a
				1774	* name into corename, which must have space for at least
				1775	* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
				1776	*/
				1777	static int format_corename(struct core_name *cn, long signr)
				1778	{
				1779	const struct cred *cred = current_cred();
				1780	const char *pat_ptr = core_pattern;
				1781	int ispipe = (*pat_ptr == '\|');
				1782	int pid_in_pattern = 0;
				1783	int err = 0;
				1784
				1785	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
				1786	cn->corename = kmalloc(cn->size, GFP_KERNEL);
				1787	cn->used = 0;
				1788
				1789	if (!cn->corename)
				1790	return -ENOMEM;
				1791
				1792	/* Repeat as long as we have more pattern to process and more output
				1793	space */
				1794	while (*pat_ptr) {
				1795	if (*pat_ptr != '%') {
				1796	if (*pat_ptr == 0)
				1797	goto out;
				1798	err = cn_printf(cn, "%c", *pat_ptr++);
				1799	} else {
				1800	switch (*++pat_ptr) {
				1801	/* single % at the end, drop that */
				1802	case 0:
				1803	goto out;
				1804	/* Double percent, output one percent */
				1805	case '%':
				1806	err = cn_printf(cn, "%c", '%');
				1807	break;
				1808	/* pid */
				1809	case 'p':
				1810	pid_in_pattern = 1;
				1811	err = cn_printf(cn, "%d",
				1812	task_tgid_vnr(current));
				1813	break;
				1814	/* uid */
				1815	case 'u':
				1816	err = cn_printf(cn, "%d", cred->uid);
				1817	break;
				1818	/* gid */
				1819	case 'g':
				1820	err = cn_printf(cn, "%d", cred->gid);
				1821	break;
				1822	/* signal that caused the coredump */
				1823	case 's':
				1824	err = cn_printf(cn, "%ld", signr);
				1825	break;
				1826	/* UNIX time of coredump */
				1827	case 't': {
				1828	struct timeval tv;
				1829	do_gettimeofday(&tv);
				1830	err = cn_printf(cn, "%lu", tv.tv_sec);
				1831	break;
				1832	}
				1833	/* hostname */
				1834	case 'h': {
				1835	char *namestart = cn->corename + cn->used;
				1836	down_read(&uts_sem);
				1837	err = cn_printf(cn, "%s",
				1838	utsname()->nodename);
				1839	up_read(&uts_sem);
				1840	cn_escape(namestart);
				1841	break;
				1842	}
				1843	/* executable */
				1844	case 'e': {
				1845	char *commstart = cn->corename + cn->used;
				1846	err = cn_printf(cn, "%s", current->comm);
				1847	cn_escape(commstart);
				1848	break;
				1849	}
				1850	case 'E':
				1851	err = cn_print_exe_file(cn);
				1852	break;
				1853	/* core limit size */
				1854	case 'c':
				1855	err = cn_printf(cn, "%lu",
				1856	rlimit(RLIMIT_CORE));
				1857	break;
				1858	default:
				1859	break;
				1860	}
				1861	++pat_ptr;
				1862	}
				1863
				1864	if (err)
				1865	return err;
				1866	}
				1867
				1868	/* Backward compatibility with core_uses_pid:
				1869	*
				1870	* If core_pattern does not include a %p (as is the default)
				1871	* and core_uses_pid is set, then .%pid will be appended to
				1872	* the filename. Do not do this for piped commands. */
				1873	if (!ispipe && !pid_in_pattern && core_uses_pid) {
				1874	err = cn_printf(cn, ".%d", task_tgid_vnr(current));
				1875	if (err)
				1876	return err;
				1877	}
				1878	out:
				1879	return ispipe;
				1880	}
				1881
				1882	static int zap_process(struct task_struct *start, int exit_code)
				1883	{
				1884	struct task_struct *t;
				1885	int nr = 0;
				1886
				1887	start->signal->flags = SIGNAL_GROUP_EXIT;
				1888	start->signal->group_exit_code = exit_code;
				1889	start->signal->group_stop_count = 0;
				1890
				1891	t = start;
				1892	do {
				1893	task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
				1894	if (t != current && t->mm) {
				1895	#ifndef CONFIG_RAMDUMP
				1896	sigaddset(&t->pending.signal, SIGKILL);
				1897	signal_wake_up(t, 1);
				1898	#endif
				1899	nr++;
				1900	}
				1901	} while_each_thread(start, t);
				1902
				1903	return nr;
				1904	}
				1905
				1906	static inline int zap_threads(struct task_struct tsk, struct mm_struct mm,
				1907	struct core_state *core_state, int exit_code)
				1908	{
				1909	struct task_struct g, p;
				1910	unsigned long flags;
				1911	int nr = -EAGAIN;
				1912
				1913	spin_lock_irq(&tsk->sighand->siglock);
				1914	if (!signal_group_exit(tsk->signal)) {
				1915	mm->core_state = core_state;
				1916	nr = zap_process(tsk, exit_code);
				1917	}
				1918	spin_unlock_irq(&tsk->sighand->siglock);
				1919	if (unlikely(nr < 0))
				1920	return nr;
				1921
				1922	if (atomic_read(&mm->mm_users) == nr + 1)
				1923	goto done;
				1924	/*
				1925	* We should find and kill all tasks which use this mm, and we should
				1926	* count them correctly into ->nr_threads. We don't take tasklist
				1927	* lock, but this is safe wrt:
				1928	*
				1929	* fork:
				1930	* None of sub-threads can fork after zap_process(leader). All
				1931	* processes which were created before this point should be
				1932	* visible to zap_threads() because copy_process() adds the new
				1933	* process to the tail of init_task.tasks list, and lock/unlock
				1934	* of ->siglock provides a memory barrier.
				1935	*
				1936	* do_exit:
				1937	* The caller holds mm->mmap_sem. This means that the task which
				1938	* uses this mm can't pass exit_mm(), so it can't exit or clear
				1939	* its ->mm.
				1940	*
				1941	* de_thread:
				1942	* It does list_replace_rcu(&leader->tasks, &current->tasks),
				1943	* we must see either old or new leader, this does not matter.
				1944	* However, it can change p->sighand, so lock_task_sighand(p)
				1945	* must be used. Since p->mm != NULL and we hold ->mmap_sem
				1946	* it can't fail.
				1947	*
				1948	* Note also that "g" can be the old leader with ->mm == NULL
				1949	* and already unhashed and thus removed from ->thread_group.
				1950	* This is OK, __unhash_process()->list_del_rcu() does not
				1951	* clear the ->next pointer, we will find the new leader via
				1952	* next_thread().
				1953	*/
				1954	rcu_read_lock();
				1955	for_each_process(g) {
				1956	if (g == tsk->group_leader)
				1957	continue;
				1958	if (g->flags & PF_KTHREAD)
				1959	continue;
				1960	p = g;
				1961	do {
				1962	if (p->mm) {
				1963	if (unlikely(p->mm == mm)) {
				1964	lock_task_sighand(p, &flags);
				1965	nr += zap_process(p, exit_code);
				1966	unlock_task_sighand(p, &flags);
				1967	}
				1968	break;
				1969	}
				1970	} while_each_thread(g, p);
				1971	}
				1972	rcu_read_unlock();
				1973	done:
				1974	atomic_set(&core_state->nr_threads, nr);
				1975	return nr;
				1976	}
				1977
				1978	static int coredump_wait(int exit_code, struct core_state *core_state)
				1979	{
				1980	struct task_struct *tsk = current;
				1981	struct mm_struct *mm = tsk->mm;
				1982	int core_waiters = -EBUSY;
				1983
				1984	init_completion(&core_state->startup);
				1985	core_state->dumper.task = tsk;
				1986	core_state->dumper.next = NULL;
				1987
				1988	down_write(&mm->mmap_sem);
				1989	if (!mm->core_state)
				1990	core_waiters = zap_threads(tsk, mm, core_state, exit_code);
				1991	up_write(&mm->mmap_sem);
				1992	#ifndef CONFIG_RAMDUMP
				1993	if (core_waiters > 0)
				1994	wait_for_completion(&core_state->startup);
				1995	#endif
				1996
				1997	return core_waiters;
				1998	}
				1999
				2000	static void coredump_finish(struct mm_struct *mm)
				2001	{
				2002	struct core_thread curr, next;
				2003	struct task_struct *task;
				2004
				2005	next = mm->core_state->dumper.next;
				2006	while ((curr = next) != NULL) {
				2007	next = curr->next;
				2008	task = curr->task;
				2009	/*
				2010	* see exit_mm(), curr->task must not see
				2011	* ->task == NULL before we read ->next.
				2012	*/
				2013	smp_mb();
				2014	curr->task = NULL;
				2015	wake_up_process(task);
				2016	}
				2017
				2018	mm->core_state = NULL;
				2019	}
				2020
				2021	/*
				2022	* set_dumpable converts traditional three-value dumpable to two flags and
				2023	* stores them into mm->flags. It modifies lower two bits of mm->flags, but
				2024	* these bits are not changed atomically. So get_dumpable can observe the
				2025	* intermediate state. To avoid doing unexpected behavior, get get_dumpable
				2026	* return either old dumpable or new one by paying attention to the order of
				2027	* modifying the bits.
				2028	*
				2029	* dumpable \| mm->flags (binary)
				2030	* old new \| initial interim final
				2031	* ---------+-----------------------
				2032	* 0 1 \| 00 01 01
				2033	* 0 2 \| 00 10(*) 11
				2034	* 1 0 \| 01 00 00
				2035	* 1 2 \| 01 11 11
				2036	* 2 0 \| 11 10(*) 00
				2037	* 2 1 \| 11 11 01
				2038	*
				2039	* (*) get_dumpable regards interim value of 10 as 11.
				2040	*/
				2041	void set_dumpable(struct mm_struct *mm, int value)
				2042	{
				2043	switch (value) {
				2044	case 0:
				2045	clear_bit(MMF_DUMPABLE, &mm->flags);
				2046	smp_wmb();
				2047	clear_bit(MMF_DUMP_SECURELY, &mm->flags);
				2048	break;
				2049	case 1:
				2050	set_bit(MMF_DUMPABLE, &mm->flags);
				2051	smp_wmb();
				2052	clear_bit(MMF_DUMP_SECURELY, &mm->flags);
				2053	break;
				2054	case 2:
				2055	set_bit(MMF_DUMP_SECURELY, &mm->flags);
				2056	smp_wmb();
				2057	set_bit(MMF_DUMPABLE, &mm->flags);
				2058	break;
				2059	}
				2060	}
				2061
				2062	static int __get_dumpable(unsigned long mm_flags)
				2063	{
				2064	int ret;
				2065
				2066	ret = mm_flags & MMF_DUMPABLE_MASK;
				2067	return (ret >= 2) ? 2 : ret;
				2068	}
				2069
				2070	/*
				2071	* This returns the actual value of the suid_dumpable flag. For things
				2072	* that are using this for checking for privilege transitions, it must
				2073	* test against SUID_DUMP_USER rather than treating it as a boolean
				2074	* value.
				2075	*/
				2076	int get_dumpable(struct mm_struct *mm)
				2077	{
				2078	return __get_dumpable(mm->flags);
				2079	}
				2080
				2081	static void wait_for_dump_helpers(struct file *file)
				2082	{
				2083	struct pipe_inode_info *pipe;
				2084
				2085	pipe = file->f_path.dentry->d_inode->i_pipe;
				2086
				2087	pipe_lock(pipe);
				2088	pipe->readers++;
				2089	pipe->writers--;
				2090
				2091	while ((pipe->readers > 1) && (!signal_pending(current))) {
				2092	wake_up_interruptible_sync(&pipe->wait);
				2093	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
				2094	pipe_wait(pipe);
				2095	}
				2096
				2097	pipe->readers--;
				2098	pipe->writers++;
				2099	pipe_unlock(pipe);
				2100
				2101	}
				2102
				2103
				2104	/*
				2105	* umh_pipe_setup
				2106	* helper function to customize the process used
				2107	* to collect the core in userspace. Specifically
				2108	* it sets up a pipe and installs it as fd 0 (stdin)
				2109	* for the process. Returns 0 on success, or
				2110	* PTR_ERR on failure.
				2111	* Note that it also sets the core limit to 1. This
				2112	* is a special value that we use to trap recursive
				2113	* core dumps
				2114	*/
				2115	static int umh_pipe_setup(struct subprocess_info info, struct cred new)
				2116	{
				2117	struct file rp, wp;
				2118	struct fdtable *fdt;
				2119	struct coredump_params cp = (struct coredump_params )info->data;
				2120	struct files_struct *cf = current->files;
				2121
				2122	wp = create_write_pipe(0);
				2123	if (IS_ERR(wp))
				2124	return PTR_ERR(wp);
				2125
				2126	rp = create_read_pipe(wp, 0);
				2127	if (IS_ERR(rp)) {
				2128	free_write_pipe(wp);
				2129	return PTR_ERR(rp);
				2130	}
				2131
				2132	cp->file = wp;
				2133
				2134	sys_close(0);
				2135	fd_install(0, rp);
				2136	spin_lock(&cf->file_lock);
				2137	fdt = files_fdtable(cf);
				2138	__set_open_fd(0, fdt);
				2139	__clear_close_on_exec(0, fdt);
				2140	spin_unlock(&cf->file_lock);
				2141
				2142	/* and disallow core files too */
				2143	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
				2144
				2145	return 0;
				2146	}
				2147
				2148	#ifdef CONFIG_RAMDUMP
				2149	/* ramdump entry */
				2150	extern void ramdump_entry(void);
				2151	int sysctl_ramdump_on_user = 1;
				2152	#endif
				2153
				2154	void do_coredump(long signr, int exit_code, struct pt_regs *regs)
				2155	{
				2156	struct core_state core_state;
				2157	struct core_name cn;
				2158	struct mm_struct *mm = current->mm;
				2159	struct linux_binfmt * binfmt;
				2160	const struct cred *old_cred;
				2161	struct cred *cred;
				2162	int retval = 0;
				2163	int flag = 0;
				2164	int ispipe;
				2165	static atomic_t core_dump_count = ATOMIC_INIT(0);
				2166	struct coredump_params cprm = {
				2167	.signr = signr,
				2168	.regs = regs,
				2169	.limit = rlimit(RLIMIT_CORE),
				2170	/*
				2171	* We must use the same mm->flags while dumping core to avoid
				2172	* inconsistency of bit flags, since this flag is not protected
				2173	* by any locks.
				2174	*/
				2175	.mm_flags = mm->flags,
				2176	};
				2177
				2178	audit_core_dumps(signr);
				2179
				2180	binfmt = mm->binfmt;
				2181	if (!binfmt \|\| !binfmt->core_dump)
				2182	goto fail;
				2183	if (!__get_dumpable(cprm.mm_flags))
				2184	goto fail;
				2185
				2186	cred = prepare_creds();
				2187	if (!cred)
				2188	goto fail;
				2189	/*
				2190	* We cannot trust fsuid as being the "true" uid of the
				2191	* process nor do we know its entire history. We only know it
				2192	* was tainted so we dump it as root in mode 2.
				2193	*/
				2194	if (__get_dumpable(cprm.mm_flags) == 2) {
				2195	/* Setuid core dump mode */
				2196	flag = O_EXCL; /* Stop rewrite attacks */
				2197	cred->fsuid = 0; /* Dump root private */
				2198	}
				2199
				2200	retval = coredump_wait(exit_code, &core_state);
				2201	if (retval < 0)
				2202	goto fail_creds;
				2203
				2204	old_cred = override_creds(cred);
				2205
				2206	/*
				2207	* Clear any false indication of pending signals that might
				2208	* be seen by the filesystem code called to write the core file.
				2209	*/
				2210	clear_thread_flag(TIF_SIGPENDING);
				2211
				2212	ispipe = format_corename(&cn, signr);
				2213
				2214	if (ispipe) {
				2215	int dump_count;
				2216	char **helper_argv;
				2217
				2218	if (ispipe < 0) {
				2219	printk(KERN_WARNING "format_corename failed\n");
				2220	printk(KERN_WARNING "Aborting core\n");
				2221	goto fail_corename;
				2222	}
				2223
				2224	if (cprm.limit == 1) {
				2225	/*
				2226	* Normally core limits are irrelevant to pipes, since
				2227	* we're not writing to the file system, but we use
				2228	* cprm.limit of 1 here as a speacial value. Any
				2229	* non-1 limit gets set to RLIM_INFINITY below, but
				2230	* a limit of 0 skips the dump. This is a consistent
				2231	* way to catch recursive crashes. We can still crash
				2232	* if the core_pattern binary sets RLIM_CORE = !1
				2233	* but it runs as root, and can do lots of stupid things
				2234	* Note that we use task_tgid_vnr here to grab the pid
				2235	* of the process group leader. That way we get the
				2236	* right pid if a thread in a multi-threaded
				2237	* core_pattern process dies.
				2238	*/
				2239	printk(KERN_WARNING
				2240	"Process %d(%s) has RLIMIT_CORE set to 1\n",
				2241	task_tgid_vnr(current), current->comm);
				2242	printk(KERN_WARNING "Aborting core\n");
				2243	goto fail_unlock;
				2244	}
				2245	cprm.limit = RLIM_INFINITY;
				2246
				2247	dump_count = atomic_inc_return(&core_dump_count);
				2248	if (core_pipe_limit && (core_pipe_limit < dump_count)) {
				2249	printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
				2250	task_tgid_vnr(current), current->comm);
				2251	printk(KERN_WARNING "Skipping core dump\n");
				2252	goto fail_dropcount;
				2253	}
				2254
				2255	helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
				2256	if (!helper_argv) {
				2257	printk(KERN_WARNING "%s failed to allocate memory\n",
				2258	__func__);
				2259	goto fail_dropcount;
				2260	}
				2261
				2262	retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
				2263	NULL, UMH_WAIT_EXEC, umh_pipe_setup,
				2264	NULL, &cprm);
				2265	argv_free(helper_argv);
				2266	if (retval) {
				2267	printk(KERN_INFO "Core dump to %s pipe failed\n",
				2268	cn.corename);
				2269	goto close_fail;
				2270	}
				2271	} else {
				2272	struct inode *inode;
				2273
				2274	if (cprm.limit < binfmt->min_coredump)
				2275	goto fail_unlock;
				2276
				2277	cprm.file = filp_open(cn.corename,
				2278	O_CREAT \| 2 \| O_NOFOLLOW \| O_LARGEFILE \| flag,
				2279	0600);
				2280	if (IS_ERR(cprm.file))
				2281	goto fail_unlock;
				2282
				2283	inode = cprm.file->f_path.dentry->d_inode;
				2284	if (inode->i_nlink > 1)
				2285	goto close_fail;
				2286	if (d_unhashed(cprm.file->f_path.dentry))
				2287	goto close_fail;
				2288	/*
				2289	* AK: actually i see no reason to not allow this for named
				2290	* pipes etc, but keep the previous behaviour for now.
				2291	*/
				2292	if (!S_ISREG(inode->i_mode))
				2293	goto close_fail;
				2294	/*
				2295	* Dont allow local users get cute and trick others to coredump
				2296	* into their pre-created files.
				2297	*/
				2298	if (inode->i_uid != current_fsuid())
				2299	goto close_fail;
				2300	if (!cprm.file->f_op \|\| !cprm.file->f_op->write)
				2301	goto close_fail;
				2302	if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
				2303	goto close_fail;
				2304	}
				2305
				2306	retval = binfmt->core_dump(&cprm);
				2307	if (retval)
				2308	current->signal->group_exit_code \|= 0x80;
				2309
				2310	if (ispipe && core_pipe_limit)
				2311	wait_for_dump_helpers(cprm.file);
				2312	close_fail:
				2313	if (cprm.file)
				2314	filp_close(cprm.file, NULL);
				2315	fail_dropcount:
				2316	if (ispipe)
				2317	atomic_dec(&core_dump_count);
				2318	fail_unlock:
				2319	kfree(cn.corename);
				2320	fail_corename:
				2321	coredump_finish(mm);
				2322	revert_creds(old_cred);
				2323	fail_creds:
				2324	put_cred(cred);
				2325	fail:
				2326	/*
				2327	* user ramdump entry
				2328	*/
				2329	#ifdef CONFIG_RAMDUMP
xf.li	e31de8b	2023-12-26 23:38:58 -0800	[diff] [blame]	2330	if(sysctl_ramdump_on_user && (signr != SIGQUIT))
lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	2331	panic("User ramdump enabled, user panic\n");//ramdump_entry();
				2332	else
				2333	printk("User ramdump disabled, current process is: %s, pid is %i!\n", current->comm, current->pid);
				2334	#endif
				2335	return;
				2336	}
				2337
				2338	/*
				2339	* Core dumping helper functions. These are the only things you should
				2340	* do on a core-file: use only these functions to write out all the
				2341	* necessary info.
				2342	*/
				2343	int dump_write(struct file file, const void addr, int nr)
				2344	{
				2345	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
				2346	}
				2347	EXPORT_SYMBOL(dump_write);
				2348
				2349	int dump_seek(struct file *file, loff_t off)
				2350	{
				2351	int ret = 1;
				2352
				2353	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
				2354	if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
				2355	return 0;
				2356	} else {
				2357	char buf = (char )get_zeroed_page(GFP_KERNEL);
				2358
				2359	if (!buf)
				2360	return 0;
				2361	while (off > 0) {
				2362	unsigned long n = off;
				2363
				2364	if (n > PAGE_SIZE)
				2365	n = PAGE_SIZE;
				2366	if (!dump_write(file, buf, n)) {
				2367	ret = 0;
				2368	break;
				2369	}
				2370	off -= n;
				2371	}
				2372	free_page((unsigned long)buf);
				2373	}
				2374	return ret;
				2375	}
				2376	EXPORT_SYMBOL(dump_seek);