Blame - src/kernel/linux/v4.19/fs/namei.c - T800

blob: 484cf99414405727a98d09bdb71ea12192292cd3 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/fs/namei.c
				4	*
				5	* Copyright (C) 1991, 1992 Linus Torvalds
				6	*/
				7
				8	/*
				9	* Some corrections by tytso.
				10	*/
				11
				12	/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
				13	* lookup logic.
				14	*/
				15	/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
				16	*/
				17
				18	#include <linux/init.h>
				19	#include <linux/export.h>
				20	#include <linux/kernel.h>
				21	#include <linux/slab.h>
				22	#include <linux/fs.h>
				23	#include <linux/namei.h>
				24	#include <linux/pagemap.h>
				25	#include <linux/fsnotify.h>
				26	#include <linux/personality.h>
				27	#include <linux/security.h>
				28	#include <linux/ima.h>
				29	#include <linux/syscalls.h>
				30	#include <linux/mount.h>
				31	#include <linux/audit.h>
				32	#include <linux/capability.h>
				33	#include <linux/file.h>
				34	#include <linux/fcntl.h>
				35	#include <linux/device_cgroup.h>
				36	#include <linux/fs_struct.h>
				37	#include <linux/posix_acl.h>
				38	#include <linux/hash.h>
				39	#include <linux/bitops.h>
				40	#include <linux/init_task.h>
				41	#include <linux/uaccess.h>
				42	#include <linux/build_bug.h>
				43
				44	#include "internal.h"
				45	#include "mount.h"
				46
				47	#define CREATE_TRACE_POINTS
				48	#include <trace/events/namei.h>
				49
				50	/* [Feb-1997 T. Schoebel-Theuer]
				51	* Fundamental changes in the pathname lookup mechanisms (namei)
				52	* were necessary because of omirr. The reason is that omirr needs
				53	* to know the _real_ pathname, not the user-supplied one, in case
				54	* of symlinks (and also when transname replacements occur).
				55	*
				56	* The new code replaces the old recursive symlink resolution with
				57	* an iterative one (in case of non-nested symlink chains). It does
				58	* this with calls to <fs>_follow_link().
				59	* As a side effect, dir_namei(), _namei() and follow_link() are now
				60	* replaced with a single function lookup_dentry() that can handle all
				61	* the special cases of the former code.
				62	*
				63	* With the new dcache, the pathname is stored at each inode, at least as
				64	* long as the refcount of the inode is positive. As a side effect, the
				65	* size of the dcache depends on the inode cache and thus is dynamic.
				66	*
				67	* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
				68	* resolution to correspond with current state of the code.
				69	*
				70	* Note that the symlink resolution is not completely iterative.
				71	* There is still a significant amount of tail- and mid- recursion in
				72	* the algorithm. Also, note that <fs>_readlink() is not used in
				73	* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
				74	* may return different results than <fs>_follow_link(). Many virtual
				75	* filesystems (including /proc) exhibit this behavior.
				76	*/
				77
				78	/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
				79	* New symlink semantics: when open() is called with flags O_CREAT \| O_EXCL
				80	* and the name already exists in form of a symlink, try to create the new
				81	* name indicated by the symlink. The old code always complained that the
				82	* name already exists, due to not following the symlink even if its target
				83	* is nonexistent. The new semantics affects also mknod() and link() when
				84	* the name is a symlink pointing to a non-existent name.
				85	*
				86	* I don't know which semantics is the right one, since I have no access
				87	* to standards. But I found by trial that HP-UX 9.0 has the full "new"
				88	* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
				89	* "old" one. Personally, I think the new semantics is much more logical.
				90	* Note that "ln old new" where "new" is a symlink pointing to a non-existing
				91	* file does succeed in both HP-UX and SunOs, but not in Solaris
				92	* and in the old Linux semantics.
				93	*/
				94
				95	/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
				96	* semantics. See the comments in "open_namei" and "do_link" below.
				97	*
				98	* [10-Sep-98 Alan Modra] Another symlink change.
				99	*/
				100
				101	/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
				102	* inside the path - always follow.
				103	* in the last component in creation/removal/renaming - never follow.
				104	* if LOOKUP_FOLLOW passed - follow.
				105	* if the pathname has trailing slashes - follow.
				106	* otherwise - don't follow.
				107	* (applied in that order).
				108	*
				109	* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
				110	* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
				111	* During the 2.4 we need to fix the userland stuff depending on it -
				112	* hopefully we will be able to get rid of that wart in 2.5. So far only
				113	* XEmacs seems to be relying on it...
				114	*/
				115	/*
				116	* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
				117	* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
				118	* any extra contention...
				119	*/
				120
				121	/* In order to reduce some races, while at the same time doing additional
				122	* checking and hopefully speeding things up, we copy filenames to the
				123	* kernel data space before using them..
				124	*
				125	* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
				126	* PATH_MAX includes the nul terminator --RR.
				127	*/
				128
				129	#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
				130
				131	struct filename *
				132	getname_flags(const char __user filename, int flags, int empty)
				133	{
				134	struct filename *result;
				135	char *kname;
				136	int len;
				137	BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0);
				138
				139	result = audit_reusename(filename);
				140	if (result)
				141	return result;
				142
				143	result = __getname();
				144	if (unlikely(!result))
				145	return ERR_PTR(-ENOMEM);
				146
				147	/*
				148	* First, try to embed the struct filename inside the names_cache
				149	* allocation
				150	*/
				151	kname = (char *)result->iname;
				152	result->name = kname;
				153
				154	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
				155	if (unlikely(len < 0)) {
				156	__putname(result);
				157	return ERR_PTR(len);
				158	}
				159
				160	/*
				161	* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
				162	* separate struct filename so we can dedicate the entire
				163	* names_cache allocation for the pathname, and re-do the copy from
				164	* userland.
				165	*/
				166	if (unlikely(len == EMBEDDED_NAME_MAX)) {
				167	const size_t size = offsetof(struct filename, iname[1]);
				168	kname = (char *)result;
				169
				170	/*
				171	* size is chosen that way we to guarantee that
				172	* result->iname[0] is within the same object and that
				173	* kname can't be equal to result->iname, no matter what.
				174	*/
				175	result = kzalloc(size, GFP_KERNEL);
				176	if (unlikely(!result)) {
				177	__putname(kname);
				178	return ERR_PTR(-ENOMEM);
				179	}
				180	result->name = kname;
				181	len = strncpy_from_user(kname, filename, PATH_MAX);
				182	if (unlikely(len < 0)) {
				183	__putname(kname);
				184	kfree(result);
				185	return ERR_PTR(len);
				186	}
				187	if (unlikely(len == PATH_MAX)) {
				188	__putname(kname);
				189	kfree(result);
				190	return ERR_PTR(-ENAMETOOLONG);
				191	}
				192	}
				193
				194	result->refcnt = 1;
				195	/* The empty path is special. */
				196	if (unlikely(!len)) {
				197	if (empty)
				198	*empty = 1;
				199	if (!(flags & LOOKUP_EMPTY)) {
				200	putname(result);
				201	return ERR_PTR(-ENOENT);
				202	}
				203	}
				204
				205	result->uptr = filename;
				206	result->aname = NULL;
				207	audit_getname(result);
				208	return result;
				209	}
				210
				211	struct filename *
				212	getname(const char __user * filename)
				213	{
				214	return getname_flags(filename, 0, NULL);
				215	}
				216
				217	struct filename *
				218	getname_kernel(const char * filename)
				219	{
				220	struct filename *result;
				221	int len = strlen(filename) + 1;
				222
				223	result = __getname();
				224	if (unlikely(!result))
				225	return ERR_PTR(-ENOMEM);
				226
				227	if (len <= EMBEDDED_NAME_MAX) {
				228	result->name = (char *)result->iname;
				229	} else if (len <= PATH_MAX) {
				230	const size_t size = offsetof(struct filename, iname[1]);
				231	struct filename *tmp;
				232
				233	tmp = kmalloc(size, GFP_KERNEL);
				234	if (unlikely(!tmp)) {
				235	__putname(result);
				236	return ERR_PTR(-ENOMEM);
				237	}
				238	tmp->name = (char *)result;
				239	result = tmp;
				240	} else {
				241	__putname(result);
				242	return ERR_PTR(-ENAMETOOLONG);
				243	}
				244	memcpy((char *)result->name, filename, len);
				245	result->uptr = NULL;
				246	result->aname = NULL;
				247	result->refcnt = 1;
				248	audit_getname(result);
				249
				250	return result;
				251	}
				252
				253	void putname(struct filename *name)
				254	{
				255	BUG_ON(name->refcnt <= 0);
				256
				257	if (--name->refcnt > 0)
				258	return;
				259
				260	if (name->name != name->iname) {
				261	__putname(name->name);
				262	kfree(name);
				263	} else
				264	__putname(name);
				265	}
				266
				267	static int check_acl(struct inode *inode, int mask)
				268	{
				269	#ifdef CONFIG_FS_POSIX_ACL
				270	struct posix_acl *acl;
				271
				272	if (mask & MAY_NOT_BLOCK) {
				273	acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
				274	if (!acl)
				275	return -EAGAIN;
				276	/* no ->get_acl() calls in RCU mode... */
				277	if (is_uncached_acl(acl))
				278	return -ECHILD;
				279	return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
				280	}
				281
				282	acl = get_acl(inode, ACL_TYPE_ACCESS);
				283	if (IS_ERR(acl))
				284	return PTR_ERR(acl);
				285	if (acl) {
				286	int error = posix_acl_permission(inode, acl, mask);
				287	posix_acl_release(acl);
				288	return error;
				289	}
				290	#endif
				291
				292	return -EAGAIN;
				293	}
				294
				295	/*
				296	* This does the basic permission checking
				297	*/
				298	static int acl_permission_check(struct inode *inode, int mask)
				299	{
				300	unsigned int mode = inode->i_mode;
				301
				302	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
				303	mode >>= 6;
				304	else {
				305	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
				306	int error = check_acl(inode, mask);
				307	if (error != -EAGAIN)
				308	return error;
				309	}
				310
				311	if (in_group_p(inode->i_gid))
				312	mode >>= 3;
				313	}
				314
				315	/*
				316	* If the DACs are ok we don't need any capability check.
				317	*/
				318	if ((mask & ~mode & (MAY_READ \| MAY_WRITE \| MAY_EXEC)) == 0)
				319	return 0;
				320	return -EACCES;
				321	}
				322
				323	/**
				324	* generic_permission - check for access rights on a Posix-like filesystem
				325	* @inode: inode to check access rights for
				326	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
				327	*
				328	* Used to check for read/write/execute permissions on a file.
				329	* We use "fsuid" for this, letting us set arbitrary permissions
				330	* for filesystem access without changing the "normal" uids which
				331	* are used for other things.
				332	*
				333	* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
				334	* request cannot be satisfied (eg. requires blocking or too much complexity).
				335	* It would then be called again in ref-walk mode.
				336	*/
				337	int generic_permission(struct inode *inode, int mask)
				338	{
				339	int ret;
				340
				341	/*
				342	* Do the basic permission checks.
				343	*/
				344	ret = acl_permission_check(inode, mask);
				345	if (ret != -EACCES)
				346	return ret;
				347
				348	if (S_ISDIR(inode->i_mode)) {
				349	/* DACs are overridable for directories */
				350	if (!(mask & MAY_WRITE))
				351	if (capable_wrt_inode_uidgid(inode,
				352	CAP_DAC_READ_SEARCH))
				353	return 0;
				354	if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
				355	return 0;
				356	return -EACCES;
				357	}
				358
				359	/*
				360	* Searching includes executable on directories, else just read.
				361	*/
				362	mask &= MAY_READ \| MAY_WRITE \| MAY_EXEC;
				363	if (mask == MAY_READ)
				364	if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
				365	return 0;
				366	/*
				367	* Read/write DACs are always overridable.
				368	* Executable DACs are overridable when there is
				369	* at least one exec bit set.
				370	*/
				371	if (!(mask & MAY_EXEC) \|\| (inode->i_mode & S_IXUGO))
				372	if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
				373	return 0;
				374
				375	return -EACCES;
				376	}
				377	EXPORT_SYMBOL(generic_permission);
				378
				379	/*
				380	* We _really_ want to just do "generic_permission()" without
				381	* even looking at the inode->i_op values. So we keep a cache
				382	* flag in inode->i_opflags, that says "this has not special
				383	* permission function, use the fast case".
				384	*/
				385	static inline int do_inode_permission(struct vfsmount mnt, struct inode inode, int mask)
				386	{
				387	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
				388	if (likely(mnt && inode->i_op->permission2))
				389	return inode->i_op->permission2(mnt, inode, mask);
				390	if (likely(inode->i_op->permission))
				391	return inode->i_op->permission(inode, mask);
				392
				393	/* This gets set once for the inode lifetime */
				394	spin_lock(&inode->i_lock);
				395	inode->i_opflags \|= IOP_FASTPERM;
				396	spin_unlock(&inode->i_lock);
				397	}
				398	return generic_permission(inode, mask);
				399	}
				400
				401	/**
				402	* sb_permission - Check superblock-level permissions
				403	* @sb: Superblock of inode to check permission on
				404	* @inode: Inode to check permission on
				405	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
				406	*
				407	* Separate out file-system wide checks from inode-specific permission checks.
				408	*/
				409	static int sb_permission(struct super_block sb, struct inode inode, int mask)
				410	{
				411	if (unlikely(mask & MAY_WRITE)) {
				412	umode_t mode = inode->i_mode;
				413
				414	/* Nobody gets write access to a read-only fs. */
				415	if (sb_rdonly(sb) && (S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode)))
				416	return -EROFS;
				417	}
				418	return 0;
				419	}
				420
				421	/**
				422	* inode_permission2 - Check for access rights to a given inode
				423	* @mnt:
				424	* @inode: Inode to check permission on
				425	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
				426	*
				427	* Check for read/write/execute permissions on an inode. We use fs[ug]id for
				428	* this, letting us set arbitrary permissions for filesystem access without
				429	* changing the "normal" UIDs which are used for other things.
				430	*
				431	* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
				432	*/
				433	int inode_permission2(struct vfsmount mnt, struct inode inode, int mask)
				434	{
				435	int retval;
				436
				437	retval = sb_permission(inode->i_sb, inode, mask);
				438	if (retval)
				439	return retval;
				440
				441	if (unlikely(mask & MAY_WRITE)) {
				442	/*
				443	* Nobody gets write access to an immutable file.
				444	*/
				445	if (IS_IMMUTABLE(inode))
				446	return -EPERM;
				447
				448	/*
				449	* Updating mtime will likely cause i_uid and i_gid to be
				450	* written back improperly if their true value is unknown
				451	* to the vfs.
				452	*/
				453	if (HAS_UNMAPPED_ID(inode))
				454	return -EACCES;
				455	}
				456
				457	retval = do_inode_permission(mnt, inode, mask);
				458	if (retval)
				459	return retval;
				460
				461	retval = devcgroup_inode_permission(inode, mask);
				462	if (retval)
				463	return retval;
				464
				465	retval = security_inode_permission(inode, mask);
				466	return retval;
				467	}
				468	EXPORT_SYMBOL(inode_permission2);
				469
				470	int inode_permission(struct inode *inode, int mask)
				471	{
				472	return inode_permission2(NULL, inode, mask);
				473	}
				474	EXPORT_SYMBOL(inode_permission);
				475
				476	/**
				477	* path_get - get a reference to a path
				478	* @path: path to get the reference to
				479	*
				480	* Given a path increment the reference count to the dentry and the vfsmount.
				481	*/
				482	void path_get(const struct path *path)
				483	{
				484	mntget(path->mnt);
				485	dget(path->dentry);
				486	}
				487	EXPORT_SYMBOL(path_get);
				488
				489	/**
				490	* path_put - put a reference to a path
				491	* @path: path to put the reference to
				492	*
				493	* Given a path decrement the reference count to the dentry and the vfsmount.
				494	*/
				495	void path_put(const struct path *path)
				496	{
				497	dput(path->dentry);
				498	mntput(path->mnt);
				499	}
				500	EXPORT_SYMBOL(path_put);
				501
				502	#define EMBEDDED_LEVELS 2
				503	struct nameidata {
				504	struct path path;
				505	struct qstr last;
				506	struct path root;
				507	struct inode inode; / path.dentry.d_inode */
				508	unsigned int flags;
				509	unsigned seq, m_seq;
				510	int last_type;
				511	unsigned depth;
				512	int total_link_count;
				513	struct saved {
				514	struct path link;
				515	struct delayed_call done;
				516	const char *name;
				517	unsigned seq;
				518	} *stack, internal[EMBEDDED_LEVELS];
				519	struct filename *name;
				520	struct nameidata *saved;
				521	struct inode *link_inode;
				522	unsigned root_seq;
				523	int dfd;
				524	} __randomize_layout;
				525
				526	static void set_nameidata(struct nameidata p, int dfd, struct filename name)
				527	{
				528	struct nameidata *old = current->nameidata;
				529	p->stack = p->internal;
				530	p->dfd = dfd;
				531	p->name = name;
				532	p->total_link_count = old ? old->total_link_count : 0;
				533	p->saved = old;
				534	current->nameidata = p;
				535	}
				536
				537	static void restore_nameidata(void)
				538	{
				539	struct nameidata now = current->nameidata, old = now->saved;
				540
				541	current->nameidata = old;
				542	if (old)
				543	old->total_link_count = now->total_link_count;
				544	if (now->stack != now->internal)
				545	kfree(now->stack);
				546	}
				547
				548	static int __nd_alloc_stack(struct nameidata *nd)
				549	{
				550	struct saved *p;
				551
				552	if (nd->flags & LOOKUP_RCU) {
				553	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
				554	GFP_ATOMIC);
				555	if (unlikely(!p))
				556	return -ECHILD;
				557	} else {
				558	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
				559	GFP_KERNEL);
				560	if (unlikely(!p))
				561	return -ENOMEM;
				562	}
				563	memcpy(p, nd->internal, sizeof(nd->internal));
				564	nd->stack = p;
				565	return 0;
				566	}
				567
				568	/**
				569	* path_connected - Verify that a path->dentry is below path->mnt.mnt_root
				570	* @path: nameidate to verify
				571	*
				572	* Rename can sometimes move a file or directory outside of a bind
				573	* mount, path_connected allows those cases to be detected.
				574	*/
				575	static bool path_connected(const struct path *path)
				576	{
				577	struct vfsmount *mnt = path->mnt;
				578	struct super_block *sb = mnt->mnt_sb;
				579
				580	/* Bind mounts and multi-root filesystems can have disconnected paths */
				581	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
				582	return true;
				583
				584	return is_subdir(path->dentry, mnt->mnt_root);
				585	}
				586
				587	static inline int nd_alloc_stack(struct nameidata *nd)
				588	{
				589	if (likely(nd->depth != EMBEDDED_LEVELS))
				590	return 0;
				591	if (likely(nd->stack != nd->internal))
				592	return 0;
				593	return __nd_alloc_stack(nd);
				594	}
				595
				596	static void drop_links(struct nameidata *nd)
				597	{
				598	int i = nd->depth;
				599	while (i--) {
				600	struct saved *last = nd->stack + i;
				601	do_delayed_call(&last->done);
				602	clear_delayed_call(&last->done);
				603	}
				604	}
				605
				606	static void terminate_walk(struct nameidata *nd)
				607	{
				608	drop_links(nd);
				609	if (!(nd->flags & LOOKUP_RCU)) {
				610	int i;
				611	path_put(&nd->path);
				612	for (i = 0; i < nd->depth; i++)
				613	path_put(&nd->stack[i].link);
				614	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
				615	path_put(&nd->root);
				616	nd->root.mnt = NULL;
				617	}
				618	} else {
				619	nd->flags &= ~LOOKUP_RCU;
				620	if (!(nd->flags & LOOKUP_ROOT))
				621	nd->root.mnt = NULL;
				622	rcu_read_unlock();
				623	}
				624	nd->depth = 0;
				625	}
				626
				627	/* path_put is needed afterwards regardless of success or failure */
				628	static bool legitimize_path(struct nameidata *nd,
				629	struct path *path, unsigned seq)
				630	{
				631	int res = __legitimize_mnt(path->mnt, nd->m_seq);
				632	if (unlikely(res)) {
				633	if (res > 0)
				634	path->mnt = NULL;
				635	path->dentry = NULL;
				636	return false;
				637	}
				638	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
				639	path->dentry = NULL;
				640	return false;
				641	}
				642	return !read_seqcount_retry(&path->dentry->d_seq, seq);
				643	}
				644
				645	static bool legitimize_links(struct nameidata *nd)
				646	{
				647	int i;
				648	for (i = 0; i < nd->depth; i++) {
				649	struct saved *last = nd->stack + i;
				650	if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
				651	drop_links(nd);
				652	nd->depth = i + 1;
				653	return false;
				654	}
				655	}
				656	return true;
				657	}
				658
				659	/*
				660	* Path walking has 2 modes, rcu-walk and ref-walk (see
				661	* Documentation/filesystems/path-lookup.txt). In situations when we can't
				662	* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
				663	* normal reference counts on dentries and vfsmounts to transition to ref-walk
				664	* mode. Refcounts are grabbed at the last known good point before rcu-walk
				665	* got stuck, so ref-walk may continue from there. If this is not successful
				666	* (eg. a seqcount has changed), then failure is returned and it's up to caller
				667	* to restart the path walk from the beginning in ref-walk mode.
				668	*/
				669
				670	/**
				671	* unlazy_walk - try to switch to ref-walk mode.
				672	* @nd: nameidata pathwalk data
				673	* Returns: 0 on success, -ECHILD on failure
				674	*
				675	* unlazy_walk attempts to legitimize the current nd->path and nd->root
				676	* for ref-walk mode.
				677	* Must be called from rcu-walk context.
				678	* Nothing should touch nameidata between unlazy_walk() failure and
				679	* terminate_walk().
				680	*/
				681	static int unlazy_walk(struct nameidata *nd)
				682	{
				683	struct dentry *parent = nd->path.dentry;
				684
				685	BUG_ON(!(nd->flags & LOOKUP_RCU));
				686
				687	nd->flags &= ~LOOKUP_RCU;
				688	if (unlikely(!legitimize_links(nd)))
				689	goto out2;
				690	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
				691	goto out1;
				692	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
				693	if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq)))
				694	goto out;
				695	}
				696	rcu_read_unlock();
				697	BUG_ON(nd->inode != parent->d_inode);
				698	return 0;
				699
				700	out2:
				701	nd->path.mnt = NULL;
				702	nd->path.dentry = NULL;
				703	out1:
				704	if (!(nd->flags & LOOKUP_ROOT))
				705	nd->root.mnt = NULL;
				706	out:
				707	rcu_read_unlock();
				708	return -ECHILD;
				709	}
				710
				711	/**
				712	* unlazy_child - try to switch to ref-walk mode.
				713	* @nd: nameidata pathwalk data
				714	* @dentry: child of nd->path.dentry
				715	* @seq: seq number to check dentry against
				716	* Returns: 0 on success, -ECHILD on failure
				717	*
				718	* unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
				719	* for ref-walk mode. @dentry must be a path found by a do_lookup call on
				720	* @nd. Must be called from rcu-walk context.
				721	* Nothing should touch nameidata between unlazy_child() failure and
				722	* terminate_walk().
				723	*/
				724	static int unlazy_child(struct nameidata nd, struct dentry dentry, unsigned seq)
				725	{
				726	BUG_ON(!(nd->flags & LOOKUP_RCU));
				727
				728	nd->flags &= ~LOOKUP_RCU;
				729	if (unlikely(!legitimize_links(nd)))
				730	goto out2;
				731	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
				732	goto out2;
				733	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
				734	goto out1;
				735
				736	/*
				737	* We need to move both the parent and the dentry from the RCU domain
				738	* to be properly refcounted. And the sequence number in the dentry
				739	* validates both dentry counters, since we checked the sequence
				740	* number of the parent after we got the child sequence number. So we
				741	* know the parent must still be valid if the child sequence number is
				742	*/
				743	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
				744	goto out;
				745	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq))) {
				746	rcu_read_unlock();
				747	dput(dentry);
				748	goto drop_root_mnt;
				749	}
				750	/*
				751	* Sequence counts matched. Now make sure that the root is
				752	* still valid and get it if required.
				753	*/
				754	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
				755	if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) {
				756	rcu_read_unlock();
				757	dput(dentry);
				758	return -ECHILD;
				759	}
				760	}
				761
				762	rcu_read_unlock();
				763	return 0;
				764
				765	out2:
				766	nd->path.mnt = NULL;
				767	out1:
				768	nd->path.dentry = NULL;
				769	out:
				770	rcu_read_unlock();
				771	drop_root_mnt:
				772	if (!(nd->flags & LOOKUP_ROOT))
				773	nd->root.mnt = NULL;
				774	return -ECHILD;
				775	}
				776
				777	static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
				778	{
				779	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
				780	return dentry->d_op->d_revalidate(dentry, flags);
				781	else
				782	return 1;
				783	}
				784
				785	#define INIT_PATH_SIZE 64
				786
				787	static void success_walk_trace(struct nameidata *nd)
				788	{
				789	struct path *pt = &nd->path;
				790	struct inode *i = nd->inode;
				791	char buf[INIT_PATH_SIZE], *try_buf;
				792	int cur_path_size;
				793	char *p;
				794
				795	/* When eBPF/ tracepoint is disabled, keep overhead low. */
				796	if (!trace_inodepath_enabled())
				797	return;
				798
				799	/* First try stack allocated buffer. */
				800	try_buf = buf;
				801	cur_path_size = INIT_PATH_SIZE;
				802
				803	while (cur_path_size <= PATH_MAX) {
				804	/* Free previous heap allocation if we are now trying
				805	* a second or later heap allocation.
				806	*/
				807	if (try_buf != buf)
				808	kfree(try_buf);
				809
				810	/* All but the first alloc are on the heap. */
				811	if (cur_path_size != INIT_PATH_SIZE) {
				812	try_buf = kmalloc(cur_path_size, GFP_KERNEL);
				813	if (!try_buf) {
				814	try_buf = buf;
				815	sprintf(try_buf, "error:buf_alloc_failed");
				816	break;
				817	}
				818	}
				819
				820	p = d_path(pt, try_buf, cur_path_size);
				821
				822	if (!IS_ERR(p)) {
				823	char *end = mangle_path(try_buf, p, "\n");
				824
				825	if (end) {
				826	try_buf[end - try_buf] = 0;
				827	break;
				828	} else {
				829	/* On mangle errors, double path size
				830	* till PATH_MAX.
				831	*/
				832	cur_path_size = cur_path_size << 1;
				833	continue;
				834	}
				835	}
				836
				837	if (PTR_ERR(p) == -ENAMETOOLONG) {
				838	/* If d_path complains that name is too long,
				839	* then double path size till PATH_MAX.
				840	*/
				841	cur_path_size = cur_path_size << 1;
				842	continue;
				843	}
				844
				845	sprintf(try_buf, "error:d_path_failed_%lu",
				846	-1 * PTR_ERR(p));
				847	break;
				848	}
				849
				850	if (cur_path_size > PATH_MAX)
				851	sprintf(try_buf, "error:d_path_name_too_long");
				852
				853	trace_inodepath(i, try_buf);
				854
				855	if (try_buf != buf)
				856	kfree(try_buf);
				857	return;
				858	}
				859
				860	/**
				861	* complete_walk - successful completion of path walk
				862	* @nd: pointer nameidata
				863	*
				864	* If we had been in RCU mode, drop out of it and legitimize nd->path.
				865	* Revalidate the final result, unless we'd already done that during
				866	* the path walk or the filesystem doesn't ask for it. Return 0 on
				867	* success, -error on failure. In case of failure caller does not
				868	* need to drop nd->path.
				869	*/
				870	static int complete_walk(struct nameidata *nd)
				871	{
				872	struct dentry *dentry = nd->path.dentry;
				873	int status;
				874
				875	if (nd->flags & LOOKUP_RCU) {
				876	if (!(nd->flags & LOOKUP_ROOT))
				877	nd->root.mnt = NULL;
				878	if (unlikely(unlazy_walk(nd)))
				879	return -ECHILD;
				880	}
				881
				882	if (likely(!(nd->flags & LOOKUP_JUMPED))) {
				883	success_walk_trace(nd);
				884	return 0;
				885	}
				886
				887	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) {
				888	success_walk_trace(nd);
				889	return 0;
				890	}
				891
				892	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
				893	if (status > 0) {
				894	success_walk_trace(nd);
				895	return 0;
				896	}
				897
				898	if (!status)
				899	status = -ESTALE;
				900
				901	return status;
				902	}
				903
				904	static void set_root(struct nameidata *nd)
				905	{
				906	struct fs_struct *fs = current->fs;
				907
				908	if (nd->flags & LOOKUP_RCU) {
				909	unsigned seq;
				910
				911	do {
				912	seq = read_seqcount_begin(&fs->seq);
				913	nd->root = fs->root;
				914	nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
				915	} while (read_seqcount_retry(&fs->seq, seq));
				916	} else {
				917	get_fs_root(fs, &nd->root);
				918	}
				919	}
				920
				921	static void path_put_conditional(struct path path, struct nameidata nd)
				922	{
				923	dput(path->dentry);
				924	if (path->mnt != nd->path.mnt)
				925	mntput(path->mnt);
				926	}
				927
				928	static inline void path_to_nameidata(const struct path *path,
				929	struct nameidata *nd)
				930	{
				931	if (!(nd->flags & LOOKUP_RCU)) {
				932	dput(nd->path.dentry);
				933	if (nd->path.mnt != path->mnt)
				934	mntput(nd->path.mnt);
				935	}
				936	nd->path.mnt = path->mnt;
				937	nd->path.dentry = path->dentry;
				938	}
				939
				940	static int nd_jump_root(struct nameidata *nd)
				941	{
				942	if (nd->flags & LOOKUP_RCU) {
				943	struct dentry *d;
				944	nd->path = nd->root;
				945	d = nd->path.dentry;
				946	nd->inode = d->d_inode;
				947	nd->seq = nd->root_seq;
				948	if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
				949	return -ECHILD;
				950	} else {
				951	path_put(&nd->path);
				952	nd->path = nd->root;
				953	path_get(&nd->path);
				954	nd->inode = nd->path.dentry->d_inode;
				955	}
				956	nd->flags \|= LOOKUP_JUMPED;
				957	return 0;
				958	}
				959
				960	/*
				961	* Helper to directly jump to a known parsed path from ->get_link,
				962	* caller must have taken a reference to path beforehand.
				963	*/
				964	void nd_jump_link(struct path *path)
				965	{
				966	struct nameidata *nd = current->nameidata;
				967	path_put(&nd->path);
				968
				969	nd->path = *path;
				970	nd->inode = nd->path.dentry->d_inode;
				971	nd->flags \|= LOOKUP_JUMPED;
				972	}
				973
				974	static inline void put_link(struct nameidata *nd)
				975	{
				976	struct saved *last = nd->stack + --nd->depth;
				977	do_delayed_call(&last->done);
				978	if (!(nd->flags & LOOKUP_RCU))
				979	path_put(&last->link);
				980	}
				981
				982	int sysctl_protected_symlinks __read_mostly = 0;
				983	int sysctl_protected_hardlinks __read_mostly = 0;
				984	int sysctl_protected_fifos __read_mostly;
				985	int sysctl_protected_regular __read_mostly;
				986
				987	/**
				988	* may_follow_link - Check symlink following for unsafe situations
				989	* @nd: nameidata pathwalk data
				990	*
				991	* In the case of the sysctl_protected_symlinks sysctl being enabled,
				992	* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
				993	* in a sticky world-writable directory. This is to protect privileged
				994	* processes from failing races against path names that may change out
				995	* from under them by way of other users creating malicious symlinks.
				996	* It will permit symlinks to be followed only when outside a sticky
				997	* world-writable directory, or when the uid of the symlink and follower
				998	* match, or when the directory owner matches the symlink's owner.
				999	*
				1000	* Returns 0 if following the symlink is allowed, -ve on error.
				1001	*/
				1002	static inline int may_follow_link(struct nameidata *nd)
				1003	{
				1004	const struct inode *inode;
				1005	const struct inode *parent;
				1006	kuid_t puid;
				1007
				1008	if (!sysctl_protected_symlinks)
				1009	return 0;
				1010
				1011	/* Allowed if owner and follower match. */
				1012	inode = nd->link_inode;
				1013	if (uid_eq(current_cred()->fsuid, inode->i_uid))
				1014	return 0;
				1015
				1016	/* Allowed if parent directory not sticky and world-writable. */
				1017	parent = nd->inode;
				1018	if ((parent->i_mode & (S_ISVTX\|S_IWOTH)) != (S_ISVTX\|S_IWOTH))
				1019	return 0;
				1020
				1021	/* Allowed if parent directory and link owner match. */
				1022	puid = parent->i_uid;
				1023	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
				1024	return 0;
				1025
				1026	if (nd->flags & LOOKUP_RCU)
				1027	return -ECHILD;
				1028
				1029	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
				1030	audit_log_link_denied("follow_link");
				1031	return -EACCES;
				1032	}
				1033
				1034	/**
				1035	* safe_hardlink_source - Check for safe hardlink conditions
				1036	* @inode: the source inode to hardlink from
				1037	*
				1038	* Return false if at least one of the following conditions:
				1039	* - inode is not a regular file
				1040	* - inode is setuid
				1041	* - inode is setgid and group-exec
				1042	* - access failure for read and write
				1043	*
				1044	* Otherwise returns true.
				1045	*/
				1046	static bool safe_hardlink_source(struct inode *inode)
				1047	{
				1048	umode_t mode = inode->i_mode;
				1049
				1050	/* Special files should not get pinned to the filesystem. */
				1051	if (!S_ISREG(mode))
				1052	return false;
				1053
				1054	/* Setuid files should not get pinned to the filesystem. */
				1055	if (mode & S_ISUID)
				1056	return false;
				1057
				1058	/* Executable setgid files should not get pinned to the filesystem. */
				1059	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP))
				1060	return false;
				1061
				1062	/* Hardlinking to unreadable or unwritable sources is dangerous. */
				1063	if (inode_permission(inode, MAY_READ \| MAY_WRITE))
				1064	return false;
				1065
				1066	return true;
				1067	}
				1068
				1069	/**
				1070	* may_linkat - Check permissions for creating a hardlink
				1071	* @link: the source to hardlink from
				1072	*
				1073	* Block hardlink when all of:
				1074	* - sysctl_protected_hardlinks enabled
				1075	* - fsuid does not match inode
				1076	* - hardlink source is unsafe (see safe_hardlink_source() above)
				1077	* - not CAP_FOWNER in a namespace with the inode owner uid mapped
				1078	*
				1079	* Returns 0 if successful, -ve on error.
				1080	*/
				1081	static int may_linkat(struct path *link)
				1082	{
				1083	struct inode *inode = link->dentry->d_inode;
				1084
				1085	/* Inode writeback is not safe when the uid or gid are invalid. */
				1086	if (!uid_valid(inode->i_uid) \|\| !gid_valid(inode->i_gid))
				1087	return -EOVERFLOW;
				1088
				1089	if (!sysctl_protected_hardlinks)
				1090	return 0;
				1091
				1092	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
				1093	* otherwise, it must be a safe source.
				1094	*/
				1095	if (safe_hardlink_source(inode) \|\| inode_owner_or_capable(inode))
				1096	return 0;
				1097
				1098	audit_log_link_denied("linkat");
				1099	return -EPERM;
				1100	}
				1101
				1102	/**
				1103	* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
				1104	* should be allowed, or not, on files that already
				1105	* exist.
				1106	* @dir_mode: mode bits of directory
				1107	* @dir_uid: owner of directory
				1108	* @inode: the inode of the file to open
				1109	*
				1110	* Block an O_CREAT open of a FIFO (or a regular file) when:
				1111	* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
				1112	* - the file already exists
				1113	* - we are in a sticky directory
				1114	* - we don't own the file
				1115	* - the owner of the directory doesn't own the file
				1116	* - the directory is world writable
				1117	* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
				1118	* the directory doesn't have to be world writable: being group writable will
				1119	* be enough.
				1120	*
				1121	* Returns 0 if the open is allowed, -ve on error.
				1122	*/
				1123	static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
				1124	struct inode * const inode)
				1125	{
				1126	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) \|\|
				1127	(!sysctl_protected_regular && S_ISREG(inode->i_mode)) \|\|
				1128	likely(!(dir_mode & S_ISVTX)) \|\|
				1129	uid_eq(inode->i_uid, dir_uid) \|\|
				1130	uid_eq(current_fsuid(), inode->i_uid))
				1131	return 0;
				1132
				1133	if (likely(dir_mode & 0002) \|\|
				1134	(dir_mode & 0020 &&
				1135	((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) \|\|
				1136	(sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
				1137	return -EACCES;
				1138	}
				1139	return 0;
				1140	}
				1141
				1142	static __always_inline
				1143	const char get_link(struct nameidata nd)
				1144	{
				1145	struct saved *last = nd->stack + nd->depth - 1;
				1146	struct dentry *dentry = last->link.dentry;
				1147	struct inode *inode = nd->link_inode;
				1148	int error;
				1149	const char *res;
				1150
				1151	if (!(nd->flags & LOOKUP_RCU)) {
				1152	touch_atime(&last->link);
				1153	cond_resched();
				1154	} else if (atime_needs_update(&last->link, inode)) {
				1155	if (unlikely(unlazy_walk(nd)))
				1156	return ERR_PTR(-ECHILD);
				1157	touch_atime(&last->link);
				1158	}
				1159
				1160	error = security_inode_follow_link(dentry, inode,
				1161	nd->flags & LOOKUP_RCU);
				1162	if (unlikely(error))
				1163	return ERR_PTR(error);
				1164
				1165	nd->last_type = LAST_BIND;
				1166	res = READ_ONCE(inode->i_link);
				1167	if (!res) {
				1168	const char * (get)(struct dentry , struct inode *,
				1169	struct delayed_call *);
				1170	get = inode->i_op->get_link;
				1171	if (nd->flags & LOOKUP_RCU) {
				1172	res = get(NULL, inode, &last->done);
				1173	if (res == ERR_PTR(-ECHILD)) {
				1174	if (unlikely(unlazy_walk(nd)))
				1175	return ERR_PTR(-ECHILD);
				1176	res = get(dentry, inode, &last->done);
				1177	}
				1178	} else {
				1179	res = get(dentry, inode, &last->done);
				1180	}
				1181	if (IS_ERR_OR_NULL(res))
				1182	return res;
				1183	}
				1184	if (*res == '/') {
				1185	if (!nd->root.mnt)
				1186	set_root(nd);
				1187	if (unlikely(nd_jump_root(nd)))
				1188	return ERR_PTR(-ECHILD);
				1189	while (unlikely(*++res == '/'))
				1190	;
				1191	}
				1192	if (!*res)
				1193	res = NULL;
				1194	return res;
				1195	}
				1196
				1197	/*
				1198	* follow_up - Find the mountpoint of path's vfsmount
				1199	*
				1200	* Given a path, find the mountpoint of its source file system.
				1201	* Replace @path with the path of the mountpoint in the parent mount.
				1202	* Up is towards /.
				1203	*
				1204	* Return 1 if we went up a level and 0 if we were already at the
				1205	* root.
				1206	*/
				1207	int follow_up(struct path *path)
				1208	{
				1209	struct mount *mnt = real_mount(path->mnt);
				1210	struct mount *parent;
				1211	struct dentry *mountpoint;
				1212
				1213	read_seqlock_excl(&mount_lock);
				1214	parent = mnt->mnt_parent;
				1215	if (parent == mnt) {
				1216	read_sequnlock_excl(&mount_lock);
				1217	return 0;
				1218	}
				1219	mntget(&parent->mnt);
				1220	mountpoint = dget(mnt->mnt_mountpoint);
				1221	read_sequnlock_excl(&mount_lock);
				1222	dput(path->dentry);
				1223	path->dentry = mountpoint;
				1224	mntput(path->mnt);
				1225	path->mnt = &parent->mnt;
				1226	return 1;
				1227	}
				1228	EXPORT_SYMBOL(follow_up);
				1229
				1230	/*
				1231	* Perform an automount
				1232	* - return -EISDIR to tell follow_managed() to stop and return the path we
				1233	* were called with.
				1234	*/
				1235	static int follow_automount(struct path path, struct nameidata nd,
				1236	bool *need_mntput)
				1237	{
				1238	struct vfsmount *mnt;
				1239	int err;
				1240
				1241	if (!path->dentry->d_op \|\| !path->dentry->d_op->d_automount)
				1242	return -EREMOTE;
				1243
				1244	/* We don't want to mount if someone's just doing a stat -
				1245	* unless they're stat'ing a directory and appended a '/' to
				1246	* the name.
				1247	*
				1248	* We do, however, want to mount if someone wants to open or
				1249	* create a file of any type under the mountpoint, wants to
				1250	* traverse through the mountpoint or wants to open the
				1251	* mounted directory. Also, autofs may mark negative dentries
				1252	* as being automount points. These will need the attentions
				1253	* of the daemon to instantiate them before they can be used.
				1254	*/
				1255	if (!(nd->flags & (LOOKUP_PARENT \| LOOKUP_DIRECTORY \|
				1256	LOOKUP_OPEN \| LOOKUP_CREATE \| LOOKUP_AUTOMOUNT)) &&
				1257	path->dentry->d_inode)
				1258	return -EISDIR;
				1259
				1260	nd->total_link_count++;
				1261	if (nd->total_link_count >= 40)
				1262	return -ELOOP;
				1263
				1264	mnt = path->dentry->d_op->d_automount(path);
				1265	if (IS_ERR(mnt)) {
				1266	/*
				1267	* The filesystem is allowed to return -EISDIR here to indicate
				1268	* it doesn't want to automount. For instance, autofs would do
				1269	* this so that its userspace daemon can mount on this dentry.
				1270	*
				1271	* However, we can only permit this if it's a terminal point in
				1272	* the path being looked up; if it wasn't then the remainder of
				1273	* the path is inaccessible and we should say so.
				1274	*/
				1275	if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
				1276	return -EREMOTE;
				1277	return PTR_ERR(mnt);
				1278	}
				1279
				1280	if (!mnt) /* mount collision */
				1281	return 0;
				1282
				1283	if (!*need_mntput) {
				1284	/* lock_mount() may release path->mnt on error */
				1285	mntget(path->mnt);
				1286	*need_mntput = true;
				1287	}
				1288	err = finish_automount(mnt, path);
				1289
				1290	switch (err) {
				1291	case -EBUSY:
				1292	/* Someone else made a mount here whilst we were busy */
				1293	return 0;
				1294	case 0:
				1295	path_put(path);
				1296	path->mnt = mnt;
				1297	path->dentry = dget(mnt->mnt_root);
				1298	return 0;
				1299	default:
				1300	return err;
				1301	}
				1302
				1303	}
				1304
				1305	/*
				1306	* Handle a dentry that is managed in some way.
				1307	* - Flagged for transit management (autofs)
				1308	* - Flagged as mountpoint
				1309	* - Flagged as automount point
				1310	*
				1311	* This may only be called in refwalk mode.
				1312	*
				1313	* Serialization is taken care of in namespace.c
				1314	*/
				1315	static int follow_managed(struct path path, struct nameidata nd)
				1316	{
				1317	struct vfsmount mnt = path->mnt; / held by caller, must be left alone */
				1318	unsigned managed;
				1319	bool need_mntput = false;
				1320	int ret = 0;
				1321
				1322	/* Given that we're not holding a lock here, we retain the value in a
				1323	* local variable for each dentry as we look at it so that we don't see
				1324	* the components of that value change under us */
				1325	while (managed = READ_ONCE(path->dentry->d_flags),
				1326	managed &= DCACHE_MANAGED_DENTRY,
				1327	unlikely(managed != 0)) {
				1328	/* Allow the filesystem to manage the transit without i_mutex
				1329	* being held. */
				1330	if (managed & DCACHE_MANAGE_TRANSIT) {
				1331	BUG_ON(!path->dentry->d_op);
				1332	BUG_ON(!path->dentry->d_op->d_manage);
				1333	ret = path->dentry->d_op->d_manage(path, false);
				1334	if (ret < 0)
				1335	break;
				1336	}
				1337
				1338	/* Transit to a mounted filesystem. */
				1339	if (managed & DCACHE_MOUNTED) {
				1340	struct vfsmount *mounted = lookup_mnt(path);
				1341	if (mounted) {
				1342	dput(path->dentry);
				1343	if (need_mntput)
				1344	mntput(path->mnt);
				1345	path->mnt = mounted;
				1346	path->dentry = dget(mounted->mnt_root);
				1347	need_mntput = true;
				1348	continue;
				1349	}
				1350
				1351	/* Something is mounted on this dentry in another
				1352	* namespace and/or whatever was mounted there in this
				1353	* namespace got unmounted before lookup_mnt() could
				1354	* get it */
				1355	}
				1356
				1357	/* Handle an automount point */
				1358	if (managed & DCACHE_NEED_AUTOMOUNT) {
				1359	ret = follow_automount(path, nd, &need_mntput);
				1360	if (ret < 0)
				1361	break;
				1362	continue;
				1363	}
				1364
				1365	/* We didn't change the current path point */
				1366	break;
				1367	}
				1368
				1369	if (need_mntput && path->mnt == mnt)
				1370	mntput(path->mnt);
				1371	if (ret == -EISDIR \|\| !ret)
				1372	ret = 1;
				1373	if (need_mntput)
				1374	nd->flags \|= LOOKUP_JUMPED;
				1375	if (unlikely(ret < 0))
				1376	path_put_conditional(path, nd);
				1377	return ret;
				1378	}
				1379
				1380	int follow_down_one(struct path *path)
				1381	{
				1382	struct vfsmount *mounted;
				1383
				1384	mounted = lookup_mnt(path);
				1385	if (mounted) {
				1386	dput(path->dentry);
				1387	mntput(path->mnt);
				1388	path->mnt = mounted;
				1389	path->dentry = dget(mounted->mnt_root);
				1390	return 1;
				1391	}
				1392	return 0;
				1393	}
				1394	EXPORT_SYMBOL(follow_down_one);
				1395
				1396	static inline int managed_dentry_rcu(const struct path *path)
				1397	{
				1398	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
				1399	path->dentry->d_op->d_manage(path, true) : 0;
				1400	}
				1401
				1402	/*
				1403	* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
				1404	* we meet a managed dentry that would need blocking.
				1405	*/
				1406	static bool __follow_mount_rcu(struct nameidata nd, struct path path,
				1407	struct inode *inode, unsigned seqp)
				1408	{
				1409	for (;;) {
				1410	struct mount *mounted;
				1411	/*
				1412	* Don't forget we might have a non-mountpoint managed dentry
				1413	* that wants to block transit.
				1414	*/
				1415	switch (managed_dentry_rcu(path)) {
				1416	case -ECHILD:
				1417	default:
				1418	return false;
				1419	case -EISDIR:
				1420	return true;
				1421	case 0:
				1422	break;
				1423	}
				1424
				1425	if (!d_mountpoint(path->dentry))
				1426	return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
				1427
				1428	mounted = __lookup_mnt(path->mnt, path->dentry);
				1429	if (!mounted)
				1430	break;
				1431	path->mnt = &mounted->mnt;
				1432	path->dentry = mounted->mnt.mnt_root;
				1433	nd->flags \|= LOOKUP_JUMPED;
				1434	*seqp = read_seqcount_begin(&path->dentry->d_seq);
				1435	/*
				1436	* Update the inode too. We don't need to re-check the
				1437	* dentry sequence number here after this d_inode read,
				1438	* because a mount-point is always pinned.
				1439	*/
				1440	*inode = path->dentry->d_inode;
				1441	}
				1442	return !read_seqretry(&mount_lock, nd->m_seq) &&
				1443	!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
				1444	}
				1445
				1446	static int follow_dotdot_rcu(struct nameidata *nd)
				1447	{
				1448	struct inode *inode = nd->inode;
				1449
				1450	while (1) {
				1451	if (path_equal(&nd->path, &nd->root))
				1452	break;
				1453	if (nd->path.dentry != nd->path.mnt->mnt_root) {
				1454	struct dentry *old = nd->path.dentry;
				1455	struct dentry *parent = old->d_parent;
				1456	unsigned seq;
				1457
				1458	inode = parent->d_inode;
				1459	seq = read_seqcount_begin(&parent->d_seq);
				1460	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
				1461	return -ECHILD;
				1462	nd->path.dentry = parent;
				1463	nd->seq = seq;
				1464	if (unlikely(!path_connected(&nd->path)))
				1465	return -ENOENT;
				1466	break;
				1467	} else {
				1468	struct mount *mnt = real_mount(nd->path.mnt);
				1469	struct mount *mparent = mnt->mnt_parent;
				1470	struct dentry *mountpoint = mnt->mnt_mountpoint;
				1471	struct inode *inode2 = mountpoint->d_inode;
				1472	unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
				1473	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
				1474	return -ECHILD;
				1475	if (&mparent->mnt == nd->path.mnt)
				1476	break;
				1477	/* we know that mountpoint was pinned */
				1478	nd->path.dentry = mountpoint;
				1479	nd->path.mnt = &mparent->mnt;
				1480	inode = inode2;
				1481	nd->seq = seq;
				1482	}
				1483	}
				1484	while (unlikely(d_mountpoint(nd->path.dentry))) {
				1485	struct mount *mounted;
				1486	mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
				1487	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
				1488	return -ECHILD;
				1489	if (!mounted)
				1490	break;
				1491	nd->path.mnt = &mounted->mnt;
				1492	nd->path.dentry = mounted->mnt.mnt_root;
				1493	inode = nd->path.dentry->d_inode;
				1494	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
				1495	}
				1496	nd->inode = inode;
				1497	return 0;
				1498	}
				1499
				1500	/*
				1501	* Follow down to the covering mount currently visible to userspace. At each
				1502	* point, the filesystem owning that dentry may be queried as to whether the
				1503	* caller is permitted to proceed or not.
				1504	*/
				1505	int follow_down(struct path *path)
				1506	{
				1507	unsigned managed;
				1508	int ret;
				1509
				1510	while (managed = READ_ONCE(path->dentry->d_flags),
				1511	unlikely(managed & DCACHE_MANAGED_DENTRY)) {
				1512	/* Allow the filesystem to manage the transit without i_mutex
				1513	* being held.
				1514	*
				1515	* We indicate to the filesystem if someone is trying to mount
				1516	* something here. This gives autofs the chance to deny anyone
				1517	* other than its daemon the right to mount on its
				1518	* superstructure.
				1519	*
				1520	* The filesystem may sleep at this point.
				1521	*/
				1522	if (managed & DCACHE_MANAGE_TRANSIT) {
				1523	BUG_ON(!path->dentry->d_op);
				1524	BUG_ON(!path->dentry->d_op->d_manage);
				1525	ret = path->dentry->d_op->d_manage(path, false);
				1526	if (ret < 0)
				1527	return ret == -EISDIR ? 0 : ret;
				1528	}
				1529
				1530	/* Transit to a mounted filesystem. */
				1531	if (managed & DCACHE_MOUNTED) {
				1532	struct vfsmount *mounted = lookup_mnt(path);
				1533	if (!mounted)
				1534	break;
				1535	dput(path->dentry);
				1536	mntput(path->mnt);
				1537	path->mnt = mounted;
				1538	path->dentry = dget(mounted->mnt_root);
				1539	continue;
				1540	}
				1541
				1542	/* Don't handle automount points here */
				1543	break;
				1544	}
				1545	return 0;
				1546	}
				1547	EXPORT_SYMBOL(follow_down);
				1548
				1549	/*
				1550	* Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
				1551	*/
				1552	static void follow_mount(struct path *path)
				1553	{
				1554	while (d_mountpoint(path->dentry)) {
				1555	struct vfsmount *mounted = lookup_mnt(path);
				1556	if (!mounted)
				1557	break;
				1558	dput(path->dentry);
				1559	mntput(path->mnt);
				1560	path->mnt = mounted;
				1561	path->dentry = dget(mounted->mnt_root);
				1562	}
				1563	}
				1564
				1565	static int path_parent_directory(struct path *path)
				1566	{
				1567	struct dentry *old = path->dentry;
				1568	/* rare case of legitimate dget_parent()... */
				1569	path->dentry = dget_parent(path->dentry);
				1570	dput(old);
				1571	if (unlikely(!path_connected(path)))
				1572	return -ENOENT;
				1573	return 0;
				1574	}
				1575
				1576	static int follow_dotdot(struct nameidata *nd)
				1577	{
				1578	while(1) {
				1579	if (path_equal(&nd->path, &nd->root))
				1580	break;
				1581	if (nd->path.dentry != nd->path.mnt->mnt_root) {
				1582	int ret = path_parent_directory(&nd->path);
				1583	if (ret)
				1584	return ret;
				1585	break;
				1586	}
				1587	if (!follow_up(&nd->path))
				1588	break;
				1589	}
				1590	follow_mount(&nd->path);
				1591	nd->inode = nd->path.dentry->d_inode;
				1592	return 0;
				1593	}
				1594
				1595	/*
				1596	* This looks up the name in dcache and possibly revalidates the found dentry.
				1597	* NULL is returned if the dentry does not exist in the cache.
				1598	*/
				1599	static struct dentry lookup_dcache(const struct qstr name,
				1600	struct dentry *dir,
				1601	unsigned int flags)
				1602	{
				1603	struct dentry *dentry = d_lookup(dir, name);
				1604	if (dentry) {
				1605	int error = d_revalidate(dentry, flags);
				1606	if (unlikely(error <= 0)) {
				1607	if (!error)
				1608	d_invalidate(dentry);
				1609	dput(dentry);
				1610	return ERR_PTR(error);
				1611	}
				1612	}
				1613	return dentry;
				1614	}
				1615
				1616	/*
				1617	* Parent directory has inode locked exclusive. This is one
				1618	* and only case when ->lookup() gets called on non in-lookup
				1619	* dentries - as the matter of fact, this only gets called
				1620	* when directory is guaranteed to have no in-lookup children
				1621	* at all.
				1622	*/
				1623	static struct dentry __lookup_hash(const struct qstr name,
				1624	struct dentry *base, unsigned int flags)
				1625	{
				1626	struct dentry *dentry = lookup_dcache(name, base, flags);
				1627	struct dentry *old;
				1628	struct inode *dir = base->d_inode;
				1629
				1630	if (dentry)
				1631	return dentry;
				1632
				1633	/* Don't create child dentry for a dead directory. */
				1634	if (unlikely(IS_DEADDIR(dir)))
				1635	return ERR_PTR(-ENOENT);
				1636
				1637	dentry = d_alloc(base, name);
				1638	if (unlikely(!dentry))
				1639	return ERR_PTR(-ENOMEM);
				1640
				1641	old = dir->i_op->lookup(dir, dentry, flags);
				1642	if (unlikely(old)) {
				1643	dput(dentry);
				1644	dentry = old;
				1645	}
				1646	return dentry;
				1647	}
				1648
				1649	static int lookup_fast(struct nameidata *nd,
				1650	struct path path, struct inode *inode,
				1651	unsigned *seqp)
				1652	{
				1653	struct vfsmount *mnt = nd->path.mnt;
				1654	struct dentry dentry, parent = nd->path.dentry;
				1655	int status = 1;
				1656	int err;
				1657
				1658	/*
				1659	* Rename seqlock is not required here because in the off chance
				1660	* of a false negative due to a concurrent rename, the caller is
				1661	* going to fall back to non-racy lookup.
				1662	*/
				1663	if (nd->flags & LOOKUP_RCU) {
				1664	unsigned seq;
				1665	bool negative;
				1666	dentry = __d_lookup_rcu(parent, &nd->last, &seq);
				1667	if (unlikely(!dentry)) {
				1668	if (unlazy_walk(nd))
				1669	return -ECHILD;
				1670	return 0;
				1671	}
				1672
				1673	/*
				1674	* This sequence count validates that the inode matches
				1675	* the dentry name information from lookup.
				1676	*/
				1677	*inode = d_backing_inode(dentry);
				1678	negative = d_is_negative(dentry);
				1679	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
				1680	return -ECHILD;
				1681
				1682	/*
				1683	* This sequence count validates that the parent had no
				1684	* changes while we did the lookup of the dentry above.
				1685	*
				1686	* The memory barrier in read_seqcount_begin of child is
				1687	* enough, we can use __read_seqcount_retry here.
				1688	*/
				1689	if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
				1690	return -ECHILD;
				1691
				1692	*seqp = seq;
				1693	status = d_revalidate(dentry, nd->flags);
				1694	if (likely(status > 0)) {
				1695	/*
				1696	* Note: do negative dentry check after revalidation in
				1697	* case that drops it.
				1698	*/
				1699	if (unlikely(negative))
				1700	return -ENOENT;
				1701	path->mnt = mnt;
				1702	path->dentry = dentry;
				1703	if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
				1704	return 1;
				1705	}
				1706	if (unlazy_child(nd, dentry, seq))
				1707	return -ECHILD;
				1708	if (unlikely(status == -ECHILD))
				1709	/* we'd been told to redo it in non-rcu mode */
				1710	status = d_revalidate(dentry, nd->flags);
				1711	} else {
				1712	dentry = __d_lookup(parent, &nd->last);
				1713	if (unlikely(!dentry))
				1714	return 0;
				1715	status = d_revalidate(dentry, nd->flags);
				1716	}
				1717	if (unlikely(status <= 0)) {
				1718	if (!status)
				1719	d_invalidate(dentry);
				1720	dput(dentry);
				1721	return status;
				1722	}
				1723	if (unlikely(d_is_negative(dentry))) {
				1724	dput(dentry);
				1725	return -ENOENT;
				1726	}
				1727
				1728	path->mnt = mnt;
				1729	path->dentry = dentry;
				1730	err = follow_managed(path, nd);
				1731	if (likely(err > 0))
				1732	*inode = d_backing_inode(path->dentry);
				1733	return err;
				1734	}
				1735
				1736	/* Fast lookup failed, do it the slow way */
				1737	static struct dentry __lookup_slow(const struct qstr name,
				1738	struct dentry *dir,
				1739	unsigned int flags)
				1740	{
				1741	struct dentry dentry, old;
				1742	struct inode *inode = dir->d_inode;
				1743	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
				1744
				1745	/* Don't go there if it's already dead */
				1746	if (unlikely(IS_DEADDIR(inode)))
				1747	return ERR_PTR(-ENOENT);
				1748	again:
				1749	dentry = d_alloc_parallel(dir, name, &wq);
				1750	if (IS_ERR(dentry))
				1751	return dentry;
				1752	if (unlikely(!d_in_lookup(dentry))) {
				1753	if (!(flags & LOOKUP_NO_REVAL)) {
				1754	int error = d_revalidate(dentry, flags);
				1755	if (unlikely(error <= 0)) {
				1756	if (!error) {
				1757	d_invalidate(dentry);
				1758	dput(dentry);
				1759	goto again;
				1760	}
				1761	dput(dentry);
				1762	dentry = ERR_PTR(error);
				1763	}
				1764	}
				1765	} else {
				1766	old = inode->i_op->lookup(inode, dentry, flags);
				1767	d_lookup_done(dentry);
				1768	if (unlikely(old)) {
				1769	dput(dentry);
				1770	dentry = old;
				1771	}
				1772	}
				1773	return dentry;
				1774	}
				1775
				1776	static struct dentry lookup_slow(const struct qstr name,
				1777	struct dentry *dir,
				1778	unsigned int flags)
				1779	{
				1780	struct inode *inode = dir->d_inode;
				1781	struct dentry *res;
				1782	inode_lock_shared(inode);
				1783	res = __lookup_slow(name, dir, flags);
				1784	inode_unlock_shared(inode);
				1785	return res;
				1786	}
				1787
				1788	static inline int may_lookup(struct nameidata *nd)
				1789	{
				1790	if (nd->flags & LOOKUP_RCU) {
				1791	int err = inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC\|MAY_NOT_BLOCK);
				1792	if (err != -ECHILD)
				1793	return err;
				1794	if (unlazy_walk(nd))
				1795	return -ECHILD;
				1796	}
				1797	return inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC);
				1798	}
				1799
				1800	static inline int handle_dots(struct nameidata *nd, int type)
				1801	{
				1802	if (type == LAST_DOTDOT) {
				1803	if (!nd->root.mnt)
				1804	set_root(nd);
				1805	if (nd->flags & LOOKUP_RCU) {
				1806	return follow_dotdot_rcu(nd);
				1807	} else
				1808	return follow_dotdot(nd);
				1809	}
				1810	return 0;
				1811	}
				1812
				1813	static int pick_link(struct nameidata nd, struct path link,
				1814	struct inode *inode, unsigned seq)
				1815	{
				1816	int error;
				1817	struct saved *last;
				1818	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
				1819	path_to_nameidata(link, nd);
				1820	return -ELOOP;
				1821	}
				1822	if (!(nd->flags & LOOKUP_RCU)) {
				1823	if (link->mnt == nd->path.mnt)
				1824	mntget(link->mnt);
				1825	}
				1826	error = nd_alloc_stack(nd);
				1827	if (unlikely(error)) {
				1828	if (error == -ECHILD) {
				1829	if (unlikely(!legitimize_path(nd, link, seq))) {
				1830	drop_links(nd);
				1831	nd->depth = 0;
				1832	nd->flags &= ~LOOKUP_RCU;
				1833	nd->path.mnt = NULL;
				1834	nd->path.dentry = NULL;
				1835	if (!(nd->flags & LOOKUP_ROOT))
				1836	nd->root.mnt = NULL;
				1837	rcu_read_unlock();
				1838	} else if (likely(unlazy_walk(nd)) == 0)
				1839	error = nd_alloc_stack(nd);
				1840	}
				1841	if (error) {
				1842	path_put(link);
				1843	return error;
				1844	}
				1845	}
				1846
				1847	last = nd->stack + nd->depth++;
				1848	last->link = *link;
				1849	clear_delayed_call(&last->done);
				1850	nd->link_inode = inode;
				1851	last->seq = seq;
				1852	return 1;
				1853	}
				1854
				1855	enum {WALK_FOLLOW = 1, WALK_MORE = 2};
				1856
				1857	/*
				1858	* Do we need to follow links? We _really_ want to be able
				1859	* to do this check without having to look at inode->i_op,
				1860	* so we keep a cache of "no, this doesn't need follow_link"
				1861	* for the common case.
				1862	*/
				1863	static inline int step_into(struct nameidata nd, struct path path,
				1864	int flags, struct inode *inode, unsigned seq)
				1865	{
				1866	if (!(flags & WALK_MORE) && nd->depth)
				1867	put_link(nd);
				1868	if (likely(!d_is_symlink(path->dentry)) \|\|
				1869	!(flags & WALK_FOLLOW \|\| nd->flags & LOOKUP_FOLLOW)) {
				1870	/* not a symlink or should not follow */
				1871	path_to_nameidata(path, nd);
				1872	nd->inode = inode;
				1873	nd->seq = seq;
				1874	return 0;
				1875	}
				1876	/* make sure that d_is_symlink above matches inode */
				1877	if (nd->flags & LOOKUP_RCU) {
				1878	if (read_seqcount_retry(&path->dentry->d_seq, seq))
				1879	return -ECHILD;
				1880	}
				1881	return pick_link(nd, path, inode, seq);
				1882	}
				1883
				1884	static int walk_component(struct nameidata *nd, int flags)
				1885	{
				1886	struct path path;
				1887	struct inode *inode;
				1888	unsigned seq;
				1889	int err;
				1890	/*
				1891	* "." and ".." are special - ".." especially so because it has
				1892	* to be able to know about the current root directory and
				1893	* parent relationships.
				1894	*/
				1895	if (unlikely(nd->last_type != LAST_NORM)) {
				1896	err = handle_dots(nd, nd->last_type);
				1897	if (!(flags & WALK_MORE) && nd->depth)
				1898	put_link(nd);
				1899	return err;
				1900	}
				1901	err = lookup_fast(nd, &path, &inode, &seq);
				1902	if (unlikely(err <= 0)) {
				1903	if (err < 0)
				1904	return err;
				1905	path.dentry = lookup_slow(&nd->last, nd->path.dentry,
				1906	nd->flags);
				1907	if (IS_ERR(path.dentry))
				1908	return PTR_ERR(path.dentry);
				1909
				1910	path.mnt = nd->path.mnt;
				1911	err = follow_managed(&path, nd);
				1912	if (unlikely(err < 0))
				1913	return err;
				1914
				1915	if (unlikely(d_is_negative(path.dentry))) {
				1916	path_to_nameidata(&path, nd);
				1917	return -ENOENT;
				1918	}
				1919
				1920	seq = 0; /* we are already out of RCU mode */
				1921	inode = d_backing_inode(path.dentry);
				1922	}
				1923
				1924	return step_into(nd, &path, flags, inode, seq);
				1925	}
				1926
				1927	/*
				1928	* We can do the critical dentry name comparison and hashing
				1929	* operations one word at a time, but we are limited to:
				1930	*
				1931	* - Architectures with fast unaligned word accesses. We could
				1932	* do a "get_unaligned()" if this helps and is sufficiently
				1933	* fast.
				1934	*
				1935	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
				1936	* do not trap on the (extremely unlikely) case of a page
				1937	* crossing operation.
				1938	*
				1939	* - Furthermore, we need an efficient 64-bit compile for the
				1940	* 64-bit case in order to generate the "number of bytes in
				1941	* the final mask". Again, that could be replaced with a
				1942	* efficient population count instruction or similar.
				1943	*/
				1944	#ifdef CONFIG_DCACHE_WORD_ACCESS
				1945
				1946	#include <asm/word-at-a-time.h>
				1947
				1948	#ifdef HASH_MIX
				1949
				1950	/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
				1951
				1952	#elif defined(CONFIG_64BIT)
				1953	/*
				1954	* Register pressure in the mixing function is an issue, particularly
				1955	* on 32-bit x86, but almost any function requires one state value and
				1956	* one temporary. Instead, use a function designed for two state values
				1957	* and no temporaries.
				1958	*
				1959	* This function cannot create a collision in only two iterations, so
				1960	* we have two iterations to achieve avalanche. In those two iterations,
				1961	* we have six layers of mixing, which is enough to spread one bit's
				1962	* influence out to 2^6 = 64 state bits.
				1963	*
				1964	* Rotate constants are scored by considering either 64 one-bit input
				1965	* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
				1966	* probability of that delta causing a change to each of the 128 output
				1967	* bits, using a sample of random initial states.
				1968	*
				1969	* The Shannon entropy of the computed probabilities is then summed
				1970	* to produce a score. Ideally, any input change has a 50% chance of
				1971	* toggling any given output bit.
				1972	*
				1973	* Mixing scores (in bits) for (12,45):
				1974	* Input delta: 1-bit 2-bit
				1975	* 1 round: 713.3 42542.6
				1976	* 2 rounds: 2753.7 140389.8
				1977	* 3 rounds: 5954.1 233458.2
				1978	* 4 rounds: 7862.6 256672.2
				1979	* Perfect: 8192 258048
				1980	* (64128) (6463/2 * 128)
				1981	*/
				1982	#define HASH_MIX(x, y, a) \
				1983	( x ^= (a), \
				1984	y ^= x, x = rol64(x,12),\
				1985	x += y, y = rol64(y,45),\
				1986	y *= 9 )
				1987
				1988	/*
				1989	* Fold two longs into one 32-bit hash value. This must be fast, but
				1990	* latency isn't quite as critical, as there is a fair bit of additional
				1991	* work done before the hash value is used.
				1992	*/
				1993	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
				1994	{
				1995	y ^= x * GOLDEN_RATIO_64;
				1996	y *= GOLDEN_RATIO_64;
				1997	return y >> 32;
				1998	}
				1999
				2000	#else /* 32-bit case */
				2001
				2002	/*
				2003	* Mixing scores (in bits) for (7,20):
				2004	* Input delta: 1-bit 2-bit
				2005	* 1 round: 330.3 9201.6
				2006	* 2 rounds: 1246.4 25475.4
				2007	* 3 rounds: 1907.1 31295.1
				2008	* 4 rounds: 2042.3 31718.6
				2009	* Perfect: 2048 31744
				2010	* (3264) (3231/2 * 64)
				2011	*/
				2012	#define HASH_MIX(x, y, a) \
				2013	( x ^= (a), \
				2014	y ^= x, x = rol32(x, 7),\
				2015	x += y, y = rol32(y,20),\
				2016	y *= 9 )
				2017
				2018	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
				2019	{
				2020	/* Use arch-optimized multiply if one exists */
				2021	return __hash_32(y ^ __hash_32(x));
				2022	}
				2023
				2024	#endif
				2025
				2026	/*
				2027	* Return the hash of a string of known length. This is carfully
				2028	* designed to match hash_name(), which is the more critical function.
				2029	* In particular, we must end by hashing a final word containing 0..7
				2030	* payload bytes, to match the way that hash_name() iterates until it
				2031	* finds the delimiter after the name.
				2032	*/
				2033	unsigned int full_name_hash(const void salt, const char name, unsigned int len)
				2034	{
				2035	unsigned long a, x = 0, y = (unsigned long)salt;
				2036
				2037	for (;;) {
				2038	if (!len)
				2039	goto done;
				2040	a = load_unaligned_zeropad(name);
				2041	if (len < sizeof(unsigned long))
				2042	break;
				2043	HASH_MIX(x, y, a);
				2044	name += sizeof(unsigned long);
				2045	len -= sizeof(unsigned long);
				2046	}
				2047	x ^= a & bytemask_from_count(len);
				2048	done:
				2049	return fold_hash(x, y);
				2050	}
				2051	EXPORT_SYMBOL(full_name_hash);
				2052
				2053	/* Return the "hash_len" (hash and length) of a null-terminated string */
				2054	u64 hashlen_string(const void salt, const char name)
				2055	{
				2056	unsigned long a = 0, x = 0, y = (unsigned long)salt;
				2057	unsigned long adata, mask, len;
				2058	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
				2059
				2060	len = 0;
				2061	goto inside;
				2062
				2063	do {
				2064	HASH_MIX(x, y, a);
				2065	len += sizeof(unsigned long);
				2066	inside:
				2067	a = load_unaligned_zeropad(name+len);
				2068	} while (!has_zero(a, &adata, &constants));
				2069
				2070	adata = prep_zero_mask(a, adata, &constants);
				2071	mask = create_zero_mask(adata);
				2072	x ^= a & zero_bytemask(mask);
				2073
				2074	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
				2075	}
				2076	EXPORT_SYMBOL(hashlen_string);
				2077
				2078	/*
				2079	* Calculate the length and hash of the path component, and
				2080	* return the "hash_len" as the result.
				2081	*/
				2082	static inline u64 hash_name(const void salt, const char name)
				2083	{
				2084	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
				2085	unsigned long adata, bdata, mask, len;
				2086	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
				2087
				2088	len = 0;
				2089	goto inside;
				2090
				2091	do {
				2092	HASH_MIX(x, y, a);
				2093	len += sizeof(unsigned long);
				2094	inside:
				2095	a = load_unaligned_zeropad(name+len);
				2096	b = a ^ REPEAT_BYTE('/');
				2097	} while (!(has_zero(a, &adata, &constants) \| has_zero(b, &bdata, &constants)));
				2098
				2099	adata = prep_zero_mask(a, adata, &constants);
				2100	bdata = prep_zero_mask(b, bdata, &constants);
				2101	mask = create_zero_mask(adata \| bdata);
				2102	x ^= a & zero_bytemask(mask);
				2103
				2104	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
				2105	}
				2106
				2107	#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
				2108
				2109	/* Return the hash of a string of known length */
				2110	unsigned int full_name_hash(const void salt, const char name, unsigned int len)
				2111	{
				2112	unsigned long hash = init_name_hash(salt);
				2113	while (len--)
				2114	hash = partial_name_hash((unsigned char)*name++, hash);
				2115	return end_name_hash(hash);
				2116	}
				2117	EXPORT_SYMBOL(full_name_hash);
				2118
				2119	/* Return the "hash_len" (hash and length) of a null-terminated string */
				2120	u64 hashlen_string(const void salt, const char name)
				2121	{
				2122	unsigned long hash = init_name_hash(salt);
				2123	unsigned long len = 0, c;
				2124
				2125	c = (unsigned char)*name;
				2126	while (c) {
				2127	len++;
				2128	hash = partial_name_hash(c, hash);
				2129	c = (unsigned char)name[len];
				2130	}
				2131	return hashlen_create(end_name_hash(hash), len);
				2132	}
				2133	EXPORT_SYMBOL(hashlen_string);
				2134
				2135	/*
				2136	* We know there's a real path component here of at least
				2137	* one character.
				2138	*/
				2139	static inline u64 hash_name(const void salt, const char name)
				2140	{
				2141	unsigned long hash = init_name_hash(salt);
				2142	unsigned long len = 0, c;
				2143
				2144	c = (unsigned char)*name;
				2145	do {
				2146	len++;
				2147	hash = partial_name_hash(c, hash);
				2148	c = (unsigned char)name[len];
				2149	} while (c && c != '/');
				2150	return hashlen_create(end_name_hash(hash), len);
				2151	}
				2152
				2153	#endif
				2154
				2155	/*
				2156	* Name resolution.
				2157	* This is the basic name resolution function, turning a pathname into
				2158	* the final dentry. We expect 'base' to be positive and a directory.
				2159	*
				2160	* Returns 0 and nd will have valid dentry and mnt on success.
				2161	* Returns error and drops reference to input namei data on failure.
				2162	*/
				2163	static int link_path_walk(const char name, struct nameidata nd)
				2164	{
				2165	int err;
				2166
				2167	if (IS_ERR(name))
				2168	return PTR_ERR(name);
				2169	while (*name=='/')
				2170	name++;
				2171	if (!*name)
				2172	return 0;
				2173
				2174	/* At this point we know we have a real path component. */
				2175	for(;;) {
				2176	u64 hash_len;
				2177	int type;
				2178
				2179	err = may_lookup(nd);
				2180	if (err)
				2181	return err;
				2182
				2183	hash_len = hash_name(nd->path.dentry, name);
				2184
				2185	type = LAST_NORM;
				2186	if (name[0] == '.') switch (hashlen_len(hash_len)) {
				2187	case 2:
				2188	if (name[1] == '.') {
				2189	type = LAST_DOTDOT;
				2190	nd->flags \|= LOOKUP_JUMPED;
				2191	}
				2192	break;
				2193	case 1:
				2194	type = LAST_DOT;
				2195	}
				2196	if (likely(type == LAST_NORM)) {
				2197	struct dentry *parent = nd->path.dentry;
				2198	nd->flags &= ~LOOKUP_JUMPED;
				2199	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
				2200	struct qstr this = { { .hash_len = hash_len }, .name = name };
				2201	err = parent->d_op->d_hash(parent, &this);
				2202	if (err < 0)
				2203	return err;
				2204	hash_len = this.hash_len;
				2205	name = this.name;
				2206	}
				2207	}
				2208
				2209	nd->last.hash_len = hash_len;
				2210	nd->last.name = name;
				2211	nd->last_type = type;
				2212
				2213	name += hashlen_len(hash_len);
				2214	if (!*name)
				2215	goto OK;
				2216	/*
				2217	* If it wasn't NUL, we know it was '/'. Skip that
				2218	* slash, and continue until no more slashes.
				2219	*/
				2220	do {
				2221	name++;
				2222	} while (unlikely(*name == '/'));
				2223	if (unlikely(!*name)) {
				2224	OK:
				2225	/* pathname body, done */
				2226	if (!nd->depth)
				2227	return 0;
				2228	name = nd->stack[nd->depth - 1].name;
				2229	/* trailing symlink, done */
				2230	if (!name)
				2231	return 0;
				2232	/* last component of nested symlink */
				2233	err = walk_component(nd, WALK_FOLLOW);
				2234	} else {
				2235	/* not the last component */
				2236	err = walk_component(nd, WALK_FOLLOW \| WALK_MORE);
				2237	}
				2238	if (err < 0)
				2239	return err;
				2240
				2241	if (err) {
				2242	const char *s = get_link(nd);
				2243
				2244	if (IS_ERR(s))
				2245	return PTR_ERR(s);
				2246	err = 0;
				2247	if (unlikely(!s)) {
				2248	/* jumped */
				2249	put_link(nd);
				2250	} else {
				2251	nd->stack[nd->depth - 1].name = name;
				2252	name = s;
				2253	continue;
				2254	}
				2255	}
				2256	if (unlikely(!d_can_lookup(nd->path.dentry))) {
				2257	if (nd->flags & LOOKUP_RCU) {
				2258	if (unlazy_walk(nd))
				2259	return -ECHILD;
				2260	}
				2261	return -ENOTDIR;
				2262	}
				2263	}
				2264	}
				2265
				2266	/* must be paired with terminate_walk() */
				2267	static const char path_init(struct nameidata nd, unsigned flags)
				2268	{
				2269	const char *s = nd->name->name;
				2270
				2271	if (!*s)
				2272	flags &= ~LOOKUP_RCU;
				2273	if (flags & LOOKUP_RCU)
				2274	rcu_read_lock();
				2275
				2276	nd->last_type = LAST_ROOT; /* if there are only slashes... */
				2277	nd->flags = flags \| LOOKUP_JUMPED \| LOOKUP_PARENT;
				2278	nd->depth = 0;
				2279	if (flags & LOOKUP_ROOT) {
				2280	struct dentry *root = nd->root.dentry;
				2281	struct inode *inode = root->d_inode;
				2282	if (*s && unlikely(!d_can_lookup(root)))
				2283	return ERR_PTR(-ENOTDIR);
				2284	nd->path = nd->root;
				2285	nd->inode = inode;
				2286	if (flags & LOOKUP_RCU) {
				2287	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
				2288	nd->root_seq = nd->seq;
				2289	nd->m_seq = read_seqbegin(&mount_lock);
				2290	} else {
				2291	path_get(&nd->path);
				2292	}
				2293	return s;
				2294	}
				2295
				2296	nd->root.mnt = NULL;
				2297	nd->path.mnt = NULL;
				2298	nd->path.dentry = NULL;
				2299
				2300	nd->m_seq = read_seqbegin(&mount_lock);
				2301	if (*s == '/') {
				2302	set_root(nd);
				2303	if (likely(!nd_jump_root(nd)))
				2304	return s;
				2305	return ERR_PTR(-ECHILD);
				2306	} else if (nd->dfd == AT_FDCWD) {
				2307	if (flags & LOOKUP_RCU) {
				2308	struct fs_struct *fs = current->fs;
				2309	unsigned seq;
				2310
				2311	do {
				2312	seq = read_seqcount_begin(&fs->seq);
				2313	nd->path = fs->pwd;
				2314	nd->inode = nd->path.dentry->d_inode;
				2315	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
				2316	} while (read_seqcount_retry(&fs->seq, seq));
				2317	} else {
				2318	get_fs_pwd(current->fs, &nd->path);
				2319	nd->inode = nd->path.dentry->d_inode;
				2320	}
				2321	return s;
				2322	} else {
				2323	/* Caller must check execute permissions on the starting path component */
				2324	struct fd f = fdget_raw(nd->dfd);
				2325	struct dentry *dentry;
				2326
				2327	if (!f.file)
				2328	return ERR_PTR(-EBADF);
				2329
				2330	dentry = f.file->f_path.dentry;
				2331
				2332	if (*s && unlikely(!d_can_lookup(dentry))) {
				2333	fdput(f);
				2334	return ERR_PTR(-ENOTDIR);
				2335	}
				2336
				2337	nd->path = f.file->f_path;
				2338	if (flags & LOOKUP_RCU) {
				2339	nd->inode = nd->path.dentry->d_inode;
				2340	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
				2341	} else {
				2342	path_get(&nd->path);
				2343	nd->inode = nd->path.dentry->d_inode;
				2344	}
				2345	fdput(f);
				2346	return s;
				2347	}
				2348	}
				2349
				2350	static const char trailing_symlink(struct nameidata nd)
				2351	{
				2352	const char *s;
				2353	int error = may_follow_link(nd);
				2354	if (unlikely(error))
				2355	return ERR_PTR(error);
				2356	nd->flags \|= LOOKUP_PARENT;
				2357	nd->stack[0].name = NULL;
				2358	s = get_link(nd);
				2359	return s ? s : "";
				2360	}
				2361
				2362	static inline int lookup_last(struct nameidata *nd)
				2363	{
				2364	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
				2365	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				2366
				2367	nd->flags &= ~LOOKUP_PARENT;
				2368	return walk_component(nd, 0);
				2369	}
				2370
				2371	static int handle_lookup_down(struct nameidata *nd)
				2372	{
				2373	struct path path = nd->path;
				2374	struct inode *inode = nd->inode;
				2375	unsigned seq = nd->seq;
				2376	int err;
				2377
				2378	if (nd->flags & LOOKUP_RCU) {
				2379	/*
				2380	* don't bother with unlazy_walk on failure - we are
				2381	* at the very beginning of walk, so we lose nothing
				2382	* if we simply redo everything in non-RCU mode
				2383	*/
				2384	if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
				2385	return -ECHILD;
				2386	} else {
				2387	dget(path.dentry);
				2388	err = follow_managed(&path, nd);
				2389	if (unlikely(err < 0))
				2390	return err;
				2391	inode = d_backing_inode(path.dentry);
				2392	seq = 0;
				2393	}
				2394	path_to_nameidata(&path, nd);
				2395	nd->inode = inode;
				2396	nd->seq = seq;
				2397	return 0;
				2398	}
				2399
				2400	/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
				2401	static int path_lookupat(struct nameidata nd, unsigned flags, struct path path)
				2402	{
				2403	const char *s = path_init(nd, flags);
				2404	int err;
				2405
				2406	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
				2407	err = handle_lookup_down(nd);
				2408	if (unlikely(err < 0))
				2409	s = ERR_PTR(err);
				2410	}
				2411
				2412	while (!(err = link_path_walk(s, nd))
				2413	&& ((err = lookup_last(nd)) > 0)) {
				2414	s = trailing_symlink(nd);
				2415	}
				2416	if (!err)
				2417	err = complete_walk(nd);
				2418
				2419	if (!err && nd->flags & LOOKUP_DIRECTORY)
				2420	if (!d_can_lookup(nd->path.dentry))
				2421	err = -ENOTDIR;
				2422	if (!err) {
				2423	*path = nd->path;
				2424	nd->path.mnt = NULL;
				2425	nd->path.dentry = NULL;
				2426	}
				2427	terminate_walk(nd);
				2428	return err;
				2429	}
				2430
				2431	static int filename_lookup(int dfd, struct filename *name, unsigned flags,
				2432	struct path path, struct path root)
				2433	{
				2434	int retval;
				2435	struct nameidata nd;
				2436	if (IS_ERR(name))
				2437	return PTR_ERR(name);
				2438	if (unlikely(root)) {
				2439	nd.root = *root;
				2440	flags \|= LOOKUP_ROOT;
				2441	}
				2442	set_nameidata(&nd, dfd, name);
				2443	retval = path_lookupat(&nd, flags \| LOOKUP_RCU, path);
				2444	if (unlikely(retval == -ECHILD))
				2445	retval = path_lookupat(&nd, flags, path);
				2446	if (unlikely(retval == -ESTALE))
				2447	retval = path_lookupat(&nd, flags \| LOOKUP_REVAL, path);
				2448
				2449	if (likely(!retval))
				2450	audit_inode(name, path->dentry, flags & LOOKUP_PARENT);
				2451	restore_nameidata();
				2452	putname(name);
				2453	return retval;
				2454	}
				2455
				2456	/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
				2457	static int path_parentat(struct nameidata *nd, unsigned flags,
				2458	struct path *parent)
				2459	{
				2460	const char *s = path_init(nd, flags);
				2461	int err = link_path_walk(s, nd);
				2462	if (!err)
				2463	err = complete_walk(nd);
				2464	if (!err) {
				2465	*parent = nd->path;
				2466	nd->path.mnt = NULL;
				2467	nd->path.dentry = NULL;
				2468	}
				2469	terminate_walk(nd);
				2470	return err;
				2471	}
				2472
				2473	static struct filename filename_parentat(int dfd, struct filename name,
				2474	unsigned int flags, struct path *parent,
				2475	struct qstr last, int type)
				2476	{
				2477	int retval;
				2478	struct nameidata nd;
				2479
				2480	if (IS_ERR(name))
				2481	return name;
				2482	set_nameidata(&nd, dfd, name);
				2483	retval = path_parentat(&nd, flags \| LOOKUP_RCU, parent);
				2484	if (unlikely(retval == -ECHILD))
				2485	retval = path_parentat(&nd, flags, parent);
				2486	if (unlikely(retval == -ESTALE))
				2487	retval = path_parentat(&nd, flags \| LOOKUP_REVAL, parent);
				2488	if (likely(!retval)) {
				2489	*last = nd.last;
				2490	*type = nd.last_type;
				2491	audit_inode(name, parent->dentry, LOOKUP_PARENT);
				2492	} else {
				2493	putname(name);
				2494	name = ERR_PTR(retval);
				2495	}
				2496	restore_nameidata();
				2497	return name;
				2498	}
				2499
				2500	/* does lookup, returns the object with parent locked */
				2501	struct dentry kern_path_locked(const char name, struct path *path)
				2502	{
				2503	struct filename *filename;
				2504	struct dentry *d;
				2505	struct qstr last;
				2506	int type;
				2507
				2508	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				2509	&last, &type);
				2510	if (IS_ERR(filename))
				2511	return ERR_CAST(filename);
				2512	if (unlikely(type != LAST_NORM)) {
				2513	path_put(path);
				2514	putname(filename);
				2515	return ERR_PTR(-EINVAL);
				2516	}
				2517	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
				2518	d = __lookup_hash(&last, path->dentry, 0);
				2519	if (IS_ERR(d)) {
				2520	inode_unlock(path->dentry->d_inode);
				2521	path_put(path);
				2522	}
				2523	putname(filename);
				2524	return d;
				2525	}
				2526
				2527	int kern_path(const char name, unsigned int flags, struct path path)
				2528	{
				2529	return filename_lookup(AT_FDCWD, getname_kernel(name),
				2530	flags, path, NULL);
				2531	}
				2532	EXPORT_SYMBOL(kern_path);
				2533
				2534	/**
				2535	* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
				2536	* @dentry: pointer to dentry of the base directory
				2537	* @mnt: pointer to vfs mount of the base directory
				2538	* @name: pointer to file name
				2539	* @flags: lookup flags
				2540	* @path: pointer to struct path to fill
				2541	*/
				2542	int vfs_path_lookup(struct dentry dentry, struct vfsmount mnt,
				2543	const char *name, unsigned int flags,
				2544	struct path *path)
				2545	{
				2546	struct path root = {.mnt = mnt, .dentry = dentry};
				2547	/* the first argument of filename_lookup() is ignored with root */
				2548	return filename_lookup(AT_FDCWD, getname_kernel(name),
				2549	flags , path, &root);
				2550	}
				2551	EXPORT_SYMBOL(vfs_path_lookup);
				2552
				2553	static int lookup_one_len_common(const char name, struct vfsmount mnt,
				2554	struct dentry base, int len, struct qstr this)
				2555	{
				2556	this->name = name;
				2557	this->len = len;
				2558	this->hash = full_name_hash(base, name, len);
				2559	if (!len)
				2560	return -EACCES;
				2561
				2562	if (unlikely(name[0] == '.')) {
				2563	if (len < 2 \|\| (len == 2 && name[1] == '.'))
				2564	return -EACCES;
				2565	}
				2566
				2567	while (len--) {
				2568	unsigned int c = (const unsigned char )name++;
				2569	if (c == '/' \|\| c == '\0')
				2570	return -EACCES;
				2571	}
				2572	/*
				2573	* See if the low-level filesystem might want
				2574	* to use its own hash..
				2575	*/
				2576	if (base->d_flags & DCACHE_OP_HASH) {
				2577	int err = base->d_op->d_hash(base, this);
				2578	if (err < 0)
				2579	return err;
				2580	}
				2581
				2582	return inode_permission2(mnt, base->d_inode, MAY_EXEC);
				2583	}
				2584
				2585	/**
				2586	* try_lookup_one_len - filesystem helper to lookup single pathname component
				2587	* @name: pathname component to lookup
				2588	* @base: base directory to lookup from
				2589	* @len: maximum length @len should be interpreted to
				2590	*
				2591	* Look up a dentry by name in the dcache, returning NULL if it does not
				2592	* currently exist. The function does not try to create a dentry.
				2593	*
				2594	* Note that this routine is purely a helper for filesystem usage and should
				2595	* not be called by generic code.
				2596	*
				2597	* The caller must hold base->i_mutex.
				2598	*/
				2599	struct dentry try_lookup_one_len(const char name, struct dentry *base, int len)
				2600	{
				2601	struct qstr this;
				2602	int err;
				2603
				2604	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
				2605
				2606	err = lookup_one_len_common(name, NULL, base, len, &this);
				2607	if (err)
				2608	return ERR_PTR(err);
				2609
				2610	return lookup_dcache(&this, base, 0);
				2611	}
				2612	EXPORT_SYMBOL(try_lookup_one_len);
				2613
				2614	/**
				2615	* lookup_one_len - filesystem helper to lookup single pathname component
				2616	* @name: pathname component to lookup
				2617	* @base: base directory to lookup from
				2618	* @len: maximum length @len should be interpreted to
				2619	*
				2620	* Note that this routine is purely a helper for filesystem usage and should
				2621	* not be called by generic code.
				2622	*
				2623	* The caller must hold base->i_mutex.
				2624	*/
				2625	struct dentry lookup_one_len2(const char name, struct vfsmount mnt, struct dentry base, int len)
				2626	{
				2627	struct dentry *dentry;
				2628	struct qstr this;
				2629	int err;
				2630
				2631	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
				2632
				2633	err = lookup_one_len_common(name, mnt, base, len, &this);
				2634	if (err)
				2635	return ERR_PTR(err);
				2636
				2637	dentry = lookup_dcache(&this, base, 0);
				2638	return dentry ? dentry : __lookup_slow(&this, base, 0);
				2639	}
				2640	EXPORT_SYMBOL(lookup_one_len2);
				2641
				2642	struct dentry lookup_one_len(const char name, struct dentry *base, int len)
				2643	{
				2644	return lookup_one_len2(name, NULL, base, len);
				2645	}
				2646	EXPORT_SYMBOL(lookup_one_len);
				2647
				2648	/**
				2649	* lookup_one_len_unlocked - filesystem helper to lookup single pathname component
				2650	* @name: pathname component to lookup
				2651	* @base: base directory to lookup from
				2652	* @len: maximum length @len should be interpreted to
				2653	*
				2654	* Note that this routine is purely a helper for filesystem usage and should
				2655	* not be called by generic code.
				2656	*
				2657	* Unlike lookup_one_len, it should be called without the parent
				2658	* i_mutex held, and will take the i_mutex itself if necessary.
				2659	*/
				2660	struct dentry lookup_one_len_unlocked(const char name,
				2661	struct dentry *base, int len)
				2662	{
				2663	struct qstr this;
				2664	int err;
				2665	struct dentry *ret;
				2666
				2667	err = lookup_one_len_common(name, NULL, base, len, &this);
				2668	if (err)
				2669	return ERR_PTR(err);
				2670
				2671	ret = lookup_dcache(&this, base, 0);
				2672	if (!ret)
				2673	ret = lookup_slow(&this, base, 0);
				2674	return ret;
				2675	}
				2676	EXPORT_SYMBOL(lookup_one_len_unlocked);
				2677
				2678	#ifdef CONFIG_UNIX98_PTYS
				2679	int path_pts(struct path *path)
				2680	{
				2681	/* Find something mounted on "pts" in the same directory as
				2682	* the input path.
				2683	*/
				2684	struct dentry child, parent;
				2685	struct qstr this;
				2686	int ret;
				2687
				2688	ret = path_parent_directory(path);
				2689	if (ret)
				2690	return ret;
				2691
				2692	parent = path->dentry;
				2693	this.name = "pts";
				2694	this.len = 3;
				2695	child = d_hash_and_lookup(parent, &this);
				2696	if (!child)
				2697	return -ENOENT;
				2698
				2699	path->dentry = child;
				2700	dput(parent);
				2701	follow_mount(path);
				2702	return 0;
				2703	}
				2704	#endif
				2705
				2706	int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
				2707	struct path path, int empty)
				2708	{
				2709	return filename_lookup(dfd, getname_flags(name, flags, empty),
				2710	flags, path, NULL);
				2711	}
				2712	EXPORT_SYMBOL(user_path_at_empty);
				2713
				2714	/**
				2715	* mountpoint_last - look up last component for umount
				2716	* @nd: pathwalk nameidata - currently pointing at parent directory of "last"
				2717	*
				2718	* This is a special lookup_last function just for umount. In this case, we
				2719	* need to resolve the path without doing any revalidation.
				2720	*
				2721	* The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
				2722	* mountpoints are always pinned in the dcache, their ancestors are too. Thus,
				2723	* in almost all cases, this lookup will be served out of the dcache. The only
				2724	* cases where it won't are if nd->last refers to a symlink or the path is
				2725	* bogus and it doesn't exist.
				2726	*
				2727	* Returns:
				2728	* -error: if there was an error during lookup. This includes -ENOENT if the
				2729	* lookup found a negative dentry.
				2730	*
				2731	* 0: if we successfully resolved nd->last and found it to not to be a
				2732	* symlink that needs to be followed.
				2733	*
				2734	* 1: if we successfully resolved nd->last and found it to be a symlink
				2735	* that needs to be followed.
				2736	*/
				2737	static int
				2738	mountpoint_last(struct nameidata *nd)
				2739	{
				2740	int error = 0;
				2741	struct dentry *dir = nd->path.dentry;
				2742	struct path path;
				2743
				2744	/* If we're in rcuwalk, drop out of it to handle last component */
				2745	if (nd->flags & LOOKUP_RCU) {
				2746	if (unlazy_walk(nd))
				2747	return -ECHILD;
				2748	}
				2749
				2750	nd->flags &= ~LOOKUP_PARENT;
				2751
				2752	if (unlikely(nd->last_type != LAST_NORM)) {
				2753	error = handle_dots(nd, nd->last_type);
				2754	if (error)
				2755	return error;
				2756	path.dentry = dget(nd->path.dentry);
				2757	} else {
				2758	path.dentry = d_lookup(dir, &nd->last);
				2759	if (!path.dentry) {
				2760	/*
				2761	* No cached dentry. Mounted dentries are pinned in the
				2762	* cache, so that means that this dentry is probably
				2763	* a symlink or the path doesn't actually point
				2764	* to a mounted dentry.
				2765	*/
				2766	path.dentry = lookup_slow(&nd->last, dir,
				2767	nd->flags \| LOOKUP_NO_REVAL);
				2768	if (IS_ERR(path.dentry))
				2769	return PTR_ERR(path.dentry);
				2770	}
				2771	}
				2772	if (d_is_negative(path.dentry)) {
				2773	dput(path.dentry);
				2774	return -ENOENT;
				2775	}
				2776	path.mnt = nd->path.mnt;
				2777	return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
				2778	}
				2779
				2780	/**
				2781	* path_mountpoint - look up a path to be umounted
				2782	* @nd: lookup context
				2783	* @flags: lookup flags
				2784	* @path: pointer to container for result
				2785	*
				2786	* Look up the given name, but don't attempt to revalidate the last component.
				2787	* Returns 0 and "path" will be valid on success; Returns error otherwise.
				2788	*/
				2789	static int
				2790	path_mountpoint(struct nameidata nd, unsigned flags, struct path path)
				2791	{
				2792	const char *s = path_init(nd, flags);
				2793	int err;
				2794
				2795	while (!(err = link_path_walk(s, nd)) &&
				2796	(err = mountpoint_last(nd)) > 0) {
				2797	s = trailing_symlink(nd);
				2798	}
				2799	if (!err) {
				2800	*path = nd->path;
				2801	nd->path.mnt = NULL;
				2802	nd->path.dentry = NULL;
				2803	follow_mount(path);
				2804	}
				2805	terminate_walk(nd);
				2806	return err;
				2807	}
				2808
				2809	static int
				2810	filename_mountpoint(int dfd, struct filename name, struct path path,
				2811	unsigned int flags)
				2812	{
				2813	struct nameidata nd;
				2814	int error;
				2815	if (IS_ERR(name))
				2816	return PTR_ERR(name);
				2817	set_nameidata(&nd, dfd, name);
				2818	error = path_mountpoint(&nd, flags \| LOOKUP_RCU, path);
				2819	if (unlikely(error == -ECHILD))
				2820	error = path_mountpoint(&nd, flags, path);
				2821	if (unlikely(error == -ESTALE))
				2822	error = path_mountpoint(&nd, flags \| LOOKUP_REVAL, path);
				2823	if (likely(!error))
				2824	audit_inode(name, path->dentry, 0);
				2825	restore_nameidata();
				2826	putname(name);
				2827	return error;
				2828	}
				2829
				2830	/**
				2831	* user_path_mountpoint_at - lookup a path from userland in order to umount it
				2832	* @dfd: directory file descriptor
				2833	* @name: pathname from userland
				2834	* @flags: lookup flags
				2835	* @path: pointer to container to hold result
				2836	*
				2837	* A umount is a special case for path walking. We're not actually interested
				2838	* in the inode in this situation, and ESTALE errors can be a problem. We
				2839	* simply want track down the dentry and vfsmount attached at the mountpoint
				2840	* and avoid revalidating the last component.
				2841	*
				2842	* Returns 0 and populates "path" on success.
				2843	*/
				2844	int
				2845	user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
				2846	struct path *path)
				2847	{
				2848	return filename_mountpoint(dfd, getname(name), path, flags);
				2849	}
				2850
				2851	int
				2852	kern_path_mountpoint(int dfd, const char name, struct path path,
				2853	unsigned int flags)
				2854	{
				2855	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
				2856	}
				2857	EXPORT_SYMBOL(kern_path_mountpoint);
				2858
				2859	int __check_sticky(struct inode dir, struct inode inode)
				2860	{
				2861	kuid_t fsuid = current_fsuid();
				2862
				2863	if (uid_eq(inode->i_uid, fsuid))
				2864	return 0;
				2865	if (uid_eq(dir->i_uid, fsuid))
				2866	return 0;
				2867	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
				2868	}
				2869	EXPORT_SYMBOL(__check_sticky);
				2870
				2871	/*
				2872	* Check whether we can remove a link victim from directory dir, check
				2873	* whether the type of victim is right.
				2874	* 1. We can't do it if dir is read-only (done in permission())
				2875	* 2. We should have write and exec permissions on dir
				2876	* 3. We can't remove anything from append-only dir
				2877	* 4. We can't do anything with immutable dir (done in permission())
				2878	* 5. If the sticky bit on dir is set we should either
				2879	* a. be owner of dir, or
				2880	* b. be owner of victim, or
				2881	* c. have CAP_FOWNER capability
				2882	* 6. If the victim is append-only or immutable we can't do antyhing with
				2883	* links pointing to it.
				2884	* 7. If the victim has an unknown uid or gid we can't change the inode.
				2885	* 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
				2886	* 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
				2887	* 10. We can't remove a root or mountpoint.
				2888	* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
				2889	* nfs_async_unlink().
				2890	*/
				2891	static int may_delete(struct vfsmount mnt, struct inode dir, struct dentry *victim, bool isdir)
				2892	{
				2893	struct inode *inode = d_backing_inode(victim);
				2894	int error;
				2895
				2896	if (d_is_negative(victim))
				2897	return -ENOENT;
				2898	BUG_ON(!inode);
				2899
				2900	BUG_ON(victim->d_parent->d_inode != dir);
				2901
				2902	/* Inode writeback is not safe when the uid or gid are invalid. */
				2903	if (!uid_valid(inode->i_uid) \|\| !gid_valid(inode->i_gid))
				2904	return -EOVERFLOW;
				2905
				2906	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
				2907
				2908	error = inode_permission2(mnt, dir, MAY_WRITE \| MAY_EXEC);
				2909	if (error)
				2910	return error;
				2911	if (IS_APPEND(dir))
				2912	return -EPERM;
				2913
				2914	if (check_sticky(dir, inode) \|\| IS_APPEND(inode) \|\|
				2915	IS_IMMUTABLE(inode) \|\| IS_SWAPFILE(inode) \|\| HAS_UNMAPPED_ID(inode))
				2916	return -EPERM;
				2917	if (isdir) {
				2918	if (!d_is_dir(victim))
				2919	return -ENOTDIR;
				2920	if (IS_ROOT(victim))
				2921	return -EBUSY;
				2922	} else if (d_is_dir(victim))
				2923	return -EISDIR;
				2924	if (IS_DEADDIR(dir))
				2925	return -ENOENT;
				2926	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
				2927	return -EBUSY;
				2928	return 0;
				2929	}
				2930
				2931	/* Check whether we can create an object with dentry child in directory
				2932	* dir.
				2933	* 1. We can't do it if child already exists (open has special treatment for
				2934	* this case, but since we are inlined it's OK)
				2935	* 2. We can't do it if dir is read-only (done in permission())
				2936	* 3. We can't do it if the fs can't represent the fsuid or fsgid.
				2937	* 4. We should have write and exec permissions on dir
				2938	* 5. We can't do it if dir is immutable (done in permission())
				2939	*/
				2940	static inline int may_create(struct vfsmount mnt, struct inode dir, struct dentry *child)
				2941	{
				2942	struct user_namespace *s_user_ns;
				2943	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
				2944	if (child->d_inode)
				2945	return -EEXIST;
				2946	if (IS_DEADDIR(dir))
				2947	return -ENOENT;
				2948	s_user_ns = dir->i_sb->s_user_ns;
				2949	if (!kuid_has_mapping(s_user_ns, current_fsuid()) \|\|
				2950	!kgid_has_mapping(s_user_ns, current_fsgid()))
				2951	return -EOVERFLOW;
				2952	return inode_permission2(mnt, dir, MAY_WRITE \| MAY_EXEC);
				2953	}
				2954
				2955	/*
				2956	* p1 and p2 should be directories on the same fs.
				2957	*/
				2958	struct dentry lock_rename(struct dentry p1, struct dentry *p2)
				2959	{
				2960	struct dentry *p;
				2961
				2962	if (p1 == p2) {
				2963	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
				2964	return NULL;
				2965	}
				2966
				2967	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
				2968
				2969	p = d_ancestor(p2, p1);
				2970	if (p) {
				2971	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
				2972	inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
				2973	return p;
				2974	}
				2975
				2976	p = d_ancestor(p1, p2);
				2977	if (p) {
				2978	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
				2979	inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
				2980	return p;
				2981	}
				2982
				2983	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
				2984	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
				2985	return NULL;
				2986	}
				2987	EXPORT_SYMBOL(lock_rename);
				2988
				2989	void unlock_rename(struct dentry p1, struct dentry p2)
				2990	{
				2991	inode_unlock(p1->d_inode);
				2992	if (p1 != p2) {
				2993	inode_unlock(p2->d_inode);
				2994	mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
				2995	}
				2996	}
				2997	EXPORT_SYMBOL(unlock_rename);
				2998
				2999	int vfs_create2(struct vfsmount mnt, struct inode dir, struct dentry *dentry,
				3000	umode_t mode, bool want_excl)
				3001	{
				3002	int error = may_create(mnt, dir, dentry);
				3003	if (error)
				3004	return error;
				3005
				3006	if (!dir->i_op->create)
				3007	return -EACCES; /* shouldn't it be ENOSYS? */
				3008	mode &= S_IALLUGO;
				3009	mode \|= S_IFREG;
				3010	error = security_inode_create(dir, dentry, mode);
				3011	if (error)
				3012	return error;
				3013	error = dir->i_op->create(dir, dentry, mode, want_excl);
				3014	if (!error)
				3015	fsnotify_create(dir, dentry);
				3016	return error;
				3017	}
				3018	EXPORT_SYMBOL(vfs_create2);
				3019
				3020	int vfs_create(struct inode dir, struct dentry dentry, umode_t mode,
				3021	bool want_excl)
				3022	{
				3023	return vfs_create2(NULL, dir, dentry, mode, want_excl);
				3024	}
				3025	EXPORT_SYMBOL(vfs_create);
				3026
				3027	int vfs_mkobj2(struct vfsmount mnt, struct dentry dentry, umode_t mode,
				3028	int (f)(struct dentry , umode_t, void *),
				3029	void *arg)
				3030	{
				3031	struct inode *dir = dentry->d_parent->d_inode;
				3032	int error = may_create(mnt, dir, dentry);
				3033	if (error)
				3034	return error;
				3035
				3036	mode &= S_IALLUGO;
				3037	mode \|= S_IFREG;
				3038	error = security_inode_create(dir, dentry, mode);
				3039	if (error)
				3040	return error;
				3041	error = f(dentry, mode, arg);
				3042	if (!error)
				3043	fsnotify_create(dir, dentry);
				3044	return error;
				3045	}
				3046	EXPORT_SYMBOL(vfs_mkobj2);
				3047
				3048
				3049	int vfs_mkobj(struct dentry *dentry, umode_t mode,
				3050	int (f)(struct dentry , umode_t, void *),
				3051	void *arg)
				3052	{
				3053	return vfs_mkobj2(NULL, dentry, mode, f, arg);
				3054	}
				3055	EXPORT_SYMBOL(vfs_mkobj);
				3056
				3057	bool may_open_dev(const struct path *path)
				3058	{
				3059	return !(path->mnt->mnt_flags & MNT_NODEV) &&
				3060	!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
				3061	}
				3062
				3063	static int may_open(const struct path *path, int acc_mode, int flag)
				3064	{
				3065	struct dentry *dentry = path->dentry;
				3066	struct vfsmount *mnt = path->mnt;
				3067	struct inode *inode = dentry->d_inode;
				3068	int error;
				3069
				3070	if (!inode)
				3071	return -ENOENT;
				3072
				3073	switch (inode->i_mode & S_IFMT) {
				3074	case S_IFLNK:
				3075	return -ELOOP;
				3076	case S_IFDIR:
				3077	if (acc_mode & MAY_WRITE)
				3078	return -EISDIR;
				3079	break;
				3080	case S_IFBLK:
				3081	case S_IFCHR:
				3082	if (!may_open_dev(path))
				3083	return -EACCES;
				3084	/FALLTHRU/
				3085	case S_IFIFO:
				3086	case S_IFSOCK:
				3087	flag &= ~O_TRUNC;
				3088	break;
				3089	}
				3090
				3091	error = inode_permission2(mnt, inode, MAY_OPEN \| acc_mode);
				3092	if (error)
				3093	return error;
				3094
				3095	/*
				3096	* An append-only file must be opened in append mode for writing.
				3097	*/
				3098	if (IS_APPEND(inode)) {
				3099	if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
				3100	return -EPERM;
				3101	if (flag & O_TRUNC)
				3102	return -EPERM;
				3103	}
				3104
				3105	/* O_NOATIME can only be set by the owner or superuser */
				3106	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
				3107	return -EPERM;
				3108
				3109	return 0;
				3110	}
				3111
				3112	static int handle_truncate(struct file *filp)
				3113	{
				3114	const struct path *path = &filp->f_path;
				3115	struct inode *inode = path->dentry->d_inode;
				3116	int error = get_write_access(inode);
				3117	if (error)
				3118	return error;
				3119	/*
				3120	* Refuse to truncate files with mandatory locks held on them.
				3121	*/
				3122	error = locks_verify_locked(filp);
				3123	if (!error)
				3124	error = security_path_truncate(path);
				3125	if (!error) {
				3126	error = do_truncate2(path->mnt, path->dentry, 0,
				3127	ATTR_MTIME\|ATTR_CTIME\|ATTR_OPEN,
				3128	filp);
				3129	}
				3130	put_write_access(inode);
				3131	return error;
				3132	}
				3133
				3134	static inline int open_to_namei_flags(int flag)
				3135	{
				3136	if ((flag & O_ACCMODE) == 3)
				3137	flag--;
				3138	return flag;
				3139	}
				3140
				3141	static int may_o_create(const struct path dir, struct dentry dentry, umode_t mode)
				3142	{
				3143	struct user_namespace *s_user_ns;
				3144	int error = security_path_mknod(dir, dentry, mode, 0);
				3145	if (error)
				3146	return error;
				3147
				3148	s_user_ns = dir->dentry->d_sb->s_user_ns;
				3149	if (!kuid_has_mapping(s_user_ns, current_fsuid()) \|\|
				3150	!kgid_has_mapping(s_user_ns, current_fsgid()))
				3151	return -EOVERFLOW;
				3152
				3153	error = inode_permission2(dir->mnt, dir->dentry->d_inode, MAY_WRITE \| MAY_EXEC);
				3154	if (error)
				3155	return error;
				3156
				3157	return security_inode_create(dir->dentry->d_inode, dentry, mode);
				3158	}
				3159
				3160	/*
				3161	* Attempt to atomically look up, create and open a file from a negative
				3162	* dentry.
				3163	*
				3164	* Returns 0 if successful. The file will have been created and attached to
				3165	* @file by the filesystem calling finish_open().
				3166	*
				3167	* If the file was looked up only or didn't need creating, FMODE_OPENED won't
				3168	* be set. The caller will need to perform the open themselves. @path will
				3169	* have been updated to point to the new dentry. This may be negative.
				3170	*
				3171	* Returns an error code otherwise.
				3172	*/
				3173	static int atomic_open(struct nameidata nd, struct dentry dentry,
				3174	struct path path, struct file file,
				3175	const struct open_flags *op,
				3176	int open_flag, umode_t mode)
				3177	{
				3178	struct dentry const DENTRY_NOT_SET = (void ) -1UL;
				3179	struct inode *dir = nd->path.dentry->d_inode;
				3180	int error;
				3181
				3182	if (!(~open_flag & (O_EXCL \| O_CREAT))) /* both O_EXCL and O_CREAT */
				3183	open_flag &= ~O_TRUNC;
				3184
				3185	if (nd->flags & LOOKUP_DIRECTORY)
				3186	open_flag \|= O_DIRECTORY;
				3187
				3188	file->f_path.dentry = DENTRY_NOT_SET;
				3189	file->f_path.mnt = nd->path.mnt;
				3190	error = dir->i_op->atomic_open(dir, dentry, file,
				3191	open_to_namei_flags(open_flag), mode);
				3192	d_lookup_done(dentry);
				3193	if (!error) {
				3194	if (file->f_mode & FMODE_OPENED) {
				3195	/*
				3196	* We didn't have the inode before the open, so check open
				3197	* permission here.
				3198	*/
				3199	int acc_mode = op->acc_mode;
				3200	if (file->f_mode & FMODE_CREATED) {
				3201	WARN_ON(!(open_flag & O_CREAT));
				3202	fsnotify_create(dir, dentry);
				3203	acc_mode = 0;
				3204	}
				3205	error = may_open(&file->f_path, acc_mode, open_flag);
				3206	if (WARN_ON(error > 0))
				3207	error = -EINVAL;
				3208	} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
				3209	error = -EIO;
				3210	} else {
				3211	if (file->f_path.dentry) {
				3212	dput(dentry);
				3213	dentry = file->f_path.dentry;
				3214	}
				3215	if (file->f_mode & FMODE_CREATED)
				3216	fsnotify_create(dir, dentry);
				3217	if (unlikely(d_is_negative(dentry))) {
				3218	error = -ENOENT;
				3219	} else {
				3220	path->dentry = dentry;
				3221	path->mnt = nd->path.mnt;
				3222	return 0;
				3223	}
				3224	}
				3225	}
				3226	dput(dentry);
				3227	return error;
				3228	}
				3229
				3230	/*
				3231	* Look up and maybe create and open the last component.
				3232	*
				3233	* Must be called with parent locked (exclusive in O_CREAT case).
				3234	*
				3235	* Returns 0 on success, that is, if
				3236	* the file was successfully atomically created (if necessary) and opened, or
				3237	* the file was not completely opened at this time, though lookups and
				3238	* creations were performed.
				3239	* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
				3240	* In the latter case dentry returned in @path might be negative if O_CREAT
				3241	* hadn't been specified.
				3242	*
				3243	* An error code is returned on failure.
				3244	*/
				3245	static int lookup_open(struct nameidata nd, struct path path,
				3246	struct file *file,
				3247	const struct open_flags *op,
				3248	bool got_write)
				3249	{
				3250	struct dentry *dir = nd->path.dentry;
				3251	struct inode *dir_inode = dir->d_inode;
				3252	int open_flag = op->open_flag;
				3253	struct dentry *dentry;
				3254	int error, create_error = 0;
				3255	umode_t mode = op->mode;
				3256	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
				3257
				3258	if (unlikely(IS_DEADDIR(dir_inode)))
				3259	return -ENOENT;
				3260
				3261	file->f_mode &= ~FMODE_CREATED;
				3262	dentry = d_lookup(dir, &nd->last);
				3263	for (;;) {
				3264	if (!dentry) {
				3265	dentry = d_alloc_parallel(dir, &nd->last, &wq);
				3266	if (IS_ERR(dentry))
				3267	return PTR_ERR(dentry);
				3268	}
				3269	if (d_in_lookup(dentry))
				3270	break;
				3271
				3272	error = d_revalidate(dentry, nd->flags);
				3273	if (likely(error > 0))
				3274	break;
				3275	if (error)
				3276	goto out_dput;
				3277	d_invalidate(dentry);
				3278	dput(dentry);
				3279	dentry = NULL;
				3280	}
				3281	if (dentry->d_inode) {
				3282	/* Cached positive dentry: will open in f_op->open */
				3283	goto out_no_open;
				3284	}
				3285
				3286	/*
				3287	* Checking write permission is tricky, bacuse we don't know if we are
				3288	* going to actually need it: O_CREAT opens should work as long as the
				3289	* file exists. But checking existence breaks atomicity. The trick is
				3290	* to check access and if not granted clear O_CREAT from the flags.
				3291	*
				3292	* Another problem is returing the "right" error value (e.g. for an
				3293	* O_EXCL open we want to return EEXIST not EROFS).
				3294	*/
				3295	if (open_flag & O_CREAT) {
				3296	if (!IS_POSIXACL(dir->d_inode))
				3297	mode &= ~current_umask();
				3298	if (unlikely(!got_write)) {
				3299	create_error = -EROFS;
				3300	open_flag &= ~O_CREAT;
				3301	if (open_flag & (O_EXCL \| O_TRUNC))
				3302	goto no_open;
				3303	/* No side effects, safe to clear O_CREAT */
				3304	} else {
				3305	create_error = may_o_create(&nd->path, dentry, mode);
				3306	if (create_error) {
				3307	open_flag &= ~O_CREAT;
				3308	if (open_flag & O_EXCL)
				3309	goto no_open;
				3310	}
				3311	}
				3312	} else if ((open_flag & (O_TRUNC\|O_WRONLY\|O_RDWR)) &&
				3313	unlikely(!got_write)) {
				3314	/*
				3315	* No O_CREATE -> atomicity not a requirement -> fall
				3316	* back to lookup + open
				3317	*/
				3318	goto no_open;
				3319	}
				3320
				3321	if (dir_inode->i_op->atomic_open) {
				3322	error = atomic_open(nd, dentry, path, file, op, open_flag,
				3323	mode);
				3324	if (unlikely(error == -ENOENT) && create_error)
				3325	error = create_error;
				3326	return error;
				3327	}
				3328
				3329	no_open:
				3330	if (d_in_lookup(dentry)) {
				3331	struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
				3332	nd->flags);
				3333	d_lookup_done(dentry);
				3334	if (unlikely(res)) {
				3335	if (IS_ERR(res)) {
				3336	error = PTR_ERR(res);
				3337	goto out_dput;
				3338	}
				3339	dput(dentry);
				3340	dentry = res;
				3341	}
				3342	}
				3343
				3344	/* Negative dentry, just create the file */
				3345	if (!dentry->d_inode && (open_flag & O_CREAT)) {
				3346	file->f_mode \|= FMODE_CREATED;
				3347	audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
				3348	if (!dir_inode->i_op->create) {
				3349	error = -EACCES;
				3350	goto out_dput;
				3351	}
				3352	error = dir_inode->i_op->create(dir_inode, dentry, mode,
				3353	open_flag & O_EXCL);
				3354	if (error)
				3355	goto out_dput;
				3356	fsnotify_create(dir_inode, dentry);
				3357	}
				3358	if (unlikely(create_error) && !dentry->d_inode) {
				3359	error = create_error;
				3360	goto out_dput;
				3361	}
				3362	out_no_open:
				3363	path->dentry = dentry;
				3364	path->mnt = nd->path.mnt;
				3365	return 0;
				3366
				3367	out_dput:
				3368	dput(dentry);
				3369	return error;
				3370	}
				3371
				3372	/*
				3373	* Handle the last step of open()
				3374	*/
				3375	static int do_last(struct nameidata *nd,
				3376	struct file file, const struct open_flags op)
				3377	{
				3378	struct dentry *dir = nd->path.dentry;
				3379	kuid_t dir_uid = nd->inode->i_uid;
				3380	umode_t dir_mode = nd->inode->i_mode;
				3381	int open_flag = op->open_flag;
				3382	bool will_truncate = (open_flag & O_TRUNC) != 0;
				3383	bool got_write = false;
				3384	int acc_mode = op->acc_mode;
				3385	unsigned seq;
				3386	struct inode *inode;
				3387	struct path path;
				3388	int error;
				3389
				3390	nd->flags &= ~LOOKUP_PARENT;
				3391	nd->flags \|= op->intent;
				3392
				3393	if (nd->last_type != LAST_NORM) {
				3394	error = handle_dots(nd, nd->last_type);
				3395	if (unlikely(error))
				3396	return error;
				3397	goto finish_open;
				3398	}
				3399
				3400	if (!(open_flag & O_CREAT)) {
				3401	if (nd->last.name[nd->last.len])
				3402	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				3403	/* we _can_ be in RCU mode here */
				3404	error = lookup_fast(nd, &path, &inode, &seq);
				3405	if (likely(error > 0))
				3406	goto finish_lookup;
				3407
				3408	if (error < 0)
				3409	return error;
				3410
				3411	BUG_ON(nd->inode != dir->d_inode);
				3412	BUG_ON(nd->flags & LOOKUP_RCU);
				3413	} else {
				3414	/* create side of things */
				3415	/*
				3416	* This will only deal with leaving RCU mode - LOOKUP_JUMPED
				3417	* has been cleared when we got to the last component we are
				3418	* about to look up
				3419	*/
				3420	error = complete_walk(nd);
				3421	if (error)
				3422	return error;
				3423
				3424	audit_inode(nd->name, dir, LOOKUP_PARENT);
				3425	/* trailing slashes? */
				3426	if (unlikely(nd->last.name[nd->last.len]))
				3427	return -EISDIR;
				3428	}
				3429
				3430	if (open_flag & (O_CREAT \| O_TRUNC \| O_WRONLY \| O_RDWR)) {
				3431	error = mnt_want_write(nd->path.mnt);
				3432	if (!error)
				3433	got_write = true;
				3434	/*
				3435	* do _not_ fail yet - we might not need that or fail with
				3436	* a different error; let lookup_open() decide; we'll be
				3437	* dropping this one anyway.
				3438	*/
				3439	}
				3440	if (open_flag & O_CREAT)
				3441	inode_lock(dir->d_inode);
				3442	else
				3443	inode_lock_shared(dir->d_inode);
				3444	error = lookup_open(nd, &path, file, op, got_write);
				3445	if (open_flag & O_CREAT)
				3446	inode_unlock(dir->d_inode);
				3447	else
				3448	inode_unlock_shared(dir->d_inode);
				3449
				3450	if (error)
				3451	goto out;
				3452
				3453	if (file->f_mode & FMODE_OPENED) {
				3454	if ((file->f_mode & FMODE_CREATED) \|\|
				3455	!S_ISREG(file_inode(file)->i_mode))
				3456	will_truncate = false;
				3457
				3458	audit_inode(nd->name, file->f_path.dentry, 0);
				3459	goto opened;
				3460	}
				3461
				3462	if (file->f_mode & FMODE_CREATED) {
				3463	/* Don't check for write permission, don't truncate */
				3464	open_flag &= ~O_TRUNC;
				3465	will_truncate = false;
				3466	acc_mode = 0;
				3467	path_to_nameidata(&path, nd);
				3468	goto finish_open_created;
				3469	}
				3470
				3471	/*
				3472	* If atomic_open() acquired write access it is dropped now due to
				3473	* possible mount and symlink following (this might be optimized away if
				3474	* necessary...)
				3475	*/
				3476	if (got_write) {
				3477	mnt_drop_write(nd->path.mnt);
				3478	got_write = false;
				3479	}
				3480
				3481	error = follow_managed(&path, nd);
				3482	if (unlikely(error < 0))
				3483	return error;
				3484
				3485	if (unlikely(d_is_negative(path.dentry))) {
				3486	path_to_nameidata(&path, nd);
				3487	return -ENOENT;
				3488	}
				3489
				3490	/*
				3491	* create/update audit record if it already exists.
				3492	*/
				3493	audit_inode(nd->name, path.dentry, 0);
				3494
				3495	if (unlikely((open_flag & (O_EXCL \| O_CREAT)) == (O_EXCL \| O_CREAT))) {
				3496	path_to_nameidata(&path, nd);
				3497	return -EEXIST;
				3498	}
				3499
				3500	seq = 0; /* out of RCU mode, so the value doesn't matter */
				3501	inode = d_backing_inode(path.dentry);
				3502	finish_lookup:
				3503	error = step_into(nd, &path, 0, inode, seq);
				3504	if (unlikely(error))
				3505	return error;
				3506	finish_open:
				3507	/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
				3508	error = complete_walk(nd);
				3509	if (error)
				3510	return error;
				3511	audit_inode(nd->name, nd->path.dentry, 0);
				3512	if (open_flag & O_CREAT) {
				3513	error = -EISDIR;
				3514	if (d_is_dir(nd->path.dentry))
				3515	goto out;
				3516	error = may_create_in_sticky(dir_mode, dir_uid,
				3517	d_backing_inode(nd->path.dentry));
				3518	if (unlikely(error))
				3519	goto out;
				3520	}
				3521	error = -ENOTDIR;
				3522	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
				3523	goto out;
				3524	if (!d_is_reg(nd->path.dentry))
				3525	will_truncate = false;
				3526
				3527	if (will_truncate) {
				3528	error = mnt_want_write(nd->path.mnt);
				3529	if (error)
				3530	goto out;
				3531	got_write = true;
				3532	}
				3533	finish_open_created:
				3534	error = may_open(&nd->path, acc_mode, open_flag);
				3535	if (error)
				3536	goto out;
				3537	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
				3538	error = vfs_open(&nd->path, file);
				3539	if (error)
				3540	goto out;
				3541	opened:
				3542	error = ima_file_check(file, op->acc_mode);
				3543	if (!error && will_truncate)
				3544	error = handle_truncate(file);
				3545	out:
				3546	if (unlikely(error > 0)) {
				3547	WARN_ON(1);
				3548	error = -EINVAL;
				3549	}
				3550	if (got_write)
				3551	mnt_drop_write(nd->path.mnt);
				3552	return error;
				3553	}
				3554
				3555	struct dentry vfs_tmpfile(struct dentry dentry, umode_t mode, int open_flag)
				3556	{
				3557	struct dentry *child = NULL;
				3558	struct inode *dir = dentry->d_inode;
				3559	struct inode *inode;
				3560	int error;
				3561
				3562	/* we want directory to be writable */
				3563	error = inode_permission2(ERR_PTR(-EOPNOTSUPP), dir,
				3564	MAY_WRITE \| MAY_EXEC);
				3565	if (error)
				3566	goto out_err;
				3567	error = -EOPNOTSUPP;
				3568	if (!dir->i_op->tmpfile)
				3569	goto out_err;
				3570	error = -ENOMEM;
				3571	child = d_alloc(dentry, &slash_name);
				3572	if (unlikely(!child))
				3573	goto out_err;
				3574	error = dir->i_op->tmpfile(dir, child, mode);
				3575	if (error)
				3576	goto out_err;
				3577	error = -ENOENT;
				3578	inode = child->d_inode;
				3579	if (unlikely(!inode))
				3580	goto out_err;
				3581	if (!(open_flag & O_EXCL)) {
				3582	spin_lock(&inode->i_lock);
				3583	inode->i_state \|= I_LINKABLE;
				3584	spin_unlock(&inode->i_lock);
				3585	}
				3586	return child;
				3587
				3588	out_err:
				3589	dput(child);
				3590	return ERR_PTR(error);
				3591	}
				3592	EXPORT_SYMBOL(vfs_tmpfile);
				3593
				3594	static int do_tmpfile(struct nameidata *nd, unsigned flags,
				3595	const struct open_flags *op,
				3596	struct file *file)
				3597	{
				3598	struct dentry *child;
				3599	struct path path;
				3600	int error = path_lookupat(nd, flags \| LOOKUP_DIRECTORY, &path);
				3601	if (unlikely(error))
				3602	return error;
				3603	error = mnt_want_write(path.mnt);
				3604	if (unlikely(error))
				3605	goto out;
				3606	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
				3607	error = PTR_ERR(child);
				3608	if (IS_ERR(child))
				3609	goto out2;
				3610	dput(path.dentry);
				3611	path.dentry = child;
				3612	audit_inode(nd->name, child, 0);
				3613	/* Don't check for other permissions, the inode was just created */
				3614	error = may_open(&path, 0, op->open_flag);
				3615	if (error)
				3616	goto out2;
				3617	file->f_path.mnt = path.mnt;
				3618	error = finish_open(file, child, NULL);
				3619	out2:
				3620	mnt_drop_write(path.mnt);
				3621	out:
				3622	path_put(&path);
				3623	return error;
				3624	}
				3625
				3626	static int do_o_path(struct nameidata nd, unsigned flags, struct file file)
				3627	{
				3628	struct path path;
				3629	int error = path_lookupat(nd, flags, &path);
				3630	if (!error) {
				3631	audit_inode(nd->name, path.dentry, 0);
				3632	error = vfs_open(&path, file);
				3633	path_put(&path);
				3634	}
				3635	return error;
				3636	}
				3637
				3638	static struct file path_openat(struct nameidata nd,
				3639	const struct open_flags *op, unsigned flags)
				3640	{
				3641	struct file *file;
				3642	int error;
				3643
				3644	file = alloc_empty_file(op->open_flag, current_cred());
				3645	if (IS_ERR(file))
				3646	return file;
				3647
				3648	if (unlikely(file->f_flags & __O_TMPFILE)) {
				3649	error = do_tmpfile(nd, flags, op, file);
				3650	} else if (unlikely(file->f_flags & O_PATH)) {
				3651	error = do_o_path(nd, flags, file);
				3652	} else {
				3653	const char *s = path_init(nd, flags);
				3654	while (!(error = link_path_walk(s, nd)) &&
				3655	(error = do_last(nd, file, op)) > 0) {
				3656	nd->flags &= ~(LOOKUP_OPEN\|LOOKUP_CREATE\|LOOKUP_EXCL);
				3657	s = trailing_symlink(nd);
				3658	}
				3659	terminate_walk(nd);
				3660	}
				3661	if (likely(!error)) {
				3662	if (likely(file->f_mode & FMODE_OPENED))
				3663	return file;
				3664	WARN_ON(1);
				3665	error = -EINVAL;
				3666	}
				3667	fput(file);
				3668	if (error == -EOPENSTALE) {
				3669	if (flags & LOOKUP_RCU)
				3670	error = -ECHILD;
				3671	else
				3672	error = -ESTALE;
				3673	}
				3674	return ERR_PTR(error);
				3675	}
				3676
				3677	struct file do_filp_open(int dfd, struct filename pathname,
				3678	const struct open_flags *op)
				3679	{
				3680	struct nameidata nd;
				3681	int flags = op->lookup_flags;
				3682	struct file *filp;
				3683
				3684	set_nameidata(&nd, dfd, pathname);
				3685	filp = path_openat(&nd, op, flags \| LOOKUP_RCU);
				3686	if (unlikely(filp == ERR_PTR(-ECHILD)))
				3687	filp = path_openat(&nd, op, flags);
				3688	if (unlikely(filp == ERR_PTR(-ESTALE)))
				3689	filp = path_openat(&nd, op, flags \| LOOKUP_REVAL);
				3690	restore_nameidata();
				3691	return filp;
				3692	}
				3693
				3694	struct file do_file_open_root(struct dentry dentry, struct vfsmount *mnt,
				3695	const char name, const struct open_flags op)
				3696	{
				3697	struct nameidata nd;
				3698	struct file *file;
				3699	struct filename *filename;
				3700	int flags = op->lookup_flags \| LOOKUP_ROOT;
				3701
				3702	nd.root.mnt = mnt;
				3703	nd.root.dentry = dentry;
				3704
				3705	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
				3706	return ERR_PTR(-ELOOP);
				3707
				3708	filename = getname_kernel(name);
				3709	if (IS_ERR(filename))
				3710	return ERR_CAST(filename);
				3711
				3712	set_nameidata(&nd, -1, filename);
				3713	file = path_openat(&nd, op, flags \| LOOKUP_RCU);
				3714	if (unlikely(file == ERR_PTR(-ECHILD)))
				3715	file = path_openat(&nd, op, flags);
				3716	if (unlikely(file == ERR_PTR(-ESTALE)))
				3717	file = path_openat(&nd, op, flags \| LOOKUP_REVAL);
				3718	restore_nameidata();
				3719	putname(filename);
				3720	return file;
				3721	}
				3722
				3723	static struct dentry filename_create(int dfd, struct filename name,
				3724	struct path *path, unsigned int lookup_flags)
				3725	{
				3726	struct dentry *dentry = ERR_PTR(-EEXIST);
				3727	struct qstr last;
				3728	int type;
				3729	int err2;
				3730	int error;
				3731	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
				3732
				3733	/*
				3734	* Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
				3735	* other flags passed in are ignored!
				3736	*/
				3737	lookup_flags &= LOOKUP_REVAL;
				3738
				3739	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
				3740	if (IS_ERR(name))
				3741	return ERR_CAST(name);
				3742
				3743	/*
				3744	* Yucky last component or no last component at all?
				3745	* (foo/., foo/.., /////)
				3746	*/
				3747	if (unlikely(type != LAST_NORM))
				3748	goto out;
				3749
				3750	/* don't fail immediately if it's r/o, at least try to report other errors */
				3751	err2 = mnt_want_write(path->mnt);
				3752	/*
				3753	* Do the final lookup.
				3754	*/
				3755	lookup_flags \|= LOOKUP_CREATE \| LOOKUP_EXCL;
				3756	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
				3757	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
				3758	if (IS_ERR(dentry))
				3759	goto unlock;
				3760
				3761	error = -EEXIST;
				3762	if (d_is_positive(dentry))
				3763	goto fail;
				3764
				3765	/*
				3766	* Special case - lookup gave negative, but... we had foo/bar/
				3767	* From the vfs_mknod() POV we just have a negative dentry -
				3768	* all is fine. Let's be bastards - you had / on the end, you've
				3769	* been asking for (non-existent) directory. -ENOENT for you.
				3770	*/
				3771	if (unlikely(!is_dir && last.name[last.len])) {
				3772	error = -ENOENT;
				3773	goto fail;
				3774	}
				3775	if (unlikely(err2)) {
				3776	error = err2;
				3777	goto fail;
				3778	}
				3779	putname(name);
				3780	return dentry;
				3781	fail:
				3782	dput(dentry);
				3783	dentry = ERR_PTR(error);
				3784	unlock:
				3785	inode_unlock(path->dentry->d_inode);
				3786	if (!err2)
				3787	mnt_drop_write(path->mnt);
				3788	out:
				3789	path_put(path);
				3790	putname(name);
				3791	return dentry;
				3792	}
				3793
				3794	struct dentry kern_path_create(int dfd, const char pathname,
				3795	struct path *path, unsigned int lookup_flags)
				3796	{
				3797	return filename_create(dfd, getname_kernel(pathname),
				3798	path, lookup_flags);
				3799	}
				3800	EXPORT_SYMBOL(kern_path_create);
				3801
				3802	void done_path_create(struct path path, struct dentry dentry)
				3803	{
				3804	dput(dentry);
				3805	inode_unlock(path->dentry->d_inode);
				3806	mnt_drop_write(path->mnt);
				3807	path_put(path);
				3808	}
				3809	EXPORT_SYMBOL(done_path_create);
				3810
				3811	inline struct dentry user_path_create(int dfd, const char __user pathname,
				3812	struct path *path, unsigned int lookup_flags)
				3813	{
				3814	return filename_create(dfd, getname(pathname), path, lookup_flags);
				3815	}
				3816	EXPORT_SYMBOL(user_path_create);
				3817
				3818	int vfs_mknod2(struct vfsmount mnt, struct inode dir, struct dentry *dentry, umode_t mode, dev_t dev)
				3819	{
				3820	int error = may_create(mnt, dir, dentry);
				3821
				3822	if (error)
				3823	return error;
				3824
				3825	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && !capable(CAP_MKNOD))
				3826	return -EPERM;
				3827
				3828	if (!dir->i_op->mknod)
				3829	return -EPERM;
				3830
				3831	error = devcgroup_inode_mknod(mode, dev);
				3832	if (error)
				3833	return error;
				3834
				3835	error = security_inode_mknod(dir, dentry, mode, dev);
				3836	if (error)
				3837	return error;
				3838
				3839	error = dir->i_op->mknod(dir, dentry, mode, dev);
				3840	if (!error)
				3841	fsnotify_create(dir, dentry);
				3842	return error;
				3843	}
				3844	EXPORT_SYMBOL(vfs_mknod2);
				3845
				3846	int vfs_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t dev)
				3847	{
				3848	return vfs_mknod2(NULL, dir, dentry, mode, dev);
				3849	}
				3850	EXPORT_SYMBOL(vfs_mknod);
				3851
				3852	static int may_mknod(umode_t mode)
				3853	{
				3854	switch (mode & S_IFMT) {
				3855	case S_IFREG:
				3856	case S_IFCHR:
				3857	case S_IFBLK:
				3858	case S_IFIFO:
				3859	case S_IFSOCK:
				3860	case 0: /* zero mode translates to S_IFREG */
				3861	return 0;
				3862	case S_IFDIR:
				3863	return -EPERM;
				3864	default:
				3865	return -EINVAL;
				3866	}
				3867	}
				3868
				3869	long do_mknodat(int dfd, const char __user *filename, umode_t mode,
				3870	unsigned int dev)
				3871	{
				3872	struct dentry *dentry;
				3873	struct path path;
				3874	int error;
				3875	unsigned int lookup_flags = 0;
				3876
				3877	error = may_mknod(mode);
				3878	if (error)
				3879	return error;
				3880	retry:
				3881	dentry = user_path_create(dfd, filename, &path, lookup_flags);
				3882	if (IS_ERR(dentry))
				3883	return PTR_ERR(dentry);
				3884
				3885	if (!IS_POSIXACL(path.dentry->d_inode))
				3886	mode &= ~current_umask();
				3887	error = security_path_mknod(&path, dentry, mode, dev);
				3888	if (error)
				3889	goto out;
				3890	switch (mode & S_IFMT) {
				3891	case 0: case S_IFREG:
				3892	error = vfs_create2(path.mnt, path.dentry->d_inode,dentry,mode,true);
				3893	if (!error)
				3894	ima_post_path_mknod(dentry);
				3895	break;
				3896	case S_IFCHR: case S_IFBLK:
				3897	error = vfs_mknod2(path.mnt, path.dentry->d_inode,dentry,mode,
				3898	new_decode_dev(dev));
				3899	break;
				3900	case S_IFIFO: case S_IFSOCK:
				3901	error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
				3902	break;
				3903	}
				3904	out:
				3905	done_path_create(&path, dentry);
				3906	if (retry_estale(error, lookup_flags)) {
				3907	lookup_flags \|= LOOKUP_REVAL;
				3908	goto retry;
				3909	}
				3910	return error;
				3911	}
				3912
				3913	SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
				3914	unsigned int, dev)
				3915	{
				3916	return do_mknodat(dfd, filename, mode, dev);
				3917	}
				3918
				3919	SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
				3920	{
				3921	return do_mknodat(AT_FDCWD, filename, mode, dev);
				3922	}
				3923
				3924	int vfs_mkdir2(struct vfsmount mnt, struct inode dir, struct dentry *dentry, umode_t mode)
				3925	{
				3926	int error = may_create(mnt, dir, dentry);
				3927	unsigned max_links = dir->i_sb->s_max_links;
				3928
				3929	if (error)
				3930	return error;
				3931
				3932	if (!dir->i_op->mkdir)
				3933	return -EPERM;
				3934
				3935	mode &= (S_IRWXUGO\|S_ISVTX);
				3936	error = security_inode_mkdir(dir, dentry, mode);
				3937	if (error)
				3938	return error;
				3939
				3940	if (max_links && dir->i_nlink >= max_links)
				3941	return -EMLINK;
				3942
				3943	error = dir->i_op->mkdir(dir, dentry, mode);
				3944	if (!error)
				3945	fsnotify_mkdir(dir, dentry);
				3946	return error;
				3947	}
				3948	EXPORT_SYMBOL(vfs_mkdir2);
				3949
				3950	int vfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
				3951	{
				3952	return vfs_mkdir2(NULL, dir, dentry, mode);
				3953	}
				3954	EXPORT_SYMBOL(vfs_mkdir);
				3955
				3956	long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
				3957	{
				3958	struct dentry *dentry;
				3959	struct path path;
				3960	int error;
				3961	unsigned int lookup_flags = LOOKUP_DIRECTORY;
				3962
				3963	retry:
				3964	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
				3965	if (IS_ERR(dentry))
				3966	return PTR_ERR(dentry);
				3967
				3968	if (!IS_POSIXACL(path.dentry->d_inode))
				3969	mode &= ~current_umask();
				3970	error = security_path_mkdir(&path, dentry, mode);
				3971	if (!error)
				3972	error = vfs_mkdir2(path.mnt, path.dentry->d_inode, dentry, mode);
				3973	done_path_create(&path, dentry);
				3974	if (retry_estale(error, lookup_flags)) {
				3975	lookup_flags \|= LOOKUP_REVAL;
				3976	goto retry;
				3977	}
				3978	return error;
				3979	}
				3980
				3981	SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
				3982	{
				3983	return do_mkdirat(dfd, pathname, mode);
				3984	}
				3985
				3986	SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
				3987	{
				3988	return do_mkdirat(AT_FDCWD, pathname, mode);
				3989	}
				3990
				3991	int vfs_rmdir2(struct vfsmount mnt, struct inode dir, struct dentry *dentry)
				3992	{
				3993	int error = may_delete(mnt, dir, dentry, 1);
				3994
				3995	if (error)
				3996	return error;
				3997
				3998	if (!dir->i_op->rmdir)
				3999	return -EPERM;
				4000
				4001	dget(dentry);
				4002	inode_lock(dentry->d_inode);
				4003
				4004	error = -EBUSY;
				4005	if (is_local_mountpoint(dentry))
				4006	goto out;
				4007
				4008	error = security_inode_rmdir(dir, dentry);
				4009	if (error)
				4010	goto out;
				4011
				4012	error = dir->i_op->rmdir(dir, dentry);
				4013	if (error)
				4014	goto out;
				4015
				4016	shrink_dcache_parent(dentry);
				4017	dentry->d_inode->i_flags \|= S_DEAD;
				4018	dont_mount(dentry);
				4019	detach_mounts(dentry);
				4020
				4021	out:
				4022	inode_unlock(dentry->d_inode);
				4023	dput(dentry);
				4024	if (!error)
				4025	d_delete(dentry);
				4026	return error;
				4027	}
				4028	EXPORT_SYMBOL(vfs_rmdir2);
				4029
				4030	int vfs_rmdir(struct inode dir, struct dentry dentry)
				4031	{
				4032	return vfs_rmdir2(NULL, dir, dentry);
				4033	}
				4034	EXPORT_SYMBOL(vfs_rmdir);
				4035
				4036	long do_rmdir(int dfd, const char __user *pathname)
				4037	{
				4038	int error = 0;
				4039	struct filename *name;
				4040	struct dentry *dentry;
				4041	struct path path;
				4042	struct qstr last;
				4043	int type;
				4044	unsigned int lookup_flags = 0;
				4045	retry:
				4046	name = filename_parentat(dfd, getname(pathname), lookup_flags,
				4047	&path, &last, &type);
				4048	if (IS_ERR(name))
				4049	return PTR_ERR(name);
				4050
				4051	switch (type) {
				4052	case LAST_DOTDOT:
				4053	error = -ENOTEMPTY;
				4054	goto exit1;
				4055	case LAST_DOT:
				4056	error = -EINVAL;
				4057	goto exit1;
				4058	case LAST_ROOT:
				4059	error = -EBUSY;
				4060	goto exit1;
				4061	}
				4062
				4063	error = mnt_want_write(path.mnt);
				4064	if (error)
				4065	goto exit1;
				4066
				4067	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
				4068	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
				4069	error = PTR_ERR(dentry);
				4070	if (IS_ERR(dentry))
				4071	goto exit2;
				4072	if (!dentry->d_inode) {
				4073	error = -ENOENT;
				4074	goto exit3;
				4075	}
				4076	error = security_path_rmdir(&path, dentry);
				4077	if (error)
				4078	goto exit3;
				4079	error = vfs_rmdir2(path.mnt, path.dentry->d_inode, dentry);
				4080	exit3:
				4081	dput(dentry);
				4082	exit2:
				4083	inode_unlock(path.dentry->d_inode);
				4084	mnt_drop_write(path.mnt);
				4085	exit1:
				4086	path_put(&path);
				4087	putname(name);
				4088	if (retry_estale(error, lookup_flags)) {
				4089	lookup_flags \|= LOOKUP_REVAL;
				4090	goto retry;
				4091	}
				4092	return error;
				4093	}
				4094
				4095	SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
				4096	{
				4097	return do_rmdir(AT_FDCWD, pathname);
				4098	}
				4099
				4100	/**
				4101	* vfs_unlink - unlink a filesystem object
				4102	* @dir: parent directory
				4103	* @dentry: victim
				4104	* @delegated_inode: returns victim inode, if the inode is delegated.
				4105	*
				4106	* The caller must hold dir->i_mutex.
				4107	*
				4108	* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
				4109	* return a reference to the inode in delegated_inode. The caller
				4110	* should then break the delegation on that inode and retry. Because
				4111	* breaking a delegation may take a long time, the caller should drop
				4112	* dir->i_mutex before doing so.
				4113	*
				4114	* Alternatively, a caller may pass NULL for delegated_inode. This may
				4115	* be appropriate for callers that expect the underlying filesystem not
				4116	* to be NFS exported.
				4117	*/
				4118	int vfs_unlink2(struct vfsmount mnt, struct inode dir, struct dentry dentry, struct inode *delegated_inode)
				4119	{
				4120	struct inode *target = dentry->d_inode;
				4121	int error = may_delete(mnt, dir, dentry, 0);
				4122
				4123	if (error)
				4124	return error;
				4125
				4126	if (!dir->i_op->unlink)
				4127	return -EPERM;
				4128
				4129	inode_lock(target);
				4130	if (is_local_mountpoint(dentry))
				4131	error = -EBUSY;
				4132	else {
				4133	error = security_inode_unlink(dir, dentry);
				4134	if (!error) {
				4135	error = try_break_deleg(target, delegated_inode);
				4136	if (error)
				4137	goto out;
				4138	error = dir->i_op->unlink(dir, dentry);
				4139	if (!error) {
				4140	dont_mount(dentry);
				4141	detach_mounts(dentry);
				4142	}
				4143	}
				4144	}
				4145	out:
				4146	inode_unlock(target);
				4147
				4148	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
				4149	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
				4150	fsnotify_link_count(target);
				4151	d_delete(dentry);
				4152	}
				4153
				4154	return error;
				4155	}
				4156	EXPORT_SYMBOL(vfs_unlink2);
				4157
				4158	int vfs_unlink(struct inode dir, struct dentry dentry, struct inode **delegated_inode)
				4159	{
				4160	return vfs_unlink2(NULL, dir, dentry, delegated_inode);
				4161	}
				4162	EXPORT_SYMBOL(vfs_unlink);
				4163
				4164	/*
				4165	* Make sure that the actual truncation of the file will occur outside its
				4166	* directory's i_mutex. Truncate can take a long time if there is a lot of
				4167	* writeout happening, and we don't want to prevent access to the directory
				4168	* while waiting on the I/O.
				4169	*/
				4170	long do_unlinkat(int dfd, struct filename *name)
				4171	{
				4172	int error;
				4173	struct dentry *dentry;
				4174	struct path path;
				4175	struct qstr last;
				4176	int type;
				4177	struct inode *inode = NULL;
				4178	struct inode *delegated_inode = NULL;
				4179	unsigned int lookup_flags = 0;
				4180	retry:
				4181	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
				4182	if (IS_ERR(name))
				4183	return PTR_ERR(name);
				4184
				4185	error = -EISDIR;
				4186	if (type != LAST_NORM)
				4187	goto exit1;
				4188
				4189	error = mnt_want_write(path.mnt);
				4190	if (error)
				4191	goto exit1;
				4192	retry_deleg:
				4193	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
				4194	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
				4195	error = PTR_ERR(dentry);
				4196	if (!IS_ERR(dentry)) {
				4197	/* Why not before? Because we want correct error value */
				4198	if (last.name[last.len])
				4199	goto slashes;
				4200	inode = dentry->d_inode;
				4201	if (d_is_negative(dentry))
				4202	goto slashes;
				4203	ihold(inode);
				4204	error = security_path_unlink(&path, dentry);
				4205	if (error)
				4206	goto exit2;
				4207	error = vfs_unlink2(path.mnt, path.dentry->d_inode, dentry, &delegated_inode);
				4208	exit2:
				4209	dput(dentry);
				4210	}
				4211	inode_unlock(path.dentry->d_inode);
				4212	if (inode)
				4213	iput(inode); /* truncate the inode here */
				4214	inode = NULL;
				4215	if (delegated_inode) {
				4216	error = break_deleg_wait(&delegated_inode);
				4217	if (!error)
				4218	goto retry_deleg;
				4219	}
				4220	mnt_drop_write(path.mnt);
				4221	exit1:
				4222	path_put(&path);
				4223	if (retry_estale(error, lookup_flags)) {
				4224	lookup_flags \|= LOOKUP_REVAL;
				4225	inode = NULL;
				4226	goto retry;
				4227	}
				4228	putname(name);
				4229	return error;
				4230
				4231	slashes:
				4232	if (d_is_negative(dentry))
				4233	error = -ENOENT;
				4234	else if (d_is_dir(dentry))
				4235	error = -EISDIR;
				4236	else
				4237	error = -ENOTDIR;
				4238	goto exit2;
				4239	}
				4240
				4241	SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
				4242	{
				4243	if ((flag & ~AT_REMOVEDIR) != 0)
				4244	return -EINVAL;
				4245
				4246	if (flag & AT_REMOVEDIR)
				4247	return do_rmdir(dfd, pathname);
				4248
				4249	return do_unlinkat(dfd, getname(pathname));
				4250	}
				4251
				4252	SYSCALL_DEFINE1(unlink, const char __user *, pathname)
				4253	{
				4254	return do_unlinkat(AT_FDCWD, getname(pathname));
				4255	}
				4256
				4257	int vfs_symlink2(struct vfsmount mnt, struct inode dir, struct dentry dentry, const char oldname)
				4258	{
				4259	int error = may_create(mnt, dir, dentry);
				4260
				4261	if (error)
				4262	return error;
				4263
				4264	if (!dir->i_op->symlink)
				4265	return -EPERM;
				4266
				4267	error = security_inode_symlink(dir, dentry, oldname);
				4268	if (error)
				4269	return error;
				4270
				4271	error = dir->i_op->symlink(dir, dentry, oldname);
				4272	if (!error)
				4273	fsnotify_create(dir, dentry);
				4274	return error;
				4275	}
				4276	EXPORT_SYMBOL(vfs_symlink2);
				4277
				4278	int vfs_symlink(struct inode dir, struct dentry dentry, const char *oldname)
				4279	{
				4280	return vfs_symlink2(NULL, dir, dentry, oldname);
				4281	}
				4282	EXPORT_SYMBOL(vfs_symlink);
				4283
				4284	long do_symlinkat(const char __user *oldname, int newdfd,
				4285	const char __user *newname)
				4286	{
				4287	int error;
				4288	struct filename *from;
				4289	struct dentry *dentry;
				4290	struct path path;
				4291	unsigned int lookup_flags = 0;
				4292
				4293	from = getname(oldname);
				4294	if (IS_ERR(from))
				4295	return PTR_ERR(from);
				4296	retry:
				4297	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
				4298	error = PTR_ERR(dentry);
				4299	if (IS_ERR(dentry))
				4300	goto out_putname;
				4301
				4302	error = security_path_symlink(&path, dentry, from->name);
				4303	if (!error)
				4304	error = vfs_symlink2(path.mnt, path.dentry->d_inode, dentry, from->name);
				4305	done_path_create(&path, dentry);
				4306	if (retry_estale(error, lookup_flags)) {
				4307	lookup_flags \|= LOOKUP_REVAL;
				4308	goto retry;
				4309	}
				4310	out_putname:
				4311	putname(from);
				4312	return error;
				4313	}
				4314
				4315	SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
				4316	int, newdfd, const char __user *, newname)
				4317	{
				4318	return do_symlinkat(oldname, newdfd, newname);
				4319	}
				4320
				4321	SYSCALL_DEFINE2(symlink, const char __user , oldname, const char __user , newname)
				4322	{
				4323	return do_symlinkat(oldname, AT_FDCWD, newname);
				4324	}
				4325
				4326	/**
				4327	* vfs_link - create a new link
				4328	* @old_dentry: object to be linked
				4329	* @dir: new parent
				4330	* @new_dentry: where to create the new link
				4331	* @delegated_inode: returns inode needing a delegation break
				4332	*
				4333	* The caller must hold dir->i_mutex
				4334	*
				4335	* If vfs_link discovers a delegation on the to-be-linked file in need
				4336	* of breaking, it will return -EWOULDBLOCK and return a reference to the
				4337	* inode in delegated_inode. The caller should then break the delegation
				4338	* and retry. Because breaking a delegation may take a long time, the
				4339	* caller should drop the i_mutex before doing so.
				4340	*
				4341	* Alternatively, a caller may pass NULL for delegated_inode. This may
				4342	* be appropriate for callers that expect the underlying filesystem not
				4343	* to be NFS exported.
				4344	*/
				4345	int vfs_link2(struct vfsmount mnt, struct dentry old_dentry, struct inode dir, struct dentry new_dentry, struct inode **delegated_inode)
				4346	{
				4347	struct inode *inode = old_dentry->d_inode;
				4348	unsigned max_links = dir->i_sb->s_max_links;
				4349	int error;
				4350
				4351	if (!inode)
				4352	return -ENOENT;
				4353
				4354	error = may_create(mnt, dir, new_dentry);
				4355	if (error)
				4356	return error;
				4357
				4358	if (dir->i_sb != inode->i_sb)
				4359	return -EXDEV;
				4360
				4361	/*
				4362	* A link to an append-only or immutable file cannot be created.
				4363	*/
				4364	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				4365	return -EPERM;
				4366	/*
				4367	* Updating the link count will likely cause i_uid and i_gid to
				4368	* be writen back improperly if their true value is unknown to
				4369	* the vfs.
				4370	*/
				4371	if (HAS_UNMAPPED_ID(inode))
				4372	return -EPERM;
				4373	if (!dir->i_op->link)
				4374	return -EPERM;
				4375	if (S_ISDIR(inode->i_mode))
				4376	return -EPERM;
				4377
				4378	error = security_inode_link(old_dentry, dir, new_dentry);
				4379	if (error)
				4380	return error;
				4381
				4382	inode_lock(inode);
				4383	/* Make sure we don't allow creating hardlink to an unlinked file */
				4384	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
				4385	error = -ENOENT;
				4386	else if (max_links && inode->i_nlink >= max_links)
				4387	error = -EMLINK;
				4388	else {
				4389	error = try_break_deleg(inode, delegated_inode);
				4390	if (!error)
				4391	error = dir->i_op->link(old_dentry, dir, new_dentry);
				4392	}
				4393
				4394	if (!error && (inode->i_state & I_LINKABLE)) {
				4395	spin_lock(&inode->i_lock);
				4396	inode->i_state &= ~I_LINKABLE;
				4397	spin_unlock(&inode->i_lock);
				4398	}
				4399	inode_unlock(inode);
				4400	if (!error)
				4401	fsnotify_link(dir, inode, new_dentry);
				4402	return error;
				4403	}
				4404	EXPORT_SYMBOL(vfs_link2);
				4405
				4406	int vfs_link(struct dentry old_dentry, struct inode dir, struct dentry new_dentry, struct inode *delegated_inode)
				4407	{
				4408	return vfs_link2(NULL, old_dentry, dir, new_dentry, delegated_inode);
				4409	}
				4410	EXPORT_SYMBOL(vfs_link);
				4411
				4412	/*
				4413	* Hardlinks are often used in delicate situations. We avoid
				4414	* security-related surprises by not following symlinks on the
				4415	* newname. --KAB
				4416	*
				4417	* We don't follow them on the oldname either to be compatible
				4418	* with linux 2.0, and to avoid hard-linking to directories
				4419	* and other special files. --ADM
				4420	*/
				4421	int do_linkat(int olddfd, const char __user *oldname, int newdfd,
				4422	const char __user *newname, int flags)
				4423	{
				4424	struct dentry *new_dentry;
				4425	struct path old_path, new_path;
				4426	struct inode *delegated_inode = NULL;
				4427	int how = 0;
				4428	int error;
				4429
				4430	if ((flags & ~(AT_SYMLINK_FOLLOW \| AT_EMPTY_PATH)) != 0)
				4431	return -EINVAL;
				4432	/*
				4433	* To use null names we require CAP_DAC_READ_SEARCH
				4434	* This ensures that not everyone will be able to create
				4435	* handlink using the passed filedescriptor.
				4436	*/
				4437	if (flags & AT_EMPTY_PATH) {
				4438	if (!capable(CAP_DAC_READ_SEARCH))
				4439	return -ENOENT;
				4440	how = LOOKUP_EMPTY;
				4441	}
				4442
				4443	if (flags & AT_SYMLINK_FOLLOW)
				4444	how \|= LOOKUP_FOLLOW;
				4445	retry:
				4446	error = user_path_at(olddfd, oldname, how, &old_path);
				4447	if (error)
				4448	return error;
				4449
				4450	new_dentry = user_path_create(newdfd, newname, &new_path,
				4451	(how & LOOKUP_REVAL));
				4452	error = PTR_ERR(new_dentry);
				4453	if (IS_ERR(new_dentry))
				4454	goto out;
				4455
				4456	error = -EXDEV;
				4457	if (old_path.mnt != new_path.mnt)
				4458	goto out_dput;
				4459	error = may_linkat(&old_path);
				4460	if (unlikely(error))
				4461	goto out_dput;
				4462	error = security_path_link(old_path.dentry, &new_path, new_dentry);
				4463	if (error)
				4464	goto out_dput;
				4465	error = vfs_link2(old_path.mnt, old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
				4466	out_dput:
				4467	done_path_create(&new_path, new_dentry);
				4468	if (delegated_inode) {
				4469	error = break_deleg_wait(&delegated_inode);
				4470	if (!error) {
				4471	path_put(&old_path);
				4472	goto retry;
				4473	}
				4474	}
				4475	if (retry_estale(error, how)) {
				4476	path_put(&old_path);
				4477	how \|= LOOKUP_REVAL;
				4478	goto retry;
				4479	}
				4480	out:
				4481	path_put(&old_path);
				4482
				4483	return error;
				4484	}
				4485
				4486	SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
				4487	int, newdfd, const char __user *, newname, int, flags)
				4488	{
				4489	return do_linkat(olddfd, oldname, newdfd, newname, flags);
				4490	}
				4491
				4492	SYSCALL_DEFINE2(link, const char __user , oldname, const char __user , newname)
				4493	{
				4494	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
				4495	}
				4496
				4497	/**
				4498	* vfs_rename - rename a filesystem object
				4499	* @old_dir: parent of source
				4500	* @old_dentry: source
				4501	* @new_dir: parent of destination
				4502	* @new_dentry: destination
				4503	* @delegated_inode: returns an inode needing a delegation break
				4504	* @flags: rename flags
				4505	*
				4506	* The caller must hold multiple mutexes--see lock_rename()).
				4507	*
				4508	* If vfs_rename discovers a delegation in need of breaking at either
				4509	* the source or destination, it will return -EWOULDBLOCK and return a
				4510	* reference to the inode in delegated_inode. The caller should then
				4511	* break the delegation and retry. Because breaking a delegation may
				4512	* take a long time, the caller should drop all locks before doing
				4513	* so.
				4514	*
				4515	* Alternatively, a caller may pass NULL for delegated_inode. This may
				4516	* be appropriate for callers that expect the underlying filesystem not
				4517	* to be NFS exported.
				4518	*
				4519	* The worst of all namespace operations - renaming directory. "Perverted"
				4520	* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
				4521	* Problems:
				4522	*
				4523	* a) we can get into loop creation.
				4524	* b) race potential - two innocent renames can create a loop together.
				4525	* That's where 4.4 screws up. Current fix: serialization on
				4526	* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
				4527	* story.
				4528	* c) we have to lock _four_ objects - parents and victim (if it exists),
				4529	* and source (if it is not a directory).
				4530	* And that - after we got ->i_mutex on parents (until then we don't know
				4531	* whether the target exists). Solution: try to be smart with locking
				4532	* order for inodes. We rely on the fact that tree topology may change
				4533	* only under ->s_vfs_rename_mutex _and_ that parent of the object we
				4534	* move will be locked. Thus we can rank directories by the tree
				4535	* (ancestors first) and rank all non-directories after them.
				4536	* That works since everybody except rename does "lock parent, lookup,
				4537	* lock child" and rename is under ->s_vfs_rename_mutex.
				4538	* HOWEVER, it relies on the assumption that any object with ->lookup()
				4539	* has no more than 1 dentry. If "hybrid" objects will ever appear,
				4540	* we'd better make sure that there's no link(2) for them.
				4541	* d) conversion from fhandle to dentry may come in the wrong moment - when
				4542	* we are removing the target. Solution: we will have to grab ->i_mutex
				4543	* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
				4544	* ->i_mutex on parents, which works but leads to some truly excessive
				4545	* locking].
				4546	*/
				4547	int vfs_rename2(struct vfsmount *mnt,
				4548	struct inode old_dir, struct dentry old_dentry,
				4549	struct inode new_dir, struct dentry new_dentry,
				4550	struct inode **delegated_inode, unsigned int flags)
				4551	{
				4552	int error;
				4553	bool is_dir = d_is_dir(old_dentry);
				4554	struct inode *source = old_dentry->d_inode;
				4555	struct inode *target = new_dentry->d_inode;
				4556	bool new_is_dir = false;
				4557	unsigned max_links = new_dir->i_sb->s_max_links;
				4558	struct name_snapshot old_name;
				4559
				4560	if (source == target)
				4561	return 0;
				4562
				4563	error = may_delete(mnt, old_dir, old_dentry, is_dir);
				4564	if (error)
				4565	return error;
				4566
				4567	if (!target) {
				4568	error = may_create(mnt, new_dir, new_dentry);
				4569	} else {
				4570	new_is_dir = d_is_dir(new_dentry);
				4571
				4572	if (!(flags & RENAME_EXCHANGE))
				4573	error = may_delete(mnt, new_dir, new_dentry, is_dir);
				4574	else
				4575	error = may_delete(mnt, new_dir, new_dentry, new_is_dir);
				4576	}
				4577	if (error)
				4578	return error;
				4579
				4580	if (!old_dir->i_op->rename)
				4581	return -EPERM;
				4582
				4583	/*
				4584	* If we are going to change the parent - check write permissions,
				4585	* we'll need to flip '..'.
				4586	*/
				4587	if (new_dir != old_dir) {
				4588	if (is_dir) {
				4589	error = inode_permission2(mnt, source, MAY_WRITE);
				4590	if (error)
				4591	return error;
				4592	}
				4593	if ((flags & RENAME_EXCHANGE) && new_is_dir) {
				4594	error = inode_permission2(mnt, target, MAY_WRITE);
				4595	if (error)
				4596	return error;
				4597	}
				4598	}
				4599
				4600	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				4601	flags);
				4602	if (error)
				4603	return error;
				4604
				4605	take_dentry_name_snapshot(&old_name, old_dentry);
				4606	dget(new_dentry);
				4607	if (!is_dir \|\| (flags & RENAME_EXCHANGE))
				4608	lock_two_nondirectories(source, target);
				4609	else if (target)
				4610	inode_lock(target);
				4611
				4612	error = -EBUSY;
				4613	if (is_local_mountpoint(old_dentry) \|\| is_local_mountpoint(new_dentry))
				4614	goto out;
				4615
				4616	if (max_links && new_dir != old_dir) {
				4617	error = -EMLINK;
				4618	if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
				4619	goto out;
				4620	if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
				4621	old_dir->i_nlink >= max_links)
				4622	goto out;
				4623	}
				4624	if (!is_dir) {
				4625	error = try_break_deleg(source, delegated_inode);
				4626	if (error)
				4627	goto out;
				4628	}
				4629	if (target && !new_is_dir) {
				4630	error = try_break_deleg(target, delegated_inode);
				4631	if (error)
				4632	goto out;
				4633	}
				4634	error = old_dir->i_op->rename(old_dir, old_dentry,
				4635	new_dir, new_dentry, flags);
				4636	if (error)
				4637	goto out;
				4638
				4639	if (!(flags & RENAME_EXCHANGE) && target) {
				4640	if (is_dir) {
				4641	shrink_dcache_parent(new_dentry);
				4642	target->i_flags \|= S_DEAD;
				4643	}
				4644	dont_mount(new_dentry);
				4645	detach_mounts(new_dentry);
				4646	}
				4647	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
				4648	if (!(flags & RENAME_EXCHANGE))
				4649	d_move(old_dentry, new_dentry);
				4650	else
				4651	d_exchange(old_dentry, new_dentry);
				4652	}
				4653	out:
				4654	if (!is_dir \|\| (flags & RENAME_EXCHANGE))
				4655	unlock_two_nondirectories(source, target);
				4656	else if (target)
				4657	inode_unlock(target);
				4658	dput(new_dentry);
				4659	if (!error) {
				4660	fsnotify_move(old_dir, new_dir, old_name.name, is_dir,
				4661	!(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
				4662	if (flags & RENAME_EXCHANGE) {
				4663	fsnotify_move(new_dir, old_dir, old_dentry->d_name.name,
				4664	new_is_dir, NULL, new_dentry);
				4665	}
				4666	}
				4667	release_dentry_name_snapshot(&old_name);
				4668
				4669	return error;
				4670	}
				4671	EXPORT_SYMBOL(vfs_rename2);
				4672
				4673	int vfs_rename(struct inode old_dir, struct dentry old_dentry,
				4674	struct inode new_dir, struct dentry new_dentry,
				4675	struct inode **delegated_inode, unsigned int flags)
				4676	{
				4677	return vfs_rename2(NULL, old_dir, old_dentry, new_dir, new_dentry, delegated_inode, flags);
				4678	}
				4679	EXPORT_SYMBOL(vfs_rename);
				4680
				4681	static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
				4682	const char __user *newname, unsigned int flags)
				4683	{
				4684	struct dentry old_dentry, new_dentry;
				4685	struct dentry *trap;
				4686	struct path old_path, new_path;
				4687	struct qstr old_last, new_last;
				4688	int old_type, new_type;
				4689	struct inode *delegated_inode = NULL;
				4690	struct filename *from;
				4691	struct filename *to;
				4692	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
				4693	bool should_retry = false;
				4694	int error;
				4695
				4696	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
				4697	return -EINVAL;
				4698
				4699	if ((flags & (RENAME_NOREPLACE \| RENAME_WHITEOUT)) &&
				4700	(flags & RENAME_EXCHANGE))
				4701	return -EINVAL;
				4702
				4703	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
				4704	return -EPERM;
				4705
				4706	if (flags & RENAME_EXCHANGE)
				4707	target_flags = 0;
				4708
				4709	retry:
				4710	from = filename_parentat(olddfd, getname(oldname), lookup_flags,
				4711	&old_path, &old_last, &old_type);
				4712	if (IS_ERR(from)) {
				4713	error = PTR_ERR(from);
				4714	goto exit;
				4715	}
				4716
				4717	to = filename_parentat(newdfd, getname(newname), lookup_flags,
				4718	&new_path, &new_last, &new_type);
				4719	if (IS_ERR(to)) {
				4720	error = PTR_ERR(to);
				4721	goto exit1;
				4722	}
				4723
				4724	error = -EXDEV;
				4725	if (old_path.mnt != new_path.mnt)
				4726	goto exit2;
				4727
				4728	error = -EBUSY;
				4729	if (old_type != LAST_NORM)
				4730	goto exit2;
				4731
				4732	if (flags & RENAME_NOREPLACE)
				4733	error = -EEXIST;
				4734	if (new_type != LAST_NORM)
				4735	goto exit2;
				4736
				4737	error = mnt_want_write(old_path.mnt);
				4738	if (error)
				4739	goto exit2;
				4740
				4741	retry_deleg:
				4742	trap = lock_rename(new_path.dentry, old_path.dentry);
				4743
				4744	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
				4745	error = PTR_ERR(old_dentry);
				4746	if (IS_ERR(old_dentry))
				4747	goto exit3;
				4748	/* source must exist */
				4749	error = -ENOENT;
				4750	if (d_is_negative(old_dentry))
				4751	goto exit4;
				4752	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags \| target_flags);
				4753	error = PTR_ERR(new_dentry);
				4754	if (IS_ERR(new_dentry))
				4755	goto exit4;
				4756	error = -EEXIST;
				4757	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
				4758	goto exit5;
				4759	if (flags & RENAME_EXCHANGE) {
				4760	error = -ENOENT;
				4761	if (d_is_negative(new_dentry))
				4762	goto exit5;
				4763
				4764	if (!d_is_dir(new_dentry)) {
				4765	error = -ENOTDIR;
				4766	if (new_last.name[new_last.len])
				4767	goto exit5;
				4768	}
				4769	}
				4770	/* unless the source is a directory trailing slashes give -ENOTDIR */
				4771	if (!d_is_dir(old_dentry)) {
				4772	error = -ENOTDIR;
				4773	if (old_last.name[old_last.len])
				4774	goto exit5;
				4775	if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
				4776	goto exit5;
				4777	}
				4778	/* source should not be ancestor of target */
				4779	error = -EINVAL;
				4780	if (old_dentry == trap)
				4781	goto exit5;
				4782	/* target should not be an ancestor of source */
				4783	if (!(flags & RENAME_EXCHANGE))
				4784	error = -ENOTEMPTY;
				4785	if (new_dentry == trap)
				4786	goto exit5;
				4787
				4788	error = security_path_rename(&old_path, old_dentry,
				4789	&new_path, new_dentry, flags);
				4790	if (error)
				4791	goto exit5;
				4792	error = vfs_rename2(old_path.mnt, old_path.dentry->d_inode, old_dentry,
				4793	new_path.dentry->d_inode, new_dentry,
				4794	&delegated_inode, flags);
				4795	exit5:
				4796	dput(new_dentry);
				4797	exit4:
				4798	dput(old_dentry);
				4799	exit3:
				4800	unlock_rename(new_path.dentry, old_path.dentry);
				4801	if (delegated_inode) {
				4802	error = break_deleg_wait(&delegated_inode);
				4803	if (!error)
				4804	goto retry_deleg;
				4805	}
				4806	mnt_drop_write(old_path.mnt);
				4807	exit2:
				4808	if (retry_estale(error, lookup_flags))
				4809	should_retry = true;
				4810	path_put(&new_path);
				4811	putname(to);
				4812	exit1:
				4813	path_put(&old_path);
				4814	putname(from);
				4815	if (should_retry) {
				4816	should_retry = false;
				4817	lookup_flags \|= LOOKUP_REVAL;
				4818	goto retry;
				4819	}
				4820	exit:
				4821	return error;
				4822	}
				4823
				4824	SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
				4825	int, newdfd, const char __user *, newname, unsigned int, flags)
				4826	{
				4827	return do_renameat2(olddfd, oldname, newdfd, newname, flags);
				4828	}
				4829
				4830	SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
				4831	int, newdfd, const char __user *, newname)
				4832	{
				4833	return do_renameat2(olddfd, oldname, newdfd, newname, 0);
				4834	}
				4835
				4836	SYSCALL_DEFINE2(rename, const char __user , oldname, const char __user , newname)
				4837	{
				4838	return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
				4839	}
				4840
				4841	int vfs_whiteout(struct inode dir, struct dentry dentry)
				4842	{
				4843	int error = may_create(NULL, dir, dentry);
				4844	if (error)
				4845	return error;
				4846
				4847	if (!dir->i_op->mknod)
				4848	return -EPERM;
				4849
				4850	return dir->i_op->mknod(dir, dentry,
				4851	S_IFCHR \| WHITEOUT_MODE, WHITEOUT_DEV);
				4852	}
				4853	EXPORT_SYMBOL(vfs_whiteout);
				4854
				4855	int readlink_copy(char __user buffer, int buflen, const char link)
				4856	{
				4857	int len = PTR_ERR(link);
				4858	if (IS_ERR(link))
				4859	goto out;
				4860
				4861	len = strlen(link);
				4862	if (len > (unsigned) buflen)
				4863	len = buflen;
				4864	if (copy_to_user(buffer, link, len))
				4865	len = -EFAULT;
				4866	out:
				4867	return len;
				4868	}
				4869
				4870	/**
				4871	* vfs_readlink - copy symlink body into userspace buffer
				4872	* @dentry: dentry on which to get symbolic link
				4873	* @buffer: user memory pointer
				4874	* @buflen: size of buffer
				4875	*
				4876	* Does not touch atime. That's up to the caller if necessary
				4877	*
				4878	* Does not call security hook.
				4879	*/
				4880	int vfs_readlink(struct dentry dentry, char __user buffer, int buflen)
				4881	{
				4882	struct inode *inode = d_inode(dentry);
				4883	DEFINE_DELAYED_CALL(done);
				4884	const char *link;
				4885	int res;
				4886
				4887	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
				4888	if (unlikely(inode->i_op->readlink))
				4889	return inode->i_op->readlink(dentry, buffer, buflen);
				4890
				4891	if (!d_is_symlink(dentry))
				4892	return -EINVAL;
				4893
				4894	spin_lock(&inode->i_lock);
				4895	inode->i_opflags \|= IOP_DEFAULT_READLINK;
				4896	spin_unlock(&inode->i_lock);
				4897	}
				4898
				4899	link = READ_ONCE(inode->i_link);
				4900	if (!link) {
				4901	link = inode->i_op->get_link(dentry, inode, &done);
				4902	if (IS_ERR(link))
				4903	return PTR_ERR(link);
				4904	}
				4905	res = readlink_copy(buffer, buflen, link);
				4906	do_delayed_call(&done);
				4907	return res;
				4908	}
				4909	EXPORT_SYMBOL(vfs_readlink);
				4910
				4911	/**
				4912	* vfs_get_link - get symlink body
				4913	* @dentry: dentry on which to get symbolic link
				4914	* @done: caller needs to free returned data with this
				4915	*
				4916	* Calls security hook and i_op->get_link() on the supplied inode.
				4917	*
				4918	* It does not touch atime. That's up to the caller if necessary.
				4919	*
				4920	* Does not work on "special" symlinks like /proc/$$/fd/N
				4921	*/
				4922	const char vfs_get_link(struct dentry dentry, struct delayed_call *done)
				4923	{
				4924	const char *res = ERR_PTR(-EINVAL);
				4925	struct inode *inode = d_inode(dentry);
				4926
				4927	if (d_is_symlink(dentry)) {
				4928	res = ERR_PTR(security_inode_readlink(dentry));
				4929	if (!res)
				4930	res = inode->i_op->get_link(dentry, inode, done);
				4931	}
				4932	return res;
				4933	}
				4934	EXPORT_SYMBOL(vfs_get_link);
				4935
				4936	/* get the link contents into pagecache */
				4937	const char page_get_link(struct dentry dentry, struct inode *inode,
				4938	struct delayed_call *callback)
				4939	{
				4940	char *kaddr;
				4941	struct page *page;
				4942	struct address_space *mapping = inode->i_mapping;
				4943
				4944	if (!dentry) {
				4945	page = find_get_page(mapping, 0);
				4946	if (!page)
				4947	return ERR_PTR(-ECHILD);
				4948	if (!PageUptodate(page)) {
				4949	put_page(page);
				4950	return ERR_PTR(-ECHILD);
				4951	}
				4952	} else {
				4953	page = read_mapping_page(mapping, 0, NULL);
				4954	if (IS_ERR(page))
				4955	return (char*)page;
				4956	}
				4957	set_delayed_call(callback, page_put_link, page);
				4958	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
				4959	kaddr = page_address(page);
				4960	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
				4961	return kaddr;
				4962	}
				4963
				4964	EXPORT_SYMBOL(page_get_link);
				4965
				4966	void page_put_link(void *arg)
				4967	{
				4968	put_page(arg);
				4969	}
				4970	EXPORT_SYMBOL(page_put_link);
				4971
				4972	int page_readlink(struct dentry dentry, char __user buffer, int buflen)
				4973	{
				4974	DEFINE_DELAYED_CALL(done);
				4975	int res = readlink_copy(buffer, buflen,
				4976	page_get_link(dentry, d_inode(dentry),
				4977	&done));
				4978	do_delayed_call(&done);
				4979	return res;
				4980	}
				4981	EXPORT_SYMBOL(page_readlink);
				4982
				4983	/*
				4984	* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
				4985	*/
				4986	int __page_symlink(struct inode inode, const char symname, int len, int nofs)
				4987	{
				4988	struct address_space *mapping = inode->i_mapping;
				4989	struct page *page;
				4990	void *fsdata;
				4991	int err;
				4992	unsigned int flags = 0;
				4993	if (nofs)
				4994	flags \|= AOP_FLAG_NOFS;
				4995
				4996	retry:
				4997	err = pagecache_write_begin(NULL, mapping, 0, len-1,
				4998	flags, &page, &fsdata);
				4999	if (err)
				5000	goto fail;
				5001
				5002	memcpy(page_address(page), symname, len-1);
				5003
				5004	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
				5005	page, fsdata);
				5006	if (err < 0)
				5007	goto fail;
				5008	if (err < len-1)
				5009	goto retry;
				5010
				5011	mark_inode_dirty(inode);
				5012	return 0;
				5013	fail:
				5014	return err;
				5015	}
				5016	EXPORT_SYMBOL(__page_symlink);
				5017
				5018	int page_symlink(struct inode inode, const char symname, int len)
				5019	{
				5020	return __page_symlink(inode, symname, len,
				5021	!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
				5022	}
				5023	EXPORT_SYMBOL(page_symlink);
				5024
				5025	const struct inode_operations page_symlink_inode_operations = {
				5026	.get_link = page_get_link,
				5027	};
				5028	EXPORT_SYMBOL(page_symlink_inode_operations);