Blame - marvell/linux/fs/namei.c - T108

blob: 48c57bf25591611dd517fafa877d022cece2a3f1 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* linux/fs/namei.c
				4	*
				5	* Copyright (C) 1991, 1992 Linus Torvalds
				6	*/
				7
				8	/*
				9	* Some corrections by tytso.
				10	*/
				11
				12	/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
				13	* lookup logic.
				14	*/
				15	/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
				16	*/
				17
				18	#include <linux/init.h>
				19	#include <linux/export.h>
				20	#include <linux/kernel.h>
				21	#include <linux/slab.h>
				22	#include <linux/fs.h>
				23	#include <linux/namei.h>
				24	#include <linux/pagemap.h>
				25	#include <linux/fsnotify.h>
				26	#include <linux/personality.h>
				27	#include <linux/security.h>
				28	#include <linux/ima.h>
				29	#include <linux/syscalls.h>
				30	#include <linux/mount.h>
				31	#include <linux/audit.h>
				32	#include <linux/capability.h>
				33	#include <linux/file.h>
				34	#include <linux/fcntl.h>
				35	#include <linux/device_cgroup.h>
				36	#include <linux/fs_struct.h>
				37	#include <linux/posix_acl.h>
				38	#include <linux/hash.h>
				39	#include <linux/bitops.h>
				40	#include <linux/init_task.h>
				41	#include <linux/uaccess.h>
				42
				43	#include "internal.h"
				44	#include "mount.h"
				45
				46	#define CREATE_TRACE_POINTS
				47	#include <trace/events/namei.h>
				48
				49	/* [Feb-1997 T. Schoebel-Theuer]
				50	* Fundamental changes in the pathname lookup mechanisms (namei)
				51	* were necessary because of omirr. The reason is that omirr needs
				52	* to know the _real_ pathname, not the user-supplied one, in case
				53	* of symlinks (and also when transname replacements occur).
				54	*
				55	* The new code replaces the old recursive symlink resolution with
				56	* an iterative one (in case of non-nested symlink chains). It does
				57	* this with calls to <fs>_follow_link().
				58	* As a side effect, dir_namei(), _namei() and follow_link() are now
				59	* replaced with a single function lookup_dentry() that can handle all
				60	* the special cases of the former code.
				61	*
				62	* With the new dcache, the pathname is stored at each inode, at least as
				63	* long as the refcount of the inode is positive. As a side effect, the
				64	* size of the dcache depends on the inode cache and thus is dynamic.
				65	*
				66	* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
				67	* resolution to correspond with current state of the code.
				68	*
				69	* Note that the symlink resolution is not completely iterative.
				70	* There is still a significant amount of tail- and mid- recursion in
				71	* the algorithm. Also, note that <fs>_readlink() is not used in
				72	* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
				73	* may return different results than <fs>_follow_link(). Many virtual
				74	* filesystems (including /proc) exhibit this behavior.
				75	*/
				76
				77	/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
				78	* New symlink semantics: when open() is called with flags O_CREAT \| O_EXCL
				79	* and the name already exists in form of a symlink, try to create the new
				80	* name indicated by the symlink. The old code always complained that the
				81	* name already exists, due to not following the symlink even if its target
				82	* is nonexistent. The new semantics affects also mknod() and link() when
				83	* the name is a symlink pointing to a non-existent name.
				84	*
				85	* I don't know which semantics is the right one, since I have no access
				86	* to standards. But I found by trial that HP-UX 9.0 has the full "new"
				87	* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
				88	* "old" one. Personally, I think the new semantics is much more logical.
				89	* Note that "ln old new" where "new" is a symlink pointing to a non-existing
				90	* file does succeed in both HP-UX and SunOs, but not in Solaris
				91	* and in the old Linux semantics.
				92	*/
				93
				94	/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
				95	* semantics. See the comments in "open_namei" and "do_link" below.
				96	*
				97	* [10-Sep-98 Alan Modra] Another symlink change.
				98	*/
				99
				100	/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
				101	* inside the path - always follow.
				102	* in the last component in creation/removal/renaming - never follow.
				103	* if LOOKUP_FOLLOW passed - follow.
				104	* if the pathname has trailing slashes - follow.
				105	* otherwise - don't follow.
				106	* (applied in that order).
				107	*
				108	* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
				109	* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
				110	* During the 2.4 we need to fix the userland stuff depending on it -
				111	* hopefully we will be able to get rid of that wart in 2.5. So far only
				112	* XEmacs seems to be relying on it...
				113	*/
				114	/*
				115	* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
				116	* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
				117	* any extra contention...
				118	*/
				119
				120	/* In order to reduce some races, while at the same time doing additional
				121	* checking and hopefully speeding things up, we copy filenames to the
				122	* kernel data space before using them..
				123	*
				124	* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
				125	* PATH_MAX includes the nul terminator --RR.
				126	*/
				127
				128	#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
				129
				130	struct filename *
				131	getname_flags(const char __user filename, int flags, int empty)
				132	{
				133	struct filename *result;
				134	char *kname;
				135	int len;
				136
				137	result = audit_reusename(filename);
				138	if (result)
				139	return result;
				140
				141	result = __getname();
				142	if (unlikely(!result))
				143	return ERR_PTR(-ENOMEM);
				144
				145	/*
				146	* First, try to embed the struct filename inside the names_cache
				147	* allocation
				148	*/
				149	kname = (char *)result->iname;
				150	result->name = kname;
				151
				152	len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
				153	if (unlikely(len < 0)) {
				154	__putname(result);
				155	return ERR_PTR(len);
				156	}
				157
				158	/*
				159	* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
				160	* separate struct filename so we can dedicate the entire
				161	* names_cache allocation for the pathname, and re-do the copy from
				162	* userland.
				163	*/
				164	if (unlikely(len == EMBEDDED_NAME_MAX)) {
				165	const size_t size = offsetof(struct filename, iname[1]);
				166	kname = (char *)result;
				167
				168	/*
				169	* size is chosen that way we to guarantee that
				170	* result->iname[0] is within the same object and that
				171	* kname can't be equal to result->iname, no matter what.
				172	*/
				173	result = kzalloc(size, GFP_KERNEL);
				174	if (unlikely(!result)) {
				175	__putname(kname);
				176	return ERR_PTR(-ENOMEM);
				177	}
				178	result->name = kname;
				179	len = strncpy_from_user(kname, filename, PATH_MAX);
				180	if (unlikely(len < 0)) {
				181	__putname(kname);
				182	kfree(result);
				183	return ERR_PTR(len);
				184	}
				185	if (unlikely(len == PATH_MAX)) {
				186	__putname(kname);
				187	kfree(result);
				188	return ERR_PTR(-ENAMETOOLONG);
				189	}
				190	}
				191
				192	result->refcnt = 1;
				193	/* The empty path is special. */
				194	if (unlikely(!len)) {
				195	if (empty)
				196	*empty = 1;
				197	if (!(flags & LOOKUP_EMPTY)) {
				198	putname(result);
				199	return ERR_PTR(-ENOENT);
				200	}
				201	}
				202
				203	result->uptr = filename;
				204	result->aname = NULL;
				205	audit_getname(result);
				206	return result;
				207	}
				208
				209	struct filename *
				210	getname(const char __user * filename)
				211	{
				212	return getname_flags(filename, 0, NULL);
				213	}
				214
				215	struct filename *
				216	getname_kernel(const char * filename)
				217	{
				218	struct filename *result;
				219	int len = strlen(filename) + 1;
				220
				221	result = __getname();
				222	if (unlikely(!result))
				223	return ERR_PTR(-ENOMEM);
				224
				225	if (len <= EMBEDDED_NAME_MAX) {
				226	result->name = (char *)result->iname;
				227	} else if (len <= PATH_MAX) {
				228	const size_t size = offsetof(struct filename, iname[1]);
				229	struct filename *tmp;
				230
				231	tmp = kmalloc(size, GFP_KERNEL);
				232	if (unlikely(!tmp)) {
				233	__putname(result);
				234	return ERR_PTR(-ENOMEM);
				235	}
				236	tmp->name = (char *)result;
				237	result = tmp;
				238	} else {
				239	__putname(result);
				240	return ERR_PTR(-ENAMETOOLONG);
				241	}
				242	memcpy((char *)result->name, filename, len);
				243	result->uptr = NULL;
				244	result->aname = NULL;
				245	result->refcnt = 1;
				246	audit_getname(result);
				247
				248	return result;
				249	}
				250
				251	void putname(struct filename *name)
				252	{
				253	BUG_ON(name->refcnt <= 0);
				254
				255	if (--name->refcnt > 0)
				256	return;
				257
				258	if (name->name != name->iname) {
				259	__putname(name->name);
				260	kfree(name);
				261	} else
				262	__putname(name);
				263	}
				264
				265	static int check_acl(struct inode *inode, int mask)
				266	{
				267	#ifdef CONFIG_FS_POSIX_ACL
				268	struct posix_acl *acl;
				269
				270	if (mask & MAY_NOT_BLOCK) {
				271	acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
				272	if (!acl)
				273	return -EAGAIN;
				274	/* no ->get_acl() calls in RCU mode... */
				275	if (is_uncached_acl(acl))
				276	return -ECHILD;
				277	return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
				278	}
				279
				280	acl = get_acl(inode, ACL_TYPE_ACCESS);
				281	if (IS_ERR(acl))
				282	return PTR_ERR(acl);
				283	if (acl) {
				284	int error = posix_acl_permission(inode, acl, mask);
				285	posix_acl_release(acl);
				286	return error;
				287	}
				288	#endif
				289
				290	return -EAGAIN;
				291	}
				292
				293	/*
				294	* This does the basic permission checking
				295	*/
				296	static int acl_permission_check(struct inode *inode, int mask)
				297	{
				298	unsigned int mode = inode->i_mode;
				299
				300	if (likely(uid_eq(current_fsuid(), inode->i_uid)))
				301	mode >>= 6;
				302	else {
				303	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
				304	int error = check_acl(inode, mask);
				305	if (error != -EAGAIN)
				306	return error;
				307	}
				308
				309	if (in_group_p(inode->i_gid))
				310	mode >>= 3;
				311	}
				312
				313	/*
				314	* If the DACs are ok we don't need any capability check.
				315	*/
				316	if ((mask & ~mode & (MAY_READ \| MAY_WRITE \| MAY_EXEC)) == 0)
				317	return 0;
				318	return -EACCES;
				319	}
				320
				321	/**
				322	* generic_permission - check for access rights on a Posix-like filesystem
				323	* @inode: inode to check access rights for
				324	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
				325	*
				326	* Used to check for read/write/execute permissions on a file.
				327	* We use "fsuid" for this, letting us set arbitrary permissions
				328	* for filesystem access without changing the "normal" uids which
				329	* are used for other things.
				330	*
				331	* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
				332	* request cannot be satisfied (eg. requires blocking or too much complexity).
				333	* It would then be called again in ref-walk mode.
				334	*/
				335	int generic_permission(struct inode *inode, int mask)
				336	{
				337	int ret;
				338
				339	/*
				340	* Do the basic permission checks.
				341	*/
				342	ret = acl_permission_check(inode, mask);
				343	if (ret != -EACCES)
				344	return ret;
				345
				346	if (S_ISDIR(inode->i_mode)) {
				347	/* DACs are overridable for directories */
				348	if (!(mask & MAY_WRITE))
				349	if (capable_wrt_inode_uidgid(inode,
				350	CAP_DAC_READ_SEARCH))
				351	return 0;
				352	if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
				353	return 0;
				354	return -EACCES;
				355	}
				356
				357	/*
				358	* Searching includes executable on directories, else just read.
				359	*/
				360	mask &= MAY_READ \| MAY_WRITE \| MAY_EXEC;
				361	if (mask == MAY_READ)
				362	if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH))
				363	return 0;
				364	/*
				365	* Read/write DACs are always overridable.
				366	* Executable DACs are overridable when there is
				367	* at least one exec bit set.
				368	*/
				369	if (!(mask & MAY_EXEC) \|\| (inode->i_mode & S_IXUGO))
				370	if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE))
				371	return 0;
				372
				373	return -EACCES;
				374	}
				375	EXPORT_SYMBOL(generic_permission);
				376
				377	/*
				378	* We _really_ want to just do "generic_permission()" without
				379	* even looking at the inode->i_op values. So we keep a cache
				380	* flag in inode->i_opflags, that says "this has not special
				381	* permission function, use the fast case".
				382	*/
				383	static inline int do_inode_permission(struct inode *inode, int mask)
				384	{
				385	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
				386	if (likely(inode->i_op->permission))
				387	return inode->i_op->permission(inode, mask);
				388
				389	/* This gets set once for the inode lifetime */
				390	spin_lock(&inode->i_lock);
				391	inode->i_opflags \|= IOP_FASTPERM;
				392	spin_unlock(&inode->i_lock);
				393	}
				394	return generic_permission(inode, mask);
				395	}
				396
				397	/**
				398	* sb_permission - Check superblock-level permissions
				399	* @sb: Superblock of inode to check permission on
				400	* @inode: Inode to check permission on
				401	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
				402	*
				403	* Separate out file-system wide checks from inode-specific permission checks.
				404	*/
				405	static int sb_permission(struct super_block sb, struct inode inode, int mask)
				406	{
				407	if (unlikely(mask & MAY_WRITE)) {
				408	umode_t mode = inode->i_mode;
				409
				410	/* Nobody gets write access to a read-only fs. */
				411	if (sb_rdonly(sb) && (S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode)))
				412	return -EROFS;
				413	}
				414	return 0;
				415	}
				416
				417	/**
				418	* inode_permission - Check for access rights to a given inode
				419	* @inode: Inode to check permission on
				420	* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
				421	*
				422	* Check for read/write/execute permissions on an inode. We use fs[ug]id for
				423	* this, letting us set arbitrary permissions for filesystem access without
				424	* changing the "normal" UIDs which are used for other things.
				425	*
				426	* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
				427	*/
				428	int inode_permission(struct inode *inode, int mask)
				429	{
				430	int retval;
				431
				432	retval = sb_permission(inode->i_sb, inode, mask);
				433	if (retval)
				434	return retval;
				435
				436	if (unlikely(mask & MAY_WRITE)) {
				437	/*
				438	* Nobody gets write access to an immutable file.
				439	*/
				440	if (IS_IMMUTABLE(inode))
				441	return -EPERM;
				442
				443	/*
				444	* Updating mtime will likely cause i_uid and i_gid to be
				445	* written back improperly if their true value is unknown
				446	* to the vfs.
				447	*/
				448	if (HAS_UNMAPPED_ID(inode))
				449	return -EACCES;
				450	}
				451
				452	retval = do_inode_permission(inode, mask);
				453	if (retval)
				454	return retval;
				455
				456	retval = devcgroup_inode_permission(inode, mask);
				457	if (retval)
				458	return retval;
				459
				460	return security_inode_permission(inode, mask);
				461	}
				462	EXPORT_SYMBOL(inode_permission);
				463
				464	/**
				465	* path_get - get a reference to a path
				466	* @path: path to get the reference to
				467	*
				468	* Given a path increment the reference count to the dentry and the vfsmount.
				469	*/
				470	void path_get(const struct path *path)
				471	{
				472	mntget(path->mnt);
				473	dget(path->dentry);
				474	}
				475	EXPORT_SYMBOL(path_get);
				476
				477	/**
				478	* path_put - put a reference to a path
				479	* @path: path to put the reference to
				480	*
				481	* Given a path decrement the reference count to the dentry and the vfsmount.
				482	*/
				483	void path_put(const struct path *path)
				484	{
				485	dput(path->dentry);
				486	mntput(path->mnt);
				487	}
				488	EXPORT_SYMBOL(path_put);
				489
				490	#define EMBEDDED_LEVELS 2
				491	struct nameidata {
				492	struct path path;
				493	struct qstr last;
				494	struct path root;
				495	struct inode inode; / path.dentry.d_inode */
				496	unsigned int flags;
				497	unsigned seq, m_seq;
				498	int last_type;
				499	unsigned depth;
				500	int total_link_count;
				501	struct saved {
				502	struct path link;
				503	struct delayed_call done;
				504	const char *name;
				505	unsigned seq;
				506	} *stack, internal[EMBEDDED_LEVELS];
				507	struct filename *name;
				508	struct nameidata *saved;
				509	struct inode *link_inode;
				510	unsigned root_seq;
				511	int dfd;
				512	} __randomize_layout;
				513
				514	static void set_nameidata(struct nameidata p, int dfd, struct filename name)
				515	{
				516	struct nameidata *old = current->nameidata;
				517	p->stack = p->internal;
				518	p->dfd = dfd;
				519	p->name = name;
				520	p->total_link_count = old ? old->total_link_count : 0;
				521	p->saved = old;
				522	current->nameidata = p;
				523	}
				524
				525	static void restore_nameidata(void)
				526	{
				527	struct nameidata now = current->nameidata, old = now->saved;
				528
				529	current->nameidata = old;
				530	if (old)
				531	old->total_link_count = now->total_link_count;
				532	if (now->stack != now->internal)
				533	kfree(now->stack);
				534	}
				535
				536	static int __nd_alloc_stack(struct nameidata *nd)
				537	{
				538	struct saved *p;
				539
				540	if (nd->flags & LOOKUP_RCU) {
				541	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
				542	GFP_ATOMIC);
				543	if (unlikely(!p))
				544	return -ECHILD;
				545	} else {
				546	p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
				547	GFP_KERNEL);
				548	if (unlikely(!p))
				549	return -ENOMEM;
				550	}
				551	memcpy(p, nd->internal, sizeof(nd->internal));
				552	nd->stack = p;
				553	return 0;
				554	}
				555
				556	/**
				557	* path_connected - Verify that a path->dentry is below path->mnt.mnt_root
				558	* @path: nameidate to verify
				559	*
				560	* Rename can sometimes move a file or directory outside of a bind
				561	* mount, path_connected allows those cases to be detected.
				562	*/
				563	static bool path_connected(const struct path *path)
				564	{
				565	struct vfsmount *mnt = path->mnt;
				566	struct super_block *sb = mnt->mnt_sb;
				567
				568	/* Bind mounts and multi-root filesystems can have disconnected paths */
				569	if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
				570	return true;
				571
				572	return is_subdir(path->dentry, mnt->mnt_root);
				573	}
				574
				575	static inline int nd_alloc_stack(struct nameidata *nd)
				576	{
				577	if (likely(nd->depth != EMBEDDED_LEVELS))
				578	return 0;
				579	if (likely(nd->stack != nd->internal))
				580	return 0;
				581	return __nd_alloc_stack(nd);
				582	}
				583
				584	static void drop_links(struct nameidata *nd)
				585	{
				586	int i = nd->depth;
				587	while (i--) {
				588	struct saved *last = nd->stack + i;
				589	do_delayed_call(&last->done);
				590	clear_delayed_call(&last->done);
				591	}
				592	}
				593
				594	static void terminate_walk(struct nameidata *nd)
				595	{
				596	drop_links(nd);
				597	if (!(nd->flags & LOOKUP_RCU)) {
				598	int i;
				599	path_put(&nd->path);
				600	for (i = 0; i < nd->depth; i++)
				601	path_put(&nd->stack[i].link);
				602	if (nd->flags & LOOKUP_ROOT_GRABBED) {
				603	path_put(&nd->root);
				604	nd->flags &= ~LOOKUP_ROOT_GRABBED;
				605	}
				606	} else {
				607	nd->flags &= ~LOOKUP_RCU;
				608	rcu_read_unlock();
				609	}
				610	nd->depth = 0;
				611	}
				612
				613	/* path_put is needed afterwards regardless of success or failure */
				614	static bool legitimize_path(struct nameidata *nd,
				615	struct path *path, unsigned seq)
				616	{
				617	int res = __legitimize_mnt(path->mnt, nd->m_seq);
				618	if (unlikely(res)) {
				619	if (res > 0)
				620	path->mnt = NULL;
				621	path->dentry = NULL;
				622	return false;
				623	}
				624	if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
				625	path->dentry = NULL;
				626	return false;
				627	}
				628	return !read_seqcount_retry(&path->dentry->d_seq, seq);
				629	}
				630
				631	static bool legitimize_links(struct nameidata *nd)
				632	{
				633	int i;
				634	for (i = 0; i < nd->depth; i++) {
				635	struct saved *last = nd->stack + i;
				636	if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
				637	drop_links(nd);
				638	nd->depth = i + 1;
				639	return false;
				640	}
				641	}
				642	return true;
				643	}
				644
				645	static bool legitimize_root(struct nameidata *nd)
				646	{
				647	if (!nd->root.mnt \|\| (nd->flags & LOOKUP_ROOT))
				648	return true;
				649	nd->flags \|= LOOKUP_ROOT_GRABBED;
				650	return legitimize_path(nd, &nd->root, nd->root_seq);
				651	}
				652
				653	/*
				654	* Path walking has 2 modes, rcu-walk and ref-walk (see
				655	* Documentation/filesystems/path-lookup.txt). In situations when we can't
				656	* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
				657	* normal reference counts on dentries and vfsmounts to transition to ref-walk
				658	* mode. Refcounts are grabbed at the last known good point before rcu-walk
				659	* got stuck, so ref-walk may continue from there. If this is not successful
				660	* (eg. a seqcount has changed), then failure is returned and it's up to caller
				661	* to restart the path walk from the beginning in ref-walk mode.
				662	*/
				663
				664	/**
				665	* unlazy_walk - try to switch to ref-walk mode.
				666	* @nd: nameidata pathwalk data
				667	* Returns: 0 on success, -ECHILD on failure
				668	*
				669	* unlazy_walk attempts to legitimize the current nd->path and nd->root
				670	* for ref-walk mode.
				671	* Must be called from rcu-walk context.
				672	* Nothing should touch nameidata between unlazy_walk() failure and
				673	* terminate_walk().
				674	*/
				675	static int unlazy_walk(struct nameidata *nd)
				676	{
				677	struct dentry *parent = nd->path.dentry;
				678
				679	BUG_ON(!(nd->flags & LOOKUP_RCU));
				680
				681	nd->flags &= ~LOOKUP_RCU;
				682	if (unlikely(!legitimize_links(nd)))
				683	goto out1;
				684	if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
				685	goto out;
				686	if (unlikely(!legitimize_root(nd)))
				687	goto out;
				688	rcu_read_unlock();
				689	BUG_ON(nd->inode != parent->d_inode);
				690	return 0;
				691
				692	out1:
				693	nd->path.mnt = NULL;
				694	nd->path.dentry = NULL;
				695	out:
				696	rcu_read_unlock();
				697	return -ECHILD;
				698	}
				699
				700	/**
				701	* unlazy_child - try to switch to ref-walk mode.
				702	* @nd: nameidata pathwalk data
				703	* @dentry: child of nd->path.dentry
				704	* @seq: seq number to check dentry against
				705	* Returns: 0 on success, -ECHILD on failure
				706	*
				707	* unlazy_child attempts to legitimize the current nd->path, nd->root and dentry
				708	* for ref-walk mode. @dentry must be a path found by a do_lookup call on
				709	* @nd. Must be called from rcu-walk context.
				710	* Nothing should touch nameidata between unlazy_child() failure and
				711	* terminate_walk().
				712	*/
				713	static int unlazy_child(struct nameidata nd, struct dentry dentry, unsigned seq)
				714	{
				715	BUG_ON(!(nd->flags & LOOKUP_RCU));
				716
				717	nd->flags &= ~LOOKUP_RCU;
				718	if (unlikely(!legitimize_links(nd)))
				719	goto out2;
				720	if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq)))
				721	goto out2;
				722	if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
				723	goto out1;
				724
				725	/*
				726	* We need to move both the parent and the dentry from the RCU domain
				727	* to be properly refcounted. And the sequence number in the dentry
				728	* validates both dentry counters, since we checked the sequence
				729	* number of the parent after we got the child sequence number. So we
				730	* know the parent must still be valid if the child sequence number is
				731	*/
				732	if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
				733	goto out;
				734	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
				735	goto out_dput;
				736	/*
				737	* Sequence counts matched. Now make sure that the root is
				738	* still valid and get it if required.
				739	*/
				740	if (unlikely(!legitimize_root(nd)))
				741	goto out_dput;
				742	rcu_read_unlock();
				743	return 0;
				744
				745	out2:
				746	nd->path.mnt = NULL;
				747	out1:
				748	nd->path.dentry = NULL;
				749	out:
				750	rcu_read_unlock();
				751	return -ECHILD;
				752	out_dput:
				753	rcu_read_unlock();
				754	dput(dentry);
				755	return -ECHILD;
				756	}
				757
				758	static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
				759	{
				760	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
				761	return dentry->d_op->d_revalidate(dentry, flags);
				762	else
				763	return 1;
				764	}
				765
				766	#define INIT_PATH_SIZE 64
				767
				768	static void success_walk_trace(struct nameidata *nd)
				769	{
				770	struct path *pt = &nd->path;
				771	struct inode *i = nd->inode;
				772	char buf[INIT_PATH_SIZE], *try_buf;
				773	int cur_path_size;
				774	char *p;
				775
				776	/* When eBPF/ tracepoint is disabled, keep overhead low. */
				777	if (!trace_inodepath_enabled())
				778	return;
				779
				780	/* First try stack allocated buffer. */
				781	try_buf = buf;
				782	cur_path_size = INIT_PATH_SIZE;
				783
				784	while (cur_path_size <= PATH_MAX) {
				785	/* Free previous heap allocation if we are now trying
				786	* a second or later heap allocation.
				787	*/
				788	if (try_buf != buf)
				789	kfree(try_buf);
				790
				791	/* All but the first alloc are on the heap. */
				792	if (cur_path_size != INIT_PATH_SIZE) {
				793	try_buf = kmalloc(cur_path_size, GFP_KERNEL);
				794	if (!try_buf) {
				795	try_buf = buf;
				796	sprintf(try_buf, "error:buf_alloc_failed");
				797	break;
				798	}
				799	}
				800
				801	p = d_path(pt, try_buf, cur_path_size);
				802
				803	if (!IS_ERR(p)) {
				804	char *end = mangle_path(try_buf, p, "\n");
				805
				806	if (end) {
				807	try_buf[end - try_buf] = 0;
				808	break;
				809	} else {
				810	/* On mangle errors, double path size
				811	* till PATH_MAX.
				812	*/
				813	cur_path_size = cur_path_size << 1;
				814	continue;
				815	}
				816	}
				817
				818	if (PTR_ERR(p) == -ENAMETOOLONG) {
				819	/* If d_path complains that name is too long,
				820	* then double path size till PATH_MAX.
				821	*/
				822	cur_path_size = cur_path_size << 1;
				823	continue;
				824	}
				825
				826	sprintf(try_buf, "error:d_path_failed_%lu",
				827	-1 * PTR_ERR(p));
				828	break;
				829	}
				830
				831	if (cur_path_size > PATH_MAX)
				832	sprintf(try_buf, "error:d_path_name_too_long");
				833
				834	trace_inodepath(i, try_buf);
				835
				836	if (try_buf != buf)
				837	kfree(try_buf);
				838	return;
				839	}
				840
				841	/**
				842	* complete_walk - successful completion of path walk
				843	* @nd: pointer nameidata
				844	*
				845	* If we had been in RCU mode, drop out of it and legitimize nd->path.
				846	* Revalidate the final result, unless we'd already done that during
				847	* the path walk or the filesystem doesn't ask for it. Return 0 on
				848	* success, -error on failure. In case of failure caller does not
				849	* need to drop nd->path.
				850	*/
				851	static int complete_walk(struct nameidata *nd)
				852	{
				853	struct dentry *dentry = nd->path.dentry;
				854	int status;
				855
				856	if (nd->flags & LOOKUP_RCU) {
				857	if (!(nd->flags & LOOKUP_ROOT))
				858	nd->root.mnt = NULL;
				859	if (unlikely(unlazy_walk(nd)))
				860	return -ECHILD;
				861	}
				862
				863	if (likely(!(nd->flags & LOOKUP_JUMPED))) {
				864	success_walk_trace(nd);
				865	return 0;
				866	}
				867
				868	if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) {
				869	success_walk_trace(nd);
				870	return 0;
				871	}
				872
				873	status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
				874	if (status > 0) {
				875	success_walk_trace(nd);
				876	return 0;
				877	}
				878
				879	if (!status)
				880	status = -ESTALE;
				881
				882	return status;
				883	}
				884
				885	static void set_root(struct nameidata *nd)
				886	{
				887	struct fs_struct *fs = current->fs;
				888
				889	if (nd->flags & LOOKUP_RCU) {
				890	unsigned seq;
				891
				892	do {
				893	seq = read_seqcount_begin(&fs->seq);
				894	nd->root = fs->root;
				895	nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
				896	} while (read_seqcount_retry(&fs->seq, seq));
				897	} else {
				898	get_fs_root(fs, &nd->root);
				899	nd->flags \|= LOOKUP_ROOT_GRABBED;
				900	}
				901	}
				902
				903	static void path_put_conditional(struct path path, struct nameidata nd)
				904	{
				905	dput(path->dentry);
				906	if (path->mnt != nd->path.mnt)
				907	mntput(path->mnt);
				908	}
				909
				910	static inline void path_to_nameidata(const struct path *path,
				911	struct nameidata *nd)
				912	{
				913	if (!(nd->flags & LOOKUP_RCU)) {
				914	dput(nd->path.dentry);
				915	if (nd->path.mnt != path->mnt)
				916	mntput(nd->path.mnt);
				917	}
				918	nd->path.mnt = path->mnt;
				919	nd->path.dentry = path->dentry;
				920	}
				921
				922	static int nd_jump_root(struct nameidata *nd)
				923	{
				924	if (nd->flags & LOOKUP_RCU) {
				925	struct dentry *d;
				926	nd->path = nd->root;
				927	d = nd->path.dentry;
				928	nd->inode = d->d_inode;
				929	nd->seq = nd->root_seq;
				930	if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
				931	return -ECHILD;
				932	} else {
				933	path_put(&nd->path);
				934	nd->path = nd->root;
				935	path_get(&nd->path);
				936	nd->inode = nd->path.dentry->d_inode;
				937	}
				938	nd->flags \|= LOOKUP_JUMPED;
				939	return 0;
				940	}
				941
				942	/*
				943	* Helper to directly jump to a known parsed path from ->get_link,
				944	* caller must have taken a reference to path beforehand.
				945	*/
				946	void nd_jump_link(struct path *path)
				947	{
				948	struct nameidata *nd = current->nameidata;
				949	path_put(&nd->path);
				950
				951	nd->path = *path;
				952	nd->inode = nd->path.dentry->d_inode;
				953	nd->flags \|= LOOKUP_JUMPED;
				954	}
				955
				956	static inline void put_link(struct nameidata *nd)
				957	{
				958	struct saved *last = nd->stack + --nd->depth;
				959	do_delayed_call(&last->done);
				960	if (!(nd->flags & LOOKUP_RCU))
				961	path_put(&last->link);
				962	}
				963
				964	int sysctl_protected_symlinks __read_mostly = 0;
				965	int sysctl_protected_hardlinks __read_mostly = 0;
				966	int sysctl_protected_fifos __read_mostly;
				967	int sysctl_protected_regular __read_mostly;
				968
				969	/**
				970	* may_follow_link - Check symlink following for unsafe situations
				971	* @nd: nameidata pathwalk data
				972	*
				973	* In the case of the sysctl_protected_symlinks sysctl being enabled,
				974	* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
				975	* in a sticky world-writable directory. This is to protect privileged
				976	* processes from failing races against path names that may change out
				977	* from under them by way of other users creating malicious symlinks.
				978	* It will permit symlinks to be followed only when outside a sticky
				979	* world-writable directory, or when the uid of the symlink and follower
				980	* match, or when the directory owner matches the symlink's owner.
				981	*
				982	* Returns 0 if following the symlink is allowed, -ve on error.
				983	*/
				984	static inline int may_follow_link(struct nameidata *nd)
				985	{
				986	const struct inode *inode;
				987	const struct inode *parent;
				988	kuid_t puid;
				989
				990	if (!sysctl_protected_symlinks)
				991	return 0;
				992
				993	/* Allowed if owner and follower match. */
				994	inode = nd->link_inode;
				995	if (uid_eq(current_cred()->fsuid, inode->i_uid))
				996	return 0;
				997
				998	/* Allowed if parent directory not sticky and world-writable. */
				999	parent = nd->inode;
				1000	if ((parent->i_mode & (S_ISVTX\|S_IWOTH)) != (S_ISVTX\|S_IWOTH))
				1001	return 0;
				1002
				1003	/* Allowed if parent directory and link owner match. */
				1004	puid = parent->i_uid;
				1005	if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
				1006	return 0;
				1007
				1008	if (nd->flags & LOOKUP_RCU)
				1009	return -ECHILD;
				1010
				1011	audit_inode(nd->name, nd->stack[0].link.dentry, 0);
				1012	audit_log_link_denied("follow_link");
				1013	return -EACCES;
				1014	}
				1015
				1016	/**
				1017	* safe_hardlink_source - Check for safe hardlink conditions
				1018	* @inode: the source inode to hardlink from
				1019	*
				1020	* Return false if at least one of the following conditions:
				1021	* - inode is not a regular file
				1022	* - inode is setuid
				1023	* - inode is setgid and group-exec
				1024	* - access failure for read and write
				1025	*
				1026	* Otherwise returns true.
				1027	*/
				1028	static bool safe_hardlink_source(struct inode *inode)
				1029	{
				1030	umode_t mode = inode->i_mode;
				1031
				1032	/* Special files should not get pinned to the filesystem. */
				1033	if (!S_ISREG(mode))
				1034	return false;
				1035
				1036	/* Setuid files should not get pinned to the filesystem. */
				1037	if (mode & S_ISUID)
				1038	return false;
				1039
				1040	/* Executable setgid files should not get pinned to the filesystem. */
				1041	if ((mode & (S_ISGID \| S_IXGRP)) == (S_ISGID \| S_IXGRP))
				1042	return false;
				1043
				1044	/* Hardlinking to unreadable or unwritable sources is dangerous. */
				1045	if (inode_permission(inode, MAY_READ \| MAY_WRITE))
				1046	return false;
				1047
				1048	return true;
				1049	}
				1050
				1051	/**
				1052	* may_linkat - Check permissions for creating a hardlink
				1053	* @link: the source to hardlink from
				1054	*
				1055	* Block hardlink when all of:
				1056	* - sysctl_protected_hardlinks enabled
				1057	* - fsuid does not match inode
				1058	* - hardlink source is unsafe (see safe_hardlink_source() above)
				1059	* - not CAP_FOWNER in a namespace with the inode owner uid mapped
				1060	*
				1061	* Returns 0 if successful, -ve on error.
				1062	*/
				1063	static int may_linkat(struct path *link)
				1064	{
				1065	struct inode *inode = link->dentry->d_inode;
				1066
				1067	/* Inode writeback is not safe when the uid or gid are invalid. */
				1068	if (!uid_valid(inode->i_uid) \|\| !gid_valid(inode->i_gid))
				1069	return -EOVERFLOW;
				1070
				1071	if (!sysctl_protected_hardlinks)
				1072	return 0;
				1073
				1074	/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
				1075	* otherwise, it must be a safe source.
				1076	*/
				1077	if (safe_hardlink_source(inode) \|\| inode_owner_or_capable(inode))
				1078	return 0;
				1079
				1080	audit_log_link_denied("linkat");
				1081	return -EPERM;
				1082	}
				1083
				1084	/**
				1085	* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
				1086	* should be allowed, or not, on files that already
				1087	* exist.
				1088	* @dir_mode: mode bits of directory
				1089	* @dir_uid: owner of directory
				1090	* @inode: the inode of the file to open
				1091	*
				1092	* Block an O_CREAT open of a FIFO (or a regular file) when:
				1093	* - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
				1094	* - the file already exists
				1095	* - we are in a sticky directory
				1096	* - we don't own the file
				1097	* - the owner of the directory doesn't own the file
				1098	* - the directory is world writable
				1099	* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
				1100	* the directory doesn't have to be world writable: being group writable will
				1101	* be enough.
				1102	*
				1103	* Returns 0 if the open is allowed, -ve on error.
				1104	*/
				1105	static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
				1106	struct inode * const inode)
				1107	{
				1108	if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) \|\|
				1109	(!sysctl_protected_regular && S_ISREG(inode->i_mode)) \|\|
				1110	likely(!(dir_mode & S_ISVTX)) \|\|
				1111	uid_eq(inode->i_uid, dir_uid) \|\|
				1112	uid_eq(current_fsuid(), inode->i_uid))
				1113	return 0;
				1114
				1115	if (likely(dir_mode & 0002) \|\|
				1116	(dir_mode & 0020 &&
				1117	((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) \|\|
				1118	(sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
				1119	return -EACCES;
				1120	}
				1121	return 0;
				1122	}
				1123
				1124	static __always_inline
				1125	const char get_link(struct nameidata nd)
				1126	{
				1127	struct saved *last = nd->stack + nd->depth - 1;
				1128	struct dentry *dentry = last->link.dentry;
				1129	struct inode *inode = nd->link_inode;
				1130	int error;
				1131	const char *res;
				1132
				1133	if (!(nd->flags & LOOKUP_RCU)) {
				1134	touch_atime(&last->link);
				1135	cond_resched();
				1136	} else if (atime_needs_update(&last->link, inode)) {
				1137	if (unlikely(unlazy_walk(nd)))
				1138	return ERR_PTR(-ECHILD);
				1139	touch_atime(&last->link);
				1140	}
				1141
				1142	error = security_inode_follow_link(dentry, inode,
				1143	nd->flags & LOOKUP_RCU);
				1144	if (unlikely(error))
				1145	return ERR_PTR(error);
				1146
				1147	nd->last_type = LAST_BIND;
				1148	res = READ_ONCE(inode->i_link);
				1149	if (!res) {
				1150	const char * (get)(struct dentry , struct inode *,
				1151	struct delayed_call *);
				1152	get = inode->i_op->get_link;
				1153	if (nd->flags & LOOKUP_RCU) {
				1154	res = get(NULL, inode, &last->done);
				1155	if (res == ERR_PTR(-ECHILD)) {
				1156	if (unlikely(unlazy_walk(nd)))
				1157	return ERR_PTR(-ECHILD);
				1158	res = get(dentry, inode, &last->done);
				1159	}
				1160	} else {
				1161	res = get(dentry, inode, &last->done);
				1162	}
				1163	if (IS_ERR_OR_NULL(res))
				1164	return res;
				1165	}
				1166	if (*res == '/') {
				1167	if (!nd->root.mnt)
				1168	set_root(nd);
				1169	if (unlikely(nd_jump_root(nd)))
				1170	return ERR_PTR(-ECHILD);
				1171	while (unlikely(*++res == '/'))
				1172	;
				1173	}
				1174	if (!*res)
				1175	res = NULL;
				1176	return res;
				1177	}
				1178
				1179	/*
				1180	* follow_up - Find the mountpoint of path's vfsmount
				1181	*
				1182	* Given a path, find the mountpoint of its source file system.
				1183	* Replace @path with the path of the mountpoint in the parent mount.
				1184	* Up is towards /.
				1185	*
				1186	* Return 1 if we went up a level and 0 if we were already at the
				1187	* root.
				1188	*/
				1189	int follow_up(struct path *path)
				1190	{
				1191	struct mount *mnt = real_mount(path->mnt);
				1192	struct mount *parent;
				1193	struct dentry *mountpoint;
				1194
				1195	read_seqlock_excl(&mount_lock);
				1196	parent = mnt->mnt_parent;
				1197	if (parent == mnt) {
				1198	read_sequnlock_excl(&mount_lock);
				1199	return 0;
				1200	}
				1201	mntget(&parent->mnt);
				1202	mountpoint = dget(mnt->mnt_mountpoint);
				1203	read_sequnlock_excl(&mount_lock);
				1204	dput(path->dentry);
				1205	path->dentry = mountpoint;
				1206	mntput(path->mnt);
				1207	path->mnt = &parent->mnt;
				1208	return 1;
				1209	}
				1210	EXPORT_SYMBOL(follow_up);
				1211
				1212	/*
				1213	* Perform an automount
				1214	* - return -EISDIR to tell follow_managed() to stop and return the path we
				1215	* were called with.
				1216	*/
				1217	static int follow_automount(struct path path, struct nameidata nd,
				1218	bool *need_mntput)
				1219	{
				1220	struct vfsmount *mnt;
				1221	int err;
				1222
				1223	if (!path->dentry->d_op \|\| !path->dentry->d_op->d_automount)
				1224	return -EREMOTE;
				1225
				1226	/* We don't want to mount if someone's just doing a stat -
				1227	* unless they're stat'ing a directory and appended a '/' to
				1228	* the name.
				1229	*
				1230	* We do, however, want to mount if someone wants to open or
				1231	* create a file of any type under the mountpoint, wants to
				1232	* traverse through the mountpoint or wants to open the
				1233	* mounted directory. Also, autofs may mark negative dentries
				1234	* as being automount points. These will need the attentions
				1235	* of the daemon to instantiate them before they can be used.
				1236	*/
				1237	if (!(nd->flags & (LOOKUP_PARENT \| LOOKUP_DIRECTORY \|
				1238	LOOKUP_OPEN \| LOOKUP_CREATE \| LOOKUP_AUTOMOUNT)) &&
				1239	path->dentry->d_inode)
				1240	return -EISDIR;
				1241
				1242	nd->total_link_count++;
				1243	if (nd->total_link_count >= 40)
				1244	return -ELOOP;
				1245
				1246	mnt = path->dentry->d_op->d_automount(path);
				1247	if (IS_ERR(mnt)) {
				1248	/*
				1249	* The filesystem is allowed to return -EISDIR here to indicate
				1250	* it doesn't want to automount. For instance, autofs would do
				1251	* this so that its userspace daemon can mount on this dentry.
				1252	*
				1253	* However, we can only permit this if it's a terminal point in
				1254	* the path being looked up; if it wasn't then the remainder of
				1255	* the path is inaccessible and we should say so.
				1256	*/
				1257	if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT))
				1258	return -EREMOTE;
				1259	return PTR_ERR(mnt);
				1260	}
				1261
				1262	if (!mnt) /* mount collision */
				1263	return 0;
				1264
				1265	if (!*need_mntput) {
				1266	/* lock_mount() may release path->mnt on error */
				1267	mntget(path->mnt);
				1268	*need_mntput = true;
				1269	}
				1270	err = finish_automount(mnt, path);
				1271
				1272	switch (err) {
				1273	case -EBUSY:
				1274	/* Someone else made a mount here whilst we were busy */
				1275	return 0;
				1276	case 0:
				1277	path_put(path);
				1278	path->mnt = mnt;
				1279	path->dentry = dget(mnt->mnt_root);
				1280	return 0;
				1281	default:
				1282	return err;
				1283	}
				1284
				1285	}
				1286
				1287	/*
				1288	* Handle a dentry that is managed in some way.
				1289	* - Flagged for transit management (autofs)
				1290	* - Flagged as mountpoint
				1291	* - Flagged as automount point
				1292	*
				1293	* This may only be called in refwalk mode.
				1294	*
				1295	* Serialization is taken care of in namespace.c
				1296	*/
				1297	static int follow_managed(struct path path, struct nameidata nd)
				1298	{
				1299	struct vfsmount mnt = path->mnt; / held by caller, must be left alone */
				1300	unsigned managed;
				1301	bool need_mntput = false;
				1302	int ret = 0;
				1303
				1304	/* Given that we're not holding a lock here, we retain the value in a
				1305	* local variable for each dentry as we look at it so that we don't see
				1306	* the components of that value change under us */
				1307	while (managed = READ_ONCE(path->dentry->d_flags),
				1308	managed &= DCACHE_MANAGED_DENTRY,
				1309	unlikely(managed != 0)) {
				1310	/* Allow the filesystem to manage the transit without i_mutex
				1311	* being held. */
				1312	if (managed & DCACHE_MANAGE_TRANSIT) {
				1313	BUG_ON(!path->dentry->d_op);
				1314	BUG_ON(!path->dentry->d_op->d_manage);
				1315	ret = path->dentry->d_op->d_manage(path, false);
				1316	if (ret < 0)
				1317	break;
				1318	}
				1319
				1320	/* Transit to a mounted filesystem. */
				1321	if (managed & DCACHE_MOUNTED) {
				1322	struct vfsmount *mounted = lookup_mnt(path);
				1323	if (mounted) {
				1324	dput(path->dentry);
				1325	if (need_mntput)
				1326	mntput(path->mnt);
				1327	path->mnt = mounted;
				1328	path->dentry = dget(mounted->mnt_root);
				1329	need_mntput = true;
				1330	continue;
				1331	}
				1332
				1333	/* Something is mounted on this dentry in another
				1334	* namespace and/or whatever was mounted there in this
				1335	* namespace got unmounted before lookup_mnt() could
				1336	* get it */
				1337	}
				1338
				1339	/* Handle an automount point */
				1340	if (managed & DCACHE_NEED_AUTOMOUNT) {
				1341	ret = follow_automount(path, nd, &need_mntput);
				1342	if (ret < 0)
				1343	break;
				1344	continue;
				1345	}
				1346
				1347	/* We didn't change the current path point */
				1348	break;
				1349	}
				1350
				1351	if (need_mntput && path->mnt == mnt)
				1352	mntput(path->mnt);
				1353	if (ret == -EISDIR \|\| !ret)
				1354	ret = 1;
				1355	if (need_mntput)
				1356	nd->flags \|= LOOKUP_JUMPED;
				1357	if (unlikely(ret < 0))
				1358	path_put_conditional(path, nd);
				1359	return ret;
				1360	}
				1361
				1362	int follow_down_one(struct path *path)
				1363	{
				1364	struct vfsmount *mounted;
				1365
				1366	mounted = lookup_mnt(path);
				1367	if (mounted) {
				1368	dput(path->dentry);
				1369	mntput(path->mnt);
				1370	path->mnt = mounted;
				1371	path->dentry = dget(mounted->mnt_root);
				1372	return 1;
				1373	}
				1374	return 0;
				1375	}
				1376	EXPORT_SYMBOL(follow_down_one);
				1377
				1378	static inline int managed_dentry_rcu(const struct path *path)
				1379	{
				1380	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
				1381	path->dentry->d_op->d_manage(path, true) : 0;
				1382	}
				1383
				1384	/*
				1385	* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
				1386	* we meet a managed dentry that would need blocking.
				1387	*/
				1388	static bool __follow_mount_rcu(struct nameidata nd, struct path path,
				1389	struct inode *inode, unsigned seqp)
				1390	{
				1391	for (;;) {
				1392	struct mount *mounted;
				1393	/*
				1394	* Don't forget we might have a non-mountpoint managed dentry
				1395	* that wants to block transit.
				1396	*/
				1397	switch (managed_dentry_rcu(path)) {
				1398	case -ECHILD:
				1399	default:
				1400	return false;
				1401	case -EISDIR:
				1402	return true;
				1403	case 0:
				1404	break;
				1405	}
				1406
				1407	if (!d_mountpoint(path->dentry))
				1408	return !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
				1409
				1410	mounted = __lookup_mnt(path->mnt, path->dentry);
				1411	if (!mounted)
				1412	break;
				1413	path->mnt = &mounted->mnt;
				1414	path->dentry = mounted->mnt.mnt_root;
				1415	nd->flags \|= LOOKUP_JUMPED;
				1416	*seqp = read_seqcount_begin(&path->dentry->d_seq);
				1417	/*
				1418	* Update the inode too. We don't need to re-check the
				1419	* dentry sequence number here after this d_inode read,
				1420	* because a mount-point is always pinned.
				1421	*/
				1422	*inode = path->dentry->d_inode;
				1423	}
				1424	return !read_seqretry(&mount_lock, nd->m_seq) &&
				1425	!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
				1426	}
				1427
				1428	static int follow_dotdot_rcu(struct nameidata *nd)
				1429	{
				1430	struct inode *inode = nd->inode;
				1431
				1432	while (1) {
				1433	if (path_equal(&nd->path, &nd->root))
				1434	break;
				1435	if (nd->path.dentry != nd->path.mnt->mnt_root) {
				1436	struct dentry *old = nd->path.dentry;
				1437	struct dentry *parent = old->d_parent;
				1438	unsigned seq;
				1439
				1440	inode = parent->d_inode;
				1441	seq = read_seqcount_begin(&parent->d_seq);
				1442	if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq)))
				1443	return -ECHILD;
				1444	nd->path.dentry = parent;
				1445	nd->seq = seq;
				1446	if (unlikely(!path_connected(&nd->path)))
				1447	return -ECHILD;
				1448	break;
				1449	} else {
				1450	struct mount *mnt = real_mount(nd->path.mnt);
				1451	struct mount *mparent = mnt->mnt_parent;
				1452	struct dentry *mountpoint = mnt->mnt_mountpoint;
				1453	struct inode *inode2 = mountpoint->d_inode;
				1454	unsigned seq = read_seqcount_begin(&mountpoint->d_seq);
				1455	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
				1456	return -ECHILD;
				1457	if (&mparent->mnt == nd->path.mnt)
				1458	break;
				1459	/* we know that mountpoint was pinned */
				1460	nd->path.dentry = mountpoint;
				1461	nd->path.mnt = &mparent->mnt;
				1462	inode = inode2;
				1463	nd->seq = seq;
				1464	}
				1465	}
				1466	while (unlikely(d_mountpoint(nd->path.dentry))) {
				1467	struct mount *mounted;
				1468	mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry);
				1469	if (unlikely(read_seqretry(&mount_lock, nd->m_seq)))
				1470	return -ECHILD;
				1471	if (!mounted)
				1472	break;
				1473	nd->path.mnt = &mounted->mnt;
				1474	nd->path.dentry = mounted->mnt.mnt_root;
				1475	inode = nd->path.dentry->d_inode;
				1476	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
				1477	}
				1478	nd->inode = inode;
				1479	return 0;
				1480	}
				1481
				1482	/*
				1483	* Follow down to the covering mount currently visible to userspace. At each
				1484	* point, the filesystem owning that dentry may be queried as to whether the
				1485	* caller is permitted to proceed or not.
				1486	*/
				1487	int follow_down(struct path *path)
				1488	{
				1489	unsigned managed;
				1490	int ret;
				1491
				1492	while (managed = READ_ONCE(path->dentry->d_flags),
				1493	unlikely(managed & DCACHE_MANAGED_DENTRY)) {
				1494	/* Allow the filesystem to manage the transit without i_mutex
				1495	* being held.
				1496	*
				1497	* We indicate to the filesystem if someone is trying to mount
				1498	* something here. This gives autofs the chance to deny anyone
				1499	* other than its daemon the right to mount on its
				1500	* superstructure.
				1501	*
				1502	* The filesystem may sleep at this point.
				1503	*/
				1504	if (managed & DCACHE_MANAGE_TRANSIT) {
				1505	BUG_ON(!path->dentry->d_op);
				1506	BUG_ON(!path->dentry->d_op->d_manage);
				1507	ret = path->dentry->d_op->d_manage(path, false);
				1508	if (ret < 0)
				1509	return ret == -EISDIR ? 0 : ret;
				1510	}
				1511
				1512	/* Transit to a mounted filesystem. */
				1513	if (managed & DCACHE_MOUNTED) {
				1514	struct vfsmount *mounted = lookup_mnt(path);
				1515	if (!mounted)
				1516	break;
				1517	dput(path->dentry);
				1518	mntput(path->mnt);
				1519	path->mnt = mounted;
				1520	path->dentry = dget(mounted->mnt_root);
				1521	continue;
				1522	}
				1523
				1524	/* Don't handle automount points here */
				1525	break;
				1526	}
				1527	return 0;
				1528	}
				1529	EXPORT_SYMBOL(follow_down);
				1530
				1531	/*
				1532	* Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
				1533	*/
				1534	static void follow_mount(struct path *path)
				1535	{
				1536	while (d_mountpoint(path->dentry)) {
				1537	struct vfsmount *mounted = lookup_mnt(path);
				1538	if (!mounted)
				1539	break;
				1540	dput(path->dentry);
				1541	mntput(path->mnt);
				1542	path->mnt = mounted;
				1543	path->dentry = dget(mounted->mnt_root);
				1544	}
				1545	}
				1546
				1547	static int path_parent_directory(struct path *path)
				1548	{
				1549	struct dentry *old = path->dentry;
				1550	/* rare case of legitimate dget_parent()... */
				1551	path->dentry = dget_parent(path->dentry);
				1552	dput(old);
				1553	if (unlikely(!path_connected(path)))
				1554	return -ENOENT;
				1555	return 0;
				1556	}
				1557
				1558	static int follow_dotdot(struct nameidata *nd)
				1559	{
				1560	while(1) {
				1561	if (path_equal(&nd->path, &nd->root))
				1562	break;
				1563	if (nd->path.dentry != nd->path.mnt->mnt_root) {
				1564	int ret = path_parent_directory(&nd->path);
				1565	if (ret)
				1566	return ret;
				1567	break;
				1568	}
				1569	if (!follow_up(&nd->path))
				1570	break;
				1571	}
				1572	follow_mount(&nd->path);
				1573	nd->inode = nd->path.dentry->d_inode;
				1574	return 0;
				1575	}
				1576
				1577	/*
				1578	* This looks up the name in dcache and possibly revalidates the found dentry.
				1579	* NULL is returned if the dentry does not exist in the cache.
				1580	*/
				1581	static struct dentry lookup_dcache(const struct qstr name,
				1582	struct dentry *dir,
				1583	unsigned int flags)
				1584	{
				1585	struct dentry *dentry = d_lookup(dir, name);
				1586	if (dentry) {
				1587	int error = d_revalidate(dentry, flags);
				1588	if (unlikely(error <= 0)) {
				1589	if (!error)
				1590	d_invalidate(dentry);
				1591	dput(dentry);
				1592	return ERR_PTR(error);
				1593	}
				1594	}
				1595	return dentry;
				1596	}
				1597
				1598	/*
				1599	* Parent directory has inode locked exclusive. This is one
				1600	* and only case when ->lookup() gets called on non in-lookup
				1601	* dentries - as the matter of fact, this only gets called
				1602	* when directory is guaranteed to have no in-lookup children
				1603	* at all.
				1604	*/
				1605	static struct dentry __lookup_hash(const struct qstr name,
				1606	struct dentry *base, unsigned int flags)
				1607	{
				1608	struct dentry *dentry = lookup_dcache(name, base, flags);
				1609	struct dentry *old;
				1610	struct inode *dir = base->d_inode;
				1611
				1612	if (dentry)
				1613	return dentry;
				1614
				1615	/* Don't create child dentry for a dead directory. */
				1616	if (unlikely(IS_DEADDIR(dir)))
				1617	return ERR_PTR(-ENOENT);
				1618
				1619	dentry = d_alloc(base, name);
				1620	if (unlikely(!dentry))
				1621	return ERR_PTR(-ENOMEM);
				1622
				1623	old = dir->i_op->lookup(dir, dentry, flags);
				1624	if (unlikely(old)) {
				1625	dput(dentry);
				1626	dentry = old;
				1627	}
				1628	return dentry;
				1629	}
				1630
				1631	static int lookup_fast(struct nameidata *nd,
				1632	struct path path, struct inode *inode,
				1633	unsigned *seqp)
				1634	{
				1635	struct vfsmount *mnt = nd->path.mnt;
				1636	struct dentry dentry, parent = nd->path.dentry;
				1637	int status = 1;
				1638	int err;
				1639
				1640	/*
				1641	* Rename seqlock is not required here because in the off chance
				1642	* of a false negative due to a concurrent rename, the caller is
				1643	* going to fall back to non-racy lookup.
				1644	*/
				1645	if (nd->flags & LOOKUP_RCU) {
				1646	unsigned seq;
				1647	bool negative;
				1648	dentry = __d_lookup_rcu(parent, &nd->last, &seq);
				1649	if (unlikely(!dentry)) {
				1650	if (unlazy_walk(nd))
				1651	return -ECHILD;
				1652	return 0;
				1653	}
				1654
				1655	/*
				1656	* This sequence count validates that the inode matches
				1657	* the dentry name information from lookup.
				1658	*/
				1659	*inode = d_backing_inode(dentry);
				1660	negative = d_is_negative(dentry);
				1661	if (unlikely(read_seqcount_retry(&dentry->d_seq, seq)))
				1662	return -ECHILD;
				1663
				1664	/*
				1665	* This sequence count validates that the parent had no
				1666	* changes while we did the lookup of the dentry above.
				1667	*
				1668	* The memory barrier in read_seqcount_begin of child is
				1669	* enough, we can use __read_seqcount_retry here.
				1670	*/
				1671	if (unlikely(__read_seqcount_retry(&parent->d_seq, nd->seq)))
				1672	return -ECHILD;
				1673
				1674	*seqp = seq;
				1675	status = d_revalidate(dentry, nd->flags);
				1676	if (likely(status > 0)) {
				1677	/*
				1678	* Note: do negative dentry check after revalidation in
				1679	* case that drops it.
				1680	*/
				1681	if (unlikely(negative))
				1682	return -ENOENT;
				1683	path->mnt = mnt;
				1684	path->dentry = dentry;
				1685	if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
				1686	return 1;
				1687	}
				1688	if (unlazy_child(nd, dentry, seq))
				1689	return -ECHILD;
				1690	if (unlikely(status == -ECHILD))
				1691	/* we'd been told to redo it in non-rcu mode */
				1692	status = d_revalidate(dentry, nd->flags);
				1693	} else {
				1694	dentry = __d_lookup(parent, &nd->last);
				1695	if (unlikely(!dentry))
				1696	return 0;
				1697	status = d_revalidate(dentry, nd->flags);
				1698	}
				1699	if (unlikely(status <= 0)) {
				1700	if (!status)
				1701	d_invalidate(dentry);
				1702	dput(dentry);
				1703	return status;
				1704	}
				1705	if (unlikely(d_is_negative(dentry))) {
				1706	dput(dentry);
				1707	return -ENOENT;
				1708	}
				1709
				1710	path->mnt = mnt;
				1711	path->dentry = dentry;
				1712	err = follow_managed(path, nd);
				1713	if (likely(err > 0))
				1714	*inode = d_backing_inode(path->dentry);
				1715	return err;
				1716	}
				1717
				1718	/* Fast lookup failed, do it the slow way */
				1719	static struct dentry __lookup_slow(const struct qstr name,
				1720	struct dentry *dir,
				1721	unsigned int flags)
				1722	{
				1723	struct dentry dentry, old;
				1724	struct inode *inode = dir->d_inode;
				1725	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
				1726
				1727	/* Don't go there if it's already dead */
				1728	if (unlikely(IS_DEADDIR(inode)))
				1729	return ERR_PTR(-ENOENT);
				1730	again:
				1731	dentry = d_alloc_parallel(dir, name, &wq);
				1732	if (IS_ERR(dentry))
				1733	return dentry;
				1734	if (unlikely(!d_in_lookup(dentry))) {
				1735	if (!(flags & LOOKUP_NO_REVAL)) {
				1736	int error = d_revalidate(dentry, flags);
				1737	if (unlikely(error <= 0)) {
				1738	if (!error) {
				1739	d_invalidate(dentry);
				1740	dput(dentry);
				1741	goto again;
				1742	}
				1743	dput(dentry);
				1744	dentry = ERR_PTR(error);
				1745	}
				1746	}
				1747	} else {
				1748	old = inode->i_op->lookup(inode, dentry, flags);
				1749	d_lookup_done(dentry);
				1750	if (unlikely(old)) {
				1751	dput(dentry);
				1752	dentry = old;
				1753	}
				1754	}
				1755	return dentry;
				1756	}
				1757
				1758	static struct dentry lookup_slow(const struct qstr name,
				1759	struct dentry *dir,
				1760	unsigned int flags)
				1761	{
				1762	struct inode *inode = dir->d_inode;
				1763	struct dentry *res;
				1764	inode_lock_shared(inode);
				1765	res = __lookup_slow(name, dir, flags);
				1766	inode_unlock_shared(inode);
				1767	return res;
				1768	}
				1769
				1770	static inline int may_lookup(struct nameidata *nd)
				1771	{
				1772	if (nd->flags & LOOKUP_RCU) {
				1773	int err = inode_permission(nd->inode, MAY_EXEC\|MAY_NOT_BLOCK);
				1774	if (err != -ECHILD)
				1775	return err;
				1776	if (unlazy_walk(nd))
				1777	return -ECHILD;
				1778	}
				1779	return inode_permission(nd->inode, MAY_EXEC);
				1780	}
				1781
				1782	static inline int handle_dots(struct nameidata *nd, int type)
				1783	{
				1784	if (type == LAST_DOTDOT) {
				1785	if (!nd->root.mnt)
				1786	set_root(nd);
				1787	if (nd->flags & LOOKUP_RCU) {
				1788	return follow_dotdot_rcu(nd);
				1789	} else
				1790	return follow_dotdot(nd);
				1791	}
				1792	return 0;
				1793	}
				1794
				1795	static int pick_link(struct nameidata nd, struct path link,
				1796	struct inode *inode, unsigned seq)
				1797	{
				1798	int error;
				1799	struct saved *last;
				1800	if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) {
				1801	path_to_nameidata(link, nd);
				1802	return -ELOOP;
				1803	}
				1804	if (!(nd->flags & LOOKUP_RCU)) {
				1805	if (link->mnt == nd->path.mnt)
				1806	mntget(link->mnt);
				1807	}
				1808	error = nd_alloc_stack(nd);
				1809	if (unlikely(error)) {
				1810	if (error == -ECHILD) {
				1811	if (unlikely(!legitimize_path(nd, link, seq))) {
				1812	drop_links(nd);
				1813	nd->depth = 0;
				1814	nd->flags &= ~LOOKUP_RCU;
				1815	nd->path.mnt = NULL;
				1816	nd->path.dentry = NULL;
				1817	rcu_read_unlock();
				1818	} else if (likely(unlazy_walk(nd)) == 0)
				1819	error = nd_alloc_stack(nd);
				1820	}
				1821	if (error) {
				1822	path_put(link);
				1823	return error;
				1824	}
				1825	}
				1826
				1827	last = nd->stack + nd->depth++;
				1828	last->link = *link;
				1829	clear_delayed_call(&last->done);
				1830	nd->link_inode = inode;
				1831	last->seq = seq;
				1832	return 1;
				1833	}
				1834
				1835	enum {WALK_FOLLOW = 1, WALK_MORE = 2};
				1836
				1837	/*
				1838	* Do we need to follow links? We _really_ want to be able
				1839	* to do this check without having to look at inode->i_op,
				1840	* so we keep a cache of "no, this doesn't need follow_link"
				1841	* for the common case.
				1842	*/
				1843	static inline int step_into(struct nameidata nd, struct path path,
				1844	int flags, struct inode *inode, unsigned seq)
				1845	{
				1846	if (!(flags & WALK_MORE) && nd->depth)
				1847	put_link(nd);
				1848	if (likely(!d_is_symlink(path->dentry)) \|\|
				1849	!(flags & WALK_FOLLOW \|\| nd->flags & LOOKUP_FOLLOW)) {
				1850	/* not a symlink or should not follow */
				1851	path_to_nameidata(path, nd);
				1852	nd->inode = inode;
				1853	nd->seq = seq;
				1854	return 0;
				1855	}
				1856	/* make sure that d_is_symlink above matches inode */
				1857	if (nd->flags & LOOKUP_RCU) {
				1858	if (read_seqcount_retry(&path->dentry->d_seq, seq))
				1859	return -ECHILD;
				1860	}
				1861	return pick_link(nd, path, inode, seq);
				1862	}
				1863
				1864	static int walk_component(struct nameidata *nd, int flags)
				1865	{
				1866	struct path path;
				1867	struct inode *inode;
				1868	unsigned seq;
				1869	int err;
				1870	/*
				1871	* "." and ".." are special - ".." especially so because it has
				1872	* to be able to know about the current root directory and
				1873	* parent relationships.
				1874	*/
				1875	if (unlikely(nd->last_type != LAST_NORM)) {
				1876	err = handle_dots(nd, nd->last_type);
				1877	if (!(flags & WALK_MORE) && nd->depth)
				1878	put_link(nd);
				1879	return err;
				1880	}
				1881	err = lookup_fast(nd, &path, &inode, &seq);
				1882	if (unlikely(err <= 0)) {
				1883	if (err < 0)
				1884	return err;
				1885	path.dentry = lookup_slow(&nd->last, nd->path.dentry,
				1886	nd->flags);
				1887	if (IS_ERR(path.dentry))
				1888	return PTR_ERR(path.dentry);
				1889
				1890	path.mnt = nd->path.mnt;
				1891	err = follow_managed(&path, nd);
				1892	if (unlikely(err < 0))
				1893	return err;
				1894
				1895	if (unlikely(d_is_negative(path.dentry))) {
				1896	path_to_nameidata(&path, nd);
				1897	return -ENOENT;
				1898	}
				1899
				1900	seq = 0; /* we are already out of RCU mode */
				1901	inode = d_backing_inode(path.dentry);
				1902	}
				1903
				1904	return step_into(nd, &path, flags, inode, seq);
				1905	}
				1906
				1907	/*
				1908	* We can do the critical dentry name comparison and hashing
				1909	* operations one word at a time, but we are limited to:
				1910	*
				1911	* - Architectures with fast unaligned word accesses. We could
				1912	* do a "get_unaligned()" if this helps and is sufficiently
				1913	* fast.
				1914	*
				1915	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
				1916	* do not trap on the (extremely unlikely) case of a page
				1917	* crossing operation.
				1918	*
				1919	* - Furthermore, we need an efficient 64-bit compile for the
				1920	* 64-bit case in order to generate the "number of bytes in
				1921	* the final mask". Again, that could be replaced with a
				1922	* efficient population count instruction or similar.
				1923	*/
				1924	#ifdef CONFIG_DCACHE_WORD_ACCESS
				1925
				1926	#include <asm/word-at-a-time.h>
				1927
				1928	#ifdef HASH_MIX
				1929
				1930	/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */
				1931
				1932	#elif defined(CONFIG_64BIT)
				1933	/*
				1934	* Register pressure in the mixing function is an issue, particularly
				1935	* on 32-bit x86, but almost any function requires one state value and
				1936	* one temporary. Instead, use a function designed for two state values
				1937	* and no temporaries.
				1938	*
				1939	* This function cannot create a collision in only two iterations, so
				1940	* we have two iterations to achieve avalanche. In those two iterations,
				1941	* we have six layers of mixing, which is enough to spread one bit's
				1942	* influence out to 2^6 = 64 state bits.
				1943	*
				1944	* Rotate constants are scored by considering either 64 one-bit input
				1945	* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
				1946	* probability of that delta causing a change to each of the 128 output
				1947	* bits, using a sample of random initial states.
				1948	*
				1949	* The Shannon entropy of the computed probabilities is then summed
				1950	* to produce a score. Ideally, any input change has a 50% chance of
				1951	* toggling any given output bit.
				1952	*
				1953	* Mixing scores (in bits) for (12,45):
				1954	* Input delta: 1-bit 2-bit
				1955	* 1 round: 713.3 42542.6
				1956	* 2 rounds: 2753.7 140389.8
				1957	* 3 rounds: 5954.1 233458.2
				1958	* 4 rounds: 7862.6 256672.2
				1959	* Perfect: 8192 258048
				1960	* (64128) (6463/2 * 128)
				1961	*/
				1962	#define HASH_MIX(x, y, a) \
				1963	( x ^= (a), \
				1964	y ^= x, x = rol64(x,12),\
				1965	x += y, y = rol64(y,45),\
				1966	y *= 9 )
				1967
				1968	/*
				1969	* Fold two longs into one 32-bit hash value. This must be fast, but
				1970	* latency isn't quite as critical, as there is a fair bit of additional
				1971	* work done before the hash value is used.
				1972	*/
				1973	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
				1974	{
				1975	y ^= x * GOLDEN_RATIO_64;
				1976	y *= GOLDEN_RATIO_64;
				1977	return y >> 32;
				1978	}
				1979
				1980	#else /* 32-bit case */
				1981
				1982	/*
				1983	* Mixing scores (in bits) for (7,20):
				1984	* Input delta: 1-bit 2-bit
				1985	* 1 round: 330.3 9201.6
				1986	* 2 rounds: 1246.4 25475.4
				1987	* 3 rounds: 1907.1 31295.1
				1988	* 4 rounds: 2042.3 31718.6
				1989	* Perfect: 2048 31744
				1990	* (3264) (3231/2 * 64)
				1991	*/
				1992	#define HASH_MIX(x, y, a) \
				1993	( x ^= (a), \
				1994	y ^= x, x = rol32(x, 7),\
				1995	x += y, y = rol32(y,20),\
				1996	y *= 9 )
				1997
				1998	static inline unsigned int fold_hash(unsigned long x, unsigned long y)
				1999	{
				2000	/* Use arch-optimized multiply if one exists */
				2001	return __hash_32(y ^ __hash_32(x));
				2002	}
				2003
				2004	#endif
				2005
				2006	/*
				2007	* Return the hash of a string of known length. This is carfully
				2008	* designed to match hash_name(), which is the more critical function.
				2009	* In particular, we must end by hashing a final word containing 0..7
				2010	* payload bytes, to match the way that hash_name() iterates until it
				2011	* finds the delimiter after the name.
				2012	*/
				2013	unsigned int full_name_hash(const void salt, const char name, unsigned int len)
				2014	{
				2015	unsigned long a, x = 0, y = (unsigned long)salt;
				2016
				2017	for (;;) {
				2018	if (!len)
				2019	goto done;
				2020	a = load_unaligned_zeropad(name);
				2021	if (len < sizeof(unsigned long))
				2022	break;
				2023	HASH_MIX(x, y, a);
				2024	name += sizeof(unsigned long);
				2025	len -= sizeof(unsigned long);
				2026	}
				2027	x ^= a & bytemask_from_count(len);
				2028	done:
				2029	return fold_hash(x, y);
				2030	}
				2031	EXPORT_SYMBOL(full_name_hash);
				2032
				2033	/* Return the "hash_len" (hash and length) of a null-terminated string */
				2034	u64 hashlen_string(const void salt, const char name)
				2035	{
				2036	unsigned long a = 0, x = 0, y = (unsigned long)salt;
				2037	unsigned long adata, mask, len;
				2038	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
				2039
				2040	len = 0;
				2041	goto inside;
				2042
				2043	do {
				2044	HASH_MIX(x, y, a);
				2045	len += sizeof(unsigned long);
				2046	inside:
				2047	a = load_unaligned_zeropad(name+len);
				2048	} while (!has_zero(a, &adata, &constants));
				2049
				2050	adata = prep_zero_mask(a, adata, &constants);
				2051	mask = create_zero_mask(adata);
				2052	x ^= a & zero_bytemask(mask);
				2053
				2054	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
				2055	}
				2056	EXPORT_SYMBOL(hashlen_string);
				2057
				2058	/*
				2059	* Calculate the length and hash of the path component, and
				2060	* return the "hash_len" as the result.
				2061	*/
				2062	static inline u64 hash_name(const void salt, const char name)
				2063	{
				2064	unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
				2065	unsigned long adata, bdata, mask, len;
				2066	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
				2067
				2068	len = 0;
				2069	goto inside;
				2070
				2071	do {
				2072	HASH_MIX(x, y, a);
				2073	len += sizeof(unsigned long);
				2074	inside:
				2075	a = load_unaligned_zeropad(name+len);
				2076	b = a ^ REPEAT_BYTE('/');
				2077	} while (!(has_zero(a, &adata, &constants) \| has_zero(b, &bdata, &constants)));
				2078
				2079	adata = prep_zero_mask(a, adata, &constants);
				2080	bdata = prep_zero_mask(b, bdata, &constants);
				2081	mask = create_zero_mask(adata \| bdata);
				2082	x ^= a & zero_bytemask(mask);
				2083
				2084	return hashlen_create(fold_hash(x, y), len + find_zero(mask));
				2085	}
				2086
				2087	#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
				2088
				2089	/* Return the hash of a string of known length */
				2090	unsigned int full_name_hash(const void salt, const char name, unsigned int len)
				2091	{
				2092	unsigned long hash = init_name_hash(salt);
				2093	while (len--)
				2094	hash = partial_name_hash((unsigned char)*name++, hash);
				2095	return end_name_hash(hash);
				2096	}
				2097	EXPORT_SYMBOL(full_name_hash);
				2098
				2099	/* Return the "hash_len" (hash and length) of a null-terminated string */
				2100	u64 hashlen_string(const void salt, const char name)
				2101	{
				2102	unsigned long hash = init_name_hash(salt);
				2103	unsigned long len = 0, c;
				2104
				2105	c = (unsigned char)*name;
				2106	while (c) {
				2107	len++;
				2108	hash = partial_name_hash(c, hash);
				2109	c = (unsigned char)name[len];
				2110	}
				2111	return hashlen_create(end_name_hash(hash), len);
				2112	}
				2113	EXPORT_SYMBOL(hashlen_string);
				2114
				2115	/*
				2116	* We know there's a real path component here of at least
				2117	* one character.
				2118	*/
				2119	static inline u64 hash_name(const void salt, const char name)
				2120	{
				2121	unsigned long hash = init_name_hash(salt);
				2122	unsigned long len = 0, c;
				2123
				2124	c = (unsigned char)*name;
				2125	do {
				2126	len++;
				2127	hash = partial_name_hash(c, hash);
				2128	c = (unsigned char)name[len];
				2129	} while (c && c != '/');
				2130	return hashlen_create(end_name_hash(hash), len);
				2131	}
				2132
				2133	#endif
				2134
				2135	/*
				2136	* Name resolution.
				2137	* This is the basic name resolution function, turning a pathname into
				2138	* the final dentry. We expect 'base' to be positive and a directory.
				2139	*
				2140	* Returns 0 and nd will have valid dentry and mnt on success.
				2141	* Returns error and drops reference to input namei data on failure.
				2142	*/
				2143	static int link_path_walk(const char name, struct nameidata nd)
				2144	{
				2145	int err;
				2146
				2147	if (IS_ERR(name))
				2148	return PTR_ERR(name);
				2149	while (*name=='/')
				2150	name++;
				2151	if (!*name)
				2152	return 0;
				2153
				2154	/* At this point we know we have a real path component. */
				2155	for(;;) {
				2156	u64 hash_len;
				2157	int type;
				2158
				2159	err = may_lookup(nd);
				2160	if (err)
				2161	return err;
				2162
				2163	hash_len = hash_name(nd->path.dentry, name);
				2164
				2165	type = LAST_NORM;
				2166	if (name[0] == '.') switch (hashlen_len(hash_len)) {
				2167	case 2:
				2168	if (name[1] == '.') {
				2169	type = LAST_DOTDOT;
				2170	nd->flags \|= LOOKUP_JUMPED;
				2171	}
				2172	break;
				2173	case 1:
				2174	type = LAST_DOT;
				2175	}
				2176	if (likely(type == LAST_NORM)) {
				2177	struct dentry *parent = nd->path.dentry;
				2178	nd->flags &= ~LOOKUP_JUMPED;
				2179	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
				2180	struct qstr this = { { .hash_len = hash_len }, .name = name };
				2181	err = parent->d_op->d_hash(parent, &this);
				2182	if (err < 0)
				2183	return err;
				2184	hash_len = this.hash_len;
				2185	name = this.name;
				2186	}
				2187	}
				2188
				2189	nd->last.hash_len = hash_len;
				2190	nd->last.name = name;
				2191	nd->last_type = type;
				2192
				2193	name += hashlen_len(hash_len);
				2194	if (!*name)
				2195	goto OK;
				2196	/*
				2197	* If it wasn't NUL, we know it was '/'. Skip that
				2198	* slash, and continue until no more slashes.
				2199	*/
				2200	do {
				2201	name++;
				2202	} while (unlikely(*name == '/'));
				2203	if (unlikely(!*name)) {
				2204	OK:
				2205	/* pathname body, done */
				2206	if (!nd->depth)
				2207	return 0;
				2208	name = nd->stack[nd->depth - 1].name;
				2209	/* trailing symlink, done */
				2210	if (!name)
				2211	return 0;
				2212	/* last component of nested symlink */
				2213	err = walk_component(nd, WALK_FOLLOW);
				2214	} else {
				2215	/* not the last component */
				2216	err = walk_component(nd, WALK_FOLLOW \| WALK_MORE);
				2217	}
				2218	if (err < 0)
				2219	return err;
				2220
				2221	if (err) {
				2222	const char *s = get_link(nd);
				2223
				2224	if (IS_ERR(s))
				2225	return PTR_ERR(s);
				2226	err = 0;
				2227	if (unlikely(!s)) {
				2228	/* jumped */
				2229	put_link(nd);
				2230	} else {
				2231	nd->stack[nd->depth - 1].name = name;
				2232	name = s;
				2233	continue;
				2234	}
				2235	}
				2236	if (unlikely(!d_can_lookup(nd->path.dentry))) {
				2237	if (nd->flags & LOOKUP_RCU) {
				2238	if (unlazy_walk(nd))
				2239	return -ECHILD;
				2240	}
				2241	return -ENOTDIR;
				2242	}
				2243	}
				2244	}
				2245
				2246	/* must be paired with terminate_walk() */
				2247	static const char path_init(struct nameidata nd, unsigned flags)
				2248	{
				2249	const char *s = nd->name->name;
				2250
				2251	if (!*s)
				2252	flags &= ~LOOKUP_RCU;
				2253	if (flags & LOOKUP_RCU)
				2254	rcu_read_lock();
				2255
				2256	nd->last_type = LAST_ROOT; /* if there are only slashes... */
				2257	nd->flags = flags \| LOOKUP_JUMPED \| LOOKUP_PARENT;
				2258	nd->depth = 0;
				2259	if (flags & LOOKUP_ROOT) {
				2260	struct dentry *root = nd->root.dentry;
				2261	struct inode *inode = root->d_inode;
				2262	if (*s && unlikely(!d_can_lookup(root)))
				2263	return ERR_PTR(-ENOTDIR);
				2264	nd->path = nd->root;
				2265	nd->inode = inode;
				2266	if (flags & LOOKUP_RCU) {
				2267	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
				2268	nd->root_seq = nd->seq;
				2269	nd->m_seq = read_seqbegin(&mount_lock);
				2270	} else {
				2271	path_get(&nd->path);
				2272	}
				2273	return s;
				2274	}
				2275
				2276	nd->root.mnt = NULL;
				2277	nd->path.mnt = NULL;
				2278	nd->path.dentry = NULL;
				2279
				2280	nd->m_seq = read_seqbegin(&mount_lock);
				2281	if (*s == '/') {
				2282	set_root(nd);
				2283	if (likely(!nd_jump_root(nd)))
				2284	return s;
				2285	return ERR_PTR(-ECHILD);
				2286	} else if (nd->dfd == AT_FDCWD) {
				2287	if (flags & LOOKUP_RCU) {
				2288	struct fs_struct *fs = current->fs;
				2289	unsigned seq;
				2290
				2291	do {
				2292	seq = read_seqcount_begin(&fs->seq);
				2293	nd->path = fs->pwd;
				2294	nd->inode = nd->path.dentry->d_inode;
				2295	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
				2296	} while (read_seqcount_retry(&fs->seq, seq));
				2297	} else {
				2298	get_fs_pwd(current->fs, &nd->path);
				2299	nd->inode = nd->path.dentry->d_inode;
				2300	}
				2301	return s;
				2302	} else {
				2303	/* Caller must check execute permissions on the starting path component */
				2304	struct fd f = fdget_raw(nd->dfd);
				2305	struct dentry *dentry;
				2306
				2307	if (!f.file)
				2308	return ERR_PTR(-EBADF);
				2309
				2310	dentry = f.file->f_path.dentry;
				2311
				2312	if (*s && unlikely(!d_can_lookup(dentry))) {
				2313	fdput(f);
				2314	return ERR_PTR(-ENOTDIR);
				2315	}
				2316
				2317	nd->path = f.file->f_path;
				2318	if (flags & LOOKUP_RCU) {
				2319	nd->inode = nd->path.dentry->d_inode;
				2320	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
				2321	} else {
				2322	path_get(&nd->path);
				2323	nd->inode = nd->path.dentry->d_inode;
				2324	}
				2325	fdput(f);
				2326	return s;
				2327	}
				2328	}
				2329
				2330	static const char trailing_symlink(struct nameidata nd)
				2331	{
				2332	const char *s;
				2333	int error = may_follow_link(nd);
				2334	if (unlikely(error))
				2335	return ERR_PTR(error);
				2336	nd->flags \|= LOOKUP_PARENT;
				2337	nd->stack[0].name = NULL;
				2338	s = get_link(nd);
				2339	return s ? s : "";
				2340	}
				2341
				2342	static inline int lookup_last(struct nameidata *nd)
				2343	{
				2344	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
				2345	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				2346
				2347	nd->flags &= ~LOOKUP_PARENT;
				2348	return walk_component(nd, 0);
				2349	}
				2350
				2351	static int handle_lookup_down(struct nameidata *nd)
				2352	{
				2353	struct path path = nd->path;
				2354	struct inode *inode = nd->inode;
				2355	unsigned seq = nd->seq;
				2356	int err;
				2357
				2358	if (nd->flags & LOOKUP_RCU) {
				2359	/*
				2360	* don't bother with unlazy_walk on failure - we are
				2361	* at the very beginning of walk, so we lose nothing
				2362	* if we simply redo everything in non-RCU mode
				2363	*/
				2364	if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq)))
				2365	return -ECHILD;
				2366	} else {
				2367	dget(path.dentry);
				2368	err = follow_managed(&path, nd);
				2369	if (unlikely(err < 0))
				2370	return err;
				2371	inode = d_backing_inode(path.dentry);
				2372	seq = 0;
				2373	}
				2374	path_to_nameidata(&path, nd);
				2375	nd->inode = inode;
				2376	nd->seq = seq;
				2377	return 0;
				2378	}
				2379
				2380	/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
				2381	static int path_lookupat(struct nameidata nd, unsigned flags, struct path path)
				2382	{
				2383	const char *s = path_init(nd, flags);
				2384	int err;
				2385
				2386	if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
				2387	err = handle_lookup_down(nd);
				2388	if (unlikely(err < 0))
				2389	s = ERR_PTR(err);
				2390	}
				2391
				2392	while (!(err = link_path_walk(s, nd))
				2393	&& ((err = lookup_last(nd)) > 0)) {
				2394	s = trailing_symlink(nd);
				2395	}
				2396	if (!err)
				2397	err = complete_walk(nd);
				2398
				2399	if (!err && nd->flags & LOOKUP_DIRECTORY)
				2400	if (!d_can_lookup(nd->path.dentry))
				2401	err = -ENOTDIR;
				2402	if (!err) {
				2403	*path = nd->path;
				2404	nd->path.mnt = NULL;
				2405	nd->path.dentry = NULL;
				2406	}
				2407	terminate_walk(nd);
				2408	return err;
				2409	}
				2410
				2411	int filename_lookup(int dfd, struct filename *name, unsigned flags,
				2412	struct path path, struct path root)
				2413	{
				2414	int retval;
				2415	struct nameidata nd;
				2416	if (IS_ERR(name))
				2417	return PTR_ERR(name);
				2418	if (unlikely(root)) {
				2419	nd.root = *root;
				2420	flags \|= LOOKUP_ROOT;
				2421	}
				2422	set_nameidata(&nd, dfd, name);
				2423	retval = path_lookupat(&nd, flags \| LOOKUP_RCU, path);
				2424	if (unlikely(retval == -ECHILD))
				2425	retval = path_lookupat(&nd, flags, path);
				2426	if (unlikely(retval == -ESTALE))
				2427	retval = path_lookupat(&nd, flags \| LOOKUP_REVAL, path);
				2428
				2429	if (likely(!retval))
				2430	audit_inode(name, path->dentry, 0);
				2431	restore_nameidata();
				2432	putname(name);
				2433	return retval;
				2434	}
				2435
				2436	/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
				2437	static int path_parentat(struct nameidata *nd, unsigned flags,
				2438	struct path *parent)
				2439	{
				2440	const char *s = path_init(nd, flags);
				2441	int err = link_path_walk(s, nd);
				2442	if (!err)
				2443	err = complete_walk(nd);
				2444	if (!err) {
				2445	*parent = nd->path;
				2446	nd->path.mnt = NULL;
				2447	nd->path.dentry = NULL;
				2448	}
				2449	terminate_walk(nd);
				2450	return err;
				2451	}
				2452
				2453	static struct filename filename_parentat(int dfd, struct filename name,
				2454	unsigned int flags, struct path *parent,
				2455	struct qstr last, int type)
				2456	{
				2457	int retval;
				2458	struct nameidata nd;
				2459
				2460	if (IS_ERR(name))
				2461	return name;
				2462	set_nameidata(&nd, dfd, name);
				2463	retval = path_parentat(&nd, flags \| LOOKUP_RCU, parent);
				2464	if (unlikely(retval == -ECHILD))
				2465	retval = path_parentat(&nd, flags, parent);
				2466	if (unlikely(retval == -ESTALE))
				2467	retval = path_parentat(&nd, flags \| LOOKUP_REVAL, parent);
				2468	if (likely(!retval)) {
				2469	*last = nd.last;
				2470	*type = nd.last_type;
				2471	audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
				2472	} else {
				2473	putname(name);
				2474	name = ERR_PTR(retval);
				2475	}
				2476	restore_nameidata();
				2477	return name;
				2478	}
				2479
				2480	/* does lookup, returns the object with parent locked */
				2481	struct dentry kern_path_locked(const char name, struct path *path)
				2482	{
				2483	struct filename *filename;
				2484	struct dentry *d;
				2485	struct qstr last;
				2486	int type;
				2487
				2488	filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path,
				2489	&last, &type);
				2490	if (IS_ERR(filename))
				2491	return ERR_CAST(filename);
				2492	if (unlikely(type != LAST_NORM)) {
				2493	path_put(path);
				2494	putname(filename);
				2495	return ERR_PTR(-EINVAL);
				2496	}
				2497	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
				2498	d = __lookup_hash(&last, path->dentry, 0);
				2499	if (IS_ERR(d)) {
				2500	inode_unlock(path->dentry->d_inode);
				2501	path_put(path);
				2502	}
				2503	putname(filename);
				2504	return d;
				2505	}
				2506
				2507	int kern_path(const char name, unsigned int flags, struct path path)
				2508	{
				2509	return filename_lookup(AT_FDCWD, getname_kernel(name),
				2510	flags, path, NULL);
				2511	}
				2512	EXPORT_SYMBOL(kern_path);
				2513
				2514	/**
				2515	* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
				2516	* @dentry: pointer to dentry of the base directory
				2517	* @mnt: pointer to vfs mount of the base directory
				2518	* @name: pointer to file name
				2519	* @flags: lookup flags
				2520	* @path: pointer to struct path to fill
				2521	*/
				2522	int vfs_path_lookup(struct dentry dentry, struct vfsmount mnt,
				2523	const char *name, unsigned int flags,
				2524	struct path *path)
				2525	{
				2526	struct path root = {.mnt = mnt, .dentry = dentry};
				2527	/* the first argument of filename_lookup() is ignored with root */
				2528	return filename_lookup(AT_FDCWD, getname_kernel(name),
				2529	flags , path, &root);
				2530	}
				2531	EXPORT_SYMBOL(vfs_path_lookup);
				2532
				2533	static int lookup_one_len_common(const char name, struct dentry base,
				2534	int len, struct qstr *this)
				2535	{
				2536	this->name = name;
				2537	this->len = len;
				2538	this->hash = full_name_hash(base, name, len);
				2539	if (!len)
				2540	return -EACCES;
				2541
				2542	if (unlikely(name[0] == '.')) {
				2543	if (len < 2 \|\| (len == 2 && name[1] == '.'))
				2544	return -EACCES;
				2545	}
				2546
				2547	while (len--) {
				2548	unsigned int c = (const unsigned char )name++;
				2549	if (c == '/' \|\| c == '\0')
				2550	return -EACCES;
				2551	}
				2552	/*
				2553	* See if the low-level filesystem might want
				2554	* to use its own hash..
				2555	*/
				2556	if (base->d_flags & DCACHE_OP_HASH) {
				2557	int err = base->d_op->d_hash(base, this);
				2558	if (err < 0)
				2559	return err;
				2560	}
				2561
				2562	return inode_permission(base->d_inode, MAY_EXEC);
				2563	}
				2564
				2565	/**
				2566	* try_lookup_one_len - filesystem helper to lookup single pathname component
				2567	* @name: pathname component to lookup
				2568	* @base: base directory to lookup from
				2569	* @len: maximum length @len should be interpreted to
				2570	*
				2571	* Look up a dentry by name in the dcache, returning NULL if it does not
				2572	* currently exist. The function does not try to create a dentry.
				2573	*
				2574	* Note that this routine is purely a helper for filesystem usage and should
				2575	* not be called by generic code.
				2576	*
				2577	* The caller must hold base->i_mutex.
				2578	*/
				2579	struct dentry try_lookup_one_len(const char name, struct dentry *base, int len)
				2580	{
				2581	struct qstr this;
				2582	int err;
				2583
				2584	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
				2585
				2586	err = lookup_one_len_common(name, base, len, &this);
				2587	if (err)
				2588	return ERR_PTR(err);
				2589
				2590	return lookup_dcache(&this, base, 0);
				2591	}
				2592	EXPORT_SYMBOL(try_lookup_one_len);
				2593
				2594	/**
				2595	* lookup_one_len - filesystem helper to lookup single pathname component
				2596	* @name: pathname component to lookup
				2597	* @base: base directory to lookup from
				2598	* @len: maximum length @len should be interpreted to
				2599	*
				2600	* Note that this routine is purely a helper for filesystem usage and should
				2601	* not be called by generic code.
				2602	*
				2603	* The caller must hold base->i_mutex.
				2604	*/
				2605	struct dentry lookup_one_len(const char name, struct dentry *base, int len)
				2606	{
				2607	struct dentry *dentry;
				2608	struct qstr this;
				2609	int err;
				2610
				2611	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
				2612
				2613	err = lookup_one_len_common(name, base, len, &this);
				2614	if (err)
				2615	return ERR_PTR(err);
				2616
				2617	dentry = lookup_dcache(&this, base, 0);
				2618	return dentry ? dentry : __lookup_slow(&this, base, 0);
				2619	}
				2620	EXPORT_SYMBOL(lookup_one_len);
				2621
				2622	/**
				2623	* lookup_one_len_unlocked - filesystem helper to lookup single pathname component
				2624	* @name: pathname component to lookup
				2625	* @base: base directory to lookup from
				2626	* @len: maximum length @len should be interpreted to
				2627	*
				2628	* Note that this routine is purely a helper for filesystem usage and should
				2629	* not be called by generic code.
				2630	*
				2631	* Unlike lookup_one_len, it should be called without the parent
				2632	* i_mutex held, and will take the i_mutex itself if necessary.
				2633	*/
				2634	struct dentry lookup_one_len_unlocked(const char name,
				2635	struct dentry *base, int len)
				2636	{
				2637	struct qstr this;
				2638	int err;
				2639	struct dentry *ret;
				2640
				2641	err = lookup_one_len_common(name, base, len, &this);
				2642	if (err)
				2643	return ERR_PTR(err);
				2644
				2645	ret = lookup_dcache(&this, base, 0);
				2646	if (!ret)
				2647	ret = lookup_slow(&this, base, 0);
				2648	return ret;
				2649	}
				2650	EXPORT_SYMBOL(lookup_one_len_unlocked);
				2651
				2652	/*
				2653	* Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
				2654	* on negatives. Returns known positive or ERR_PTR(); that's what
				2655	* most of the users want. Note that pinned negative with unlocked parent
				2656	* _can_ become positive at any time, so callers of lookup_one_len_unlocked()
				2657	* need to be very careful; pinned positives have ->d_inode stable, so
				2658	* this one avoids such problems.
				2659	*/
				2660	struct dentry lookup_positive_unlocked(const char name,
				2661	struct dentry *base, int len)
				2662	{
				2663	struct dentry *ret = lookup_one_len_unlocked(name, base, len);
				2664	if (!IS_ERR(ret) && d_is_negative(ret)) {
				2665	dput(ret);
				2666	ret = ERR_PTR(-ENOENT);
				2667	}
				2668	return ret;
				2669	}
				2670	EXPORT_SYMBOL(lookup_positive_unlocked);
				2671
				2672	#ifdef CONFIG_UNIX98_PTYS
				2673	int path_pts(struct path *path)
				2674	{
				2675	/* Find something mounted on "pts" in the same directory as
				2676	* the input path.
				2677	*/
				2678	struct dentry child, parent;
				2679	struct qstr this;
				2680	int ret;
				2681
				2682	ret = path_parent_directory(path);
				2683	if (ret)
				2684	return ret;
				2685
				2686	parent = path->dentry;
				2687	this.name = "pts";
				2688	this.len = 3;
				2689	child = d_hash_and_lookup(parent, &this);
				2690	if (IS_ERR_OR_NULL(child))
				2691	return -ENOENT;
				2692
				2693	path->dentry = child;
				2694	dput(parent);
				2695	follow_mount(path);
				2696	return 0;
				2697	}
				2698	#endif
				2699
				2700	int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
				2701	struct path path, int empty)
				2702	{
				2703	return filename_lookup(dfd, getname_flags(name, flags, empty),
				2704	flags, path, NULL);
				2705	}
				2706	EXPORT_SYMBOL(user_path_at_empty);
				2707
				2708	/**
				2709	* mountpoint_last - look up last component for umount
				2710	* @nd: pathwalk nameidata - currently pointing at parent directory of "last"
				2711	*
				2712	* This is a special lookup_last function just for umount. In this case, we
				2713	* need to resolve the path without doing any revalidation.
				2714	*
				2715	* The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
				2716	* mountpoints are always pinned in the dcache, their ancestors are too. Thus,
				2717	* in almost all cases, this lookup will be served out of the dcache. The only
				2718	* cases where it won't are if nd->last refers to a symlink or the path is
				2719	* bogus and it doesn't exist.
				2720	*
				2721	* Returns:
				2722	* -error: if there was an error during lookup. This includes -ENOENT if the
				2723	* lookup found a negative dentry.
				2724	*
				2725	* 0: if we successfully resolved nd->last and found it to not to be a
				2726	* symlink that needs to be followed.
				2727	*
				2728	* 1: if we successfully resolved nd->last and found it to be a symlink
				2729	* that needs to be followed.
				2730	*/
				2731	static int
				2732	mountpoint_last(struct nameidata *nd)
				2733	{
				2734	int error = 0;
				2735	struct dentry *dir = nd->path.dentry;
				2736	struct path path;
				2737
				2738	/* If we're in rcuwalk, drop out of it to handle last component */
				2739	if (nd->flags & LOOKUP_RCU) {
				2740	if (unlazy_walk(nd))
				2741	return -ECHILD;
				2742	}
				2743
				2744	nd->flags &= ~LOOKUP_PARENT;
				2745
				2746	if (unlikely(nd->last_type != LAST_NORM)) {
				2747	error = handle_dots(nd, nd->last_type);
				2748	if (error)
				2749	return error;
				2750	path.dentry = dget(nd->path.dentry);
				2751	} else {
				2752	path.dentry = d_lookup(dir, &nd->last);
				2753	if (!path.dentry) {
				2754	/*
				2755	* No cached dentry. Mounted dentries are pinned in the
				2756	* cache, so that means that this dentry is probably
				2757	* a symlink or the path doesn't actually point
				2758	* to a mounted dentry.
				2759	*/
				2760	path.dentry = lookup_slow(&nd->last, dir,
				2761	nd->flags \| LOOKUP_NO_REVAL);
				2762	if (IS_ERR(path.dentry))
				2763	return PTR_ERR(path.dentry);
				2764	}
				2765	}
				2766	if (d_is_negative(path.dentry)) {
				2767	dput(path.dentry);
				2768	return -ENOENT;
				2769	}
				2770	path.mnt = nd->path.mnt;
				2771	return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
				2772	}
				2773
				2774	/**
				2775	* path_mountpoint - look up a path to be umounted
				2776	* @nd: lookup context
				2777	* @flags: lookup flags
				2778	* @path: pointer to container for result
				2779	*
				2780	* Look up the given name, but don't attempt to revalidate the last component.
				2781	* Returns 0 and "path" will be valid on success; Returns error otherwise.
				2782	*/
				2783	static int
				2784	path_mountpoint(struct nameidata nd, unsigned flags, struct path path)
				2785	{
				2786	const char *s = path_init(nd, flags);
				2787	int err;
				2788
				2789	while (!(err = link_path_walk(s, nd)) &&
				2790	(err = mountpoint_last(nd)) > 0) {
				2791	s = trailing_symlink(nd);
				2792	}
				2793	if (!err) {
				2794	*path = nd->path;
				2795	nd->path.mnt = NULL;
				2796	nd->path.dentry = NULL;
				2797	follow_mount(path);
				2798	}
				2799	terminate_walk(nd);
				2800	return err;
				2801	}
				2802
				2803	static int
				2804	filename_mountpoint(int dfd, struct filename name, struct path path,
				2805	unsigned int flags)
				2806	{
				2807	struct nameidata nd;
				2808	int error;
				2809	if (IS_ERR(name))
				2810	return PTR_ERR(name);
				2811	set_nameidata(&nd, dfd, name);
				2812	error = path_mountpoint(&nd, flags \| LOOKUP_RCU, path);
				2813	if (unlikely(error == -ECHILD))
				2814	error = path_mountpoint(&nd, flags, path);
				2815	if (unlikely(error == -ESTALE))
				2816	error = path_mountpoint(&nd, flags \| LOOKUP_REVAL, path);
				2817	if (likely(!error))
				2818	audit_inode(name, path->dentry, AUDIT_INODE_NOEVAL);
				2819	restore_nameidata();
				2820	putname(name);
				2821	return error;
				2822	}
				2823
				2824	/**
				2825	* user_path_mountpoint_at - lookup a path from userland in order to umount it
				2826	* @dfd: directory file descriptor
				2827	* @name: pathname from userland
				2828	* @flags: lookup flags
				2829	* @path: pointer to container to hold result
				2830	*
				2831	* A umount is a special case for path walking. We're not actually interested
				2832	* in the inode in this situation, and ESTALE errors can be a problem. We
				2833	* simply want track down the dentry and vfsmount attached at the mountpoint
				2834	* and avoid revalidating the last component.
				2835	*
				2836	* Returns 0 and populates "path" on success.
				2837	*/
				2838	int
				2839	user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
				2840	struct path *path)
				2841	{
				2842	return filename_mountpoint(dfd, getname(name), path, flags);
				2843	}
				2844
				2845	int
				2846	kern_path_mountpoint(int dfd, const char name, struct path path,
				2847	unsigned int flags)
				2848	{
				2849	return filename_mountpoint(dfd, getname_kernel(name), path, flags);
				2850	}
				2851	EXPORT_SYMBOL(kern_path_mountpoint);
				2852
				2853	int __check_sticky(struct inode dir, struct inode inode)
				2854	{
				2855	kuid_t fsuid = current_fsuid();
				2856
				2857	if (uid_eq(inode->i_uid, fsuid))
				2858	return 0;
				2859	if (uid_eq(dir->i_uid, fsuid))
				2860	return 0;
				2861	return !capable_wrt_inode_uidgid(inode, CAP_FOWNER);
				2862	}
				2863	EXPORT_SYMBOL(__check_sticky);
				2864
				2865	/*
				2866	* Check whether we can remove a link victim from directory dir, check
				2867	* whether the type of victim is right.
				2868	* 1. We can't do it if dir is read-only (done in permission())
				2869	* 2. We should have write and exec permissions on dir
				2870	* 3. We can't remove anything from append-only dir
				2871	* 4. We can't do anything with immutable dir (done in permission())
				2872	* 5. If the sticky bit on dir is set we should either
				2873	* a. be owner of dir, or
				2874	* b. be owner of victim, or
				2875	* c. have CAP_FOWNER capability
				2876	* 6. If the victim is append-only or immutable we can't do antyhing with
				2877	* links pointing to it.
				2878	* 7. If the victim has an unknown uid or gid we can't change the inode.
				2879	* 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
				2880	* 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
				2881	* 10. We can't remove a root or mountpoint.
				2882	* 11. We don't allow removal of NFS sillyrenamed files; it's handled by
				2883	* nfs_async_unlink().
				2884	*/
				2885	static int may_delete(struct inode dir, struct dentry victim, bool isdir)
				2886	{
				2887	struct inode *inode = d_backing_inode(victim);
				2888	int error;
				2889
				2890	if (d_is_negative(victim))
				2891	return -ENOENT;
				2892	BUG_ON(!inode);
				2893
				2894	BUG_ON(victim->d_parent->d_inode != dir);
				2895
				2896	/* Inode writeback is not safe when the uid or gid are invalid. */
				2897	if (!uid_valid(inode->i_uid) \|\| !gid_valid(inode->i_gid))
				2898	return -EOVERFLOW;
				2899
				2900	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
				2901
				2902	error = inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				2903	if (error)
				2904	return error;
				2905	if (IS_APPEND(dir))
				2906	return -EPERM;
				2907
				2908	if (check_sticky(dir, inode) \|\| IS_APPEND(inode) \|\|
				2909	IS_IMMUTABLE(inode) \|\| IS_SWAPFILE(inode) \|\| HAS_UNMAPPED_ID(inode))
				2910	return -EPERM;
				2911	if (isdir) {
				2912	if (!d_is_dir(victim))
				2913	return -ENOTDIR;
				2914	if (IS_ROOT(victim))
				2915	return -EBUSY;
				2916	} else if (d_is_dir(victim))
				2917	return -EISDIR;
				2918	if (IS_DEADDIR(dir))
				2919	return -ENOENT;
				2920	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
				2921	return -EBUSY;
				2922	return 0;
				2923	}
				2924
				2925	/* Check whether we can create an object with dentry child in directory
				2926	* dir.
				2927	* 1. We can't do it if child already exists (open has special treatment for
				2928	* this case, but since we are inlined it's OK)
				2929	* 2. We can't do it if dir is read-only (done in permission())
				2930	* 3. We can't do it if the fs can't represent the fsuid or fsgid.
				2931	* 4. We should have write and exec permissions on dir
				2932	* 5. We can't do it if dir is immutable (done in permission())
				2933	*/
				2934	static inline int may_create(struct inode dir, struct dentry child)
				2935	{
				2936	struct user_namespace *s_user_ns;
				2937	audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
				2938	if (child->d_inode)
				2939	return -EEXIST;
				2940	if (IS_DEADDIR(dir))
				2941	return -ENOENT;
				2942	s_user_ns = dir->i_sb->s_user_ns;
				2943	if (!kuid_has_mapping(s_user_ns, current_fsuid()) \|\|
				2944	!kgid_has_mapping(s_user_ns, current_fsgid()))
				2945	return -EOVERFLOW;
				2946	return inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				2947	}
				2948
				2949	/*
				2950	* p1 and p2 should be directories on the same fs.
				2951	*/
				2952	struct dentry lock_rename(struct dentry p1, struct dentry *p2)
				2953	{
				2954	struct dentry *p;
				2955
				2956	if (p1 == p2) {
				2957	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
				2958	return NULL;
				2959	}
				2960
				2961	mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
				2962
				2963	p = d_ancestor(p2, p1);
				2964	if (p) {
				2965	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
				2966	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
				2967	return p;
				2968	}
				2969
				2970	p = d_ancestor(p1, p2);
				2971	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
				2972	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
				2973	return p;
				2974	}
				2975	EXPORT_SYMBOL(lock_rename);
				2976
				2977	void unlock_rename(struct dentry p1, struct dentry p2)
				2978	{
				2979	inode_unlock(p1->d_inode);
				2980	if (p1 != p2) {
				2981	inode_unlock(p2->d_inode);
				2982	mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
				2983	}
				2984	}
				2985	EXPORT_SYMBOL(unlock_rename);
				2986
				2987	/**
				2988	* mode_strip_umask - handle vfs umask stripping
				2989	* @dir: parent directory of the new inode
				2990	* @mode: mode of the new inode to be created in @dir
				2991	*
				2992	* Umask stripping depends on whether or not the filesystem supports POSIX
				2993	* ACLs. If the filesystem doesn't support it umask stripping is done directly
				2994	* in here. If the filesystem does support POSIX ACLs umask stripping is
				2995	* deferred until the filesystem calls posix_acl_create().
				2996	*
				2997	* Returns: mode
				2998	*/
				2999	static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
				3000	{
				3001	if (!IS_POSIXACL(dir))
				3002	mode &= ~current_umask();
				3003	return mode;
				3004	}
				3005
				3006	/**
				3007	* vfs_prepare_mode - prepare the mode to be used for a new inode
				3008	* @dir: parent directory of the new inode
				3009	* @mode: mode of the new inode
				3010	* @mask_perms: allowed permission by the vfs
				3011	* @type: type of file to be created
				3012	*
				3013	* This helper consolidates and enforces vfs restrictions on the @mode of a new
				3014	* object to be created.
				3015	*
				3016	* Umask stripping depends on whether the filesystem supports POSIX ACLs (see
				3017	* the kernel documentation for mode_strip_umask()). Moving umask stripping
				3018	* after setgid stripping allows the same ordering for both non-POSIX ACL and
				3019	* POSIX ACL supporting filesystems.
				3020	*
				3021	* Note that it's currently valid for @type to be 0 if a directory is created.
				3022	* Filesystems raise that flag individually and we need to check whether each
				3023	* filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
				3024	* non-zero type.
				3025	*
				3026	* Returns: mode to be passed to the filesystem
				3027	*/
				3028	static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode,
				3029	umode_t mask_perms, umode_t type)
				3030	{
				3031	mode = mode_strip_sgid(dir, mode);
				3032	mode = mode_strip_umask(dir, mode);
				3033
				3034	/*
				3035	* Apply the vfs mandated allowed permission mask and set the type of
				3036	* file to be created before we call into the filesystem.
				3037	*/
				3038	mode &= (mask_perms & ~S_IFMT);
				3039	mode \|= (type & S_IFMT);
				3040
				3041	return mode;
				3042	}
				3043
				3044	int vfs_create(struct inode dir, struct dentry dentry, umode_t mode,
				3045	bool want_excl)
				3046	{
				3047	int error = may_create(dir, dentry);
				3048	if (error)
				3049	return error;
				3050
				3051	if (!dir->i_op->create)
				3052	return -EACCES; /* shouldn't it be ENOSYS? */
				3053
				3054	mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG);
				3055	error = security_inode_create(dir, dentry, mode);
				3056	if (error)
				3057	return error;
				3058	error = dir->i_op->create(dir, dentry, mode, want_excl);
				3059	if (!error)
				3060	fsnotify_create(dir, dentry);
				3061	return error;
				3062	}
				3063	EXPORT_SYMBOL(vfs_create);
				3064
				3065	int vfs_mkobj(struct dentry *dentry, umode_t mode,
				3066	int (f)(struct dentry , umode_t, void *),
				3067	void *arg)
				3068	{
				3069	struct inode *dir = dentry->d_parent->d_inode;
				3070	int error = may_create(dir, dentry);
				3071	if (error)
				3072	return error;
				3073
				3074	mode &= S_IALLUGO;
				3075	mode \|= S_IFREG;
				3076	error = security_inode_create(dir, dentry, mode);
				3077	if (error)
				3078	return error;
				3079	error = f(dentry, mode, arg);
				3080	if (!error)
				3081	fsnotify_create(dir, dentry);
				3082	return error;
				3083	}
				3084	EXPORT_SYMBOL(vfs_mkobj);
				3085
				3086	bool may_open_dev(const struct path *path)
				3087	{
				3088	return !(path->mnt->mnt_flags & MNT_NODEV) &&
				3089	!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
				3090	}
				3091
				3092	static int may_open(const struct path *path, int acc_mode, int flag)
				3093	{
				3094	struct dentry *dentry = path->dentry;
				3095	struct inode *inode = dentry->d_inode;
				3096	int error;
				3097
				3098	if (!inode)
				3099	return -ENOENT;
				3100
				3101	switch (inode->i_mode & S_IFMT) {
				3102	case S_IFLNK:
				3103	return -ELOOP;
				3104	case S_IFDIR:
				3105	if (acc_mode & MAY_WRITE)
				3106	return -EISDIR;
				3107	break;
				3108	case S_IFBLK:
				3109	case S_IFCHR:
				3110	if (!may_open_dev(path))
				3111	return -EACCES;
				3112	/FALLTHRU/
				3113	case S_IFIFO:
				3114	case S_IFSOCK:
				3115	flag &= ~O_TRUNC;
				3116	break;
				3117	}
				3118
				3119	error = inode_permission(inode, MAY_OPEN \| acc_mode);
				3120	if (error)
				3121	return error;
				3122
				3123	/*
				3124	* An append-only file must be opened in append mode for writing.
				3125	*/
				3126	if (IS_APPEND(inode)) {
				3127	if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
				3128	return -EPERM;
				3129	if (flag & O_TRUNC)
				3130	return -EPERM;
				3131	}
				3132
				3133	/* O_NOATIME can only be set by the owner or superuser */
				3134	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
				3135	return -EPERM;
				3136
				3137	return 0;
				3138	}
				3139
				3140	static int handle_truncate(struct file *filp)
				3141	{
				3142	const struct path *path = &filp->f_path;
				3143	struct inode *inode = path->dentry->d_inode;
				3144	int error = get_write_access(inode);
				3145	if (error)
				3146	return error;
				3147	/*
				3148	* Refuse to truncate files with mandatory locks held on them.
				3149	*/
				3150	error = locks_verify_locked(filp);
				3151	if (!error)
				3152	error = security_path_truncate(path);
				3153	if (!error) {
				3154	error = do_truncate(path->dentry, 0,
				3155	ATTR_MTIME\|ATTR_CTIME\|ATTR_OPEN,
				3156	filp);
				3157	}
				3158	put_write_access(inode);
				3159	return error;
				3160	}
				3161
				3162	static inline int open_to_namei_flags(int flag)
				3163	{
				3164	if ((flag & O_ACCMODE) == 3)
				3165	flag--;
				3166	return flag;
				3167	}
				3168
				3169	static int may_o_create(const struct path dir, struct dentry dentry, umode_t mode)
				3170	{
				3171	struct user_namespace *s_user_ns;
				3172	int error = security_path_mknod(dir, dentry, mode, 0);
				3173	if (error)
				3174	return error;
				3175
				3176	s_user_ns = dir->dentry->d_sb->s_user_ns;
				3177	if (!kuid_has_mapping(s_user_ns, current_fsuid()) \|\|
				3178	!kgid_has_mapping(s_user_ns, current_fsgid()))
				3179	return -EOVERFLOW;
				3180
				3181	error = inode_permission(dir->dentry->d_inode, MAY_WRITE \| MAY_EXEC);
				3182	if (error)
				3183	return error;
				3184
				3185	return security_inode_create(dir->dentry->d_inode, dentry, mode);
				3186	}
				3187
				3188	/*
				3189	* Attempt to atomically look up, create and open a file from a negative
				3190	* dentry.
				3191	*
				3192	* Returns 0 if successful. The file will have been created and attached to
				3193	* @file by the filesystem calling finish_open().
				3194	*
				3195	* If the file was looked up only or didn't need creating, FMODE_OPENED won't
				3196	* be set. The caller will need to perform the open themselves. @path will
				3197	* have been updated to point to the new dentry. This may be negative.
				3198	*
				3199	* Returns an error code otherwise.
				3200	*/
				3201	static int atomic_open(struct nameidata nd, struct dentry dentry,
				3202	struct path path, struct file file,
				3203	const struct open_flags *op,
				3204	int open_flag, umode_t mode)
				3205	{
				3206	struct dentry const DENTRY_NOT_SET = (void ) -1UL;
				3207	struct inode *dir = nd->path.dentry->d_inode;
				3208	int error;
				3209
				3210	if (!(~open_flag & (O_EXCL \| O_CREAT))) /* both O_EXCL and O_CREAT */
				3211	open_flag &= ~O_TRUNC;
				3212
				3213	if (nd->flags & LOOKUP_DIRECTORY)
				3214	open_flag \|= O_DIRECTORY;
				3215
				3216	file->f_path.dentry = DENTRY_NOT_SET;
				3217	file->f_path.mnt = nd->path.mnt;
				3218	error = dir->i_op->atomic_open(dir, dentry, file,
				3219	open_to_namei_flags(open_flag), mode);
				3220	d_lookup_done(dentry);
				3221	if (!error) {
				3222	if (file->f_mode & FMODE_OPENED) {
				3223	/*
				3224	* We didn't have the inode before the open, so check open
				3225	* permission here.
				3226	*/
				3227	int acc_mode = op->acc_mode;
				3228	if (file->f_mode & FMODE_CREATED) {
				3229	WARN_ON(!(open_flag & O_CREAT));
				3230	fsnotify_create(dir, dentry);
				3231	acc_mode = 0;
				3232	}
				3233	error = may_open(&file->f_path, acc_mode, open_flag);
				3234	if (WARN_ON(error > 0))
				3235	error = -EINVAL;
				3236	} else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
				3237	error = -EIO;
				3238	} else {
				3239	if (file->f_path.dentry) {
				3240	dput(dentry);
				3241	dentry = file->f_path.dentry;
				3242	}
				3243	if (file->f_mode & FMODE_CREATED)
				3244	fsnotify_create(dir, dentry);
				3245	if (unlikely(d_is_negative(dentry))) {
				3246	error = -ENOENT;
				3247	} else {
				3248	path->dentry = dentry;
				3249	path->mnt = nd->path.mnt;
				3250	return 0;
				3251	}
				3252	}
				3253	}
				3254	dput(dentry);
				3255	return error;
				3256	}
				3257
				3258	/*
				3259	* Look up and maybe create and open the last component.
				3260	*
				3261	* Must be called with parent locked (exclusive in O_CREAT case).
				3262	*
				3263	* Returns 0 on success, that is, if
				3264	* the file was successfully atomically created (if necessary) and opened, or
				3265	* the file was not completely opened at this time, though lookups and
				3266	* creations were performed.
				3267	* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
				3268	* In the latter case dentry returned in @path might be negative if O_CREAT
				3269	* hadn't been specified.
				3270	*
				3271	* An error code is returned on failure.
				3272	*/
				3273	static int lookup_open(struct nameidata nd, struct path path,
				3274	struct file *file,
				3275	const struct open_flags *op,
				3276	bool got_write)
				3277	{
				3278	struct dentry *dir = nd->path.dentry;
				3279	struct inode *dir_inode = dir->d_inode;
				3280	int open_flag = op->open_flag;
				3281	struct dentry *dentry;
				3282	int error, create_error = 0;
				3283	umode_t mode = op->mode;
				3284	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
				3285
				3286	if (unlikely(IS_DEADDIR(dir_inode)))
				3287	return -ENOENT;
				3288
				3289	file->f_mode &= ~FMODE_CREATED;
				3290	dentry = d_lookup(dir, &nd->last);
				3291	for (;;) {
				3292	if (!dentry) {
				3293	dentry = d_alloc_parallel(dir, &nd->last, &wq);
				3294	if (IS_ERR(dentry))
				3295	return PTR_ERR(dentry);
				3296	}
				3297	if (d_in_lookup(dentry))
				3298	break;
				3299
				3300	error = d_revalidate(dentry, nd->flags);
				3301	if (likely(error > 0))
				3302	break;
				3303	if (error)
				3304	goto out_dput;
				3305	d_invalidate(dentry);
				3306	dput(dentry);
				3307	dentry = NULL;
				3308	}
				3309	if (dentry->d_inode) {
				3310	/* Cached positive dentry: will open in f_op->open */
				3311	goto out_no_open;
				3312	}
				3313
				3314	/*
				3315	* Checking write permission is tricky, bacuse we don't know if we are
				3316	* going to actually need it: O_CREAT opens should work as long as the
				3317	* file exists. But checking existence breaks atomicity. The trick is
				3318	* to check access and if not granted clear O_CREAT from the flags.
				3319	*
				3320	* Another problem is returing the "right" error value (e.g. for an
				3321	* O_EXCL open we want to return EEXIST not EROFS).
				3322	*/
				3323	if (open_flag & O_CREAT) {
				3324	mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode);
				3325	if (unlikely(!got_write)) {
				3326	create_error = -EROFS;
				3327	open_flag &= ~O_CREAT;
				3328	if (open_flag & (O_EXCL \| O_TRUNC))
				3329	goto no_open;
				3330	/* No side effects, safe to clear O_CREAT */
				3331	} else {
				3332	create_error = may_o_create(&nd->path, dentry, mode);
				3333	if (create_error) {
				3334	open_flag &= ~O_CREAT;
				3335	if (open_flag & O_EXCL)
				3336	goto no_open;
				3337	}
				3338	}
				3339	} else if ((open_flag & (O_TRUNC\|O_WRONLY\|O_RDWR)) &&
				3340	unlikely(!got_write)) {
				3341	/*
				3342	* No O_CREATE -> atomicity not a requirement -> fall
				3343	* back to lookup + open
				3344	*/
				3345	goto no_open;
				3346	}
				3347
				3348	if (dir_inode->i_op->atomic_open) {
				3349	error = atomic_open(nd, dentry, path, file, op, open_flag,
				3350	mode);
				3351	if (unlikely(error == -ENOENT) && create_error)
				3352	error = create_error;
				3353	return error;
				3354	}
				3355
				3356	no_open:
				3357	if (d_in_lookup(dentry)) {
				3358	struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
				3359	nd->flags);
				3360	d_lookup_done(dentry);
				3361	if (unlikely(res)) {
				3362	if (IS_ERR(res)) {
				3363	error = PTR_ERR(res);
				3364	goto out_dput;
				3365	}
				3366	dput(dentry);
				3367	dentry = res;
				3368	}
				3369	}
				3370
				3371	/* Negative dentry, just create the file */
				3372	if (!dentry->d_inode && (open_flag & O_CREAT)) {
				3373	file->f_mode \|= FMODE_CREATED;
				3374	audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
				3375	if (!dir_inode->i_op->create) {
				3376	error = -EACCES;
				3377	goto out_dput;
				3378	}
				3379	error = dir_inode->i_op->create(dir_inode, dentry, mode,
				3380	open_flag & O_EXCL);
				3381	if (error)
				3382	goto out_dput;
				3383	fsnotify_create(dir_inode, dentry);
				3384	}
				3385	if (unlikely(create_error) && !dentry->d_inode) {
				3386	error = create_error;
				3387	goto out_dput;
				3388	}
				3389	out_no_open:
				3390	path->dentry = dentry;
				3391	path->mnt = nd->path.mnt;
				3392	return 0;
				3393
				3394	out_dput:
				3395	dput(dentry);
				3396	return error;
				3397	}
				3398
				3399	/*
				3400	* Handle the last step of open()
				3401	*/
				3402	static int do_last(struct nameidata *nd,
				3403	struct file file, const struct open_flags op)
				3404	{
				3405	struct dentry *dir = nd->path.dentry;
				3406	kuid_t dir_uid = nd->inode->i_uid;
				3407	umode_t dir_mode = nd->inode->i_mode;
				3408	int open_flag = op->open_flag;
				3409	bool will_truncate = (open_flag & O_TRUNC) != 0;
				3410	bool got_write = false;
				3411	int acc_mode = op->acc_mode;
				3412	unsigned seq;
				3413	struct inode *inode;
				3414	struct path path;
				3415	int error;
				3416
				3417	nd->flags &= ~LOOKUP_PARENT;
				3418	nd->flags \|= op->intent;
				3419
				3420	if (nd->last_type != LAST_NORM) {
				3421	error = handle_dots(nd, nd->last_type);
				3422	if (unlikely(error))
				3423	return error;
				3424	goto finish_open;
				3425	}
				3426
				3427	if (!(open_flag & O_CREAT)) {
				3428	if (nd->last.name[nd->last.len])
				3429	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				3430	/* we _can_ be in RCU mode here */
				3431	error = lookup_fast(nd, &path, &inode, &seq);
				3432	if (likely(error > 0))
				3433	goto finish_lookup;
				3434
				3435	if (error < 0)
				3436	return error;
				3437
				3438	BUG_ON(nd->inode != dir->d_inode);
				3439	BUG_ON(nd->flags & LOOKUP_RCU);
				3440	} else {
				3441	/* create side of things */
				3442	/*
				3443	* This will only deal with leaving RCU mode - LOOKUP_JUMPED
				3444	* has been cleared when we got to the last component we are
				3445	* about to look up
				3446	*/
				3447	error = complete_walk(nd);
				3448	if (error)
				3449	return error;
				3450
				3451	audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
				3452	/* trailing slashes? */
				3453	if (unlikely(nd->last.name[nd->last.len]))
				3454	return -EISDIR;
				3455	}
				3456
				3457	if (open_flag & (O_CREAT \| O_TRUNC \| O_WRONLY \| O_RDWR)) {
				3458	error = mnt_want_write(nd->path.mnt);
				3459	if (!error)
				3460	got_write = true;
				3461	/*
				3462	* do _not_ fail yet - we might not need that or fail with
				3463	* a different error; let lookup_open() decide; we'll be
				3464	* dropping this one anyway.
				3465	*/
				3466	}
				3467	if (open_flag & O_CREAT)
				3468	inode_lock(dir->d_inode);
				3469	else
				3470	inode_lock_shared(dir->d_inode);
				3471	error = lookup_open(nd, &path, file, op, got_write);
				3472	if (open_flag & O_CREAT)
				3473	inode_unlock(dir->d_inode);
				3474	else
				3475	inode_unlock_shared(dir->d_inode);
				3476
				3477	if (error)
				3478	goto out;
				3479
				3480	if (file->f_mode & FMODE_OPENED) {
				3481	if ((file->f_mode & FMODE_CREATED) \|\|
				3482	!S_ISREG(file_inode(file)->i_mode))
				3483	will_truncate = false;
				3484
				3485	audit_inode(nd->name, file->f_path.dentry, 0);
				3486	goto opened;
				3487	}
				3488
				3489	if (file->f_mode & FMODE_CREATED) {
				3490	/* Don't check for write permission, don't truncate */
				3491	open_flag &= ~O_TRUNC;
				3492	will_truncate = false;
				3493	acc_mode = 0;
				3494	path_to_nameidata(&path, nd);
				3495	goto finish_open_created;
				3496	}
				3497
				3498	/*
				3499	* If atomic_open() acquired write access it is dropped now due to
				3500	* possible mount and symlink following (this might be optimized away if
				3501	* necessary...)
				3502	*/
				3503	if (got_write) {
				3504	mnt_drop_write(nd->path.mnt);
				3505	got_write = false;
				3506	}
				3507
				3508	error = follow_managed(&path, nd);
				3509	if (unlikely(error < 0))
				3510	return error;
				3511
				3512	if (unlikely(d_is_negative(path.dentry))) {
				3513	path_to_nameidata(&path, nd);
				3514	return -ENOENT;
				3515	}
				3516
				3517	/*
				3518	* create/update audit record if it already exists.
				3519	*/
				3520	audit_inode(nd->name, path.dentry, 0);
				3521
				3522	if (unlikely((open_flag & (O_EXCL \| O_CREAT)) == (O_EXCL \| O_CREAT))) {
				3523	path_to_nameidata(&path, nd);
				3524	return -EEXIST;
				3525	}
				3526
				3527	seq = 0; /* out of RCU mode, so the value doesn't matter */
				3528	inode = d_backing_inode(path.dentry);
				3529	finish_lookup:
				3530	error = step_into(nd, &path, 0, inode, seq);
				3531	if (unlikely(error))
				3532	return error;
				3533	finish_open:
				3534	/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
				3535	error = complete_walk(nd);
				3536	if (error)
				3537	return error;
				3538	audit_inode(nd->name, nd->path.dentry, 0);
				3539	if (open_flag & O_CREAT) {
				3540	error = -EISDIR;
				3541	if (d_is_dir(nd->path.dentry))
				3542	goto out;
				3543	error = may_create_in_sticky(dir_mode, dir_uid,
				3544	d_backing_inode(nd->path.dentry));
				3545	if (unlikely(error))
				3546	goto out;
				3547	}
				3548	error = -ENOTDIR;
				3549	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
				3550	goto out;
				3551	if (!d_is_reg(nd->path.dentry))
				3552	will_truncate = false;
				3553
				3554	if (will_truncate) {
				3555	error = mnt_want_write(nd->path.mnt);
				3556	if (error)
				3557	goto out;
				3558	got_write = true;
				3559	}
				3560	finish_open_created:
				3561	error = may_open(&nd->path, acc_mode, open_flag);
				3562	if (error)
				3563	goto out;
				3564	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
				3565	error = vfs_open(&nd->path, file);
				3566	if (error)
				3567	goto out;
				3568	opened:
				3569	error = ima_file_check(file, op->acc_mode);
				3570	if (!error && will_truncate)
				3571	error = handle_truncate(file);
				3572	out:
				3573	if (unlikely(error > 0)) {
				3574	WARN_ON(1);
				3575	error = -EINVAL;
				3576	}
				3577	if (got_write)
				3578	mnt_drop_write(nd->path.mnt);
				3579	return error;
				3580	}
				3581
				3582	struct dentry vfs_tmpfile(struct dentry dentry, umode_t mode, int open_flag)
				3583	{
				3584	struct dentry *child = NULL;
				3585	struct inode *dir = dentry->d_inode;
				3586	struct inode *inode;
				3587	int error;
				3588
				3589	/* we want directory to be writable */
				3590	error = inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				3591	if (error)
				3592	goto out_err;
				3593	error = -EOPNOTSUPP;
				3594	if (!dir->i_op->tmpfile)
				3595	goto out_err;
				3596	error = -ENOMEM;
				3597	child = d_alloc(dentry, &slash_name);
				3598	if (unlikely(!child))
				3599	goto out_err;
				3600	mode = vfs_prepare_mode(dir, mode, mode, mode);
				3601	error = dir->i_op->tmpfile(dir, child, mode);
				3602	if (error)
				3603	goto out_err;
				3604	error = -ENOENT;
				3605	inode = child->d_inode;
				3606	if (unlikely(!inode))
				3607	goto out_err;
				3608	if (!(open_flag & O_EXCL)) {
				3609	spin_lock(&inode->i_lock);
				3610	inode->i_state \|= I_LINKABLE;
				3611	spin_unlock(&inode->i_lock);
				3612	}
				3613	ima_post_create_tmpfile(inode);
				3614	return child;
				3615
				3616	out_err:
				3617	dput(child);
				3618	return ERR_PTR(error);
				3619	}
				3620	EXPORT_SYMBOL(vfs_tmpfile);
				3621
				3622	static int do_tmpfile(struct nameidata *nd, unsigned flags,
				3623	const struct open_flags *op,
				3624	struct file *file)
				3625	{
				3626	struct dentry *child;
				3627	struct path path;
				3628	int error = path_lookupat(nd, flags \| LOOKUP_DIRECTORY, &path);
				3629	if (unlikely(error))
				3630	return error;
				3631	error = mnt_want_write(path.mnt);
				3632	if (unlikely(error))
				3633	goto out;
				3634	child = vfs_tmpfile(path.dentry, op->mode, op->open_flag);
				3635	error = PTR_ERR(child);
				3636	if (IS_ERR(child))
				3637	goto out2;
				3638	dput(path.dentry);
				3639	path.dentry = child;
				3640	audit_inode(nd->name, child, 0);
				3641	/* Don't check for other permissions, the inode was just created */
				3642	error = may_open(&path, 0, op->open_flag);
				3643	if (error)
				3644	goto out2;
				3645	file->f_path.mnt = path.mnt;
				3646	error = finish_open(file, child, NULL);
				3647	out2:
				3648	mnt_drop_write(path.mnt);
				3649	out:
				3650	path_put(&path);
				3651	return error;
				3652	}
				3653
				3654	static int do_o_path(struct nameidata nd, unsigned flags, struct file file)
				3655	{
				3656	struct path path;
				3657	int error = path_lookupat(nd, flags, &path);
				3658	if (!error) {
				3659	audit_inode(nd->name, path.dentry, 0);
				3660	error = vfs_open(&path, file);
				3661	path_put(&path);
				3662	}
				3663	return error;
				3664	}
				3665
				3666	static struct file path_openat(struct nameidata nd,
				3667	const struct open_flags *op, unsigned flags)
				3668	{
				3669	struct file *file;
				3670	int error;
				3671
				3672	file = alloc_empty_file(op->open_flag, current_cred());
				3673	if (IS_ERR(file))
				3674	return file;
				3675
				3676	if (unlikely(file->f_flags & __O_TMPFILE)) {
				3677	error = do_tmpfile(nd, flags, op, file);
				3678	} else if (unlikely(file->f_flags & O_PATH)) {
				3679	error = do_o_path(nd, flags, file);
				3680	} else {
				3681	const char *s = path_init(nd, flags);
				3682	while (!(error = link_path_walk(s, nd)) &&
				3683	(error = do_last(nd, file, op)) > 0) {
				3684	nd->flags &= ~(LOOKUP_OPEN\|LOOKUP_CREATE\|LOOKUP_EXCL);
				3685	s = trailing_symlink(nd);
				3686	}
				3687	terminate_walk(nd);
				3688	}
				3689	if (likely(!error)) {
				3690	if (likely(file->f_mode & FMODE_OPENED))
				3691	return file;
				3692	WARN_ON(1);
				3693	error = -EINVAL;
				3694	}
				3695	fput(file);
				3696	if (error == -EOPENSTALE) {
				3697	if (flags & LOOKUP_RCU)
				3698	error = -ECHILD;
				3699	else
				3700	error = -ESTALE;
				3701	}
				3702	return ERR_PTR(error);
				3703	}
				3704
				3705	struct file do_filp_open(int dfd, struct filename pathname,
				3706	const struct open_flags *op)
				3707	{
				3708	struct nameidata nd;
				3709	int flags = op->lookup_flags;
				3710	struct file *filp;
				3711
				3712	set_nameidata(&nd, dfd, pathname);
				3713	filp = path_openat(&nd, op, flags \| LOOKUP_RCU);
				3714	if (unlikely(filp == ERR_PTR(-ECHILD)))
				3715	filp = path_openat(&nd, op, flags);
				3716	if (unlikely(filp == ERR_PTR(-ESTALE)))
				3717	filp = path_openat(&nd, op, flags \| LOOKUP_REVAL);
				3718	restore_nameidata();
				3719	return filp;
				3720	}
				3721
				3722	struct file do_file_open_root(struct dentry dentry, struct vfsmount *mnt,
				3723	const char name, const struct open_flags op)
				3724	{
				3725	struct nameidata nd;
				3726	struct file *file;
				3727	struct filename *filename;
				3728	int flags = op->lookup_flags \| LOOKUP_ROOT;
				3729
				3730	nd.root.mnt = mnt;
				3731	nd.root.dentry = dentry;
				3732
				3733	if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
				3734	return ERR_PTR(-ELOOP);
				3735
				3736	filename = getname_kernel(name);
				3737	if (IS_ERR(filename))
				3738	return ERR_CAST(filename);
				3739
				3740	set_nameidata(&nd, -1, filename);
				3741	file = path_openat(&nd, op, flags \| LOOKUP_RCU);
				3742	if (unlikely(file == ERR_PTR(-ECHILD)))
				3743	file = path_openat(&nd, op, flags);
				3744	if (unlikely(file == ERR_PTR(-ESTALE)))
				3745	file = path_openat(&nd, op, flags \| LOOKUP_REVAL);
				3746	restore_nameidata();
				3747	putname(filename);
				3748	return file;
				3749	}
				3750
				3751	static struct dentry filename_create(int dfd, struct filename name,
				3752	struct path *path, unsigned int lookup_flags)
				3753	{
				3754	struct dentry *dentry = ERR_PTR(-EEXIST);
				3755	struct qstr last;
				3756	int type;
				3757	int err2;
				3758	int error;
				3759	bool is_dir = (lookup_flags & LOOKUP_DIRECTORY);
				3760
				3761	/*
				3762	* Note that only LOOKUP_REVAL and LOOKUP_DIRECTORY matter here. Any
				3763	* other flags passed in are ignored!
				3764	*/
				3765	lookup_flags &= LOOKUP_REVAL;
				3766
				3767	name = filename_parentat(dfd, name, lookup_flags, path, &last, &type);
				3768	if (IS_ERR(name))
				3769	return ERR_CAST(name);
				3770
				3771	/*
				3772	* Yucky last component or no last component at all?
				3773	* (foo/., foo/.., /////)
				3774	*/
				3775	if (unlikely(type != LAST_NORM))
				3776	goto out;
				3777
				3778	/* don't fail immediately if it's r/o, at least try to report other errors */
				3779	err2 = mnt_want_write(path->mnt);
				3780	/*
				3781	* Do the final lookup.
				3782	*/
				3783	lookup_flags \|= LOOKUP_CREATE \| LOOKUP_EXCL;
				3784	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
				3785	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
				3786	if (IS_ERR(dentry))
				3787	goto unlock;
				3788
				3789	error = -EEXIST;
				3790	if (d_is_positive(dentry))
				3791	goto fail;
				3792
				3793	/*
				3794	* Special case - lookup gave negative, but... we had foo/bar/
				3795	* From the vfs_mknod() POV we just have a negative dentry -
				3796	* all is fine. Let's be bastards - you had / on the end, you've
				3797	* been asking for (non-existent) directory. -ENOENT for you.
				3798	*/
				3799	if (unlikely(!is_dir && last.name[last.len])) {
				3800	error = -ENOENT;
				3801	goto fail;
				3802	}
				3803	if (unlikely(err2)) {
				3804	error = err2;
				3805	goto fail;
				3806	}
				3807	putname(name);
				3808	return dentry;
				3809	fail:
				3810	dput(dentry);
				3811	dentry = ERR_PTR(error);
				3812	unlock:
				3813	inode_unlock(path->dentry->d_inode);
				3814	if (!err2)
				3815	mnt_drop_write(path->mnt);
				3816	out:
				3817	path_put(path);
				3818	putname(name);
				3819	return dentry;
				3820	}
				3821
				3822	struct dentry kern_path_create(int dfd, const char pathname,
				3823	struct path *path, unsigned int lookup_flags)
				3824	{
				3825	return filename_create(dfd, getname_kernel(pathname),
				3826	path, lookup_flags);
				3827	}
				3828	EXPORT_SYMBOL(kern_path_create);
				3829
				3830	void done_path_create(struct path path, struct dentry dentry)
				3831	{
				3832	dput(dentry);
				3833	inode_unlock(path->dentry->d_inode);
				3834	mnt_drop_write(path->mnt);
				3835	path_put(path);
				3836	}
				3837	EXPORT_SYMBOL(done_path_create);
				3838
				3839	inline struct dentry user_path_create(int dfd, const char __user pathname,
				3840	struct path *path, unsigned int lookup_flags)
				3841	{
				3842	return filename_create(dfd, getname(pathname), path, lookup_flags);
				3843	}
				3844	EXPORT_SYMBOL(user_path_create);
				3845
				3846	int vfs_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t dev)
				3847	{
				3848	int error = may_create(dir, dentry);
				3849
				3850	if (error)
				3851	return error;
				3852
				3853	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) && !capable(CAP_MKNOD))
				3854	return -EPERM;
				3855
				3856	if (!dir->i_op->mknod)
				3857	return -EPERM;
				3858
				3859	mode = vfs_prepare_mode(dir, mode, mode, mode);
				3860	error = devcgroup_inode_mknod(mode, dev);
				3861	if (error)
				3862	return error;
				3863
				3864	error = security_inode_mknod(dir, dentry, mode, dev);
				3865	if (error)
				3866	return error;
				3867
				3868	error = dir->i_op->mknod(dir, dentry, mode, dev);
				3869	if (!error)
				3870	fsnotify_create(dir, dentry);
				3871	return error;
				3872	}
				3873	EXPORT_SYMBOL(vfs_mknod);
				3874
				3875	static int may_mknod(umode_t mode)
				3876	{
				3877	switch (mode & S_IFMT) {
				3878	case S_IFREG:
				3879	case S_IFCHR:
				3880	case S_IFBLK:
				3881	case S_IFIFO:
				3882	case S_IFSOCK:
				3883	case 0: /* zero mode translates to S_IFREG */
				3884	return 0;
				3885	case S_IFDIR:
				3886	return -EPERM;
				3887	default:
				3888	return -EINVAL;
				3889	}
				3890	}
				3891
				3892	long do_mknodat(int dfd, const char __user *filename, umode_t mode,
				3893	unsigned int dev)
				3894	{
				3895	struct dentry *dentry;
				3896	struct path path;
				3897	int error;
				3898	unsigned int lookup_flags = 0;
				3899
				3900	error = may_mknod(mode);
				3901	if (error)
				3902	return error;
				3903	retry:
				3904	dentry = user_path_create(dfd, filename, &path, lookup_flags);
				3905	if (IS_ERR(dentry))
				3906	return PTR_ERR(dentry);
				3907
				3908	error = security_path_mknod(&path, dentry,
				3909	mode_strip_umask(path.dentry->d_inode, mode), dev);
				3910	if (error)
				3911	goto out;
				3912	switch (mode & S_IFMT) {
				3913	case 0: case S_IFREG:
				3914	error = vfs_create(path.dentry->d_inode,dentry,mode,true);
				3915	if (!error)
				3916	ima_post_path_mknod(dentry);
				3917	break;
				3918	case S_IFCHR: case S_IFBLK:
				3919	error = vfs_mknod(path.dentry->d_inode,dentry,mode,
				3920	new_decode_dev(dev));
				3921	break;
				3922	case S_IFIFO: case S_IFSOCK:
				3923	error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
				3924	break;
				3925	}
				3926	out:
				3927	done_path_create(&path, dentry);
				3928	if (retry_estale(error, lookup_flags)) {
				3929	lookup_flags \|= LOOKUP_REVAL;
				3930	goto retry;
				3931	}
				3932	return error;
				3933	}
				3934
				3935	SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
				3936	unsigned int, dev)
				3937	{
				3938	return do_mknodat(dfd, filename, mode, dev);
				3939	}
				3940
				3941	SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
				3942	{
				3943	return do_mknodat(AT_FDCWD, filename, mode, dev);
				3944	}
				3945
				3946	int vfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
				3947	{
				3948	int error = may_create(dir, dentry);
				3949	unsigned max_links = dir->i_sb->s_max_links;
				3950
				3951	if (error)
				3952	return error;
				3953
				3954	if (!dir->i_op->mkdir)
				3955	return -EPERM;
				3956
				3957	mode = vfs_prepare_mode(dir, mode, S_IRWXUGO \| S_ISVTX, 0);
				3958	error = security_inode_mkdir(dir, dentry, mode);
				3959	if (error)
				3960	return error;
				3961
				3962	if (max_links && dir->i_nlink >= max_links)
				3963	return -EMLINK;
				3964
				3965	error = dir->i_op->mkdir(dir, dentry, mode);
				3966	if (!error)
				3967	fsnotify_mkdir(dir, dentry);
				3968	return error;
				3969	}
				3970	EXPORT_SYMBOL(vfs_mkdir);
				3971
				3972	long do_mkdirat(int dfd, const char __user *pathname, umode_t mode)
				3973	{
				3974	struct dentry *dentry;
				3975	struct path path;
				3976	int error;
				3977	unsigned int lookup_flags = LOOKUP_DIRECTORY;
				3978
				3979	retry:
				3980	dentry = user_path_create(dfd, pathname, &path, lookup_flags);
				3981	if (IS_ERR(dentry))
				3982	return PTR_ERR(dentry);
				3983
				3984	error = security_path_mkdir(&path, dentry,
				3985	mode_strip_umask(path.dentry->d_inode, mode));
				3986	if (!error)
				3987	error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
				3988	done_path_create(&path, dentry);
				3989	if (retry_estale(error, lookup_flags)) {
				3990	lookup_flags \|= LOOKUP_REVAL;
				3991	goto retry;
				3992	}
				3993	return error;
				3994	}
				3995
				3996	SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
				3997	{
				3998	return do_mkdirat(dfd, pathname, mode);
				3999	}
				4000
				4001	SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
				4002	{
				4003	return do_mkdirat(AT_FDCWD, pathname, mode);
				4004	}
				4005
				4006	int vfs_rmdir(struct inode dir, struct dentry dentry)
				4007	{
				4008	int error = may_delete(dir, dentry, 1);
				4009
				4010	if (error)
				4011	return error;
				4012
				4013	if (!dir->i_op->rmdir)
				4014	return -EPERM;
				4015
				4016	dget(dentry);
				4017	inode_lock(dentry->d_inode);
				4018
				4019	error = -EBUSY;
				4020	if (is_local_mountpoint(dentry))
				4021	goto out;
				4022
				4023	error = security_inode_rmdir(dir, dentry);
				4024	if (error)
				4025	goto out;
				4026
				4027	error = dir->i_op->rmdir(dir, dentry);
				4028	if (error)
				4029	goto out;
				4030
				4031	shrink_dcache_parent(dentry);
				4032	dentry->d_inode->i_flags \|= S_DEAD;
				4033	dont_mount(dentry);
				4034	detach_mounts(dentry);
				4035
				4036	out:
				4037	inode_unlock(dentry->d_inode);
				4038	dput(dentry);
				4039	if (!error)
				4040	d_delete_notify(dir, dentry);
				4041	return error;
				4042	}
				4043	EXPORT_SYMBOL(vfs_rmdir);
				4044
				4045	long do_rmdir(int dfd, const char __user *pathname)
				4046	{
				4047	int error = 0;
				4048	struct filename *name;
				4049	struct dentry *dentry;
				4050	struct path path;
				4051	struct qstr last;
				4052	int type;
				4053	unsigned int lookup_flags = 0;
				4054	retry:
				4055	name = filename_parentat(dfd, getname(pathname), lookup_flags,
				4056	&path, &last, &type);
				4057	if (IS_ERR(name))
				4058	return PTR_ERR(name);
				4059
				4060	switch (type) {
				4061	case LAST_DOTDOT:
				4062	error = -ENOTEMPTY;
				4063	goto exit1;
				4064	case LAST_DOT:
				4065	error = -EINVAL;
				4066	goto exit1;
				4067	case LAST_ROOT:
				4068	error = -EBUSY;
				4069	goto exit1;
				4070	}
				4071
				4072	error = mnt_want_write(path.mnt);
				4073	if (error)
				4074	goto exit1;
				4075
				4076	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
				4077	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
				4078	error = PTR_ERR(dentry);
				4079	if (IS_ERR(dentry))
				4080	goto exit2;
				4081	if (!dentry->d_inode) {
				4082	error = -ENOENT;
				4083	goto exit3;
				4084	}
				4085	error = security_path_rmdir(&path, dentry);
				4086	if (error)
				4087	goto exit3;
				4088	error = vfs_rmdir(path.dentry->d_inode, dentry);
				4089	exit3:
				4090	dput(dentry);
				4091	exit2:
				4092	inode_unlock(path.dentry->d_inode);
				4093	mnt_drop_write(path.mnt);
				4094	exit1:
				4095	path_put(&path);
				4096	putname(name);
				4097	if (retry_estale(error, lookup_flags)) {
				4098	lookup_flags \|= LOOKUP_REVAL;
				4099	goto retry;
				4100	}
				4101	return error;
				4102	}
				4103
				4104	SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
				4105	{
				4106	return do_rmdir(AT_FDCWD, pathname);
				4107	}
				4108
				4109	/**
				4110	* vfs_unlink - unlink a filesystem object
				4111	* @dir: parent directory
				4112	* @dentry: victim
				4113	* @delegated_inode: returns victim inode, if the inode is delegated.
				4114	*
				4115	* The caller must hold dir->i_mutex.
				4116	*
				4117	* If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
				4118	* return a reference to the inode in delegated_inode. The caller
				4119	* should then break the delegation on that inode and retry. Because
				4120	* breaking a delegation may take a long time, the caller should drop
				4121	* dir->i_mutex before doing so.
				4122	*
				4123	* Alternatively, a caller may pass NULL for delegated_inode. This may
				4124	* be appropriate for callers that expect the underlying filesystem not
				4125	* to be NFS exported.
				4126	*/
				4127	int vfs_unlink(struct inode dir, struct dentry dentry, struct inode **delegated_inode)
				4128	{
				4129	struct inode *target = dentry->d_inode;
				4130	int error = may_delete(dir, dentry, 0);
				4131
				4132	if (error)
				4133	return error;
				4134
				4135	if (!dir->i_op->unlink)
				4136	return -EPERM;
				4137
				4138	inode_lock(target);
				4139	if (is_local_mountpoint(dentry))
				4140	error = -EBUSY;
				4141	else {
				4142	error = security_inode_unlink(dir, dentry);
				4143	if (!error) {
				4144	error = try_break_deleg(target, delegated_inode);
				4145	if (error)
				4146	goto out;
				4147	error = dir->i_op->unlink(dir, dentry);
				4148	if (!error) {
				4149	dont_mount(dentry);
				4150	detach_mounts(dentry);
				4151	}
				4152	}
				4153	}
				4154	out:
				4155	inode_unlock(target);
				4156
				4157	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
				4158	if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
				4159	fsnotify_unlink(dir, dentry);
				4160	} else if (!error) {
				4161	fsnotify_link_count(target);
				4162	d_delete_notify(dir, dentry);
				4163	}
				4164
				4165	return error;
				4166	}
				4167	EXPORT_SYMBOL(vfs_unlink);
				4168
				4169	/*
				4170	* Make sure that the actual truncation of the file will occur outside its
				4171	* directory's i_mutex. Truncate can take a long time if there is a lot of
				4172	* writeout happening, and we don't want to prevent access to the directory
				4173	* while waiting on the I/O.
				4174	*/
				4175	long do_unlinkat(int dfd, struct filename *name)
				4176	{
				4177	int error;
				4178	struct dentry *dentry;
				4179	struct path path;
				4180	struct qstr last;
				4181	int type;
				4182	struct inode *inode = NULL;
				4183	struct inode *delegated_inode = NULL;
				4184	unsigned int lookup_flags = 0;
				4185	retry:
				4186	name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
				4187	if (IS_ERR(name))
				4188	return PTR_ERR(name);
				4189
				4190	error = -EISDIR;
				4191	if (type != LAST_NORM)
				4192	goto exit1;
				4193
				4194	error = mnt_want_write(path.mnt);
				4195	if (error)
				4196	goto exit1;
				4197	retry_deleg:
				4198	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
				4199	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
				4200	error = PTR_ERR(dentry);
				4201	if (!IS_ERR(dentry)) {
				4202	/* Why not before? Because we want correct error value */
				4203	if (last.name[last.len])
				4204	goto slashes;
				4205	inode = dentry->d_inode;
				4206	if (d_is_negative(dentry))
				4207	goto slashes;
				4208	ihold(inode);
				4209	error = security_path_unlink(&path, dentry);
				4210	if (error)
				4211	goto exit2;
				4212	error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
				4213	exit2:
				4214	dput(dentry);
				4215	}
				4216	inode_unlock(path.dentry->d_inode);
				4217	if (inode)
				4218	iput(inode); /* truncate the inode here */
				4219	inode = NULL;
				4220	if (delegated_inode) {
				4221	error = break_deleg_wait(&delegated_inode);
				4222	if (!error)
				4223	goto retry_deleg;
				4224	}
				4225	mnt_drop_write(path.mnt);
				4226	exit1:
				4227	path_put(&path);
				4228	if (retry_estale(error, lookup_flags)) {
				4229	lookup_flags \|= LOOKUP_REVAL;
				4230	inode = NULL;
				4231	goto retry;
				4232	}
				4233	putname(name);
				4234	return error;
				4235
				4236	slashes:
				4237	if (d_is_negative(dentry))
				4238	error = -ENOENT;
				4239	else if (d_is_dir(dentry))
				4240	error = -EISDIR;
				4241	else
				4242	error = -ENOTDIR;
				4243	goto exit2;
				4244	}
				4245
				4246	SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
				4247	{
				4248	if ((flag & ~AT_REMOVEDIR) != 0)
				4249	return -EINVAL;
				4250
				4251	if (flag & AT_REMOVEDIR)
				4252	return do_rmdir(dfd, pathname);
				4253
				4254	return do_unlinkat(dfd, getname(pathname));
				4255	}
				4256
				4257	SYSCALL_DEFINE1(unlink, const char __user *, pathname)
				4258	{
				4259	return do_unlinkat(AT_FDCWD, getname(pathname));
				4260	}
				4261
				4262	int vfs_symlink(struct inode dir, struct dentry dentry, const char *oldname)
				4263	{
				4264	int error = may_create(dir, dentry);
				4265
				4266	if (error)
				4267	return error;
				4268
				4269	if (!dir->i_op->symlink)
				4270	return -EPERM;
				4271
				4272	error = security_inode_symlink(dir, dentry, oldname);
				4273	if (error)
				4274	return error;
				4275
				4276	error = dir->i_op->symlink(dir, dentry, oldname);
				4277	if (!error)
				4278	fsnotify_create(dir, dentry);
				4279	return error;
				4280	}
				4281	EXPORT_SYMBOL(vfs_symlink);
				4282
				4283	long do_symlinkat(const char __user *oldname, int newdfd,
				4284	const char __user *newname)
				4285	{
				4286	int error;
				4287	struct filename *from;
				4288	struct dentry *dentry;
				4289	struct path path;
				4290	unsigned int lookup_flags = 0;
				4291
				4292	from = getname(oldname);
				4293	if (IS_ERR(from))
				4294	return PTR_ERR(from);
				4295	retry:
				4296	dentry = user_path_create(newdfd, newname, &path, lookup_flags);
				4297	error = PTR_ERR(dentry);
				4298	if (IS_ERR(dentry))
				4299	goto out_putname;
				4300
				4301	error = security_path_symlink(&path, dentry, from->name);
				4302	if (!error)
				4303	error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
				4304	done_path_create(&path, dentry);
				4305	if (retry_estale(error, lookup_flags)) {
				4306	lookup_flags \|= LOOKUP_REVAL;
				4307	goto retry;
				4308	}
				4309	out_putname:
				4310	putname(from);
				4311	return error;
				4312	}
				4313
				4314	SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
				4315	int, newdfd, const char __user *, newname)
				4316	{
				4317	return do_symlinkat(oldname, newdfd, newname);
				4318	}
				4319
				4320	SYSCALL_DEFINE2(symlink, const char __user , oldname, const char __user , newname)
				4321	{
				4322	return do_symlinkat(oldname, AT_FDCWD, newname);
				4323	}
				4324
				4325	/**
				4326	* vfs_link - create a new link
				4327	* @old_dentry: object to be linked
				4328	* @dir: new parent
				4329	* @new_dentry: where to create the new link
				4330	* @delegated_inode: returns inode needing a delegation break
				4331	*
				4332	* The caller must hold dir->i_mutex
				4333	*
				4334	* If vfs_link discovers a delegation on the to-be-linked file in need
				4335	* of breaking, it will return -EWOULDBLOCK and return a reference to the
				4336	* inode in delegated_inode. The caller should then break the delegation
				4337	* and retry. Because breaking a delegation may take a long time, the
				4338	* caller should drop the i_mutex before doing so.
				4339	*
				4340	* Alternatively, a caller may pass NULL for delegated_inode. This may
				4341	* be appropriate for callers that expect the underlying filesystem not
				4342	* to be NFS exported.
				4343	*/
				4344	int vfs_link(struct dentry old_dentry, struct inode dir, struct dentry new_dentry, struct inode *delegated_inode)
				4345	{
				4346	struct inode *inode = old_dentry->d_inode;
				4347	unsigned max_links = dir->i_sb->s_max_links;
				4348	int error;
				4349
				4350	if (!inode)
				4351	return -ENOENT;
				4352
				4353	error = may_create(dir, new_dentry);
				4354	if (error)
				4355	return error;
				4356
				4357	if (dir->i_sb != inode->i_sb)
				4358	return -EXDEV;
				4359
				4360	/*
				4361	* A link to an append-only or immutable file cannot be created.
				4362	*/
				4363	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				4364	return -EPERM;
				4365	/*
				4366	* Updating the link count will likely cause i_uid and i_gid to
				4367	* be writen back improperly if their true value is unknown to
				4368	* the vfs.
				4369	*/
				4370	if (HAS_UNMAPPED_ID(inode))
				4371	return -EPERM;
				4372	if (!dir->i_op->link)
				4373	return -EPERM;
				4374	if (S_ISDIR(inode->i_mode))
				4375	return -EPERM;
				4376
				4377	error = security_inode_link(old_dentry, dir, new_dentry);
				4378	if (error)
				4379	return error;
				4380
				4381	inode_lock(inode);
				4382	/* Make sure we don't allow creating hardlink to an unlinked file */
				4383	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
				4384	error = -ENOENT;
				4385	else if (max_links && inode->i_nlink >= max_links)
				4386	error = -EMLINK;
				4387	else {
				4388	error = try_break_deleg(inode, delegated_inode);
				4389	if (!error)
				4390	error = dir->i_op->link(old_dentry, dir, new_dentry);
				4391	}
				4392
				4393	if (!error && (inode->i_state & I_LINKABLE)) {
				4394	spin_lock(&inode->i_lock);
				4395	inode->i_state &= ~I_LINKABLE;
				4396	spin_unlock(&inode->i_lock);
				4397	}
				4398	inode_unlock(inode);
				4399	if (!error)
				4400	fsnotify_link(dir, inode, new_dentry);
				4401	return error;
				4402	}
				4403	EXPORT_SYMBOL(vfs_link);
				4404
				4405	/*
				4406	* Hardlinks are often used in delicate situations. We avoid
				4407	* security-related surprises by not following symlinks on the
				4408	* newname. --KAB
				4409	*
				4410	* We don't follow them on the oldname either to be compatible
				4411	* with linux 2.0, and to avoid hard-linking to directories
				4412	* and other special files. --ADM
				4413	*/
				4414	int do_linkat(int olddfd, const char __user *oldname, int newdfd,
				4415	const char __user *newname, int flags)
				4416	{
				4417	struct dentry *new_dentry;
				4418	struct path old_path, new_path;
				4419	struct inode *delegated_inode = NULL;
				4420	int how = 0;
				4421	int error;
				4422
				4423	if ((flags & ~(AT_SYMLINK_FOLLOW \| AT_EMPTY_PATH)) != 0)
				4424	return -EINVAL;
				4425	/*
				4426	* To use null names we require CAP_DAC_READ_SEARCH
				4427	* This ensures that not everyone will be able to create
				4428	* handlink using the passed filedescriptor.
				4429	*/
				4430	if (flags & AT_EMPTY_PATH) {
				4431	if (!capable(CAP_DAC_READ_SEARCH))
				4432	return -ENOENT;
				4433	how = LOOKUP_EMPTY;
				4434	}
				4435
				4436	if (flags & AT_SYMLINK_FOLLOW)
				4437	how \|= LOOKUP_FOLLOW;
				4438	retry:
				4439	error = user_path_at(olddfd, oldname, how, &old_path);
				4440	if (error)
				4441	return error;
				4442
				4443	new_dentry = user_path_create(newdfd, newname, &new_path,
				4444	(how & LOOKUP_REVAL));
				4445	error = PTR_ERR(new_dentry);
				4446	if (IS_ERR(new_dentry))
				4447	goto out;
				4448
				4449	error = -EXDEV;
				4450	if (old_path.mnt != new_path.mnt)
				4451	goto out_dput;
				4452	error = may_linkat(&old_path);
				4453	if (unlikely(error))
				4454	goto out_dput;
				4455	error = security_path_link(old_path.dentry, &new_path, new_dentry);
				4456	if (error)
				4457	goto out_dput;
				4458	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
				4459	out_dput:
				4460	done_path_create(&new_path, new_dentry);
				4461	if (delegated_inode) {
				4462	error = break_deleg_wait(&delegated_inode);
				4463	if (!error) {
				4464	path_put(&old_path);
				4465	goto retry;
				4466	}
				4467	}
				4468	if (retry_estale(error, how)) {
				4469	path_put(&old_path);
				4470	how \|= LOOKUP_REVAL;
				4471	goto retry;
				4472	}
				4473	out:
				4474	path_put(&old_path);
				4475
				4476	return error;
				4477	}
				4478
				4479	SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
				4480	int, newdfd, const char __user *, newname, int, flags)
				4481	{
				4482	return do_linkat(olddfd, oldname, newdfd, newname, flags);
				4483	}
				4484
				4485	SYSCALL_DEFINE2(link, const char __user , oldname, const char __user , newname)
				4486	{
				4487	return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
				4488	}
				4489
				4490	/**
				4491	* vfs_rename - rename a filesystem object
				4492	* @old_dir: parent of source
				4493	* @old_dentry: source
				4494	* @new_dir: parent of destination
				4495	* @new_dentry: destination
				4496	* @delegated_inode: returns an inode needing a delegation break
				4497	* @flags: rename flags
				4498	*
				4499	* The caller must hold multiple mutexes--see lock_rename()).
				4500	*
				4501	* If vfs_rename discovers a delegation in need of breaking at either
				4502	* the source or destination, it will return -EWOULDBLOCK and return a
				4503	* reference to the inode in delegated_inode. The caller should then
				4504	* break the delegation and retry. Because breaking a delegation may
				4505	* take a long time, the caller should drop all locks before doing
				4506	* so.
				4507	*
				4508	* Alternatively, a caller may pass NULL for delegated_inode. This may
				4509	* be appropriate for callers that expect the underlying filesystem not
				4510	* to be NFS exported.
				4511	*
				4512	* The worst of all namespace operations - renaming directory. "Perverted"
				4513	* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
				4514	* Problems:
				4515	*
				4516	* a) we can get into loop creation.
				4517	* b) race potential - two innocent renames can create a loop together.
				4518	* That's where 4.4BSD screws up. Current fix: serialization on
				4519	* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
				4520	* story.
				4521	* c) we may have to lock up to _four_ objects - parents and victim (if it exists),
				4522	* and source (if it's a non-directory or a subdirectory that moves to
				4523	* different parent).
				4524	* And that - after we got ->i_mutex on parents (until then we don't know
				4525	* whether the target exists). Solution: try to be smart with locking
				4526	* order for inodes. We rely on the fact that tree topology may change
				4527	* only under ->s_vfs_rename_mutex _and_ that parent of the object we
				4528	* move will be locked. Thus we can rank directories by the tree
				4529	* (ancestors first) and rank all non-directories after them.
				4530	* That works since everybody except rename does "lock parent, lookup,
				4531	* lock child" and rename is under ->s_vfs_rename_mutex.
				4532	* HOWEVER, it relies on the assumption that any object with ->lookup()
				4533	* has no more than 1 dentry. If "hybrid" objects will ever appear,
				4534	* we'd better make sure that there's no link(2) for them.
				4535	* d) conversion from fhandle to dentry may come in the wrong moment - when
				4536	* we are removing the target. Solution: we will have to grab ->i_mutex
				4537	* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
				4538	* ->i_mutex on parents, which works but leads to some truly excessive
				4539	* locking].
				4540	*/
				4541	int vfs_rename(struct inode old_dir, struct dentry old_dentry,
				4542	struct inode new_dir, struct dentry new_dentry,
				4543	struct inode **delegated_inode, unsigned int flags)
				4544	{
				4545	int error;
				4546	bool is_dir = d_is_dir(old_dentry);
				4547	struct inode *source = old_dentry->d_inode;
				4548	struct inode *target = new_dentry->d_inode;
				4549	bool new_is_dir = false;
				4550	unsigned max_links = new_dir->i_sb->s_max_links;
				4551	struct name_snapshot old_name;
				4552	bool lock_old_subdir, lock_new_subdir;
				4553
				4554	if (source == target)
				4555	return 0;
				4556
				4557	error = may_delete(old_dir, old_dentry, is_dir);
				4558	if (error)
				4559	return error;
				4560
				4561	if (!target) {
				4562	error = may_create(new_dir, new_dentry);
				4563	} else {
				4564	new_is_dir = d_is_dir(new_dentry);
				4565
				4566	if (!(flags & RENAME_EXCHANGE))
				4567	error = may_delete(new_dir, new_dentry, is_dir);
				4568	else
				4569	error = may_delete(new_dir, new_dentry, new_is_dir);
				4570	}
				4571	if (error)
				4572	return error;
				4573
				4574	if (!old_dir->i_op->rename)
				4575	return -EPERM;
				4576
				4577	/*
				4578	* If we are going to change the parent - check write permissions,
				4579	* we'll need to flip '..'.
				4580	*/
				4581	if (new_dir != old_dir) {
				4582	if (is_dir) {
				4583	error = inode_permission(source, MAY_WRITE);
				4584	if (error)
				4585	return error;
				4586	}
				4587	if ((flags & RENAME_EXCHANGE) && new_is_dir) {
				4588	error = inode_permission(target, MAY_WRITE);
				4589	if (error)
				4590	return error;
				4591	}
				4592	}
				4593
				4594	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
				4595	flags);
				4596	if (error)
				4597	return error;
				4598
				4599	take_dentry_name_snapshot(&old_name, old_dentry);
				4600	dget(new_dentry);
				4601	/*
				4602	* Lock children.
				4603	* The source subdirectory needs to be locked on cross-directory
				4604	* rename or cross-directory exchange since its parent changes.
				4605	* The target subdirectory needs to be locked on cross-directory
				4606	* exchange due to parent change and on any rename due to becoming
				4607	* a victim.
				4608	* Non-directories need locking in all cases (for NFS reasons);
				4609	* they get locked after any subdirectories (in inode address order).
				4610	*
				4611	* NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
				4612	* NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
				4613	*/
				4614	lock_old_subdir = new_dir != old_dir;
				4615	lock_new_subdir = new_dir != old_dir \|\| !(flags & RENAME_EXCHANGE);
				4616	if (is_dir) {
				4617	if (lock_old_subdir)
				4618	inode_lock_nested(source, I_MUTEX_CHILD);
				4619	if (target && (!new_is_dir \|\| lock_new_subdir))
				4620	inode_lock(target);
				4621	} else if (new_is_dir) {
				4622	if (lock_new_subdir)
				4623	inode_lock_nested(target, I_MUTEX_CHILD);
				4624	inode_lock(source);
				4625	} else {
				4626	lock_two_nondirectories(source, target);
				4627	}
				4628
				4629	error = -EBUSY;
				4630	if (is_local_mountpoint(old_dentry) \|\| is_local_mountpoint(new_dentry))
				4631	goto out;
				4632
				4633	if (max_links && new_dir != old_dir) {
				4634	error = -EMLINK;
				4635	if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
				4636	goto out;
				4637	if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
				4638	old_dir->i_nlink >= max_links)
				4639	goto out;
				4640	}
				4641	if (!is_dir) {
				4642	error = try_break_deleg(source, delegated_inode);
				4643	if (error)
				4644	goto out;
				4645	}
				4646	if (target && !new_is_dir) {
				4647	error = try_break_deleg(target, delegated_inode);
				4648	if (error)
				4649	goto out;
				4650	}
				4651	error = old_dir->i_op->rename(old_dir, old_dentry,
				4652	new_dir, new_dentry, flags);
				4653	if (error)
				4654	goto out;
				4655
				4656	if (!(flags & RENAME_EXCHANGE) && target) {
				4657	if (is_dir) {
				4658	shrink_dcache_parent(new_dentry);
				4659	target->i_flags \|= S_DEAD;
				4660	}
				4661	dont_mount(new_dentry);
				4662	detach_mounts(new_dentry);
				4663	}
				4664	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
				4665	if (!(flags & RENAME_EXCHANGE))
				4666	d_move(old_dentry, new_dentry);
				4667	else
				4668	d_exchange(old_dentry, new_dentry);
				4669	}
				4670	out:
				4671	if (!is_dir \|\| lock_old_subdir)
				4672	inode_unlock(source);
				4673	if (target && (!new_is_dir \|\| lock_new_subdir))
				4674	inode_unlock(target);
				4675	dput(new_dentry);
				4676	if (!error) {
				4677	fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
				4678	!(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
				4679	if (flags & RENAME_EXCHANGE) {
				4680	fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
				4681	new_is_dir, NULL, new_dentry);
				4682	}
				4683	}
				4684	release_dentry_name_snapshot(&old_name);
				4685
				4686	return error;
				4687	}
				4688	EXPORT_SYMBOL(vfs_rename);
				4689
				4690	static int do_renameat2(int olddfd, const char __user *oldname, int newdfd,
				4691	const char __user *newname, unsigned int flags)
				4692	{
				4693	struct dentry old_dentry, new_dentry;
				4694	struct dentry *trap;
				4695	struct path old_path, new_path;
				4696	struct qstr old_last, new_last;
				4697	int old_type, new_type;
				4698	struct inode *delegated_inode = NULL;
				4699	struct filename *from;
				4700	struct filename *to;
				4701	unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET;
				4702	bool should_retry = false;
				4703	int error;
				4704
				4705	if (flags & ~(RENAME_NOREPLACE \| RENAME_EXCHANGE \| RENAME_WHITEOUT))
				4706	return -EINVAL;
				4707
				4708	if ((flags & (RENAME_NOREPLACE \| RENAME_WHITEOUT)) &&
				4709	(flags & RENAME_EXCHANGE))
				4710	return -EINVAL;
				4711
				4712	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
				4713	return -EPERM;
				4714
				4715	if (flags & RENAME_EXCHANGE)
				4716	target_flags = 0;
				4717
				4718	retry:
				4719	from = filename_parentat(olddfd, getname(oldname), lookup_flags,
				4720	&old_path, &old_last, &old_type);
				4721	if (IS_ERR(from)) {
				4722	error = PTR_ERR(from);
				4723	goto exit;
				4724	}
				4725
				4726	to = filename_parentat(newdfd, getname(newname), lookup_flags,
				4727	&new_path, &new_last, &new_type);
				4728	if (IS_ERR(to)) {
				4729	error = PTR_ERR(to);
				4730	goto exit1;
				4731	}
				4732
				4733	error = -EXDEV;
				4734	if (old_path.mnt != new_path.mnt)
				4735	goto exit2;
				4736
				4737	error = -EBUSY;
				4738	if (old_type != LAST_NORM)
				4739	goto exit2;
				4740
				4741	if (flags & RENAME_NOREPLACE)
				4742	error = -EEXIST;
				4743	if (new_type != LAST_NORM)
				4744	goto exit2;
				4745
				4746	error = mnt_want_write(old_path.mnt);
				4747	if (error)
				4748	goto exit2;
				4749
				4750	retry_deleg:
				4751	trap = lock_rename(new_path.dentry, old_path.dentry);
				4752
				4753	old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags);
				4754	error = PTR_ERR(old_dentry);
				4755	if (IS_ERR(old_dentry))
				4756	goto exit3;
				4757	/* source must exist */
				4758	error = -ENOENT;
				4759	if (d_is_negative(old_dentry))
				4760	goto exit4;
				4761	new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags \| target_flags);
				4762	error = PTR_ERR(new_dentry);
				4763	if (IS_ERR(new_dentry))
				4764	goto exit4;
				4765	error = -EEXIST;
				4766	if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry))
				4767	goto exit5;
				4768	if (flags & RENAME_EXCHANGE) {
				4769	error = -ENOENT;
				4770	if (d_is_negative(new_dentry))
				4771	goto exit5;
				4772
				4773	if (!d_is_dir(new_dentry)) {
				4774	error = -ENOTDIR;
				4775	if (new_last.name[new_last.len])
				4776	goto exit5;
				4777	}
				4778	}
				4779	/* unless the source is a directory trailing slashes give -ENOTDIR */
				4780	if (!d_is_dir(old_dentry)) {
				4781	error = -ENOTDIR;
				4782	if (old_last.name[old_last.len])
				4783	goto exit5;
				4784	if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
				4785	goto exit5;
				4786	}
				4787	/* source should not be ancestor of target */
				4788	error = -EINVAL;
				4789	if (old_dentry == trap)
				4790	goto exit5;
				4791	/* target should not be an ancestor of source */
				4792	if (!(flags & RENAME_EXCHANGE))
				4793	error = -ENOTEMPTY;
				4794	if (new_dentry == trap)
				4795	goto exit5;
				4796
				4797	error = security_path_rename(&old_path, old_dentry,
				4798	&new_path, new_dentry, flags);
				4799	if (error)
				4800	goto exit5;
				4801	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
				4802	new_path.dentry->d_inode, new_dentry,
				4803	&delegated_inode, flags);
				4804	exit5:
				4805	dput(new_dentry);
				4806	exit4:
				4807	dput(old_dentry);
				4808	exit3:
				4809	unlock_rename(new_path.dentry, old_path.dentry);
				4810	if (delegated_inode) {
				4811	error = break_deleg_wait(&delegated_inode);
				4812	if (!error)
				4813	goto retry_deleg;
				4814	}
				4815	mnt_drop_write(old_path.mnt);
				4816	exit2:
				4817	if (retry_estale(error, lookup_flags))
				4818	should_retry = true;
				4819	path_put(&new_path);
				4820	putname(to);
				4821	exit1:
				4822	path_put(&old_path);
				4823	putname(from);
				4824	if (should_retry) {
				4825	should_retry = false;
				4826	lookup_flags \|= LOOKUP_REVAL;
				4827	goto retry;
				4828	}
				4829	exit:
				4830	return error;
				4831	}
				4832
				4833	SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
				4834	int, newdfd, const char __user *, newname, unsigned int, flags)
				4835	{
				4836	return do_renameat2(olddfd, oldname, newdfd, newname, flags);
				4837	}
				4838
				4839	SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
				4840	int, newdfd, const char __user *, newname)
				4841	{
				4842	return do_renameat2(olddfd, oldname, newdfd, newname, 0);
				4843	}
				4844
				4845	SYSCALL_DEFINE2(rename, const char __user , oldname, const char __user , newname)
				4846	{
				4847	return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
				4848	}
				4849
				4850	int vfs_whiteout(struct inode dir, struct dentry dentry)
				4851	{
				4852	int error = may_create(dir, dentry);
				4853	if (error)
				4854	return error;
				4855
				4856	if (!dir->i_op->mknod)
				4857	return -EPERM;
				4858
				4859	return dir->i_op->mknod(dir, dentry,
				4860	S_IFCHR \| WHITEOUT_MODE, WHITEOUT_DEV);
				4861	}
				4862	EXPORT_SYMBOL(vfs_whiteout);
				4863
				4864	int readlink_copy(char __user buffer, int buflen, const char link)
				4865	{
				4866	int len = PTR_ERR(link);
				4867	if (IS_ERR(link))
				4868	goto out;
				4869
				4870	len = strlen(link);
				4871	if (len > (unsigned) buflen)
				4872	len = buflen;
				4873	if (copy_to_user(buffer, link, len))
				4874	len = -EFAULT;
				4875	out:
				4876	return len;
				4877	}
				4878
				4879	/**
				4880	* vfs_readlink - copy symlink body into userspace buffer
				4881	* @dentry: dentry on which to get symbolic link
				4882	* @buffer: user memory pointer
				4883	* @buflen: size of buffer
				4884	*
				4885	* Does not touch atime. That's up to the caller if necessary
				4886	*
				4887	* Does not call security hook.
				4888	*/
				4889	int vfs_readlink(struct dentry dentry, char __user buffer, int buflen)
				4890	{
				4891	struct inode *inode = d_inode(dentry);
				4892	DEFINE_DELAYED_CALL(done);
				4893	const char *link;
				4894	int res;
				4895
				4896	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
				4897	if (unlikely(inode->i_op->readlink))
				4898	return inode->i_op->readlink(dentry, buffer, buflen);
				4899
				4900	if (!d_is_symlink(dentry))
				4901	return -EINVAL;
				4902
				4903	spin_lock(&inode->i_lock);
				4904	inode->i_opflags \|= IOP_DEFAULT_READLINK;
				4905	spin_unlock(&inode->i_lock);
				4906	}
				4907
				4908	link = READ_ONCE(inode->i_link);
				4909	if (!link) {
				4910	link = inode->i_op->get_link(dentry, inode, &done);
				4911	if (IS_ERR(link))
				4912	return PTR_ERR(link);
				4913	}
				4914	res = readlink_copy(buffer, buflen, link);
				4915	do_delayed_call(&done);
				4916	return res;
				4917	}
				4918	EXPORT_SYMBOL(vfs_readlink);
				4919
				4920	/**
				4921	* vfs_get_link - get symlink body
				4922	* @dentry: dentry on which to get symbolic link
				4923	* @done: caller needs to free returned data with this
				4924	*
				4925	* Calls security hook and i_op->get_link() on the supplied inode.
				4926	*
				4927	* It does not touch atime. That's up to the caller if necessary.
				4928	*
				4929	* Does not work on "special" symlinks like /proc/$$/fd/N
				4930	*/
				4931	const char vfs_get_link(struct dentry dentry, struct delayed_call *done)
				4932	{
				4933	const char *res = ERR_PTR(-EINVAL);
				4934	struct inode *inode = d_inode(dentry);
				4935
				4936	if (d_is_symlink(dentry)) {
				4937	res = ERR_PTR(security_inode_readlink(dentry));
				4938	if (!res)
				4939	res = inode->i_op->get_link(dentry, inode, done);
				4940	}
				4941	return res;
				4942	}
				4943	EXPORT_SYMBOL(vfs_get_link);
				4944
				4945	/* get the link contents into pagecache */
				4946	const char page_get_link(struct dentry dentry, struct inode *inode,
				4947	struct delayed_call *callback)
				4948	{
				4949	char *kaddr;
				4950	struct page *page;
				4951	struct address_space *mapping = inode->i_mapping;
				4952
				4953	if (!dentry) {
				4954	page = find_get_page(mapping, 0);
				4955	if (!page)
				4956	return ERR_PTR(-ECHILD);
				4957	if (!PageUptodate(page)) {
				4958	put_page(page);
				4959	return ERR_PTR(-ECHILD);
				4960	}
				4961	} else {
				4962	page = read_mapping_page(mapping, 0, NULL);
				4963	if (IS_ERR(page))
				4964	return (char*)page;
				4965	}
				4966	set_delayed_call(callback, page_put_link, page);
				4967	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
				4968	kaddr = page_address(page);
				4969	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
				4970	return kaddr;
				4971	}
				4972
				4973	EXPORT_SYMBOL(page_get_link);
				4974
				4975	void page_put_link(void *arg)
				4976	{
				4977	put_page(arg);
				4978	}
				4979	EXPORT_SYMBOL(page_put_link);
				4980
				4981	int page_readlink(struct dentry dentry, char __user buffer, int buflen)
				4982	{
				4983	DEFINE_DELAYED_CALL(done);
				4984	int res = readlink_copy(buffer, buflen,
				4985	page_get_link(dentry, d_inode(dentry),
				4986	&done));
				4987	do_delayed_call(&done);
				4988	return res;
				4989	}
				4990	EXPORT_SYMBOL(page_readlink);
				4991
				4992	/*
				4993	* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
				4994	*/
				4995	int __page_symlink(struct inode inode, const char symname, int len, int nofs)
				4996	{
				4997	struct address_space *mapping = inode->i_mapping;
				4998	struct page *page;
				4999	void *fsdata = NULL;
				5000	int err;
				5001	unsigned int flags = 0;
				5002	if (nofs)
				5003	flags \|= AOP_FLAG_NOFS;
				5004
				5005	retry:
				5006	err = pagecache_write_begin(NULL, mapping, 0, len-1,
				5007	flags, &page, &fsdata);
				5008	if (err)
				5009	goto fail;
				5010
				5011	memcpy(page_address(page), symname, len-1);
				5012
				5013	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
				5014	page, fsdata);
				5015	if (err < 0)
				5016	goto fail;
				5017	if (err < len-1)
				5018	goto retry;
				5019
				5020	mark_inode_dirty(inode);
				5021	return 0;
				5022	fail:
				5023	return err;
				5024	}
				5025	EXPORT_SYMBOL(__page_symlink);
				5026
				5027	int page_symlink(struct inode inode, const char symname, int len)
				5028	{
				5029	return __page_symlink(inode, symname, len,
				5030	!mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
				5031	}
				5032	EXPORT_SYMBOL(page_symlink);
				5033
				5034	const struct inode_operations page_symlink_inode_operations = {
				5035	.get_link = page_get_link,
				5036	};
				5037	EXPORT_SYMBOL(page_symlink_inode_operations);