Blame - ap/os/linux/linux-3.4.x/fs/namei.c - T106_DC

blob: 4fc034ffd20908cc245102f690ce77ec8d9457d0 [file] [log] [blame]

lh	9ed821d	2023-04-07 01:36:19 -0700	[diff] [blame]	1	/*
				2	* linux/fs/namei.c
				3	*
				4	* Copyright (C) 1991, 1992 Linus Torvalds
				5	*/
				6
				7	/*
				8	* Some corrections by tytso.
				9	*/
				10
				11	/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
				12	* lookup logic.
				13	*/
				14	/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
				15	*/
				16
				17	#include <linux/init.h>
				18	#include <linux/export.h>
				19	#include <linux/slab.h>
				20	#include <linux/fs.h>
				21	#include <linux/namei.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/fsnotify.h>
				24	#include <linux/personality.h>
				25	#include <linux/security.h>
				26	#include <linux/ima.h>
				27	#include <linux/syscalls.h>
				28	#include <linux/mount.h>
				29	#include <linux/audit.h>
				30	#include <linux/capability.h>
				31	#include <linux/file.h>
				32	#include <linux/fcntl.h>
				33	#include <linux/device_cgroup.h>
				34	#include <linux/fs_struct.h>
				35	#include <linux/posix_acl.h>
				36	#include <asm/uaccess.h>
				37
				38	#include "internal.h"
				39	#include "mount.h"
				40
				41	/* [Feb-1997 T. Schoebel-Theuer]
				42	* Fundamental changes in the pathname lookup mechanisms (namei)
				43	* were necessary because of omirr. The reason is that omirr needs
				44	* to know the _real_ pathname, not the user-supplied one, in case
				45	* of symlinks (and also when transname replacements occur).
				46	*
				47	* The new code replaces the old recursive symlink resolution with
				48	* an iterative one (in case of non-nested symlink chains). It does
				49	* this with calls to <fs>_follow_link().
				50	* As a side effect, dir_namei(), _namei() and follow_link() are now
				51	* replaced with a single function lookup_dentry() that can handle all
				52	* the special cases of the former code.
				53	*
				54	* With the new dcache, the pathname is stored at each inode, at least as
				55	* long as the refcount of the inode is positive. As a side effect, the
				56	* size of the dcache depends on the inode cache and thus is dynamic.
				57	*
				58	* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
				59	* resolution to correspond with current state of the code.
				60	*
				61	* Note that the symlink resolution is not completely iterative.
				62	* There is still a significant amount of tail- and mid- recursion in
				63	* the algorithm. Also, note that <fs>_readlink() is not used in
				64	* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
				65	* may return different results than <fs>_follow_link(). Many virtual
				66	* filesystems (including /proc) exhibit this behavior.
				67	*/
				68
				69	/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
				70	* New symlink semantics: when open() is called with flags O_CREAT \| O_EXCL
				71	* and the name already exists in form of a symlink, try to create the new
				72	* name indicated by the symlink. The old code always complained that the
				73	* name already exists, due to not following the symlink even if its target
				74	* is nonexistent. The new semantics affects also mknod() and link() when
				75	* the name is a symlink pointing to a non-existent name.
				76	*
				77	* I don't know which semantics is the right one, since I have no access
				78	* to standards. But I found by trial that HP-UX 9.0 has the full "new"
				79	* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
				80	* "old" one. Personally, I think the new semantics is much more logical.
				81	* Note that "ln old new" where "new" is a symlink pointing to a non-existing
				82	* file does succeed in both HP-UX and SunOs, but not in Solaris
				83	* and in the old Linux semantics.
				84	*/
				85
				86	/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
				87	* semantics. See the comments in "open_namei" and "do_link" below.
				88	*
				89	* [10-Sep-98 Alan Modra] Another symlink change.
				90	*/
				91
				92	/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
				93	* inside the path - always follow.
				94	* in the last component in creation/removal/renaming - never follow.
				95	* if LOOKUP_FOLLOW passed - follow.
				96	* if the pathname has trailing slashes - follow.
				97	* otherwise - don't follow.
				98	* (applied in that order).
				99	*
				100	* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
				101	* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
				102	* During the 2.4 we need to fix the userland stuff depending on it -
				103	* hopefully we will be able to get rid of that wart in 2.5. So far only
				104	* XEmacs seems to be relying on it...
				105	*/
				106	/*
				107	* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
				108	* implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives
				109	* any extra contention...
				110	*/
				111
				112	/* In order to reduce some races, while at the same time doing additional
				113	* checking and hopefully speeding things up, we copy filenames to the
				114	* kernel data space before using them..
				115	*
				116	* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
				117	* PATH_MAX includes the nul terminator --RR.
				118	*/
				119	static int do_getname(const char __user filename, char page)
				120	{
				121	int retval;
				122	unsigned long len = PATH_MAX;
				123
				124	if (!segment_eq(get_fs(), KERNEL_DS)) {
				125	if ((unsigned long) filename >= TASK_SIZE)
				126	return -EFAULT;
				127	if (TASK_SIZE - (unsigned long) filename < PATH_MAX)
				128	len = TASK_SIZE - (unsigned long) filename;
				129	}
				130
				131	retval = strncpy_from_user(page, filename, len);
				132	if (retval > 0) {
				133	if (retval < len)
				134	return 0;
				135	return -ENAMETOOLONG;
				136	} else if (!retval)
				137	retval = -ENOENT;
				138	return retval;
				139	}
				140
				141	static char getname_flags(const char __user filename, int flags, int *empty)
				142	{
				143	char *result = __getname();
				144	int retval;
				145
				146	if (!result)
				147	return ERR_PTR(-ENOMEM);
				148
				149	retval = do_getname(filename, result);
				150	if (retval < 0) {
				151	if (retval == -ENOENT && empty)
				152	*empty = 1;
				153	if (retval != -ENOENT \|\| !(flags & LOOKUP_EMPTY)) {
				154	__putname(result);
				155	return ERR_PTR(retval);
				156	}
				157	}
				158	audit_getname(result);
				159	return result;
				160	}
				161
				162	char getname(const char __user filename)
				163	{
				164	return getname_flags(filename, 0, NULL);
				165	}
				166
				167	#ifdef CONFIG_AUDITSYSCALL
				168	void putname(const char *name)
				169	{
				170	if (unlikely(!audit_dummy_context()))
				171	audit_putname(name);
				172	else
				173	__putname(name);
				174	}
				175	EXPORT_SYMBOL(putname);
				176	#endif
				177
				178	static int check_acl(struct inode *inode, int mask)
				179	{
				180	#ifdef CONFIG_FS_POSIX_ACL
				181	struct posix_acl *acl;
				182
				183	if (mask & MAY_NOT_BLOCK) {
				184	acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
				185	if (!acl)
				186	return -EAGAIN;
				187	/* no ->get_acl() calls in RCU mode... */
				188	if (acl == ACL_NOT_CACHED)
				189	return -ECHILD;
				190	return posix_acl_permission(inode, acl, mask & ~MAY_NOT_BLOCK);
				191	}
				192
				193	acl = get_cached_acl(inode, ACL_TYPE_ACCESS);
				194
				195	/*
				196	* A filesystem can force a ACL callback by just never filling the
				197	* ACL cache. But normally you'd fill the cache either at inode
				198	* instantiation time, or on the first ->get_acl call.
				199	*
				200	* If the filesystem doesn't have a get_acl() function at all, we'll
				201	* just create the negative cache entry.
				202	*/
				203	if (acl == ACL_NOT_CACHED) {
				204	if (inode->i_op->get_acl) {
				205	acl = inode->i_op->get_acl(inode, ACL_TYPE_ACCESS);
				206	if (IS_ERR(acl))
				207	return PTR_ERR(acl);
				208	} else {
				209	set_cached_acl(inode, ACL_TYPE_ACCESS, NULL);
				210	return -EAGAIN;
				211	}
				212	}
				213
				214	if (acl) {
				215	int error = posix_acl_permission(inode, acl, mask);
				216	posix_acl_release(acl);
				217	return error;
				218	}
				219	#endif
				220
				221	return -EAGAIN;
				222	}
				223
				224	/*
				225	* This does the basic permission checking
				226	*/
				227	static int acl_permission_check(struct inode *inode, int mask)
				228	{
				229	unsigned int mode = inode->i_mode;
				230
				231	if (current_user_ns() != inode_userns(inode))
				232	goto other_perms;
				233
				234	if (likely(current_fsuid() == inode->i_uid))
				235	mode >>= 6;
				236	else {
				237	if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
				238	int error = check_acl(inode, mask);
				239	if (error != -EAGAIN)
				240	return error;
				241	}
				242
				243	if (in_group_p(inode->i_gid))
				244	mode >>= 3;
				245	}
				246
				247	other_perms:
				248	/*
				249	* If the DACs are ok we don't need any capability check.
				250	*/
				251	if ((mask & ~mode & (MAY_READ \| MAY_WRITE \| MAY_EXEC)) == 0)
				252	return 0;
				253	return -EACCES;
				254	}
				255
				256	/**
				257	* generic_permission - check for access rights on a Posix-like filesystem
				258	* @inode: inode to check access rights for
				259	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
				260	*
				261	* Used to check for read/write/execute permissions on a file.
				262	* We use "fsuid" for this, letting us set arbitrary permissions
				263	* for filesystem access without changing the "normal" uids which
				264	* are used for other things.
				265	*
				266	* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
				267	* request cannot be satisfied (eg. requires blocking or too much complexity).
				268	* It would then be called again in ref-walk mode.
				269	*/
				270	int generic_permission(struct inode *inode, int mask)
				271	{
				272	int ret;
				273
				274	/*
				275	* Do the basic permission checks.
				276	*/
				277	ret = acl_permission_check(inode, mask);
				278	if (ret != -EACCES)
				279	return ret;
				280
				281	if (S_ISDIR(inode->i_mode)) {
				282	/* DACs are overridable for directories */
				283	if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
				284	return 0;
				285	if (!(mask & MAY_WRITE))
				286	if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
				287	return 0;
				288	return -EACCES;
				289	}
				290	/*
				291	* Read/write DACs are always overridable.
				292	* Executable DACs are overridable when there is
				293	* at least one exec bit set.
				294	*/
				295	if (!(mask & MAY_EXEC) \|\| (inode->i_mode & S_IXUGO))
				296	if (ns_capable(inode_userns(inode), CAP_DAC_OVERRIDE))
				297	return 0;
				298
				299	/*
				300	* Searching includes executable on directories, else just read.
				301	*/
				302	mask &= MAY_READ \| MAY_WRITE \| MAY_EXEC;
				303	if (mask == MAY_READ)
				304	if (ns_capable(inode_userns(inode), CAP_DAC_READ_SEARCH))
				305	return 0;
				306
				307	return -EACCES;
				308	}
				309
				310	/*
				311	* We _really_ want to just do "generic_permission()" without
				312	* even looking at the inode->i_op values. So we keep a cache
				313	* flag in inode->i_opflags, that says "this has not special
				314	* permission function, use the fast case".
				315	*/
				316	static inline int do_inode_permission(struct inode *inode, int mask)
				317	{
				318	if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
				319	if (likely(inode->i_op->permission))
				320	return inode->i_op->permission(inode, mask);
				321
				322	/* This gets set once for the inode lifetime */
				323	spin_lock(&inode->i_lock);
				324	inode->i_opflags \|= IOP_FASTPERM;
				325	spin_unlock(&inode->i_lock);
				326	}
				327	return generic_permission(inode, mask);
				328	}
				329
				330	/**
				331	* inode_permission - check for access rights to a given inode
				332	* @inode: inode to check permission on
				333	* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, ...)
				334	*
				335	* Used to check for read/write/execute permissions on an inode.
				336	* We use "fsuid" for this, letting us set arbitrary permissions
				337	* for filesystem access without changing the "normal" uids which
				338	* are used for other things.
				339	*
				340	* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
				341	*/
				342	int inode_permission(struct inode *inode, int mask)
				343	{
				344	int retval;
				345
				346	if (unlikely(mask & MAY_WRITE)) {
				347	umode_t mode = inode->i_mode;
				348
				349	/*
				350	* Nobody gets write access to a read-only fs.
				351	*/
				352	if (IS_RDONLY(inode) &&
				353	(S_ISREG(mode) \|\| S_ISDIR(mode) \|\| S_ISLNK(mode)))
				354	return -EROFS;
				355
				356	/*
				357	* Nobody gets write access to an immutable file.
				358	*/
				359	if (IS_IMMUTABLE(inode))
				360	return -EACCES;
				361	}
				362
				363	retval = do_inode_permission(inode, mask);
				364	if (retval)
				365	return retval;
				366
				367	retval = devcgroup_inode_permission(inode, mask);
				368	if (retval)
				369	return retval;
				370
				371	return security_inode_permission(inode, mask);
				372	}
				373
				374	/**
				375	* path_get - get a reference to a path
				376	* @path: path to get the reference to
				377	*
				378	* Given a path increment the reference count to the dentry and the vfsmount.
				379	*/
				380	void path_get(struct path *path)
				381	{
				382	mntget(path->mnt);
				383	dget(path->dentry);
				384	}
				385	EXPORT_SYMBOL(path_get);
				386
				387	/**
				388	* path_put - put a reference to a path
				389	* @path: path to put the reference to
				390	*
				391	* Given a path decrement the reference count to the dentry and the vfsmount.
				392	*/
				393	void path_put(struct path *path)
				394	{
				395	dput(path->dentry);
				396	mntput(path->mnt);
				397	}
				398	EXPORT_SYMBOL(path_put);
				399
				400	/**
				401	* path_connected - Verify that a path->dentry is below path->mnt.mnt_root
				402	* @path: nameidate to verify
				403	*
				404	* Rename can sometimes move a file or directory outside of a bind
				405	* mount, path_connected allows those cases to be detected.
				406	*/
				407	static bool path_connected(const struct path *path)
				408	{
				409	struct vfsmount *mnt = path->mnt;
				410
				411	/* Only bind mounts can have disconnected paths */
				412	if (mnt->mnt_root == mnt->mnt_sb->s_root)
				413	return true;
				414
				415	return is_subdir(path->dentry, mnt->mnt_root);
				416	}
				417
				418	/*
				419	* Path walking has 2 modes, rcu-walk and ref-walk (see
				420	* Documentation/filesystems/path-lookup.txt). In situations when we can't
				421	* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
				422	* normal reference counts on dentries and vfsmounts to transition to rcu-walk
				423	* mode. Refcounts are grabbed at the last known good point before rcu-walk
				424	* got stuck, so ref-walk may continue from there. If this is not successful
				425	* (eg. a seqcount has changed), then failure is returned and it's up to caller
				426	* to restart the path walk from the beginning in ref-walk mode.
				427	*/
				428
				429	/**
				430	* unlazy_walk - try to switch to ref-walk mode.
				431	* @nd: nameidata pathwalk data
				432	* @dentry: child of nd->path.dentry or NULL
				433	* Returns: 0 on success, -ECHILD on failure
				434	*
				435	* unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry
				436	* for ref-walk mode. @dentry must be a path found by a do_lookup call on
				437	* @nd or NULL. Must be called from rcu-walk context.
				438	*/
				439	static int unlazy_walk(struct nameidata nd, struct dentry dentry)
				440	{
				441	struct fs_struct *fs = current->fs;
				442	struct dentry *parent = nd->path.dentry;
				443	int want_root = 0;
				444
				445	BUG_ON(!(nd->flags & LOOKUP_RCU));
				446	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
				447	want_root = 1;
				448	spin_lock(&fs->lock);
				449	if (nd->root.mnt != fs->root.mnt \|\|
				450	nd->root.dentry != fs->root.dentry)
				451	goto err_root;
				452	}
				453	spin_lock(&parent->d_lock);
				454	if (!dentry) {
				455	if (!__d_rcu_to_refcount(parent, nd->seq))
				456	goto err_parent;
				457	BUG_ON(nd->inode != parent->d_inode);
				458	} else {
				459	if (dentry->d_parent != parent)
				460	goto err_parent;
				461	spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
				462	if (!__d_rcu_to_refcount(dentry, nd->seq))
				463	goto err_child;
				464	/*
				465	* If the sequence check on the child dentry passed, then
				466	* the child has not been removed from its parent. This
				467	* means the parent dentry must be valid and able to take
				468	* a reference at this point.
				469	*/
				470	BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
				471	BUG_ON(!parent->d_count);
				472	parent->d_count++;
				473	spin_unlock(&dentry->d_lock);
				474	}
				475	spin_unlock(&parent->d_lock);
				476	if (want_root) {
				477	path_get(&nd->root);
				478	spin_unlock(&fs->lock);
				479	}
				480	mntget(nd->path.mnt);
				481
				482	rcu_read_unlock();
				483	br_read_unlock(vfsmount_lock);
				484	nd->flags &= ~LOOKUP_RCU;
				485	return 0;
				486
				487	err_child:
				488	spin_unlock(&dentry->d_lock);
				489	err_parent:
				490	spin_unlock(&parent->d_lock);
				491	err_root:
				492	if (want_root)
				493	spin_unlock(&fs->lock);
				494	return -ECHILD;
				495	}
				496
				497	/**
				498	* release_open_intent - free up open intent resources
				499	* @nd: pointer to nameidata
				500	*/
				501	void release_open_intent(struct nameidata *nd)
				502	{
				503	struct file *file = nd->intent.open.file;
				504
				505	if (file && !IS_ERR(file)) {
				506	if (file->f_path.dentry == NULL)
				507	put_filp(file);
				508	else
				509	fput(file);
				510	}
				511	}
				512
				513	static inline int d_revalidate(struct dentry dentry, struct nameidata nd)
				514	{
				515	return dentry->d_op->d_revalidate(dentry, nd);
				516	}
				517
				518	/**
				519	* complete_walk - successful completion of path walk
				520	* @nd: pointer nameidata
				521	*
				522	* If we had been in RCU mode, drop out of it and legitimize nd->path.
				523	* Revalidate the final result, unless we'd already done that during
				524	* the path walk or the filesystem doesn't ask for it. Return 0 on
				525	* success, -error on failure. In case of failure caller does not
				526	* need to drop nd->path.
				527	*/
				528	static int complete_walk(struct nameidata *nd)
				529	{
				530	struct dentry *dentry = nd->path.dentry;
				531	int status;
				532
				533	if (nd->flags & LOOKUP_RCU) {
				534	nd->flags &= ~LOOKUP_RCU;
				535	if (!(nd->flags & LOOKUP_ROOT))
				536	nd->root.mnt = NULL;
				537	spin_lock(&dentry->d_lock);
				538	if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
				539	spin_unlock(&dentry->d_lock);
				540	rcu_read_unlock();
				541	br_read_unlock(vfsmount_lock);
				542	return -ECHILD;
				543	}
				544	BUG_ON(nd->inode != dentry->d_inode);
				545	spin_unlock(&dentry->d_lock);
				546	mntget(nd->path.mnt);
				547	rcu_read_unlock();
				548	br_read_unlock(vfsmount_lock);
				549	}
				550
				551	if (likely(!(nd->flags & LOOKUP_JUMPED)))
				552	return 0;
				553
				554	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
				555	return 0;
				556
				557	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
				558	return 0;
				559
				560	/* Note: we do not d_invalidate() */
				561	status = d_revalidate(dentry, nd);
				562	if (status > 0)
				563	return 0;
				564
				565	if (!status)
				566	status = -ESTALE;
				567
				568	path_put(&nd->path);
				569	return status;
				570	}
				571
				572	static __always_inline void set_root(struct nameidata *nd)
				573	{
				574	get_fs_root(current->fs, &nd->root);
				575	}
				576
				577	static int link_path_walk(const char , struct nameidata );
				578
				579	static __always_inline unsigned set_root_rcu(struct nameidata *nd)
				580	{
				581	struct fs_struct *fs = current->fs;
				582	unsigned seq, res;
				583
				584	do {
				585	seq = read_seqcount_begin(&fs->seq);
				586	nd->root = fs->root;
				587	res = __read_seqcount_begin(&nd->root.dentry->d_seq);
				588	} while (read_seqcount_retry(&fs->seq, seq));
				589	return res;
				590	}
				591
				592	static __always_inline int __vfs_follow_link(struct nameidata nd, const char link)
				593	{
				594	int ret;
				595
				596	if (IS_ERR(link))
				597	goto fail;
				598
				599	if (*link == '/') {
				600	if (!nd->root.mnt)
				601	set_root(nd);
				602	path_put(&nd->path);
				603	nd->path = nd->root;
				604	path_get(&nd->root);
				605	nd->flags \|= LOOKUP_JUMPED;
				606	}
				607	nd->inode = nd->path.dentry->d_inode;
				608
				609	ret = link_path_walk(link, nd);
				610	return ret;
				611	fail:
				612	path_put(&nd->path);
				613	return PTR_ERR(link);
				614	}
				615
				616	static void path_put_conditional(struct path path, struct nameidata nd)
				617	{
				618	dput(path->dentry);
				619	if (path->mnt != nd->path.mnt)
				620	mntput(path->mnt);
				621	}
				622
				623	static inline void path_to_nameidata(const struct path *path,
				624	struct nameidata *nd)
				625	{
				626	if (!(nd->flags & LOOKUP_RCU)) {
				627	dput(nd->path.dentry);
				628	if (nd->path.mnt != path->mnt)
				629	mntput(nd->path.mnt);
				630	}
				631	nd->path.mnt = path->mnt;
				632	nd->path.dentry = path->dentry;
				633	}
				634
				635	static inline void put_link(struct nameidata nd, struct path link, void *cookie)
				636	{
				637	struct inode *inode = link->dentry->d_inode;
				638	if (!IS_ERR(cookie) && inode->i_op->put_link)
				639	inode->i_op->put_link(link->dentry, nd, cookie);
				640	path_put(link);
				641	}
				642
				643	static __always_inline int
				644	follow_link(struct path link, struct nameidata nd, void **p)
				645	{
				646	int error;
				647	struct dentry *dentry = link->dentry;
				648
				649	BUG_ON(nd->flags & LOOKUP_RCU);
				650
				651	if (link->mnt == nd->path.mnt)
				652	mntget(link->mnt);
				653
				654	if (unlikely(current->total_link_count >= 40)) {
				655	p = ERR_PTR(-ELOOP); / no ->put_link(), please */
				656	path_put(&nd->path);
				657	return -ELOOP;
				658	}
				659	cond_resched();
				660	current->total_link_count++;
				661
				662	touch_atime(link);
				663	nd_set_link(nd, NULL);
				664
				665	error = security_inode_follow_link(link->dentry, nd);
				666	if (error) {
				667	p = ERR_PTR(error); / no ->put_link(), please */
				668	path_put(&nd->path);
				669	return error;
				670	}
				671
				672	nd->last_type = LAST_BIND;
				673	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
				674	error = PTR_ERR(*p);
				675	if (!IS_ERR(*p)) {
				676	char *s = nd_get_link(nd);
				677	error = 0;
				678	if (s)
				679	error = __vfs_follow_link(nd, s);
				680	else if (nd->last_type == LAST_BIND) {
				681	nd->flags \|= LOOKUP_JUMPED;
				682	nd->inode = nd->path.dentry->d_inode;
				683	if (nd->inode->i_op->follow_link) {
				684	/* stepped on a _really_ weird one */
				685	path_put(&nd->path);
				686	error = -ELOOP;
				687	}
				688	}
				689	}
				690	return error;
				691	}
				692
				693	static int follow_up_rcu(struct path *path)
				694	{
				695	struct mount *mnt = real_mount(path->mnt);
				696	struct mount *parent;
				697	struct dentry *mountpoint;
				698
				699	parent = mnt->mnt_parent;
				700	if (&parent->mnt == path->mnt)
				701	return 0;
				702	mountpoint = mnt->mnt_mountpoint;
				703	path->dentry = mountpoint;
				704	path->mnt = &parent->mnt;
				705	return 1;
				706	}
				707
				708	int follow_up(struct path *path)
				709	{
				710	struct mount *mnt = real_mount(path->mnt);
				711	struct mount *parent;
				712	struct dentry *mountpoint;
				713
				714	br_read_lock(vfsmount_lock);
				715	parent = mnt->mnt_parent;
				716	if (&parent->mnt == path->mnt) {
				717	br_read_unlock(vfsmount_lock);
				718	return 0;
				719	}
				720	mntget(&parent->mnt);
				721	mountpoint = dget(mnt->mnt_mountpoint);
				722	br_read_unlock(vfsmount_lock);
				723	dput(path->dentry);
				724	path->dentry = mountpoint;
				725	mntput(path->mnt);
				726	path->mnt = &parent->mnt;
				727	return 1;
				728	}
				729
				730	/*
				731	* Perform an automount
				732	* - return -EISDIR to tell follow_managed() to stop and return the path we
				733	* were called with.
				734	*/
				735	static int follow_automount(struct path *path, unsigned flags,
				736	bool *need_mntput)
				737	{
				738	struct vfsmount *mnt;
				739	int err;
				740
				741	if (!path->dentry->d_op \|\| !path->dentry->d_op->d_automount)
				742	return -EREMOTE;
				743
				744	/* We don't want to mount if someone's just doing a stat -
				745	* unless they're stat'ing a directory and appended a '/' to
				746	* the name.
				747	*
				748	* We do, however, want to mount if someone wants to open or
				749	* create a file of any type under the mountpoint, wants to
				750	* traverse through the mountpoint or wants to open the
				751	* mounted directory. Also, autofs may mark negative dentries
				752	* as being automount points. These will need the attentions
				753	* of the daemon to instantiate them before they can be used.
				754	*/
				755	if (!(flags & (LOOKUP_PARENT \| LOOKUP_DIRECTORY \|
				756	LOOKUP_OPEN \| LOOKUP_CREATE \| LOOKUP_AUTOMOUNT)) &&
				757	path->dentry->d_inode)
				758	return -EISDIR;
				759
				760	current->total_link_count++;
				761	if (current->total_link_count >= 40)
				762	return -ELOOP;
				763
				764	mnt = path->dentry->d_op->d_automount(path);
				765	if (IS_ERR(mnt)) {
				766	/*
				767	* The filesystem is allowed to return -EISDIR here to indicate
				768	* it doesn't want to automount. For instance, autofs would do
				769	* this so that its userspace daemon can mount on this dentry.
				770	*
				771	* However, we can only permit this if it's a terminal point in
				772	* the path being looked up; if it wasn't then the remainder of
				773	* the path is inaccessible and we should say so.
				774	*/
				775	if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT))
				776	return -EREMOTE;
				777	return PTR_ERR(mnt);
				778	}
				779
				780	if (!mnt) /* mount collision */
				781	return 0;
				782
				783	if (!*need_mntput) {
				784	/* lock_mount() may release path->mnt on error */
				785	mntget(path->mnt);
				786	*need_mntput = true;
				787	}
				788	err = finish_automount(mnt, path);
				789
				790	switch (err) {
				791	case -EBUSY:
				792	/* Someone else made a mount here whilst we were busy */
				793	return 0;
				794	case 0:
				795	path_put(path);
				796	path->mnt = mnt;
				797	path->dentry = dget(mnt->mnt_root);
				798	return 0;
				799	default:
				800	return err;
				801	}
				802
				803	}
				804
				805	/*
				806	* Handle a dentry that is managed in some way.
				807	* - Flagged for transit management (autofs)
				808	* - Flagged as mountpoint
				809	* - Flagged as automount point
				810	*
				811	* This may only be called in refwalk mode.
				812	*
				813	* Serialization is taken care of in namespace.c
				814	*/
				815	static int follow_managed(struct path *path, unsigned flags)
				816	{
				817	struct vfsmount mnt = path->mnt; / held by caller, must be left alone */
				818	unsigned managed;
				819	bool need_mntput = false;
				820	int ret = 0;
				821
				822	/* Given that we're not holding a lock here, we retain the value in a
				823	* local variable for each dentry as we look at it so that we don't see
				824	* the components of that value change under us */
				825	while (managed = ACCESS_ONCE(path->dentry->d_flags),
				826	managed &= DCACHE_MANAGED_DENTRY,
				827	unlikely(managed != 0)) {
				828	/* Allow the filesystem to manage the transit without i_mutex
				829	* being held. */
				830	if (managed & DCACHE_MANAGE_TRANSIT) {
				831	BUG_ON(!path->dentry->d_op);
				832	BUG_ON(!path->dentry->d_op->d_manage);
				833	ret = path->dentry->d_op->d_manage(path->dentry, false);
				834	if (ret < 0)
				835	break;
				836	}
				837
				838	/* Transit to a mounted filesystem. */
				839	if (managed & DCACHE_MOUNTED) {
				840	struct vfsmount *mounted = lookup_mnt(path);
				841	if (mounted) {
				842	dput(path->dentry);
				843	if (need_mntput)
				844	mntput(path->mnt);
				845	path->mnt = mounted;
				846	path->dentry = dget(mounted->mnt_root);
				847	need_mntput = true;
				848	continue;
				849	}
				850
				851	/* Something is mounted on this dentry in another
				852	* namespace and/or whatever was mounted there in this
				853	* namespace got unmounted before we managed to get the
				854	* vfsmount_lock */
				855	}
				856
				857	/* Handle an automount point */
				858	if (managed & DCACHE_NEED_AUTOMOUNT) {
				859	ret = follow_automount(path, flags, &need_mntput);
				860	if (ret < 0)
				861	break;
				862	continue;
				863	}
				864
				865	/* We didn't change the current path point */
				866	break;
				867	}
				868
				869	if (need_mntput && path->mnt == mnt)
				870	mntput(path->mnt);
				871	if (ret == -EISDIR)
				872	ret = 0;
				873	return ret < 0 ? ret : need_mntput;
				874	}
				875
				876	int follow_down_one(struct path *path)
				877	{
				878	struct vfsmount *mounted;
				879
				880	mounted = lookup_mnt(path);
				881	if (mounted) {
				882	dput(path->dentry);
				883	mntput(path->mnt);
				884	path->mnt = mounted;
				885	path->dentry = dget(mounted->mnt_root);
				886	return 1;
				887	}
				888	return 0;
				889	}
				890
				891	static inline bool managed_dentry_might_block(struct dentry *dentry)
				892	{
				893	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT &&
				894	dentry->d_op->d_manage(dentry, true) < 0);
				895	}
				896
				897	/*
				898	* Try to skip to top of mountpoint pile in rcuwalk mode. Fail if
				899	* we meet a managed dentry that would need blocking.
				900	*/
				901	static bool __follow_mount_rcu(struct nameidata nd, struct path path,
				902	struct inode **inode)
				903	{
				904	for (;;) {
				905	struct mount *mounted;
				906	/*
				907	* Don't forget we might have a non-mountpoint managed dentry
				908	* that wants to block transit.
				909	*/
				910	if (unlikely(managed_dentry_might_block(path->dentry)))
				911	return false;
				912
				913	if (!d_mountpoint(path->dentry))
				914	break;
				915
				916	mounted = __lookup_mnt(path->mnt, path->dentry, 1);
				917	if (!mounted)
				918	break;
				919	path->mnt = &mounted->mnt;
				920	path->dentry = mounted->mnt.mnt_root;
				921	nd->flags \|= LOOKUP_JUMPED;
				922	nd->seq = read_seqcount_begin(&path->dentry->d_seq);
				923	/*
				924	* Update the inode too. We don't need to re-check the
				925	* dentry sequence number here after this d_inode read,
				926	* because a mount-point is always pinned.
				927	*/
				928	*inode = path->dentry->d_inode;
				929	}
				930	return true;
				931	}
				932
				933	static void follow_mount_rcu(struct nameidata *nd)
				934	{
				935	while (d_mountpoint(nd->path.dentry)) {
				936	struct mount *mounted;
				937	mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry, 1);
				938	if (!mounted)
				939	break;
				940	nd->path.mnt = &mounted->mnt;
				941	nd->path.dentry = mounted->mnt.mnt_root;
				942	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
				943	}
				944	}
				945
				946	static int follow_dotdot_rcu(struct nameidata *nd)
				947	{
				948	if (!nd->root.mnt)
				949	set_root_rcu(nd);
				950
				951	while (1) {
				952	if (nd->path.dentry == nd->root.dentry &&
				953	nd->path.mnt == nd->root.mnt) {
				954	break;
				955	}
				956	if (nd->path.dentry != nd->path.mnt->mnt_root) {
				957	struct dentry *old = nd->path.dentry;
				958	struct dentry *parent = old->d_parent;
				959	unsigned seq;
				960
				961	seq = read_seqcount_begin(&parent->d_seq);
				962	if (read_seqcount_retry(&old->d_seq, nd->seq))
				963	goto failed;
				964	nd->path.dentry = parent;
				965	nd->seq = seq;
				966	if (unlikely(!path_connected(&nd->path)))
				967	goto failed;
				968	break;
				969	}
				970	if (!follow_up_rcu(&nd->path))
				971	break;
				972	nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
				973	}
				974	follow_mount_rcu(nd);
				975	nd->inode = nd->path.dentry->d_inode;
				976	return 0;
				977
				978	failed:
				979	nd->flags &= ~LOOKUP_RCU;
				980	if (!(nd->flags & LOOKUP_ROOT))
				981	nd->root.mnt = NULL;
				982	rcu_read_unlock();
				983	br_read_unlock(vfsmount_lock);
				984	return -ECHILD;
				985	}
				986
				987	/*
				988	* Follow down to the covering mount currently visible to userspace. At each
				989	* point, the filesystem owning that dentry may be queried as to whether the
				990	* caller is permitted to proceed or not.
				991	*/
				992	int follow_down(struct path *path)
				993	{
				994	unsigned managed;
				995	int ret;
				996
				997	while (managed = ACCESS_ONCE(path->dentry->d_flags),
				998	unlikely(managed & DCACHE_MANAGED_DENTRY)) {
				999	/* Allow the filesystem to manage the transit without i_mutex
				1000	* being held.
				1001	*
				1002	* We indicate to the filesystem if someone is trying to mount
				1003	* something here. This gives autofs the chance to deny anyone
				1004	* other than its daemon the right to mount on its
				1005	* superstructure.
				1006	*
				1007	* The filesystem may sleep at this point.
				1008	*/
				1009	if (managed & DCACHE_MANAGE_TRANSIT) {
				1010	BUG_ON(!path->dentry->d_op);
				1011	BUG_ON(!path->dentry->d_op->d_manage);
				1012	ret = path->dentry->d_op->d_manage(
				1013	path->dentry, false);
				1014	if (ret < 0)
				1015	return ret == -EISDIR ? 0 : ret;
				1016	}
				1017
				1018	/* Transit to a mounted filesystem. */
				1019	if (managed & DCACHE_MOUNTED) {
				1020	struct vfsmount *mounted = lookup_mnt(path);
				1021	if (!mounted)
				1022	break;
				1023	dput(path->dentry);
				1024	mntput(path->mnt);
				1025	path->mnt = mounted;
				1026	path->dentry = dget(mounted->mnt_root);
				1027	continue;
				1028	}
				1029
				1030	/* Don't handle automount points here */
				1031	break;
				1032	}
				1033	return 0;
				1034	}
				1035
				1036	/*
				1037	* Skip to top of mountpoint pile in refwalk mode for follow_dotdot()
				1038	*/
				1039	static void follow_mount(struct path *path)
				1040	{
				1041	while (d_mountpoint(path->dentry)) {
				1042	struct vfsmount *mounted = lookup_mnt(path);
				1043	if (!mounted)
				1044	break;
				1045	dput(path->dentry);
				1046	mntput(path->mnt);
				1047	path->mnt = mounted;
				1048	path->dentry = dget(mounted->mnt_root);
				1049	}
				1050	}
				1051
				1052	static int follow_dotdot(struct nameidata *nd)
				1053	{
				1054	if (!nd->root.mnt)
				1055	set_root(nd);
				1056
				1057	while(1) {
				1058	struct dentry *old = nd->path.dentry;
				1059
				1060	if (nd->path.dentry == nd->root.dentry &&
				1061	nd->path.mnt == nd->root.mnt) {
				1062	break;
				1063	}
				1064	if (nd->path.dentry != nd->path.mnt->mnt_root) {
				1065	/* rare case of legitimate dget_parent()... */
				1066	nd->path.dentry = dget_parent(nd->path.dentry);
				1067	dput(old);
				1068	if (unlikely(!path_connected(&nd->path))) {
				1069	path_put(&nd->path);
				1070	return -ENOENT;
				1071	}
				1072	break;
				1073	}
				1074	if (!follow_up(&nd->path))
				1075	break;
				1076	}
				1077	follow_mount(&nd->path);
				1078	nd->inode = nd->path.dentry->d_inode;
				1079	return 0;
				1080	}
				1081
				1082	/*
				1083	* This looks up the name in dcache, possibly revalidates the old dentry and
				1084	* allocates a new one if not found or not valid. In the need_lookup argument
				1085	* returns whether i_op->lookup is necessary.
				1086	*
				1087	* dir->d_inode->i_mutex must be held
				1088	*/
				1089	static struct dentry lookup_dcache(struct qstr name, struct dentry *dir,
				1090	struct nameidata nd, bool need_lookup)
				1091	{
				1092	struct dentry *dentry;
				1093	int error;
				1094
				1095	*need_lookup = false;
				1096	dentry = d_lookup(dir, name);
				1097	if (dentry) {
				1098	if (d_need_lookup(dentry)) {
				1099	*need_lookup = true;
				1100	} else if (dentry->d_flags & DCACHE_OP_REVALIDATE) {
				1101	error = d_revalidate(dentry, nd);
				1102	if (unlikely(error <= 0)) {
				1103	if (error < 0) {
				1104	dput(dentry);
				1105	return ERR_PTR(error);
				1106	} else if (!d_invalidate(dentry)) {
				1107	dput(dentry);
				1108	dentry = NULL;
				1109	}
				1110	}
				1111	}
				1112	}
				1113
				1114	if (!dentry) {
				1115	dentry = d_alloc(dir, name);
				1116	if (unlikely(!dentry))
				1117	return ERR_PTR(-ENOMEM);
				1118
				1119	*need_lookup = true;
				1120	}
				1121	return dentry;
				1122	}
				1123
				1124	/*
				1125	* Call i_op->lookup on the dentry. The dentry must be negative but may be
				1126	* hashed if it was pouplated with DCACHE_NEED_LOOKUP.
				1127	*
				1128	* dir->d_inode->i_mutex must be held
				1129	*/
				1130	static struct dentry lookup_real(struct inode dir, struct dentry *dentry,
				1131	struct nameidata *nd)
				1132	{
				1133	struct dentry *old;
				1134
				1135	/* Don't create child dentry for a dead directory. */
				1136	if (unlikely(IS_DEADDIR(dir))) {
				1137	dput(dentry);
				1138	return ERR_PTR(-ENOENT);
				1139	}
				1140
				1141	old = dir->i_op->lookup(dir, dentry, nd);
				1142	if (unlikely(old)) {
				1143	dput(dentry);
				1144	dentry = old;
				1145	}
				1146	return dentry;
				1147	}
				1148
				1149	static struct dentry __lookup_hash(struct qstr name,
				1150	struct dentry base, struct nameidata nd)
				1151	{
				1152	bool need_lookup;
				1153	struct dentry *dentry;
				1154
				1155	dentry = lookup_dcache(name, base, nd, &need_lookup);
				1156	if (!need_lookup)
				1157	return dentry;
				1158
				1159	return lookup_real(base->d_inode, dentry, nd);
				1160	}
				1161
				1162	/*
				1163	* It's more convoluted than I'd like it to be, but... it's still fairly
				1164	* small and for now I'd prefer to have fast path as straight as possible.
				1165	* It _is_ time-critical.
				1166	*/
				1167	static int do_lookup(struct nameidata nd, struct qstr name,
				1168	struct path path, struct inode *inode)
				1169	{
				1170	struct vfsmount *mnt = nd->path.mnt;
				1171	struct dentry dentry, parent = nd->path.dentry;
				1172	int need_reval = 1;
				1173	int status = 1;
				1174	int err;
				1175
				1176	/*
				1177	* Rename seqlock is not required here because in the off chance
				1178	* of a false negative due to a concurrent rename, we're going to
				1179	* do the non-racy lookup, below.
				1180	*/
				1181	if (nd->flags & LOOKUP_RCU) {
				1182	unsigned seq;
				1183	*inode = nd->inode;
				1184	dentry = __d_lookup_rcu(parent, name, &seq, inode);
				1185	if (!dentry)
				1186	goto unlazy;
				1187
				1188	/* Memory barrier in read_seqcount_begin of child is enough */
				1189	if (__read_seqcount_retry(&parent->d_seq, nd->seq))
				1190	return -ECHILD;
				1191	nd->seq = seq;
				1192
				1193	if (unlikely(d_need_lookup(dentry)))
				1194	goto unlazy;
				1195	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
				1196	status = d_revalidate(dentry, nd);
				1197	if (unlikely(status <= 0)) {
				1198	if (status != -ECHILD)
				1199	need_reval = 0;
				1200	goto unlazy;
				1201	}
				1202	}
				1203	path->mnt = mnt;
				1204	path->dentry = dentry;
				1205	if (unlikely(!__follow_mount_rcu(nd, path, inode)))
				1206	goto unlazy;
				1207	if (unlikely(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
				1208	goto unlazy;
				1209	return 0;
				1210	unlazy:
				1211	if (unlazy_walk(nd, dentry))
				1212	return -ECHILD;
				1213	} else {
				1214	dentry = __d_lookup(parent, name);
				1215	}
				1216
				1217	if (unlikely(!dentry))
				1218	goto need_lookup;
				1219
				1220	if (unlikely(d_need_lookup(dentry))) {
				1221	dput(dentry);
				1222	goto need_lookup;
				1223	}
				1224
				1225	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
				1226	status = d_revalidate(dentry, nd);
				1227	if (unlikely(status <= 0)) {
				1228	if (status < 0) {
				1229	dput(dentry);
				1230	return status;
				1231	}
				1232	if (!d_invalidate(dentry)) {
				1233	dput(dentry);
				1234	goto need_lookup;
				1235	}
				1236	}
				1237	done:
				1238	path->mnt = mnt;
				1239	path->dentry = dentry;
				1240	err = follow_managed(path, nd->flags);
				1241	if (unlikely(err < 0)) {
				1242	path_put_conditional(path, nd);
				1243	return err;
				1244	}
				1245	if (err)
				1246	nd->flags \|= LOOKUP_JUMPED;
				1247	*inode = path->dentry->d_inode;
				1248	return 0;
				1249
				1250	need_lookup:
				1251	BUG_ON(nd->inode != parent->d_inode);
				1252
				1253	mutex_lock(&parent->d_inode->i_mutex);
				1254	dentry = __lookup_hash(name, parent, nd);
				1255	mutex_unlock(&parent->d_inode->i_mutex);
				1256	if (IS_ERR(dentry))
				1257	return PTR_ERR(dentry);
				1258	goto done;
				1259	}
				1260
				1261	static inline int may_lookup(struct nameidata *nd)
				1262	{
				1263	if (nd->flags & LOOKUP_RCU) {
				1264	int err = inode_permission(nd->inode, MAY_EXEC\|MAY_NOT_BLOCK);
				1265	if (err != -ECHILD)
				1266	return err;
				1267	if (unlazy_walk(nd, NULL))
				1268	return -ECHILD;
				1269	}
				1270	return inode_permission(nd->inode, MAY_EXEC);
				1271	}
				1272
				1273	static inline int handle_dots(struct nameidata *nd, int type)
				1274	{
				1275	if (type == LAST_DOTDOT) {
				1276	if (nd->flags & LOOKUP_RCU) {
				1277	if (follow_dotdot_rcu(nd))
				1278	return -ECHILD;
				1279	} else
				1280	return follow_dotdot(nd);
				1281	}
				1282	return 0;
				1283	}
				1284
				1285	static void terminate_walk(struct nameidata *nd)
				1286	{
				1287	if (!(nd->flags & LOOKUP_RCU)) {
				1288	path_put(&nd->path);
				1289	} else {
				1290	nd->flags &= ~LOOKUP_RCU;
				1291	if (!(nd->flags & LOOKUP_ROOT))
				1292	nd->root.mnt = NULL;
				1293	rcu_read_unlock();
				1294	br_read_unlock(vfsmount_lock);
				1295	}
				1296	}
				1297
				1298	/*
				1299	* Do we need to follow links? We _really_ want to be able
				1300	* to do this check without having to look at inode->i_op,
				1301	* so we keep a cache of "no, this doesn't need follow_link"
				1302	* for the common case.
				1303	*/
				1304	static inline int should_follow_link(struct inode *inode, int follow)
				1305	{
				1306	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
				1307	if (likely(inode->i_op->follow_link))
				1308	return follow;
				1309
				1310	/* This gets set once for the inode lifetime */
				1311	spin_lock(&inode->i_lock);
				1312	inode->i_opflags \|= IOP_NOFOLLOW;
				1313	spin_unlock(&inode->i_lock);
				1314	}
				1315	return 0;
				1316	}
				1317
				1318	static inline int walk_component(struct nameidata nd, struct path path,
				1319	struct qstr *name, int type, int follow)
				1320	{
				1321	struct inode *inode;
				1322	int err;
				1323	/*
				1324	* "." and ".." are special - ".." especially so because it has
				1325	* to be able to know about the current root directory and
				1326	* parent relationships.
				1327	*/
				1328	if (unlikely(type != LAST_NORM))
				1329	return handle_dots(nd, type);
				1330	err = do_lookup(nd, name, path, &inode);
				1331	if (unlikely(err)) {
				1332	terminate_walk(nd);
				1333	return err;
				1334	}
				1335	if (!inode) {
				1336	path_to_nameidata(path, nd);
				1337	terminate_walk(nd);
				1338	return -ENOENT;
				1339	}
				1340	if (should_follow_link(inode, follow)) {
				1341	if (nd->flags & LOOKUP_RCU) {
				1342	if (unlikely(nd->path.mnt != path->mnt \|\|
				1343	unlazy_walk(nd, path->dentry))) {
				1344	terminate_walk(nd);
				1345	return -ECHILD;
				1346	}
				1347	}
				1348	BUG_ON(inode != path->dentry->d_inode);
				1349	return 1;
				1350	}
				1351	path_to_nameidata(path, nd);
				1352	nd->inode = inode;
				1353	return 0;
				1354	}
				1355
				1356	/*
				1357	* This limits recursive symlink follows to 8, while
				1358	* limiting consecutive symlinks to 40.
				1359	*
				1360	* Without that kind of total limit, nasty chains of consecutive
				1361	* symlinks can cause almost arbitrarily long lookups.
				1362	*/
				1363	static inline int nested_symlink(struct path path, struct nameidata nd)
				1364	{
				1365	int res;
				1366
				1367	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
				1368	path_put_conditional(path, nd);
				1369	path_put(&nd->path);
				1370	return -ELOOP;
				1371	}
				1372	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
				1373
				1374	nd->depth++;
				1375	current->link_count++;
				1376
				1377	do {
				1378	struct path link = *path;
				1379	void *cookie;
				1380
				1381	res = follow_link(&link, nd, &cookie);
				1382	if (!res)
				1383	res = walk_component(nd, path, &nd->last,
				1384	nd->last_type, LOOKUP_FOLLOW);
				1385	put_link(nd, &link, cookie);
				1386	} while (res > 0);
				1387
				1388	current->link_count--;
				1389	nd->depth--;
				1390	return res;
				1391	}
				1392
				1393	/*
				1394	* We really don't want to look at inode->i_op->lookup
				1395	* when we don't have to. So we keep a cache bit in
				1396	* the inode ->i_opflags field that says "yes, we can
				1397	* do lookup on this inode".
				1398	*/
				1399	static inline int can_lookup(struct inode *inode)
				1400	{
				1401	if (likely(inode->i_opflags & IOP_LOOKUP))
				1402	return 1;
				1403	if (likely(!inode->i_op->lookup))
				1404	return 0;
				1405
				1406	/* We do this once for the lifetime of the inode */
				1407	spin_lock(&inode->i_lock);
				1408	inode->i_opflags \|= IOP_LOOKUP;
				1409	spin_unlock(&inode->i_lock);
				1410	return 1;
				1411	}
				1412
				1413	/*
				1414	* We can do the critical dentry name comparison and hashing
				1415	* operations one word at a time, but we are limited to:
				1416	*
				1417	* - Architectures with fast unaligned word accesses. We could
				1418	* do a "get_unaligned()" if this helps and is sufficiently
				1419	* fast.
				1420	*
				1421	* - Little-endian machines (so that we can generate the mask
				1422	* of low bytes efficiently). Again, we could do a byte
				1423	* swapping load on big-endian architectures if that is not
				1424	* expensive enough to make the optimization worthless.
				1425	*
				1426	* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
				1427	* do not trap on the (extremely unlikely) case of a page
				1428	* crossing operation.
				1429	*
				1430	* - Furthermore, we need an efficient 64-bit compile for the
				1431	* 64-bit case in order to generate the "number of bytes in
				1432	* the final mask". Again, that could be replaced with a
				1433	* efficient population count instruction or similar.
				1434	*/
				1435	#ifdef CONFIG_DCACHE_WORD_ACCESS
				1436
				1437	#include <asm/word-at-a-time.h>
				1438
				1439	#ifdef CONFIG_64BIT
				1440
				1441	static inline unsigned int fold_hash(unsigned long hash)
				1442	{
				1443	hash += hash >> (8*sizeof(int));
				1444	return hash;
				1445	}
				1446
				1447	#else /* 32-bit case */
				1448
				1449	#define fold_hash(x) (x)
				1450
				1451	#endif
				1452
				1453	unsigned int full_name_hash(const unsigned char *name, unsigned int len)
				1454	{
				1455	unsigned long a, mask;
				1456	unsigned long hash = 0;
				1457
				1458	for (;;) {
				1459	a = load_unaligned_zeropad(name);
				1460	if (len < sizeof(unsigned long))
				1461	break;
				1462	hash += a;
				1463	hash *= 9;
				1464	name += sizeof(unsigned long);
				1465	len -= sizeof(unsigned long);
				1466	if (!len)
				1467	goto done;
				1468	}
				1469	mask = ~(~0ul << len*8);
				1470	hash += mask & a;
				1471	done:
				1472	return fold_hash(hash);
				1473	}
				1474	EXPORT_SYMBOL(full_name_hash);
				1475
				1476	/*
				1477	* Calculate the length and hash of the path component, and
				1478	* return the length of the component;
				1479	*/
				1480	static inline unsigned long hash_name(const char name, unsigned int hashp)
				1481	{
				1482	unsigned long a, mask, hash, len;
				1483
				1484	hash = a = 0;
				1485	len = -sizeof(unsigned long);
				1486	do {
				1487	hash = (hash + a) * 9;
				1488	len += sizeof(unsigned long);
				1489	a = load_unaligned_zeropad(name+len);
				1490	/* Do we have any NUL or '/' bytes in this word? */
				1491	mask = has_zero(a) \| has_zero(a ^ REPEAT_BYTE('/'));
				1492	} while (!mask);
				1493
				1494	/* The mask below the first high bit set */
				1495	mask = (mask - 1) & ~mask;
				1496	mask >>= 7;
				1497	hash += a & mask;
				1498	*hashp = fold_hash(hash);
				1499
				1500	return len + count_masked_bytes(mask);
				1501	}
				1502
				1503	#else
				1504
				1505	unsigned int full_name_hash(const unsigned char *name, unsigned int len)
				1506	{
				1507	unsigned long hash = init_name_hash();
				1508	while (len--)
				1509	hash = partial_name_hash(*name++, hash);
				1510	return end_name_hash(hash);
				1511	}
				1512	EXPORT_SYMBOL(full_name_hash);
				1513
				1514	/*
				1515	* We know there's a real path component here of at least
				1516	* one character.
				1517	*/
				1518	static inline unsigned long hash_name(const char name, unsigned int hashp)
				1519	{
				1520	unsigned long hash = init_name_hash();
				1521	unsigned long len = 0, c;
				1522
				1523	c = (unsigned char)*name;
				1524	do {
				1525	len++;
				1526	hash = partial_name_hash(c, hash);
				1527	c = (unsigned char)name[len];
				1528	} while (c && c != '/');
				1529	*hashp = end_name_hash(hash);
				1530	return len;
				1531	}
				1532
				1533	#endif
				1534
				1535	/*
				1536	* Name resolution.
				1537	* This is the basic name resolution function, turning a pathname into
				1538	* the final dentry. We expect 'base' to be positive and a directory.
				1539	*
				1540	* Returns 0 and nd will have valid dentry and mnt on success.
				1541	* Returns error and drops reference to input namei data on failure.
				1542	*/
				1543	static int link_path_walk(const char name, struct nameidata nd)
				1544	{
				1545	struct path next;
				1546	int err;
				1547
				1548	while (*name=='/')
				1549	name++;
				1550	if (!*name)
				1551	return 0;
				1552
				1553	/* At this point we know we have a real path component. */
				1554	for(;;) {
				1555	struct qstr this;
				1556	long len;
				1557	int type;
				1558
				1559	err = may_lookup(nd);
				1560	if (err)
				1561	break;
				1562
				1563	len = hash_name(name, &this.hash);
				1564	this.name = name;
				1565	this.len = len;
				1566
				1567	type = LAST_NORM;
				1568	if (name[0] == '.') switch (len) {
				1569	case 2:
				1570	if (name[1] == '.') {
				1571	type = LAST_DOTDOT;
				1572	nd->flags \|= LOOKUP_JUMPED;
				1573	}
				1574	break;
				1575	case 1:
				1576	type = LAST_DOT;
				1577	}
				1578	if (likely(type == LAST_NORM)) {
				1579	struct dentry *parent = nd->path.dentry;
				1580	nd->flags &= ~LOOKUP_JUMPED;
				1581	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
				1582	err = parent->d_op->d_hash(parent, nd->inode,
				1583	&this);
				1584	if (err < 0)
				1585	break;
				1586	}
				1587	}
				1588
				1589	if (!name[len])
				1590	goto last_component;
				1591	/*
				1592	* If it wasn't NUL, we know it was '/'. Skip that
				1593	* slash, and continue until no more slashes.
				1594	*/
				1595	do {
				1596	len++;
				1597	} while (unlikely(name[len] == '/'));
				1598	if (!name[len])
				1599	goto last_component;
				1600	name += len;
				1601
				1602	err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
				1603	if (err < 0)
				1604	return err;
				1605
				1606	if (err) {
				1607	err = nested_symlink(&next, nd);
				1608	if (err)
				1609	return err;
				1610	}
				1611	if (can_lookup(nd->inode))
				1612	continue;
				1613	err = -ENOTDIR;
				1614	break;
				1615	/* here ends the main loop */
				1616
				1617	last_component:
				1618	nd->last = this;
				1619	nd->last_type = type;
				1620	return 0;
				1621	}
				1622	terminate_walk(nd);
				1623	return err;
				1624	}
				1625
				1626	static int path_init(int dfd, const char *name, unsigned int flags,
				1627	struct nameidata nd, struct file *fp)
				1628	{
				1629	int retval = 0;
				1630	int fput_needed;
				1631	struct file *file;
				1632
				1633	nd->last_type = LAST_ROOT; /* if there are only slashes... */
				1634	nd->flags = flags \| LOOKUP_JUMPED;
				1635	nd->depth = 0;
				1636	if (flags & LOOKUP_ROOT) {
				1637	struct inode *inode = nd->root.dentry->d_inode;
				1638	if (*name) {
				1639	if (!inode->i_op->lookup)
				1640	return -ENOTDIR;
				1641	retval = inode_permission(inode, MAY_EXEC);
				1642	if (retval)
				1643	return retval;
				1644	}
				1645	nd->path = nd->root;
				1646	nd->inode = inode;
				1647	if (flags & LOOKUP_RCU) {
				1648	br_read_lock(vfsmount_lock);
				1649	rcu_read_lock();
				1650	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
				1651	} else {
				1652	path_get(&nd->path);
				1653	}
				1654	return 0;
				1655	}
				1656
				1657	nd->root.mnt = NULL;
				1658
				1659	if (*name=='/') {
				1660	if (flags & LOOKUP_RCU) {
				1661	br_read_lock(vfsmount_lock);
				1662	rcu_read_lock();
				1663	nd->seq = set_root_rcu(nd);
				1664	} else {
				1665	set_root(nd);
				1666	path_get(&nd->root);
				1667	}
				1668	nd->path = nd->root;
				1669	} else if (dfd == AT_FDCWD) {
				1670	if (flags & LOOKUP_RCU) {
				1671	struct fs_struct *fs = current->fs;
				1672	unsigned seq;
				1673
				1674	br_read_lock(vfsmount_lock);
				1675	rcu_read_lock();
				1676
				1677	do {
				1678	seq = read_seqcount_begin(&fs->seq);
				1679	nd->path = fs->pwd;
				1680	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
				1681	} while (read_seqcount_retry(&fs->seq, seq));
				1682	} else {
				1683	get_fs_pwd(current->fs, &nd->path);
				1684	}
				1685	} else {
				1686	struct dentry *dentry;
				1687
				1688	file = fget_raw_light(dfd, &fput_needed);
				1689	retval = -EBADF;
				1690	if (!file)
				1691	goto out_fail;
				1692
				1693	dentry = file->f_path.dentry;
				1694
				1695	if (*name) {
				1696	retval = -ENOTDIR;
				1697	if (!S_ISDIR(dentry->d_inode->i_mode))
				1698	goto fput_fail;
				1699
				1700	retval = inode_permission(dentry->d_inode, MAY_EXEC);
				1701	if (retval)
				1702	goto fput_fail;
				1703	}
				1704
				1705	nd->path = file->f_path;
				1706	if (flags & LOOKUP_RCU) {
				1707	if (fput_needed)
				1708	*fp = file;
				1709	nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
				1710	br_read_lock(vfsmount_lock);
				1711	rcu_read_lock();
				1712	} else {
				1713	path_get(&file->f_path);
				1714	fput_light(file, fput_needed);
				1715	}
				1716	}
				1717
				1718	nd->inode = nd->path.dentry->d_inode;
				1719	return 0;
				1720
				1721	fput_fail:
				1722	fput_light(file, fput_needed);
				1723	out_fail:
				1724	return retval;
				1725	}
				1726
				1727	static inline int lookup_last(struct nameidata nd, struct path path)
				1728	{
				1729	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
				1730	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				1731
				1732	nd->flags &= ~LOOKUP_PARENT;
				1733	return walk_component(nd, path, &nd->last, nd->last_type,
				1734	nd->flags & LOOKUP_FOLLOW);
				1735	}
				1736
				1737	/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
				1738	static int path_lookupat(int dfd, const char *name,
				1739	unsigned int flags, struct nameidata *nd)
				1740	{
				1741	struct file *base = NULL;
				1742	struct path path;
				1743	int err;
				1744
				1745	/*
				1746	* Path walking is largely split up into 2 different synchronisation
				1747	* schemes, rcu-walk and ref-walk (explained in
				1748	* Documentation/filesystems/path-lookup.txt). These share much of the
				1749	* path walk code, but some things particularly setup, cleanup, and
				1750	* following mounts are sufficiently divergent that functions are
				1751	* duplicated. Typically there is a function foo(), and its RCU
				1752	* analogue, foo_rcu().
				1753	*
				1754	* -ECHILD is the error number of choice (just to avoid clashes) that
				1755	* is returned if some aspect of an rcu-walk fails. Such an error must
				1756	* be handled by restarting a traditional ref-walk (which will always
				1757	* be able to complete).
				1758	*/
				1759	err = path_init(dfd, name, flags \| LOOKUP_PARENT, nd, &base);
				1760
				1761	if (unlikely(err))
				1762	return err;
				1763
				1764	current->total_link_count = 0;
				1765	err = link_path_walk(name, nd);
				1766
				1767	if (!err && !(flags & LOOKUP_PARENT)) {
				1768	err = lookup_last(nd, &path);
				1769	while (err > 0) {
				1770	void *cookie;
				1771	struct path link = path;
				1772	nd->flags \|= LOOKUP_PARENT;
				1773	err = follow_link(&link, nd, &cookie);
				1774	if (!err)
				1775	err = lookup_last(nd, &path);
				1776	put_link(nd, &link, cookie);
				1777	}
				1778	}
				1779
				1780	if (!err)
				1781	err = complete_walk(nd);
				1782
				1783	if (!err && nd->flags & LOOKUP_DIRECTORY) {
				1784	if (!nd->inode->i_op->lookup) {
				1785	path_put(&nd->path);
				1786	err = -ENOTDIR;
				1787	}
				1788	}
				1789
				1790	if (base)
				1791	fput(base);
				1792
				1793	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
				1794	path_put(&nd->root);
				1795	nd->root.mnt = NULL;
				1796	}
				1797	return err;
				1798	}
				1799
				1800	static int do_path_lookup(int dfd, const char *name,
				1801	unsigned int flags, struct nameidata *nd)
				1802	{
				1803	int retval = path_lookupat(dfd, name, flags \| LOOKUP_RCU, nd);
				1804	if (unlikely(retval == -ECHILD))
				1805	retval = path_lookupat(dfd, name, flags, nd);
				1806	if (unlikely(retval == -ESTALE))
				1807	retval = path_lookupat(dfd, name, flags \| LOOKUP_REVAL, nd);
				1808
				1809	if (likely(!retval)) {
				1810	if (unlikely(!audit_dummy_context())) {
				1811	if (nd->path.dentry && nd->inode)
				1812	audit_inode(name, nd->path.dentry);
				1813	}
				1814	}
				1815	return retval;
				1816	}
				1817
				1818	int kern_path_parent(const char name, struct nameidata nd)
				1819	{
				1820	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
				1821	}
				1822
				1823	int kern_path(const char name, unsigned int flags, struct path path)
				1824	{
				1825	struct nameidata nd;
				1826	int res = do_path_lookup(AT_FDCWD, name, flags, &nd);
				1827	if (!res)
				1828	*path = nd.path;
				1829	return res;
				1830	}
				1831
				1832	/**
				1833	* vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
				1834	* @dentry: pointer to dentry of the base directory
				1835	* @mnt: pointer to vfs mount of the base directory
				1836	* @name: pointer to file name
				1837	* @flags: lookup flags
				1838	* @path: pointer to struct path to fill
				1839	*/
				1840	int vfs_path_lookup(struct dentry dentry, struct vfsmount mnt,
				1841	const char *name, unsigned int flags,
				1842	struct path *path)
				1843	{
				1844	struct nameidata nd;
				1845	int err;
				1846	nd.root.dentry = dentry;
				1847	nd.root.mnt = mnt;
				1848	BUG_ON(flags & LOOKUP_PARENT);
				1849	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
				1850	err = do_path_lookup(AT_FDCWD, name, flags \| LOOKUP_ROOT, &nd);
				1851	if (!err)
				1852	*path = nd.path;
				1853	return err;
				1854	}
				1855
				1856	/*
				1857	* Restricted form of lookup. Doesn't follow links, single-component only,
				1858	* needs parent already locked. Doesn't follow mounts.
				1859	* SMP-safe.
				1860	*/
				1861	static struct dentry lookup_hash(struct nameidata nd)
				1862	{
				1863	return __lookup_hash(&nd->last, nd->path.dentry, nd);
				1864	}
				1865
				1866	/**
				1867	* lookup_one_len - filesystem helper to lookup single pathname component
				1868	* @name: pathname component to lookup
				1869	* @base: base directory to lookup from
				1870	* @len: maximum length @len should be interpreted to
				1871	*
				1872	* Note that this routine is purely a helper for filesystem usage and should
				1873	* not be called by generic code. Also note that by using this function the
				1874	* nameidata argument is passed to the filesystem methods and a filesystem
				1875	* using this helper needs to be prepared for that.
				1876	*/
				1877	struct dentry lookup_one_len(const char name, struct dentry *base, int len)
				1878	{
				1879	struct qstr this;
				1880	unsigned int c;
				1881	int err;
				1882
				1883	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
				1884
				1885	this.name = name;
				1886	this.len = len;
				1887	this.hash = full_name_hash(name, len);
				1888	if (!len)
				1889	return ERR_PTR(-EACCES);
				1890
				1891	while (len--) {
				1892	c = (const unsigned char )name++;
				1893	if (c == '/' \|\| c == '\0')
				1894	return ERR_PTR(-EACCES);
				1895	}
				1896	/*
				1897	* See if the low-level filesystem might want
				1898	* to use its own hash..
				1899	*/
				1900	if (base->d_flags & DCACHE_OP_HASH) {
				1901	int err = base->d_op->d_hash(base, base->d_inode, &this);
				1902	if (err < 0)
				1903	return ERR_PTR(err);
				1904	}
				1905
				1906	err = inode_permission(base->d_inode, MAY_EXEC);
				1907	if (err)
				1908	return ERR_PTR(err);
				1909
				1910	return __lookup_hash(&this, base, NULL);
				1911	}
				1912
				1913	int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
				1914	struct path path, int empty)
				1915	{
				1916	struct nameidata nd;
				1917	char *tmp = getname_flags(name, flags, empty);
				1918	int err = PTR_ERR(tmp);
				1919	if (!IS_ERR(tmp)) {
				1920
				1921	BUG_ON(flags & LOOKUP_PARENT);
				1922
				1923	err = do_path_lookup(dfd, tmp, flags, &nd);
				1924	putname(tmp);
				1925	if (!err)
				1926	*path = nd.path;
				1927	}
				1928	return err;
				1929	}
				1930
				1931	int user_path_at(int dfd, const char __user *name, unsigned flags,
				1932	struct path *path)
				1933	{
				1934	return user_path_at_empty(dfd, name, flags, path, NULL);
				1935	}
				1936
				1937	static int user_path_parent(int dfd, const char __user *path,
				1938	struct nameidata nd, char *name)
				1939	{
				1940	char *s = getname(path);
				1941	int error;
				1942
				1943	if (IS_ERR(s))
				1944	return PTR_ERR(s);
				1945
				1946	error = do_path_lookup(dfd, s, LOOKUP_PARENT, nd);
				1947	if (error)
				1948	putname(s);
				1949	else
				1950	*name = s;
				1951
				1952	return error;
				1953	}
				1954
				1955	/*
				1956	* It's inline, so penalty for filesystems that don't use sticky bit is
				1957	* minimal.
				1958	*/
				1959	static inline int check_sticky(struct inode dir, struct inode inode)
				1960	{
				1961	uid_t fsuid = current_fsuid();
				1962
				1963	if (!(dir->i_mode & S_ISVTX))
				1964	return 0;
				1965	if (current_user_ns() != inode_userns(inode))
				1966	goto other_userns;
				1967	if (inode->i_uid == fsuid)
				1968	return 0;
				1969	if (dir->i_uid == fsuid)
				1970	return 0;
				1971
				1972	other_userns:
				1973	return !ns_capable(inode_userns(inode), CAP_FOWNER);
				1974	}
				1975
				1976	/*
				1977	* Check whether we can remove a link victim from directory dir, check
				1978	* whether the type of victim is right.
				1979	* 1. We can't do it if dir is read-only (done in permission())
				1980	* 2. We should have write and exec permissions on dir
				1981	* 3. We can't remove anything from append-only dir
				1982	* 4. We can't do anything with immutable dir (done in permission())
				1983	* 5. If the sticky bit on dir is set we should either
				1984	* a. be owner of dir, or
				1985	* b. be owner of victim, or
				1986	* c. have CAP_FOWNER capability
				1987	* 6. If the victim is append-only or immutable we can't do antyhing with
				1988	* links pointing to it.
				1989	* 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
				1990	* 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
				1991	* 9. We can't remove a root or mountpoint.
				1992	* 10. We don't allow removal of NFS sillyrenamed files; it's handled by
				1993	* nfs_async_unlink().
				1994	*/
				1995	static int may_delete(struct inode dir,struct dentry victim,int isdir)
				1996	{
				1997	int error;
				1998
				1999	if (!victim->d_inode)
				2000	return -ENOENT;
				2001
				2002	BUG_ON(victim->d_parent->d_inode != dir);
				2003	audit_inode_child(victim, dir);
				2004
				2005	error = inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				2006	if (error)
				2007	return error;
				2008	if (IS_APPEND(dir))
				2009	return -EPERM;
				2010	if (check_sticky(dir, victim->d_inode)\|\|IS_APPEND(victim->d_inode)\|\|
				2011	IS_IMMUTABLE(victim->d_inode) \|\| IS_SWAPFILE(victim->d_inode))
				2012	return -EPERM;
				2013	if (isdir) {
				2014	if (!S_ISDIR(victim->d_inode->i_mode))
				2015	return -ENOTDIR;
				2016	if (IS_ROOT(victim))
				2017	return -EBUSY;
				2018	} else if (S_ISDIR(victim->d_inode->i_mode))
				2019	return -EISDIR;
				2020	if (IS_DEADDIR(dir))
				2021	return -ENOENT;
				2022	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
				2023	return -EBUSY;
				2024	return 0;
				2025	}
				2026
				2027	/* Check whether we can create an object with dentry child in directory
				2028	* dir.
				2029	* 1. We can't do it if child already exists (open has special treatment for
				2030	* this case, but since we are inlined it's OK)
				2031	* 2. We can't do it if dir is read-only (done in permission())
				2032	* 3. We should have write and exec permissions on dir
				2033	* 4. We can't do it if dir is immutable (done in permission())
				2034	*/
				2035	static inline int may_create(struct inode dir, struct dentry child)
				2036	{
				2037	if (child->d_inode)
				2038	return -EEXIST;
				2039	if (IS_DEADDIR(dir))
				2040	return -ENOENT;
				2041	return inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				2042	}
				2043
				2044	/*
				2045	* p1 and p2 should be directories on the same fs.
				2046	*/
				2047	struct dentry lock_rename(struct dentry p1, struct dentry *p2)
				2048	{
				2049	struct dentry *p;
				2050
				2051	if (p1 == p2) {
				2052	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
				2053	return NULL;
				2054	}
				2055
				2056	mutex_lock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
				2057
				2058	p = d_ancestor(p2, p1);
				2059	if (p) {
				2060	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
				2061	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
				2062	return p;
				2063	}
				2064
				2065	p = d_ancestor(p1, p2);
				2066	if (p) {
				2067	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
				2068	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
				2069	return p;
				2070	}
				2071
				2072	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
				2073	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
				2074	return NULL;
				2075	}
				2076
				2077	void unlock_rename(struct dentry p1, struct dentry p2)
				2078	{
				2079	mutex_unlock(&p1->d_inode->i_mutex);
				2080	if (p1 != p2) {
				2081	mutex_unlock(&p2->d_inode->i_mutex);
				2082	mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
				2083	}
				2084	}
				2085
				2086	int vfs_create(struct inode dir, struct dentry dentry, umode_t mode,
				2087	struct nameidata *nd)
				2088	{
				2089	int error = may_create(dir, dentry);
				2090
				2091	if (error)
				2092	return error;
				2093
				2094	if (!dir->i_op->create)
				2095	return -EACCES; /* shouldn't it be ENOSYS? */
				2096	mode &= S_IALLUGO;
				2097	mode \|= S_IFREG;
				2098	error = security_inode_create(dir, dentry, mode);
				2099	if (error)
				2100	return error;
				2101	error = dir->i_op->create(dir, dentry, mode, nd);
				2102	if (!error)
				2103	fsnotify_create(dir, dentry);
				2104	return error;
				2105	}
				2106
				2107	static int may_open(struct path *path, int acc_mode, int flag)
				2108	{
				2109	struct dentry *dentry = path->dentry;
				2110	struct inode *inode = dentry->d_inode;
				2111	int error;
				2112
				2113	/* O_PATH? */
				2114	if (!acc_mode)
				2115	return 0;
				2116
				2117	if (!inode)
				2118	return -ENOENT;
				2119
				2120	switch (inode->i_mode & S_IFMT) {
				2121	case S_IFLNK:
				2122	return -ELOOP;
				2123	case S_IFDIR:
				2124	if (acc_mode & MAY_WRITE)
				2125	return -EISDIR;
				2126	break;
				2127	case S_IFBLK:
				2128	case S_IFCHR:
				2129	if (path->mnt->mnt_flags & MNT_NODEV)
				2130	return -EACCES;
				2131	/FALLTHRU/
				2132	case S_IFIFO:
				2133	case S_IFSOCK:
				2134	flag &= ~O_TRUNC;
				2135	break;
				2136	}
				2137
				2138	error = inode_permission(inode, acc_mode);
				2139	if (error)
				2140	return error;
				2141
				2142	/*
				2143	* An append-only file must be opened in append mode for writing.
				2144	*/
				2145	if (IS_APPEND(inode)) {
				2146	if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
				2147	return -EPERM;
				2148	if (flag & O_TRUNC)
				2149	return -EPERM;
				2150	}
				2151
				2152	/* O_NOATIME can only be set by the owner or superuser */
				2153	if (flag & O_NOATIME && !inode_owner_or_capable(inode))
				2154	return -EPERM;
				2155
				2156	return 0;
				2157	}
				2158
				2159	static int handle_truncate(struct file *filp)
				2160	{
				2161	struct path *path = &filp->f_path;
				2162	struct inode *inode = path->dentry->d_inode;
				2163	int error = get_write_access(inode);
				2164	if (error)
				2165	return error;
				2166	/*
				2167	* Refuse to truncate files with mandatory locks held on them.
				2168	*/
				2169	error = locks_verify_locked(inode);
				2170	if (!error)
				2171	error = security_path_truncate(path);
				2172	if (!error) {
				2173	error = do_truncate(path->dentry, 0,
				2174	ATTR_MTIME\|ATTR_CTIME\|ATTR_OPEN,
				2175	filp);
				2176	}
				2177	put_write_access(inode);
				2178	return error;
				2179	}
				2180
				2181	static inline int open_to_namei_flags(int flag)
				2182	{
				2183	if ((flag & O_ACCMODE) == 3)
				2184	flag--;
				2185	return flag;
				2186	}
				2187
				2188	/*
				2189	* Handle the last step of open()
				2190	*/
				2191	static struct file do_last(struct nameidata nd, struct path *path,
				2192	const struct open_flags op, const char pathname)
				2193	{
				2194	struct dentry *dir = nd->path.dentry;
				2195	struct dentry *dentry;
				2196	int open_flag = op->open_flag;
				2197	int will_truncate = open_flag & O_TRUNC;
				2198	int want_write = 0;
				2199	int acc_mode = op->acc_mode;
				2200	struct file *filp;
				2201	int error;
				2202
				2203	nd->flags &= ~LOOKUP_PARENT;
				2204	nd->flags \|= op->intent;
				2205
				2206	switch (nd->last_type) {
				2207	case LAST_DOTDOT:
				2208	case LAST_DOT:
				2209	error = handle_dots(nd, nd->last_type);
				2210	if (error)
				2211	return ERR_PTR(error);
				2212	/* fallthrough */
				2213	case LAST_ROOT:
				2214	error = complete_walk(nd);
				2215	if (error)
				2216	return ERR_PTR(error);
				2217	audit_inode(pathname, nd->path.dentry);
				2218	if (open_flag & O_CREAT) {
				2219	error = -EISDIR;
				2220	goto exit;
				2221	}
				2222	goto ok;
				2223	case LAST_BIND:
				2224	error = complete_walk(nd);
				2225	if (error)
				2226	return ERR_PTR(error);
				2227	audit_inode(pathname, dir);
				2228	goto ok;
				2229	}
				2230
				2231	if (!(open_flag & O_CREAT)) {
				2232	int symlink_ok = 0;
				2233	if (nd->last.name[nd->last.len])
				2234	nd->flags \|= LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				2235	if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
				2236	symlink_ok = 1;
				2237	/* we _can_ be in RCU mode here */
				2238	error = walk_component(nd, path, &nd->last, LAST_NORM,
				2239	!symlink_ok);
				2240	if (error < 0)
				2241	return ERR_PTR(error);
				2242	if (error) /* symlink */
				2243	return NULL;
				2244	/* sayonara */
				2245	error = complete_walk(nd);
				2246	if (error)
				2247	return ERR_PTR(error);
				2248
				2249	error = -ENOTDIR;
				2250	if (nd->flags & LOOKUP_DIRECTORY) {
				2251	if (!nd->inode->i_op->lookup)
				2252	goto exit;
				2253	}
				2254	audit_inode(pathname, nd->path.dentry);
				2255	goto ok;
				2256	}
				2257
				2258	/* create side of things */
				2259	/*
				2260	* This will only deal with leaving RCU mode - LOOKUP_JUMPED has been
				2261	* cleared when we got to the last component we are about to look up
				2262	*/
				2263	error = complete_walk(nd);
				2264	if (error)
				2265	return ERR_PTR(error);
				2266
				2267	audit_inode(pathname, dir);
				2268	error = -EISDIR;
				2269	/* trailing slashes? */
				2270	if (nd->last.name[nd->last.len])
				2271	goto exit;
				2272
				2273	mutex_lock(&dir->d_inode->i_mutex);
				2274
				2275	dentry = lookup_hash(nd);
				2276	error = PTR_ERR(dentry);
				2277	if (IS_ERR(dentry)) {
				2278	mutex_unlock(&dir->d_inode->i_mutex);
				2279	goto exit;
				2280	}
				2281
				2282	path->dentry = dentry;
				2283	path->mnt = nd->path.mnt;
				2284
				2285	/* Negative dentry, just create the file */
				2286	if (!dentry->d_inode) {
				2287	umode_t mode = op->mode;
				2288	if (!IS_POSIXACL(dir->d_inode))
				2289	mode &= ~current_umask();
				2290	/*
				2291	* This write is needed to ensure that a
				2292	* rw->ro transition does not occur between
				2293	* the time when the file is created and when
				2294	* a permanent write count is taken through
				2295	* the 'struct file' in nameidata_to_filp().
				2296	*/
				2297	error = mnt_want_write(nd->path.mnt);
				2298	if (error)
				2299	goto exit_mutex_unlock;
				2300	want_write = 1;
				2301	/* Don't check for write permission, don't truncate */
				2302	open_flag &= ~O_TRUNC;
				2303	will_truncate = 0;
				2304	acc_mode = MAY_OPEN;
				2305	error = security_path_mknod(&nd->path, dentry, mode, 0);
				2306	if (error)
				2307	goto exit_mutex_unlock;
				2308	error = vfs_create(dir->d_inode, dentry, mode, nd);
				2309	if (error)
				2310	goto exit_mutex_unlock;
				2311	mutex_unlock(&dir->d_inode->i_mutex);
				2312	dput(nd->path.dentry);
				2313	nd->path.dentry = dentry;
				2314	goto common;
				2315	}
				2316
				2317	/*
				2318	* It already exists.
				2319	*/
				2320	mutex_unlock(&dir->d_inode->i_mutex);
				2321	audit_inode(pathname, path->dentry);
				2322
				2323	error = -EEXIST;
				2324	if (open_flag & O_EXCL)
				2325	goto exit_dput;
				2326
				2327	error = follow_managed(path, nd->flags);
				2328	if (error < 0)
				2329	goto exit_dput;
				2330
				2331	if (error)
				2332	nd->flags \|= LOOKUP_JUMPED;
				2333
				2334	error = -ENOENT;
				2335	if (!path->dentry->d_inode)
				2336	goto exit_dput;
				2337
				2338	if (path->dentry->d_inode->i_op->follow_link)
				2339	return NULL;
				2340
				2341	path_to_nameidata(path, nd);
				2342	nd->inode = path->dentry->d_inode;
				2343	/* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */
				2344	error = complete_walk(nd);
				2345	if (error)
				2346	return ERR_PTR(error);
				2347	error = -EISDIR;
				2348	if (S_ISDIR(nd->inode->i_mode))
				2349	goto exit;
				2350	ok:
				2351	if (!S_ISREG(nd->inode->i_mode))
				2352	will_truncate = 0;
				2353
				2354	if (will_truncate) {
				2355	error = mnt_want_write(nd->path.mnt);
				2356	if (error)
				2357	goto exit;
				2358	want_write = 1;
				2359	}
				2360	common:
				2361	error = may_open(&nd->path, acc_mode, open_flag);
				2362	if (error)
				2363	goto exit;
				2364	filp = nameidata_to_filp(nd);
				2365	if (!IS_ERR(filp)) {
				2366	error = ima_file_check(filp, op->acc_mode);
				2367	if (error) {
				2368	fput(filp);
				2369	filp = ERR_PTR(error);
				2370	}
				2371	}
				2372	if (!IS_ERR(filp)) {
				2373	if (will_truncate) {
				2374	error = handle_truncate(filp);
				2375	if (error) {
				2376	fput(filp);
				2377	filp = ERR_PTR(error);
				2378	}
				2379	}
				2380	}
				2381	out:
				2382	if (want_write)
				2383	mnt_drop_write(nd->path.mnt);
				2384	path_put(&nd->path);
				2385	return filp;
				2386
				2387	exit_mutex_unlock:
				2388	mutex_unlock(&dir->d_inode->i_mutex);
				2389	exit_dput:
				2390	path_put_conditional(path, nd);
				2391	exit:
				2392	filp = ERR_PTR(error);
				2393	goto out;
				2394	}
				2395
				2396	static struct file path_openat(int dfd, const char pathname,
				2397	struct nameidata nd, const struct open_flags op, int flags)
				2398	{
				2399	struct file *base = NULL;
				2400	struct file *filp;
				2401	struct path path;
				2402	int error;
				2403
				2404	filp = get_empty_filp();
				2405	if (!filp)
				2406	return ERR_PTR(-ENFILE);
				2407
				2408	filp->f_flags = op->open_flag;
				2409	nd->intent.open.file = filp;
				2410	nd->intent.open.flags = open_to_namei_flags(op->open_flag);
				2411	nd->intent.open.create_mode = op->mode;
				2412
				2413	error = path_init(dfd, pathname, flags \| LOOKUP_PARENT, nd, &base);
				2414	if (unlikely(error))
				2415	goto out_filp;
				2416
				2417	current->total_link_count = 0;
				2418	error = link_path_walk(pathname, nd);
				2419	if (unlikely(error))
				2420	goto out_filp;
				2421
				2422	filp = do_last(nd, &path, op, pathname);
				2423	while (unlikely(!filp)) { /* trailing symlink */
				2424	struct path link = path;
				2425	void *cookie;
				2426	if (!(nd->flags & LOOKUP_FOLLOW)) {
				2427	path_put_conditional(&path, nd);
				2428	path_put(&nd->path);
				2429	filp = ERR_PTR(-ELOOP);
				2430	break;
				2431	}
				2432	nd->flags \|= LOOKUP_PARENT;
				2433	nd->flags &= ~(LOOKUP_OPEN\|LOOKUP_CREATE\|LOOKUP_EXCL);
				2434	error = follow_link(&link, nd, &cookie);
				2435	if (unlikely(error))
				2436	filp = ERR_PTR(error);
				2437	else
				2438	filp = do_last(nd, &path, op, pathname);
				2439	put_link(nd, &link, cookie);
				2440	}
				2441	out:
				2442	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
				2443	path_put(&nd->root);
				2444	if (base)
				2445	fput(base);
				2446	release_open_intent(nd);
				2447	return filp;
				2448
				2449	out_filp:
				2450	filp = ERR_PTR(error);
				2451	goto out;
				2452	}
				2453
				2454	struct file do_filp_open(int dfd, const char pathname,
				2455	const struct open_flags *op, int flags)
				2456	{
				2457	struct nameidata nd;
				2458	struct file *filp;
				2459
				2460	filp = path_openat(dfd, pathname, &nd, op, flags \| LOOKUP_RCU);
				2461	if (unlikely(filp == ERR_PTR(-ECHILD)))
				2462	filp = path_openat(dfd, pathname, &nd, op, flags);
				2463	if (unlikely(filp == ERR_PTR(-ESTALE)))
				2464	filp = path_openat(dfd, pathname, &nd, op, flags \| LOOKUP_REVAL);
				2465	return filp;
				2466	}
				2467
				2468	struct file do_file_open_root(struct dentry dentry, struct vfsmount *mnt,
				2469	const char name, const struct open_flags op, int flags)
				2470	{
				2471	struct nameidata nd;
				2472	struct file *file;
				2473
				2474	nd.root.mnt = mnt;
				2475	nd.root.dentry = dentry;
				2476
				2477	flags \|= LOOKUP_ROOT;
				2478
				2479	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
				2480	return ERR_PTR(-ELOOP);
				2481
				2482	file = path_openat(-1, name, &nd, op, flags \| LOOKUP_RCU);
				2483	if (unlikely(file == ERR_PTR(-ECHILD)))
				2484	file = path_openat(-1, name, &nd, op, flags);
				2485	if (unlikely(file == ERR_PTR(-ESTALE)))
				2486	file = path_openat(-1, name, &nd, op, flags \| LOOKUP_REVAL);
				2487	return file;
				2488	}
				2489
				2490	struct dentry kern_path_create(int dfd, const char pathname, struct path *path, int is_dir)
				2491	{
				2492	struct dentry *dentry = ERR_PTR(-EEXIST);
				2493	struct nameidata nd;
				2494	int error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
				2495	if (error)
				2496	return ERR_PTR(error);
				2497
				2498	/*
				2499	* Yucky last component or no last component at all?
				2500	* (foo/., foo/.., /////)
				2501	*/
				2502	if (nd.last_type != LAST_NORM)
				2503	goto out;
				2504	nd.flags &= ~LOOKUP_PARENT;
				2505	nd.flags \|= LOOKUP_CREATE \| LOOKUP_EXCL;
				2506	nd.intent.open.flags = O_EXCL;
				2507
				2508	/*
				2509	* Do the final lookup.
				2510	*/
				2511	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
				2512	dentry = lookup_hash(&nd);
				2513	if (IS_ERR(dentry))
				2514	goto fail;
				2515
				2516	if (dentry->d_inode)
				2517	goto eexist;
				2518	/*
				2519	* Special case - lookup gave negative, but... we had foo/bar/
				2520	* From the vfs_mknod() POV we just have a negative dentry -
				2521	* all is fine. Let's be bastards - you had / on the end, you've
				2522	* been asking for (non-existent) directory. -ENOENT for you.
				2523	*/
				2524	if (unlikely(!is_dir && nd.last.name[nd.last.len])) {
				2525	dput(dentry);
				2526	dentry = ERR_PTR(-ENOENT);
				2527	goto fail;
				2528	}
				2529	*path = nd.path;
				2530	return dentry;
				2531	eexist:
				2532	dput(dentry);
				2533	dentry = ERR_PTR(-EEXIST);
				2534	fail:
				2535	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
				2536	out:
				2537	path_put(&nd.path);
				2538	return dentry;
				2539	}
				2540	EXPORT_SYMBOL(kern_path_create);
				2541
				2542	struct dentry user_path_create(int dfd, const char __user pathname, struct path *path, int is_dir)
				2543	{
				2544	char *tmp = getname(pathname);
				2545	struct dentry *res;
				2546	if (IS_ERR(tmp))
				2547	return ERR_CAST(tmp);
				2548	res = kern_path_create(dfd, tmp, path, is_dir);
				2549	putname(tmp);
				2550	return res;
				2551	}
				2552	EXPORT_SYMBOL(user_path_create);
				2553
				2554	int vfs_mknod(struct inode dir, struct dentry dentry, umode_t mode, dev_t dev)
				2555	{
				2556	int error = may_create(dir, dentry);
				2557
				2558	if (error)
				2559	return error;
				2560
				2561	if ((S_ISCHR(mode) \|\| S_ISBLK(mode)) &&
				2562	!ns_capable(inode_userns(dir), CAP_MKNOD))
				2563	return -EPERM;
				2564
				2565	if (!dir->i_op->mknod)
				2566	return -EPERM;
				2567
				2568	error = devcgroup_inode_mknod(mode, dev);
				2569	if (error)
				2570	return error;
				2571
				2572	error = security_inode_mknod(dir, dentry, mode, dev);
				2573	if (error)
				2574	return error;
				2575
				2576	error = dir->i_op->mknod(dir, dentry, mode, dev);
				2577	if (!error)
				2578	fsnotify_create(dir, dentry);
				2579	return error;
				2580	}
				2581
				2582	static int may_mknod(umode_t mode)
				2583	{
				2584	switch (mode & S_IFMT) {
				2585	case S_IFREG:
				2586	case S_IFCHR:
				2587	case S_IFBLK:
				2588	case S_IFIFO:
				2589	case S_IFSOCK:
				2590	case 0: /* zero mode translates to S_IFREG */
				2591	return 0;
				2592	case S_IFDIR:
				2593	return -EPERM;
				2594	default:
				2595	return -EINVAL;
				2596	}
				2597	}
				2598
				2599	SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
				2600	unsigned, dev)
				2601	{
				2602	struct dentry *dentry;
				2603	struct path path;
				2604	int error;
				2605
				2606	if (S_ISDIR(mode))
				2607	return -EPERM;
				2608
				2609	dentry = user_path_create(dfd, filename, &path, 0);
				2610	if (IS_ERR(dentry))
				2611	return PTR_ERR(dentry);
				2612
				2613	if (!IS_POSIXACL(path.dentry->d_inode))
				2614	mode &= ~current_umask();
				2615	error = may_mknod(mode);
				2616	if (error)
				2617	goto out_dput;
				2618	error = mnt_want_write(path.mnt);
				2619	if (error)
				2620	goto out_dput;
				2621	error = security_path_mknod(&path, dentry, mode, dev);
				2622	if (error)
				2623	goto out_drop_write;
				2624	switch (mode & S_IFMT) {
				2625	case 0: case S_IFREG:
				2626	error = vfs_create(path.dentry->d_inode,dentry,mode,NULL);
				2627	break;
				2628	case S_IFCHR: case S_IFBLK:
				2629	error = vfs_mknod(path.dentry->d_inode,dentry,mode,
				2630	new_decode_dev(dev));
				2631	break;
				2632	case S_IFIFO: case S_IFSOCK:
				2633	error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
				2634	break;
				2635	}
				2636	out_drop_write:
				2637	mnt_drop_write(path.mnt);
				2638	out_dput:
				2639	dput(dentry);
				2640	mutex_unlock(&path.dentry->d_inode->i_mutex);
				2641	path_put(&path);
				2642
				2643	return error;
				2644	}
				2645
				2646	SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
				2647	{
				2648	return sys_mknodat(AT_FDCWD, filename, mode, dev);
				2649	}
				2650
				2651	int vfs_mkdir(struct inode dir, struct dentry dentry, umode_t mode)
				2652	{
				2653	int error = may_create(dir, dentry);
				2654	unsigned max_links = dir->i_sb->s_max_links;
				2655
				2656	if (error)
				2657	return error;
				2658
				2659	if (!dir->i_op->mkdir)
				2660	return -EPERM;
				2661
				2662	mode &= (S_IRWXUGO\|S_ISVTX);
				2663	error = security_inode_mkdir(dir, dentry, mode);
				2664	if (error)
				2665	return error;
				2666
				2667	if (max_links && dir->i_nlink >= max_links)
				2668	return -EMLINK;
				2669
				2670	error = dir->i_op->mkdir(dir, dentry, mode);
				2671	if (!error)
				2672	fsnotify_mkdir(dir, dentry);
				2673	return error;
				2674	}
				2675
				2676	SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
				2677	{
				2678	struct dentry *dentry;
				2679	struct path path;
				2680	int error;
				2681
				2682	dentry = user_path_create(dfd, pathname, &path, 1);
				2683	if (IS_ERR(dentry))
				2684	return PTR_ERR(dentry);
				2685
				2686	if (!IS_POSIXACL(path.dentry->d_inode))
				2687	mode &= ~current_umask();
				2688	error = mnt_want_write(path.mnt);
				2689	if (error)
				2690	goto out_dput;
				2691	error = security_path_mkdir(&path, dentry, mode);
				2692	if (error)
				2693	goto out_drop_write;
				2694	error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
				2695	out_drop_write:
				2696	mnt_drop_write(path.mnt);
				2697	out_dput:
				2698	dput(dentry);
				2699	mutex_unlock(&path.dentry->d_inode->i_mutex);
				2700	path_put(&path);
				2701	return error;
				2702	}
				2703
				2704	SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
				2705	{
				2706	return sys_mkdirat(AT_FDCWD, pathname, mode);
				2707	}
				2708
				2709	/*
				2710	* The dentry_unhash() helper will try to drop the dentry early: we
				2711	* should have a usage count of 1 if we're the only user of this
				2712	* dentry, and if that is true (possibly after pruning the dcache),
				2713	* then we drop the dentry now.
				2714	*
				2715	* A low-level filesystem can, if it choses, legally
				2716	* do a
				2717	*
				2718	* if (!d_unhashed(dentry))
				2719	* return -EBUSY;
				2720	*
				2721	* if it cannot handle the case of removing a directory
				2722	* that is still in use by something else..
				2723	*/
				2724	void dentry_unhash(struct dentry *dentry)
				2725	{
				2726	shrink_dcache_parent(dentry);
				2727	spin_lock(&dentry->d_lock);
				2728	if (dentry->d_count == 1)
				2729	__d_drop(dentry);
				2730	spin_unlock(&dentry->d_lock);
				2731	}
				2732
				2733	int vfs_rmdir(struct inode dir, struct dentry dentry)
				2734	{
				2735	int error = may_delete(dir, dentry, 1);
				2736
				2737	if (error)
				2738	return error;
				2739
				2740	if (!dir->i_op->rmdir)
				2741	return -EPERM;
				2742
				2743	dget(dentry);
				2744	mutex_lock(&dentry->d_inode->i_mutex);
				2745
				2746	error = -EBUSY;
				2747	if (d_mountpoint(dentry))
				2748	goto out;
				2749
				2750	error = security_inode_rmdir(dir, dentry);
				2751	if (error)
				2752	goto out;
				2753
				2754	shrink_dcache_parent(dentry);
				2755	error = dir->i_op->rmdir(dir, dentry);
				2756	if (error)
				2757	goto out;
				2758
				2759	dentry->d_inode->i_flags \|= S_DEAD;
				2760	dont_mount(dentry);
				2761
				2762	out:
				2763	mutex_unlock(&dentry->d_inode->i_mutex);
				2764	dput(dentry);
				2765	if (!error)
				2766	d_delete(dentry);
				2767	return error;
				2768	}
				2769
				2770	static long do_rmdir(int dfd, const char __user *pathname)
				2771	{
				2772	int error = 0;
				2773	char * name;
				2774	struct dentry *dentry;
				2775	struct nameidata nd;
				2776
				2777	error = user_path_parent(dfd, pathname, &nd, &name);
				2778	if (error)
				2779	return error;
				2780
				2781	switch(nd.last_type) {
				2782	case LAST_DOTDOT:
				2783	error = -ENOTEMPTY;
				2784	goto exit1;
				2785	case LAST_DOT:
				2786	error = -EINVAL;
				2787	goto exit1;
				2788	case LAST_ROOT:
				2789	error = -EBUSY;
				2790	goto exit1;
				2791	}
				2792
				2793	nd.flags &= ~LOOKUP_PARENT;
				2794
				2795	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
				2796	dentry = lookup_hash(&nd);
				2797	error = PTR_ERR(dentry);
				2798	if (IS_ERR(dentry))
				2799	goto exit2;
				2800	if (!dentry->d_inode) {
				2801	error = -ENOENT;
				2802	goto exit3;
				2803	}
				2804	error = mnt_want_write(nd.path.mnt);
				2805	if (error)
				2806	goto exit3;
				2807	error = security_path_rmdir(&nd.path, dentry);
				2808	if (error)
				2809	goto exit4;
				2810	error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
				2811	exit4:
				2812	mnt_drop_write(nd.path.mnt);
				2813	exit3:
				2814	dput(dentry);
				2815	exit2:
				2816	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
				2817	exit1:
				2818	path_put(&nd.path);
				2819	putname(name);
				2820	return error;
				2821	}
				2822
				2823	SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
				2824	{
				2825	return do_rmdir(AT_FDCWD, pathname);
				2826	}
				2827
				2828	int vfs_unlink(struct inode dir, struct dentry dentry)
				2829	{
				2830	int error = may_delete(dir, dentry, 0);
				2831
				2832	if (error)
				2833	return error;
				2834
				2835	if (!dir->i_op->unlink)
				2836	return -EPERM;
				2837
				2838	mutex_lock(&dentry->d_inode->i_mutex);
				2839	if (d_mountpoint(dentry))
				2840	error = -EBUSY;
				2841	else {
				2842	error = security_inode_unlink(dir, dentry);
				2843	if (!error) {
				2844	error = dir->i_op->unlink(dir, dentry);
				2845	if (!error)
				2846	dont_mount(dentry);
				2847	}
				2848	}
				2849	mutex_unlock(&dentry->d_inode->i_mutex);
				2850
				2851	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
				2852	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
				2853	fsnotify_link_count(dentry->d_inode);
				2854	d_delete(dentry);
				2855	}
				2856
				2857	return error;
				2858	}
				2859
				2860	/*
				2861	* Make sure that the actual truncation of the file will occur outside its
				2862	* directory's i_mutex. Truncate can take a long time if there is a lot of
				2863	* writeout happening, and we don't want to prevent access to the directory
				2864	* while waiting on the I/O.
				2865	*/
				2866	static long do_unlinkat(int dfd, const char __user *pathname)
				2867	{
				2868	int error;
				2869	char *name;
				2870	struct dentry *dentry;
				2871	struct nameidata nd;
				2872	struct inode *inode = NULL;
				2873
				2874	error = user_path_parent(dfd, pathname, &nd, &name);
				2875	if (error)
				2876	return error;
				2877
				2878	error = -EISDIR;
				2879	if (nd.last_type != LAST_NORM)
				2880	goto exit1;
				2881
				2882	nd.flags &= ~LOOKUP_PARENT;
				2883
				2884	mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
				2885	dentry = lookup_hash(&nd);
				2886	error = PTR_ERR(dentry);
				2887	if (!IS_ERR(dentry)) {
				2888	/* Why not before? Because we want correct error value */
				2889	if (nd.last.name[nd.last.len])
				2890	goto slashes;
				2891	inode = dentry->d_inode;
				2892	if (!inode)
				2893	goto slashes;
				2894	ihold(inode);
				2895	error = mnt_want_write(nd.path.mnt);
				2896	if (error)
				2897	goto exit2;
				2898	error = security_path_unlink(&nd.path, dentry);
				2899	if (error)
				2900	goto exit3;
				2901	error = vfs_unlink(nd.path.dentry->d_inode, dentry);
				2902	exit3:
				2903	mnt_drop_write(nd.path.mnt);
				2904	exit2:
				2905	dput(dentry);
				2906	}
				2907	mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
				2908	if (inode)
				2909	iput(inode); /* truncate the inode here */
				2910	exit1:
				2911	path_put(&nd.path);
				2912	putname(name);
				2913	return error;
				2914
				2915	slashes:
				2916	error = !dentry->d_inode ? -ENOENT :
				2917	S_ISDIR(dentry->d_inode->i_mode) ? -EISDIR : -ENOTDIR;
				2918	goto exit2;
				2919	}
				2920
				2921	SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
				2922	{
				2923	if ((flag & ~AT_REMOVEDIR) != 0)
				2924	return -EINVAL;
				2925
				2926	if (flag & AT_REMOVEDIR)
				2927	return do_rmdir(dfd, pathname);
				2928
				2929	return do_unlinkat(dfd, pathname);
				2930	}
				2931
				2932	SYSCALL_DEFINE1(unlink, const char __user *, pathname)
				2933	{
				2934	return do_unlinkat(AT_FDCWD, pathname);
				2935	}
				2936
				2937	int vfs_symlink(struct inode dir, struct dentry dentry, const char *oldname)
				2938	{
				2939	int error = may_create(dir, dentry);
				2940
				2941	if (error)
				2942	return error;
				2943
				2944	if (!dir->i_op->symlink)
				2945	return -EPERM;
				2946
				2947	error = security_inode_symlink(dir, dentry, oldname);
				2948	if (error)
				2949	return error;
				2950
				2951	error = dir->i_op->symlink(dir, dentry, oldname);
				2952	if (!error)
				2953	fsnotify_create(dir, dentry);
				2954	return error;
				2955	}
				2956
				2957	SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
				2958	int, newdfd, const char __user *, newname)
				2959	{
				2960	int error;
				2961	char *from;
				2962	struct dentry *dentry;
				2963	struct path path;
				2964
				2965	from = getname(oldname);
				2966	if (IS_ERR(from))
				2967	return PTR_ERR(from);
				2968
				2969	dentry = user_path_create(newdfd, newname, &path, 0);
				2970	error = PTR_ERR(dentry);
				2971	if (IS_ERR(dentry))
				2972	goto out_putname;
				2973
				2974	error = mnt_want_write(path.mnt);
				2975	if (error)
				2976	goto out_dput;
				2977	error = security_path_symlink(&path, dentry, from);
				2978	if (error)
				2979	goto out_drop_write;
				2980	error = vfs_symlink(path.dentry->d_inode, dentry, from);
				2981	out_drop_write:
				2982	mnt_drop_write(path.mnt);
				2983	out_dput:
				2984	dput(dentry);
				2985	mutex_unlock(&path.dentry->d_inode->i_mutex);
				2986	path_put(&path);
				2987	out_putname:
				2988	putname(from);
				2989	return error;
				2990	}
				2991
				2992	SYSCALL_DEFINE2(symlink, const char __user , oldname, const char __user , newname)
				2993	{
				2994	return sys_symlinkat(oldname, AT_FDCWD, newname);
				2995	}
				2996
				2997	int vfs_link(struct dentry old_dentry, struct inode dir, struct dentry *new_dentry)
				2998	{
				2999	struct inode *inode = old_dentry->d_inode;
				3000	unsigned max_links = dir->i_sb->s_max_links;
				3001	int error;
				3002
				3003	if (!inode)
				3004	return -ENOENT;
				3005
				3006	error = may_create(dir, new_dentry);
				3007	if (error)
				3008	return error;
				3009
				3010	if (dir->i_sb != inode->i_sb)
				3011	return -EXDEV;
				3012
				3013	/*
				3014	* A link to an append-only or immutable file cannot be created.
				3015	*/
				3016	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				3017	return -EPERM;
				3018	if (!dir->i_op->link)
				3019	return -EPERM;
				3020	if (S_ISDIR(inode->i_mode))
				3021	return -EPERM;
				3022
				3023	error = security_inode_link(old_dentry, dir, new_dentry);
				3024	if (error)
				3025	return error;
				3026
				3027	mutex_lock(&inode->i_mutex);
				3028	/* Make sure we don't allow creating hardlink to an unlinked file */
				3029	if (inode->i_nlink == 0)
				3030	error = -ENOENT;
				3031	else if (max_links && inode->i_nlink >= max_links)
				3032	error = -EMLINK;
				3033	else
				3034	error = dir->i_op->link(old_dentry, dir, new_dentry);
				3035	mutex_unlock(&inode->i_mutex);
				3036	if (!error)
				3037	fsnotify_link(dir, inode, new_dentry);
				3038	return error;
				3039	}
				3040
				3041	/*
				3042	* Hardlinks are often used in delicate situations. We avoid
				3043	* security-related surprises by not following symlinks on the
				3044	* newname. --KAB
				3045	*
				3046	* We don't follow them on the oldname either to be compatible
				3047	* with linux 2.0, and to avoid hard-linking to directories
				3048	* and other special files. --ADM
				3049	*/
				3050	SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
				3051	int, newdfd, const char __user *, newname, int, flags)
				3052	{
				3053	struct dentry *new_dentry;
				3054	struct path old_path, new_path;
				3055	int how = 0;
				3056	int error;
				3057
				3058	if ((flags & ~(AT_SYMLINK_FOLLOW \| AT_EMPTY_PATH)) != 0)
				3059	return -EINVAL;
				3060	/*
				3061	* To use null names we require CAP_DAC_READ_SEARCH
				3062	* This ensures that not everyone will be able to create
				3063	* handlink using the passed filedescriptor.
				3064	*/
				3065	if (flags & AT_EMPTY_PATH) {
				3066	if (!capable(CAP_DAC_READ_SEARCH))
				3067	return -ENOENT;
				3068	how = LOOKUP_EMPTY;
				3069	}
				3070
				3071	if (flags & AT_SYMLINK_FOLLOW)
				3072	how \|= LOOKUP_FOLLOW;
				3073
				3074	error = user_path_at(olddfd, oldname, how, &old_path);
				3075	if (error)
				3076	return error;
				3077
				3078	new_dentry = user_path_create(newdfd, newname, &new_path, 0);
				3079	error = PTR_ERR(new_dentry);
				3080	if (IS_ERR(new_dentry))
				3081	goto out;
				3082
				3083	error = -EXDEV;
				3084	if (old_path.mnt != new_path.mnt)
				3085	goto out_dput;
				3086	error = mnt_want_write(new_path.mnt);
				3087	if (error)
				3088	goto out_dput;
				3089	error = security_path_link(old_path.dentry, &new_path, new_dentry);
				3090	if (error)
				3091	goto out_drop_write;
				3092	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry);
				3093	out_drop_write:
				3094	mnt_drop_write(new_path.mnt);
				3095	out_dput:
				3096	dput(new_dentry);
				3097	mutex_unlock(&new_path.dentry->d_inode->i_mutex);
				3098	path_put(&new_path);
				3099	out:
				3100	path_put(&old_path);
				3101
				3102	return error;
				3103	}
				3104
				3105	SYSCALL_DEFINE2(link, const char __user , oldname, const char __user , newname)
				3106	{
				3107	return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
				3108	}
				3109
				3110	/*
				3111	* The worst of all namespace operations - renaming directory. "Perverted"
				3112	* doesn't even start to describe it. Somebody in UCB had a heck of a trip...
				3113	* Problems:
				3114	* a) we can get into loop creation. Check is done in is_subdir().
				3115	* b) race potential - two innocent renames can create a loop together.
				3116	* That's where 4.4 screws up. Current fix: serialization on
				3117	* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
				3118	* story.
				3119	* c) we have to lock _three_ objects - parents and victim (if it exists).
				3120	* And that - after we got ->i_mutex on parents (until then we don't know
				3121	* whether the target exists). Solution: try to be smart with locking
				3122	* order for inodes. We rely on the fact that tree topology may change
				3123	* only under ->s_vfs_rename_mutex _and_ that parent of the object we
				3124	* move will be locked. Thus we can rank directories by the tree
				3125	* (ancestors first) and rank all non-directories after them.
				3126	* That works since everybody except rename does "lock parent, lookup,
				3127	* lock child" and rename is under ->s_vfs_rename_mutex.
				3128	* HOWEVER, it relies on the assumption that any object with ->lookup()
				3129	* has no more than 1 dentry. If "hybrid" objects will ever appear,
				3130	* we'd better make sure that there's no link(2) for them.
				3131	* d) conversion from fhandle to dentry may come in the wrong moment - when
				3132	* we are removing the target. Solution: we will have to grab ->i_mutex
				3133	* in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
				3134	* ->i_mutex on parents, which works but leads to some truly excessive
				3135	* locking].
				3136	*/
				3137	static int vfs_rename_dir(struct inode old_dir, struct dentry old_dentry,
				3138	struct inode new_dir, struct dentry new_dentry)
				3139	{
				3140	int error = 0;
				3141	struct inode *target = new_dentry->d_inode;
				3142	unsigned max_links = new_dir->i_sb->s_max_links;
				3143
				3144	/*
				3145	* If we are going to change the parent - check write permissions,
				3146	* we'll need to flip '..'.
				3147	*/
				3148	if (new_dir != old_dir) {
				3149	error = inode_permission(old_dentry->d_inode, MAY_WRITE);
				3150	if (error)
				3151	return error;
				3152	}
				3153
				3154	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
				3155	if (error)
				3156	return error;
				3157
				3158	dget(new_dentry);
				3159	if (target)
				3160	mutex_lock(&target->i_mutex);
				3161
				3162	error = -EBUSY;
				3163	if (d_mountpoint(old_dentry) \|\| d_mountpoint(new_dentry))
				3164	goto out;
				3165
				3166	error = -EMLINK;
				3167	if (max_links && !target && new_dir != old_dir &&
				3168	new_dir->i_nlink >= max_links)
				3169	goto out;
				3170
				3171	if (target)
				3172	shrink_dcache_parent(new_dentry);
				3173	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
				3174	if (error)
				3175	goto out;
				3176
				3177	if (target) {
				3178	target->i_flags \|= S_DEAD;
				3179	dont_mount(new_dentry);
				3180	}
				3181	out:
				3182	if (target)
				3183	mutex_unlock(&target->i_mutex);
				3184	dput(new_dentry);
				3185	if (!error)
				3186	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
				3187	d_move(old_dentry,new_dentry);
				3188	return error;
				3189	}
				3190
				3191	static int vfs_rename_other(struct inode old_dir, struct dentry old_dentry,
				3192	struct inode new_dir, struct dentry new_dentry)
				3193	{
				3194	struct inode *target = new_dentry->d_inode;
				3195	int error;
				3196
				3197	error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
				3198	if (error)
				3199	return error;
				3200
				3201	dget(new_dentry);
				3202	if (target)
				3203	mutex_lock(&target->i_mutex);
				3204
				3205	error = -EBUSY;
				3206	if (d_mountpoint(old_dentry)\|\|d_mountpoint(new_dentry))
				3207	goto out;
				3208
				3209	error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry);
				3210	if (error)
				3211	goto out;
				3212
				3213	if (target)
				3214	dont_mount(new_dentry);
				3215	if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
				3216	d_move(old_dentry, new_dentry);
				3217	out:
				3218	if (target)
				3219	mutex_unlock(&target->i_mutex);
				3220	dput(new_dentry);
				3221	return error;
				3222	}
				3223
				3224	int vfs_rename(struct inode old_dir, struct dentry old_dentry,
				3225	struct inode new_dir, struct dentry new_dentry)
				3226	{
				3227	int error;
				3228	int is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
				3229	const unsigned char *old_name;
				3230
				3231	if (old_dentry->d_inode == new_dentry->d_inode)
				3232	return 0;
				3233
				3234	error = may_delete(old_dir, old_dentry, is_dir);
				3235	if (error)
				3236	return error;
				3237
				3238	if (!new_dentry->d_inode)
				3239	error = may_create(new_dir, new_dentry);
				3240	else
				3241	error = may_delete(new_dir, new_dentry, is_dir);
				3242	if (error)
				3243	return error;
				3244
				3245	if (!old_dir->i_op->rename)
				3246	return -EPERM;
				3247
				3248	old_name = fsnotify_oldname_init(old_dentry->d_name.name);
				3249
				3250	if (is_dir)
				3251	error = vfs_rename_dir(old_dir,old_dentry,new_dir,new_dentry);
				3252	else
				3253	error = vfs_rename_other(old_dir,old_dentry,new_dir,new_dentry);
				3254	if (!error)
				3255	fsnotify_move(old_dir, new_dir, old_name, is_dir,
				3256	new_dentry->d_inode, old_dentry);
				3257	fsnotify_oldname_free(old_name);
				3258
				3259	return error;
				3260	}
				3261
				3262	SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
				3263	int, newdfd, const char __user *, newname)
				3264	{
				3265	struct dentry old_dir, new_dir;
				3266	struct dentry old_dentry, new_dentry;
				3267	struct dentry *trap;
				3268	struct nameidata oldnd, newnd;
				3269	char *from;
				3270	char *to;
				3271	int error;
				3272
				3273	error = user_path_parent(olddfd, oldname, &oldnd, &from);
				3274	if (error)
				3275	goto exit;
				3276
				3277	error = user_path_parent(newdfd, newname, &newnd, &to);
				3278	if (error)
				3279	goto exit1;
				3280
				3281	error = -EXDEV;
				3282	if (oldnd.path.mnt != newnd.path.mnt)
				3283	goto exit2;
				3284
				3285	old_dir = oldnd.path.dentry;
				3286	error = -EBUSY;
				3287	if (oldnd.last_type != LAST_NORM)
				3288	goto exit2;
				3289
				3290	new_dir = newnd.path.dentry;
				3291	if (newnd.last_type != LAST_NORM)
				3292	goto exit2;
				3293
				3294	oldnd.flags &= ~LOOKUP_PARENT;
				3295	newnd.flags &= ~LOOKUP_PARENT;
				3296	newnd.flags \|= LOOKUP_RENAME_TARGET;
				3297
				3298	trap = lock_rename(new_dir, old_dir);
				3299
				3300	old_dentry = lookup_hash(&oldnd);
				3301	error = PTR_ERR(old_dentry);
				3302	if (IS_ERR(old_dentry))
				3303	goto exit3;
				3304	/* source must exist */
				3305	error = -ENOENT;
				3306	if (!old_dentry->d_inode)
				3307	goto exit4;
				3308	/* unless the source is a directory trailing slashes give -ENOTDIR */
				3309	if (!S_ISDIR(old_dentry->d_inode->i_mode)) {
				3310	error = -ENOTDIR;
				3311	if (oldnd.last.name[oldnd.last.len])
				3312	goto exit4;
				3313	if (newnd.last.name[newnd.last.len])
				3314	goto exit4;
				3315	}
				3316	/* source should not be ancestor of target */
				3317	error = -EINVAL;
				3318	if (old_dentry == trap)
				3319	goto exit4;
				3320	new_dentry = lookup_hash(&newnd);
				3321	error = PTR_ERR(new_dentry);
				3322	if (IS_ERR(new_dentry))
				3323	goto exit4;
				3324	/* target should not be an ancestor of source */
				3325	error = -ENOTEMPTY;
				3326	if (new_dentry == trap)
				3327	goto exit5;
				3328
				3329	error = mnt_want_write(oldnd.path.mnt);
				3330	if (error)
				3331	goto exit5;
				3332	error = security_path_rename(&oldnd.path, old_dentry,
				3333	&newnd.path, new_dentry);
				3334	if (error)
				3335	goto exit6;
				3336	error = vfs_rename(old_dir->d_inode, old_dentry,
				3337	new_dir->d_inode, new_dentry);
				3338	exit6:
				3339	mnt_drop_write(oldnd.path.mnt);
				3340	exit5:
				3341	dput(new_dentry);
				3342	exit4:
				3343	dput(old_dentry);
				3344	exit3:
				3345	unlock_rename(new_dir, old_dir);
				3346	exit2:
				3347	path_put(&newnd.path);
				3348	putname(to);
				3349	exit1:
				3350	path_put(&oldnd.path);
				3351	putname(from);
				3352	exit:
				3353	return error;
				3354	}
				3355
				3356	SYSCALL_DEFINE2(rename, const char __user , oldname, const char __user , newname)
				3357	{
				3358	return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname);
				3359	}
				3360
				3361	int vfs_readlink(struct dentry dentry, char __user buffer, int buflen, const char *link)
				3362	{
				3363	int len;
				3364
				3365	len = PTR_ERR(link);
				3366	if (IS_ERR(link))
				3367	goto out;
				3368
				3369	len = strlen(link);
				3370	if (len > (unsigned) buflen)
				3371	len = buflen;
				3372	if (copy_to_user(buffer, link, len))
				3373	len = -EFAULT;
				3374	out:
				3375	return len;
				3376	}
				3377
				3378	/*
				3379	* A helper for ->readlink(). This should be used ONLY for symlinks that
				3380	* have ->follow_link() touching nd only in nd_set_link(). Using (or not
				3381	* using) it for any given inode is up to filesystem.
				3382	*/
				3383	int generic_readlink(struct dentry dentry, char __user buffer, int buflen)
				3384	{
				3385	struct nameidata nd;
				3386	void *cookie;
				3387	int res;
				3388
				3389	nd.depth = 0;
				3390	cookie = dentry->d_inode->i_op->follow_link(dentry, &nd);
				3391	if (IS_ERR(cookie))
				3392	return PTR_ERR(cookie);
				3393
				3394	res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd));
				3395	if (dentry->d_inode->i_op->put_link)
				3396	dentry->d_inode->i_op->put_link(dentry, &nd, cookie);
				3397	return res;
				3398	}
				3399
				3400	int vfs_follow_link(struct nameidata nd, const char link)
				3401	{
				3402	return __vfs_follow_link(nd, link);
				3403	}
				3404
				3405	/* get the link contents into pagecache */
				3406	static char page_getlink(struct dentry dentry, struct page **ppage)
				3407	{
				3408	char *kaddr;
				3409	struct page *page;
				3410	struct address_space *mapping = dentry->d_inode->i_mapping;
				3411	page = read_mapping_page(mapping, 0, NULL);
				3412	if (IS_ERR(page))
				3413	return (char*)page;
				3414	*ppage = page;
				3415	kaddr = kmap(page);
				3416	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
				3417	return kaddr;
				3418	}
				3419
				3420	int page_readlink(struct dentry dentry, char __user buffer, int buflen)
				3421	{
				3422	struct page *page = NULL;
				3423	char *s = page_getlink(dentry, &page);
				3424	int res = vfs_readlink(dentry,buffer,buflen,s);
				3425	if (page) {
				3426	kunmap(page);
				3427	page_cache_release(page);
				3428	}
				3429	return res;
				3430	}
				3431
				3432	void page_follow_link_light(struct dentry dentry, struct nameidata *nd)
				3433	{
				3434	struct page *page = NULL;
				3435	nd_set_link(nd, page_getlink(dentry, &page));
				3436	return page;
				3437	}
				3438
				3439	void page_put_link(struct dentry dentry, struct nameidata nd, void *cookie)
				3440	{
				3441	struct page *page = cookie;
				3442
				3443	if (page) {
				3444	kunmap(page);
				3445	page_cache_release(page);
				3446	}
				3447	}
				3448
				3449	/*
				3450	* The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
				3451	*/
				3452	int __page_symlink(struct inode inode, const char symname, int len, int nofs)
				3453	{
				3454	struct address_space *mapping = inode->i_mapping;
				3455	struct page *page;
				3456	void *fsdata;
				3457	int err;
				3458	char *kaddr;
				3459	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
				3460	if (nofs)
				3461	flags \|= AOP_FLAG_NOFS;
				3462
				3463	retry:
				3464	err = pagecache_write_begin(NULL, mapping, 0, len-1,
				3465	flags, &page, &fsdata);
				3466	if (err)
				3467	goto fail;
				3468
				3469	kaddr = kmap_atomic(page);
				3470	memcpy(kaddr, symname, len-1);
				3471	kunmap_atomic(kaddr);
				3472
				3473	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
				3474	page, fsdata);
				3475	if (err < 0)
				3476	goto fail;
				3477	if (err < len-1)
				3478	goto retry;
				3479
				3480	mark_inode_dirty(inode);
				3481	return 0;
				3482	fail:
				3483	return err;
				3484	}
				3485
				3486	int page_symlink(struct inode inode, const char symname, int len)
				3487	{
				3488	return __page_symlink(inode, symname, len,
				3489	!(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
				3490	}
				3491
				3492	const struct inode_operations page_symlink_inode_operations = {
				3493	.readlink = generic_readlink,
				3494	.follow_link = page_follow_link_light,
				3495	.put_link = page_put_link,
				3496	};
				3497
				3498	EXPORT_SYMBOL(user_path_at);
				3499	EXPORT_SYMBOL(follow_down_one);
				3500	EXPORT_SYMBOL(follow_down);
				3501	EXPORT_SYMBOL(follow_up);
				3502	EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
				3503	EXPORT_SYMBOL(getname);
				3504	EXPORT_SYMBOL(lock_rename);
				3505	EXPORT_SYMBOL(lookup_one_len);
				3506	EXPORT_SYMBOL(page_follow_link_light);
				3507	EXPORT_SYMBOL(page_put_link);
				3508	EXPORT_SYMBOL(page_readlink);
				3509	EXPORT_SYMBOL(__page_symlink);
				3510	EXPORT_SYMBOL(page_symlink);
				3511	EXPORT_SYMBOL(page_symlink_inode_operations);
				3512	EXPORT_SYMBOL(kern_path);
				3513	EXPORT_SYMBOL(vfs_path_lookup);
				3514	EXPORT_SYMBOL(inode_permission);
				3515	EXPORT_SYMBOL(unlock_rename);
				3516	EXPORT_SYMBOL(vfs_create);
				3517	EXPORT_SYMBOL(vfs_follow_link);
				3518	EXPORT_SYMBOL(vfs_link);
				3519	EXPORT_SYMBOL(vfs_mkdir);
				3520	EXPORT_SYMBOL(vfs_mknod);
				3521	EXPORT_SYMBOL(generic_permission);
				3522	EXPORT_SYMBOL(vfs_readlink);
				3523	EXPORT_SYMBOL(vfs_rename);
				3524	EXPORT_SYMBOL(vfs_rmdir);
				3525	EXPORT_SYMBOL(vfs_symlink);
				3526	EXPORT_SYMBOL(vfs_unlink);
				3527	EXPORT_SYMBOL(dentry_unhash);
				3528	EXPORT_SYMBOL(generic_readlink);