Blame - src/kernel/linux/v4.19/fs/open.c - T800

blob: 886da562e40547a5b521f0d71aa57d15eebab1f5 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	/*
				2	* linux/fs/open.c
				3	*
				4	* Copyright (C) 1991, 1992 Linus Torvalds
				5	*/
				6
				7	#include <linux/string.h>
				8	#include <linux/mm.h>
				9	#include <linux/file.h>
				10	#include <linux/fdtable.h>
				11	#include <linux/fsnotify.h>
				12	#include <linux/module.h>
				13	#include <linux/tty.h>
				14	#include <linux/namei.h>
				15	#include <linux/backing-dev.h>
				16	#include <linux/capability.h>
				17	#include <linux/securebits.h>
				18	#include <linux/security.h>
				19	#include <linux/mount.h>
				20	#include <linux/fcntl.h>
				21	#include <linux/slab.h>
				22	#include <linux/uaccess.h>
				23	#include <linux/fs.h>
				24	#include <linux/personality.h>
				25	#include <linux/pagemap.h>
				26	#include <linux/syscalls.h>
				27	#include <linux/rcupdate.h>
				28	#include <linux/audit.h>
				29	#include <linux/falloc.h>
				30	#include <linux/fs_struct.h>
				31	#include <linux/ima.h>
				32	#include <linux/dnotify.h>
				33	#include <linux/compat.h>
				34
				35	#include "internal.h"
				36
				37	int do_truncate2(struct vfsmount mnt, struct dentry dentry, loff_t length,
				38	unsigned int time_attrs, struct file *filp)
				39	{
				40	int ret;
				41	struct iattr newattrs;
				42
				43	/* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
				44	if (length < 0)
				45	return -EINVAL;
				46
				47	newattrs.ia_size = length;
				48	newattrs.ia_valid = ATTR_SIZE \| time_attrs;
				49	if (filp) {
				50	newattrs.ia_file = filp;
				51	newattrs.ia_valid \|= ATTR_FILE;
				52	}
				53
				54	/* Remove suid, sgid, and file capabilities on truncate too */
				55	ret = dentry_needs_remove_privs(dentry);
				56	if (ret < 0)
				57	return ret;
				58	if (ret)
				59	newattrs.ia_valid \|= ret \| ATTR_FORCE;
				60
				61	inode_lock(dentry->d_inode);
				62	/* Note any delegations or leases have already been broken: */
				63	ret = notify_change2(mnt, dentry, &newattrs, NULL);
				64	inode_unlock(dentry->d_inode);
				65	return ret;
				66	}
				67	int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
				68	struct file *filp)
				69	{
				70	return do_truncate2(NULL, dentry, length, time_attrs, filp);
				71	}
				72
				73	long vfs_truncate(const struct path *path, loff_t length)
				74	{
				75	struct inode *inode;
				76	struct vfsmount *mnt;
				77	long error;
				78
				79	inode = path->dentry->d_inode;
				80	mnt = path->mnt;
				81
				82	/* For directories it's -EISDIR, for other non-regulars - -EINVAL */
				83	if (S_ISDIR(inode->i_mode))
				84	return -EISDIR;
				85	if (!S_ISREG(inode->i_mode))
				86	return -EINVAL;
				87
				88	error = mnt_want_write(path->mnt);
				89	if (error)
				90	goto out;
				91
				92	error = inode_permission2(mnt, inode, MAY_WRITE);
				93	if (error)
				94	goto mnt_drop_write_and_out;
				95
				96	error = -EPERM;
				97	if (IS_APPEND(inode))
				98	goto mnt_drop_write_and_out;
				99
				100	error = get_write_access(inode);
				101	if (error)
				102	goto mnt_drop_write_and_out;
				103
				104	/*
				105	* Make sure that there are no leases. get_write_access() protects
				106	* against the truncate racing with a lease-granting setlease().
				107	*/
				108	error = break_lease(inode, O_WRONLY);
				109	if (error)
				110	goto put_write_and_out;
				111
				112	error = locks_verify_truncate(inode, NULL, length);
				113	if (!error)
				114	error = security_path_truncate(path);
				115	if (!error)
				116	error = do_truncate2(mnt, path->dentry, length, 0, NULL);
				117
				118	put_write_and_out:
				119	put_write_access(inode);
				120	mnt_drop_write_and_out:
				121	mnt_drop_write(path->mnt);
				122	out:
				123	return error;
				124	}
				125	EXPORT_SYMBOL_GPL(vfs_truncate);
				126
				127	long do_sys_truncate(const char __user *pathname, loff_t length)
				128	{
				129	unsigned int lookup_flags = LOOKUP_FOLLOW;
				130	struct path path;
				131	int error;
				132
				133	if (length < 0) /* sorry, but loff_t says... */
				134	return -EINVAL;
				135
				136	retry:
				137	error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
				138	if (!error) {
				139	error = vfs_truncate(&path, length);
				140	path_put(&path);
				141	}
				142	if (retry_estale(error, lookup_flags)) {
				143	lookup_flags \|= LOOKUP_REVAL;
				144	goto retry;
				145	}
				146	return error;
				147	}
				148
				149	SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
				150	{
				151	return do_sys_truncate(path, length);
				152	}
				153
				154	#ifdef CONFIG_COMPAT
				155	COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
				156	{
				157	return do_sys_truncate(path, length);
				158	}
				159	#endif
				160
				161	long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
				162	{
				163	struct inode *inode;
				164	struct dentry *dentry;
				165	struct vfsmount *mnt;
				166	struct fd f;
				167	int error;
				168
				169	error = -EINVAL;
				170	if (length < 0)
				171	goto out;
				172	error = -EBADF;
				173	f = fdget(fd);
				174	if (!f.file)
				175	goto out;
				176
				177	/* explicitly opened as large or we are on 64-bit box */
				178	if (f.file->f_flags & O_LARGEFILE)
				179	small = 0;
				180
				181	dentry = f.file->f_path.dentry;
				182	mnt = f.file->f_path.mnt;
				183	inode = dentry->d_inode;
				184	error = -EINVAL;
				185	if (!S_ISREG(inode->i_mode) \|\| !(f.file->f_mode & FMODE_WRITE))
				186	goto out_putf;
				187
				188	error = -EINVAL;
				189	/* Cannot ftruncate over 2^31 bytes without large file support */
				190	if (small && length > MAX_NON_LFS)
				191	goto out_putf;
				192
				193	error = -EPERM;
				194	/* Check IS_APPEND on real upper inode */
				195	if (IS_APPEND(file_inode(f.file)))
				196	goto out_putf;
				197
				198	sb_start_write(inode->i_sb);
				199	error = locks_verify_truncate(inode, f.file, length);
				200	if (!error)
				201	error = security_path_truncate(&f.file->f_path);
				202	if (!error)
				203	error = do_truncate2(mnt, dentry, length, ATTR_MTIME\|ATTR_CTIME, f.file);
				204	sb_end_write(inode->i_sb);
				205	out_putf:
				206	fdput(f);
				207	out:
				208	return error;
				209	}
				210
				211	SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
				212	{
				213	return do_sys_ftruncate(fd, length, 1);
				214	}
				215
				216	#ifdef CONFIG_COMPAT
				217	COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
				218	{
				219	return do_sys_ftruncate(fd, length, 1);
				220	}
				221	#endif
				222
				223	/* LFS versions of truncate are only needed on 32 bit machines */
				224	#if BITS_PER_LONG == 32
				225	SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
				226	{
				227	return do_sys_truncate(path, length);
				228	}
				229
				230	SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
				231	{
				232	return do_sys_ftruncate(fd, length, 0);
				233	}
				234	#endif /* BITS_PER_LONG == 32 */
				235
				236
				237	int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
				238	{
				239	struct inode *inode = file_inode(file);
				240	long ret;
				241
				242	if (offset < 0 \|\| len <= 0)
				243	return -EINVAL;
				244
				245	/* Return error if mode is not supported */
				246	if (mode & ~FALLOC_FL_SUPPORTED_MASK)
				247	return -EOPNOTSUPP;
				248
				249	/* Punch hole and zero range are mutually exclusive */
				250	if ((mode & (FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE)) ==
				251	(FALLOC_FL_PUNCH_HOLE \| FALLOC_FL_ZERO_RANGE))
				252	return -EOPNOTSUPP;
				253
				254	/* Punch hole must have keep size set */
				255	if ((mode & FALLOC_FL_PUNCH_HOLE) &&
				256	!(mode & FALLOC_FL_KEEP_SIZE))
				257	return -EOPNOTSUPP;
				258
				259	/* Collapse range should only be used exclusively. */
				260	if ((mode & FALLOC_FL_COLLAPSE_RANGE) &&
				261	(mode & ~FALLOC_FL_COLLAPSE_RANGE))
				262	return -EINVAL;
				263
				264	/* Insert range should only be used exclusively. */
				265	if ((mode & FALLOC_FL_INSERT_RANGE) &&
				266	(mode & ~FALLOC_FL_INSERT_RANGE))
				267	return -EINVAL;
				268
				269	/* Unshare range should only be used with allocate mode. */
				270	if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
				271	(mode & ~(FALLOC_FL_UNSHARE_RANGE \| FALLOC_FL_KEEP_SIZE)))
				272	return -EINVAL;
				273
				274	if (!(file->f_mode & FMODE_WRITE))
				275	return -EBADF;
				276
				277	/*
				278	* We can only allow pure fallocate on append only files
				279	*/
				280	if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
				281	return -EPERM;
				282
				283	if (IS_IMMUTABLE(inode))
				284	return -EPERM;
				285
				286	/*
				287	* We cannot allow any fallocate operation on an active swapfile
				288	*/
				289	if (IS_SWAPFILE(inode))
				290	return -ETXTBSY;
				291
				292	/*
				293	* Revalidate the write permissions, in case security policy has
				294	* changed since the files were opened.
				295	*/
				296	ret = security_file_permission(file, MAY_WRITE);
				297	if (ret)
				298	return ret;
				299
				300	if (S_ISFIFO(inode->i_mode))
				301	return -ESPIPE;
				302
				303	if (S_ISDIR(inode->i_mode))
				304	return -EISDIR;
				305
				306	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
				307	return -ENODEV;
				308
				309	/* Check for wrap through zero too */
				310	if (((offset + len) > inode->i_sb->s_maxbytes) \|\| ((offset + len) < 0))
				311	return -EFBIG;
				312
				313	if (!file->f_op->fallocate)
				314	return -EOPNOTSUPP;
				315
				316	file_start_write(file);
				317	ret = file->f_op->fallocate(file, mode, offset, len);
				318
				319	/*
				320	* Create inotify and fanotify events.
				321	*
				322	* To keep the logic simple always create events if fallocate succeeds.
				323	* This implies that events are even created if the file size remains
				324	* unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
				325	*/
				326	if (ret == 0)
				327	fsnotify_modify(file);
				328
				329	file_end_write(file);
				330	return ret;
				331	}
				332	EXPORT_SYMBOL_GPL(vfs_fallocate);
				333
				334	int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
				335	{
				336	struct fd f = fdget(fd);
				337	int error = -EBADF;
				338
				339	if (f.file) {
				340	error = vfs_fallocate(f.file, mode, offset, len);
				341	fdput(f);
				342	}
				343	return error;
				344	}
				345
				346	SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
				347	{
				348	return ksys_fallocate(fd, mode, offset, len);
				349	}
				350
				351	/*
				352	* access() needs to use the real uid/gid, not the effective uid/gid.
				353	* We do this by temporarily clearing all FS-related capabilities and
				354	* switching the fsuid/fsgid around to the real ones.
				355	*/
				356	long do_faccessat(int dfd, const char __user *filename, int mode)
				357	{
				358	const struct cred *old_cred;
				359	struct cred *override_cred;
				360	struct path path;
				361	struct inode *inode;
				362	struct vfsmount *mnt;
				363	int res;
				364	unsigned int lookup_flags = LOOKUP_FOLLOW;
				365
				366	if (mode & ~S_IRWXO) /* where's F_OK, X_OK, W_OK, R_OK? */
				367	return -EINVAL;
				368
				369	override_cred = prepare_creds();
				370	if (!override_cred)
				371	return -ENOMEM;
				372
				373	override_cred->fsuid = override_cred->uid;
				374	override_cred->fsgid = override_cred->gid;
				375
				376	if (!issecure(SECURE_NO_SETUID_FIXUP)) {
				377	/* Clear the capabilities if we switch to a non-root user */
				378	kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
				379	if (!uid_eq(override_cred->uid, root_uid))
				380	cap_clear(override_cred->cap_effective);
				381	else
				382	override_cred->cap_effective =
				383	override_cred->cap_permitted;
				384	}
				385
				386	/*
				387	* The new set of credentials can only be used in
				388	* task-synchronous circumstances, and does not need
				389	* RCU freeing, unless somebody then takes a separate
				390	* reference to it.
				391	*
				392	* NOTE! This is _only_ true because this credential
				393	* is used purely for override_creds() that installs
				394	* it as the subjective cred. Other threads will be
				395	* accessing ->real_cred, not the subjective cred.
				396	*
				397	* If somebody _does_ make a copy of this (using the
				398	* 'get_current_cred()' function), that will clear the
				399	* non_rcu field, because now that other user may be
				400	* expecting RCU freeing. But normal thread-synchronous
				401	* cred accesses will keep things non-RCY.
				402	*/
				403	override_cred->non_rcu = 1;
				404
				405	old_cred = override_creds(override_cred);
				406	retry:
				407	res = user_path_at(dfd, filename, lookup_flags, &path);
				408	if (res)
				409	goto out;
				410
				411	inode = d_backing_inode(path.dentry);
				412	mnt = path.mnt;
				413
				414	if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
				415	/*
				416	* MAY_EXEC on regular files is denied if the fs is mounted
				417	* with the "noexec" flag.
				418	*/
				419	res = -EACCES;
				420	if (path_noexec(&path))
				421	goto out_path_release;
				422	}
				423
				424	res = inode_permission2(mnt, inode, mode \| MAY_ACCESS);
				425	/* SuS v2 requires we report a read only fs too */
				426	if (res \|\| !(mode & S_IWOTH) \|\| special_file(inode->i_mode))
				427	goto out_path_release;
				428	/*
				429	* This is a rare case where using __mnt_is_readonly()
				430	* is OK without a mnt_want/drop_write() pair. Since
				431	* no actual write to the fs is performed here, we do
				432	* not need to telegraph to that to anyone.
				433	*
				434	* By doing this, we accept that this access is
				435	* inherently racy and know that the fs may change
				436	* state before we even see this result.
				437	*/
				438	if (__mnt_is_readonly(path.mnt))
				439	res = -EROFS;
				440
				441	out_path_release:
				442	path_put(&path);
				443	if (retry_estale(res, lookup_flags)) {
				444	lookup_flags \|= LOOKUP_REVAL;
				445	goto retry;
				446	}
				447	out:
				448	revert_creds(old_cred);
				449	put_cred(override_cred);
				450	return res;
				451	}
				452
				453	SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
				454	{
				455	return do_faccessat(dfd, filename, mode);
				456	}
				457
				458	SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
				459	{
				460	return do_faccessat(AT_FDCWD, filename, mode);
				461	}
				462
				463	int ksys_chdir(const char __user *filename)
				464	{
				465	struct path path;
				466	int error;
				467	unsigned int lookup_flags = LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				468	retry:
				469	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
				470	if (error)
				471	goto out;
				472
				473	error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC \| MAY_CHDIR);
				474	if (error)
				475	goto dput_and_out;
				476
				477	set_fs_pwd(current->fs, &path);
				478
				479	dput_and_out:
				480	path_put(&path);
				481	if (retry_estale(error, lookup_flags)) {
				482	lookup_flags \|= LOOKUP_REVAL;
				483	goto retry;
				484	}
				485	out:
				486	return error;
				487	}
				488
				489	SYSCALL_DEFINE1(chdir, const char __user *, filename)
				490	{
				491	return ksys_chdir(filename);
				492	}
				493
				494	SYSCALL_DEFINE1(fchdir, unsigned int, fd)
				495	{
				496	struct fd f = fdget_raw(fd);
				497	int error;
				498
				499	error = -EBADF;
				500	if (!f.file)
				501	goto out;
				502
				503	error = -ENOTDIR;
				504	if (!d_can_lookup(f.file->f_path.dentry))
				505	goto out_putf;
				506
				507	error = inode_permission2(f.file->f_path.mnt, file_inode(f.file),
				508	MAY_EXEC \| MAY_CHDIR);
				509	if (!error)
				510	set_fs_pwd(current->fs, &f.file->f_path);
				511	out_putf:
				512	fdput(f);
				513	out:
				514	return error;
				515	}
				516
				517	int ksys_chroot(const char __user *filename)
				518	{
				519	struct path path;
				520	int error;
				521	unsigned int lookup_flags = LOOKUP_FOLLOW \| LOOKUP_DIRECTORY;
				522	retry:
				523	error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
				524	if (error)
				525	goto out;
				526
				527	error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC \| MAY_CHDIR);
				528	if (error)
				529	goto dput_and_out;
				530
				531	error = -EPERM;
				532	if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
				533	goto dput_and_out;
				534	error = security_path_chroot(&path);
				535	if (error)
				536	goto dput_and_out;
				537
				538	set_fs_root(current->fs, &path);
				539	error = 0;
				540	dput_and_out:
				541	path_put(&path);
				542	if (retry_estale(error, lookup_flags)) {
				543	lookup_flags \|= LOOKUP_REVAL;
				544	goto retry;
				545	}
				546	out:
				547	return error;
				548	}
				549
				550	SYSCALL_DEFINE1(chroot, const char __user *, filename)
				551	{
				552	return ksys_chroot(filename);
				553	}
				554
				555	static int chmod_common(const struct path *path, umode_t mode)
				556	{
				557	struct inode *inode = path->dentry->d_inode;
				558	struct inode *delegated_inode = NULL;
				559	struct iattr newattrs;
				560	int error;
				561
				562	error = mnt_want_write(path->mnt);
				563	if (error)
				564	return error;
				565	retry_deleg:
				566	inode_lock(inode);
				567	error = security_path_chmod(path, mode);
				568	if (error)
				569	goto out_unlock;
				570	newattrs.ia_mode = (mode & S_IALLUGO) \| (inode->i_mode & ~S_IALLUGO);
				571	newattrs.ia_valid = ATTR_MODE \| ATTR_CTIME;
				572	error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
				573	out_unlock:
				574	inode_unlock(inode);
				575	if (delegated_inode) {
				576	error = break_deleg_wait(&delegated_inode);
				577	if (!error)
				578	goto retry_deleg;
				579	}
				580	mnt_drop_write(path->mnt);
				581	return error;
				582	}
				583
				584	int ksys_fchmod(unsigned int fd, umode_t mode)
				585	{
				586	struct fd f = fdget(fd);
				587	int err = -EBADF;
				588
				589	if (f.file) {
				590	audit_file(f.file);
				591	err = chmod_common(&f.file->f_path, mode);
				592	fdput(f);
				593	}
				594	return err;
				595	}
				596
				597	SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
				598	{
				599	return ksys_fchmod(fd, mode);
				600	}
				601
				602	int do_fchmodat(int dfd, const char __user *filename, umode_t mode)
				603	{
				604	struct path path;
				605	int error;
				606	unsigned int lookup_flags = LOOKUP_FOLLOW;
				607	retry:
				608	error = user_path_at(dfd, filename, lookup_flags, &path);
				609	if (!error) {
				610	error = chmod_common(&path, mode);
				611	path_put(&path);
				612	if (retry_estale(error, lookup_flags)) {
				613	lookup_flags \|= LOOKUP_REVAL;
				614	goto retry;
				615	}
				616	}
				617	return error;
				618	}
				619
				620	SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
				621	umode_t, mode)
				622	{
				623	return do_fchmodat(dfd, filename, mode);
				624	}
				625
				626	SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
				627	{
				628	return do_fchmodat(AT_FDCWD, filename, mode);
				629	}
				630
				631	static int chown_common(const struct path *path, uid_t user, gid_t group)
				632	{
				633	struct inode *inode = path->dentry->d_inode;
				634	struct inode *delegated_inode = NULL;
				635	int error;
				636	struct iattr newattrs;
				637	kuid_t uid;
				638	kgid_t gid;
				639
				640	uid = make_kuid(current_user_ns(), user);
				641	gid = make_kgid(current_user_ns(), group);
				642
				643	retry_deleg:
				644	newattrs.ia_valid = ATTR_CTIME;
				645	if (user != (uid_t) -1) {
				646	if (!uid_valid(uid))
				647	return -EINVAL;
				648	newattrs.ia_valid \|= ATTR_UID;
				649	newattrs.ia_uid = uid;
				650	}
				651	if (group != (gid_t) -1) {
				652	if (!gid_valid(gid))
				653	return -EINVAL;
				654	newattrs.ia_valid \|= ATTR_GID;
				655	newattrs.ia_gid = gid;
				656	}
				657	if (!S_ISDIR(inode->i_mode))
				658	newattrs.ia_valid \|=
				659	ATTR_KILL_SUID \| ATTR_KILL_SGID \| ATTR_KILL_PRIV;
				660	inode_lock(inode);
				661	error = security_path_chown(path, uid, gid);
				662	if (!error)
				663	error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
				664	inode_unlock(inode);
				665	if (delegated_inode) {
				666	error = break_deleg_wait(&delegated_inode);
				667	if (!error)
				668	goto retry_deleg;
				669	}
				670	return error;
				671	}
				672
				673	int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
				674	int flag)
				675	{
				676	struct path path;
				677	int error = -EINVAL;
				678	int lookup_flags;
				679
				680	if ((flag & ~(AT_SYMLINK_NOFOLLOW \| AT_EMPTY_PATH)) != 0)
				681	goto out;
				682
				683	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
				684	if (flag & AT_EMPTY_PATH)
				685	lookup_flags \|= LOOKUP_EMPTY;
				686	retry:
				687	error = user_path_at(dfd, filename, lookup_flags, &path);
				688	if (error)
				689	goto out;
				690	error = mnt_want_write(path.mnt);
				691	if (error)
				692	goto out_release;
				693	error = chown_common(&path, user, group);
				694	mnt_drop_write(path.mnt);
				695	out_release:
				696	path_put(&path);
				697	if (retry_estale(error, lookup_flags)) {
				698	lookup_flags \|= LOOKUP_REVAL;
				699	goto retry;
				700	}
				701	out:
				702	return error;
				703	}
				704
				705	SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
				706	gid_t, group, int, flag)
				707	{
				708	return do_fchownat(dfd, filename, user, group, flag);
				709	}
				710
				711	SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
				712	{
				713	return do_fchownat(AT_FDCWD, filename, user, group, 0);
				714	}
				715
				716	SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
				717	{
				718	return do_fchownat(AT_FDCWD, filename, user, group,
				719	AT_SYMLINK_NOFOLLOW);
				720	}
				721
				722	int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
				723	{
				724	struct fd f = fdget(fd);
				725	int error = -EBADF;
				726
				727	if (!f.file)
				728	goto out;
				729
				730	error = mnt_want_write_file(f.file);
				731	if (error)
				732	goto out_fput;
				733	audit_file(f.file);
				734	error = chown_common(&f.file->f_path, user, group);
				735	mnt_drop_write_file(f.file);
				736	out_fput:
				737	fdput(f);
				738	out:
				739	return error;
				740	}
				741
				742	SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
				743	{
				744	return ksys_fchown(fd, user, group);
				745	}
				746
				747	static int do_dentry_open(struct file *f,
				748	struct inode *inode,
				749	int (open)(struct inode , struct file *))
				750	{
				751	static const struct file_operations empty_fops = {};
				752	int error;
				753
				754	path_get(&f->f_path);
				755	f->f_inode = inode;
				756	f->f_mapping = inode->i_mapping;
				757
				758	/* Ensure that we skip any errors that predate opening of the file */
				759	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
				760
				761	if (unlikely(f->f_flags & O_PATH)) {
				762	f->f_mode = FMODE_PATH \| FMODE_OPENED;
				763	f->f_op = &empty_fops;
				764	return 0;
				765	}
				766
				767	/* Any file opened for execve()/uselib() has to be a regular file. */
				768	if (unlikely(f->f_flags & FMODE_EXEC && !S_ISREG(inode->i_mode))) {
				769	error = -EACCES;
				770	goto cleanup_file;
				771	}
				772
				773	if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
				774	error = get_write_access(inode);
				775	if (unlikely(error))
				776	goto cleanup_file;
				777	error = __mnt_want_write(f->f_path.mnt);
				778	if (unlikely(error)) {
				779	put_write_access(inode);
				780	goto cleanup_file;
				781	}
				782	f->f_mode \|= FMODE_WRITER;
				783	}
				784
				785	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
				786	if (S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode))
				787	f->f_mode \|= FMODE_ATOMIC_POS;
				788
				789	f->f_op = fops_get(inode->i_fop);
				790	if (unlikely(WARN_ON(!f->f_op))) {
				791	error = -ENODEV;
				792	goto cleanup_all;
				793	}
				794
				795	error = security_file_open(f);
				796	if (error)
				797	goto cleanup_all;
				798
				799	error = break_lease(locks_inode(f), f->f_flags);
				800	if (error)
				801	goto cleanup_all;
				802
				803	/* normally all 3 are set; ->open() can clear them if needed */
				804	f->f_mode \|= FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE;
				805	if (!open)
				806	open = f->f_op->open;
				807	if (open) {
				808	error = open(inode, f);
				809	if (error)
				810	goto cleanup_all;
				811	}
				812	f->f_mode \|= FMODE_OPENED;
				813	if ((f->f_mode & (FMODE_READ \| FMODE_WRITE)) == FMODE_READ)
				814	i_readcount_inc(inode);
				815	if ((f->f_mode & FMODE_READ) &&
				816	likely(f->f_op->read \|\| f->f_op->read_iter))
				817	f->f_mode \|= FMODE_CAN_READ;
				818	if ((f->f_mode & FMODE_WRITE) &&
				819	likely(f->f_op->write \|\| f->f_op->write_iter))
				820	f->f_mode \|= FMODE_CAN_WRITE;
				821
				822	f->f_write_hint = WRITE_LIFE_NOT_SET;
				823	f->f_flags &= ~(O_CREAT \| O_EXCL \| O_NOCTTY \| O_TRUNC);
				824
				825	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
				826
				827	/* NB: we're sure to have correct a_ops only after f_op->open */
				828	if (f->f_flags & O_DIRECT) {
				829	if (!f->f_mapping->a_ops \|\| !f->f_mapping->a_ops->direct_IO)
				830	return -EINVAL;
				831	}
				832	return 0;
				833
				834	cleanup_all:
				835	if (WARN_ON_ONCE(error > 0))
				836	error = -EINVAL;
				837	fops_put(f->f_op);
				838	if (f->f_mode & FMODE_WRITER) {
				839	put_write_access(inode);
				840	__mnt_drop_write(f->f_path.mnt);
				841	}
				842	cleanup_file:
				843	path_put(&f->f_path);
				844	f->f_path.mnt = NULL;
				845	f->f_path.dentry = NULL;
				846	f->f_inode = NULL;
				847	return error;
				848	}
				849
				850	/**
				851	* finish_open - finish opening a file
				852	* @file: file pointer
				853	* @dentry: pointer to dentry
				854	* @open: open callback
				855	* @opened: state of open
				856	*
				857	* This can be used to finish opening a file passed to i_op->atomic_open().
				858	*
				859	* If the open callback is set to NULL, then the standard f_op->open()
				860	* filesystem callback is substituted.
				861	*
				862	* NB: the dentry reference is _not_ consumed. If, for example, the dentry is
				863	* the return value of d_splice_alias(), then the caller needs to perform dput()
				864	* on it after finish_open().
				865	*
				866	* On successful return @file is a fully instantiated open file. After this, if
				867	* an error occurs in ->atomic_open(), it needs to clean up with fput().
				868	*
				869	* Returns zero on success or -errno if the open failed.
				870	*/
				871	int finish_open(struct file file, struct dentry dentry,
				872	int (open)(struct inode , struct file *))
				873	{
				874	BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */
				875
				876	file->f_path.dentry = dentry;
				877	return do_dentry_open(file, d_backing_inode(dentry), open);
				878	}
				879	EXPORT_SYMBOL(finish_open);
				880
				881	/**
				882	* finish_no_open - finish ->atomic_open() without opening the file
				883	*
				884	* @file: file pointer
				885	* @dentry: dentry or NULL (as returned from ->lookup())
				886	*
				887	* This can be used to set the result of a successful lookup in ->atomic_open().
				888	*
				889	* NB: unlike finish_open() this function does consume the dentry reference and
				890	* the caller need not dput() it.
				891	*
				892	* Returns "0" which must be the return value of ->atomic_open() after having
				893	* called this function.
				894	*/
				895	int finish_no_open(struct file file, struct dentry dentry)
				896	{
				897	file->f_path.dentry = dentry;
				898	return 0;
				899	}
				900	EXPORT_SYMBOL(finish_no_open);
				901
				902	char file_path(struct file filp, char *buf, int buflen)
				903	{
				904	return d_path(&filp->f_path, buf, buflen);
				905	}
				906	EXPORT_SYMBOL(file_path);
				907
				908	/**
				909	* vfs_open - open the file at the given path
				910	* @path: path to open
				911	* @file: newly allocated file with f_flag initialized
				912	* @cred: credentials to use
				913	*/
				914	int vfs_open(const struct path path, struct file file)
				915	{
				916	file->f_path = *path;
				917	return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
				918	}
				919
				920	struct file dentry_open(const struct path path, int flags,
				921	const struct cred *cred)
				922	{
				923	int error;
				924	struct file *f;
				925
				926	validate_creds(cred);
				927
				928	/* We must always pass in a valid mount pointer. */
				929	BUG_ON(!path->mnt);
				930
				931	f = alloc_empty_file(flags, cred);
				932	if (!IS_ERR(f)) {
				933	error = vfs_open(path, f);
				934	if (error) {
				935	fput(f);
				936	f = ERR_PTR(error);
				937	}
				938	}
				939	return f;
				940	}
				941	EXPORT_SYMBOL(dentry_open);
				942
				943	struct file open_with_fake_path(const struct path path, int flags,
				944	struct inode inode, const struct cred cred)
				945	{
				946	struct file *f = alloc_empty_file_noaccount(flags, cred);
				947	if (!IS_ERR(f)) {
				948	int error;
				949
				950	f->f_path = *path;
				951	error = do_dentry_open(f, inode, NULL);
				952	if (error) {
				953	fput(f);
				954	f = ERR_PTR(error);
				955	}
				956	}
				957	return f;
				958	}
				959	EXPORT_SYMBOL(open_with_fake_path);
				960
				961	static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
				962	{
				963	int lookup_flags = 0;
				964	int acc_mode = ACC_MODE(flags);
				965
				966	/*
				967	* Clear out all open flags we don't know about so that we don't report
				968	* them in fcntl(F_GETFD) or similar interfaces.
				969	*/
				970	flags &= VALID_OPEN_FLAGS;
				971
				972	if (flags & (O_CREAT \| __O_TMPFILE))
				973	op->mode = (mode & S_IALLUGO) \| S_IFREG;
				974	else
				975	op->mode = 0;
				976
				977	/* Must never be set by userspace */
				978	flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
				979
				980	/*
				981	* O_SYNC is implemented as __O_SYNC\|O_DSYNC. As many places only
				982	* check for O_DSYNC if the need any syncing at all we enforce it's
				983	* always set instead of having to deal with possibly weird behaviour
				984	* for malicious applications setting only __O_SYNC.
				985	*/
				986	if (flags & __O_SYNC)
				987	flags \|= O_DSYNC;
				988
				989	if (flags & __O_TMPFILE) {
				990	if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
				991	return -EINVAL;
				992	if (!(acc_mode & MAY_WRITE))
				993	return -EINVAL;
				994	} else if (flags & O_PATH) {
				995	/*
				996	* If we have O_PATH in the open flag. Then we
				997	* cannot have anything other than the below set of flags
				998	*/
				999	flags &= O_DIRECTORY \| O_NOFOLLOW \| O_PATH;
				1000	acc_mode = 0;
				1001	}
				1002
				1003	op->open_flag = flags;
				1004
				1005	/* O_TRUNC implies we need access checks for write permissions */
				1006	if (flags & O_TRUNC)
				1007	acc_mode \|= MAY_WRITE;
				1008
				1009	/* Allow the LSM permission hook to distinguish append
				1010	access from general write access. */
				1011	if (flags & O_APPEND)
				1012	acc_mode \|= MAY_APPEND;
				1013
				1014	op->acc_mode = acc_mode;
				1015
				1016	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
				1017
				1018	if (flags & O_CREAT) {
				1019	op->intent \|= LOOKUP_CREATE;
				1020	if (flags & O_EXCL)
				1021	op->intent \|= LOOKUP_EXCL;
				1022	}
				1023
				1024	if (flags & O_DIRECTORY)
				1025	lookup_flags \|= LOOKUP_DIRECTORY;
				1026	if (!(flags & O_NOFOLLOW))
				1027	lookup_flags \|= LOOKUP_FOLLOW;
				1028	op->lookup_flags = lookup_flags;
				1029	return 0;
				1030	}
				1031
				1032	/**
				1033	* file_open_name - open file and return file pointer
				1034	*
				1035	* @name: struct filename containing path to open
				1036	* @flags: open flags as per the open(2) second argument
				1037	* @mode: mode for the new file if O_CREAT is set, else ignored
				1038	*
				1039	* This is the helper to open a file from kernelspace if you really
				1040	* have to. But in generally you should not do this, so please move
				1041	* along, nothing to see here..
				1042	*/
				1043	struct file file_open_name(struct filename name, int flags, umode_t mode)
				1044	{
				1045	struct open_flags op;
				1046	int err = build_open_flags(flags, mode, &op);
				1047	return err ? ERR_PTR(err) : do_filp_open(AT_FDCWD, name, &op);
				1048	}
				1049
				1050	/**
				1051	* filp_open - open file and return file pointer
				1052	*
				1053	* @filename: path to open
				1054	* @flags: open flags as per the open(2) second argument
				1055	* @mode: mode for the new file if O_CREAT is set, else ignored
				1056	*
				1057	* This is the helper to open a file from kernelspace if you really
				1058	* have to. But in generally you should not do this, so please move
				1059	* along, nothing to see here..
				1060	*/
				1061	struct file filp_open(const char filename, int flags, umode_t mode)
				1062	{
				1063	struct filename *name = getname_kernel(filename);
				1064	struct file *file = ERR_CAST(name);
				1065
				1066	if (!IS_ERR(name)) {
				1067	file = file_open_name(name, flags, mode);
				1068	putname(name);
				1069	}
				1070	return file;
				1071	}
				1072	EXPORT_SYMBOL(filp_open);
				1073
				1074	struct file file_open_root(struct dentry dentry, struct vfsmount *mnt,
				1075	const char *filename, int flags, umode_t mode)
				1076	{
				1077	struct open_flags op;
				1078	int err = build_open_flags(flags, mode, &op);
				1079	if (err)
				1080	return ERR_PTR(err);
				1081	return do_file_open_root(dentry, mnt, filename, &op);
				1082	}
				1083	EXPORT_SYMBOL(file_open_root);
				1084
				1085	long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
				1086	{
				1087	struct open_flags op;
				1088	int fd = build_open_flags(flags, mode, &op);
				1089	struct filename *tmp;
				1090
				1091	if (fd)
				1092	return fd;
				1093
				1094	tmp = getname(filename);
				1095	if (IS_ERR(tmp))
				1096	return PTR_ERR(tmp);
				1097
				1098	fd = get_unused_fd_flags(flags);
				1099	if (fd >= 0) {
				1100	struct file *f = do_filp_open(dfd, tmp, &op);
				1101	if (IS_ERR(f)) {
				1102	put_unused_fd(fd);
				1103	fd = PTR_ERR(f);
				1104	} else {
				1105	fsnotify_open(f);
				1106	fd_install(fd, f);
				1107	}
				1108	}
				1109	putname(tmp);
				1110	return fd;
				1111	}
				1112
				1113	SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
				1114	{
				1115	if (force_o_largefile())
				1116	flags \|= O_LARGEFILE;
				1117
				1118	return do_sys_open(AT_FDCWD, filename, flags, mode);
				1119	}
				1120
				1121	SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
				1122	umode_t, mode)
				1123	{
				1124	if (force_o_largefile())
				1125	flags \|= O_LARGEFILE;
				1126
				1127	return do_sys_open(dfd, filename, flags, mode);
				1128	}
				1129
				1130	#ifdef CONFIG_COMPAT
				1131	/*
				1132	* Exactly like sys_open(), except that it doesn't set the
				1133	* O_LARGEFILE flag.
				1134	*/
				1135	COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
				1136	{
				1137	return do_sys_open(AT_FDCWD, filename, flags, mode);
				1138	}
				1139
				1140	/*
				1141	* Exactly like sys_openat(), except that it doesn't set the
				1142	* O_LARGEFILE flag.
				1143	*/
				1144	COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
				1145	{
				1146	return do_sys_open(dfd, filename, flags, mode);
				1147	}
				1148	#endif
				1149
				1150	#ifndef __alpha__
				1151
				1152	/*
				1153	* For backward compatibility? Maybe this should be moved
				1154	* into arch/i386 instead?
				1155	*/
				1156	SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
				1157	{
				1158	return ksys_open(pathname, O_CREAT \| O_WRONLY \| O_TRUNC, mode);
				1159	}
				1160
				1161	#endif
				1162
				1163	/*
				1164	* "id" is the POSIX thread ID. We use the
				1165	* files pointer for this..
				1166	*/
				1167	int filp_close(struct file *filp, fl_owner_t id)
				1168	{
				1169	int retval = 0;
				1170
				1171	if (!file_count(filp)) {
				1172	printk(KERN_ERR "VFS: Close: file count is 0\n");
				1173	return 0;
				1174	}
				1175
				1176	if (filp->f_op->flush)
				1177	retval = filp->f_op->flush(filp, id);
				1178
				1179	if (likely(!(filp->f_mode & FMODE_PATH))) {
				1180	dnotify_flush(filp, id);
				1181	locks_remove_posix(filp, id);
				1182	}
				1183	fput(filp);
				1184	return retval;
				1185	}
				1186
				1187	EXPORT_SYMBOL(filp_close);
				1188
				1189	/*
				1190	* Careful here! We test whether the file pointer is NULL before
				1191	* releasing the fd. This ensures that one clone task can't release
				1192	* an fd while another clone is opening it.
				1193	*/
				1194	SYSCALL_DEFINE1(close, unsigned int, fd)
				1195	{
				1196	int retval = __close_fd(current->files, fd);
				1197
				1198	/* can't restart close syscall because file table entry was cleared */
				1199	if (unlikely(retval == -ERESTARTSYS \|\|
				1200	retval == -ERESTARTNOINTR \|\|
				1201	retval == -ERESTARTNOHAND \|\|
				1202	retval == -ERESTART_RESTARTBLOCK))
				1203	retval = -EINTR;
				1204
				1205	return retval;
				1206	}
				1207
				1208	/*
				1209	* This routine simulates a hangup on the tty, to arrange that users
				1210	* are given clean terminals at login time.
				1211	*/
				1212	SYSCALL_DEFINE0(vhangup)
				1213	{
				1214	if (capable(CAP_SYS_TTY_CONFIG)) {
				1215	tty_vhangup_self();
				1216	return 0;
				1217	}
				1218	return -EPERM;
				1219	}
				1220
				1221	/*
				1222	* Called when an inode is about to be open.
				1223	* We use this to disallow opening large files on 32bit systems if
				1224	* the caller didn't specify O_LARGEFILE. On 64bit systems we force
				1225	* on this flag in sys_open.
				1226	*/
				1227	int generic_file_open(struct inode * inode, struct file * filp)
				1228	{
				1229	if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
				1230	return -EOVERFLOW;
				1231	return 0;
				1232	}
				1233
				1234	EXPORT_SYMBOL(generic_file_open);
				1235
				1236	/*
				1237	* This is used by subsystems that don't want seekable
				1238	* file descriptors. The function is not supposed to ever fail, the only
				1239	* reason it returns an 'int' and not 'void' is so that it can be plugged
				1240	* directly into file_operations structure.
				1241	*/
				1242	int nonseekable_open(struct inode inode, struct file filp)
				1243	{
				1244	filp->f_mode &= ~(FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE);
				1245	return 0;
				1246	}
				1247
				1248	EXPORT_SYMBOL(nonseekable_open);
				1249
				1250	/*
				1251	* stream_open is used by subsystems that want stream-like file descriptors.
				1252	* Such file descriptors are not seekable and don't have notion of position
				1253	* (file.f_pos is always 0). Contrary to file descriptors of other regular
				1254	* files, .read() and .write() can run simultaneously.
				1255	*
				1256	* stream_open never fails and is marked to return int so that it could be
				1257	* directly used as file_operations.open .
				1258	*/
				1259	int stream_open(struct inode inode, struct file filp)
				1260	{
				1261	filp->f_mode &= ~(FMODE_LSEEK \| FMODE_PREAD \| FMODE_PWRITE \| FMODE_ATOMIC_POS);
				1262	filp->f_mode \|= FMODE_STREAM;
				1263	return 0;
				1264	}
				1265
				1266	EXPORT_SYMBOL(stream_open);