Blame - src/kernel/linux/v4.19/fs/btrfs/ioctl.c - T800

blob: 199c70b8f7d8e705d00a68babd0251f767a205f8 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/kernel.h>
				7	#include <linux/bio.h>
				8	#include <linux/file.h>
				9	#include <linux/fs.h>
				10	#include <linux/fsnotify.h>
				11	#include <linux/pagemap.h>
				12	#include <linux/highmem.h>
				13	#include <linux/time.h>
				14	#include <linux/string.h>
				15	#include <linux/backing-dev.h>
				16	#include <linux/mount.h>
				17	#include <linux/namei.h>
				18	#include <linux/writeback.h>
				19	#include <linux/compat.h>
				20	#include <linux/security.h>
				21	#include <linux/xattr.h>
				22	#include <linux/mm.h>
				23	#include <linux/slab.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/uuid.h>
				26	#include <linux/btrfs.h>
				27	#include <linux/uaccess.h>
				28	#include <linux/iversion.h>
				29	#include "ctree.h"
				30	#include "disk-io.h"
				31	#include "transaction.h"
				32	#include "btrfs_inode.h"
				33	#include "print-tree.h"
				34	#include "volumes.h"
				35	#include "locking.h"
				36	#include "inode-map.h"
				37	#include "backref.h"
				38	#include "rcu-string.h"
				39	#include "send.h"
				40	#include "dev-replace.h"
				41	#include "props.h"
				42	#include "sysfs.h"
				43	#include "qgroup.h"
				44	#include "tree-log.h"
				45	#include "compression.h"
				46
				47	#ifdef CONFIG_64BIT
				48	/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
				49	* structures are incorrect, as the timespec structure from userspace
				50	* is 4 bytes too small. We define these alternatives here to teach
				51	* the kernel about the 32-bit struct packing.
				52	*/
				53	struct btrfs_ioctl_timespec_32 {
				54	__u64 sec;
				55	__u32 nsec;
				56	} __attribute__ ((__packed__));
				57
				58	struct btrfs_ioctl_received_subvol_args_32 {
				59	char uuid[BTRFS_UUID_SIZE]; /* in */
				60	__u64 stransid; /* in */
				61	__u64 rtransid; /* out */
				62	struct btrfs_ioctl_timespec_32 stime; /* in */
				63	struct btrfs_ioctl_timespec_32 rtime; /* out */
				64	__u64 flags; /* in */
				65	__u64 reserved[16]; /* in */
				66	} __attribute__ ((__packed__));
				67
				68	#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
				69	struct btrfs_ioctl_received_subvol_args_32)
				70	#endif
				71
				72	#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
				73	struct btrfs_ioctl_send_args_32 {
				74	__s64 send_fd; /* in */
				75	__u64 clone_sources_count; /* in */
				76	compat_uptr_t clone_sources; /* in */
				77	__u64 parent_root; /* in */
				78	__u64 flags; /* in */
				79	__u64 reserved[4]; /* in */
				80	} __attribute__ ((__packed__));
				81
				82	#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
				83	struct btrfs_ioctl_send_args_32)
				84	#endif
				85
				86	static int btrfs_clone(struct inode src, struct inode inode,
				87	u64 off, u64 olen, u64 olen_aligned, u64 destoff,
				88	int no_time_update);
				89
				90	/* Mask out flags that are inappropriate for the given type of inode. */
				91	static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
				92	unsigned int flags)
				93	{
				94	if (S_ISDIR(inode->i_mode))
				95	return flags;
				96	else if (S_ISREG(inode->i_mode))
				97	return flags & ~FS_DIRSYNC_FL;
				98	else
				99	return flags & (FS_NODUMP_FL \| FS_NOATIME_FL);
				100	}
				101
				102	/*
				103	* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
				104	* ioctl.
				105	*/
				106	static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
				107	{
				108	unsigned int iflags = 0;
				109
				110	if (flags & BTRFS_INODE_SYNC)
				111	iflags \|= FS_SYNC_FL;
				112	if (flags & BTRFS_INODE_IMMUTABLE)
				113	iflags \|= FS_IMMUTABLE_FL;
				114	if (flags & BTRFS_INODE_APPEND)
				115	iflags \|= FS_APPEND_FL;
				116	if (flags & BTRFS_INODE_NODUMP)
				117	iflags \|= FS_NODUMP_FL;
				118	if (flags & BTRFS_INODE_NOATIME)
				119	iflags \|= FS_NOATIME_FL;
				120	if (flags & BTRFS_INODE_DIRSYNC)
				121	iflags \|= FS_DIRSYNC_FL;
				122	if (flags & BTRFS_INODE_NODATACOW)
				123	iflags \|= FS_NOCOW_FL;
				124
				125	if (flags & BTRFS_INODE_NOCOMPRESS)
				126	iflags \|= FS_NOCOMP_FL;
				127	else if (flags & BTRFS_INODE_COMPRESS)
				128	iflags \|= FS_COMPR_FL;
				129
				130	return iflags;
				131	}
				132
				133	/*
				134	* Update inode->i_flags based on the btrfs internal flags.
				135	*/
				136	void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
				137	{
				138	struct btrfs_inode *binode = BTRFS_I(inode);
				139	unsigned int new_fl = 0;
				140
				141	if (binode->flags & BTRFS_INODE_SYNC)
				142	new_fl \|= S_SYNC;
				143	if (binode->flags & BTRFS_INODE_IMMUTABLE)
				144	new_fl \|= S_IMMUTABLE;
				145	if (binode->flags & BTRFS_INODE_APPEND)
				146	new_fl \|= S_APPEND;
				147	if (binode->flags & BTRFS_INODE_NOATIME)
				148	new_fl \|= S_NOATIME;
				149	if (binode->flags & BTRFS_INODE_DIRSYNC)
				150	new_fl \|= S_DIRSYNC;
				151
				152	set_mask_bits(&inode->i_flags,
				153	S_SYNC \| S_APPEND \| S_IMMUTABLE \| S_NOATIME \| S_DIRSYNC,
				154	new_fl);
				155	}
				156
				157	static int btrfs_ioctl_getflags(struct file file, void __user arg)
				158	{
				159	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
				160	unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
				161
				162	if (copy_to_user(arg, &flags, sizeof(flags)))
				163	return -EFAULT;
				164	return 0;
				165	}
				166
				167	/* Check if @flags are a supported and valid set of FS__FL flags /
				168	static int check_fsflags(unsigned int flags)
				169	{
				170	if (flags & ~(FS_IMMUTABLE_FL \| FS_APPEND_FL \| \
				171	FS_NOATIME_FL \| FS_NODUMP_FL \| \
				172	FS_SYNC_FL \| FS_DIRSYNC_FL \| \
				173	FS_NOCOMP_FL \| FS_COMPR_FL \|
				174	FS_NOCOW_FL))
				175	return -EOPNOTSUPP;
				176
				177	if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
				178	return -EINVAL;
				179
				180	return 0;
				181	}
				182
				183	static int btrfs_ioctl_setflags(struct file file, void __user arg)
				184	{
				185	struct inode *inode = file_inode(file);
				186	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				187	struct btrfs_inode *binode = BTRFS_I(inode);
				188	struct btrfs_root *root = binode->root;
				189	struct btrfs_trans_handle *trans;
				190	unsigned int fsflags, old_fsflags;
				191	int ret;
				192	u64 old_flags;
				193	unsigned int old_i_flags;
				194	umode_t mode;
				195
				196	if (!inode_owner_or_capable(inode))
				197	return -EPERM;
				198
				199	if (btrfs_root_readonly(root))
				200	return -EROFS;
				201
				202	if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
				203	return -EFAULT;
				204
				205	ret = check_fsflags(fsflags);
				206	if (ret)
				207	return ret;
				208
				209	ret = mnt_want_write_file(file);
				210	if (ret)
				211	return ret;
				212
				213	inode_lock(inode);
				214
				215	old_flags = binode->flags;
				216	old_i_flags = inode->i_flags;
				217	mode = inode->i_mode;
				218
				219	fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
				220	old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
				221	if ((fsflags ^ old_fsflags) & (FS_APPEND_FL \| FS_IMMUTABLE_FL)) {
				222	if (!capable(CAP_LINUX_IMMUTABLE)) {
				223	ret = -EPERM;
				224	goto out_unlock;
				225	}
				226	}
				227
				228	if (fsflags & FS_SYNC_FL)
				229	binode->flags \|= BTRFS_INODE_SYNC;
				230	else
				231	binode->flags &= ~BTRFS_INODE_SYNC;
				232	if (fsflags & FS_IMMUTABLE_FL)
				233	binode->flags \|= BTRFS_INODE_IMMUTABLE;
				234	else
				235	binode->flags &= ~BTRFS_INODE_IMMUTABLE;
				236	if (fsflags & FS_APPEND_FL)
				237	binode->flags \|= BTRFS_INODE_APPEND;
				238	else
				239	binode->flags &= ~BTRFS_INODE_APPEND;
				240	if (fsflags & FS_NODUMP_FL)
				241	binode->flags \|= BTRFS_INODE_NODUMP;
				242	else
				243	binode->flags &= ~BTRFS_INODE_NODUMP;
				244	if (fsflags & FS_NOATIME_FL)
				245	binode->flags \|= BTRFS_INODE_NOATIME;
				246	else
				247	binode->flags &= ~BTRFS_INODE_NOATIME;
				248	if (fsflags & FS_DIRSYNC_FL)
				249	binode->flags \|= BTRFS_INODE_DIRSYNC;
				250	else
				251	binode->flags &= ~BTRFS_INODE_DIRSYNC;
				252	if (fsflags & FS_NOCOW_FL) {
				253	if (S_ISREG(mode)) {
				254	/*
				255	* It's safe to turn csums off here, no extents exist.
				256	* Otherwise we want the flag to reflect the real COW
				257	* status of the file and will not set it.
				258	*/
				259	if (inode->i_size == 0)
				260	binode->flags \|= BTRFS_INODE_NODATACOW
				261	\| BTRFS_INODE_NODATASUM;
				262	} else {
				263	binode->flags \|= BTRFS_INODE_NODATACOW;
				264	}
				265	} else {
				266	/*
				267	* Revert back under same assumptions as above
				268	*/
				269	if (S_ISREG(mode)) {
				270	if (inode->i_size == 0)
				271	binode->flags &= ~(BTRFS_INODE_NODATACOW
				272	\| BTRFS_INODE_NODATASUM);
				273	} else {
				274	binode->flags &= ~BTRFS_INODE_NODATACOW;
				275	}
				276	}
				277
				278	/*
				279	* The COMPRESS flag can only be changed by users, while the NOCOMPRESS
				280	* flag may be changed automatically if compression code won't make
				281	* things smaller.
				282	*/
				283	if (fsflags & FS_NOCOMP_FL) {
				284	binode->flags &= ~BTRFS_INODE_COMPRESS;
				285	binode->flags \|= BTRFS_INODE_NOCOMPRESS;
				286
				287	ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
				288	if (ret && ret != -ENODATA)
				289	goto out_drop;
				290	} else if (fsflags & FS_COMPR_FL) {
				291	const char *comp;
				292
				293	binode->flags \|= BTRFS_INODE_COMPRESS;
				294	binode->flags &= ~BTRFS_INODE_NOCOMPRESS;
				295
				296	comp = btrfs_compress_type2str(fs_info->compress_type);
				297	if (!comp \|\| comp[0] == 0)
				298	comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
				299
				300	ret = btrfs_set_prop(inode, "btrfs.compression",
				301	comp, strlen(comp), 0);
				302	if (ret)
				303	goto out_drop;
				304
				305	} else {
				306	ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
				307	if (ret && ret != -ENODATA)
				308	goto out_drop;
				309	binode->flags &= ~(BTRFS_INODE_COMPRESS \| BTRFS_INODE_NOCOMPRESS);
				310	}
				311
				312	trans = btrfs_start_transaction(root, 1);
				313	if (IS_ERR(trans)) {
				314	ret = PTR_ERR(trans);
				315	goto out_drop;
				316	}
				317
				318	btrfs_sync_inode_flags_to_i_flags(inode);
				319	inode_inc_iversion(inode);
				320	inode->i_ctime = current_time(inode);
				321	ret = btrfs_update_inode(trans, root, inode);
				322
				323	btrfs_end_transaction(trans);
				324	out_drop:
				325	if (ret) {
				326	binode->flags = old_flags;
				327	inode->i_flags = old_i_flags;
				328	}
				329
				330	out_unlock:
				331	inode_unlock(inode);
				332	mnt_drop_write_file(file);
				333	return ret;
				334	}
				335
				336	/*
				337	* Translate btrfs internal inode flags to xflags as expected by the
				338	* FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
				339	* silently dropped.
				340	*/
				341	static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
				342	{
				343	unsigned int xflags = 0;
				344
				345	if (flags & BTRFS_INODE_APPEND)
				346	xflags \|= FS_XFLAG_APPEND;
				347	if (flags & BTRFS_INODE_IMMUTABLE)
				348	xflags \|= FS_XFLAG_IMMUTABLE;
				349	if (flags & BTRFS_INODE_NOATIME)
				350	xflags \|= FS_XFLAG_NOATIME;
				351	if (flags & BTRFS_INODE_NODUMP)
				352	xflags \|= FS_XFLAG_NODUMP;
				353	if (flags & BTRFS_INODE_SYNC)
				354	xflags \|= FS_XFLAG_SYNC;
				355
				356	return xflags;
				357	}
				358
				359	/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
				360	static int check_xflags(unsigned int flags)
				361	{
				362	if (flags & ~(FS_XFLAG_APPEND \| FS_XFLAG_IMMUTABLE \| FS_XFLAG_NOATIME \|
				363	FS_XFLAG_NODUMP \| FS_XFLAG_SYNC))
				364	return -EOPNOTSUPP;
				365	return 0;
				366	}
				367
				368	/*
				369	* Set the xflags from the internal inode flags. The remaining items of fsxattr
				370	* are zeroed.
				371	*/
				372	static int btrfs_ioctl_fsgetxattr(struct file file, void __user arg)
				373	{
				374	struct btrfs_inode *binode = BTRFS_I(file_inode(file));
				375	struct fsxattr fa;
				376
				377	memset(&fa, 0, sizeof(fa));
				378	fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
				379
				380	if (copy_to_user(arg, &fa, sizeof(fa)))
				381	return -EFAULT;
				382
				383	return 0;
				384	}
				385
				386	static int btrfs_ioctl_fssetxattr(struct file file, void __user arg)
				387	{
				388	struct inode *inode = file_inode(file);
				389	struct btrfs_inode *binode = BTRFS_I(inode);
				390	struct btrfs_root *root = binode->root;
				391	struct btrfs_trans_handle *trans;
				392	struct fsxattr fa;
				393	unsigned old_flags;
				394	unsigned old_i_flags;
				395	int ret = 0;
				396
				397	if (!inode_owner_or_capable(inode))
				398	return -EPERM;
				399
				400	if (btrfs_root_readonly(root))
				401	return -EROFS;
				402
				403	memset(&fa, 0, sizeof(fa));
				404	if (copy_from_user(&fa, arg, sizeof(fa)))
				405	return -EFAULT;
				406
				407	ret = check_xflags(fa.fsx_xflags);
				408	if (ret)
				409	return ret;
				410
				411	if (fa.fsx_extsize != 0 \|\| fa.fsx_projid != 0 \|\| fa.fsx_cowextsize != 0)
				412	return -EOPNOTSUPP;
				413
				414	ret = mnt_want_write_file(file);
				415	if (ret)
				416	return ret;
				417
				418	inode_lock(inode);
				419
				420	old_flags = binode->flags;
				421	old_i_flags = inode->i_flags;
				422
				423	/* We need the capabilities to change append-only or immutable inode */
				424	if (((old_flags & (BTRFS_INODE_APPEND \| BTRFS_INODE_IMMUTABLE)) \|\|
				425	(fa.fsx_xflags & (FS_XFLAG_APPEND \| FS_XFLAG_IMMUTABLE))) &&
				426	!capable(CAP_LINUX_IMMUTABLE)) {
				427	ret = -EPERM;
				428	goto out_unlock;
				429	}
				430
				431	if (fa.fsx_xflags & FS_XFLAG_SYNC)
				432	binode->flags \|= BTRFS_INODE_SYNC;
				433	else
				434	binode->flags &= ~BTRFS_INODE_SYNC;
				435	if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
				436	binode->flags \|= BTRFS_INODE_IMMUTABLE;
				437	else
				438	binode->flags &= ~BTRFS_INODE_IMMUTABLE;
				439	if (fa.fsx_xflags & FS_XFLAG_APPEND)
				440	binode->flags \|= BTRFS_INODE_APPEND;
				441	else
				442	binode->flags &= ~BTRFS_INODE_APPEND;
				443	if (fa.fsx_xflags & FS_XFLAG_NODUMP)
				444	binode->flags \|= BTRFS_INODE_NODUMP;
				445	else
				446	binode->flags &= ~BTRFS_INODE_NODUMP;
				447	if (fa.fsx_xflags & FS_XFLAG_NOATIME)
				448	binode->flags \|= BTRFS_INODE_NOATIME;
				449	else
				450	binode->flags &= ~BTRFS_INODE_NOATIME;
				451
				452	/* 1 item for the inode */
				453	trans = btrfs_start_transaction(root, 1);
				454	if (IS_ERR(trans)) {
				455	ret = PTR_ERR(trans);
				456	goto out_unlock;
				457	}
				458
				459	btrfs_sync_inode_flags_to_i_flags(inode);
				460	inode_inc_iversion(inode);
				461	inode->i_ctime = current_time(inode);
				462	ret = btrfs_update_inode(trans, root, inode);
				463
				464	btrfs_end_transaction(trans);
				465
				466	out_unlock:
				467	if (ret) {
				468	binode->flags = old_flags;
				469	inode->i_flags = old_i_flags;
				470	}
				471
				472	inode_unlock(inode);
				473	mnt_drop_write_file(file);
				474
				475	return ret;
				476	}
				477
				478	static int btrfs_ioctl_getversion(struct file file, int __user arg)
				479	{
				480	struct inode *inode = file_inode(file);
				481
				482	return put_user(inode->i_generation, arg);
				483	}
				484
				485	static noinline int btrfs_ioctl_fitrim(struct file file, void __user arg)
				486	{
				487	struct inode *inode = file_inode(file);
				488	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				489	struct btrfs_device *device;
				490	struct request_queue *q;
				491	struct fstrim_range range;
				492	u64 minlen = ULLONG_MAX;
				493	u64 num_devices = 0;
				494	int ret;
				495
				496	if (!capable(CAP_SYS_ADMIN))
				497	return -EPERM;
				498
				499	/*
				500	* If the fs is mounted with nologreplay, which requires it to be
				501	* mounted in RO mode as well, we can not allow discard on free space
				502	* inside block groups, because log trees refer to extents that are not
				503	* pinned in a block group's free space cache (pinning the extents is
				504	* precisely the first phase of replaying a log tree).
				505	*/
				506	if (btrfs_test_opt(fs_info, NOLOGREPLAY))
				507	return -EROFS;
				508
				509	rcu_read_lock();
				510	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
				511	dev_list) {
				512	if (!device->bdev)
				513	continue;
				514	q = bdev_get_queue(device->bdev);
				515	if (blk_queue_discard(q)) {
				516	num_devices++;
				517	minlen = min_t(u64, q->limits.discard_granularity,
				518	minlen);
				519	}
				520	}
				521	rcu_read_unlock();
				522
				523	if (!num_devices)
				524	return -EOPNOTSUPP;
				525	if (copy_from_user(&range, arg, sizeof(range)))
				526	return -EFAULT;
				527
				528	/*
				529	* NOTE: Don't truncate the range using super->total_bytes. Bytenr of
				530	* block group is in the logical address space, which can be any
				531	* sectorsize aligned bytenr in the range [0, U64_MAX].
				532	*/
				533	if (range.len < fs_info->sb->s_blocksize)
				534	return -EINVAL;
				535
				536	range.minlen = max(range.minlen, minlen);
				537	ret = btrfs_trim_fs(fs_info, &range);
				538	if (ret < 0)
				539	return ret;
				540
				541	if (copy_to_user(arg, &range, sizeof(range)))
				542	return -EFAULT;
				543
				544	return 0;
				545	}
				546
				547	int btrfs_is_empty_uuid(u8 *uuid)
				548	{
				549	int i;
				550
				551	for (i = 0; i < BTRFS_UUID_SIZE; i++) {
				552	if (uuid[i])
				553	return 0;
				554	}
				555	return 1;
				556	}
				557
				558	static noinline int create_subvol(struct inode *dir,
				559	struct dentry *dentry,
				560	const char *name, int namelen,
				561	u64 *async_transid,
				562	struct btrfs_qgroup_inherit *inherit)
				563	{
				564	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
				565	struct btrfs_trans_handle *trans;
				566	struct btrfs_key key;
				567	struct btrfs_root_item *root_item;
				568	struct btrfs_inode_item *inode_item;
				569	struct extent_buffer *leaf;
				570	struct btrfs_root *root = BTRFS_I(dir)->root;
				571	struct btrfs_root *new_root;
				572	struct btrfs_block_rsv block_rsv;
				573	struct timespec64 cur_time = current_time(dir);
				574	struct inode *inode;
				575	int ret;
				576	int err;
				577	u64 objectid;
				578	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
				579	u64 index = 0;
				580	uuid_le new_uuid;
				581
				582	root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
				583	if (!root_item)
				584	return -ENOMEM;
				585
				586	ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
				587	if (ret)
				588	goto fail_free;
				589
				590	/*
				591	* Don't create subvolume whose level is not zero. Or qgroup will be
				592	* screwed up since it assumes subvolume qgroup's level to be 0.
				593	*/
				594	if (btrfs_qgroup_level(objectid)) {
				595	ret = -ENOSPC;
				596	goto fail_free;
				597	}
				598
				599	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
				600	/*
				601	* The same as the snapshot creation, please see the comment
				602	* of create_snapshot().
				603	*/
				604	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
				605	if (ret)
				606	goto fail_free;
				607
				608	trans = btrfs_start_transaction(root, 0);
				609	if (IS_ERR(trans)) {
				610	ret = PTR_ERR(trans);
				611	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
				612	goto fail_free;
				613	}
				614	trans->block_rsv = &block_rsv;
				615	trans->bytes_reserved = block_rsv.size;
				616
				617	ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
				618	if (ret)
				619	goto fail;
				620
				621	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
				622	if (IS_ERR(leaf)) {
				623	ret = PTR_ERR(leaf);
				624	goto fail;
				625	}
				626
				627	btrfs_mark_buffer_dirty(leaf);
				628
				629	inode_item = &root_item->inode;
				630	btrfs_set_stack_inode_generation(inode_item, 1);
				631	btrfs_set_stack_inode_size(inode_item, 3);
				632	btrfs_set_stack_inode_nlink(inode_item, 1);
				633	btrfs_set_stack_inode_nbytes(inode_item,
				634	fs_info->nodesize);
				635	btrfs_set_stack_inode_mode(inode_item, S_IFDIR \| 0755);
				636
				637	btrfs_set_root_flags(root_item, 0);
				638	btrfs_set_root_limit(root_item, 0);
				639	btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
				640
				641	btrfs_set_root_bytenr(root_item, leaf->start);
				642	btrfs_set_root_generation(root_item, trans->transid);
				643	btrfs_set_root_level(root_item, 0);
				644	btrfs_set_root_refs(root_item, 1);
				645	btrfs_set_root_used(root_item, leaf->len);
				646	btrfs_set_root_last_snapshot(root_item, 0);
				647
				648	btrfs_set_root_generation_v2(root_item,
				649	btrfs_root_generation(root_item));
				650	uuid_le_gen(&new_uuid);
				651	memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
				652	btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
				653	btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
				654	root_item->ctime = root_item->otime;
				655	btrfs_set_root_ctransid(root_item, trans->transid);
				656	btrfs_set_root_otransid(root_item, trans->transid);
				657
				658	btrfs_tree_unlock(leaf);
				659	free_extent_buffer(leaf);
				660	leaf = NULL;
				661
				662	btrfs_set_root_dirid(root_item, new_dirid);
				663
				664	key.objectid = objectid;
				665	key.offset = 0;
				666	key.type = BTRFS_ROOT_ITEM_KEY;
				667	ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
				668	root_item);
				669	if (ret)
				670	goto fail;
				671
				672	key.offset = (u64)-1;
				673	new_root = btrfs_read_fs_root_no_name(fs_info, &key);
				674	if (IS_ERR(new_root)) {
				675	ret = PTR_ERR(new_root);
				676	btrfs_abort_transaction(trans, ret);
				677	goto fail;
				678	}
				679
				680	btrfs_record_root_in_trans(trans, new_root);
				681
				682	ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
				683	if (ret) {
				684	/* We potentially lose an unused inode item here */
				685	btrfs_abort_transaction(trans, ret);
				686	goto fail;
				687	}
				688
				689	mutex_lock(&new_root->objectid_mutex);
				690	new_root->highest_objectid = new_dirid;
				691	mutex_unlock(&new_root->objectid_mutex);
				692
				693	/*
				694	* insert the directory item
				695	*/
				696	ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
				697	if (ret) {
				698	btrfs_abort_transaction(trans, ret);
				699	goto fail;
				700	}
				701
				702	ret = btrfs_insert_dir_item(trans, root,
				703	name, namelen, BTRFS_I(dir), &key,
				704	BTRFS_FT_DIR, index);
				705	if (ret) {
				706	btrfs_abort_transaction(trans, ret);
				707	goto fail;
				708	}
				709
				710	btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
				711	ret = btrfs_update_inode(trans, root, dir);
				712	if (ret) {
				713	btrfs_abort_transaction(trans, ret);
				714	goto fail;
				715	}
				716
				717	ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
				718	btrfs_ino(BTRFS_I(dir)), index, name, namelen);
				719	if (ret) {
				720	btrfs_abort_transaction(trans, ret);
				721	goto fail;
				722	}
				723
				724	ret = btrfs_uuid_tree_add(trans, root_item->uuid,
				725	BTRFS_UUID_KEY_SUBVOL, objectid);
				726	if (ret)
				727	btrfs_abort_transaction(trans, ret);
				728
				729	fail:
				730	kfree(root_item);
				731	trans->block_rsv = NULL;
				732	trans->bytes_reserved = 0;
				733	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
				734
				735	if (async_transid) {
				736	*async_transid = trans->transid;
				737	err = btrfs_commit_transaction_async(trans, 1);
				738	if (err)
				739	err = btrfs_commit_transaction(trans);
				740	} else {
				741	err = btrfs_commit_transaction(trans);
				742	}
				743	if (err && !ret)
				744	ret = err;
				745
				746	if (!ret) {
				747	inode = btrfs_lookup_dentry(dir, dentry);
				748	if (IS_ERR(inode))
				749	return PTR_ERR(inode);
				750	d_instantiate(dentry, inode);
				751	}
				752	return ret;
				753
				754	fail_free:
				755	kfree(root_item);
				756	return ret;
				757	}
				758
				759	static int create_snapshot(struct btrfs_root root, struct inode dir,
				760	struct dentry *dentry,
				761	u64 *async_transid, bool readonly,
				762	struct btrfs_qgroup_inherit *inherit)
				763	{
				764	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
				765	struct inode *inode;
				766	struct btrfs_pending_snapshot *pending_snapshot;
				767	struct btrfs_trans_handle *trans;
				768	int ret;
				769	bool snapshot_force_cow = false;
				770
				771	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
				772	return -EINVAL;
				773
				774	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
				775	if (!pending_snapshot)
				776	return -ENOMEM;
				777
				778	pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
				779	GFP_KERNEL);
				780	pending_snapshot->path = btrfs_alloc_path();
				781	if (!pending_snapshot->root_item \|\| !pending_snapshot->path) {
				782	ret = -ENOMEM;
				783	goto free_pending;
				784	}
				785
				786	/*
				787	* Force new buffered writes to reserve space even when NOCOW is
				788	* possible. This is to avoid later writeback (running dealloc) to
				789	* fallback to COW mode and unexpectedly fail with ENOSPC.
				790	*/
				791	atomic_inc(&root->will_be_snapshotted);
				792	smp_mb__after_atomic();
				793	/* wait for no snapshot writes */
				794	wait_event(root->subv_writers->wait,
				795	percpu_counter_sum(&root->subv_writers->counter) == 0);
				796
				797	ret = btrfs_start_delalloc_snapshot(root);
				798	if (ret)
				799	goto dec_and_free;
				800
				801	/*
				802	* All previous writes have started writeback in NOCOW mode, so now
				803	* we force future writes to fallback to COW mode during snapshot
				804	* creation.
				805	*/
				806	atomic_inc(&root->snapshot_force_cow);
				807	snapshot_force_cow = true;
				808
				809	btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
				810
				811	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
				812	BTRFS_BLOCK_RSV_TEMP);
				813	/*
				814	* 1 - parent dir inode
				815	* 2 - dir entries
				816	* 1 - root item
				817	* 2 - root ref/backref
				818	* 1 - root of snapshot
				819	* 1 - UUID item
				820	*/
				821	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
				822	&pending_snapshot->block_rsv, 8,
				823	false);
				824	if (ret)
				825	goto dec_and_free;
				826
				827	pending_snapshot->dentry = dentry;
				828	pending_snapshot->root = root;
				829	pending_snapshot->readonly = readonly;
				830	pending_snapshot->dir = dir;
				831	pending_snapshot->inherit = inherit;
				832
				833	trans = btrfs_start_transaction(root, 0);
				834	if (IS_ERR(trans)) {
				835	ret = PTR_ERR(trans);
				836	goto fail;
				837	}
				838
				839	spin_lock(&fs_info->trans_lock);
				840	list_add(&pending_snapshot->list,
				841	&trans->transaction->pending_snapshots);
				842	spin_unlock(&fs_info->trans_lock);
				843	if (async_transid) {
				844	*async_transid = trans->transid;
				845	ret = btrfs_commit_transaction_async(trans, 1);
				846	if (ret)
				847	ret = btrfs_commit_transaction(trans);
				848	} else {
				849	ret = btrfs_commit_transaction(trans);
				850	}
				851	if (ret)
				852	goto fail;
				853
				854	ret = pending_snapshot->error;
				855	if (ret)
				856	goto fail;
				857
				858	ret = btrfs_orphan_cleanup(pending_snapshot->snap);
				859	if (ret)
				860	goto fail;
				861
				862	inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
				863	if (IS_ERR(inode)) {
				864	ret = PTR_ERR(inode);
				865	goto fail;
				866	}
				867
				868	d_instantiate(dentry, inode);
				869	ret = 0;
				870	fail:
				871	btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
				872	dec_and_free:
				873	if (snapshot_force_cow)
				874	atomic_dec(&root->snapshot_force_cow);
				875	if (atomic_dec_and_test(&root->will_be_snapshotted))
				876	wake_up_var(&root->will_be_snapshotted);
				877	free_pending:
				878	kfree(pending_snapshot->root_item);
				879	btrfs_free_path(pending_snapshot->path);
				880	kfree(pending_snapshot);
				881
				882	return ret;
				883	}
				884
				885	/* copy of may_delete in fs/namei.c()
				886	* Check whether we can remove a link victim from directory dir, check
				887	* whether the type of victim is right.
				888	* 1. We can't do it if dir is read-only (done in permission())
				889	* 2. We should have write and exec permissions on dir
				890	* 3. We can't remove anything from append-only dir
				891	* 4. We can't do anything with immutable dir (done in permission())
				892	* 5. If the sticky bit on dir is set we should either
				893	* a. be owner of dir, or
				894	* b. be owner of victim, or
				895	* c. have CAP_FOWNER capability
				896	* 6. If the victim is append-only or immutable we can't do anything with
				897	* links pointing to it.
				898	* 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
				899	* 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
				900	* 9. We can't remove a root or mountpoint.
				901	* 10. We don't allow removal of NFS sillyrenamed files; it's handled by
				902	* nfs_async_unlink().
				903	*/
				904
				905	static int btrfs_may_delete(struct inode dir, struct dentry victim, int isdir)
				906	{
				907	int error;
				908
				909	if (d_really_is_negative(victim))
				910	return -ENOENT;
				911
				912	BUG_ON(d_inode(victim->d_parent) != dir);
				913	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
				914
				915	error = inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				916	if (error)
				917	return error;
				918	if (IS_APPEND(dir))
				919	return -EPERM;
				920	if (check_sticky(dir, d_inode(victim)) \|\| IS_APPEND(d_inode(victim)) \|\|
				921	IS_IMMUTABLE(d_inode(victim)) \|\| IS_SWAPFILE(d_inode(victim)))
				922	return -EPERM;
				923	if (isdir) {
				924	if (!d_is_dir(victim))
				925	return -ENOTDIR;
				926	if (IS_ROOT(victim))
				927	return -EBUSY;
				928	} else if (d_is_dir(victim))
				929	return -EISDIR;
				930	if (IS_DEADDIR(dir))
				931	return -ENOENT;
				932	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
				933	return -EBUSY;
				934	return 0;
				935	}
				936
				937	/* copy of may_create in fs/namei.c() */
				938	static inline int btrfs_may_create(struct inode dir, struct dentry child)
				939	{
				940	if (d_really_is_positive(child))
				941	return -EEXIST;
				942	if (IS_DEADDIR(dir))
				943	return -ENOENT;
				944	return inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				945	}
				946
				947	/*
				948	* Create a new subvolume below @parent. This is largely modeled after
				949	* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
				950	* inside this filesystem so it's quite a bit simpler.
				951	*/
				952	static noinline int btrfs_mksubvol(const struct path *parent,
				953	const char *name, int namelen,
				954	struct btrfs_root *snap_src,
				955	u64 *async_transid, bool readonly,
				956	struct btrfs_qgroup_inherit *inherit)
				957	{
				958	struct inode *dir = d_inode(parent->dentry);
				959	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
				960	struct dentry *dentry;
				961	int error;
				962
				963	error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
				964	if (error == -EINTR)
				965	return error;
				966
				967	dentry = lookup_one_len(name, parent->dentry, namelen);
				968	error = PTR_ERR(dentry);
				969	if (IS_ERR(dentry))
				970	goto out_unlock;
				971
				972	error = btrfs_may_create(dir, dentry);
				973	if (error)
				974	goto out_dput;
				975
				976	/*
				977	* even if this name doesn't exist, we may get hash collisions.
				978	* check for them now when we can safely fail
				979	*/
				980	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
				981	dir->i_ino, name,
				982	namelen);
				983	if (error)
				984	goto out_dput;
				985
				986	down_read(&fs_info->subvol_sem);
				987
				988	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
				989	goto out_up_read;
				990
				991	if (snap_src) {
				992	error = create_snapshot(snap_src, dir, dentry,
				993	async_transid, readonly, inherit);
				994	} else {
				995	error = create_subvol(dir, dentry, name, namelen,
				996	async_transid, inherit);
				997	}
				998	if (!error)
				999	fsnotify_mkdir(dir, dentry);
				1000	out_up_read:
				1001	up_read(&fs_info->subvol_sem);
				1002	out_dput:
				1003	dput(dentry);
				1004	out_unlock:
				1005	inode_unlock(dir);
				1006	return error;
				1007	}
				1008
				1009	/*
				1010	* When we're defragging a range, we don't want to kick it off again
				1011	* if it is really just waiting for delalloc to send it down.
				1012	* If we find a nice big extent or delalloc range for the bytes in the
				1013	* file you want to defrag, we return 0 to let you know to skip this
				1014	* part of the file
				1015	*/
				1016	static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
				1017	{
				1018	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
				1019	struct extent_map *em = NULL;
				1020	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
				1021	u64 end;
				1022
				1023	read_lock(&em_tree->lock);
				1024	em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
				1025	read_unlock(&em_tree->lock);
				1026
				1027	if (em) {
				1028	end = extent_map_end(em);
				1029	free_extent_map(em);
				1030	if (end - offset > thresh)
				1031	return 0;
				1032	}
				1033	/* if we already have a nice delalloc here, just stop */
				1034	thresh /= 2;
				1035	end = count_range_bits(io_tree, &offset, offset + thresh,
				1036	thresh, EXTENT_DELALLOC, 1);
				1037	if (end >= thresh)
				1038	return 0;
				1039	return 1;
				1040	}
				1041
				1042	/*
				1043	* helper function to walk through a file and find extents
				1044	* newer than a specific transid, and smaller than thresh.
				1045	*
				1046	* This is used by the defragging code to find new and small
				1047	* extents
				1048	*/
				1049	static int find_new_extents(struct btrfs_root *root,
				1050	struct inode *inode, u64 newer_than,
				1051	u64 *off, u32 thresh)
				1052	{
				1053	struct btrfs_path *path;
				1054	struct btrfs_key min_key;
				1055	struct extent_buffer *leaf;
				1056	struct btrfs_file_extent_item *extent;
				1057	int type;
				1058	int ret;
				1059	u64 ino = btrfs_ino(BTRFS_I(inode));
				1060
				1061	path = btrfs_alloc_path();
				1062	if (!path)
				1063	return -ENOMEM;
				1064
				1065	min_key.objectid = ino;
				1066	min_key.type = BTRFS_EXTENT_DATA_KEY;
				1067	min_key.offset = *off;
				1068
				1069	while (1) {
				1070	ret = btrfs_search_forward(root, &min_key, path, newer_than);
				1071	if (ret != 0)
				1072	goto none;
				1073	process_slot:
				1074	if (min_key.objectid != ino)
				1075	goto none;
				1076	if (min_key.type != BTRFS_EXTENT_DATA_KEY)
				1077	goto none;
				1078
				1079	leaf = path->nodes[0];
				1080	extent = btrfs_item_ptr(leaf, path->slots[0],
				1081	struct btrfs_file_extent_item);
				1082
				1083	type = btrfs_file_extent_type(leaf, extent);
				1084	if (type == BTRFS_FILE_EXTENT_REG &&
				1085	btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
				1086	check_defrag_in_cache(inode, min_key.offset, thresh)) {
				1087	*off = min_key.offset;
				1088	btrfs_free_path(path);
				1089	return 0;
				1090	}
				1091
				1092	path->slots[0]++;
				1093	if (path->slots[0] < btrfs_header_nritems(leaf)) {
				1094	btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
				1095	goto process_slot;
				1096	}
				1097
				1098	if (min_key.offset == (u64)-1)
				1099	goto none;
				1100
				1101	min_key.offset++;
				1102	btrfs_release_path(path);
				1103	}
				1104	none:
				1105	btrfs_free_path(path);
				1106	return -ENOENT;
				1107	}
				1108
				1109	static struct extent_map defrag_lookup_extent(struct inode inode, u64 start)
				1110	{
				1111	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
				1112	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
				1113	struct extent_map *em;
				1114	u64 len = PAGE_SIZE;
				1115
				1116	/*
				1117	* hopefully we have this extent in the tree already, try without
				1118	* the full extent lock
				1119	*/
				1120	read_lock(&em_tree->lock);
				1121	em = lookup_extent_mapping(em_tree, start, len);
				1122	read_unlock(&em_tree->lock);
				1123
				1124	if (!em) {
				1125	struct extent_state *cached = NULL;
				1126	u64 end = start + len - 1;
				1127
				1128	/* get the big lock and read metadata off disk */
				1129	lock_extent_bits(io_tree, start, end, &cached);
				1130	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
				1131	unlock_extent_cached(io_tree, start, end, &cached);
				1132
				1133	if (IS_ERR(em))
				1134	return NULL;
				1135	}
				1136
				1137	return em;
				1138	}
				1139
				1140	static bool defrag_check_next_extent(struct inode inode, struct extent_map em)
				1141	{
				1142	struct extent_map *next;
				1143	bool ret = true;
				1144
				1145	/* this is the last extent */
				1146	if (em->start + em->len >= i_size_read(inode))
				1147	return false;
				1148
				1149	next = defrag_lookup_extent(inode, em->start + em->len);
				1150	if (!next \|\| next->block_start >= EXTENT_MAP_LAST_BYTE)
				1151	ret = false;
				1152	else if ((em->block_start + em->block_len == next->block_start) &&
				1153	(em->block_len > SZ_128K && next->block_len > SZ_128K))
				1154	ret = false;
				1155
				1156	free_extent_map(next);
				1157	return ret;
				1158	}
				1159
				1160	static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
				1161	u64 last_len, u64 skip, u64 *defrag_end,
				1162	int compress)
				1163	{
				1164	struct extent_map *em;
				1165	int ret = 1;
				1166	bool next_mergeable = true;
				1167	bool prev_mergeable = true;
				1168
				1169	/*
				1170	* make sure that once we start defragging an extent, we keep on
				1171	* defragging it
				1172	*/
				1173	if (start < *defrag_end)
				1174	return 1;
				1175
				1176	*skip = 0;
				1177
				1178	em = defrag_lookup_extent(inode, start);
				1179	if (!em)
				1180	return 0;
				1181
				1182	/* this will cover holes, and inline extents */
				1183	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
				1184	ret = 0;
				1185	goto out;
				1186	}
				1187
				1188	if (!*defrag_end)
				1189	prev_mergeable = false;
				1190
				1191	next_mergeable = defrag_check_next_extent(inode, em);
				1192	/*
				1193	* we hit a real extent, if it is big or the next extent is not a
				1194	* real extent, don't bother defragging it
				1195	*/
				1196	if (!compress && (last_len == 0 \|\| last_len >= thresh) &&
				1197	(em->len >= thresh \|\| (!next_mergeable && !prev_mergeable)))
				1198	ret = 0;
				1199	out:
				1200	/*
				1201	* last_len ends up being a counter of how many bytes we've defragged.
				1202	* every time we choose not to defrag an extent, we reset *last_len
				1203	* so that the next tiny extent will force a defrag.
				1204	*
				1205	* The end result of this is that tiny extents before a single big
				1206	* extent will force at least part of that big extent to be defragged.
				1207	*/
				1208	if (ret) {
				1209	*defrag_end = extent_map_end(em);
				1210	} else {
				1211	*last_len = 0;
				1212	*skip = extent_map_end(em);
				1213	*defrag_end = 0;
				1214	}
				1215
				1216	free_extent_map(em);
				1217	return ret;
				1218	}
				1219
				1220	/*
				1221	* it doesn't do much good to defrag one or two pages
				1222	* at a time. This pulls in a nice chunk of pages
				1223	* to COW and defrag.
				1224	*
				1225	* It also makes sure the delalloc code has enough
				1226	* dirty data to avoid making new small extents as part
				1227	* of the defrag
				1228	*
				1229	* It's a good idea to start RA on this range
				1230	* before calling this.
				1231	*/
				1232	static int cluster_pages_for_defrag(struct inode *inode,
				1233	struct page **pages,
				1234	unsigned long start_index,
				1235	unsigned long num_pages)
				1236	{
				1237	unsigned long file_end;
				1238	u64 isize = i_size_read(inode);
				1239	u64 page_start;
				1240	u64 page_end;
				1241	u64 page_cnt;
				1242	int ret;
				1243	int i;
				1244	int i_done;
				1245	struct btrfs_ordered_extent *ordered;
				1246	struct extent_state *cached_state = NULL;
				1247	struct extent_io_tree *tree;
				1248	struct extent_changeset *data_reserved = NULL;
				1249	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
				1250
				1251	file_end = (isize - 1) >> PAGE_SHIFT;
				1252	if (!isize \|\| start_index > file_end)
				1253	return 0;
				1254
				1255	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
				1256
				1257	ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
				1258	start_index << PAGE_SHIFT,
				1259	page_cnt << PAGE_SHIFT);
				1260	if (ret)
				1261	return ret;
				1262	i_done = 0;
				1263	tree = &BTRFS_I(inode)->io_tree;
				1264
				1265	/* step one, lock all the pages */
				1266	for (i = 0; i < page_cnt; i++) {
				1267	struct page *page;
				1268	again:
				1269	page = find_or_create_page(inode->i_mapping,
				1270	start_index + i, mask);
				1271	if (!page)
				1272	break;
				1273
				1274	page_start = page_offset(page);
				1275	page_end = page_start + PAGE_SIZE - 1;
				1276	while (1) {
				1277	lock_extent_bits(tree, page_start, page_end,
				1278	&cached_state);
				1279	ordered = btrfs_lookup_ordered_extent(inode,
				1280	page_start);
				1281	unlock_extent_cached(tree, page_start, page_end,
				1282	&cached_state);
				1283	if (!ordered)
				1284	break;
				1285
				1286	unlock_page(page);
				1287	btrfs_start_ordered_extent(inode, ordered, 1);
				1288	btrfs_put_ordered_extent(ordered);
				1289	lock_page(page);
				1290	/*
				1291	* we unlocked the page above, so we need check if
				1292	* it was released or not.
				1293	*/
				1294	if (page->mapping != inode->i_mapping) {
				1295	unlock_page(page);
				1296	put_page(page);
				1297	goto again;
				1298	}
				1299	}
				1300
				1301	if (!PageUptodate(page)) {
				1302	btrfs_readpage(NULL, page);
				1303	lock_page(page);
				1304	if (!PageUptodate(page)) {
				1305	unlock_page(page);
				1306	put_page(page);
				1307	ret = -EIO;
				1308	break;
				1309	}
				1310	}
				1311
				1312	if (page->mapping != inode->i_mapping) {
				1313	unlock_page(page);
				1314	put_page(page);
				1315	goto again;
				1316	}
				1317
				1318	pages[i] = page;
				1319	i_done++;
				1320	}
				1321	if (!i_done \|\| ret)
				1322	goto out;
				1323
				1324	if (!(inode->i_sb->s_flags & SB_ACTIVE))
				1325	goto out;
				1326
				1327	/*
				1328	* so now we have a nice long stream of locked
				1329	* and up to date pages, lets wait on them
				1330	*/
				1331	for (i = 0; i < i_done; i++)
				1332	wait_on_page_writeback(pages[i]);
				1333
				1334	page_start = page_offset(pages[0]);
				1335	page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
				1336
				1337	lock_extent_bits(&BTRFS_I(inode)->io_tree,
				1338	page_start, page_end - 1, &cached_state);
				1339	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
				1340	page_end - 1, EXTENT_DIRTY \| EXTENT_DELALLOC \|
				1341	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG, 0, 0,
				1342	&cached_state);
				1343
				1344	if (i_done != page_cnt) {
				1345	spin_lock(&BTRFS_I(inode)->lock);
				1346	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
				1347	spin_unlock(&BTRFS_I(inode)->lock);
				1348	btrfs_delalloc_release_space(inode, data_reserved,
				1349	start_index << PAGE_SHIFT,
				1350	(page_cnt - i_done) << PAGE_SHIFT, true);
				1351	}
				1352
				1353
				1354	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
				1355	&cached_state);
				1356
				1357	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
				1358	page_start, page_end - 1, &cached_state);
				1359
				1360	for (i = 0; i < i_done; i++) {
				1361	clear_page_dirty_for_io(pages[i]);
				1362	ClearPageChecked(pages[i]);
				1363	set_page_extent_mapped(pages[i]);
				1364	set_page_dirty(pages[i]);
				1365	unlock_page(pages[i]);
				1366	put_page(pages[i]);
				1367	}
				1368	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
				1369	extent_changeset_free(data_reserved);
				1370	return i_done;
				1371	out:
				1372	for (i = 0; i < i_done; i++) {
				1373	unlock_page(pages[i]);
				1374	put_page(pages[i]);
				1375	}
				1376	btrfs_delalloc_release_space(inode, data_reserved,
				1377	start_index << PAGE_SHIFT,
				1378	page_cnt << PAGE_SHIFT, true);
				1379	btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
				1380	extent_changeset_free(data_reserved);
				1381	return ret;
				1382
				1383	}
				1384
				1385	int btrfs_defrag_file(struct inode inode, struct file file,
				1386	struct btrfs_ioctl_defrag_range_args *range,
				1387	u64 newer_than, unsigned long max_to_defrag)
				1388	{
				1389	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1390	struct btrfs_root *root = BTRFS_I(inode)->root;
				1391	struct file_ra_state *ra = NULL;
				1392	unsigned long last_index;
				1393	u64 isize = i_size_read(inode);
				1394	u64 last_len = 0;
				1395	u64 skip = 0;
				1396	u64 defrag_end = 0;
				1397	u64 newer_off = range->start;
				1398	unsigned long i;
				1399	unsigned long ra_index = 0;
				1400	int ret;
				1401	int defrag_count = 0;
				1402	int compress_type = BTRFS_COMPRESS_ZLIB;
				1403	u32 extent_thresh = range->extent_thresh;
				1404	unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
				1405	unsigned long cluster = max_cluster;
				1406	u64 new_align = ~((u64)SZ_128K - 1);
				1407	struct page **pages = NULL;
				1408	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
				1409
				1410	if (isize == 0)
				1411	return 0;
				1412
				1413	if (range->start >= isize)
				1414	return -EINVAL;
				1415
				1416	if (do_compress) {
				1417	if (range->compress_type > BTRFS_COMPRESS_TYPES)
				1418	return -EINVAL;
				1419	if (range->compress_type)
				1420	compress_type = range->compress_type;
				1421	}
				1422
				1423	if (extent_thresh == 0)
				1424	extent_thresh = SZ_256K;
				1425
				1426	/*
				1427	* If we were not given a file, allocate a readahead context. As
				1428	* readahead is just an optimization, defrag will work without it so
				1429	* we don't error out.
				1430	*/
				1431	if (!file) {
				1432	ra = kzalloc(sizeof(*ra), GFP_KERNEL);
				1433	if (ra)
				1434	file_ra_state_init(ra, inode->i_mapping);
				1435	} else {
				1436	ra = &file->f_ra;
				1437	}
				1438
				1439	pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
				1440	if (!pages) {
				1441	ret = -ENOMEM;
				1442	goto out_ra;
				1443	}
				1444
				1445	/* find the last page to defrag */
				1446	if (range->start + range->len > range->start) {
				1447	last_index = min_t(u64, isize - 1,
				1448	range->start + range->len - 1) >> PAGE_SHIFT;
				1449	} else {
				1450	last_index = (isize - 1) >> PAGE_SHIFT;
				1451	}
				1452
				1453	if (newer_than) {
				1454	ret = find_new_extents(root, inode, newer_than,
				1455	&newer_off, SZ_64K);
				1456	if (!ret) {
				1457	range->start = newer_off;
				1458	/*
				1459	* we always align our defrag to help keep
				1460	* the extents in the file evenly spaced
				1461	*/
				1462	i = (newer_off & new_align) >> PAGE_SHIFT;
				1463	} else
				1464	goto out_ra;
				1465	} else {
				1466	i = range->start >> PAGE_SHIFT;
				1467	}
				1468	if (!max_to_defrag)
				1469	max_to_defrag = last_index - i + 1;
				1470
				1471	/*
				1472	* make writeback starts from i, so the defrag range can be
				1473	* written sequentially.
				1474	*/
				1475	if (i < inode->i_mapping->writeback_index)
				1476	inode->i_mapping->writeback_index = i;
				1477
				1478	while (i <= last_index && defrag_count < max_to_defrag &&
				1479	(i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
				1480	/*
				1481	* make sure we stop running if someone unmounts
				1482	* the FS
				1483	*/
				1484	if (!(inode->i_sb->s_flags & SB_ACTIVE))
				1485	break;
				1486
				1487	if (btrfs_defrag_cancelled(fs_info)) {
				1488	btrfs_debug(fs_info, "defrag_file cancelled");
				1489	ret = -EAGAIN;
				1490	break;
				1491	}
				1492
				1493	if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
				1494	extent_thresh, &last_len, &skip,
				1495	&defrag_end, do_compress)){
				1496	unsigned long next;
				1497	/*
				1498	* the should_defrag function tells us how much to skip
				1499	* bump our counter by the suggested amount
				1500	*/
				1501	next = DIV_ROUND_UP(skip, PAGE_SIZE);
				1502	i = max(i + 1, next);
				1503	continue;
				1504	}
				1505
				1506	if (!newer_than) {
				1507	cluster = (PAGE_ALIGN(defrag_end) >>
				1508	PAGE_SHIFT) - i;
				1509	cluster = min(cluster, max_cluster);
				1510	} else {
				1511	cluster = max_cluster;
				1512	}
				1513
				1514	if (i + cluster > ra_index) {
				1515	ra_index = max(i, ra_index);
				1516	if (ra)
				1517	page_cache_sync_readahead(inode->i_mapping, ra,
				1518	file, ra_index, cluster);
				1519	ra_index += cluster;
				1520	}
				1521
				1522	inode_lock(inode);
				1523	if (do_compress)
				1524	BTRFS_I(inode)->defrag_compress = compress_type;
				1525	ret = cluster_pages_for_defrag(inode, pages, i, cluster);
				1526	if (ret < 0) {
				1527	inode_unlock(inode);
				1528	goto out_ra;
				1529	}
				1530
				1531	defrag_count += ret;
				1532	balance_dirty_pages_ratelimited(inode->i_mapping);
				1533	inode_unlock(inode);
				1534
				1535	if (newer_than) {
				1536	if (newer_off == (u64)-1)
				1537	break;
				1538
				1539	if (ret > 0)
				1540	i += ret;
				1541
				1542	newer_off = max(newer_off + 1,
				1543	(u64)i << PAGE_SHIFT);
				1544
				1545	ret = find_new_extents(root, inode, newer_than,
				1546	&newer_off, SZ_64K);
				1547	if (!ret) {
				1548	range->start = newer_off;
				1549	i = (newer_off & new_align) >> PAGE_SHIFT;
				1550	} else {
				1551	break;
				1552	}
				1553	} else {
				1554	if (ret > 0) {
				1555	i += ret;
				1556	last_len += ret << PAGE_SHIFT;
				1557	} else {
				1558	i++;
				1559	last_len = 0;
				1560	}
				1561	}
				1562	}
				1563
				1564	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
				1565	filemap_flush(inode->i_mapping);
				1566	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
				1567	&BTRFS_I(inode)->runtime_flags))
				1568	filemap_flush(inode->i_mapping);
				1569	}
				1570
				1571	if (range->compress_type == BTRFS_COMPRESS_LZO) {
				1572	btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
				1573	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
				1574	btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
				1575	}
				1576
				1577	ret = defrag_count;
				1578
				1579	out_ra:
				1580	if (do_compress) {
				1581	inode_lock(inode);
				1582	BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
				1583	inode_unlock(inode);
				1584	}
				1585	if (!file)
				1586	kfree(ra);
				1587	kfree(pages);
				1588	return ret;
				1589	}
				1590
				1591	static noinline int btrfs_ioctl_resize(struct file *file,
				1592	void __user *arg)
				1593	{
				1594	struct inode *inode = file_inode(file);
				1595	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1596	u64 new_size;
				1597	u64 old_size;
				1598	u64 devid = 1;
				1599	struct btrfs_root *root = BTRFS_I(inode)->root;
				1600	struct btrfs_ioctl_vol_args *vol_args;
				1601	struct btrfs_trans_handle *trans;
				1602	struct btrfs_device *device = NULL;
				1603	char *sizestr;
				1604	char *retptr;
				1605	char *devstr = NULL;
				1606	int ret = 0;
				1607	int mod = 0;
				1608
				1609	if (!capable(CAP_SYS_ADMIN))
				1610	return -EPERM;
				1611
				1612	ret = mnt_want_write_file(file);
				1613	if (ret)
				1614	return ret;
				1615
				1616	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				1617	mnt_drop_write_file(file);
				1618	return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				1619	}
				1620
				1621	vol_args = memdup_user(arg, sizeof(*vol_args));
				1622	if (IS_ERR(vol_args)) {
				1623	ret = PTR_ERR(vol_args);
				1624	goto out;
				1625	}
				1626
				1627	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				1628
				1629	sizestr = vol_args->name;
				1630	devstr = strchr(sizestr, ':');
				1631	if (devstr) {
				1632	sizestr = devstr + 1;
				1633	*devstr = '\0';
				1634	devstr = vol_args->name;
				1635	ret = kstrtoull(devstr, 10, &devid);
				1636	if (ret)
				1637	goto out_free;
				1638	if (!devid) {
				1639	ret = -EINVAL;
				1640	goto out_free;
				1641	}
				1642	btrfs_info(fs_info, "resizing devid %llu", devid);
				1643	}
				1644
				1645	device = btrfs_find_device(fs_info, devid, NULL, NULL);
				1646	if (!device) {
				1647	btrfs_info(fs_info, "resizer unable to find device %llu",
				1648	devid);
				1649	ret = -ENODEV;
				1650	goto out_free;
				1651	}
				1652
				1653	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				1654	btrfs_info(fs_info,
				1655	"resizer unable to apply on readonly device %llu",
				1656	devid);
				1657	ret = -EPERM;
				1658	goto out_free;
				1659	}
				1660
				1661	if (!strcmp(sizestr, "max"))
				1662	new_size = device->bdev->bd_inode->i_size;
				1663	else {
				1664	if (sizestr[0] == '-') {
				1665	mod = -1;
				1666	sizestr++;
				1667	} else if (sizestr[0] == '+') {
				1668	mod = 1;
				1669	sizestr++;
				1670	}
				1671	new_size = memparse(sizestr, &retptr);
				1672	if (*retptr != '\0' \|\| new_size == 0) {
				1673	ret = -EINVAL;
				1674	goto out_free;
				1675	}
				1676	}
				1677
				1678	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				1679	ret = -EPERM;
				1680	goto out_free;
				1681	}
				1682
				1683	old_size = btrfs_device_get_total_bytes(device);
				1684
				1685	if (mod < 0) {
				1686	if (new_size > old_size) {
				1687	ret = -EINVAL;
				1688	goto out_free;
				1689	}
				1690	new_size = old_size - new_size;
				1691	} else if (mod > 0) {
				1692	if (new_size > ULLONG_MAX - old_size) {
				1693	ret = -ERANGE;
				1694	goto out_free;
				1695	}
				1696	new_size = old_size + new_size;
				1697	}
				1698
				1699	if (new_size < SZ_256M) {
				1700	ret = -EINVAL;
				1701	goto out_free;
				1702	}
				1703	if (new_size > device->bdev->bd_inode->i_size) {
				1704	ret = -EFBIG;
				1705	goto out_free;
				1706	}
				1707
				1708	new_size = round_down(new_size, fs_info->sectorsize);
				1709
				1710	btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
				1711	rcu_str_deref(device->name), new_size);
				1712
				1713	if (new_size > old_size) {
				1714	trans = btrfs_start_transaction(root, 0);
				1715	if (IS_ERR(trans)) {
				1716	ret = PTR_ERR(trans);
				1717	goto out_free;
				1718	}
				1719	ret = btrfs_grow_device(trans, device, new_size);
				1720	btrfs_commit_transaction(trans);
				1721	} else if (new_size < old_size) {
				1722	ret = btrfs_shrink_device(device, new_size);
				1723	} /* equal, nothing need to do */
				1724
				1725	out_free:
				1726	kfree(vol_args);
				1727	out:
				1728	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				1729	mnt_drop_write_file(file);
				1730	return ret;
				1731	}
				1732
				1733	static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
				1734	const char *name, unsigned long fd, int subvol,
				1735	u64 *transid, bool readonly,
				1736	struct btrfs_qgroup_inherit *inherit)
				1737	{
				1738	int namelen;
				1739	int ret = 0;
				1740
				1741	if (!S_ISDIR(file_inode(file)->i_mode))
				1742	return -ENOTDIR;
				1743
				1744	ret = mnt_want_write_file(file);
				1745	if (ret)
				1746	goto out;
				1747
				1748	namelen = strlen(name);
				1749	if (strchr(name, '/')) {
				1750	ret = -EINVAL;
				1751	goto out_drop_write;
				1752	}
				1753
				1754	if (name[0] == '.' &&
				1755	(namelen == 1 \|\| (name[1] == '.' && namelen == 2))) {
				1756	ret = -EEXIST;
				1757	goto out_drop_write;
				1758	}
				1759
				1760	if (subvol) {
				1761	ret = btrfs_mksubvol(&file->f_path, name, namelen,
				1762	NULL, transid, readonly, inherit);
				1763	} else {
				1764	struct fd src = fdget(fd);
				1765	struct inode *src_inode;
				1766	if (!src.file) {
				1767	ret = -EINVAL;
				1768	goto out_drop_write;
				1769	}
				1770
				1771	src_inode = file_inode(src.file);
				1772	if (src_inode->i_sb != file_inode(file)->i_sb) {
				1773	btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
				1774	"Snapshot src from another FS");
				1775	ret = -EXDEV;
				1776	} else if (!inode_owner_or_capable(src_inode)) {
				1777	/*
				1778	* Subvolume creation is not restricted, but snapshots
				1779	* are limited to own subvolumes only
				1780	*/
				1781	ret = -EPERM;
				1782	} else {
				1783	ret = btrfs_mksubvol(&file->f_path, name, namelen,
				1784	BTRFS_I(src_inode)->root,
				1785	transid, readonly, inherit);
				1786	}
				1787	fdput(src);
				1788	}
				1789	out_drop_write:
				1790	mnt_drop_write_file(file);
				1791	out:
				1792	return ret;
				1793	}
				1794
				1795	static noinline int btrfs_ioctl_snap_create(struct file *file,
				1796	void __user *arg, int subvol)
				1797	{
				1798	struct btrfs_ioctl_vol_args *vol_args;
				1799	int ret;
				1800
				1801	if (!S_ISDIR(file_inode(file)->i_mode))
				1802	return -ENOTDIR;
				1803
				1804	vol_args = memdup_user(arg, sizeof(*vol_args));
				1805	if (IS_ERR(vol_args))
				1806	return PTR_ERR(vol_args);
				1807	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				1808
				1809	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
				1810	vol_args->fd, subvol,
				1811	NULL, false, NULL);
				1812
				1813	kfree(vol_args);
				1814	return ret;
				1815	}
				1816
				1817	static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
				1818	void __user *arg, int subvol)
				1819	{
				1820	struct btrfs_ioctl_vol_args_v2 *vol_args;
				1821	int ret;
				1822	u64 transid = 0;
				1823	u64 *ptr = NULL;
				1824	bool readonly = false;
				1825	struct btrfs_qgroup_inherit *inherit = NULL;
				1826
				1827	if (!S_ISDIR(file_inode(file)->i_mode))
				1828	return -ENOTDIR;
				1829
				1830	vol_args = memdup_user(arg, sizeof(*vol_args));
				1831	if (IS_ERR(vol_args))
				1832	return PTR_ERR(vol_args);
				1833	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
				1834
				1835	if (vol_args->flags &
				1836	~(BTRFS_SUBVOL_CREATE_ASYNC \| BTRFS_SUBVOL_RDONLY \|
				1837	BTRFS_SUBVOL_QGROUP_INHERIT)) {
				1838	ret = -EOPNOTSUPP;
				1839	goto free_args;
				1840	}
				1841
				1842	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
				1843	ptr = &transid;
				1844	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
				1845	readonly = true;
				1846	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
				1847	if (vol_args->size > PAGE_SIZE) {
				1848	ret = -EINVAL;
				1849	goto free_args;
				1850	}
				1851	inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
				1852	if (IS_ERR(inherit)) {
				1853	ret = PTR_ERR(inherit);
				1854	goto free_args;
				1855	}
				1856	}
				1857
				1858	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
				1859	vol_args->fd, subvol, ptr,
				1860	readonly, inherit);
				1861	if (ret)
				1862	goto free_inherit;
				1863
				1864	if (ptr && copy_to_user(arg +
				1865	offsetof(struct btrfs_ioctl_vol_args_v2,
				1866	transid),
				1867	ptr, sizeof(*ptr)))
				1868	ret = -EFAULT;
				1869
				1870	free_inherit:
				1871	kfree(inherit);
				1872	free_args:
				1873	kfree(vol_args);
				1874	return ret;
				1875	}
				1876
				1877	static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
				1878	void __user *arg)
				1879	{
				1880	struct inode *inode = file_inode(file);
				1881	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1882	struct btrfs_root *root = BTRFS_I(inode)->root;
				1883	int ret = 0;
				1884	u64 flags = 0;
				1885
				1886	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
				1887	return -EINVAL;
				1888
				1889	down_read(&fs_info->subvol_sem);
				1890	if (btrfs_root_readonly(root))
				1891	flags \|= BTRFS_SUBVOL_RDONLY;
				1892	up_read(&fs_info->subvol_sem);
				1893
				1894	if (copy_to_user(arg, &flags, sizeof(flags)))
				1895	ret = -EFAULT;
				1896
				1897	return ret;
				1898	}
				1899
				1900	static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
				1901	void __user *arg)
				1902	{
				1903	struct inode *inode = file_inode(file);
				1904	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1905	struct btrfs_root *root = BTRFS_I(inode)->root;
				1906	struct btrfs_trans_handle *trans;
				1907	u64 root_flags;
				1908	u64 flags;
				1909	int ret = 0;
				1910
				1911	if (!inode_owner_or_capable(inode))
				1912	return -EPERM;
				1913
				1914	ret = mnt_want_write_file(file);
				1915	if (ret)
				1916	goto out;
				1917
				1918	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
				1919	ret = -EINVAL;
				1920	goto out_drop_write;
				1921	}
				1922
				1923	if (copy_from_user(&flags, arg, sizeof(flags))) {
				1924	ret = -EFAULT;
				1925	goto out_drop_write;
				1926	}
				1927
				1928	if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
				1929	ret = -EINVAL;
				1930	goto out_drop_write;
				1931	}
				1932
				1933	if (flags & ~BTRFS_SUBVOL_RDONLY) {
				1934	ret = -EOPNOTSUPP;
				1935	goto out_drop_write;
				1936	}
				1937
				1938	down_write(&fs_info->subvol_sem);
				1939
				1940	/* nothing to do */
				1941	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
				1942	goto out_drop_sem;
				1943
				1944	root_flags = btrfs_root_flags(&root->root_item);
				1945	if (flags & BTRFS_SUBVOL_RDONLY) {
				1946	btrfs_set_root_flags(&root->root_item,
				1947	root_flags \| BTRFS_ROOT_SUBVOL_RDONLY);
				1948	} else {
				1949	/*
				1950	* Block RO -> RW transition if this subvolume is involved in
				1951	* send
				1952	*/
				1953	spin_lock(&root->root_item_lock);
				1954	if (root->send_in_progress == 0) {
				1955	btrfs_set_root_flags(&root->root_item,
				1956	root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
				1957	spin_unlock(&root->root_item_lock);
				1958	} else {
				1959	spin_unlock(&root->root_item_lock);
				1960	btrfs_warn(fs_info,
				1961	"Attempt to set subvolume %llu read-write during send",
				1962	root->root_key.objectid);
				1963	ret = -EPERM;
				1964	goto out_drop_sem;
				1965	}
				1966	}
				1967
				1968	trans = btrfs_start_transaction(root, 1);
				1969	if (IS_ERR(trans)) {
				1970	ret = PTR_ERR(trans);
				1971	goto out_reset;
				1972	}
				1973
				1974	ret = btrfs_update_root(trans, fs_info->tree_root,
				1975	&root->root_key, &root->root_item);
				1976	if (ret < 0) {
				1977	btrfs_end_transaction(trans);
				1978	goto out_reset;
				1979	}
				1980
				1981	ret = btrfs_commit_transaction(trans);
				1982
				1983	out_reset:
				1984	if (ret)
				1985	btrfs_set_root_flags(&root->root_item, root_flags);
				1986	out_drop_sem:
				1987	up_write(&fs_info->subvol_sem);
				1988	out_drop_write:
				1989	mnt_drop_write_file(file);
				1990	out:
				1991	return ret;
				1992	}
				1993
				1994	static noinline int key_in_sk(struct btrfs_key *key,
				1995	struct btrfs_ioctl_search_key *sk)
				1996	{
				1997	struct btrfs_key test;
				1998	int ret;
				1999
				2000	test.objectid = sk->min_objectid;
				2001	test.type = sk->min_type;
				2002	test.offset = sk->min_offset;
				2003
				2004	ret = btrfs_comp_cpu_keys(key, &test);
				2005	if (ret < 0)
				2006	return 0;
				2007
				2008	test.objectid = sk->max_objectid;
				2009	test.type = sk->max_type;
				2010	test.offset = sk->max_offset;
				2011
				2012	ret = btrfs_comp_cpu_keys(key, &test);
				2013	if (ret > 0)
				2014	return 0;
				2015	return 1;
				2016	}
				2017
				2018	static noinline int copy_to_sk(struct btrfs_path *path,
				2019	struct btrfs_key *key,
				2020	struct btrfs_ioctl_search_key *sk,
				2021	size_t *buf_size,
				2022	char __user *ubuf,
				2023	unsigned long *sk_offset,
				2024	int *num_found)
				2025	{
				2026	u64 found_transid;
				2027	struct extent_buffer *leaf;
				2028	struct btrfs_ioctl_search_header sh;
				2029	struct btrfs_key test;
				2030	unsigned long item_off;
				2031	unsigned long item_len;
				2032	int nritems;
				2033	int i;
				2034	int slot;
				2035	int ret = 0;
				2036
				2037	leaf = path->nodes[0];
				2038	slot = path->slots[0];
				2039	nritems = btrfs_header_nritems(leaf);
				2040
				2041	if (btrfs_header_generation(leaf) > sk->max_transid) {
				2042	i = nritems;
				2043	goto advance_key;
				2044	}
				2045	found_transid = btrfs_header_generation(leaf);
				2046
				2047	for (i = slot; i < nritems; i++) {
				2048	item_off = btrfs_item_ptr_offset(leaf, i);
				2049	item_len = btrfs_item_size_nr(leaf, i);
				2050
				2051	btrfs_item_key_to_cpu(leaf, key, i);
				2052	if (!key_in_sk(key, sk))
				2053	continue;
				2054
				2055	if (sizeof(sh) + item_len > *buf_size) {
				2056	if (*num_found) {
				2057	ret = 1;
				2058	goto out;
				2059	}
				2060
				2061	/*
				2062	* return one empty item back for v1, which does not
				2063	* handle -EOVERFLOW
				2064	*/
				2065
				2066	*buf_size = sizeof(sh) + item_len;
				2067	item_len = 0;
				2068	ret = -EOVERFLOW;
				2069	}
				2070
				2071	if (sizeof(sh) + item_len + sk_offset > buf_size) {
				2072	ret = 1;
				2073	goto out;
				2074	}
				2075
				2076	sh.objectid = key->objectid;
				2077	sh.offset = key->offset;
				2078	sh.type = key->type;
				2079	sh.len = item_len;
				2080	sh.transid = found_transid;
				2081
				2082	/* copy search result header */
				2083	if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
				2084	ret = -EFAULT;
				2085	goto out;
				2086	}
				2087
				2088	*sk_offset += sizeof(sh);
				2089
				2090	if (item_len) {
				2091	char __user up = ubuf + sk_offset;
				2092	/* copy the item */
				2093	if (read_extent_buffer_to_user(leaf, up,
				2094	item_off, item_len)) {
				2095	ret = -EFAULT;
				2096	goto out;
				2097	}
				2098
				2099	*sk_offset += item_len;
				2100	}
				2101	(*num_found)++;
				2102
				2103	if (ret) /* -EOVERFLOW from above */
				2104	goto out;
				2105
				2106	if (*num_found >= sk->nr_items) {
				2107	ret = 1;
				2108	goto out;
				2109	}
				2110	}
				2111	advance_key:
				2112	ret = 0;
				2113	test.objectid = sk->max_objectid;
				2114	test.type = sk->max_type;
				2115	test.offset = sk->max_offset;
				2116	if (btrfs_comp_cpu_keys(key, &test) >= 0)
				2117	ret = 1;
				2118	else if (key->offset < (u64)-1)
				2119	key->offset++;
				2120	else if (key->type < (u8)-1) {
				2121	key->offset = 0;
				2122	key->type++;
				2123	} else if (key->objectid < (u64)-1) {
				2124	key->offset = 0;
				2125	key->type = 0;
				2126	key->objectid++;
				2127	} else
				2128	ret = 1;
				2129	out:
				2130	/*
				2131	* 0: all items from this leaf copied, continue with next
				2132	* 1: * more items can be copied, but unused buffer is too small
				2133	* * all items were found
				2134	* Either way, it will stops the loop which iterates to the next
				2135	* leaf
				2136	* -EOVERFLOW: item was to large for buffer
				2137	* -EFAULT: could not copy extent buffer back to userspace
				2138	*/
				2139	return ret;
				2140	}
				2141
				2142	static noinline int search_ioctl(struct inode *inode,
				2143	struct btrfs_ioctl_search_key *sk,
				2144	size_t *buf_size,
				2145	char __user *ubuf)
				2146	{
				2147	struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
				2148	struct btrfs_root *root;
				2149	struct btrfs_key key;
				2150	struct btrfs_path *path;
				2151	int ret;
				2152	int num_found = 0;
				2153	unsigned long sk_offset = 0;
				2154
				2155	if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
				2156	*buf_size = sizeof(struct btrfs_ioctl_search_header);
				2157	return -EOVERFLOW;
				2158	}
				2159
				2160	path = btrfs_alloc_path();
				2161	if (!path)
				2162	return -ENOMEM;
				2163
				2164	if (sk->tree_id == 0) {
				2165	/* search the root of the inode that was passed */
				2166	root = BTRFS_I(inode)->root;
				2167	} else {
				2168	key.objectid = sk->tree_id;
				2169	key.type = BTRFS_ROOT_ITEM_KEY;
				2170	key.offset = (u64)-1;
				2171	root = btrfs_read_fs_root_no_name(info, &key);
				2172	if (IS_ERR(root)) {
				2173	btrfs_free_path(path);
				2174	return PTR_ERR(root);
				2175	}
				2176	}
				2177
				2178	key.objectid = sk->min_objectid;
				2179	key.type = sk->min_type;
				2180	key.offset = sk->min_offset;
				2181
				2182	while (1) {
				2183	ret = btrfs_search_forward(root, &key, path, sk->min_transid);
				2184	if (ret != 0) {
				2185	if (ret > 0)
				2186	ret = 0;
				2187	goto err;
				2188	}
				2189	ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
				2190	&sk_offset, &num_found);
				2191	btrfs_release_path(path);
				2192	if (ret)
				2193	break;
				2194
				2195	}
				2196	if (ret > 0)
				2197	ret = 0;
				2198	err:
				2199	sk->nr_items = num_found;
				2200	btrfs_free_path(path);
				2201	return ret;
				2202	}
				2203
				2204	static noinline int btrfs_ioctl_tree_search(struct file *file,
				2205	void __user *argp)
				2206	{
				2207	struct btrfs_ioctl_search_args __user *uargs;
				2208	struct btrfs_ioctl_search_key sk;
				2209	struct inode *inode;
				2210	int ret;
				2211	size_t buf_size;
				2212
				2213	if (!capable(CAP_SYS_ADMIN))
				2214	return -EPERM;
				2215
				2216	uargs = (struct btrfs_ioctl_search_args __user *)argp;
				2217
				2218	if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
				2219	return -EFAULT;
				2220
				2221	buf_size = sizeof(uargs->buf);
				2222
				2223	inode = file_inode(file);
				2224	ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
				2225
				2226	/*
				2227	* In the origin implementation an overflow is handled by returning a
				2228	* search header with a len of zero, so reset ret.
				2229	*/
				2230	if (ret == -EOVERFLOW)
				2231	ret = 0;
				2232
				2233	if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
				2234	ret = -EFAULT;
				2235	return ret;
				2236	}
				2237
				2238	static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
				2239	void __user *argp)
				2240	{
				2241	struct btrfs_ioctl_search_args_v2 __user *uarg;
				2242	struct btrfs_ioctl_search_args_v2 args;
				2243	struct inode *inode;
				2244	int ret;
				2245	size_t buf_size;
				2246	const size_t buf_limit = SZ_16M;
				2247
				2248	if (!capable(CAP_SYS_ADMIN))
				2249	return -EPERM;
				2250
				2251	/* copy search header and buffer size */
				2252	uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
				2253	if (copy_from_user(&args, uarg, sizeof(args)))
				2254	return -EFAULT;
				2255
				2256	buf_size = args.buf_size;
				2257
				2258	/* limit result size to 16MB */
				2259	if (buf_size > buf_limit)
				2260	buf_size = buf_limit;
				2261
				2262	inode = file_inode(file);
				2263	ret = search_ioctl(inode, &args.key, &buf_size,
				2264	(char __user *)(&uarg->buf[0]));
				2265	if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
				2266	ret = -EFAULT;
				2267	else if (ret == -EOVERFLOW &&
				2268	copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
				2269	ret = -EFAULT;
				2270
				2271	return ret;
				2272	}
				2273
				2274	/*
				2275	* Search INODE_REFs to identify path name of 'dirid' directory
				2276	* in a 'tree_id' tree. and sets path name to 'name'.
				2277	*/
				2278	static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
				2279	u64 tree_id, u64 dirid, char *name)
				2280	{
				2281	struct btrfs_root *root;
				2282	struct btrfs_key key;
				2283	char *ptr;
				2284	int ret = -1;
				2285	int slot;
				2286	int len;
				2287	int total_len = 0;
				2288	struct btrfs_inode_ref *iref;
				2289	struct extent_buffer *l;
				2290	struct btrfs_path *path;
				2291
				2292	if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
				2293	name[0]='\0';
				2294	return 0;
				2295	}
				2296
				2297	path = btrfs_alloc_path();
				2298	if (!path)
				2299	return -ENOMEM;
				2300
				2301	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
				2302
				2303	key.objectid = tree_id;
				2304	key.type = BTRFS_ROOT_ITEM_KEY;
				2305	key.offset = (u64)-1;
				2306	root = btrfs_read_fs_root_no_name(info, &key);
				2307	if (IS_ERR(root)) {
				2308	ret = PTR_ERR(root);
				2309	goto out;
				2310	}
				2311
				2312	key.objectid = dirid;
				2313	key.type = BTRFS_INODE_REF_KEY;
				2314	key.offset = (u64)-1;
				2315
				2316	while (1) {
				2317	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2318	if (ret < 0)
				2319	goto out;
				2320	else if (ret > 0) {
				2321	ret = btrfs_previous_item(root, path, dirid,
				2322	BTRFS_INODE_REF_KEY);
				2323	if (ret < 0)
				2324	goto out;
				2325	else if (ret > 0) {
				2326	ret = -ENOENT;
				2327	goto out;
				2328	}
				2329	}
				2330
				2331	l = path->nodes[0];
				2332	slot = path->slots[0];
				2333	btrfs_item_key_to_cpu(l, &key, slot);
				2334
				2335	iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
				2336	len = btrfs_inode_ref_name_len(l, iref);
				2337	ptr -= len + 1;
				2338	total_len += len + 1;
				2339	if (ptr < name) {
				2340	ret = -ENAMETOOLONG;
				2341	goto out;
				2342	}
				2343
				2344	*(ptr + len) = '/';
				2345	read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
				2346
				2347	if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
				2348	break;
				2349
				2350	btrfs_release_path(path);
				2351	key.objectid = key.offset;
				2352	key.offset = (u64)-1;
				2353	dirid = key.objectid;
				2354	}
				2355	memmove(name, ptr, total_len);
				2356	name[total_len] = '\0';
				2357	ret = 0;
				2358	out:
				2359	btrfs_free_path(path);
				2360	return ret;
				2361	}
				2362
				2363	static int btrfs_search_path_in_tree_user(struct inode *inode,
				2364	struct btrfs_ioctl_ino_lookup_user_args *args)
				2365	{
				2366	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
				2367	struct super_block *sb = inode->i_sb;
				2368	struct btrfs_key upper_limit = BTRFS_I(inode)->location;
				2369	u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
				2370	u64 dirid = args->dirid;
				2371	unsigned long item_off;
				2372	unsigned long item_len;
				2373	struct btrfs_inode_ref *iref;
				2374	struct btrfs_root_ref *rref;
				2375	struct btrfs_root *root;
				2376	struct btrfs_path *path;
				2377	struct btrfs_key key, key2;
				2378	struct extent_buffer *leaf;
				2379	struct inode *temp_inode;
				2380	char *ptr;
				2381	int slot;
				2382	int len;
				2383	int total_len = 0;
				2384	int ret;
				2385
				2386	path = btrfs_alloc_path();
				2387	if (!path)
				2388	return -ENOMEM;
				2389
				2390	/*
				2391	* If the bottom subvolume does not exist directly under upper_limit,
				2392	* construct the path in from the bottom up.
				2393	*/
				2394	if (dirid != upper_limit.objectid) {
				2395	ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
				2396
				2397	key.objectid = treeid;
				2398	key.type = BTRFS_ROOT_ITEM_KEY;
				2399	key.offset = (u64)-1;
				2400	root = btrfs_read_fs_root_no_name(fs_info, &key);
				2401	if (IS_ERR(root)) {
				2402	ret = PTR_ERR(root);
				2403	goto out;
				2404	}
				2405
				2406	key.objectid = dirid;
				2407	key.type = BTRFS_INODE_REF_KEY;
				2408	key.offset = (u64)-1;
				2409	while (1) {
				2410	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2411	if (ret < 0) {
				2412	goto out;
				2413	} else if (ret > 0) {
				2414	ret = btrfs_previous_item(root, path, dirid,
				2415	BTRFS_INODE_REF_KEY);
				2416	if (ret < 0) {
				2417	goto out;
				2418	} else if (ret > 0) {
				2419	ret = -ENOENT;
				2420	goto out;
				2421	}
				2422	}
				2423
				2424	leaf = path->nodes[0];
				2425	slot = path->slots[0];
				2426	btrfs_item_key_to_cpu(leaf, &key, slot);
				2427
				2428	iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
				2429	len = btrfs_inode_ref_name_len(leaf, iref);
				2430	ptr -= len + 1;
				2431	total_len += len + 1;
				2432	if (ptr < args->path) {
				2433	ret = -ENAMETOOLONG;
				2434	goto out;
				2435	}
				2436
				2437	*(ptr + len) = '/';
				2438	read_extent_buffer(leaf, ptr,
				2439	(unsigned long)(iref + 1), len);
				2440
				2441	/* Check the read+exec permission of this directory */
				2442	ret = btrfs_previous_item(root, path, dirid,
				2443	BTRFS_INODE_ITEM_KEY);
				2444	if (ret < 0) {
				2445	goto out;
				2446	} else if (ret > 0) {
				2447	ret = -ENOENT;
				2448	goto out;
				2449	}
				2450
				2451	leaf = path->nodes[0];
				2452	slot = path->slots[0];
				2453	btrfs_item_key_to_cpu(leaf, &key2, slot);
				2454	if (key2.objectid != dirid) {
				2455	ret = -ENOENT;
				2456	goto out;
				2457	}
				2458
				2459	temp_inode = btrfs_iget(sb, &key2, root, NULL);
				2460	if (IS_ERR(temp_inode)) {
				2461	ret = PTR_ERR(temp_inode);
				2462	goto out;
				2463	}
				2464	ret = inode_permission(temp_inode, MAY_READ \| MAY_EXEC);
				2465	iput(temp_inode);
				2466	if (ret) {
				2467	ret = -EACCES;
				2468	goto out;
				2469	}
				2470
				2471	if (key.offset == upper_limit.objectid)
				2472	break;
				2473	if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
				2474	ret = -EACCES;
				2475	goto out;
				2476	}
				2477
				2478	btrfs_release_path(path);
				2479	key.objectid = key.offset;
				2480	key.offset = (u64)-1;
				2481	dirid = key.objectid;
				2482	}
				2483
				2484	memmove(args->path, ptr, total_len);
				2485	args->path[total_len] = '\0';
				2486	btrfs_release_path(path);
				2487	}
				2488
				2489	/* Get the bottom subvolume's name from ROOT_REF */
				2490	root = fs_info->tree_root;
				2491	key.objectid = treeid;
				2492	key.type = BTRFS_ROOT_REF_KEY;
				2493	key.offset = args->treeid;
				2494	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2495	if (ret < 0) {
				2496	goto out;
				2497	} else if (ret > 0) {
				2498	ret = -ENOENT;
				2499	goto out;
				2500	}
				2501
				2502	leaf = path->nodes[0];
				2503	slot = path->slots[0];
				2504	btrfs_item_key_to_cpu(leaf, &key, slot);
				2505
				2506	item_off = btrfs_item_ptr_offset(leaf, slot);
				2507	item_len = btrfs_item_size_nr(leaf, slot);
				2508	/* Check if dirid in ROOT_REF corresponds to passed dirid */
				2509	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
				2510	if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
				2511	ret = -EINVAL;
				2512	goto out;
				2513	}
				2514
				2515	/* Copy subvolume's name */
				2516	item_off += sizeof(struct btrfs_root_ref);
				2517	item_len -= sizeof(struct btrfs_root_ref);
				2518	read_extent_buffer(leaf, args->name, item_off, item_len);
				2519	args->name[item_len] = 0;
				2520
				2521	out:
				2522	btrfs_free_path(path);
				2523	return ret;
				2524	}
				2525
				2526	static noinline int btrfs_ioctl_ino_lookup(struct file *file,
				2527	void __user *argp)
				2528	{
				2529	struct btrfs_ioctl_ino_lookup_args *args;
				2530	struct inode *inode;
				2531	int ret = 0;
				2532
				2533	args = memdup_user(argp, sizeof(*args));
				2534	if (IS_ERR(args))
				2535	return PTR_ERR(args);
				2536
				2537	inode = file_inode(file);
				2538
				2539	/*
				2540	* Unprivileged query to obtain the containing subvolume root id. The
				2541	* path is reset so it's consistent with btrfs_search_path_in_tree.
				2542	*/
				2543	if (args->treeid == 0)
				2544	args->treeid = BTRFS_I(inode)->root->root_key.objectid;
				2545
				2546	if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
				2547	args->name[0] = 0;
				2548	goto out;
				2549	}
				2550
				2551	if (!capable(CAP_SYS_ADMIN)) {
				2552	ret = -EPERM;
				2553	goto out;
				2554	}
				2555
				2556	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
				2557	args->treeid, args->objectid,
				2558	args->name);
				2559
				2560	out:
				2561	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
				2562	ret = -EFAULT;
				2563
				2564	kfree(args);
				2565	return ret;
				2566	}
				2567
				2568	/*
				2569	* Version of ino_lookup ioctl (unprivileged)
				2570	*
				2571	* The main differences from ino_lookup ioctl are:
				2572	*
				2573	* 1. Read + Exec permission will be checked using inode_permission() during
				2574	* path construction. -EACCES will be returned in case of failure.
				2575	* 2. Path construction will be stopped at the inode number which corresponds
				2576	* to the fd with which this ioctl is called. If constructed path does not
				2577	* exist under fd's inode, -EACCES will be returned.
				2578	* 3. The name of bottom subvolume is also searched and filled.
				2579	*/
				2580	static int btrfs_ioctl_ino_lookup_user(struct file file, void __user argp)
				2581	{
				2582	struct btrfs_ioctl_ino_lookup_user_args *args;
				2583	struct inode *inode;
				2584	int ret;
				2585
				2586	args = memdup_user(argp, sizeof(*args));
				2587	if (IS_ERR(args))
				2588	return PTR_ERR(args);
				2589
				2590	inode = file_inode(file);
				2591
				2592	if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
				2593	BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
				2594	/*
				2595	* The subvolume does not exist under fd with which this is
				2596	* called
				2597	*/
				2598	kfree(args);
				2599	return -EACCES;
				2600	}
				2601
				2602	ret = btrfs_search_path_in_tree_user(inode, args);
				2603
				2604	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
				2605	ret = -EFAULT;
				2606
				2607	kfree(args);
				2608	return ret;
				2609	}
				2610
				2611	/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
				2612	static int btrfs_ioctl_get_subvol_info(struct file file, void __user argp)
				2613	{
				2614	struct btrfs_ioctl_get_subvol_info_args *subvol_info;
				2615	struct btrfs_fs_info *fs_info;
				2616	struct btrfs_root *root;
				2617	struct btrfs_path *path;
				2618	struct btrfs_key key;
				2619	struct btrfs_root_item *root_item;
				2620	struct btrfs_root_ref *rref;
				2621	struct extent_buffer *leaf;
				2622	unsigned long item_off;
				2623	unsigned long item_len;
				2624	struct inode *inode;
				2625	int slot;
				2626	int ret = 0;
				2627
				2628	path = btrfs_alloc_path();
				2629	if (!path)
				2630	return -ENOMEM;
				2631
				2632	subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
				2633	if (!subvol_info) {
				2634	btrfs_free_path(path);
				2635	return -ENOMEM;
				2636	}
				2637
				2638	inode = file_inode(file);
				2639	fs_info = BTRFS_I(inode)->root->fs_info;
				2640
				2641	/* Get root_item of inode's subvolume */
				2642	key.objectid = BTRFS_I(inode)->root->root_key.objectid;
				2643	key.type = BTRFS_ROOT_ITEM_KEY;
				2644	key.offset = (u64)-1;
				2645	root = btrfs_read_fs_root_no_name(fs_info, &key);
				2646	if (IS_ERR(root)) {
				2647	ret = PTR_ERR(root);
				2648	goto out;
				2649	}
				2650	root_item = &root->root_item;
				2651
				2652	subvol_info->treeid = key.objectid;
				2653
				2654	subvol_info->generation = btrfs_root_generation(root_item);
				2655	subvol_info->flags = btrfs_root_flags(root_item);
				2656
				2657	memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
				2658	memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
				2659	BTRFS_UUID_SIZE);
				2660	memcpy(subvol_info->received_uuid, root_item->received_uuid,
				2661	BTRFS_UUID_SIZE);
				2662
				2663	subvol_info->ctransid = btrfs_root_ctransid(root_item);
				2664	subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
				2665	subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
				2666
				2667	subvol_info->otransid = btrfs_root_otransid(root_item);
				2668	subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
				2669	subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
				2670
				2671	subvol_info->stransid = btrfs_root_stransid(root_item);
				2672	subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
				2673	subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
				2674
				2675	subvol_info->rtransid = btrfs_root_rtransid(root_item);
				2676	subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
				2677	subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
				2678
				2679	if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
				2680	/* Search root tree for ROOT_BACKREF of this subvolume */
				2681	root = fs_info->tree_root;
				2682
				2683	key.type = BTRFS_ROOT_BACKREF_KEY;
				2684	key.offset = 0;
				2685	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2686	if (ret < 0) {
				2687	goto out;
				2688	} else if (path->slots[0] >=
				2689	btrfs_header_nritems(path->nodes[0])) {
				2690	ret = btrfs_next_leaf(root, path);
				2691	if (ret < 0) {
				2692	goto out;
				2693	} else if (ret > 0) {
				2694	ret = -EUCLEAN;
				2695	goto out;
				2696	}
				2697	}
				2698
				2699	leaf = path->nodes[0];
				2700	slot = path->slots[0];
				2701	btrfs_item_key_to_cpu(leaf, &key, slot);
				2702	if (key.objectid == subvol_info->treeid &&
				2703	key.type == BTRFS_ROOT_BACKREF_KEY) {
				2704	subvol_info->parent_id = key.offset;
				2705
				2706	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
				2707	subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
				2708
				2709	item_off = btrfs_item_ptr_offset(leaf, slot)
				2710	+ sizeof(struct btrfs_root_ref);
				2711	item_len = btrfs_item_size_nr(leaf, slot)
				2712	- sizeof(struct btrfs_root_ref);
				2713	read_extent_buffer(leaf, subvol_info->name,
				2714	item_off, item_len);
				2715	} else {
				2716	ret = -ENOENT;
				2717	goto out;
				2718	}
				2719	}
				2720
				2721	if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
				2722	ret = -EFAULT;
				2723
				2724	out:
				2725	btrfs_free_path(path);
				2726	kzfree(subvol_info);
				2727	return ret;
				2728	}
				2729
				2730	/*
				2731	* Return ROOT_REF information of the subvolume containing this inode
				2732	* except the subvolume name.
				2733	*/
				2734	static int btrfs_ioctl_get_subvol_rootref(struct file file, void __user argp)
				2735	{
				2736	struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
				2737	struct btrfs_root_ref *rref;
				2738	struct btrfs_root *root;
				2739	struct btrfs_path *path;
				2740	struct btrfs_key key;
				2741	struct extent_buffer *leaf;
				2742	struct inode *inode;
				2743	u64 objectid;
				2744	int slot;
				2745	int ret;
				2746	u8 found;
				2747
				2748	path = btrfs_alloc_path();
				2749	if (!path)
				2750	return -ENOMEM;
				2751
				2752	rootrefs = memdup_user(argp, sizeof(*rootrefs));
				2753	if (IS_ERR(rootrefs)) {
				2754	btrfs_free_path(path);
				2755	return PTR_ERR(rootrefs);
				2756	}
				2757
				2758	inode = file_inode(file);
				2759	root = BTRFS_I(inode)->root->fs_info->tree_root;
				2760	objectid = BTRFS_I(inode)->root->root_key.objectid;
				2761
				2762	key.objectid = objectid;
				2763	key.type = BTRFS_ROOT_REF_KEY;
				2764	key.offset = rootrefs->min_treeid;
				2765	found = 0;
				2766
				2767	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2768	if (ret < 0) {
				2769	goto out;
				2770	} else if (path->slots[0] >=
				2771	btrfs_header_nritems(path->nodes[0])) {
				2772	ret = btrfs_next_leaf(root, path);
				2773	if (ret < 0) {
				2774	goto out;
				2775	} else if (ret > 0) {
				2776	ret = -EUCLEAN;
				2777	goto out;
				2778	}
				2779	}
				2780	while (1) {
				2781	leaf = path->nodes[0];
				2782	slot = path->slots[0];
				2783
				2784	btrfs_item_key_to_cpu(leaf, &key, slot);
				2785	if (key.objectid != objectid \|\| key.type != BTRFS_ROOT_REF_KEY) {
				2786	ret = 0;
				2787	goto out;
				2788	}
				2789
				2790	if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
				2791	ret = -EOVERFLOW;
				2792	goto out;
				2793	}
				2794
				2795	rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
				2796	rootrefs->rootref[found].treeid = key.offset;
				2797	rootrefs->rootref[found].dirid =
				2798	btrfs_root_ref_dirid(leaf, rref);
				2799	found++;
				2800
				2801	ret = btrfs_next_item(root, path);
				2802	if (ret < 0) {
				2803	goto out;
				2804	} else if (ret > 0) {
				2805	ret = -EUCLEAN;
				2806	goto out;
				2807	}
				2808	}
				2809
				2810	out:
				2811	if (!ret \|\| ret == -EOVERFLOW) {
				2812	rootrefs->num_items = found;
				2813	/* update min_treeid for next search */
				2814	if (found)
				2815	rootrefs->min_treeid =
				2816	rootrefs->rootref[found - 1].treeid + 1;
				2817	if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
				2818	ret = -EFAULT;
				2819	}
				2820
				2821	kfree(rootrefs);
				2822	btrfs_free_path(path);
				2823
				2824	return ret;
				2825	}
				2826
				2827	static noinline int btrfs_ioctl_snap_destroy(struct file *file,
				2828	void __user *arg)
				2829	{
				2830	struct dentry *parent = file->f_path.dentry;
				2831	struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
				2832	struct dentry *dentry;
				2833	struct inode *dir = d_inode(parent);
				2834	struct inode *inode;
				2835	struct btrfs_root *root = BTRFS_I(dir)->root;
				2836	struct btrfs_root *dest = NULL;
				2837	struct btrfs_ioctl_vol_args *vol_args;
				2838	int namelen;
				2839	int err = 0;
				2840
				2841	if (!S_ISDIR(dir->i_mode))
				2842	return -ENOTDIR;
				2843
				2844	vol_args = memdup_user(arg, sizeof(*vol_args));
				2845	if (IS_ERR(vol_args))
				2846	return PTR_ERR(vol_args);
				2847
				2848	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				2849	namelen = strlen(vol_args->name);
				2850	if (strchr(vol_args->name, '/') \|\|
				2851	strncmp(vol_args->name, "..", namelen) == 0) {
				2852	err = -EINVAL;
				2853	goto out;
				2854	}
				2855
				2856	err = mnt_want_write_file(file);
				2857	if (err)
				2858	goto out;
				2859
				2860
				2861	err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
				2862	if (err == -EINTR)
				2863	goto out_drop_write;
				2864	dentry = lookup_one_len(vol_args->name, parent, namelen);
				2865	if (IS_ERR(dentry)) {
				2866	err = PTR_ERR(dentry);
				2867	goto out_unlock_dir;
				2868	}
				2869
				2870	if (d_really_is_negative(dentry)) {
				2871	err = -ENOENT;
				2872	goto out_dput;
				2873	}
				2874
				2875	inode = d_inode(dentry);
				2876	dest = BTRFS_I(inode)->root;
				2877	if (!capable(CAP_SYS_ADMIN)) {
				2878	/*
				2879	* Regular user. Only allow this with a special mount
				2880	* option, when the user has write+exec access to the
				2881	* subvol root, and when rmdir(2) would have been
				2882	* allowed.
				2883	*
				2884	* Note that this is _not_ check that the subvol is
				2885	* empty or doesn't contain data that we wouldn't
				2886	* otherwise be able to delete.
				2887	*
				2888	* Users who want to delete empty subvols should try
				2889	* rmdir(2).
				2890	*/
				2891	err = -EPERM;
				2892	if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
				2893	goto out_dput;
				2894
				2895	/*
				2896	* Do not allow deletion if the parent dir is the same
				2897	* as the dir to be deleted. That means the ioctl
				2898	* must be called on the dentry referencing the root
				2899	* of the subvol, not a random directory contained
				2900	* within it.
				2901	*/
				2902	err = -EINVAL;
				2903	if (root == dest)
				2904	goto out_dput;
				2905
				2906	err = inode_permission(inode, MAY_WRITE \| MAY_EXEC);
				2907	if (err)
				2908	goto out_dput;
				2909	}
				2910
				2911	/* check if subvolume may be deleted by a user */
				2912	err = btrfs_may_delete(dir, dentry, 1);
				2913	if (err)
				2914	goto out_dput;
				2915
				2916	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
				2917	err = -EINVAL;
				2918	goto out_dput;
				2919	}
				2920
				2921	inode_lock(inode);
				2922	err = btrfs_delete_subvolume(dir, dentry);
				2923	inode_unlock(inode);
				2924	if (!err)
				2925	d_delete(dentry);
				2926
				2927	out_dput:
				2928	dput(dentry);
				2929	out_unlock_dir:
				2930	inode_unlock(dir);
				2931	out_drop_write:
				2932	mnt_drop_write_file(file);
				2933	out:
				2934	kfree(vol_args);
				2935	return err;
				2936	}
				2937
				2938	static int btrfs_ioctl_defrag(struct file file, void __user argp)
				2939	{
				2940	struct inode *inode = file_inode(file);
				2941	struct btrfs_root *root = BTRFS_I(inode)->root;
				2942	struct btrfs_ioctl_defrag_range_args *range;
				2943	int ret;
				2944
				2945	ret = mnt_want_write_file(file);
				2946	if (ret)
				2947	return ret;
				2948
				2949	if (btrfs_root_readonly(root)) {
				2950	ret = -EROFS;
				2951	goto out;
				2952	}
				2953
				2954	switch (inode->i_mode & S_IFMT) {
				2955	case S_IFDIR:
				2956	if (!capable(CAP_SYS_ADMIN)) {
				2957	ret = -EPERM;
				2958	goto out;
				2959	}
				2960	ret = btrfs_defrag_root(root);
				2961	break;
				2962	case S_IFREG:
				2963	/*
				2964	* Note that this does not check the file descriptor for write
				2965	* access. This prevents defragmenting executables that are
				2966	* running and allows defrag on files open in read-only mode.
				2967	*/
				2968	if (!capable(CAP_SYS_ADMIN) &&
				2969	inode_permission(inode, MAY_WRITE)) {
				2970	ret = -EPERM;
				2971	goto out;
				2972	}
				2973
				2974	range = kzalloc(sizeof(*range), GFP_KERNEL);
				2975	if (!range) {
				2976	ret = -ENOMEM;
				2977	goto out;
				2978	}
				2979
				2980	if (argp) {
				2981	if (copy_from_user(range, argp,
				2982	sizeof(*range))) {
				2983	ret = -EFAULT;
				2984	kfree(range);
				2985	goto out;
				2986	}
				2987	/* compression requires us to start the IO */
				2988	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
				2989	range->flags \|= BTRFS_DEFRAG_RANGE_START_IO;
				2990	range->extent_thresh = (u32)-1;
				2991	}
				2992	} else {
				2993	/* the rest are all set to zero by kzalloc */
				2994	range->len = (u64)-1;
				2995	}
				2996	ret = btrfs_defrag_file(file_inode(file), file,
				2997	range, BTRFS_OLDEST_GENERATION, 0);
				2998	if (ret > 0)
				2999	ret = 0;
				3000	kfree(range);
				3001	break;
				3002	default:
				3003	ret = -EINVAL;
				3004	}
				3005	out:
				3006	mnt_drop_write_file(file);
				3007	return ret;
				3008	}
				3009
				3010	static long btrfs_ioctl_add_dev(struct btrfs_fs_info fs_info, void __user arg)
				3011	{
				3012	struct btrfs_ioctl_vol_args *vol_args;
				3013	int ret;
				3014
				3015	if (!capable(CAP_SYS_ADMIN))
				3016	return -EPERM;
				3017
				3018	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
				3019	return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				3020
				3021	vol_args = memdup_user(arg, sizeof(*vol_args));
				3022	if (IS_ERR(vol_args)) {
				3023	ret = PTR_ERR(vol_args);
				3024	goto out;
				3025	}
				3026
				3027	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				3028	ret = btrfs_init_new_device(fs_info, vol_args->name);
				3029
				3030	if (!ret)
				3031	btrfs_info(fs_info, "disk added %s", vol_args->name);
				3032
				3033	kfree(vol_args);
				3034	out:
				3035	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				3036	return ret;
				3037	}
				3038
				3039	static long btrfs_ioctl_rm_dev_v2(struct file file, void __user arg)
				3040	{
				3041	struct inode *inode = file_inode(file);
				3042	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				3043	struct btrfs_ioctl_vol_args_v2 *vol_args;
				3044	int ret;
				3045
				3046	if (!capable(CAP_SYS_ADMIN))
				3047	return -EPERM;
				3048
				3049	ret = mnt_want_write_file(file);
				3050	if (ret)
				3051	return ret;
				3052
				3053	vol_args = memdup_user(arg, sizeof(*vol_args));
				3054	if (IS_ERR(vol_args)) {
				3055	ret = PTR_ERR(vol_args);
				3056	goto err_drop;
				3057	}
				3058
				3059	/* Check for compatibility reject unknown flags */
				3060	if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
				3061	ret = -EOPNOTSUPP;
				3062	goto out;
				3063	}
				3064
				3065	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				3066	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				3067	goto out;
				3068	}
				3069
				3070	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
				3071	ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
				3072	} else {
				3073	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
				3074	ret = btrfs_rm_device(fs_info, vol_args->name, 0);
				3075	}
				3076	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				3077
				3078	if (!ret) {
				3079	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
				3080	btrfs_info(fs_info, "device deleted: id %llu",
				3081	vol_args->devid);
				3082	else
				3083	btrfs_info(fs_info, "device deleted: %s",
				3084	vol_args->name);
				3085	}
				3086	out:
				3087	kfree(vol_args);
				3088	err_drop:
				3089	mnt_drop_write_file(file);
				3090	return ret;
				3091	}
				3092
				3093	static long btrfs_ioctl_rm_dev(struct file file, void __user arg)
				3094	{
				3095	struct inode *inode = file_inode(file);
				3096	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				3097	struct btrfs_ioctl_vol_args *vol_args;
				3098	int ret;
				3099
				3100	if (!capable(CAP_SYS_ADMIN))
				3101	return -EPERM;
				3102
				3103	ret = mnt_want_write_file(file);
				3104	if (ret)
				3105	return ret;
				3106
				3107	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				3108	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				3109	goto out_drop_write;
				3110	}
				3111
				3112	vol_args = memdup_user(arg, sizeof(*vol_args));
				3113	if (IS_ERR(vol_args)) {
				3114	ret = PTR_ERR(vol_args);
				3115	goto out;
				3116	}
				3117
				3118	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				3119	ret = btrfs_rm_device(fs_info, vol_args->name, 0);
				3120
				3121	if (!ret)
				3122	btrfs_info(fs_info, "disk deleted %s", vol_args->name);
				3123	kfree(vol_args);
				3124	out:
				3125	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				3126	out_drop_write:
				3127	mnt_drop_write_file(file);
				3128
				3129	return ret;
				3130	}
				3131
				3132	static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
				3133	void __user *arg)
				3134	{
				3135	struct btrfs_ioctl_fs_info_args *fi_args;
				3136	struct btrfs_device *device;
				3137	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				3138	int ret = 0;
				3139
				3140	fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
				3141	if (!fi_args)
				3142	return -ENOMEM;
				3143
				3144	rcu_read_lock();
				3145	fi_args->num_devices = fs_devices->num_devices;
				3146
				3147	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
				3148	if (device->devid > fi_args->max_id)
				3149	fi_args->max_id = device->devid;
				3150	}
				3151	rcu_read_unlock();
				3152
				3153	memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
				3154	fi_args->nodesize = fs_info->nodesize;
				3155	fi_args->sectorsize = fs_info->sectorsize;
				3156	fi_args->clone_alignment = fs_info->sectorsize;
				3157
				3158	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
				3159	ret = -EFAULT;
				3160
				3161	kfree(fi_args);
				3162	return ret;
				3163	}
				3164
				3165	static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
				3166	void __user *arg)
				3167	{
				3168	struct btrfs_ioctl_dev_info_args *di_args;
				3169	struct btrfs_device *dev;
				3170	int ret = 0;
				3171	char *s_uuid = NULL;
				3172
				3173	di_args = memdup_user(arg, sizeof(*di_args));
				3174	if (IS_ERR(di_args))
				3175	return PTR_ERR(di_args);
				3176
				3177	if (!btrfs_is_empty_uuid(di_args->uuid))
				3178	s_uuid = di_args->uuid;
				3179
				3180	rcu_read_lock();
				3181	dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
				3182
				3183	if (!dev) {
				3184	ret = -ENODEV;
				3185	goto out;
				3186	}
				3187
				3188	di_args->devid = dev->devid;
				3189	di_args->bytes_used = btrfs_device_get_bytes_used(dev);
				3190	di_args->total_bytes = btrfs_device_get_total_bytes(dev);
				3191	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
				3192	if (dev->name) {
				3193	strncpy(di_args->path, rcu_str_deref(dev->name),
				3194	sizeof(di_args->path) - 1);
				3195	di_args->path[sizeof(di_args->path) - 1] = 0;
				3196	} else {
				3197	di_args->path[0] = '\0';
				3198	}
				3199
				3200	out:
				3201	rcu_read_unlock();
				3202	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
				3203	ret = -EFAULT;
				3204
				3205	kfree(di_args);
				3206	return ret;
				3207	}
				3208
				3209	static struct page extent_same_get_page(struct inode inode, pgoff_t index)
				3210	{
				3211	struct page *page;
				3212
				3213	page = grab_cache_page(inode->i_mapping, index);
				3214	if (!page)
				3215	return ERR_PTR(-ENOMEM);
				3216
				3217	if (!PageUptodate(page)) {
				3218	int ret;
				3219
				3220	ret = btrfs_readpage(NULL, page);
				3221	if (ret)
				3222	return ERR_PTR(ret);
				3223	lock_page(page);
				3224	if (!PageUptodate(page)) {
				3225	unlock_page(page);
				3226	put_page(page);
				3227	return ERR_PTR(-EIO);
				3228	}
				3229	if (page->mapping != inode->i_mapping) {
				3230	unlock_page(page);
				3231	put_page(page);
				3232	return ERR_PTR(-EAGAIN);
				3233	}
				3234	}
				3235
				3236	return page;
				3237	}
				3238
				3239	static int gather_extent_pages(struct inode inode, struct page *pages,
				3240	int num_pages, u64 off)
				3241	{
				3242	int i;
				3243	pgoff_t index = off >> PAGE_SHIFT;
				3244
				3245	for (i = 0; i < num_pages; i++) {
				3246	again:
				3247	pages[i] = extent_same_get_page(inode, index + i);
				3248	if (IS_ERR(pages[i])) {
				3249	int err = PTR_ERR(pages[i]);
				3250
				3251	if (err == -EAGAIN)
				3252	goto again;
				3253	pages[i] = NULL;
				3254	return err;
				3255	}
				3256	}
				3257	return 0;
				3258	}
				3259
				3260	static int lock_extent_range(struct inode *inode, u64 off, u64 len,
				3261	bool retry_range_locking)
				3262	{
				3263	/*
				3264	* Do any pending delalloc/csum calculations on inode, one way or
				3265	* another, and lock file content.
				3266	* The locking order is:
				3267	*
				3268	* 1) pages
				3269	* 2) range in the inode's io tree
				3270	*/
				3271	while (1) {
				3272	struct btrfs_ordered_extent *ordered;
				3273	lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
				3274	ordered = btrfs_lookup_first_ordered_extent(inode,
				3275	off + len - 1);
				3276	if ((!ordered \|\|
				3277	ordered->file_offset + ordered->len <= off \|\|
				3278	ordered->file_offset >= off + len) &&
				3279	!test_range_bit(&BTRFS_I(inode)->io_tree, off,
				3280	off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
				3281	if (ordered)
				3282	btrfs_put_ordered_extent(ordered);
				3283	break;
				3284	}
				3285	unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
				3286	if (ordered)
				3287	btrfs_put_ordered_extent(ordered);
				3288	if (!retry_range_locking)
				3289	return -EAGAIN;
				3290	btrfs_wait_ordered_range(inode, off, len);
				3291	}
				3292	return 0;
				3293	}
				3294
				3295	static void btrfs_double_inode_unlock(struct inode inode1, struct inode inode2)
				3296	{
				3297	inode_unlock(inode1);
				3298	inode_unlock(inode2);
				3299	}
				3300
				3301	static void btrfs_double_inode_lock(struct inode inode1, struct inode inode2)
				3302	{
				3303	if (inode1 < inode2)
				3304	swap(inode1, inode2);
				3305
				3306	inode_lock_nested(inode1, I_MUTEX_PARENT);
				3307	inode_lock_nested(inode2, I_MUTEX_CHILD);
				3308	}
				3309
				3310	static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
				3311	struct inode *inode2, u64 loff2, u64 len)
				3312	{
				3313	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				3314	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				3315	}
				3316
				3317	static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
				3318	struct inode *inode2, u64 loff2, u64 len,
				3319	bool retry_range_locking)
				3320	{
				3321	int ret;
				3322
				3323	if (inode1 < inode2) {
				3324	swap(inode1, inode2);
				3325	swap(loff1, loff2);
				3326	}
				3327	ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
				3328	if (ret)
				3329	return ret;
				3330	ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
				3331	if (ret)
				3332	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
				3333	loff1 + len - 1);
				3334	return ret;
				3335	}
				3336
				3337	struct cmp_pages {
				3338	int num_pages;
				3339	struct page **src_pages;
				3340	struct page **dst_pages;
				3341	};
				3342
				3343	static void btrfs_cmp_data_free(struct cmp_pages *cmp)
				3344	{
				3345	int i;
				3346	struct page *pg;
				3347
				3348	for (i = 0; i < cmp->num_pages; i++) {
				3349	pg = cmp->src_pages[i];
				3350	if (pg) {
				3351	unlock_page(pg);
				3352	put_page(pg);
				3353	cmp->src_pages[i] = NULL;
				3354	}
				3355	pg = cmp->dst_pages[i];
				3356	if (pg) {
				3357	unlock_page(pg);
				3358	put_page(pg);
				3359	cmp->dst_pages[i] = NULL;
				3360	}
				3361	}
				3362	}
				3363
				3364	static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
				3365	struct inode *dst, u64 dst_loff,
				3366	u64 len, struct cmp_pages *cmp)
				3367	{
				3368	int ret;
				3369	int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
				3370
				3371	cmp->num_pages = num_pages;
				3372
				3373	ret = gather_extent_pages(src, cmp->src_pages, num_pages, loff);
				3374	if (ret)
				3375	goto out;
				3376
				3377	ret = gather_extent_pages(dst, cmp->dst_pages, num_pages, dst_loff);
				3378
				3379	out:
				3380	if (ret)
				3381	btrfs_cmp_data_free(cmp);
				3382	return ret;
				3383	}
				3384
				3385	static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
				3386	{
				3387	int ret = 0;
				3388	int i;
				3389	struct page src_page, dst_page;
				3390	unsigned int cmp_len = PAGE_SIZE;
				3391	void addr, dst_addr;
				3392
				3393	i = 0;
				3394	while (len) {
				3395	if (len < PAGE_SIZE)
				3396	cmp_len = len;
				3397
				3398	BUG_ON(i >= cmp->num_pages);
				3399
				3400	src_page = cmp->src_pages[i];
				3401	dst_page = cmp->dst_pages[i];
				3402	ASSERT(PageLocked(src_page));
				3403	ASSERT(PageLocked(dst_page));
				3404
				3405	addr = kmap_atomic(src_page);
				3406	dst_addr = kmap_atomic(dst_page);
				3407
				3408	flush_dcache_page(src_page);
				3409	flush_dcache_page(dst_page);
				3410
				3411	if (memcmp(addr, dst_addr, cmp_len))
				3412	ret = -EBADE;
				3413
				3414	kunmap_atomic(addr);
				3415	kunmap_atomic(dst_addr);
				3416
				3417	if (ret)
				3418	break;
				3419
				3420	len -= cmp_len;
				3421	i++;
				3422	}
				3423
				3424	return ret;
				3425	}
				3426
				3427	static int extent_same_check_offsets(struct inode inode, u64 off, u64 plen,
				3428	u64 olen)
				3429	{
				3430	u64 len = *plen;
				3431	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
				3432
				3433	if (off + olen > inode->i_size \|\| off + olen < off)
				3434	return -EINVAL;
				3435
				3436	/* if we extend to eof, continue to block boundary */
				3437	if (off + len == inode->i_size)
				3438	*plen = len = ALIGN(inode->i_size, bs) - off;
				3439
				3440	/* Check that we are block aligned - btrfs_clone() requires this */
				3441	if (!IS_ALIGNED(off, bs) \|\| !IS_ALIGNED(off + len, bs))
				3442	return -EINVAL;
				3443
				3444	return 0;
				3445	}
				3446
				3447	static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
				3448	struct inode *dst, u64 dst_loff,
				3449	struct cmp_pages *cmp)
				3450	{
				3451	int ret;
				3452	u64 len = olen;
				3453	bool same_inode = (src == dst);
				3454	u64 same_lock_start = 0;
				3455	u64 same_lock_len = 0;
				3456
				3457	ret = extent_same_check_offsets(src, loff, &len, olen);
				3458	if (ret)
				3459	return ret;
				3460
				3461	ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
				3462	if (ret)
				3463	return ret;
				3464
				3465	if (same_inode) {
				3466	/*
				3467	* Single inode case wants the same checks, except we
				3468	* don't want our length pushed out past i_size as
				3469	* comparing that data range makes no sense.
				3470	*
				3471	* extent_same_check_offsets() will do this for an
				3472	* unaligned length at i_size, so catch it here and
				3473	* reject the request.
				3474	*
				3475	* This effectively means we require aligned extents
				3476	* for the single-inode case, whereas the other cases
				3477	* allow an unaligned length so long as it ends at
				3478	* i_size.
				3479	*/
				3480	if (len != olen)
				3481	return -EINVAL;
				3482
				3483	/* Check for overlapping ranges */
				3484	if (dst_loff + len > loff && dst_loff < loff + len)
				3485	return -EINVAL;
				3486
				3487	same_lock_start = min_t(u64, loff, dst_loff);
				3488	same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
				3489	} else {
				3490	/*
				3491	* If the source and destination inodes are different, the
				3492	* source's range end offset matches the source's i_size, that
				3493	* i_size is not a multiple of the sector size, and the
				3494	* destination range does not go past the destination's i_size,
				3495	* we must round down the length to the nearest sector size
				3496	* multiple. If we don't do this adjustment we end replacing
				3497	* with zeroes the bytes in the range that starts at the
				3498	* deduplication range's end offset and ends at the next sector
				3499	* size multiple.
				3500	*/
				3501	if (loff + olen == i_size_read(src) &&
				3502	dst_loff + len < i_size_read(dst)) {
				3503	const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
				3504
				3505	len = round_down(i_size_read(src), sz) - loff;
				3506	if (len == 0)
				3507	return 0;
				3508	olen = len;
				3509	}
				3510	}
				3511
				3512	again:
				3513	ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, cmp);
				3514	if (ret)
				3515	return ret;
				3516
				3517	if (same_inode)
				3518	ret = lock_extent_range(src, same_lock_start, same_lock_len,
				3519	false);
				3520	else
				3521	ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
				3522	false);
				3523	/*
				3524	* If one of the inodes has dirty pages in the respective range or
				3525	* ordered extents, we need to flush dellaloc and wait for all ordered
				3526	* extents in the range. We must unlock the pages and the ranges in the
				3527	* io trees to avoid deadlocks when flushing delalloc (requires locking
				3528	* pages) and when waiting for ordered extents to complete (they require
				3529	* range locking).
				3530	*/
				3531	if (ret == -EAGAIN) {
				3532	/*
				3533	* Ranges in the io trees already unlocked. Now unlock all
				3534	* pages before waiting for all IO to complete.
				3535	*/
				3536	btrfs_cmp_data_free(cmp);
				3537	if (same_inode) {
				3538	btrfs_wait_ordered_range(src, same_lock_start,
				3539	same_lock_len);
				3540	} else {
				3541	btrfs_wait_ordered_range(src, loff, len);
				3542	btrfs_wait_ordered_range(dst, dst_loff, len);
				3543	}
				3544	goto again;
				3545	}
				3546	ASSERT(ret == 0);
				3547	if (WARN_ON(ret)) {
				3548	/* ranges in the io trees already unlocked */
				3549	btrfs_cmp_data_free(cmp);
				3550	return ret;
				3551	}
				3552
				3553	/* pass original length for comparison so we stay within i_size */
				3554	ret = btrfs_cmp_data(olen, cmp);
				3555	if (ret == 0)
				3556	ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
				3557
				3558	if (same_inode)
				3559	unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
				3560	same_lock_start + same_lock_len - 1);
				3561	else
				3562	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
				3563
				3564	btrfs_cmp_data_free(cmp);
				3565
				3566	return ret;
				3567	}
				3568
				3569	#define BTRFS_MAX_DEDUPE_LEN SZ_16M
				3570
				3571	static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
				3572	struct inode *dst, u64 dst_loff)
				3573	{
				3574	int ret;
				3575	struct cmp_pages cmp;
				3576	int num_pages = PAGE_ALIGN(BTRFS_MAX_DEDUPE_LEN) >> PAGE_SHIFT;
				3577	bool same_inode = (src == dst);
				3578	u64 i, tail_len, chunk_count;
				3579
				3580	if (olen == 0)
				3581	return 0;
				3582
				3583	if (same_inode)
				3584	inode_lock(src);
				3585	else
				3586	btrfs_double_inode_lock(src, dst);
				3587
				3588	/* don't make the dst file partly checksummed */
				3589	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
				3590	(BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
				3591	ret = -EINVAL;
				3592	goto out_unlock;
				3593	}
				3594
				3595	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
				3596	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
				3597	if (chunk_count == 0)
				3598	num_pages = PAGE_ALIGN(tail_len) >> PAGE_SHIFT;
				3599
				3600	/*
				3601	* If deduping ranges in the same inode, locking rules make it
				3602	* mandatory to always lock pages in ascending order to avoid deadlocks
				3603	* with concurrent tasks (such as starting writeback/delalloc).
				3604	*/
				3605	if (same_inode && dst_loff < loff)
				3606	swap(loff, dst_loff);
				3607
				3608	/*
				3609	* We must gather up all the pages before we initiate our extent
				3610	* locking. We use an array for the page pointers. Size of the array is
				3611	* bounded by len, which is in turn bounded by BTRFS_MAX_DEDUPE_LEN.
				3612	*/
				3613	cmp.src_pages = kvmalloc_array(num_pages, sizeof(struct page *),
				3614	GFP_KERNEL \| __GFP_ZERO);
				3615	cmp.dst_pages = kvmalloc_array(num_pages, sizeof(struct page *),
				3616	GFP_KERNEL \| __GFP_ZERO);
				3617	if (!cmp.src_pages \|\| !cmp.dst_pages) {
				3618	ret = -ENOMEM;
				3619	goto out_free;
				3620	}
				3621
				3622	for (i = 0; i < chunk_count; i++) {
				3623	ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
				3624	dst, dst_loff, &cmp);
				3625	if (ret)
				3626	goto out_free;
				3627
				3628	loff += BTRFS_MAX_DEDUPE_LEN;
				3629	dst_loff += BTRFS_MAX_DEDUPE_LEN;
				3630	}
				3631
				3632	if (tail_len > 0)
				3633	ret = btrfs_extent_same_range(src, loff, tail_len, dst,
				3634	dst_loff, &cmp);
				3635
				3636	out_free:
				3637	kvfree(cmp.src_pages);
				3638	kvfree(cmp.dst_pages);
				3639
				3640	out_unlock:
				3641	if (same_inode)
				3642	inode_unlock(src);
				3643	else
				3644	btrfs_double_inode_unlock(src, dst);
				3645
				3646	return ret;
				3647	}
				3648
				3649	int btrfs_dedupe_file_range(struct file *src_file, loff_t src_loff,
				3650	struct file *dst_file, loff_t dst_loff,
				3651	u64 olen)
				3652	{
				3653	struct inode *src = file_inode(src_file);
				3654	struct inode *dst = file_inode(dst_file);
				3655	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
				3656
				3657	if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
				3658	/*
				3659	* Btrfs does not support blocksize < page_size. As a
				3660	* result, btrfs_cmp_data() won't correctly handle
				3661	* this situation without an update.
				3662	*/
				3663	return -EINVAL;
				3664	}
				3665
				3666	return btrfs_extent_same(src, src_loff, olen, dst, dst_loff);
				3667	}
				3668
				3669	static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
				3670	struct inode *inode,
				3671	u64 endoff,
				3672	const u64 destoff,
				3673	const u64 olen,
				3674	int no_time_update)
				3675	{
				3676	struct btrfs_root *root = BTRFS_I(inode)->root;
				3677	int ret;
				3678
				3679	inode_inc_iversion(inode);
				3680	if (!no_time_update)
				3681	inode->i_mtime = inode->i_ctime = current_time(inode);
				3682	/*
				3683	* We round up to the block size at eof when determining which
				3684	* extents to clone above, but shouldn't round up the file size.
				3685	*/
				3686	if (endoff > destoff + olen)
				3687	endoff = destoff + olen;
				3688	if (endoff > inode->i_size)
				3689	btrfs_i_size_write(BTRFS_I(inode), endoff);
				3690
				3691	ret = btrfs_update_inode(trans, root, inode);
				3692	if (ret) {
				3693	btrfs_abort_transaction(trans, ret);
				3694	btrfs_end_transaction(trans);
				3695	goto out;
				3696	}
				3697	ret = btrfs_end_transaction(trans);
				3698	out:
				3699	return ret;
				3700	}
				3701
				3702	static void clone_update_extent_map(struct btrfs_inode *inode,
				3703	const struct btrfs_trans_handle *trans,
				3704	const struct btrfs_path *path,
				3705	const u64 hole_offset,
				3706	const u64 hole_len)
				3707	{
				3708	struct extent_map_tree *em_tree = &inode->extent_tree;
				3709	struct extent_map *em;
				3710	int ret;
				3711
				3712	em = alloc_extent_map();
				3713	if (!em) {
				3714	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
				3715	return;
				3716	}
				3717
				3718	if (path) {
				3719	struct btrfs_file_extent_item *fi;
				3720
				3721	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3722	struct btrfs_file_extent_item);
				3723	btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
				3724	em->generation = -1;
				3725	if (btrfs_file_extent_type(path->nodes[0], fi) ==
				3726	BTRFS_FILE_EXTENT_INLINE)
				3727	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				3728	&inode->runtime_flags);
				3729	} else {
				3730	em->start = hole_offset;
				3731	em->len = hole_len;
				3732	em->ram_bytes = em->len;
				3733	em->orig_start = hole_offset;
				3734	em->block_start = EXTENT_MAP_HOLE;
				3735	em->block_len = 0;
				3736	em->orig_block_len = 0;
				3737	em->compress_type = BTRFS_COMPRESS_NONE;
				3738	em->generation = trans->transid;
				3739	}
				3740
				3741	while (1) {
				3742	write_lock(&em_tree->lock);
				3743	ret = add_extent_mapping(em_tree, em, 1);
				3744	write_unlock(&em_tree->lock);
				3745	if (ret != -EEXIST) {
				3746	free_extent_map(em);
				3747	break;
				3748	}
				3749	btrfs_drop_extent_cache(inode, em->start,
				3750	em->start + em->len - 1, 0);
				3751	}
				3752
				3753	if (ret)
				3754	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
				3755	}
				3756
				3757	/*
				3758	* Make sure we do not end up inserting an inline extent into a file that has
				3759	* already other (non-inline) extents. If a file has an inline extent it can
				3760	* not have any other extents and the (single) inline extent must start at the
				3761	* file offset 0. Failing to respect these rules will lead to file corruption,
				3762	* resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
				3763	*
				3764	* We can have extents that have been already written to disk or we can have
				3765	* dirty ranges still in delalloc, in which case the extent maps and items are
				3766	* created only when we run delalloc, and the delalloc ranges might fall outside
				3767	* the range we are currently locking in the inode's io tree. So we check the
				3768	* inode's i_size because of that (i_size updates are done while holding the
				3769	* i_mutex, which we are holding here).
				3770	* We also check to see if the inode has a size not greater than "datal" but has
				3771	* extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
				3772	* protected against such concurrent fallocate calls by the i_mutex).
				3773	*
				3774	* If the file has no extents but a size greater than datal, do not allow the
				3775	* copy because we would need turn the inline extent into a non-inline one (even
				3776	* with NO_HOLES enabled). If we find our destination inode only has one inline
				3777	* extent, just overwrite it with the source inline extent if its size is less
				3778	* than the source extent's size, or we could copy the source inline extent's
				3779	* data into the destination inode's inline extent if the later is greater then
				3780	* the former.
				3781	*/
				3782	static int clone_copy_inline_extent(struct inode *dst,
				3783	struct btrfs_trans_handle *trans,
				3784	struct btrfs_path *path,
				3785	struct btrfs_key *new_key,
				3786	const u64 drop_start,
				3787	const u64 datal,
				3788	const u64 skip,
				3789	const u64 size,
				3790	char *inline_data)
				3791	{
				3792	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
				3793	struct btrfs_root *root = BTRFS_I(dst)->root;
				3794	const u64 aligned_end = ALIGN(new_key->offset + datal,
				3795	fs_info->sectorsize);
				3796	int ret;
				3797	struct btrfs_key key;
				3798
				3799	if (new_key->offset > 0)
				3800	return -EOPNOTSUPP;
				3801
				3802	key.objectid = btrfs_ino(BTRFS_I(dst));
				3803	key.type = BTRFS_EXTENT_DATA_KEY;
				3804	key.offset = 0;
				3805	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				3806	if (ret < 0) {
				3807	return ret;
				3808	} else if (ret > 0) {
				3809	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				3810	ret = btrfs_next_leaf(root, path);
				3811	if (ret < 0)
				3812	return ret;
				3813	else if (ret > 0)
				3814	goto copy_inline_extent;
				3815	}
				3816	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				3817	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				3818	key.type == BTRFS_EXTENT_DATA_KEY) {
				3819	ASSERT(key.offset > 0);
				3820	return -EOPNOTSUPP;
				3821	}
				3822	} else if (i_size_read(dst) <= datal) {
				3823	struct btrfs_file_extent_item *ei;
				3824	u64 ext_len;
				3825
				3826	/*
				3827	* If the file size is <= datal, make sure there are no other
				3828	* extents following (can happen do to an fallocate call with
				3829	* the flag FALLOC_FL_KEEP_SIZE).
				3830	*/
				3831	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3832	struct btrfs_file_extent_item);
				3833	/*
				3834	* If it's an inline extent, it can not have other extents
				3835	* following it.
				3836	*/
				3837	if (btrfs_file_extent_type(path->nodes[0], ei) ==
				3838	BTRFS_FILE_EXTENT_INLINE)
				3839	goto copy_inline_extent;
				3840
				3841	ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
				3842	if (ext_len > aligned_end)
				3843	return -EOPNOTSUPP;
				3844
				3845	ret = btrfs_next_item(root, path);
				3846	if (ret < 0) {
				3847	return ret;
				3848	} else if (ret == 0) {
				3849	btrfs_item_key_to_cpu(path->nodes[0], &key,
				3850	path->slots[0]);
				3851	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				3852	key.type == BTRFS_EXTENT_DATA_KEY)
				3853	return -EOPNOTSUPP;
				3854	}
				3855	}
				3856
				3857	copy_inline_extent:
				3858	/*
				3859	* We have no extent items, or we have an extent at offset 0 which may
				3860	* or may not be inlined. All these cases are dealt the same way.
				3861	*/
				3862	if (i_size_read(dst) > datal) {
				3863	/*
				3864	* If the destination inode has an inline extent...
				3865	* This would require copying the data from the source inline
				3866	* extent into the beginning of the destination's inline extent.
				3867	* But this is really complex, both extents can be compressed
				3868	* or just one of them, which would require decompressing and
				3869	* re-compressing data (which could increase the new compressed
				3870	* size, not allowing the compressed data to fit anymore in an
				3871	* inline extent).
				3872	* So just don't support this case for now (it should be rare,
				3873	* we are not really saving space when cloning inline extents).
				3874	*/
				3875	return -EOPNOTSUPP;
				3876	}
				3877
				3878	btrfs_release_path(path);
				3879	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
				3880	if (ret)
				3881	return ret;
				3882	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
				3883	if (ret)
				3884	return ret;
				3885
				3886	if (skip) {
				3887	const u32 start = btrfs_file_extent_calc_inline_size(0);
				3888
				3889	memmove(inline_data + start, inline_data + start + skip, datal);
				3890	}
				3891
				3892	write_extent_buffer(path->nodes[0], inline_data,
				3893	btrfs_item_ptr_offset(path->nodes[0],
				3894	path->slots[0]),
				3895	size);
				3896	inode_add_bytes(dst, datal);
				3897
				3898	return 0;
				3899	}
				3900
				3901	/**
				3902	* btrfs_clone() - clone a range from inode file to another
				3903	*
				3904	* @src: Inode to clone from
				3905	* @inode: Inode to clone to
				3906	* @off: Offset within source to start clone from
				3907	* @olen: Original length, passed by user, of range to clone
				3908	* @olen_aligned: Block-aligned value of olen
				3909	* @destoff: Offset within @inode to start clone
				3910	* @no_time_update: Whether to update mtime/ctime on the target inode
				3911	*/
				3912	static int btrfs_clone(struct inode src, struct inode inode,
				3913	const u64 off, const u64 olen, const u64 olen_aligned,
				3914	const u64 destoff, int no_time_update)
				3915	{
				3916	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				3917	struct btrfs_root *root = BTRFS_I(inode)->root;
				3918	struct btrfs_path *path = NULL;
				3919	struct extent_buffer *leaf;
				3920	struct btrfs_trans_handle *trans;
				3921	char *buf = NULL;
				3922	struct btrfs_key key;
				3923	u32 nritems;
				3924	int slot;
				3925	int ret;
				3926	const u64 len = olen_aligned;
				3927	u64 last_dest_end = destoff;
				3928
				3929	ret = -ENOMEM;
				3930	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
				3931	if (!buf)
				3932	return ret;
				3933
				3934	path = btrfs_alloc_path();
				3935	if (!path) {
				3936	kvfree(buf);
				3937	return ret;
				3938	}
				3939
				3940	path->reada = READA_FORWARD;
				3941	/* clone data */
				3942	key.objectid = btrfs_ino(BTRFS_I(src));
				3943	key.type = BTRFS_EXTENT_DATA_KEY;
				3944	key.offset = off;
				3945
				3946	while (1) {
				3947	u64 next_key_min_offset = key.offset + 1;
				3948
				3949	/*
				3950	* note the key will change type as we walk through the
				3951	* tree.
				3952	*/
				3953	path->leave_spinning = 1;
				3954	ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
				3955	0, 0);
				3956	if (ret < 0)
				3957	goto out;
				3958	/*
				3959	* First search, if no extent item that starts at offset off was
				3960	* found but the previous item is an extent item, it's possible
				3961	* it might overlap our target range, therefore process it.
				3962	*/
				3963	if (key.offset == off && ret > 0 && path->slots[0] > 0) {
				3964	btrfs_item_key_to_cpu(path->nodes[0], &key,
				3965	path->slots[0] - 1);
				3966	if (key.type == BTRFS_EXTENT_DATA_KEY)
				3967	path->slots[0]--;
				3968	}
				3969
				3970	nritems = btrfs_header_nritems(path->nodes[0]);
				3971	process_slot:
				3972	if (path->slots[0] >= nritems) {
				3973	ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
				3974	if (ret < 0)
				3975	goto out;
				3976	if (ret > 0)
				3977	break;
				3978	nritems = btrfs_header_nritems(path->nodes[0]);
				3979	}
				3980	leaf = path->nodes[0];
				3981	slot = path->slots[0];
				3982
				3983	btrfs_item_key_to_cpu(leaf, &key, slot);
				3984	if (key.type > BTRFS_EXTENT_DATA_KEY \|\|
				3985	key.objectid != btrfs_ino(BTRFS_I(src)))
				3986	break;
				3987
				3988	if (key.type == BTRFS_EXTENT_DATA_KEY) {
				3989	struct btrfs_file_extent_item *extent;
				3990	int type;
				3991	u32 size;
				3992	struct btrfs_key new_key;
				3993	u64 disko = 0, diskl = 0;
				3994	u64 datao = 0, datal = 0;
				3995	u8 comp;
				3996	u64 drop_start;
				3997
				3998	extent = btrfs_item_ptr(leaf, slot,
				3999	struct btrfs_file_extent_item);
				4000	comp = btrfs_file_extent_compression(leaf, extent);
				4001	type = btrfs_file_extent_type(leaf, extent);
				4002	if (type == BTRFS_FILE_EXTENT_REG \|\|
				4003	type == BTRFS_FILE_EXTENT_PREALLOC) {
				4004	disko = btrfs_file_extent_disk_bytenr(leaf,
				4005	extent);
				4006	diskl = btrfs_file_extent_disk_num_bytes(leaf,
				4007	extent);
				4008	datao = btrfs_file_extent_offset(leaf, extent);
				4009	datal = btrfs_file_extent_num_bytes(leaf,
				4010	extent);
				4011	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
				4012	/* take upper bound, may be compressed */
				4013	datal = btrfs_file_extent_ram_bytes(leaf,
				4014	extent);
				4015	}
				4016
				4017	/*
				4018	* The first search might have left us at an extent
				4019	* item that ends before our target range's start, can
				4020	* happen if we have holes and NO_HOLES feature enabled.
				4021	*/
				4022	if (key.offset + datal <= off) {
				4023	path->slots[0]++;
				4024	goto process_slot;
				4025	} else if (key.offset >= off + len) {
				4026	break;
				4027	}
				4028	next_key_min_offset = key.offset + datal;
				4029	size = btrfs_item_size_nr(leaf, slot);
				4030	read_extent_buffer(leaf, buf,
				4031	btrfs_item_ptr_offset(leaf, slot),
				4032	size);
				4033
				4034	btrfs_release_path(path);
				4035	path->leave_spinning = 0;
				4036
				4037	memcpy(&new_key, &key, sizeof(new_key));
				4038	new_key.objectid = btrfs_ino(BTRFS_I(inode));
				4039	if (off <= key.offset)
				4040	new_key.offset = key.offset + destoff - off;
				4041	else
				4042	new_key.offset = destoff;
				4043
				4044	/*
				4045	* Deal with a hole that doesn't have an extent item
				4046	* that represents it (NO_HOLES feature enabled).
				4047	* This hole is either in the middle of the cloning
				4048	* range or at the beginning (fully overlaps it or
				4049	* partially overlaps it).
				4050	*/
				4051	if (new_key.offset != last_dest_end)
				4052	drop_start = last_dest_end;
				4053	else
				4054	drop_start = new_key.offset;
				4055
				4056	/*
				4057	* 1 - adjusting old extent (we may have to split it)
				4058	* 1 - add new extent
				4059	* 1 - inode update
				4060	*/
				4061	trans = btrfs_start_transaction(root, 3);
				4062	if (IS_ERR(trans)) {
				4063	ret = PTR_ERR(trans);
				4064	goto out;
				4065	}
				4066
				4067	if (type == BTRFS_FILE_EXTENT_REG \|\|
				4068	type == BTRFS_FILE_EXTENT_PREALLOC) {
				4069	/*
				4070	* a \| --- range to clone ---\| b
				4071	* \| ------------- extent ------------- \|
				4072	*/
				4073
				4074	/* subtract range b */
				4075	if (key.offset + datal > off + len)
				4076	datal = off + len - key.offset;
				4077
				4078	/* subtract range a */
				4079	if (off > key.offset) {
				4080	datao += off - key.offset;
				4081	datal -= off - key.offset;
				4082	}
				4083
				4084	ret = btrfs_drop_extents(trans, root, inode,
				4085	drop_start,
				4086	new_key.offset + datal,
				4087	1);
				4088	if (ret) {
				4089	if (ret != -EOPNOTSUPP)
				4090	btrfs_abort_transaction(trans,
				4091	ret);
				4092	btrfs_end_transaction(trans);
				4093	goto out;
				4094	}
				4095
				4096	ret = btrfs_insert_empty_item(trans, root, path,
				4097	&new_key, size);
				4098	if (ret) {
				4099	btrfs_abort_transaction(trans, ret);
				4100	btrfs_end_transaction(trans);
				4101	goto out;
				4102	}
				4103
				4104	leaf = path->nodes[0];
				4105	slot = path->slots[0];
				4106	write_extent_buffer(leaf, buf,
				4107	btrfs_item_ptr_offset(leaf, slot),
				4108	size);
				4109
				4110	extent = btrfs_item_ptr(leaf, slot,
				4111	struct btrfs_file_extent_item);
				4112
				4113	/* disko == 0 means it's a hole */
				4114	if (!disko)
				4115	datao = 0;
				4116
				4117	btrfs_set_file_extent_offset(leaf, extent,
				4118	datao);
				4119	btrfs_set_file_extent_num_bytes(leaf, extent,
				4120	datal);
				4121
				4122	if (disko) {
				4123	inode_add_bytes(inode, datal);
				4124	ret = btrfs_inc_extent_ref(trans,
				4125	root,
				4126	disko, diskl, 0,
				4127	root->root_key.objectid,
				4128	btrfs_ino(BTRFS_I(inode)),
				4129	new_key.offset - datao);
				4130	if (ret) {
				4131	btrfs_abort_transaction(trans,
				4132	ret);
				4133	btrfs_end_transaction(trans);
				4134	goto out;
				4135
				4136	}
				4137	}
				4138	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
				4139	u64 skip = 0;
				4140	u64 trim = 0;
				4141
				4142	if (off > key.offset) {
				4143	skip = off - key.offset;
				4144	new_key.offset += skip;
				4145	}
				4146
				4147	if (key.offset + datal > off + len)
				4148	trim = key.offset + datal - (off + len);
				4149
				4150	if (comp && (skip \|\| trim)) {
				4151	ret = -EINVAL;
				4152	btrfs_end_transaction(trans);
				4153	goto out;
				4154	}
				4155	size -= skip + trim;
				4156	datal -= skip + trim;
				4157
				4158	ret = clone_copy_inline_extent(inode,
				4159	trans, path,
				4160	&new_key,
				4161	drop_start,
				4162	datal,
				4163	skip, size, buf);
				4164	if (ret) {
				4165	if (ret != -EOPNOTSUPP)
				4166	btrfs_abort_transaction(trans,
				4167	ret);
				4168	btrfs_end_transaction(trans);
				4169	goto out;
				4170	}
				4171	leaf = path->nodes[0];
				4172	slot = path->slots[0];
				4173	}
				4174
				4175	/* If we have an implicit hole (NO_HOLES feature). */
				4176	if (drop_start < new_key.offset)
				4177	clone_update_extent_map(BTRFS_I(inode), trans,
				4178	NULL, drop_start,
				4179	new_key.offset - drop_start);
				4180
				4181	clone_update_extent_map(BTRFS_I(inode), trans,
				4182	path, 0, 0);
				4183
				4184	btrfs_mark_buffer_dirty(leaf);
				4185	btrfs_release_path(path);
				4186
				4187	last_dest_end = ALIGN(new_key.offset + datal,
				4188	fs_info->sectorsize);
				4189	ret = clone_finish_inode_update(trans, inode,
				4190	last_dest_end,
				4191	destoff, olen,
				4192	no_time_update);
				4193	if (ret)
				4194	goto out;
				4195	if (new_key.offset + datal >= destoff + len)
				4196	break;
				4197	}
				4198	btrfs_release_path(path);
				4199	key.offset = next_key_min_offset;
				4200
				4201	if (fatal_signal_pending(current)) {
				4202	ret = -EINTR;
				4203	goto out;
				4204	}
				4205	}
				4206	ret = 0;
				4207
				4208	if (last_dest_end < destoff + len) {
				4209	/*
				4210	* We have an implicit hole (NO_HOLES feature is enabled) that
				4211	* fully or partially overlaps our cloning range at its end.
				4212	*/
				4213	btrfs_release_path(path);
				4214
				4215	/*
				4216	* 1 - remove extent(s)
				4217	* 1 - inode update
				4218	*/
				4219	trans = btrfs_start_transaction(root, 2);
				4220	if (IS_ERR(trans)) {
				4221	ret = PTR_ERR(trans);
				4222	goto out;
				4223	}
				4224	ret = btrfs_drop_extents(trans, root, inode,
				4225	last_dest_end, destoff + len, 1);
				4226	if (ret) {
				4227	if (ret != -EOPNOTSUPP)
				4228	btrfs_abort_transaction(trans, ret);
				4229	btrfs_end_transaction(trans);
				4230	goto out;
				4231	}
				4232	clone_update_extent_map(BTRFS_I(inode), trans, NULL,
				4233	last_dest_end,
				4234	destoff + len - last_dest_end);
				4235	ret = clone_finish_inode_update(trans, inode, destoff + len,
				4236	destoff, olen, no_time_update);
				4237	}
				4238
				4239	out:
				4240	btrfs_free_path(path);
				4241	kvfree(buf);
				4242	return ret;
				4243	}
				4244
				4245	static noinline int btrfs_clone_files(struct file file, struct file file_src,
				4246	u64 off, u64 olen, u64 destoff)
				4247	{
				4248	struct inode *inode = file_inode(file);
				4249	struct inode *src = file_inode(file_src);
				4250	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4251	struct btrfs_root *root = BTRFS_I(inode)->root;
				4252	int ret;
				4253	u64 len = olen;
				4254	u64 bs = fs_info->sb->s_blocksize;
				4255	int same_inode = src == inode;
				4256
				4257	/*
				4258	* TODO:
				4259	* - split compressed inline extents. annoying: we need to
				4260	* decompress into destination's address_space (the file offset
				4261	* may change, so source mapping won't do), then recompress (or
				4262	* otherwise reinsert) a subrange.
				4263	*
				4264	* - split destination inode's inline extents. The inline extents can
				4265	* be either compressed or non-compressed.
				4266	*/
				4267
				4268	if (btrfs_root_readonly(root))
				4269	return -EROFS;
				4270
				4271	if (file_src->f_path.mnt != file->f_path.mnt \|\|
				4272	src->i_sb != inode->i_sb)
				4273	return -EXDEV;
				4274
				4275	if (S_ISDIR(src->i_mode) \|\| S_ISDIR(inode->i_mode))
				4276	return -EISDIR;
				4277
				4278	if (!same_inode) {
				4279	btrfs_double_inode_lock(src, inode);
				4280	} else {
				4281	inode_lock(src);
				4282	}
				4283
				4284	/* don't make the dst file partly checksummed */
				4285	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
				4286	(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
				4287	ret = -EINVAL;
				4288	goto out_unlock;
				4289	}
				4290
				4291	/* determine range to clone */
				4292	ret = -EINVAL;
				4293	if (off + len > src->i_size \|\| off + len < off)
				4294	goto out_unlock;
				4295	if (len == 0)
				4296	olen = len = src->i_size - off;
				4297	/*
				4298	* If we extend to eof, continue to block boundary if and only if the
				4299	* destination end offset matches the destination file's size, otherwise
				4300	* we would be corrupting data by placing the eof block into the middle
				4301	* of a file.
				4302	*/
				4303	if (off + len == src->i_size) {
				4304	if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
				4305	goto out_unlock;
				4306	len = ALIGN(src->i_size, bs) - off;
				4307	}
				4308
				4309	if (len == 0) {
				4310	ret = 0;
				4311	goto out_unlock;
				4312	}
				4313
				4314	/* verify the end result is block aligned */
				4315	if (!IS_ALIGNED(off, bs) \|\| !IS_ALIGNED(off + len, bs) \|\|
				4316	!IS_ALIGNED(destoff, bs))
				4317	goto out_unlock;
				4318
				4319	/* verify if ranges are overlapped within the same file */
				4320	if (same_inode) {
				4321	if (destoff + len > off && destoff < off + len)
				4322	goto out_unlock;
				4323	}
				4324
				4325	if (destoff > inode->i_size) {
				4326	ret = btrfs_cont_expand(inode, inode->i_size, destoff);
				4327	if (ret)
				4328	goto out_unlock;
				4329	}
				4330
				4331	/*
				4332	* Lock the target range too. Right after we replace the file extent
				4333	* items in the fs tree (which now point to the cloned data), we might
				4334	* have a worker replace them with extent items relative to a write
				4335	* operation that was issued before this clone operation (i.e. confront
				4336	* with inode.c:btrfs_finish_ordered_io).
				4337	*/
				4338	if (same_inode) {
				4339	u64 lock_start = min_t(u64, off, destoff);
				4340	u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
				4341
				4342	ret = lock_extent_range(src, lock_start, lock_len, true);
				4343	} else {
				4344	ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
				4345	true);
				4346	}
				4347	ASSERT(ret == 0);
				4348	if (WARN_ON(ret)) {
				4349	/* ranges in the io trees already unlocked */
				4350	goto out_unlock;
				4351	}
				4352
				4353	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
				4354
				4355	if (same_inode) {
				4356	u64 lock_start = min_t(u64, off, destoff);
				4357	u64 lock_end = max_t(u64, off, destoff) + len - 1;
				4358
				4359	unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
				4360	} else {
				4361	btrfs_double_extent_unlock(src, off, inode, destoff, len);
				4362	}
				4363	/*
				4364	* Truncate page cache pages so that future reads will see the cloned
				4365	* data immediately and not the previous data.
				4366	*/
				4367	truncate_inode_pages_range(&inode->i_data,
				4368	round_down(destoff, PAGE_SIZE),
				4369	round_up(destoff + len, PAGE_SIZE) - 1);
				4370	out_unlock:
				4371	if (!same_inode)
				4372	btrfs_double_inode_unlock(src, inode);
				4373	else
				4374	inode_unlock(src);
				4375	return ret;
				4376	}
				4377
				4378	int btrfs_clone_file_range(struct file *src_file, loff_t off,
				4379	struct file *dst_file, loff_t destoff, u64 len)
				4380	{
				4381	return btrfs_clone_files(dst_file, src_file, off, len, destoff);
				4382	}
				4383
				4384	static long btrfs_ioctl_default_subvol(struct file file, void __user argp)
				4385	{
				4386	struct inode *inode = file_inode(file);
				4387	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4388	struct btrfs_root *root = BTRFS_I(inode)->root;
				4389	struct btrfs_root *new_root;
				4390	struct btrfs_dir_item *di;
				4391	struct btrfs_trans_handle *trans;
				4392	struct btrfs_path *path;
				4393	struct btrfs_key location;
				4394	struct btrfs_disk_key disk_key;
				4395	u64 objectid = 0;
				4396	u64 dir_id;
				4397	int ret;
				4398
				4399	if (!capable(CAP_SYS_ADMIN))
				4400	return -EPERM;
				4401
				4402	ret = mnt_want_write_file(file);
				4403	if (ret)
				4404	return ret;
				4405
				4406	if (copy_from_user(&objectid, argp, sizeof(objectid))) {
				4407	ret = -EFAULT;
				4408	goto out;
				4409	}
				4410
				4411	if (!objectid)
				4412	objectid = BTRFS_FS_TREE_OBJECTID;
				4413
				4414	location.objectid = objectid;
				4415	location.type = BTRFS_ROOT_ITEM_KEY;
				4416	location.offset = (u64)-1;
				4417
				4418	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
				4419	if (IS_ERR(new_root)) {
				4420	ret = PTR_ERR(new_root);
				4421	goto out;
				4422	}
				4423	if (!is_fstree(new_root->objectid)) {
				4424	ret = -ENOENT;
				4425	goto out;
				4426	}
				4427
				4428	path = btrfs_alloc_path();
				4429	if (!path) {
				4430	ret = -ENOMEM;
				4431	goto out;
				4432	}
				4433	path->leave_spinning = 1;
				4434
				4435	trans = btrfs_start_transaction(root, 1);
				4436	if (IS_ERR(trans)) {
				4437	btrfs_free_path(path);
				4438	ret = PTR_ERR(trans);
				4439	goto out;
				4440	}
				4441
				4442	dir_id = btrfs_super_root_dir(fs_info->super_copy);
				4443	di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
				4444	dir_id, "default", 7, 1);
				4445	if (IS_ERR_OR_NULL(di)) {
				4446	btrfs_free_path(path);
				4447	btrfs_end_transaction(trans);
				4448	btrfs_err(fs_info,
				4449	"Umm, you don't have the default diritem, this isn't going to work");
				4450	ret = -ENOENT;
				4451	goto out;
				4452	}
				4453
				4454	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
				4455	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
				4456	btrfs_mark_buffer_dirty(path->nodes[0]);
				4457	btrfs_free_path(path);
				4458
				4459	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
				4460	btrfs_end_transaction(trans);
				4461	out:
				4462	mnt_drop_write_file(file);
				4463	return ret;
				4464	}
				4465
				4466	static void get_block_group_info(struct list_head *groups_list,
				4467	struct btrfs_ioctl_space_info *space)
				4468	{
				4469	struct btrfs_block_group_cache *block_group;
				4470
				4471	space->total_bytes = 0;
				4472	space->used_bytes = 0;
				4473	space->flags = 0;
				4474	list_for_each_entry(block_group, groups_list, list) {
				4475	space->flags = block_group->flags;
				4476	space->total_bytes += block_group->key.offset;
				4477	space->used_bytes +=
				4478	btrfs_block_group_used(&block_group->item);
				4479	}
				4480	}
				4481
				4482	static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
				4483	void __user *arg)
				4484	{
				4485	struct btrfs_ioctl_space_args space_args;
				4486	struct btrfs_ioctl_space_info space;
				4487	struct btrfs_ioctl_space_info *dest;
				4488	struct btrfs_ioctl_space_info *dest_orig;
				4489	struct btrfs_ioctl_space_info __user *user_dest;
				4490	struct btrfs_space_info *info;
				4491	static const u64 types[] = {
				4492	BTRFS_BLOCK_GROUP_DATA,
				4493	BTRFS_BLOCK_GROUP_SYSTEM,
				4494	BTRFS_BLOCK_GROUP_METADATA,
				4495	BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA
				4496	};
				4497	int num_types = 4;
				4498	int alloc_size;
				4499	int ret = 0;
				4500	u64 slot_count = 0;
				4501	int i, c;
				4502
				4503	if (copy_from_user(&space_args,
				4504	(struct btrfs_ioctl_space_args __user *)arg,
				4505	sizeof(space_args)))
				4506	return -EFAULT;
				4507
				4508	for (i = 0; i < num_types; i++) {
				4509	struct btrfs_space_info *tmp;
				4510
				4511	info = NULL;
				4512	rcu_read_lock();
				4513	list_for_each_entry_rcu(tmp, &fs_info->space_info,
				4514	list) {
				4515	if (tmp->flags == types[i]) {
				4516	info = tmp;
				4517	break;
				4518	}
				4519	}
				4520	rcu_read_unlock();
				4521
				4522	if (!info)
				4523	continue;
				4524
				4525	down_read(&info->groups_sem);
				4526	for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
				4527	if (!list_empty(&info->block_groups[c]))
				4528	slot_count++;
				4529	}
				4530	up_read(&info->groups_sem);
				4531	}
				4532
				4533	/*
				4534	* Global block reserve, exported as a space_info
				4535	*/
				4536	slot_count++;
				4537
				4538	/* space_slots == 0 means they are asking for a count */
				4539	if (space_args.space_slots == 0) {
				4540	space_args.total_spaces = slot_count;
				4541	goto out;
				4542	}
				4543
				4544	slot_count = min_t(u64, space_args.space_slots, slot_count);
				4545
				4546	alloc_size = sizeof(dest) slot_count;
				4547
				4548	/* we generally have at most 6 or so space infos, one for each raid
				4549	* level. So, a whole page should be more than enough for everyone
				4550	*/
				4551	if (alloc_size > PAGE_SIZE)
				4552	return -ENOMEM;
				4553
				4554	space_args.total_spaces = 0;
				4555	dest = kmalloc(alloc_size, GFP_KERNEL);
				4556	if (!dest)
				4557	return -ENOMEM;
				4558	dest_orig = dest;
				4559
				4560	/* now we have a buffer to copy into */
				4561	for (i = 0; i < num_types; i++) {
				4562	struct btrfs_space_info *tmp;
				4563
				4564	if (!slot_count)
				4565	break;
				4566
				4567	info = NULL;
				4568	rcu_read_lock();
				4569	list_for_each_entry_rcu(tmp, &fs_info->space_info,
				4570	list) {
				4571	if (tmp->flags == types[i]) {
				4572	info = tmp;
				4573	break;
				4574	}
				4575	}
				4576	rcu_read_unlock();
				4577
				4578	if (!info)
				4579	continue;
				4580	down_read(&info->groups_sem);
				4581	for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
				4582	if (!list_empty(&info->block_groups[c])) {
				4583	get_block_group_info(&info->block_groups[c],
				4584	&space);
				4585	memcpy(dest, &space, sizeof(space));
				4586	dest++;
				4587	space_args.total_spaces++;
				4588	slot_count--;
				4589	}
				4590	if (!slot_count)
				4591	break;
				4592	}
				4593	up_read(&info->groups_sem);
				4594	}
				4595
				4596	/*
				4597	* Add global block reserve
				4598	*/
				4599	if (slot_count) {
				4600	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
				4601
				4602	spin_lock(&block_rsv->lock);
				4603	space.total_bytes = block_rsv->size;
				4604	space.used_bytes = block_rsv->size - block_rsv->reserved;
				4605	spin_unlock(&block_rsv->lock);
				4606	space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
				4607	memcpy(dest, &space, sizeof(space));
				4608	space_args.total_spaces++;
				4609	}
				4610
				4611	user_dest = (struct btrfs_ioctl_space_info __user *)
				4612	(arg + sizeof(struct btrfs_ioctl_space_args));
				4613
				4614	if (copy_to_user(user_dest, dest_orig, alloc_size))
				4615	ret = -EFAULT;
				4616
				4617	kfree(dest_orig);
				4618	out:
				4619	if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
				4620	ret = -EFAULT;
				4621
				4622	return ret;
				4623	}
				4624
				4625	static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
				4626	void __user *argp)
				4627	{
				4628	struct btrfs_trans_handle *trans;
				4629	u64 transid;
				4630	int ret;
				4631
				4632	trans = btrfs_attach_transaction_barrier(root);
				4633	if (IS_ERR(trans)) {
				4634	if (PTR_ERR(trans) != -ENOENT)
				4635	return PTR_ERR(trans);
				4636
				4637	/* No running transaction, don't bother */
				4638	transid = root->fs_info->last_trans_committed;
				4639	goto out;
				4640	}
				4641	transid = trans->transid;
				4642	ret = btrfs_commit_transaction_async(trans, 0);
				4643	if (ret) {
				4644	btrfs_end_transaction(trans);
				4645	return ret;
				4646	}
				4647	out:
				4648	if (argp)
				4649	if (copy_to_user(argp, &transid, sizeof(transid)))
				4650	return -EFAULT;
				4651	return 0;
				4652	}
				4653
				4654	static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
				4655	void __user *argp)
				4656	{
				4657	u64 transid;
				4658
				4659	if (argp) {
				4660	if (copy_from_user(&transid, argp, sizeof(transid)))
				4661	return -EFAULT;
				4662	} else {
				4663	transid = 0; /* current trans */
				4664	}
				4665	return btrfs_wait_for_commit(fs_info, transid);
				4666	}
				4667
				4668	static long btrfs_ioctl_scrub(struct file file, void __user arg)
				4669	{
				4670	struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
				4671	struct btrfs_ioctl_scrub_args *sa;
				4672	int ret;
				4673
				4674	if (!capable(CAP_SYS_ADMIN))
				4675	return -EPERM;
				4676
				4677	sa = memdup_user(arg, sizeof(*sa));
				4678	if (IS_ERR(sa))
				4679	return PTR_ERR(sa);
				4680
				4681	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
				4682	ret = mnt_want_write_file(file);
				4683	if (ret)
				4684	goto out;
				4685	}
				4686
				4687	ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
				4688	&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
				4689	0);
				4690
				4691	if (copy_to_user(arg, sa, sizeof(*sa)))
				4692	ret = -EFAULT;
				4693
				4694	if (!(sa->flags & BTRFS_SCRUB_READONLY))
				4695	mnt_drop_write_file(file);
				4696	out:
				4697	kfree(sa);
				4698	return ret;
				4699	}
				4700
				4701	static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
				4702	{
				4703	if (!capable(CAP_SYS_ADMIN))
				4704	return -EPERM;
				4705
				4706	return btrfs_scrub_cancel(fs_info);
				4707	}
				4708
				4709	static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
				4710	void __user *arg)
				4711	{
				4712	struct btrfs_ioctl_scrub_args *sa;
				4713	int ret;
				4714
				4715	if (!capable(CAP_SYS_ADMIN))
				4716	return -EPERM;
				4717
				4718	sa = memdup_user(arg, sizeof(*sa));
				4719	if (IS_ERR(sa))
				4720	return PTR_ERR(sa);
				4721
				4722	ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
				4723
				4724	if (copy_to_user(arg, sa, sizeof(*sa)))
				4725	ret = -EFAULT;
				4726
				4727	kfree(sa);
				4728	return ret;
				4729	}
				4730
				4731	static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
				4732	void __user *arg)
				4733	{
				4734	struct btrfs_ioctl_get_dev_stats *sa;
				4735	int ret;
				4736
				4737	sa = memdup_user(arg, sizeof(*sa));
				4738	if (IS_ERR(sa))
				4739	return PTR_ERR(sa);
				4740
				4741	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
				4742	kfree(sa);
				4743	return -EPERM;
				4744	}
				4745
				4746	ret = btrfs_get_dev_stats(fs_info, sa);
				4747
				4748	if (copy_to_user(arg, sa, sizeof(*sa)))
				4749	ret = -EFAULT;
				4750
				4751	kfree(sa);
				4752	return ret;
				4753	}
				4754
				4755	static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
				4756	void __user *arg)
				4757	{
				4758	struct btrfs_ioctl_dev_replace_args *p;
				4759	int ret;
				4760
				4761	if (!capable(CAP_SYS_ADMIN))
				4762	return -EPERM;
				4763
				4764	p = memdup_user(arg, sizeof(*p));
				4765	if (IS_ERR(p))
				4766	return PTR_ERR(p);
				4767
				4768	switch (p->cmd) {
				4769	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
				4770	if (sb_rdonly(fs_info->sb)) {
				4771	ret = -EROFS;
				4772	goto out;
				4773	}
				4774	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				4775	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				4776	} else {
				4777	ret = btrfs_dev_replace_by_ioctl(fs_info, p);
				4778	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				4779	}
				4780	break;
				4781	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
				4782	btrfs_dev_replace_status(fs_info, p);
				4783	ret = 0;
				4784	break;
				4785	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
				4786	p->result = btrfs_dev_replace_cancel(fs_info);
				4787	ret = 0;
				4788	break;
				4789	default:
				4790	ret = -EINVAL;
				4791	break;
				4792	}
				4793
				4794	if (copy_to_user(arg, p, sizeof(*p)))
				4795	ret = -EFAULT;
				4796	out:
				4797	kfree(p);
				4798	return ret;
				4799	}
				4800
				4801	static long btrfs_ioctl_ino_to_path(struct btrfs_root root, void __user arg)
				4802	{
				4803	int ret = 0;
				4804	int i;
				4805	u64 rel_ptr;
				4806	int size;
				4807	struct btrfs_ioctl_ino_path_args *ipa = NULL;
				4808	struct inode_fs_paths *ipath = NULL;
				4809	struct btrfs_path *path;
				4810
				4811	if (!capable(CAP_DAC_READ_SEARCH))
				4812	return -EPERM;
				4813
				4814	path = btrfs_alloc_path();
				4815	if (!path) {
				4816	ret = -ENOMEM;
				4817	goto out;
				4818	}
				4819
				4820	ipa = memdup_user(arg, sizeof(*ipa));
				4821	if (IS_ERR(ipa)) {
				4822	ret = PTR_ERR(ipa);
				4823	ipa = NULL;
				4824	goto out;
				4825	}
				4826
				4827	size = min_t(u32, ipa->size, 4096);
				4828	ipath = init_ipath(size, root, path);
				4829	if (IS_ERR(ipath)) {
				4830	ret = PTR_ERR(ipath);
				4831	ipath = NULL;
				4832	goto out;
				4833	}
				4834
				4835	ret = paths_from_inode(ipa->inum, ipath);
				4836	if (ret < 0)
				4837	goto out;
				4838
				4839	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
				4840	rel_ptr = ipath->fspath->val[i] -
				4841	(u64)(unsigned long)ipath->fspath->val;
				4842	ipath->fspath->val[i] = rel_ptr;
				4843	}
				4844
				4845	ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
				4846	ipath->fspath, size);
				4847	if (ret) {
				4848	ret = -EFAULT;
				4849	goto out;
				4850	}
				4851
				4852	out:
				4853	btrfs_free_path(path);
				4854	free_ipath(ipath);
				4855	kfree(ipa);
				4856
				4857	return ret;
				4858	}
				4859
				4860	static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
				4861	{
				4862	struct btrfs_data_container *inodes = ctx;
				4863	const size_t c = 3 * sizeof(u64);
				4864
				4865	if (inodes->bytes_left >= c) {
				4866	inodes->bytes_left -= c;
				4867	inodes->val[inodes->elem_cnt] = inum;
				4868	inodes->val[inodes->elem_cnt + 1] = offset;
				4869	inodes->val[inodes->elem_cnt + 2] = root;
				4870	inodes->elem_cnt += 3;
				4871	} else {
				4872	inodes->bytes_missing += c - inodes->bytes_left;
				4873	inodes->bytes_left = 0;
				4874	inodes->elem_missed += 3;
				4875	}
				4876
				4877	return 0;
				4878	}
				4879
				4880	static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
				4881	void __user *arg, int version)
				4882	{
				4883	int ret = 0;
				4884	int size;
				4885	struct btrfs_ioctl_logical_ino_args *loi;
				4886	struct btrfs_data_container *inodes = NULL;
				4887	struct btrfs_path *path = NULL;
				4888	bool ignore_offset;
				4889
				4890	if (!capable(CAP_SYS_ADMIN))
				4891	return -EPERM;
				4892
				4893	loi = memdup_user(arg, sizeof(*loi));
				4894	if (IS_ERR(loi))
				4895	return PTR_ERR(loi);
				4896
				4897	if (version == 1) {
				4898	ignore_offset = false;
				4899	size = min_t(u32, loi->size, SZ_64K);
				4900	} else {
				4901	/* All reserved bits must be 0 for now */
				4902	if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
				4903	ret = -EINVAL;
				4904	goto out_loi;
				4905	}
				4906	/* Only accept flags we have defined so far */
				4907	if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
				4908	ret = -EINVAL;
				4909	goto out_loi;
				4910	}
				4911	ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
				4912	size = min_t(u32, loi->size, SZ_16M);
				4913	}
				4914
				4915	path = btrfs_alloc_path();
				4916	if (!path) {
				4917	ret = -ENOMEM;
				4918	goto out;
				4919	}
				4920
				4921	inodes = init_data_container(size);
				4922	if (IS_ERR(inodes)) {
				4923	ret = PTR_ERR(inodes);
				4924	inodes = NULL;
				4925	goto out;
				4926	}
				4927
				4928	ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
				4929	build_ino_list, inodes, ignore_offset);
				4930	if (ret == -EINVAL)
				4931	ret = -ENOENT;
				4932	if (ret < 0)
				4933	goto out;
				4934
				4935	ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
				4936	size);
				4937	if (ret)
				4938	ret = -EFAULT;
				4939
				4940	out:
				4941	btrfs_free_path(path);
				4942	kvfree(inodes);
				4943	out_loi:
				4944	kfree(loi);
				4945
				4946	return ret;
				4947	}
				4948
				4949	void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
				4950	struct btrfs_ioctl_balance_args *bargs)
				4951	{
				4952	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				4953
				4954	bargs->flags = bctl->flags;
				4955
				4956	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
				4957	bargs->state \|= BTRFS_BALANCE_STATE_RUNNING;
				4958	if (atomic_read(&fs_info->balance_pause_req))
				4959	bargs->state \|= BTRFS_BALANCE_STATE_PAUSE_REQ;
				4960	if (atomic_read(&fs_info->balance_cancel_req))
				4961	bargs->state \|= BTRFS_BALANCE_STATE_CANCEL_REQ;
				4962
				4963	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
				4964	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
				4965	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
				4966
				4967	spin_lock(&fs_info->balance_lock);
				4968	memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
				4969	spin_unlock(&fs_info->balance_lock);
				4970	}
				4971
				4972	static long btrfs_ioctl_balance(struct file file, void __user arg)
				4973	{
				4974	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
				4975	struct btrfs_fs_info *fs_info = root->fs_info;
				4976	struct btrfs_ioctl_balance_args *bargs;
				4977	struct btrfs_balance_control *bctl;
				4978	bool need_unlock; /* for mut. excl. ops lock */
				4979	int ret;
				4980
				4981	if (!capable(CAP_SYS_ADMIN))
				4982	return -EPERM;
				4983
				4984	ret = mnt_want_write_file(file);
				4985	if (ret)
				4986	return ret;
				4987
				4988	again:
				4989	if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				4990	mutex_lock(&fs_info->balance_mutex);
				4991	need_unlock = true;
				4992	goto locked;
				4993	}
				4994
				4995	/*
				4996	* mut. excl. ops lock is locked. Three possibilities:
				4997	* (1) some other op is running
				4998	* (2) balance is running
				4999	* (3) balance is paused -- special case (think resume)
				5000	*/
				5001	mutex_lock(&fs_info->balance_mutex);
				5002	if (fs_info->balance_ctl) {
				5003	/* this is either (2) or (3) */
				5004	if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
				5005	mutex_unlock(&fs_info->balance_mutex);
				5006	/*
				5007	* Lock released to allow other waiters to continue,
				5008	* we'll reexamine the status again.
				5009	*/
				5010	mutex_lock(&fs_info->balance_mutex);
				5011
				5012	if (fs_info->balance_ctl &&
				5013	!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
				5014	/* this is (3) */
				5015	need_unlock = false;
				5016	goto locked;
				5017	}
				5018
				5019	mutex_unlock(&fs_info->balance_mutex);
				5020	goto again;
				5021	} else {
				5022	/* this is (2) */
				5023	mutex_unlock(&fs_info->balance_mutex);
				5024	ret = -EINPROGRESS;
				5025	goto out;
				5026	}
				5027	} else {
				5028	/* this is (1) */
				5029	mutex_unlock(&fs_info->balance_mutex);
				5030	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				5031	goto out;
				5032	}
				5033
				5034	locked:
				5035	BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
				5036
				5037	if (arg) {
				5038	bargs = memdup_user(arg, sizeof(*bargs));
				5039	if (IS_ERR(bargs)) {
				5040	ret = PTR_ERR(bargs);
				5041	goto out_unlock;
				5042	}
				5043
				5044	if (bargs->flags & BTRFS_BALANCE_RESUME) {
				5045	if (!fs_info->balance_ctl) {
				5046	ret = -ENOTCONN;
				5047	goto out_bargs;
				5048	}
				5049
				5050	bctl = fs_info->balance_ctl;
				5051	spin_lock(&fs_info->balance_lock);
				5052	bctl->flags \|= BTRFS_BALANCE_RESUME;
				5053	spin_unlock(&fs_info->balance_lock);
				5054
				5055	goto do_balance;
				5056	}
				5057	} else {
				5058	bargs = NULL;
				5059	}
				5060
				5061	if (fs_info->balance_ctl) {
				5062	ret = -EINPROGRESS;
				5063	goto out_bargs;
				5064	}
				5065
				5066	bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
				5067	if (!bctl) {
				5068	ret = -ENOMEM;
				5069	goto out_bargs;
				5070	}
				5071
				5072	if (arg) {
				5073	memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
				5074	memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
				5075	memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
				5076
				5077	bctl->flags = bargs->flags;
				5078	} else {
				5079	/* balance everything - no filters */
				5080	bctl->flags \|= BTRFS_BALANCE_TYPE_MASK;
				5081	}
				5082
				5083	if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK \| BTRFS_BALANCE_TYPE_MASK)) {
				5084	ret = -EINVAL;
				5085	goto out_bctl;
				5086	}
				5087
				5088	do_balance:
				5089	/*
				5090	* Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
				5091	* btrfs_balance. bctl is freed in reset_balance_state, or, if
				5092	* restriper was paused all the way until unmount, in free_fs_info.
				5093	* The flag should be cleared after reset_balance_state.
				5094	*/
				5095	need_unlock = false;
				5096
				5097	ret = btrfs_balance(fs_info, bctl, bargs);
				5098	bctl = NULL;
				5099
				5100	if (arg) {
				5101	if (copy_to_user(arg, bargs, sizeof(*bargs)))
				5102	ret = -EFAULT;
				5103	}
				5104
				5105	out_bctl:
				5106	kfree(bctl);
				5107	out_bargs:
				5108	kfree(bargs);
				5109	out_unlock:
				5110	mutex_unlock(&fs_info->balance_mutex);
				5111	if (need_unlock)
				5112	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				5113	out:
				5114	mnt_drop_write_file(file);
				5115	return ret;
				5116	}
				5117
				5118	static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
				5119	{
				5120	if (!capable(CAP_SYS_ADMIN))
				5121	return -EPERM;
				5122
				5123	switch (cmd) {
				5124	case BTRFS_BALANCE_CTL_PAUSE:
				5125	return btrfs_pause_balance(fs_info);
				5126	case BTRFS_BALANCE_CTL_CANCEL:
				5127	return btrfs_cancel_balance(fs_info);
				5128	}
				5129
				5130	return -EINVAL;
				5131	}
				5132
				5133	static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
				5134	void __user *arg)
				5135	{
				5136	struct btrfs_ioctl_balance_args *bargs;
				5137	int ret = 0;
				5138
				5139	if (!capable(CAP_SYS_ADMIN))
				5140	return -EPERM;
				5141
				5142	mutex_lock(&fs_info->balance_mutex);
				5143	if (!fs_info->balance_ctl) {
				5144	ret = -ENOTCONN;
				5145	goto out;
				5146	}
				5147
				5148	bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
				5149	if (!bargs) {
				5150	ret = -ENOMEM;
				5151	goto out;
				5152	}
				5153
				5154	btrfs_update_ioctl_balance_args(fs_info, bargs);
				5155
				5156	if (copy_to_user(arg, bargs, sizeof(*bargs)))
				5157	ret = -EFAULT;
				5158
				5159	kfree(bargs);
				5160	out:
				5161	mutex_unlock(&fs_info->balance_mutex);
				5162	return ret;
				5163	}
				5164
				5165	static long btrfs_ioctl_quota_ctl(struct file file, void __user arg)
				5166	{
				5167	struct inode *inode = file_inode(file);
				5168	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5169	struct btrfs_ioctl_quota_ctl_args *sa;
				5170	int ret;
				5171
				5172	if (!capable(CAP_SYS_ADMIN))
				5173	return -EPERM;
				5174
				5175	ret = mnt_want_write_file(file);
				5176	if (ret)
				5177	return ret;
				5178
				5179	sa = memdup_user(arg, sizeof(*sa));
				5180	if (IS_ERR(sa)) {
				5181	ret = PTR_ERR(sa);
				5182	goto drop_write;
				5183	}
				5184
				5185	down_write(&fs_info->subvol_sem);
				5186
				5187	switch (sa->cmd) {
				5188	case BTRFS_QUOTA_CTL_ENABLE:
				5189	ret = btrfs_quota_enable(fs_info);
				5190	break;
				5191	case BTRFS_QUOTA_CTL_DISABLE:
				5192	ret = btrfs_quota_disable(fs_info);
				5193	break;
				5194	default:
				5195	ret = -EINVAL;
				5196	break;
				5197	}
				5198
				5199	kfree(sa);
				5200	up_write(&fs_info->subvol_sem);
				5201	drop_write:
				5202	mnt_drop_write_file(file);
				5203	return ret;
				5204	}
				5205
				5206	static long btrfs_ioctl_qgroup_assign(struct file file, void __user arg)
				5207	{
				5208	struct inode *inode = file_inode(file);
				5209	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5210	struct btrfs_root *root = BTRFS_I(inode)->root;
				5211	struct btrfs_ioctl_qgroup_assign_args *sa;
				5212	struct btrfs_trans_handle *trans;
				5213	int ret;
				5214	int err;
				5215
				5216	if (!capable(CAP_SYS_ADMIN))
				5217	return -EPERM;
				5218
				5219	ret = mnt_want_write_file(file);
				5220	if (ret)
				5221	return ret;
				5222
				5223	sa = memdup_user(arg, sizeof(*sa));
				5224	if (IS_ERR(sa)) {
				5225	ret = PTR_ERR(sa);
				5226	goto drop_write;
				5227	}
				5228
				5229	trans = btrfs_join_transaction(root);
				5230	if (IS_ERR(trans)) {
				5231	ret = PTR_ERR(trans);
				5232	goto out;
				5233	}
				5234
				5235	if (sa->assign) {
				5236	ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
				5237	} else {
				5238	ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
				5239	}
				5240
				5241	/* update qgroup status and info */
				5242	err = btrfs_run_qgroups(trans);
				5243	if (err < 0)
				5244	btrfs_handle_fs_error(fs_info, err,
				5245	"failed to update qgroup status and info");
				5246	err = btrfs_end_transaction(trans);
				5247	if (err && !ret)
				5248	ret = err;
				5249
				5250	out:
				5251	kfree(sa);
				5252	drop_write:
				5253	mnt_drop_write_file(file);
				5254	return ret;
				5255	}
				5256
				5257	static long btrfs_ioctl_qgroup_create(struct file file, void __user arg)
				5258	{
				5259	struct inode *inode = file_inode(file);
				5260	struct btrfs_root *root = BTRFS_I(inode)->root;
				5261	struct btrfs_ioctl_qgroup_create_args *sa;
				5262	struct btrfs_trans_handle *trans;
				5263	int ret;
				5264	int err;
				5265
				5266	if (!capable(CAP_SYS_ADMIN))
				5267	return -EPERM;
				5268
				5269	ret = mnt_want_write_file(file);
				5270	if (ret)
				5271	return ret;
				5272
				5273	sa = memdup_user(arg, sizeof(*sa));
				5274	if (IS_ERR(sa)) {
				5275	ret = PTR_ERR(sa);
				5276	goto drop_write;
				5277	}
				5278
				5279	if (!sa->qgroupid) {
				5280	ret = -EINVAL;
				5281	goto out;
				5282	}
				5283
				5284	trans = btrfs_join_transaction(root);
				5285	if (IS_ERR(trans)) {
				5286	ret = PTR_ERR(trans);
				5287	goto out;
				5288	}
				5289
				5290	if (sa->create) {
				5291	ret = btrfs_create_qgroup(trans, sa->qgroupid);
				5292	} else {
				5293	ret = btrfs_remove_qgroup(trans, sa->qgroupid);
				5294	}
				5295
				5296	err = btrfs_end_transaction(trans);
				5297	if (err && !ret)
				5298	ret = err;
				5299
				5300	out:
				5301	kfree(sa);
				5302	drop_write:
				5303	mnt_drop_write_file(file);
				5304	return ret;
				5305	}
				5306
				5307	static long btrfs_ioctl_qgroup_limit(struct file file, void __user arg)
				5308	{
				5309	struct inode *inode = file_inode(file);
				5310	struct btrfs_root *root = BTRFS_I(inode)->root;
				5311	struct btrfs_ioctl_qgroup_limit_args *sa;
				5312	struct btrfs_trans_handle *trans;
				5313	int ret;
				5314	int err;
				5315	u64 qgroupid;
				5316
				5317	if (!capable(CAP_SYS_ADMIN))
				5318	return -EPERM;
				5319
				5320	ret = mnt_want_write_file(file);
				5321	if (ret)
				5322	return ret;
				5323
				5324	sa = memdup_user(arg, sizeof(*sa));
				5325	if (IS_ERR(sa)) {
				5326	ret = PTR_ERR(sa);
				5327	goto drop_write;
				5328	}
				5329
				5330	trans = btrfs_join_transaction(root);
				5331	if (IS_ERR(trans)) {
				5332	ret = PTR_ERR(trans);
				5333	goto out;
				5334	}
				5335
				5336	qgroupid = sa->qgroupid;
				5337	if (!qgroupid) {
				5338	/* take the current subvol as qgroup */
				5339	qgroupid = root->root_key.objectid;
				5340	}
				5341
				5342	ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
				5343
				5344	err = btrfs_end_transaction(trans);
				5345	if (err && !ret)
				5346	ret = err;
				5347
				5348	out:
				5349	kfree(sa);
				5350	drop_write:
				5351	mnt_drop_write_file(file);
				5352	return ret;
				5353	}
				5354
				5355	static long btrfs_ioctl_quota_rescan(struct file file, void __user arg)
				5356	{
				5357	struct inode *inode = file_inode(file);
				5358	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5359	struct btrfs_ioctl_quota_rescan_args *qsa;
				5360	int ret;
				5361
				5362	if (!capable(CAP_SYS_ADMIN))
				5363	return -EPERM;
				5364
				5365	ret = mnt_want_write_file(file);
				5366	if (ret)
				5367	return ret;
				5368
				5369	qsa = memdup_user(arg, sizeof(*qsa));
				5370	if (IS_ERR(qsa)) {
				5371	ret = PTR_ERR(qsa);
				5372	goto drop_write;
				5373	}
				5374
				5375	if (qsa->flags) {
				5376	ret = -EINVAL;
				5377	goto out;
				5378	}
				5379
				5380	ret = btrfs_qgroup_rescan(fs_info);
				5381
				5382	out:
				5383	kfree(qsa);
				5384	drop_write:
				5385	mnt_drop_write_file(file);
				5386	return ret;
				5387	}
				5388
				5389	static long btrfs_ioctl_quota_rescan_status(struct file file, void __user arg)
				5390	{
				5391	struct inode *inode = file_inode(file);
				5392	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5393	struct btrfs_ioctl_quota_rescan_args *qsa;
				5394	int ret = 0;
				5395
				5396	if (!capable(CAP_SYS_ADMIN))
				5397	return -EPERM;
				5398
				5399	qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
				5400	if (!qsa)
				5401	return -ENOMEM;
				5402
				5403	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
				5404	qsa->flags = 1;
				5405	qsa->progress = fs_info->qgroup_rescan_progress.objectid;
				5406	}
				5407
				5408	if (copy_to_user(arg, qsa, sizeof(*qsa)))
				5409	ret = -EFAULT;
				5410
				5411	kfree(qsa);
				5412	return ret;
				5413	}
				5414
				5415	static long btrfs_ioctl_quota_rescan_wait(struct file file, void __user arg)
				5416	{
				5417	struct inode *inode = file_inode(file);
				5418	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5419
				5420	if (!capable(CAP_SYS_ADMIN))
				5421	return -EPERM;
				5422
				5423	return btrfs_qgroup_wait_for_completion(fs_info, true);
				5424	}
				5425
				5426	static long _btrfs_ioctl_set_received_subvol(struct file *file,
				5427	struct btrfs_ioctl_received_subvol_args *sa)
				5428	{
				5429	struct inode *inode = file_inode(file);
				5430	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5431	struct btrfs_root *root = BTRFS_I(inode)->root;
				5432	struct btrfs_root_item *root_item = &root->root_item;
				5433	struct btrfs_trans_handle *trans;
				5434	struct timespec64 ct = current_time(inode);
				5435	int ret = 0;
				5436	int received_uuid_changed;
				5437
				5438	if (!inode_owner_or_capable(inode))
				5439	return -EPERM;
				5440
				5441	ret = mnt_want_write_file(file);
				5442	if (ret < 0)
				5443	return ret;
				5444
				5445	down_write(&fs_info->subvol_sem);
				5446
				5447	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
				5448	ret = -EINVAL;
				5449	goto out;
				5450	}
				5451
				5452	if (btrfs_root_readonly(root)) {
				5453	ret = -EROFS;
				5454	goto out;
				5455	}
				5456
				5457	/*
				5458	* 1 - root item
				5459	* 2 - uuid items (received uuid + subvol uuid)
				5460	*/
				5461	trans = btrfs_start_transaction(root, 3);
				5462	if (IS_ERR(trans)) {
				5463	ret = PTR_ERR(trans);
				5464	trans = NULL;
				5465	goto out;
				5466	}
				5467
				5468	sa->rtransid = trans->transid;
				5469	sa->rtime.sec = ct.tv_sec;
				5470	sa->rtime.nsec = ct.tv_nsec;
				5471
				5472	received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
				5473	BTRFS_UUID_SIZE);
				5474	if (received_uuid_changed &&
				5475	!btrfs_is_empty_uuid(root_item->received_uuid)) {
				5476	ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
				5477	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				5478	root->root_key.objectid);
				5479	if (ret && ret != -ENOENT) {
				5480	btrfs_abort_transaction(trans, ret);
				5481	btrfs_end_transaction(trans);
				5482	goto out;
				5483	}
				5484	}
				5485	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
				5486	btrfs_set_root_stransid(root_item, sa->stransid);
				5487	btrfs_set_root_rtransid(root_item, sa->rtransid);
				5488	btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
				5489	btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
				5490	btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
				5491	btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
				5492
				5493	ret = btrfs_update_root(trans, fs_info->tree_root,
				5494	&root->root_key, &root->root_item);
				5495	if (ret < 0) {
				5496	btrfs_end_transaction(trans);
				5497	goto out;
				5498	}
				5499	if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
				5500	ret = btrfs_uuid_tree_add(trans, sa->uuid,
				5501	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				5502	root->root_key.objectid);
				5503	if (ret < 0 && ret != -EEXIST) {
				5504	btrfs_abort_transaction(trans, ret);
				5505	btrfs_end_transaction(trans);
				5506	goto out;
				5507	}
				5508	}
				5509	ret = btrfs_commit_transaction(trans);
				5510	out:
				5511	up_write(&fs_info->subvol_sem);
				5512	mnt_drop_write_file(file);
				5513	return ret;
				5514	}
				5515
				5516	#ifdef CONFIG_64BIT
				5517	static long btrfs_ioctl_set_received_subvol_32(struct file *file,
				5518	void __user *arg)
				5519	{
				5520	struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
				5521	struct btrfs_ioctl_received_subvol_args *args64 = NULL;
				5522	int ret = 0;
				5523
				5524	args32 = memdup_user(arg, sizeof(*args32));
				5525	if (IS_ERR(args32))
				5526	return PTR_ERR(args32);
				5527
				5528	args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
				5529	if (!args64) {
				5530	ret = -ENOMEM;
				5531	goto out;
				5532	}
				5533
				5534	memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
				5535	args64->stransid = args32->stransid;
				5536	args64->rtransid = args32->rtransid;
				5537	args64->stime.sec = args32->stime.sec;
				5538	args64->stime.nsec = args32->stime.nsec;
				5539	args64->rtime.sec = args32->rtime.sec;
				5540	args64->rtime.nsec = args32->rtime.nsec;
				5541	args64->flags = args32->flags;
				5542
				5543	ret = _btrfs_ioctl_set_received_subvol(file, args64);
				5544	if (ret)
				5545	goto out;
				5546
				5547	memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
				5548	args32->stransid = args64->stransid;
				5549	args32->rtransid = args64->rtransid;
				5550	args32->stime.sec = args64->stime.sec;
				5551	args32->stime.nsec = args64->stime.nsec;
				5552	args32->rtime.sec = args64->rtime.sec;
				5553	args32->rtime.nsec = args64->rtime.nsec;
				5554	args32->flags = args64->flags;
				5555
				5556	ret = copy_to_user(arg, args32, sizeof(*args32));
				5557	if (ret)
				5558	ret = -EFAULT;
				5559
				5560	out:
				5561	kfree(args32);
				5562	kfree(args64);
				5563	return ret;
				5564	}
				5565	#endif
				5566
				5567	static long btrfs_ioctl_set_received_subvol(struct file *file,
				5568	void __user *arg)
				5569	{
				5570	struct btrfs_ioctl_received_subvol_args *sa = NULL;
				5571	int ret = 0;
				5572
				5573	sa = memdup_user(arg, sizeof(*sa));
				5574	if (IS_ERR(sa))
				5575	return PTR_ERR(sa);
				5576
				5577	ret = _btrfs_ioctl_set_received_subvol(file, sa);
				5578
				5579	if (ret)
				5580	goto out;
				5581
				5582	ret = copy_to_user(arg, sa, sizeof(*sa));
				5583	if (ret)
				5584	ret = -EFAULT;
				5585
				5586	out:
				5587	kfree(sa);
				5588	return ret;
				5589	}
				5590
				5591	static int btrfs_ioctl_get_fslabel(struct file file, void __user arg)
				5592	{
				5593	struct inode *inode = file_inode(file);
				5594	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5595	size_t len;
				5596	int ret;
				5597	char label[BTRFS_LABEL_SIZE];
				5598
				5599	spin_lock(&fs_info->super_lock);
				5600	memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
				5601	spin_unlock(&fs_info->super_lock);
				5602
				5603	len = strnlen(label, BTRFS_LABEL_SIZE);
				5604
				5605	if (len == BTRFS_LABEL_SIZE) {
				5606	btrfs_warn(fs_info,
				5607	"label is too long, return the first %zu bytes",
				5608	--len);
				5609	}
				5610
				5611	ret = copy_to_user(arg, label, len);
				5612
				5613	return ret ? -EFAULT : 0;
				5614	}
				5615
				5616	static int btrfs_ioctl_set_fslabel(struct file file, void __user arg)
				5617	{
				5618	struct inode *inode = file_inode(file);
				5619	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5620	struct btrfs_root *root = BTRFS_I(inode)->root;
				5621	struct btrfs_super_block *super_block = fs_info->super_copy;
				5622	struct btrfs_trans_handle *trans;
				5623	char label[BTRFS_LABEL_SIZE];
				5624	int ret;
				5625
				5626	if (!capable(CAP_SYS_ADMIN))
				5627	return -EPERM;
				5628
				5629	if (copy_from_user(label, arg, sizeof(label)))
				5630	return -EFAULT;
				5631
				5632	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
				5633	btrfs_err(fs_info,
				5634	"unable to set label with more than %d bytes",
				5635	BTRFS_LABEL_SIZE - 1);
				5636	return -EINVAL;
				5637	}
				5638
				5639	ret = mnt_want_write_file(file);
				5640	if (ret)
				5641	return ret;
				5642
				5643	trans = btrfs_start_transaction(root, 0);
				5644	if (IS_ERR(trans)) {
				5645	ret = PTR_ERR(trans);
				5646	goto out_unlock;
				5647	}
				5648
				5649	spin_lock(&fs_info->super_lock);
				5650	strcpy(super_block->label, label);
				5651	spin_unlock(&fs_info->super_lock);
				5652	ret = btrfs_commit_transaction(trans);
				5653
				5654	out_unlock:
				5655	mnt_drop_write_file(file);
				5656	return ret;
				5657	}
				5658
				5659	#define INIT_FEATURE_FLAGS(suffix) \
				5660	{ .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
				5661	.compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
				5662	.incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
				5663
				5664	int btrfs_ioctl_get_supported_features(void __user *arg)
				5665	{
				5666	static const struct btrfs_ioctl_feature_flags features[3] = {
				5667	INIT_FEATURE_FLAGS(SUPP),
				5668	INIT_FEATURE_FLAGS(SAFE_SET),
				5669	INIT_FEATURE_FLAGS(SAFE_CLEAR)
				5670	};
				5671
				5672	if (copy_to_user(arg, &features, sizeof(features)))
				5673	return -EFAULT;
				5674
				5675	return 0;
				5676	}
				5677
				5678	static int btrfs_ioctl_get_features(struct file file, void __user arg)
				5679	{
				5680	struct inode *inode = file_inode(file);
				5681	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5682	struct btrfs_super_block *super_block = fs_info->super_copy;
				5683	struct btrfs_ioctl_feature_flags features;
				5684
				5685	features.compat_flags = btrfs_super_compat_flags(super_block);
				5686	features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
				5687	features.incompat_flags = btrfs_super_incompat_flags(super_block);
				5688
				5689	if (copy_to_user(arg, &features, sizeof(features)))
				5690	return -EFAULT;
				5691
				5692	return 0;
				5693	}
				5694
				5695	static int check_feature_bits(struct btrfs_fs_info *fs_info,
				5696	enum btrfs_feature_set set,
				5697	u64 change_mask, u64 flags, u64 supported_flags,
				5698	u64 safe_set, u64 safe_clear)
				5699	{
				5700	const char *type = btrfs_feature_set_names[set];
				5701	char *names;
				5702	u64 disallowed, unsupported;
				5703	u64 set_mask = flags & change_mask;
				5704	u64 clear_mask = ~flags & change_mask;
				5705
				5706	unsupported = set_mask & ~supported_flags;
				5707	if (unsupported) {
				5708	names = btrfs_printable_features(set, unsupported);
				5709	if (names) {
				5710	btrfs_warn(fs_info,
				5711	"this kernel does not support the %s feature bit%s",
				5712	names, strchr(names, ',') ? "s" : "");
				5713	kfree(names);
				5714	} else
				5715	btrfs_warn(fs_info,
				5716	"this kernel does not support %s bits 0x%llx",
				5717	type, unsupported);
				5718	return -EOPNOTSUPP;
				5719	}
				5720
				5721	disallowed = set_mask & ~safe_set;
				5722	if (disallowed) {
				5723	names = btrfs_printable_features(set, disallowed);
				5724	if (names) {
				5725	btrfs_warn(fs_info,
				5726	"can't set the %s feature bit%s while mounted",
				5727	names, strchr(names, ',') ? "s" : "");
				5728	kfree(names);
				5729	} else
				5730	btrfs_warn(fs_info,
				5731	"can't set %s bits 0x%llx while mounted",
				5732	type, disallowed);
				5733	return -EPERM;
				5734	}
				5735
				5736	disallowed = clear_mask & ~safe_clear;
				5737	if (disallowed) {
				5738	names = btrfs_printable_features(set, disallowed);
				5739	if (names) {
				5740	btrfs_warn(fs_info,
				5741	"can't clear the %s feature bit%s while mounted",
				5742	names, strchr(names, ',') ? "s" : "");
				5743	kfree(names);
				5744	} else
				5745	btrfs_warn(fs_info,
				5746	"can't clear %s bits 0x%llx while mounted",
				5747	type, disallowed);
				5748	return -EPERM;
				5749	}
				5750
				5751	return 0;
				5752	}
				5753
				5754	#define check_feature(fs_info, change_mask, flags, mask_base) \
				5755	check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
				5756	BTRFS_FEATURE_ ## mask_base ## _SUPP, \
				5757	BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
				5758	BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
				5759
				5760	static int btrfs_ioctl_set_features(struct file file, void __user arg)
				5761	{
				5762	struct inode *inode = file_inode(file);
				5763	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5764	struct btrfs_root *root = BTRFS_I(inode)->root;
				5765	struct btrfs_super_block *super_block = fs_info->super_copy;
				5766	struct btrfs_ioctl_feature_flags flags[2];
				5767	struct btrfs_trans_handle *trans;
				5768	u64 newflags;
				5769	int ret;
				5770
				5771	if (!capable(CAP_SYS_ADMIN))
				5772	return -EPERM;
				5773
				5774	if (copy_from_user(flags, arg, sizeof(flags)))
				5775	return -EFAULT;
				5776
				5777	/* Nothing to do */
				5778	if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
				5779	!flags[0].incompat_flags)
				5780	return 0;
				5781
				5782	ret = check_feature(fs_info, flags[0].compat_flags,
				5783	flags[1].compat_flags, COMPAT);
				5784	if (ret)
				5785	return ret;
				5786
				5787	ret = check_feature(fs_info, flags[0].compat_ro_flags,
				5788	flags[1].compat_ro_flags, COMPAT_RO);
				5789	if (ret)
				5790	return ret;
				5791
				5792	ret = check_feature(fs_info, flags[0].incompat_flags,
				5793	flags[1].incompat_flags, INCOMPAT);
				5794	if (ret)
				5795	return ret;
				5796
				5797	ret = mnt_want_write_file(file);
				5798	if (ret)
				5799	return ret;
				5800
				5801	trans = btrfs_start_transaction(root, 0);
				5802	if (IS_ERR(trans)) {
				5803	ret = PTR_ERR(trans);
				5804	goto out_drop_write;
				5805	}
				5806
				5807	spin_lock(&fs_info->super_lock);
				5808	newflags = btrfs_super_compat_flags(super_block);
				5809	newflags \|= flags[0].compat_flags & flags[1].compat_flags;
				5810	newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
				5811	btrfs_set_super_compat_flags(super_block, newflags);
				5812
				5813	newflags = btrfs_super_compat_ro_flags(super_block);
				5814	newflags \|= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
				5815	newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
				5816	btrfs_set_super_compat_ro_flags(super_block, newflags);
				5817
				5818	newflags = btrfs_super_incompat_flags(super_block);
				5819	newflags \|= flags[0].incompat_flags & flags[1].incompat_flags;
				5820	newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
				5821	btrfs_set_super_incompat_flags(super_block, newflags);
				5822	spin_unlock(&fs_info->super_lock);
				5823
				5824	ret = btrfs_commit_transaction(trans);
				5825	out_drop_write:
				5826	mnt_drop_write_file(file);
				5827
				5828	return ret;
				5829	}
				5830
				5831	static int _btrfs_ioctl_send(struct file file, void __user argp, bool compat)
				5832	{
				5833	struct btrfs_ioctl_send_args *arg;
				5834	int ret;
				5835
				5836	if (compat) {
				5837	#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
				5838	struct btrfs_ioctl_send_args_32 args32;
				5839
				5840	ret = copy_from_user(&args32, argp, sizeof(args32));
				5841	if (ret)
				5842	return -EFAULT;
				5843	arg = kzalloc(sizeof(*arg), GFP_KERNEL);
				5844	if (!arg)
				5845	return -ENOMEM;
				5846	arg->send_fd = args32.send_fd;
				5847	arg->clone_sources_count = args32.clone_sources_count;
				5848	arg->clone_sources = compat_ptr(args32.clone_sources);
				5849	arg->parent_root = args32.parent_root;
				5850	arg->flags = args32.flags;
				5851	memcpy(arg->reserved, args32.reserved,
				5852	sizeof(args32.reserved));
				5853	#else
				5854	return -ENOTTY;
				5855	#endif
				5856	} else {
				5857	arg = memdup_user(argp, sizeof(*arg));
				5858	if (IS_ERR(arg))
				5859	return PTR_ERR(arg);
				5860	}
				5861	ret = btrfs_ioctl_send(file, arg);
				5862	kfree(arg);
				5863	return ret;
				5864	}
				5865
				5866	long btrfs_ioctl(struct file *file, unsigned int
				5867	cmd, unsigned long arg)
				5868	{
				5869	struct inode *inode = file_inode(file);
				5870	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5871	struct btrfs_root *root = BTRFS_I(inode)->root;
				5872	void __user argp = (void __user )arg;
				5873
				5874	switch (cmd) {
				5875	case FS_IOC_GETFLAGS:
				5876	return btrfs_ioctl_getflags(file, argp);
				5877	case FS_IOC_SETFLAGS:
				5878	return btrfs_ioctl_setflags(file, argp);
				5879	case FS_IOC_GETVERSION:
				5880	return btrfs_ioctl_getversion(file, argp);
				5881	case FITRIM:
				5882	return btrfs_ioctl_fitrim(file, argp);
				5883	case BTRFS_IOC_SNAP_CREATE:
				5884	return btrfs_ioctl_snap_create(file, argp, 0);
				5885	case BTRFS_IOC_SNAP_CREATE_V2:
				5886	return btrfs_ioctl_snap_create_v2(file, argp, 0);
				5887	case BTRFS_IOC_SUBVOL_CREATE:
				5888	return btrfs_ioctl_snap_create(file, argp, 1);
				5889	case BTRFS_IOC_SUBVOL_CREATE_V2:
				5890	return btrfs_ioctl_snap_create_v2(file, argp, 1);
				5891	case BTRFS_IOC_SNAP_DESTROY:
				5892	return btrfs_ioctl_snap_destroy(file, argp);
				5893	case BTRFS_IOC_SUBVOL_GETFLAGS:
				5894	return btrfs_ioctl_subvol_getflags(file, argp);
				5895	case BTRFS_IOC_SUBVOL_SETFLAGS:
				5896	return btrfs_ioctl_subvol_setflags(file, argp);
				5897	case BTRFS_IOC_DEFAULT_SUBVOL:
				5898	return btrfs_ioctl_default_subvol(file, argp);
				5899	case BTRFS_IOC_DEFRAG:
				5900	return btrfs_ioctl_defrag(file, NULL);
				5901	case BTRFS_IOC_DEFRAG_RANGE:
				5902	return btrfs_ioctl_defrag(file, argp);
				5903	case BTRFS_IOC_RESIZE:
				5904	return btrfs_ioctl_resize(file, argp);
				5905	case BTRFS_IOC_ADD_DEV:
				5906	return btrfs_ioctl_add_dev(fs_info, argp);
				5907	case BTRFS_IOC_RM_DEV:
				5908	return btrfs_ioctl_rm_dev(file, argp);
				5909	case BTRFS_IOC_RM_DEV_V2:
				5910	return btrfs_ioctl_rm_dev_v2(file, argp);
				5911	case BTRFS_IOC_FS_INFO:
				5912	return btrfs_ioctl_fs_info(fs_info, argp);
				5913	case BTRFS_IOC_DEV_INFO:
				5914	return btrfs_ioctl_dev_info(fs_info, argp);
				5915	case BTRFS_IOC_BALANCE:
				5916	return btrfs_ioctl_balance(file, NULL);
				5917	case BTRFS_IOC_TREE_SEARCH:
				5918	return btrfs_ioctl_tree_search(file, argp);
				5919	case BTRFS_IOC_TREE_SEARCH_V2:
				5920	return btrfs_ioctl_tree_search_v2(file, argp);
				5921	case BTRFS_IOC_INO_LOOKUP:
				5922	return btrfs_ioctl_ino_lookup(file, argp);
				5923	case BTRFS_IOC_INO_PATHS:
				5924	return btrfs_ioctl_ino_to_path(root, argp);
				5925	case BTRFS_IOC_LOGICAL_INO:
				5926	return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
				5927	case BTRFS_IOC_LOGICAL_INO_V2:
				5928	return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
				5929	case BTRFS_IOC_SPACE_INFO:
				5930	return btrfs_ioctl_space_info(fs_info, argp);
				5931	case BTRFS_IOC_SYNC: {
				5932	int ret;
				5933
				5934	ret = btrfs_start_delalloc_roots(fs_info, -1);
				5935	if (ret)
				5936	return ret;
				5937	ret = btrfs_sync_fs(inode->i_sb, 1);
				5938	/*
				5939	* The transaction thread may want to do more work,
				5940	* namely it pokes the cleaner kthread that will start
				5941	* processing uncleaned subvols.
				5942	*/
				5943	wake_up_process(fs_info->transaction_kthread);
				5944	return ret;
				5945	}
				5946	case BTRFS_IOC_START_SYNC:
				5947	return btrfs_ioctl_start_sync(root, argp);
				5948	case BTRFS_IOC_WAIT_SYNC:
				5949	return btrfs_ioctl_wait_sync(fs_info, argp);
				5950	case BTRFS_IOC_SCRUB:
				5951	return btrfs_ioctl_scrub(file, argp);
				5952	case BTRFS_IOC_SCRUB_CANCEL:
				5953	return btrfs_ioctl_scrub_cancel(fs_info);
				5954	case BTRFS_IOC_SCRUB_PROGRESS:
				5955	return btrfs_ioctl_scrub_progress(fs_info, argp);
				5956	case BTRFS_IOC_BALANCE_V2:
				5957	return btrfs_ioctl_balance(file, argp);
				5958	case BTRFS_IOC_BALANCE_CTL:
				5959	return btrfs_ioctl_balance_ctl(fs_info, arg);
				5960	case BTRFS_IOC_BALANCE_PROGRESS:
				5961	return btrfs_ioctl_balance_progress(fs_info, argp);
				5962	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
				5963	return btrfs_ioctl_set_received_subvol(file, argp);
				5964	#ifdef CONFIG_64BIT
				5965	case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
				5966	return btrfs_ioctl_set_received_subvol_32(file, argp);
				5967	#endif
				5968	case BTRFS_IOC_SEND:
				5969	return _btrfs_ioctl_send(file, argp, false);
				5970	#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
				5971	case BTRFS_IOC_SEND_32:
				5972	return _btrfs_ioctl_send(file, argp, true);
				5973	#endif
				5974	case BTRFS_IOC_GET_DEV_STATS:
				5975	return btrfs_ioctl_get_dev_stats(fs_info, argp);
				5976	case BTRFS_IOC_QUOTA_CTL:
				5977	return btrfs_ioctl_quota_ctl(file, argp);
				5978	case BTRFS_IOC_QGROUP_ASSIGN:
				5979	return btrfs_ioctl_qgroup_assign(file, argp);
				5980	case BTRFS_IOC_QGROUP_CREATE:
				5981	return btrfs_ioctl_qgroup_create(file, argp);
				5982	case BTRFS_IOC_QGROUP_LIMIT:
				5983	return btrfs_ioctl_qgroup_limit(file, argp);
				5984	case BTRFS_IOC_QUOTA_RESCAN:
				5985	return btrfs_ioctl_quota_rescan(file, argp);
				5986	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
				5987	return btrfs_ioctl_quota_rescan_status(file, argp);
				5988	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
				5989	return btrfs_ioctl_quota_rescan_wait(file, argp);
				5990	case BTRFS_IOC_DEV_REPLACE:
				5991	return btrfs_ioctl_dev_replace(fs_info, argp);
				5992	case BTRFS_IOC_GET_FSLABEL:
				5993	return btrfs_ioctl_get_fslabel(file, argp);
				5994	case BTRFS_IOC_SET_FSLABEL:
				5995	return btrfs_ioctl_set_fslabel(file, argp);
				5996	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
				5997	return btrfs_ioctl_get_supported_features(argp);
				5998	case BTRFS_IOC_GET_FEATURES:
				5999	return btrfs_ioctl_get_features(file, argp);
				6000	case BTRFS_IOC_SET_FEATURES:
				6001	return btrfs_ioctl_set_features(file, argp);
				6002	case FS_IOC_FSGETXATTR:
				6003	return btrfs_ioctl_fsgetxattr(file, argp);
				6004	case FS_IOC_FSSETXATTR:
				6005	return btrfs_ioctl_fssetxattr(file, argp);
				6006	case BTRFS_IOC_GET_SUBVOL_INFO:
				6007	return btrfs_ioctl_get_subvol_info(file, argp);
				6008	case BTRFS_IOC_GET_SUBVOL_ROOTREF:
				6009	return btrfs_ioctl_get_subvol_rootref(file, argp);
				6010	case BTRFS_IOC_INO_LOOKUP_USER:
				6011	return btrfs_ioctl_ino_lookup_user(file, argp);
				6012	}
				6013
				6014	return -ENOTTY;
				6015	}
				6016
				6017	#ifdef CONFIG_COMPAT
				6018	long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
				6019	{
				6020	/*
				6021	* These all access 32-bit values anyway so no further
				6022	* handling is necessary.
				6023	*/
				6024	switch (cmd) {
				6025	case FS_IOC32_GETFLAGS:
				6026	cmd = FS_IOC_GETFLAGS;
				6027	break;
				6028	case FS_IOC32_SETFLAGS:
				6029	cmd = FS_IOC_SETFLAGS;
				6030	break;
				6031	case FS_IOC32_GETVERSION:
				6032	cmd = FS_IOC_GETVERSION;
				6033	break;
				6034	}
				6035
				6036	return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
				6037	}
				6038	#endif