Blame - src/kernel/linux/v4.14/fs/btrfs/ioctl.c - T103

blob: 73a0fc60e395ac0358635b6b6dec047b3bd451fb [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2007 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/kernel.h>
				20	#include <linux/bio.h>
				21	#include <linux/buffer_head.h>
				22	#include <linux/file.h>
				23	#include <linux/fs.h>
				24	#include <linux/fsnotify.h>
				25	#include <linux/pagemap.h>
				26	#include <linux/highmem.h>
				27	#include <linux/time.h>
				28	#include <linux/init.h>
				29	#include <linux/string.h>
				30	#include <linux/backing-dev.h>
				31	#include <linux/mount.h>
				32	#include <linux/mpage.h>
				33	#include <linux/namei.h>
				34	#include <linux/swap.h>
				35	#include <linux/writeback.h>
				36	#include <linux/compat.h>
				37	#include <linux/bit_spinlock.h>
				38	#include <linux/security.h>
				39	#include <linux/xattr.h>
				40	#include <linux/mm.h>
				41	#include <linux/slab.h>
				42	#include <linux/blkdev.h>
				43	#include <linux/uuid.h>
				44	#include <linux/btrfs.h>
				45	#include <linux/uaccess.h>
				46	#include "ctree.h"
				47	#include "disk-io.h"
				48	#include "transaction.h"
				49	#include "btrfs_inode.h"
				50	#include "print-tree.h"
				51	#include "volumes.h"
				52	#include "locking.h"
				53	#include "inode-map.h"
				54	#include "backref.h"
				55	#include "rcu-string.h"
				56	#include "send.h"
				57	#include "dev-replace.h"
				58	#include "props.h"
				59	#include "sysfs.h"
				60	#include "qgroup.h"
				61	#include "tree-log.h"
				62	#include "compression.h"
				63
				64	#ifdef CONFIG_64BIT
				65	/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
				66	* structures are incorrect, as the timespec structure from userspace
				67	* is 4 bytes too small. We define these alternatives here to teach
				68	* the kernel about the 32-bit struct packing.
				69	*/
				70	struct btrfs_ioctl_timespec_32 {
				71	__u64 sec;
				72	__u32 nsec;
				73	} __attribute__ ((__packed__));
				74
				75	struct btrfs_ioctl_received_subvol_args_32 {
				76	char uuid[BTRFS_UUID_SIZE]; /* in */
				77	__u64 stransid; /* in */
				78	__u64 rtransid; /* out */
				79	struct btrfs_ioctl_timespec_32 stime; /* in */
				80	struct btrfs_ioctl_timespec_32 rtime; /* out */
				81	__u64 flags; /* in */
				82	__u64 reserved[16]; /* in */
				83	} __attribute__ ((__packed__));
				84
				85	#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
				86	struct btrfs_ioctl_received_subvol_args_32)
				87	#endif
				88
				89
				90	static int btrfs_clone(struct inode src, struct inode inode,
				91	u64 off, u64 olen, u64 olen_aligned, u64 destoff,
				92	int no_time_update);
				93
				94	/* Mask out flags that are inappropriate for the given type of inode. */
				95	static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
				96	{
				97	if (S_ISDIR(mode))
				98	return flags;
				99	else if (S_ISREG(mode))
				100	return flags & ~FS_DIRSYNC_FL;
				101	else
				102	return flags & (FS_NODUMP_FL \| FS_NOATIME_FL);
				103	}
				104
				105	/*
				106	* Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
				107	*/
				108	static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
				109	{
				110	unsigned int iflags = 0;
				111
				112	if (flags & BTRFS_INODE_SYNC)
				113	iflags \|= FS_SYNC_FL;
				114	if (flags & BTRFS_INODE_IMMUTABLE)
				115	iflags \|= FS_IMMUTABLE_FL;
				116	if (flags & BTRFS_INODE_APPEND)
				117	iflags \|= FS_APPEND_FL;
				118	if (flags & BTRFS_INODE_NODUMP)
				119	iflags \|= FS_NODUMP_FL;
				120	if (flags & BTRFS_INODE_NOATIME)
				121	iflags \|= FS_NOATIME_FL;
				122	if (flags & BTRFS_INODE_DIRSYNC)
				123	iflags \|= FS_DIRSYNC_FL;
				124	if (flags & BTRFS_INODE_NODATACOW)
				125	iflags \|= FS_NOCOW_FL;
				126
				127	if (flags & BTRFS_INODE_NOCOMPRESS)
				128	iflags \|= FS_NOCOMP_FL;
				129	else if (flags & BTRFS_INODE_COMPRESS)
				130	iflags \|= FS_COMPR_FL;
				131
				132	return iflags;
				133	}
				134
				135	/*
				136	* Update inode->i_flags based on the btrfs internal flags.
				137	*/
				138	void btrfs_update_iflags(struct inode *inode)
				139	{
				140	struct btrfs_inode *ip = BTRFS_I(inode);
				141	unsigned int new_fl = 0;
				142
				143	if (ip->flags & BTRFS_INODE_SYNC)
				144	new_fl \|= S_SYNC;
				145	if (ip->flags & BTRFS_INODE_IMMUTABLE)
				146	new_fl \|= S_IMMUTABLE;
				147	if (ip->flags & BTRFS_INODE_APPEND)
				148	new_fl \|= S_APPEND;
				149	if (ip->flags & BTRFS_INODE_NOATIME)
				150	new_fl \|= S_NOATIME;
				151	if (ip->flags & BTRFS_INODE_DIRSYNC)
				152	new_fl \|= S_DIRSYNC;
				153
				154	set_mask_bits(&inode->i_flags,
				155	S_SYNC \| S_APPEND \| S_IMMUTABLE \| S_NOATIME \| S_DIRSYNC,
				156	new_fl);
				157	}
				158
				159	static int btrfs_ioctl_getflags(struct file file, void __user arg)
				160	{
				161	struct btrfs_inode *ip = BTRFS_I(file_inode(file));
				162	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
				163
				164	if (copy_to_user(arg, &flags, sizeof(flags)))
				165	return -EFAULT;
				166	return 0;
				167	}
				168
				169	static int check_flags(unsigned int flags)
				170	{
				171	if (flags & ~(FS_IMMUTABLE_FL \| FS_APPEND_FL \| \
				172	FS_NOATIME_FL \| FS_NODUMP_FL \| \
				173	FS_SYNC_FL \| FS_DIRSYNC_FL \| \
				174	FS_NOCOMP_FL \| FS_COMPR_FL \|
				175	FS_NOCOW_FL))
				176	return -EOPNOTSUPP;
				177
				178	if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
				179	return -EINVAL;
				180
				181	return 0;
				182	}
				183
				184	static int btrfs_ioctl_setflags(struct file file, void __user arg)
				185	{
				186	struct inode *inode = file_inode(file);
				187	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				188	struct btrfs_inode *ip = BTRFS_I(inode);
				189	struct btrfs_root *root = ip->root;
				190	struct btrfs_trans_handle *trans;
				191	unsigned int flags, oldflags;
				192	int ret;
				193	u64 ip_oldflags;
				194	unsigned int i_oldflags;
				195	umode_t mode;
				196
				197	if (!inode_owner_or_capable(inode))
				198	return -EPERM;
				199
				200	if (btrfs_root_readonly(root))
				201	return -EROFS;
				202
				203	if (copy_from_user(&flags, arg, sizeof(flags)))
				204	return -EFAULT;
				205
				206	ret = check_flags(flags);
				207	if (ret)
				208	return ret;
				209
				210	ret = mnt_want_write_file(file);
				211	if (ret)
				212	return ret;
				213
				214	inode_lock(inode);
				215
				216	ip_oldflags = ip->flags;
				217	i_oldflags = inode->i_flags;
				218	mode = inode->i_mode;
				219
				220	flags = btrfs_mask_flags(inode->i_mode, flags);
				221	oldflags = btrfs_flags_to_ioctl(ip->flags);
				222	if ((flags ^ oldflags) & (FS_APPEND_FL \| FS_IMMUTABLE_FL)) {
				223	if (!capable(CAP_LINUX_IMMUTABLE)) {
				224	ret = -EPERM;
				225	goto out_unlock;
				226	}
				227	}
				228
				229	if (flags & FS_SYNC_FL)
				230	ip->flags \|= BTRFS_INODE_SYNC;
				231	else
				232	ip->flags &= ~BTRFS_INODE_SYNC;
				233	if (flags & FS_IMMUTABLE_FL)
				234	ip->flags \|= BTRFS_INODE_IMMUTABLE;
				235	else
				236	ip->flags &= ~BTRFS_INODE_IMMUTABLE;
				237	if (flags & FS_APPEND_FL)
				238	ip->flags \|= BTRFS_INODE_APPEND;
				239	else
				240	ip->flags &= ~BTRFS_INODE_APPEND;
				241	if (flags & FS_NODUMP_FL)
				242	ip->flags \|= BTRFS_INODE_NODUMP;
				243	else
				244	ip->flags &= ~BTRFS_INODE_NODUMP;
				245	if (flags & FS_NOATIME_FL)
				246	ip->flags \|= BTRFS_INODE_NOATIME;
				247	else
				248	ip->flags &= ~BTRFS_INODE_NOATIME;
				249	if (flags & FS_DIRSYNC_FL)
				250	ip->flags \|= BTRFS_INODE_DIRSYNC;
				251	else
				252	ip->flags &= ~BTRFS_INODE_DIRSYNC;
				253	if (flags & FS_NOCOW_FL) {
				254	if (S_ISREG(mode)) {
				255	/*
				256	* It's safe to turn csums off here, no extents exist.
				257	* Otherwise we want the flag to reflect the real COW
				258	* status of the file and will not set it.
				259	*/
				260	if (inode->i_size == 0)
				261	ip->flags \|= BTRFS_INODE_NODATACOW
				262	\| BTRFS_INODE_NODATASUM;
				263	} else {
				264	ip->flags \|= BTRFS_INODE_NODATACOW;
				265	}
				266	} else {
				267	/*
				268	* Revert back under same assumptions as above
				269	*/
				270	if (S_ISREG(mode)) {
				271	if (inode->i_size == 0)
				272	ip->flags &= ~(BTRFS_INODE_NODATACOW
				273	\| BTRFS_INODE_NODATASUM);
				274	} else {
				275	ip->flags &= ~BTRFS_INODE_NODATACOW;
				276	}
				277	}
				278
				279	/*
				280	* The COMPRESS flag can only be changed by users, while the NOCOMPRESS
				281	* flag may be changed automatically if compression code won't make
				282	* things smaller.
				283	*/
				284	if (flags & FS_NOCOMP_FL) {
				285	ip->flags &= ~BTRFS_INODE_COMPRESS;
				286	ip->flags \|= BTRFS_INODE_NOCOMPRESS;
				287
				288	ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
				289	if (ret && ret != -ENODATA)
				290	goto out_drop;
				291	} else if (flags & FS_COMPR_FL) {
				292	const char *comp;
				293
				294	ip->flags \|= BTRFS_INODE_COMPRESS;
				295	ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
				296
				297	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
				298	comp = "lzo";
				299	else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB)
				300	comp = "zlib";
				301	else
				302	comp = "zstd";
				303	ret = btrfs_set_prop(inode, "btrfs.compression",
				304	comp, strlen(comp), 0);
				305	if (ret)
				306	goto out_drop;
				307
				308	} else {
				309	ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0);
				310	if (ret && ret != -ENODATA)
				311	goto out_drop;
				312	ip->flags &= ~(BTRFS_INODE_COMPRESS \| BTRFS_INODE_NOCOMPRESS);
				313	}
				314
				315	trans = btrfs_start_transaction(root, 1);
				316	if (IS_ERR(trans)) {
				317	ret = PTR_ERR(trans);
				318	goto out_drop;
				319	}
				320
				321	btrfs_update_iflags(inode);
				322	inode_inc_iversion(inode);
				323	inode->i_ctime = current_time(inode);
				324	ret = btrfs_update_inode(trans, root, inode);
				325
				326	btrfs_end_transaction(trans);
				327	out_drop:
				328	if (ret) {
				329	ip->flags = ip_oldflags;
				330	inode->i_flags = i_oldflags;
				331	}
				332
				333	out_unlock:
				334	inode_unlock(inode);
				335	mnt_drop_write_file(file);
				336	return ret;
				337	}
				338
				339	static int btrfs_ioctl_getversion(struct file file, int __user arg)
				340	{
				341	struct inode *inode = file_inode(file);
				342
				343	return put_user(inode->i_generation, arg);
				344	}
				345
				346	static noinline int btrfs_ioctl_fitrim(struct file file, void __user arg)
				347	{
				348	struct inode *inode = file_inode(file);
				349	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				350	struct btrfs_device *device;
				351	struct request_queue *q;
				352	struct fstrim_range range;
				353	u64 minlen = ULLONG_MAX;
				354	u64 num_devices = 0;
				355	int ret;
				356
				357	if (!capable(CAP_SYS_ADMIN))
				358	return -EPERM;
				359
				360	/*
				361	* If the fs is mounted with nologreplay, which requires it to be
				362	* mounted in RO mode as well, we can not allow discard on free space
				363	* inside block groups, because log trees refer to extents that are not
				364	* pinned in a block group's free space cache (pinning the extents is
				365	* precisely the first phase of replaying a log tree).
				366	*/
				367	if (btrfs_test_opt(fs_info, NOLOGREPLAY))
				368	return -EROFS;
				369
				370	rcu_read_lock();
				371	list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
				372	dev_list) {
				373	if (!device->bdev)
				374	continue;
				375	q = bdev_get_queue(device->bdev);
				376	if (blk_queue_discard(q)) {
				377	num_devices++;
				378	minlen = min_t(u64, q->limits.discard_granularity,
				379	minlen);
				380	}
				381	}
				382	rcu_read_unlock();
				383
				384	if (!num_devices)
				385	return -EOPNOTSUPP;
				386	if (copy_from_user(&range, arg, sizeof(range)))
				387	return -EFAULT;
				388
				389	/*
				390	* NOTE: Don't truncate the range using super->total_bytes. Bytenr of
				391	* block group is in the logical address space, which can be any
				392	* sectorsize aligned bytenr in the range [0, U64_MAX].
				393	*/
				394	if (range.len < fs_info->sb->s_blocksize)
				395	return -EINVAL;
				396
				397	range.minlen = max(range.minlen, minlen);
				398	ret = btrfs_trim_fs(fs_info, &range);
				399	if (ret < 0)
				400	return ret;
				401
				402	if (copy_to_user(arg, &range, sizeof(range)))
				403	return -EFAULT;
				404
				405	return 0;
				406	}
				407
				408	int btrfs_is_empty_uuid(u8 *uuid)
				409	{
				410	int i;
				411
				412	for (i = 0; i < BTRFS_UUID_SIZE; i++) {
				413	if (uuid[i])
				414	return 0;
				415	}
				416	return 1;
				417	}
				418
				419	static noinline int create_subvol(struct inode *dir,
				420	struct dentry *dentry,
				421	const char *name, int namelen,
				422	u64 *async_transid,
				423	struct btrfs_qgroup_inherit *inherit)
				424	{
				425	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
				426	struct btrfs_trans_handle *trans;
				427	struct btrfs_key key;
				428	struct btrfs_root_item *root_item;
				429	struct btrfs_inode_item *inode_item;
				430	struct extent_buffer *leaf;
				431	struct btrfs_root *root = BTRFS_I(dir)->root;
				432	struct btrfs_root *new_root;
				433	struct btrfs_block_rsv block_rsv;
				434	struct timespec cur_time = current_time(dir);
				435	struct inode *inode;
				436	int ret;
				437	int err;
				438	u64 objectid;
				439	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
				440	u64 index = 0;
				441	u64 qgroup_reserved;
				442	uuid_le new_uuid;
				443
				444	root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
				445	if (!root_item)
				446	return -ENOMEM;
				447
				448	ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
				449	if (ret)
				450	goto fail_free;
				451
				452	/*
				453	* Don't create subvolume whose level is not zero. Or qgroup will be
				454	* screwed up since it assumes subvolume qgroup's level to be 0.
				455	*/
				456	if (btrfs_qgroup_level(objectid)) {
				457	ret = -ENOSPC;
				458	goto fail_free;
				459	}
				460
				461	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
				462	/*
				463	* The same as the snapshot creation, please see the comment
				464	* of create_snapshot().
				465	*/
				466	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
				467	8, &qgroup_reserved, false);
				468	if (ret)
				469	goto fail_free;
				470
				471	trans = btrfs_start_transaction(root, 0);
				472	if (IS_ERR(trans)) {
				473	ret = PTR_ERR(trans);
				474	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
				475	goto fail_free;
				476	}
				477	trans->block_rsv = &block_rsv;
				478	trans->bytes_reserved = block_rsv.size;
				479
				480	ret = btrfs_qgroup_inherit(trans, fs_info, 0, objectid, inherit);
				481	if (ret)
				482	goto fail;
				483
				484	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
				485	if (IS_ERR(leaf)) {
				486	ret = PTR_ERR(leaf);
				487	goto fail;
				488	}
				489
				490	memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
				491	btrfs_set_header_bytenr(leaf, leaf->start);
				492	btrfs_set_header_generation(leaf, trans->transid);
				493	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
				494	btrfs_set_header_owner(leaf, objectid);
				495
				496	write_extent_buffer_fsid(leaf, fs_info->fsid);
				497	write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
				498	btrfs_mark_buffer_dirty(leaf);
				499
				500	inode_item = &root_item->inode;
				501	btrfs_set_stack_inode_generation(inode_item, 1);
				502	btrfs_set_stack_inode_size(inode_item, 3);
				503	btrfs_set_stack_inode_nlink(inode_item, 1);
				504	btrfs_set_stack_inode_nbytes(inode_item,
				505	fs_info->nodesize);
				506	btrfs_set_stack_inode_mode(inode_item, S_IFDIR \| 0755);
				507
				508	btrfs_set_root_flags(root_item, 0);
				509	btrfs_set_root_limit(root_item, 0);
				510	btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
				511
				512	btrfs_set_root_bytenr(root_item, leaf->start);
				513	btrfs_set_root_generation(root_item, trans->transid);
				514	btrfs_set_root_level(root_item, 0);
				515	btrfs_set_root_refs(root_item, 1);
				516	btrfs_set_root_used(root_item, leaf->len);
				517	btrfs_set_root_last_snapshot(root_item, 0);
				518
				519	btrfs_set_root_generation_v2(root_item,
				520	btrfs_root_generation(root_item));
				521	uuid_le_gen(&new_uuid);
				522	memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
				523	btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
				524	btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
				525	root_item->ctime = root_item->otime;
				526	btrfs_set_root_ctransid(root_item, trans->transid);
				527	btrfs_set_root_otransid(root_item, trans->transid);
				528
				529	btrfs_tree_unlock(leaf);
				530	free_extent_buffer(leaf);
				531	leaf = NULL;
				532
				533	btrfs_set_root_dirid(root_item, new_dirid);
				534
				535	key.objectid = objectid;
				536	key.offset = 0;
				537	key.type = BTRFS_ROOT_ITEM_KEY;
				538	ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
				539	root_item);
				540	if (ret)
				541	goto fail;
				542
				543	key.offset = (u64)-1;
				544	new_root = btrfs_read_fs_root_no_name(fs_info, &key);
				545	if (IS_ERR(new_root)) {
				546	ret = PTR_ERR(new_root);
				547	btrfs_abort_transaction(trans, ret);
				548	goto fail;
				549	}
				550
				551	btrfs_record_root_in_trans(trans, new_root);
				552
				553	ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
				554	if (ret) {
				555	/* We potentially lose an unused inode item here */
				556	btrfs_abort_transaction(trans, ret);
				557	goto fail;
				558	}
				559
				560	mutex_lock(&new_root->objectid_mutex);
				561	new_root->highest_objectid = new_dirid;
				562	mutex_unlock(&new_root->objectid_mutex);
				563
				564	/*
				565	* insert the directory item
				566	*/
				567	ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
				568	if (ret) {
				569	btrfs_abort_transaction(trans, ret);
				570	goto fail;
				571	}
				572
				573	ret = btrfs_insert_dir_item(trans, root,
				574	name, namelen, BTRFS_I(dir), &key,
				575	BTRFS_FT_DIR, index);
				576	if (ret) {
				577	btrfs_abort_transaction(trans, ret);
				578	goto fail;
				579	}
				580
				581	btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
				582	ret = btrfs_update_inode(trans, root, dir);
				583	if (ret) {
				584	btrfs_abort_transaction(trans, ret);
				585	goto fail;
				586	}
				587
				588	ret = btrfs_add_root_ref(trans, fs_info,
				589	objectid, root->root_key.objectid,
				590	btrfs_ino(BTRFS_I(dir)), index, name, namelen);
				591	if (ret) {
				592	btrfs_abort_transaction(trans, ret);
				593	goto fail;
				594	}
				595
				596	ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid,
				597	BTRFS_UUID_KEY_SUBVOL, objectid);
				598	if (ret)
				599	btrfs_abort_transaction(trans, ret);
				600
				601	fail:
				602	kfree(root_item);
				603	trans->block_rsv = NULL;
				604	trans->bytes_reserved = 0;
				605	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
				606
				607	if (async_transid) {
				608	*async_transid = trans->transid;
				609	err = btrfs_commit_transaction_async(trans, 1);
				610	if (err)
				611	err = btrfs_commit_transaction(trans);
				612	} else {
				613	err = btrfs_commit_transaction(trans);
				614	}
				615	if (err && !ret)
				616	ret = err;
				617
				618	if (!ret) {
				619	inode = btrfs_lookup_dentry(dir, dentry);
				620	if (IS_ERR(inode))
				621	return PTR_ERR(inode);
				622	d_instantiate(dentry, inode);
				623	}
				624	return ret;
				625
				626	fail_free:
				627	kfree(root_item);
				628	return ret;
				629	}
				630
				631	static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root)
				632	{
				633	s64 writers;
				634	DEFINE_WAIT(wait);
				635
				636	do {
				637	prepare_to_wait(&root->subv_writers->wait, &wait,
				638	TASK_UNINTERRUPTIBLE);
				639
				640	writers = percpu_counter_sum(&root->subv_writers->counter);
				641	if (writers)
				642	schedule();
				643
				644	finish_wait(&root->subv_writers->wait, &wait);
				645	} while (writers);
				646	}
				647
				648	static int create_snapshot(struct btrfs_root root, struct inode dir,
				649	struct dentry *dentry,
				650	u64 *async_transid, bool readonly,
				651	struct btrfs_qgroup_inherit *inherit)
				652	{
				653	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
				654	struct inode *inode;
				655	struct btrfs_pending_snapshot *pending_snapshot;
				656	struct btrfs_trans_handle *trans;
				657	int ret;
				658
				659	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
				660	return -EINVAL;
				661
				662	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
				663	if (!pending_snapshot)
				664	return -ENOMEM;
				665
				666	pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
				667	GFP_KERNEL);
				668	pending_snapshot->path = btrfs_alloc_path();
				669	if (!pending_snapshot->root_item \|\| !pending_snapshot->path) {
				670	ret = -ENOMEM;
				671	goto free_pending;
				672	}
				673
				674	atomic_inc(&root->will_be_snapshotted);
				675	smp_mb__after_atomic();
				676	btrfs_wait_for_no_snapshotting_writes(root);
				677
				678	ret = btrfs_start_delalloc_inodes(root, 0);
				679	if (ret)
				680	goto dec_and_free;
				681
				682	btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
				683
				684	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
				685	BTRFS_BLOCK_RSV_TEMP);
				686	/*
				687	* 1 - parent dir inode
				688	* 2 - dir entries
				689	* 1 - root item
				690	* 2 - root ref/backref
				691	* 1 - root of snapshot
				692	* 1 - UUID item
				693	*/
				694	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
				695	&pending_snapshot->block_rsv, 8,
				696	&pending_snapshot->qgroup_reserved,
				697	false);
				698	if (ret)
				699	goto dec_and_free;
				700
				701	pending_snapshot->dentry = dentry;
				702	pending_snapshot->root = root;
				703	pending_snapshot->readonly = readonly;
				704	pending_snapshot->dir = dir;
				705	pending_snapshot->inherit = inherit;
				706
				707	trans = btrfs_start_transaction(root, 0);
				708	if (IS_ERR(trans)) {
				709	ret = PTR_ERR(trans);
				710	goto fail;
				711	}
				712
				713	spin_lock(&fs_info->trans_lock);
				714	list_add(&pending_snapshot->list,
				715	&trans->transaction->pending_snapshots);
				716	spin_unlock(&fs_info->trans_lock);
				717	if (async_transid) {
				718	*async_transid = trans->transid;
				719	ret = btrfs_commit_transaction_async(trans, 1);
				720	if (ret)
				721	ret = btrfs_commit_transaction(trans);
				722	} else {
				723	ret = btrfs_commit_transaction(trans);
				724	}
				725	if (ret)
				726	goto fail;
				727
				728	ret = pending_snapshot->error;
				729	if (ret)
				730	goto fail;
				731
				732	ret = btrfs_orphan_cleanup(pending_snapshot->snap);
				733	if (ret)
				734	goto fail;
				735
				736	inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
				737	if (IS_ERR(inode)) {
				738	ret = PTR_ERR(inode);
				739	goto fail;
				740	}
				741
				742	d_instantiate(dentry, inode);
				743	ret = 0;
				744	fail:
				745	btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
				746	dec_and_free:
				747	if (atomic_dec_and_test(&root->will_be_snapshotted))
				748	wake_up_atomic_t(&root->will_be_snapshotted);
				749	free_pending:
				750	kfree(pending_snapshot->root_item);
				751	btrfs_free_path(pending_snapshot->path);
				752	kfree(pending_snapshot);
				753
				754	return ret;
				755	}
				756
				757	/* copy of may_delete in fs/namei.c()
				758	* Check whether we can remove a link victim from directory dir, check
				759	* whether the type of victim is right.
				760	* 1. We can't do it if dir is read-only (done in permission())
				761	* 2. We should have write and exec permissions on dir
				762	* 3. We can't remove anything from append-only dir
				763	* 4. We can't do anything with immutable dir (done in permission())
				764	* 5. If the sticky bit on dir is set we should either
				765	* a. be owner of dir, or
				766	* b. be owner of victim, or
				767	* c. have CAP_FOWNER capability
				768	* 6. If the victim is append-only or immutable we can't do anything with
				769	* links pointing to it.
				770	* 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
				771	* 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
				772	* 9. We can't remove a root or mountpoint.
				773	* 10. We don't allow removal of NFS sillyrenamed files; it's handled by
				774	* nfs_async_unlink().
				775	*/
				776
				777	static int btrfs_may_delete(struct inode dir, struct dentry victim, int isdir)
				778	{
				779	int error;
				780
				781	if (d_really_is_negative(victim))
				782	return -ENOENT;
				783
				784	BUG_ON(d_inode(victim->d_parent) != dir);
				785	audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
				786
				787	error = inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				788	if (error)
				789	return error;
				790	if (IS_APPEND(dir))
				791	return -EPERM;
				792	if (check_sticky(dir, d_inode(victim)) \|\| IS_APPEND(d_inode(victim)) \|\|
				793	IS_IMMUTABLE(d_inode(victim)) \|\| IS_SWAPFILE(d_inode(victim)))
				794	return -EPERM;
				795	if (isdir) {
				796	if (!d_is_dir(victim))
				797	return -ENOTDIR;
				798	if (IS_ROOT(victim))
				799	return -EBUSY;
				800	} else if (d_is_dir(victim))
				801	return -EISDIR;
				802	if (IS_DEADDIR(dir))
				803	return -ENOENT;
				804	if (victim->d_flags & DCACHE_NFSFS_RENAMED)
				805	return -EBUSY;
				806	return 0;
				807	}
				808
				809	/* copy of may_create in fs/namei.c() */
				810	static inline int btrfs_may_create(struct inode dir, struct dentry child)
				811	{
				812	if (d_really_is_positive(child))
				813	return -EEXIST;
				814	if (IS_DEADDIR(dir))
				815	return -ENOENT;
				816	return inode_permission(dir, MAY_WRITE \| MAY_EXEC);
				817	}
				818
				819	/*
				820	* Create a new subvolume below @parent. This is largely modeled after
				821	* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
				822	* inside this filesystem so it's quite a bit simpler.
				823	*/
				824	static noinline int btrfs_mksubvol(const struct path *parent,
				825	const char *name, int namelen,
				826	struct btrfs_root *snap_src,
				827	u64 *async_transid, bool readonly,
				828	struct btrfs_qgroup_inherit *inherit)
				829	{
				830	struct inode *dir = d_inode(parent->dentry);
				831	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
				832	struct dentry *dentry;
				833	int error;
				834
				835	error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
				836	if (error == -EINTR)
				837	return error;
				838
				839	dentry = lookup_one_len(name, parent->dentry, namelen);
				840	error = PTR_ERR(dentry);
				841	if (IS_ERR(dentry))
				842	goto out_unlock;
				843
				844	error = btrfs_may_create(dir, dentry);
				845	if (error)
				846	goto out_dput;
				847
				848	/*
				849	* even if this name doesn't exist, we may get hash collisions.
				850	* check for them now when we can safely fail
				851	*/
				852	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
				853	dir->i_ino, name,
				854	namelen);
				855	if (error)
				856	goto out_dput;
				857
				858	down_read(&fs_info->subvol_sem);
				859
				860	if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
				861	goto out_up_read;
				862
				863	if (snap_src) {
				864	error = create_snapshot(snap_src, dir, dentry,
				865	async_transid, readonly, inherit);
				866	} else {
				867	error = create_subvol(dir, dentry, name, namelen,
				868	async_transid, inherit);
				869	}
				870	if (!error)
				871	fsnotify_mkdir(dir, dentry);
				872	out_up_read:
				873	up_read(&fs_info->subvol_sem);
				874	out_dput:
				875	dput(dentry);
				876	out_unlock:
				877	inode_unlock(dir);
				878	return error;
				879	}
				880
				881	/*
				882	* When we're defragging a range, we don't want to kick it off again
				883	* if it is really just waiting for delalloc to send it down.
				884	* If we find a nice big extent or delalloc range for the bytes in the
				885	* file you want to defrag, we return 0 to let you know to skip this
				886	* part of the file
				887	*/
				888	static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
				889	{
				890	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
				891	struct extent_map *em = NULL;
				892	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
				893	u64 end;
				894
				895	read_lock(&em_tree->lock);
				896	em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
				897	read_unlock(&em_tree->lock);
				898
				899	if (em) {
				900	end = extent_map_end(em);
				901	free_extent_map(em);
				902	if (end - offset > thresh)
				903	return 0;
				904	}
				905	/* if we already have a nice delalloc here, just stop */
				906	thresh /= 2;
				907	end = count_range_bits(io_tree, &offset, offset + thresh,
				908	thresh, EXTENT_DELALLOC, 1);
				909	if (end >= thresh)
				910	return 0;
				911	return 1;
				912	}
				913
				914	/*
				915	* helper function to walk through a file and find extents
				916	* newer than a specific transid, and smaller than thresh.
				917	*
				918	* This is used by the defragging code to find new and small
				919	* extents
				920	*/
				921	static int find_new_extents(struct btrfs_root *root,
				922	struct inode *inode, u64 newer_than,
				923	u64 *off, u32 thresh)
				924	{
				925	struct btrfs_path *path;
				926	struct btrfs_key min_key;
				927	struct extent_buffer *leaf;
				928	struct btrfs_file_extent_item *extent;
				929	int type;
				930	int ret;
				931	u64 ino = btrfs_ino(BTRFS_I(inode));
				932
				933	path = btrfs_alloc_path();
				934	if (!path)
				935	return -ENOMEM;
				936
				937	min_key.objectid = ino;
				938	min_key.type = BTRFS_EXTENT_DATA_KEY;
				939	min_key.offset = *off;
				940
				941	while (1) {
				942	ret = btrfs_search_forward(root, &min_key, path, newer_than);
				943	if (ret != 0)
				944	goto none;
				945	process_slot:
				946	if (min_key.objectid != ino)
				947	goto none;
				948	if (min_key.type != BTRFS_EXTENT_DATA_KEY)
				949	goto none;
				950
				951	leaf = path->nodes[0];
				952	extent = btrfs_item_ptr(leaf, path->slots[0],
				953	struct btrfs_file_extent_item);
				954
				955	type = btrfs_file_extent_type(leaf, extent);
				956	if (type == BTRFS_FILE_EXTENT_REG &&
				957	btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
				958	check_defrag_in_cache(inode, min_key.offset, thresh)) {
				959	*off = min_key.offset;
				960	btrfs_free_path(path);
				961	return 0;
				962	}
				963
				964	path->slots[0]++;
				965	if (path->slots[0] < btrfs_header_nritems(leaf)) {
				966	btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
				967	goto process_slot;
				968	}
				969
				970	if (min_key.offset == (u64)-1)
				971	goto none;
				972
				973	min_key.offset++;
				974	btrfs_release_path(path);
				975	}
				976	none:
				977	btrfs_free_path(path);
				978	return -ENOENT;
				979	}
				980
				981	static struct extent_map defrag_lookup_extent(struct inode inode, u64 start)
				982	{
				983	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
				984	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
				985	struct extent_map *em;
				986	u64 len = PAGE_SIZE;
				987
				988	/*
				989	* hopefully we have this extent in the tree already, try without
				990	* the full extent lock
				991	*/
				992	read_lock(&em_tree->lock);
				993	em = lookup_extent_mapping(em_tree, start, len);
				994	read_unlock(&em_tree->lock);
				995
				996	if (!em) {
				997	struct extent_state *cached = NULL;
				998	u64 end = start + len - 1;
				999
				1000	/* get the big lock and read metadata off disk */
				1001	lock_extent_bits(io_tree, start, end, &cached);
				1002	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
				1003	unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
				1004
				1005	if (IS_ERR(em))
				1006	return NULL;
				1007	}
				1008
				1009	return em;
				1010	}
				1011
				1012	static bool defrag_check_next_extent(struct inode inode, struct extent_map em)
				1013	{
				1014	struct extent_map *next;
				1015	bool ret = true;
				1016
				1017	/* this is the last extent */
				1018	if (em->start + em->len >= i_size_read(inode))
				1019	return false;
				1020
				1021	next = defrag_lookup_extent(inode, em->start + em->len);
				1022	if (!next \|\| next->block_start >= EXTENT_MAP_LAST_BYTE)
				1023	ret = false;
				1024	else if ((em->block_start + em->block_len == next->block_start) &&
				1025	(em->block_len > SZ_128K && next->block_len > SZ_128K))
				1026	ret = false;
				1027
				1028	free_extent_map(next);
				1029	return ret;
				1030	}
				1031
				1032	static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
				1033	u64 last_len, u64 skip, u64 *defrag_end,
				1034	int compress)
				1035	{
				1036	struct extent_map *em;
				1037	int ret = 1;
				1038	bool next_mergeable = true;
				1039	bool prev_mergeable = true;
				1040
				1041	/*
				1042	* make sure that once we start defragging an extent, we keep on
				1043	* defragging it
				1044	*/
				1045	if (start < *defrag_end)
				1046	return 1;
				1047
				1048	*skip = 0;
				1049
				1050	em = defrag_lookup_extent(inode, start);
				1051	if (!em)
				1052	return 0;
				1053
				1054	/* this will cover holes, and inline extents */
				1055	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
				1056	ret = 0;
				1057	goto out;
				1058	}
				1059
				1060	if (!*defrag_end)
				1061	prev_mergeable = false;
				1062
				1063	next_mergeable = defrag_check_next_extent(inode, em);
				1064	/*
				1065	* we hit a real extent, if it is big or the next extent is not a
				1066	* real extent, don't bother defragging it
				1067	*/
				1068	if (!compress && (last_len == 0 \|\| last_len >= thresh) &&
				1069	(em->len >= thresh \|\| (!next_mergeable && !prev_mergeable)))
				1070	ret = 0;
				1071	out:
				1072	/*
				1073	* last_len ends up being a counter of how many bytes we've defragged.
				1074	* every time we choose not to defrag an extent, we reset *last_len
				1075	* so that the next tiny extent will force a defrag.
				1076	*
				1077	* The end result of this is that tiny extents before a single big
				1078	* extent will force at least part of that big extent to be defragged.
				1079	*/
				1080	if (ret) {
				1081	*defrag_end = extent_map_end(em);
				1082	} else {
				1083	*last_len = 0;
				1084	*skip = extent_map_end(em);
				1085	*defrag_end = 0;
				1086	}
				1087
				1088	free_extent_map(em);
				1089	return ret;
				1090	}
				1091
				1092	/*
				1093	* it doesn't do much good to defrag one or two pages
				1094	* at a time. This pulls in a nice chunk of pages
				1095	* to COW and defrag.
				1096	*
				1097	* It also makes sure the delalloc code has enough
				1098	* dirty data to avoid making new small extents as part
				1099	* of the defrag
				1100	*
				1101	* It's a good idea to start RA on this range
				1102	* before calling this.
				1103	*/
				1104	static int cluster_pages_for_defrag(struct inode *inode,
				1105	struct page **pages,
				1106	unsigned long start_index,
				1107	unsigned long num_pages)
				1108	{
				1109	unsigned long file_end;
				1110	u64 isize = i_size_read(inode);
				1111	u64 page_start;
				1112	u64 page_end;
				1113	u64 page_cnt;
				1114	int ret;
				1115	int i;
				1116	int i_done;
				1117	struct btrfs_ordered_extent *ordered;
				1118	struct extent_state *cached_state = NULL;
				1119	struct extent_io_tree *tree;
				1120	struct extent_changeset *data_reserved = NULL;
				1121	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
				1122
				1123	file_end = (isize - 1) >> PAGE_SHIFT;
				1124	if (!isize \|\| start_index > file_end)
				1125	return 0;
				1126
				1127	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
				1128
				1129	ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
				1130	start_index << PAGE_SHIFT,
				1131	page_cnt << PAGE_SHIFT);
				1132	if (ret)
				1133	return ret;
				1134	i_done = 0;
				1135	tree = &BTRFS_I(inode)->io_tree;
				1136
				1137	/* step one, lock all the pages */
				1138	for (i = 0; i < page_cnt; i++) {
				1139	struct page *page;
				1140	again:
				1141	page = find_or_create_page(inode->i_mapping,
				1142	start_index + i, mask);
				1143	if (!page)
				1144	break;
				1145
				1146	page_start = page_offset(page);
				1147	page_end = page_start + PAGE_SIZE - 1;
				1148	while (1) {
				1149	lock_extent_bits(tree, page_start, page_end,
				1150	&cached_state);
				1151	ordered = btrfs_lookup_ordered_extent(inode,
				1152	page_start);
				1153	unlock_extent_cached(tree, page_start, page_end,
				1154	&cached_state, GFP_NOFS);
				1155	if (!ordered)
				1156	break;
				1157
				1158	unlock_page(page);
				1159	btrfs_start_ordered_extent(inode, ordered, 1);
				1160	btrfs_put_ordered_extent(ordered);
				1161	lock_page(page);
				1162	/*
				1163	* we unlocked the page above, so we need check if
				1164	* it was released or not.
				1165	*/
				1166	if (page->mapping != inode->i_mapping) {
				1167	unlock_page(page);
				1168	put_page(page);
				1169	goto again;
				1170	}
				1171	}
				1172
				1173	if (!PageUptodate(page)) {
				1174	btrfs_readpage(NULL, page);
				1175	lock_page(page);
				1176	if (!PageUptodate(page)) {
				1177	unlock_page(page);
				1178	put_page(page);
				1179	ret = -EIO;
				1180	break;
				1181	}
				1182	}
				1183
				1184	if (page->mapping != inode->i_mapping) {
				1185	unlock_page(page);
				1186	put_page(page);
				1187	goto again;
				1188	}
				1189
				1190	pages[i] = page;
				1191	i_done++;
				1192	}
				1193	if (!i_done \|\| ret)
				1194	goto out;
				1195
				1196	if (!(inode->i_sb->s_flags & MS_ACTIVE))
				1197	goto out;
				1198
				1199	/*
				1200	* so now we have a nice long stream of locked
				1201	* and up to date pages, lets wait on them
				1202	*/
				1203	for (i = 0; i < i_done; i++)
				1204	wait_on_page_writeback(pages[i]);
				1205
				1206	page_start = page_offset(pages[0]);
				1207	page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
				1208
				1209	lock_extent_bits(&BTRFS_I(inode)->io_tree,
				1210	page_start, page_end - 1, &cached_state);
				1211	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
				1212	page_end - 1, EXTENT_DIRTY \| EXTENT_DELALLOC \|
				1213	EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG, 0, 0,
				1214	&cached_state, GFP_NOFS);
				1215
				1216	if (i_done != page_cnt) {
				1217	spin_lock(&BTRFS_I(inode)->lock);
				1218	BTRFS_I(inode)->outstanding_extents++;
				1219	spin_unlock(&BTRFS_I(inode)->lock);
				1220	btrfs_delalloc_release_space(inode, data_reserved,
				1221	start_index << PAGE_SHIFT,
				1222	(page_cnt - i_done) << PAGE_SHIFT);
				1223	}
				1224
				1225
				1226	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
				1227	&cached_state);
				1228
				1229	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
				1230	page_start, page_end - 1, &cached_state,
				1231	GFP_NOFS);
				1232
				1233	for (i = 0; i < i_done; i++) {
				1234	clear_page_dirty_for_io(pages[i]);
				1235	ClearPageChecked(pages[i]);
				1236	set_page_extent_mapped(pages[i]);
				1237	set_page_dirty(pages[i]);
				1238	unlock_page(pages[i]);
				1239	put_page(pages[i]);
				1240	}
				1241	extent_changeset_free(data_reserved);
				1242	return i_done;
				1243	out:
				1244	for (i = 0; i < i_done; i++) {
				1245	unlock_page(pages[i]);
				1246	put_page(pages[i]);
				1247	}
				1248	btrfs_delalloc_release_space(inode, data_reserved,
				1249	start_index << PAGE_SHIFT,
				1250	page_cnt << PAGE_SHIFT);
				1251	extent_changeset_free(data_reserved);
				1252	return ret;
				1253
				1254	}
				1255
				1256	int btrfs_defrag_file(struct inode inode, struct file file,
				1257	struct btrfs_ioctl_defrag_range_args *range,
				1258	u64 newer_than, unsigned long max_to_defrag)
				1259	{
				1260	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1261	struct btrfs_root *root = BTRFS_I(inode)->root;
				1262	struct file_ra_state *ra = NULL;
				1263	unsigned long last_index;
				1264	u64 isize = i_size_read(inode);
				1265	u64 last_len = 0;
				1266	u64 skip = 0;
				1267	u64 defrag_end = 0;
				1268	u64 newer_off = range->start;
				1269	unsigned long i;
				1270	unsigned long ra_index = 0;
				1271	int ret;
				1272	int defrag_count = 0;
				1273	int compress_type = BTRFS_COMPRESS_ZLIB;
				1274	u32 extent_thresh = range->extent_thresh;
				1275	unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
				1276	unsigned long cluster = max_cluster;
				1277	u64 new_align = ~((u64)SZ_128K - 1);
				1278	struct page **pages = NULL;
				1279	bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
				1280
				1281	if (isize == 0)
				1282	return 0;
				1283
				1284	if (range->start >= isize)
				1285	return -EINVAL;
				1286
				1287	if (do_compress) {
				1288	if (range->compress_type > BTRFS_COMPRESS_TYPES)
				1289	return -EINVAL;
				1290	if (range->compress_type)
				1291	compress_type = range->compress_type;
				1292	}
				1293
				1294	if (extent_thresh == 0)
				1295	extent_thresh = SZ_256K;
				1296
				1297	/*
				1298	* If we were not given a file, allocate a readahead context. As
				1299	* readahead is just an optimization, defrag will work without it so
				1300	* we don't error out.
				1301	*/
				1302	if (!file) {
				1303	ra = kzalloc(sizeof(*ra), GFP_KERNEL);
				1304	if (ra)
				1305	file_ra_state_init(ra, inode->i_mapping);
				1306	} else {
				1307	ra = &file->f_ra;
				1308	}
				1309
				1310	pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
				1311	if (!pages) {
				1312	ret = -ENOMEM;
				1313	goto out_ra;
				1314	}
				1315
				1316	/* find the last page to defrag */
				1317	if (range->start + range->len > range->start) {
				1318	last_index = min_t(u64, isize - 1,
				1319	range->start + range->len - 1) >> PAGE_SHIFT;
				1320	} else {
				1321	last_index = (isize - 1) >> PAGE_SHIFT;
				1322	}
				1323
				1324	if (newer_than) {
				1325	ret = find_new_extents(root, inode, newer_than,
				1326	&newer_off, SZ_64K);
				1327	if (!ret) {
				1328	range->start = newer_off;
				1329	/*
				1330	* we always align our defrag to help keep
				1331	* the extents in the file evenly spaced
				1332	*/
				1333	i = (newer_off & new_align) >> PAGE_SHIFT;
				1334	} else
				1335	goto out_ra;
				1336	} else {
				1337	i = range->start >> PAGE_SHIFT;
				1338	}
				1339	if (!max_to_defrag)
				1340	max_to_defrag = last_index - i + 1;
				1341
				1342	/*
				1343	* make writeback starts from i, so the defrag range can be
				1344	* written sequentially.
				1345	*/
				1346	if (i < inode->i_mapping->writeback_index)
				1347	inode->i_mapping->writeback_index = i;
				1348
				1349	while (i <= last_index && defrag_count < max_to_defrag &&
				1350	(i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
				1351	/*
				1352	* make sure we stop running if someone unmounts
				1353	* the FS
				1354	*/
				1355	if (!(inode->i_sb->s_flags & MS_ACTIVE))
				1356	break;
				1357
				1358	if (btrfs_defrag_cancelled(fs_info)) {
				1359	btrfs_debug(fs_info, "defrag_file cancelled");
				1360	ret = -EAGAIN;
				1361	break;
				1362	}
				1363
				1364	if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
				1365	extent_thresh, &last_len, &skip,
				1366	&defrag_end, do_compress)){
				1367	unsigned long next;
				1368	/*
				1369	* the should_defrag function tells us how much to skip
				1370	* bump our counter by the suggested amount
				1371	*/
				1372	next = DIV_ROUND_UP(skip, PAGE_SIZE);
				1373	i = max(i + 1, next);
				1374	continue;
				1375	}
				1376
				1377	if (!newer_than) {
				1378	cluster = (PAGE_ALIGN(defrag_end) >>
				1379	PAGE_SHIFT) - i;
				1380	cluster = min(cluster, max_cluster);
				1381	} else {
				1382	cluster = max_cluster;
				1383	}
				1384
				1385	if (i + cluster > ra_index) {
				1386	ra_index = max(i, ra_index);
				1387	if (ra)
				1388	page_cache_sync_readahead(inode->i_mapping, ra,
				1389	file, ra_index, cluster);
				1390	ra_index += cluster;
				1391	}
				1392
				1393	inode_lock(inode);
				1394	if (do_compress)
				1395	BTRFS_I(inode)->defrag_compress = compress_type;
				1396	ret = cluster_pages_for_defrag(inode, pages, i, cluster);
				1397	if (ret < 0) {
				1398	inode_unlock(inode);
				1399	goto out_ra;
				1400	}
				1401
				1402	defrag_count += ret;
				1403	balance_dirty_pages_ratelimited(inode->i_mapping);
				1404	inode_unlock(inode);
				1405
				1406	if (newer_than) {
				1407	if (newer_off == (u64)-1)
				1408	break;
				1409
				1410	if (ret > 0)
				1411	i += ret;
				1412
				1413	newer_off = max(newer_off + 1,
				1414	(u64)i << PAGE_SHIFT);
				1415
				1416	ret = find_new_extents(root, inode, newer_than,
				1417	&newer_off, SZ_64K);
				1418	if (!ret) {
				1419	range->start = newer_off;
				1420	i = (newer_off & new_align) >> PAGE_SHIFT;
				1421	} else {
				1422	break;
				1423	}
				1424	} else {
				1425	if (ret > 0) {
				1426	i += ret;
				1427	last_len += ret << PAGE_SHIFT;
				1428	} else {
				1429	i++;
				1430	last_len = 0;
				1431	}
				1432	}
				1433	}
				1434
				1435	if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
				1436	filemap_flush(inode->i_mapping);
				1437	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
				1438	&BTRFS_I(inode)->runtime_flags))
				1439	filemap_flush(inode->i_mapping);
				1440	}
				1441
				1442	if (do_compress) {
				1443	/* the filemap_flush will queue IO into the worker threads, but
				1444	* we have to make sure the IO is actually started and that
				1445	* ordered extents get created before we return
				1446	*/
				1447	atomic_inc(&fs_info->async_submit_draining);
				1448	while (atomic_read(&fs_info->nr_async_submits) \|\|
				1449	atomic_read(&fs_info->async_delalloc_pages)) {
				1450	wait_event(fs_info->async_submit_wait,
				1451	(atomic_read(&fs_info->nr_async_submits) == 0 &&
				1452	atomic_read(&fs_info->async_delalloc_pages) == 0));
				1453	}
				1454	atomic_dec(&fs_info->async_submit_draining);
				1455	}
				1456
				1457	if (range->compress_type == BTRFS_COMPRESS_LZO) {
				1458	btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
				1459	} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
				1460	btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
				1461	}
				1462
				1463	ret = defrag_count;
				1464
				1465	out_ra:
				1466	if (do_compress) {
				1467	inode_lock(inode);
				1468	BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
				1469	inode_unlock(inode);
				1470	}
				1471	if (!file)
				1472	kfree(ra);
				1473	kfree(pages);
				1474	return ret;
				1475	}
				1476
				1477	static noinline int btrfs_ioctl_resize(struct file *file,
				1478	void __user *arg)
				1479	{
				1480	struct inode *inode = file_inode(file);
				1481	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1482	u64 new_size;
				1483	u64 old_size;
				1484	u64 devid = 1;
				1485	struct btrfs_root *root = BTRFS_I(inode)->root;
				1486	struct btrfs_ioctl_vol_args *vol_args;
				1487	struct btrfs_trans_handle *trans;
				1488	struct btrfs_device *device = NULL;
				1489	char *sizestr;
				1490	char *retptr;
				1491	char *devstr = NULL;
				1492	int ret = 0;
				1493	int mod = 0;
				1494
				1495	if (!capable(CAP_SYS_ADMIN))
				1496	return -EPERM;
				1497
				1498	ret = mnt_want_write_file(file);
				1499	if (ret)
				1500	return ret;
				1501
				1502	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				1503	mnt_drop_write_file(file);
				1504	return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				1505	}
				1506
				1507	mutex_lock(&fs_info->volume_mutex);
				1508	vol_args = memdup_user(arg, sizeof(*vol_args));
				1509	if (IS_ERR(vol_args)) {
				1510	ret = PTR_ERR(vol_args);
				1511	goto out;
				1512	}
				1513
				1514	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				1515
				1516	sizestr = vol_args->name;
				1517	devstr = strchr(sizestr, ':');
				1518	if (devstr) {
				1519	sizestr = devstr + 1;
				1520	*devstr = '\0';
				1521	devstr = vol_args->name;
				1522	ret = kstrtoull(devstr, 10, &devid);
				1523	if (ret)
				1524	goto out_free;
				1525	if (!devid) {
				1526	ret = -EINVAL;
				1527	goto out_free;
				1528	}
				1529	btrfs_info(fs_info, "resizing devid %llu", devid);
				1530	}
				1531
				1532	device = btrfs_find_device(fs_info, devid, NULL, NULL);
				1533	if (!device) {
				1534	btrfs_info(fs_info, "resizer unable to find device %llu",
				1535	devid);
				1536	ret = -ENODEV;
				1537	goto out_free;
				1538	}
				1539
				1540	if (!device->writeable) {
				1541	btrfs_info(fs_info,
				1542	"resizer unable to apply on readonly device %llu",
				1543	devid);
				1544	ret = -EPERM;
				1545	goto out_free;
				1546	}
				1547
				1548	if (!strcmp(sizestr, "max"))
				1549	new_size = device->bdev->bd_inode->i_size;
				1550	else {
				1551	if (sizestr[0] == '-') {
				1552	mod = -1;
				1553	sizestr++;
				1554	} else if (sizestr[0] == '+') {
				1555	mod = 1;
				1556	sizestr++;
				1557	}
				1558	new_size = memparse(sizestr, &retptr);
				1559	if (*retptr != '\0' \|\| new_size == 0) {
				1560	ret = -EINVAL;
				1561	goto out_free;
				1562	}
				1563	}
				1564
				1565	if (device->is_tgtdev_for_dev_replace) {
				1566	ret = -EPERM;
				1567	goto out_free;
				1568	}
				1569
				1570	old_size = btrfs_device_get_total_bytes(device);
				1571
				1572	if (mod < 0) {
				1573	if (new_size > old_size) {
				1574	ret = -EINVAL;
				1575	goto out_free;
				1576	}
				1577	new_size = old_size - new_size;
				1578	} else if (mod > 0) {
				1579	if (new_size > ULLONG_MAX - old_size) {
				1580	ret = -ERANGE;
				1581	goto out_free;
				1582	}
				1583	new_size = old_size + new_size;
				1584	}
				1585
				1586	if (new_size < SZ_256M) {
				1587	ret = -EINVAL;
				1588	goto out_free;
				1589	}
				1590	if (new_size > device->bdev->bd_inode->i_size) {
				1591	ret = -EFBIG;
				1592	goto out_free;
				1593	}
				1594
				1595	new_size = round_down(new_size, fs_info->sectorsize);
				1596
				1597	btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
				1598	rcu_str_deref(device->name), new_size);
				1599
				1600	if (new_size > old_size) {
				1601	trans = btrfs_start_transaction(root, 0);
				1602	if (IS_ERR(trans)) {
				1603	ret = PTR_ERR(trans);
				1604	goto out_free;
				1605	}
				1606	ret = btrfs_grow_device(trans, device, new_size);
				1607	btrfs_commit_transaction(trans);
				1608	} else if (new_size < old_size) {
				1609	ret = btrfs_shrink_device(device, new_size);
				1610	} /* equal, nothing need to do */
				1611
				1612	out_free:
				1613	kfree(vol_args);
				1614	out:
				1615	mutex_unlock(&fs_info->volume_mutex);
				1616	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				1617	mnt_drop_write_file(file);
				1618	return ret;
				1619	}
				1620
				1621	static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
				1622	const char *name, unsigned long fd, int subvol,
				1623	u64 *transid, bool readonly,
				1624	struct btrfs_qgroup_inherit *inherit)
				1625	{
				1626	int namelen;
				1627	int ret = 0;
				1628
				1629	if (!S_ISDIR(file_inode(file)->i_mode))
				1630	return -ENOTDIR;
				1631
				1632	ret = mnt_want_write_file(file);
				1633	if (ret)
				1634	goto out;
				1635
				1636	namelen = strlen(name);
				1637	if (strchr(name, '/')) {
				1638	ret = -EINVAL;
				1639	goto out_drop_write;
				1640	}
				1641
				1642	if (name[0] == '.' &&
				1643	(namelen == 1 \|\| (name[1] == '.' && namelen == 2))) {
				1644	ret = -EEXIST;
				1645	goto out_drop_write;
				1646	}
				1647
				1648	if (subvol) {
				1649	ret = btrfs_mksubvol(&file->f_path, name, namelen,
				1650	NULL, transid, readonly, inherit);
				1651	} else {
				1652	struct fd src = fdget(fd);
				1653	struct inode *src_inode;
				1654	if (!src.file) {
				1655	ret = -EINVAL;
				1656	goto out_drop_write;
				1657	}
				1658
				1659	src_inode = file_inode(src.file);
				1660	if (src_inode->i_sb != file_inode(file)->i_sb) {
				1661	btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
				1662	"Snapshot src from another FS");
				1663	ret = -EXDEV;
				1664	} else if (!inode_owner_or_capable(src_inode)) {
				1665	/*
				1666	* Subvolume creation is not restricted, but snapshots
				1667	* are limited to own subvolumes only
				1668	*/
				1669	ret = -EPERM;
				1670	} else {
				1671	ret = btrfs_mksubvol(&file->f_path, name, namelen,
				1672	BTRFS_I(src_inode)->root,
				1673	transid, readonly, inherit);
				1674	}
				1675	fdput(src);
				1676	}
				1677	out_drop_write:
				1678	mnt_drop_write_file(file);
				1679	out:
				1680	return ret;
				1681	}
				1682
				1683	static noinline int btrfs_ioctl_snap_create(struct file *file,
				1684	void __user *arg, int subvol)
				1685	{
				1686	struct btrfs_ioctl_vol_args *vol_args;
				1687	int ret;
				1688
				1689	if (!S_ISDIR(file_inode(file)->i_mode))
				1690	return -ENOTDIR;
				1691
				1692	vol_args = memdup_user(arg, sizeof(*vol_args));
				1693	if (IS_ERR(vol_args))
				1694	return PTR_ERR(vol_args);
				1695	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				1696
				1697	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
				1698	vol_args->fd, subvol,
				1699	NULL, false, NULL);
				1700
				1701	kfree(vol_args);
				1702	return ret;
				1703	}
				1704
				1705	static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
				1706	void __user *arg, int subvol)
				1707	{
				1708	struct btrfs_ioctl_vol_args_v2 *vol_args;
				1709	int ret;
				1710	u64 transid = 0;
				1711	u64 *ptr = NULL;
				1712	bool readonly = false;
				1713	struct btrfs_qgroup_inherit *inherit = NULL;
				1714
				1715	if (!S_ISDIR(file_inode(file)->i_mode))
				1716	return -ENOTDIR;
				1717
				1718	vol_args = memdup_user(arg, sizeof(*vol_args));
				1719	if (IS_ERR(vol_args))
				1720	return PTR_ERR(vol_args);
				1721	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
				1722
				1723	if (vol_args->flags &
				1724	~(BTRFS_SUBVOL_CREATE_ASYNC \| BTRFS_SUBVOL_RDONLY \|
				1725	BTRFS_SUBVOL_QGROUP_INHERIT)) {
				1726	ret = -EOPNOTSUPP;
				1727	goto free_args;
				1728	}
				1729
				1730	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
				1731	ptr = &transid;
				1732	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
				1733	readonly = true;
				1734	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
				1735	if (vol_args->size > PAGE_SIZE) {
				1736	ret = -EINVAL;
				1737	goto free_args;
				1738	}
				1739	inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
				1740	if (IS_ERR(inherit)) {
				1741	ret = PTR_ERR(inherit);
				1742	goto free_args;
				1743	}
				1744	}
				1745
				1746	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
				1747	vol_args->fd, subvol, ptr,
				1748	readonly, inherit);
				1749	if (ret)
				1750	goto free_inherit;
				1751
				1752	if (ptr && copy_to_user(arg +
				1753	offsetof(struct btrfs_ioctl_vol_args_v2,
				1754	transid),
				1755	ptr, sizeof(*ptr)))
				1756	ret = -EFAULT;
				1757
				1758	free_inherit:
				1759	kfree(inherit);
				1760	free_args:
				1761	kfree(vol_args);
				1762	return ret;
				1763	}
				1764
				1765	static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
				1766	void __user *arg)
				1767	{
				1768	struct inode *inode = file_inode(file);
				1769	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1770	struct btrfs_root *root = BTRFS_I(inode)->root;
				1771	int ret = 0;
				1772	u64 flags = 0;
				1773
				1774	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
				1775	return -EINVAL;
				1776
				1777	down_read(&fs_info->subvol_sem);
				1778	if (btrfs_root_readonly(root))
				1779	flags \|= BTRFS_SUBVOL_RDONLY;
				1780	up_read(&fs_info->subvol_sem);
				1781
				1782	if (copy_to_user(arg, &flags, sizeof(flags)))
				1783	ret = -EFAULT;
				1784
				1785	return ret;
				1786	}
				1787
				1788	static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
				1789	void __user *arg)
				1790	{
				1791	struct inode *inode = file_inode(file);
				1792	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				1793	struct btrfs_root *root = BTRFS_I(inode)->root;
				1794	struct btrfs_trans_handle *trans;
				1795	u64 root_flags;
				1796	u64 flags;
				1797	int ret = 0;
				1798
				1799	if (!inode_owner_or_capable(inode))
				1800	return -EPERM;
				1801
				1802	ret = mnt_want_write_file(file);
				1803	if (ret)
				1804	goto out;
				1805
				1806	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
				1807	ret = -EINVAL;
				1808	goto out_drop_write;
				1809	}
				1810
				1811	if (copy_from_user(&flags, arg, sizeof(flags))) {
				1812	ret = -EFAULT;
				1813	goto out_drop_write;
				1814	}
				1815
				1816	if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
				1817	ret = -EINVAL;
				1818	goto out_drop_write;
				1819	}
				1820
				1821	if (flags & ~BTRFS_SUBVOL_RDONLY) {
				1822	ret = -EOPNOTSUPP;
				1823	goto out_drop_write;
				1824	}
				1825
				1826	down_write(&fs_info->subvol_sem);
				1827
				1828	/* nothing to do */
				1829	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
				1830	goto out_drop_sem;
				1831
				1832	root_flags = btrfs_root_flags(&root->root_item);
				1833	if (flags & BTRFS_SUBVOL_RDONLY) {
				1834	btrfs_set_root_flags(&root->root_item,
				1835	root_flags \| BTRFS_ROOT_SUBVOL_RDONLY);
				1836	} else {
				1837	/*
				1838	* Block RO -> RW transition if this subvolume is involved in
				1839	* send
				1840	*/
				1841	spin_lock(&root->root_item_lock);
				1842	if (root->send_in_progress == 0) {
				1843	btrfs_set_root_flags(&root->root_item,
				1844	root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
				1845	spin_unlock(&root->root_item_lock);
				1846	} else {
				1847	spin_unlock(&root->root_item_lock);
				1848	btrfs_warn(fs_info,
				1849	"Attempt to set subvolume %llu read-write during send",
				1850	root->root_key.objectid);
				1851	ret = -EPERM;
				1852	goto out_drop_sem;
				1853	}
				1854	}
				1855
				1856	trans = btrfs_start_transaction(root, 1);
				1857	if (IS_ERR(trans)) {
				1858	ret = PTR_ERR(trans);
				1859	goto out_reset;
				1860	}
				1861
				1862	ret = btrfs_update_root(trans, fs_info->tree_root,
				1863	&root->root_key, &root->root_item);
				1864	if (ret < 0) {
				1865	btrfs_end_transaction(trans);
				1866	goto out_reset;
				1867	}
				1868
				1869	ret = btrfs_commit_transaction(trans);
				1870
				1871	out_reset:
				1872	if (ret)
				1873	btrfs_set_root_flags(&root->root_item, root_flags);
				1874	out_drop_sem:
				1875	up_write(&fs_info->subvol_sem);
				1876	out_drop_write:
				1877	mnt_drop_write_file(file);
				1878	out:
				1879	return ret;
				1880	}
				1881
				1882	/*
				1883	* helper to check if the subvolume references other subvolumes
				1884	*/
				1885	static noinline int may_destroy_subvol(struct btrfs_root *root)
				1886	{
				1887	struct btrfs_fs_info *fs_info = root->fs_info;
				1888	struct btrfs_path *path;
				1889	struct btrfs_dir_item *di;
				1890	struct btrfs_key key;
				1891	u64 dir_id;
				1892	int ret;
				1893
				1894	path = btrfs_alloc_path();
				1895	if (!path)
				1896	return -ENOMEM;
				1897
				1898	/* Make sure this root isn't set as the default subvol */
				1899	dir_id = btrfs_super_root_dir(fs_info->super_copy);
				1900	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
				1901	dir_id, "default", 7, 0);
				1902	if (di && !IS_ERR(di)) {
				1903	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
				1904	if (key.objectid == root->root_key.objectid) {
				1905	ret = -EPERM;
				1906	btrfs_err(fs_info,
				1907	"deleting default subvolume %llu is not allowed",
				1908	key.objectid);
				1909	goto out;
				1910	}
				1911	btrfs_release_path(path);
				1912	}
				1913
				1914	key.objectid = root->root_key.objectid;
				1915	key.type = BTRFS_ROOT_REF_KEY;
				1916	key.offset = (u64)-1;
				1917
				1918	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
				1919	if (ret < 0)
				1920	goto out;
				1921	BUG_ON(ret == 0);
				1922
				1923	ret = 0;
				1924	if (path->slots[0] > 0) {
				1925	path->slots[0]--;
				1926	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1927	if (key.objectid == root->root_key.objectid &&
				1928	key.type == BTRFS_ROOT_REF_KEY)
				1929	ret = -ENOTEMPTY;
				1930	}
				1931	out:
				1932	btrfs_free_path(path);
				1933	return ret;
				1934	}
				1935
				1936	static noinline int key_in_sk(struct btrfs_key *key,
				1937	struct btrfs_ioctl_search_key *sk)
				1938	{
				1939	struct btrfs_key test;
				1940	int ret;
				1941
				1942	test.objectid = sk->min_objectid;
				1943	test.type = sk->min_type;
				1944	test.offset = sk->min_offset;
				1945
				1946	ret = btrfs_comp_cpu_keys(key, &test);
				1947	if (ret < 0)
				1948	return 0;
				1949
				1950	test.objectid = sk->max_objectid;
				1951	test.type = sk->max_type;
				1952	test.offset = sk->max_offset;
				1953
				1954	ret = btrfs_comp_cpu_keys(key, &test);
				1955	if (ret > 0)
				1956	return 0;
				1957	return 1;
				1958	}
				1959
				1960	static noinline int copy_to_sk(struct btrfs_path *path,
				1961	struct btrfs_key *key,
				1962	struct btrfs_ioctl_search_key *sk,
				1963	size_t *buf_size,
				1964	char __user *ubuf,
				1965	unsigned long *sk_offset,
				1966	int *num_found)
				1967	{
				1968	u64 found_transid;
				1969	struct extent_buffer *leaf;
				1970	struct btrfs_ioctl_search_header sh;
				1971	struct btrfs_key test;
				1972	unsigned long item_off;
				1973	unsigned long item_len;
				1974	int nritems;
				1975	int i;
				1976	int slot;
				1977	int ret = 0;
				1978
				1979	leaf = path->nodes[0];
				1980	slot = path->slots[0];
				1981	nritems = btrfs_header_nritems(leaf);
				1982
				1983	if (btrfs_header_generation(leaf) > sk->max_transid) {
				1984	i = nritems;
				1985	goto advance_key;
				1986	}
				1987	found_transid = btrfs_header_generation(leaf);
				1988
				1989	for (i = slot; i < nritems; i++) {
				1990	item_off = btrfs_item_ptr_offset(leaf, i);
				1991	item_len = btrfs_item_size_nr(leaf, i);
				1992
				1993	btrfs_item_key_to_cpu(leaf, key, i);
				1994	if (!key_in_sk(key, sk))
				1995	continue;
				1996
				1997	if (sizeof(sh) + item_len > *buf_size) {
				1998	if (*num_found) {
				1999	ret = 1;
				2000	goto out;
				2001	}
				2002
				2003	/*
				2004	* return one empty item back for v1, which does not
				2005	* handle -EOVERFLOW
				2006	*/
				2007
				2008	*buf_size = sizeof(sh) + item_len;
				2009	item_len = 0;
				2010	ret = -EOVERFLOW;
				2011	}
				2012
				2013	if (sizeof(sh) + item_len + sk_offset > buf_size) {
				2014	ret = 1;
				2015	goto out;
				2016	}
				2017
				2018	sh.objectid = key->objectid;
				2019	sh.offset = key->offset;
				2020	sh.type = key->type;
				2021	sh.len = item_len;
				2022	sh.transid = found_transid;
				2023
				2024	/*
				2025	* Copy search result header. If we fault then loop again so we
				2026	* can fault in the pages and -EFAULT there if there's a
				2027	* problem. Otherwise we'll fault and then copy the buffer in
				2028	* properly this next time through
				2029	*/
				2030	if (probe_user_write(ubuf + *sk_offset, &sh, sizeof(sh))) {
				2031	ret = 0;
				2032	goto out;
				2033	}
				2034
				2035	*sk_offset += sizeof(sh);
				2036
				2037	if (item_len) {
				2038	char __user up = ubuf + sk_offset;
				2039	/*
				2040	* Copy the item, same behavior as above, but reset the
				2041	* * sk_offset so we copy the full thing again.
				2042	*/
				2043	if (read_extent_buffer_to_user_nofault(leaf, up,
				2044	item_off, item_len)) {
				2045	ret = 0;
				2046	*sk_offset -= sizeof(sh);
				2047	goto out;
				2048	}
				2049
				2050	*sk_offset += item_len;
				2051	}
				2052	(*num_found)++;
				2053
				2054	if (ret) /* -EOVERFLOW from above */
				2055	goto out;
				2056
				2057	if (*num_found >= sk->nr_items) {
				2058	ret = 1;
				2059	goto out;
				2060	}
				2061	}
				2062	advance_key:
				2063	ret = 0;
				2064	test.objectid = sk->max_objectid;
				2065	test.type = sk->max_type;
				2066	test.offset = sk->max_offset;
				2067	if (btrfs_comp_cpu_keys(key, &test) >= 0)
				2068	ret = 1;
				2069	else if (key->offset < (u64)-1)
				2070	key->offset++;
				2071	else if (key->type < (u8)-1) {
				2072	key->offset = 0;
				2073	key->type++;
				2074	} else if (key->objectid < (u64)-1) {
				2075	key->offset = 0;
				2076	key->type = 0;
				2077	key->objectid++;
				2078	} else
				2079	ret = 1;
				2080	out:
				2081	/*
				2082	* 0: all items from this leaf copied, continue with next
				2083	* 1: * more items can be copied, but unused buffer is too small
				2084	* * all items were found
				2085	* Either way, it will stops the loop which iterates to the next
				2086	* leaf
				2087	* -EOVERFLOW: item was to large for buffer
				2088	* -EFAULT: could not copy extent buffer back to userspace
				2089	*/
				2090	return ret;
				2091	}
				2092
				2093	static noinline int search_ioctl(struct inode *inode,
				2094	struct btrfs_ioctl_search_key *sk,
				2095	size_t *buf_size,
				2096	char __user *ubuf)
				2097	{
				2098	struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
				2099	struct btrfs_root *root;
				2100	struct btrfs_key key;
				2101	struct btrfs_path *path;
				2102	int ret;
				2103	int num_found = 0;
				2104	unsigned long sk_offset = 0;
				2105
				2106	if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
				2107	*buf_size = sizeof(struct btrfs_ioctl_search_header);
				2108	return -EOVERFLOW;
				2109	}
				2110
				2111	path = btrfs_alloc_path();
				2112	if (!path)
				2113	return -ENOMEM;
				2114
				2115	if (sk->tree_id == 0) {
				2116	/* search the root of the inode that was passed */
				2117	root = BTRFS_I(inode)->root;
				2118	} else {
				2119	key.objectid = sk->tree_id;
				2120	key.type = BTRFS_ROOT_ITEM_KEY;
				2121	key.offset = (u64)-1;
				2122	root = btrfs_read_fs_root_no_name(info, &key);
				2123	if (IS_ERR(root)) {
				2124	btrfs_free_path(path);
				2125	return -ENOENT;
				2126	}
				2127	}
				2128
				2129	key.objectid = sk->min_objectid;
				2130	key.type = sk->min_type;
				2131	key.offset = sk->min_offset;
				2132
				2133	while (1) {
				2134	ret = fault_in_pages_writeable(ubuf + sk_offset,
				2135	*buf_size - sk_offset);
				2136	if (ret)
				2137	break;
				2138
				2139	ret = btrfs_search_forward(root, &key, path, sk->min_transid);
				2140	if (ret != 0) {
				2141	if (ret > 0)
				2142	ret = 0;
				2143	goto err;
				2144	}
				2145	ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
				2146	&sk_offset, &num_found);
				2147	btrfs_release_path(path);
				2148	if (ret)
				2149	break;
				2150
				2151	}
				2152	if (ret > 0)
				2153	ret = 0;
				2154	err:
				2155	sk->nr_items = num_found;
				2156	btrfs_free_path(path);
				2157	return ret;
				2158	}
				2159
				2160	static noinline int btrfs_ioctl_tree_search(struct file *file,
				2161	void __user *argp)
				2162	{
				2163	struct btrfs_ioctl_search_args __user *uargs;
				2164	struct btrfs_ioctl_search_key sk;
				2165	struct inode *inode;
				2166	int ret;
				2167	size_t buf_size;
				2168
				2169	if (!capable(CAP_SYS_ADMIN))
				2170	return -EPERM;
				2171
				2172	uargs = (struct btrfs_ioctl_search_args __user *)argp;
				2173
				2174	if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
				2175	return -EFAULT;
				2176
				2177	buf_size = sizeof(uargs->buf);
				2178
				2179	inode = file_inode(file);
				2180	ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
				2181
				2182	/*
				2183	* In the origin implementation an overflow is handled by returning a
				2184	* search header with a len of zero, so reset ret.
				2185	*/
				2186	if (ret == -EOVERFLOW)
				2187	ret = 0;
				2188
				2189	if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
				2190	ret = -EFAULT;
				2191	return ret;
				2192	}
				2193
				2194	static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
				2195	void __user *argp)
				2196	{
				2197	struct btrfs_ioctl_search_args_v2 __user *uarg;
				2198	struct btrfs_ioctl_search_args_v2 args;
				2199	struct inode *inode;
				2200	int ret;
				2201	size_t buf_size;
				2202	const size_t buf_limit = SZ_16M;
				2203
				2204	if (!capable(CAP_SYS_ADMIN))
				2205	return -EPERM;
				2206
				2207	/* copy search header and buffer size */
				2208	uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
				2209	if (copy_from_user(&args, uarg, sizeof(args)))
				2210	return -EFAULT;
				2211
				2212	buf_size = args.buf_size;
				2213
				2214	/* limit result size to 16MB */
				2215	if (buf_size > buf_limit)
				2216	buf_size = buf_limit;
				2217
				2218	inode = file_inode(file);
				2219	ret = search_ioctl(inode, &args.key, &buf_size,
				2220	(char *)(&uarg->buf[0]));
				2221	if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
				2222	ret = -EFAULT;
				2223	else if (ret == -EOVERFLOW &&
				2224	copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
				2225	ret = -EFAULT;
				2226
				2227	return ret;
				2228	}
				2229
				2230	/*
				2231	* Search INODE_REFs to identify path name of 'dirid' directory
				2232	* in a 'tree_id' tree. and sets path name to 'name'.
				2233	*/
				2234	static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
				2235	u64 tree_id, u64 dirid, char *name)
				2236	{
				2237	struct btrfs_root *root;
				2238	struct btrfs_key key;
				2239	char *ptr;
				2240	int ret = -1;
				2241	int slot;
				2242	int len;
				2243	int total_len = 0;
				2244	struct btrfs_inode_ref *iref;
				2245	struct extent_buffer *l;
				2246	struct btrfs_path *path;
				2247
				2248	if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
				2249	name[0]='\0';
				2250	return 0;
				2251	}
				2252
				2253	path = btrfs_alloc_path();
				2254	if (!path)
				2255	return -ENOMEM;
				2256
				2257	ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
				2258
				2259	key.objectid = tree_id;
				2260	key.type = BTRFS_ROOT_ITEM_KEY;
				2261	key.offset = (u64)-1;
				2262	root = btrfs_read_fs_root_no_name(info, &key);
				2263	if (IS_ERR(root)) {
				2264	btrfs_err(info, "could not find root %llu", tree_id);
				2265	ret = -ENOENT;
				2266	goto out;
				2267	}
				2268
				2269	key.objectid = dirid;
				2270	key.type = BTRFS_INODE_REF_KEY;
				2271	key.offset = (u64)-1;
				2272
				2273	while (1) {
				2274	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				2275	if (ret < 0)
				2276	goto out;
				2277	else if (ret > 0) {
				2278	ret = btrfs_previous_item(root, path, dirid,
				2279	BTRFS_INODE_REF_KEY);
				2280	if (ret < 0)
				2281	goto out;
				2282	else if (ret > 0) {
				2283	ret = -ENOENT;
				2284	goto out;
				2285	}
				2286	}
				2287
				2288	l = path->nodes[0];
				2289	slot = path->slots[0];
				2290	btrfs_item_key_to_cpu(l, &key, slot);
				2291
				2292	iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
				2293	len = btrfs_inode_ref_name_len(l, iref);
				2294	ptr -= len + 1;
				2295	total_len += len + 1;
				2296	if (ptr < name) {
				2297	ret = -ENAMETOOLONG;
				2298	goto out;
				2299	}
				2300
				2301	*(ptr + len) = '/';
				2302	read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
				2303
				2304	if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
				2305	break;
				2306
				2307	btrfs_release_path(path);
				2308	key.objectid = key.offset;
				2309	key.offset = (u64)-1;
				2310	dirid = key.objectid;
				2311	}
				2312	memmove(name, ptr, total_len);
				2313	name[total_len] = '\0';
				2314	ret = 0;
				2315	out:
				2316	btrfs_free_path(path);
				2317	return ret;
				2318	}
				2319
				2320	static noinline int btrfs_ioctl_ino_lookup(struct file *file,
				2321	void __user *argp)
				2322	{
				2323	struct btrfs_ioctl_ino_lookup_args *args;
				2324	struct inode *inode;
				2325	int ret = 0;
				2326
				2327	args = memdup_user(argp, sizeof(*args));
				2328	if (IS_ERR(args))
				2329	return PTR_ERR(args);
				2330
				2331	inode = file_inode(file);
				2332
				2333	/*
				2334	* Unprivileged query to obtain the containing subvolume root id. The
				2335	* path is reset so it's consistent with btrfs_search_path_in_tree.
				2336	*/
				2337	if (args->treeid == 0)
				2338	args->treeid = BTRFS_I(inode)->root->root_key.objectid;
				2339
				2340	if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
				2341	args->name[0] = 0;
				2342	goto out;
				2343	}
				2344
				2345	if (!capable(CAP_SYS_ADMIN)) {
				2346	ret = -EPERM;
				2347	goto out;
				2348	}
				2349
				2350	ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
				2351	args->treeid, args->objectid,
				2352	args->name);
				2353
				2354	out:
				2355	if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
				2356	ret = -EFAULT;
				2357
				2358	kfree(args);
				2359	return ret;
				2360	}
				2361
				2362	static noinline int btrfs_ioctl_snap_destroy(struct file *file,
				2363	void __user *arg)
				2364	{
				2365	struct dentry *parent = file->f_path.dentry;
				2366	struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
				2367	struct dentry *dentry;
				2368	struct inode *dir = d_inode(parent);
				2369	struct inode *inode;
				2370	struct btrfs_root *root = BTRFS_I(dir)->root;
				2371	struct btrfs_root *dest = NULL;
				2372	struct btrfs_ioctl_vol_args *vol_args;
				2373	struct btrfs_trans_handle *trans;
				2374	struct btrfs_block_rsv block_rsv;
				2375	u64 root_flags;
				2376	u64 qgroup_reserved;
				2377	int namelen;
				2378	int ret;
				2379	int err = 0;
				2380
				2381	if (!S_ISDIR(dir->i_mode))
				2382	return -ENOTDIR;
				2383
				2384	vol_args = memdup_user(arg, sizeof(*vol_args));
				2385	if (IS_ERR(vol_args))
				2386	return PTR_ERR(vol_args);
				2387
				2388	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				2389	namelen = strlen(vol_args->name);
				2390	if (strchr(vol_args->name, '/') \|\|
				2391	strncmp(vol_args->name, "..", namelen) == 0) {
				2392	err = -EINVAL;
				2393	goto out;
				2394	}
				2395
				2396	err = mnt_want_write_file(file);
				2397	if (err)
				2398	goto out;
				2399
				2400
				2401	err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
				2402	if (err == -EINTR)
				2403	goto out_drop_write;
				2404	dentry = lookup_one_len(vol_args->name, parent, namelen);
				2405	if (IS_ERR(dentry)) {
				2406	err = PTR_ERR(dentry);
				2407	goto out_unlock_dir;
				2408	}
				2409
				2410	if (d_really_is_negative(dentry)) {
				2411	err = -ENOENT;
				2412	goto out_dput;
				2413	}
				2414
				2415	inode = d_inode(dentry);
				2416	dest = BTRFS_I(inode)->root;
				2417	if (!capable(CAP_SYS_ADMIN)) {
				2418	/*
				2419	* Regular user. Only allow this with a special mount
				2420	* option, when the user has write+exec access to the
				2421	* subvol root, and when rmdir(2) would have been
				2422	* allowed.
				2423	*
				2424	* Note that this is _not_ check that the subvol is
				2425	* empty or doesn't contain data that we wouldn't
				2426	* otherwise be able to delete.
				2427	*
				2428	* Users who want to delete empty subvols should try
				2429	* rmdir(2).
				2430	*/
				2431	err = -EPERM;
				2432	if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
				2433	goto out_dput;
				2434
				2435	/*
				2436	* Do not allow deletion if the parent dir is the same
				2437	* as the dir to be deleted. That means the ioctl
				2438	* must be called on the dentry referencing the root
				2439	* of the subvol, not a random directory contained
				2440	* within it.
				2441	*/
				2442	err = -EINVAL;
				2443	if (root == dest)
				2444	goto out_dput;
				2445
				2446	err = inode_permission(inode, MAY_WRITE \| MAY_EXEC);
				2447	if (err)
				2448	goto out_dput;
				2449	}
				2450
				2451	/* check if subvolume may be deleted by a user */
				2452	err = btrfs_may_delete(dir, dentry, 1);
				2453	if (err)
				2454	goto out_dput;
				2455
				2456	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
				2457	err = -EINVAL;
				2458	goto out_dput;
				2459	}
				2460
				2461	inode_lock(inode);
				2462
				2463	/*
				2464	* Don't allow to delete a subvolume with send in progress. This is
				2465	* inside the i_mutex so the error handling that has to drop the bit
				2466	* again is not run concurrently.
				2467	*/
				2468	spin_lock(&dest->root_item_lock);
				2469	root_flags = btrfs_root_flags(&dest->root_item);
				2470	if (dest->send_in_progress == 0) {
				2471	btrfs_set_root_flags(&dest->root_item,
				2472	root_flags \| BTRFS_ROOT_SUBVOL_DEAD);
				2473	spin_unlock(&dest->root_item_lock);
				2474	} else {
				2475	spin_unlock(&dest->root_item_lock);
				2476	btrfs_warn(fs_info,
				2477	"Attempt to delete subvolume %llu during send",
				2478	dest->root_key.objectid);
				2479	err = -EPERM;
				2480	goto out_unlock_inode;
				2481	}
				2482
				2483	down_write(&fs_info->subvol_sem);
				2484
				2485	err = may_destroy_subvol(dest);
				2486	if (err)
				2487	goto out_up_write;
				2488
				2489	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
				2490	/*
				2491	* One for dir inode, two for dir entries, two for root
				2492	* ref/backref.
				2493	*/
				2494	err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
				2495	5, &qgroup_reserved, true);
				2496	if (err)
				2497	goto out_up_write;
				2498
				2499	trans = btrfs_start_transaction(root, 0);
				2500	if (IS_ERR(trans)) {
				2501	err = PTR_ERR(trans);
				2502	goto out_release;
				2503	}
				2504	trans->block_rsv = &block_rsv;
				2505	trans->bytes_reserved = block_rsv.size;
				2506
				2507	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
				2508
				2509	ret = btrfs_unlink_subvol(trans, root, dir,
				2510	dest->root_key.objectid,
				2511	dentry->d_name.name,
				2512	dentry->d_name.len);
				2513	if (ret) {
				2514	err = ret;
				2515	btrfs_abort_transaction(trans, ret);
				2516	goto out_end_trans;
				2517	}
				2518
				2519	btrfs_record_root_in_trans(trans, dest);
				2520
				2521	memset(&dest->root_item.drop_progress, 0,
				2522	sizeof(dest->root_item.drop_progress));
				2523	dest->root_item.drop_level = 0;
				2524	btrfs_set_root_refs(&dest->root_item, 0);
				2525
				2526	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
				2527	ret = btrfs_insert_orphan_item(trans,
				2528	fs_info->tree_root,
				2529	dest->root_key.objectid);
				2530	if (ret) {
				2531	btrfs_abort_transaction(trans, ret);
				2532	err = ret;
				2533	goto out_end_trans;
				2534	}
				2535	}
				2536
				2537	ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid,
				2538	BTRFS_UUID_KEY_SUBVOL,
				2539	dest->root_key.objectid);
				2540	if (ret && ret != -ENOENT) {
				2541	btrfs_abort_transaction(trans, ret);
				2542	err = ret;
				2543	goto out_end_trans;
				2544	}
				2545	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
				2546	ret = btrfs_uuid_tree_rem(trans, fs_info,
				2547	dest->root_item.received_uuid,
				2548	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				2549	dest->root_key.objectid);
				2550	if (ret && ret != -ENOENT) {
				2551	btrfs_abort_transaction(trans, ret);
				2552	err = ret;
				2553	goto out_end_trans;
				2554	}
				2555	}
				2556
				2557	out_end_trans:
				2558	trans->block_rsv = NULL;
				2559	trans->bytes_reserved = 0;
				2560	ret = btrfs_end_transaction(trans);
				2561	if (ret && !err)
				2562	err = ret;
				2563	inode->i_flags \|= S_DEAD;
				2564	out_release:
				2565	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
				2566	out_up_write:
				2567	up_write(&fs_info->subvol_sem);
				2568	if (err) {
				2569	spin_lock(&dest->root_item_lock);
				2570	root_flags = btrfs_root_flags(&dest->root_item);
				2571	btrfs_set_root_flags(&dest->root_item,
				2572	root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
				2573	spin_unlock(&dest->root_item_lock);
				2574	}
				2575	out_unlock_inode:
				2576	inode_unlock(inode);
				2577	if (!err) {
				2578	d_invalidate(dentry);
				2579	btrfs_invalidate_inodes(dest);
				2580	d_delete(dentry);
				2581	ASSERT(dest->send_in_progress == 0);
				2582
				2583	/* the last ref */
				2584	if (dest->ino_cache_inode) {
				2585	iput(dest->ino_cache_inode);
				2586	dest->ino_cache_inode = NULL;
				2587	}
				2588	}
				2589	out_dput:
				2590	dput(dentry);
				2591	out_unlock_dir:
				2592	inode_unlock(dir);
				2593	out_drop_write:
				2594	mnt_drop_write_file(file);
				2595	out:
				2596	kfree(vol_args);
				2597	return err;
				2598	}
				2599
				2600	static int btrfs_ioctl_defrag(struct file file, void __user argp)
				2601	{
				2602	struct inode *inode = file_inode(file);
				2603	struct btrfs_root *root = BTRFS_I(inode)->root;
				2604	struct btrfs_ioctl_defrag_range_args *range;
				2605	int ret;
				2606
				2607	ret = mnt_want_write_file(file);
				2608	if (ret)
				2609	return ret;
				2610
				2611	if (btrfs_root_readonly(root)) {
				2612	ret = -EROFS;
				2613	goto out;
				2614	}
				2615
				2616	switch (inode->i_mode & S_IFMT) {
				2617	case S_IFDIR:
				2618	if (!capable(CAP_SYS_ADMIN)) {
				2619	ret = -EPERM;
				2620	goto out;
				2621	}
				2622	ret = btrfs_defrag_root(root);
				2623	break;
				2624	case S_IFREG:
				2625	if (!(file->f_mode & FMODE_WRITE)) {
				2626	ret = -EINVAL;
				2627	goto out;
				2628	}
				2629
				2630	range = kzalloc(sizeof(*range), GFP_KERNEL);
				2631	if (!range) {
				2632	ret = -ENOMEM;
				2633	goto out;
				2634	}
				2635
				2636	if (argp) {
				2637	if (copy_from_user(range, argp,
				2638	sizeof(*range))) {
				2639	ret = -EFAULT;
				2640	kfree(range);
				2641	goto out;
				2642	}
				2643	/* compression requires us to start the IO */
				2644	if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
				2645	range->flags \|= BTRFS_DEFRAG_RANGE_START_IO;
				2646	range->extent_thresh = (u32)-1;
				2647	}
				2648	} else {
				2649	/* the rest are all set to zero by kzalloc */
				2650	range->len = (u64)-1;
				2651	}
				2652	ret = btrfs_defrag_file(file_inode(file), file,
				2653	range, 0, 0);
				2654	if (ret > 0)
				2655	ret = 0;
				2656	kfree(range);
				2657	break;
				2658	default:
				2659	ret = -EINVAL;
				2660	}
				2661	out:
				2662	mnt_drop_write_file(file);
				2663	return ret;
				2664	}
				2665
				2666	static long btrfs_ioctl_add_dev(struct btrfs_fs_info fs_info, void __user arg)
				2667	{
				2668	struct btrfs_ioctl_vol_args *vol_args;
				2669	int ret;
				2670
				2671	if (!capable(CAP_SYS_ADMIN))
				2672	return -EPERM;
				2673
				2674	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
				2675	return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				2676
				2677	mutex_lock(&fs_info->volume_mutex);
				2678	vol_args = memdup_user(arg, sizeof(*vol_args));
				2679	if (IS_ERR(vol_args)) {
				2680	ret = PTR_ERR(vol_args);
				2681	goto out;
				2682	}
				2683
				2684	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				2685	ret = btrfs_init_new_device(fs_info, vol_args->name);
				2686
				2687	if (!ret)
				2688	btrfs_info(fs_info, "disk added %s", vol_args->name);
				2689
				2690	kfree(vol_args);
				2691	out:
				2692	mutex_unlock(&fs_info->volume_mutex);
				2693	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				2694	return ret;
				2695	}
				2696
				2697	static long btrfs_ioctl_rm_dev_v2(struct file file, void __user arg)
				2698	{
				2699	struct inode *inode = file_inode(file);
				2700	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2701	struct btrfs_ioctl_vol_args_v2 *vol_args;
				2702	int ret;
				2703
				2704	if (!capable(CAP_SYS_ADMIN))
				2705	return -EPERM;
				2706
				2707	ret = mnt_want_write_file(file);
				2708	if (ret)
				2709	return ret;
				2710
				2711	vol_args = memdup_user(arg, sizeof(*vol_args));
				2712	if (IS_ERR(vol_args)) {
				2713	ret = PTR_ERR(vol_args);
				2714	goto err_drop;
				2715	}
				2716
				2717	/* Check for compatibility reject unknown flags */
				2718	if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
				2719	ret = -EOPNOTSUPP;
				2720	goto out;
				2721	}
				2722
				2723	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				2724	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				2725	goto out;
				2726	}
				2727
				2728	mutex_lock(&fs_info->volume_mutex);
				2729	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
				2730	ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
				2731	} else {
				2732	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
				2733	ret = btrfs_rm_device(fs_info, vol_args->name, 0);
				2734	}
				2735	mutex_unlock(&fs_info->volume_mutex);
				2736	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				2737
				2738	if (!ret) {
				2739	if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
				2740	btrfs_info(fs_info, "device deleted: id %llu",
				2741	vol_args->devid);
				2742	else
				2743	btrfs_info(fs_info, "device deleted: %s",
				2744	vol_args->name);
				2745	}
				2746	out:
				2747	kfree(vol_args);
				2748	err_drop:
				2749	mnt_drop_write_file(file);
				2750	return ret;
				2751	}
				2752
				2753	static long btrfs_ioctl_rm_dev(struct file file, void __user arg)
				2754	{
				2755	struct inode *inode = file_inode(file);
				2756	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2757	struct btrfs_ioctl_vol_args *vol_args;
				2758	int ret;
				2759
				2760	if (!capable(CAP_SYS_ADMIN))
				2761	return -EPERM;
				2762
				2763	ret = mnt_want_write_file(file);
				2764	if (ret)
				2765	return ret;
				2766
				2767	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				2768	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				2769	goto out_drop_write;
				2770	}
				2771
				2772	vol_args = memdup_user(arg, sizeof(*vol_args));
				2773	if (IS_ERR(vol_args)) {
				2774	ret = PTR_ERR(vol_args);
				2775	goto out;
				2776	}
				2777
				2778	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
				2779	mutex_lock(&fs_info->volume_mutex);
				2780	ret = btrfs_rm_device(fs_info, vol_args->name, 0);
				2781	mutex_unlock(&fs_info->volume_mutex);
				2782
				2783	if (!ret)
				2784	btrfs_info(fs_info, "disk deleted %s", vol_args->name);
				2785	kfree(vol_args);
				2786	out:
				2787	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				2788	out_drop_write:
				2789	mnt_drop_write_file(file);
				2790
				2791	return ret;
				2792	}
				2793
				2794	static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
				2795	void __user *arg)
				2796	{
				2797	struct btrfs_ioctl_fs_info_args *fi_args;
				2798	struct btrfs_device *device;
				2799	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2800	int ret = 0;
				2801
				2802	fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
				2803	if (!fi_args)
				2804	return -ENOMEM;
				2805
				2806	mutex_lock(&fs_devices->device_list_mutex);
				2807	fi_args->num_devices = fs_devices->num_devices;
				2808	memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid));
				2809
				2810	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				2811	if (device->devid > fi_args->max_id)
				2812	fi_args->max_id = device->devid;
				2813	}
				2814	mutex_unlock(&fs_devices->device_list_mutex);
				2815
				2816	fi_args->nodesize = fs_info->nodesize;
				2817	fi_args->sectorsize = fs_info->sectorsize;
				2818	fi_args->clone_alignment = fs_info->sectorsize;
				2819
				2820	if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
				2821	ret = -EFAULT;
				2822
				2823	kfree(fi_args);
				2824	return ret;
				2825	}
				2826
				2827	static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
				2828	void __user *arg)
				2829	{
				2830	struct btrfs_ioctl_dev_info_args *di_args;
				2831	struct btrfs_device *dev;
				2832	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2833	int ret = 0;
				2834	char *s_uuid = NULL;
				2835
				2836	di_args = memdup_user(arg, sizeof(*di_args));
				2837	if (IS_ERR(di_args))
				2838	return PTR_ERR(di_args);
				2839
				2840	if (!btrfs_is_empty_uuid(di_args->uuid))
				2841	s_uuid = di_args->uuid;
				2842
				2843	mutex_lock(&fs_devices->device_list_mutex);
				2844	dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL);
				2845
				2846	if (!dev) {
				2847	ret = -ENODEV;
				2848	goto out;
				2849	}
				2850
				2851	di_args->devid = dev->devid;
				2852	di_args->bytes_used = btrfs_device_get_bytes_used(dev);
				2853	di_args->total_bytes = btrfs_device_get_total_bytes(dev);
				2854	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
				2855	if (dev->name) {
				2856	struct rcu_string *name;
				2857
				2858	rcu_read_lock();
				2859	name = rcu_dereference(dev->name);
				2860	strncpy(di_args->path, name->str, sizeof(di_args->path));
				2861	rcu_read_unlock();
				2862	di_args->path[sizeof(di_args->path) - 1] = 0;
				2863	} else {
				2864	di_args->path[0] = '\0';
				2865	}
				2866
				2867	out:
				2868	mutex_unlock(&fs_devices->device_list_mutex);
				2869	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
				2870	ret = -EFAULT;
				2871
				2872	kfree(di_args);
				2873	return ret;
				2874	}
				2875
				2876	static struct page extent_same_get_page(struct inode inode, pgoff_t index)
				2877	{
				2878	struct page *page;
				2879
				2880	page = grab_cache_page(inode->i_mapping, index);
				2881	if (!page)
				2882	return ERR_PTR(-ENOMEM);
				2883
				2884	if (!PageUptodate(page)) {
				2885	int ret;
				2886
				2887	ret = btrfs_readpage(NULL, page);
				2888	if (ret)
				2889	return ERR_PTR(ret);
				2890	lock_page(page);
				2891	if (!PageUptodate(page)) {
				2892	unlock_page(page);
				2893	put_page(page);
				2894	return ERR_PTR(-EIO);
				2895	}
				2896	if (page->mapping != inode->i_mapping) {
				2897	unlock_page(page);
				2898	put_page(page);
				2899	return ERR_PTR(-EAGAIN);
				2900	}
				2901	}
				2902
				2903	return page;
				2904	}
				2905
				2906	static int gather_extent_pages(struct inode inode, struct page *pages,
				2907	int num_pages, u64 off)
				2908	{
				2909	int i;
				2910	pgoff_t index = off >> PAGE_SHIFT;
				2911
				2912	for (i = 0; i < num_pages; i++) {
				2913	again:
				2914	pages[i] = extent_same_get_page(inode, index + i);
				2915	if (IS_ERR(pages[i])) {
				2916	int err = PTR_ERR(pages[i]);
				2917
				2918	if (err == -EAGAIN)
				2919	goto again;
				2920	pages[i] = NULL;
				2921	return err;
				2922	}
				2923	}
				2924	return 0;
				2925	}
				2926
				2927	static int lock_extent_range(struct inode *inode, u64 off, u64 len,
				2928	bool retry_range_locking)
				2929	{
				2930	/*
				2931	* Do any pending delalloc/csum calculations on inode, one way or
				2932	* another, and lock file content.
				2933	* The locking order is:
				2934	*
				2935	* 1) pages
				2936	* 2) range in the inode's io tree
				2937	*/
				2938	while (1) {
				2939	struct btrfs_ordered_extent *ordered;
				2940	lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
				2941	ordered = btrfs_lookup_first_ordered_extent(inode,
				2942	off + len - 1);
				2943	if ((!ordered \|\|
				2944	ordered->file_offset + ordered->len <= off \|\|
				2945	ordered->file_offset >= off + len) &&
				2946	!test_range_bit(&BTRFS_I(inode)->io_tree, off,
				2947	off + len - 1, EXTENT_DELALLOC, 0, NULL)) {
				2948	if (ordered)
				2949	btrfs_put_ordered_extent(ordered);
				2950	break;
				2951	}
				2952	unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
				2953	if (ordered)
				2954	btrfs_put_ordered_extent(ordered);
				2955	if (!retry_range_locking)
				2956	return -EAGAIN;
				2957	btrfs_wait_ordered_range(inode, off, len);
				2958	}
				2959	return 0;
				2960	}
				2961
				2962	static void btrfs_double_inode_unlock(struct inode inode1, struct inode inode2)
				2963	{
				2964	inode_unlock(inode1);
				2965	inode_unlock(inode2);
				2966	}
				2967
				2968	static void btrfs_double_inode_lock(struct inode inode1, struct inode inode2)
				2969	{
				2970	if (inode1 < inode2)
				2971	swap(inode1, inode2);
				2972
				2973	inode_lock_nested(inode1, I_MUTEX_PARENT);
				2974	inode_lock_nested(inode2, I_MUTEX_CHILD);
				2975	}
				2976
				2977	static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
				2978	struct inode *inode2, u64 loff2, u64 len)
				2979	{
				2980	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				2981	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				2982	}
				2983
				2984	static int btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
				2985	struct inode *inode2, u64 loff2, u64 len,
				2986	bool retry_range_locking)
				2987	{
				2988	int ret;
				2989
				2990	if (inode1 < inode2) {
				2991	swap(inode1, inode2);
				2992	swap(loff1, loff2);
				2993	}
				2994	ret = lock_extent_range(inode1, loff1, len, retry_range_locking);
				2995	if (ret)
				2996	return ret;
				2997	ret = lock_extent_range(inode2, loff2, len, retry_range_locking);
				2998	if (ret)
				2999	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1,
				3000	loff1 + len - 1);
				3001	return ret;
				3002	}
				3003
				3004	struct cmp_pages {
				3005	int num_pages;
				3006	struct page **src_pages;
				3007	struct page **dst_pages;
				3008	};
				3009
				3010	static void btrfs_cmp_data_free(struct cmp_pages *cmp)
				3011	{
				3012	int i;
				3013	struct page *pg;
				3014
				3015	for (i = 0; i < cmp->num_pages; i++) {
				3016	pg = cmp->src_pages[i];
				3017	if (pg) {
				3018	unlock_page(pg);
				3019	put_page(pg);
				3020	}
				3021	pg = cmp->dst_pages[i];
				3022	if (pg) {
				3023	unlock_page(pg);
				3024	put_page(pg);
				3025	}
				3026	}
				3027	kfree(cmp->src_pages);
				3028	kfree(cmp->dst_pages);
				3029	}
				3030
				3031	static int btrfs_cmp_data_prepare(struct inode *src, u64 loff,
				3032	struct inode *dst, u64 dst_loff,
				3033	u64 len, struct cmp_pages *cmp)
				3034	{
				3035	int ret;
				3036	int num_pages = PAGE_ALIGN(len) >> PAGE_SHIFT;
				3037	struct page src_pgarr, dst_pgarr;
				3038
				3039	/*
				3040	* We must gather up all the pages before we initiate our
				3041	* extent locking. We use an array for the page pointers. Size
				3042	* of the array is bounded by len, which is in turn bounded by
				3043	* BTRFS_MAX_DEDUPE_LEN.
				3044	*/
				3045	src_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
				3046	dst_pgarr = kcalloc(num_pages, sizeof(struct page *), GFP_KERNEL);
				3047	if (!src_pgarr \|\| !dst_pgarr) {
				3048	kfree(src_pgarr);
				3049	kfree(dst_pgarr);
				3050	return -ENOMEM;
				3051	}
				3052	cmp->num_pages = num_pages;
				3053	cmp->src_pages = src_pgarr;
				3054	cmp->dst_pages = dst_pgarr;
				3055
				3056	/*
				3057	* If deduping ranges in the same inode, locking rules make it mandatory
				3058	* to always lock pages in ascending order to avoid deadlocks with
				3059	* concurrent tasks (such as starting writeback/delalloc).
				3060	*/
				3061	if (src == dst && dst_loff < loff) {
				3062	swap(src_pgarr, dst_pgarr);
				3063	swap(loff, dst_loff);
				3064	}
				3065
				3066	ret = gather_extent_pages(src, src_pgarr, cmp->num_pages, loff);
				3067	if (ret)
				3068	goto out;
				3069
				3070	ret = gather_extent_pages(dst, dst_pgarr, cmp->num_pages, dst_loff);
				3071
				3072	out:
				3073	if (ret)
				3074	btrfs_cmp_data_free(cmp);
				3075	return ret;
				3076	}
				3077
				3078	static int btrfs_cmp_data(u64 len, struct cmp_pages *cmp)
				3079	{
				3080	int ret = 0;
				3081	int i;
				3082	struct page src_page, dst_page;
				3083	unsigned int cmp_len = PAGE_SIZE;
				3084	void addr, dst_addr;
				3085
				3086	i = 0;
				3087	while (len) {
				3088	if (len < PAGE_SIZE)
				3089	cmp_len = len;
				3090
				3091	BUG_ON(i >= cmp->num_pages);
				3092
				3093	src_page = cmp->src_pages[i];
				3094	dst_page = cmp->dst_pages[i];
				3095	ASSERT(PageLocked(src_page));
				3096	ASSERT(PageLocked(dst_page));
				3097
				3098	addr = kmap_atomic(src_page);
				3099	dst_addr = kmap_atomic(dst_page);
				3100
				3101	flush_dcache_page(src_page);
				3102	flush_dcache_page(dst_page);
				3103
				3104	if (memcmp(addr, dst_addr, cmp_len))
				3105	ret = -EBADE;
				3106
				3107	kunmap_atomic(addr);
				3108	kunmap_atomic(dst_addr);
				3109
				3110	if (ret)
				3111	break;
				3112
				3113	len -= cmp_len;
				3114	i++;
				3115	}
				3116
				3117	return ret;
				3118	}
				3119
				3120	static int extent_same_check_offsets(struct inode inode, u64 off, u64 plen,
				3121	u64 olen)
				3122	{
				3123	u64 len = *plen;
				3124	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
				3125
				3126	if (off + olen > inode->i_size \|\| off + olen < off)
				3127	return -EINVAL;
				3128
				3129	/* if we extend to eof, continue to block boundary */
				3130	if (off + len == inode->i_size)
				3131	*plen = len = ALIGN(inode->i_size, bs) - off;
				3132
				3133	/* Check that we are block aligned - btrfs_clone() requires this */
				3134	if (!IS_ALIGNED(off, bs) \|\| !IS_ALIGNED(off + len, bs))
				3135	return -EINVAL;
				3136
				3137	return 0;
				3138	}
				3139
				3140	static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
				3141	struct inode *dst, u64 dst_loff)
				3142	{
				3143	int ret;
				3144	u64 len = olen;
				3145	struct cmp_pages cmp;
				3146	bool same_inode = (src == dst);
				3147	u64 same_lock_start = 0;
				3148	u64 same_lock_len = 0;
				3149
				3150	if (len == 0)
				3151	return 0;
				3152
				3153	if (same_inode)
				3154	inode_lock(src);
				3155	else
				3156	btrfs_double_inode_lock(src, dst);
				3157
				3158	ret = extent_same_check_offsets(src, loff, &len, olen);
				3159	if (ret)
				3160	goto out_unlock;
				3161
				3162	ret = extent_same_check_offsets(dst, dst_loff, &len, olen);
				3163	if (ret)
				3164	goto out_unlock;
				3165
				3166	if (same_inode) {
				3167	/*
				3168	* Single inode case wants the same checks, except we
				3169	* don't want our length pushed out past i_size as
				3170	* comparing that data range makes no sense.
				3171	*
				3172	* extent_same_check_offsets() will do this for an
				3173	* unaligned length at i_size, so catch it here and
				3174	* reject the request.
				3175	*
				3176	* This effectively means we require aligned extents
				3177	* for the single-inode case, whereas the other cases
				3178	* allow an unaligned length so long as it ends at
				3179	* i_size.
				3180	*/
				3181	if (len != olen) {
				3182	ret = -EINVAL;
				3183	goto out_unlock;
				3184	}
				3185
				3186	/* Check for overlapping ranges */
				3187	if (dst_loff + len > loff && dst_loff < loff + len) {
				3188	ret = -EINVAL;
				3189	goto out_unlock;
				3190	}
				3191
				3192	same_lock_start = min_t(u64, loff, dst_loff);
				3193	same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
				3194	} else {
				3195	/*
				3196	* If the source and destination inodes are different, the
				3197	* source's range end offset matches the source's i_size, that
				3198	* i_size is not a multiple of the sector size, and the
				3199	* destination range does not go past the destination's i_size,
				3200	* we must round down the length to the nearest sector size
				3201	* multiple. If we don't do this adjustment we end replacing
				3202	* with zeroes the bytes in the range that starts at the
				3203	* deduplication range's end offset and ends at the next sector
				3204	* size multiple.
				3205	*/
				3206	if (loff + olen == i_size_read(src) &&
				3207	dst_loff + len < i_size_read(dst)) {
				3208	const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
				3209
				3210	len = round_down(i_size_read(src), sz) - loff;
				3211	if (len == 0)
				3212	return 0;
				3213	olen = len;
				3214	}
				3215	}
				3216
				3217	/* don't make the dst file partly checksummed */
				3218	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
				3219	(BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
				3220	ret = -EINVAL;
				3221	goto out_unlock;
				3222	}
				3223
				3224	again:
				3225	ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp);
				3226	if (ret)
				3227	goto out_unlock;
				3228
				3229	if (same_inode)
				3230	ret = lock_extent_range(src, same_lock_start, same_lock_len,
				3231	false);
				3232	else
				3233	ret = btrfs_double_extent_lock(src, loff, dst, dst_loff, len,
				3234	false);
				3235	/*
				3236	* If one of the inodes has dirty pages in the respective range or
				3237	* ordered extents, we need to flush dellaloc and wait for all ordered
				3238	* extents in the range. We must unlock the pages and the ranges in the
				3239	* io trees to avoid deadlocks when flushing delalloc (requires locking
				3240	* pages) and when waiting for ordered extents to complete (they require
				3241	* range locking).
				3242	*/
				3243	if (ret == -EAGAIN) {
				3244	/*
				3245	* Ranges in the io trees already unlocked. Now unlock all
				3246	* pages before waiting for all IO to complete.
				3247	*/
				3248	btrfs_cmp_data_free(&cmp);
				3249	if (same_inode) {
				3250	btrfs_wait_ordered_range(src, same_lock_start,
				3251	same_lock_len);
				3252	} else {
				3253	btrfs_wait_ordered_range(src, loff, len);
				3254	btrfs_wait_ordered_range(dst, dst_loff, len);
				3255	}
				3256	goto again;
				3257	}
				3258	ASSERT(ret == 0);
				3259	if (WARN_ON(ret)) {
				3260	/* ranges in the io trees already unlocked */
				3261	btrfs_cmp_data_free(&cmp);
				3262	return ret;
				3263	}
				3264
				3265	/* pass original length for comparison so we stay within i_size */
				3266	ret = btrfs_cmp_data(olen, &cmp);
				3267	if (ret == 0)
				3268	ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
				3269
				3270	if (same_inode)
				3271	unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
				3272	same_lock_start + same_lock_len - 1);
				3273	else
				3274	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
				3275
				3276	btrfs_cmp_data_free(&cmp);
				3277	out_unlock:
				3278	if (same_inode)
				3279	inode_unlock(src);
				3280	else
				3281	btrfs_double_inode_unlock(src, dst);
				3282
				3283	return ret;
				3284	}
				3285
				3286	#define BTRFS_MAX_DEDUPE_LEN SZ_16M
				3287
				3288	ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
				3289	struct file *dst_file, u64 dst_loff)
				3290	{
				3291	struct inode *src = file_inode(src_file);
				3292	struct inode *dst = file_inode(dst_file);
				3293	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
				3294	ssize_t res;
				3295
				3296	if (olen > BTRFS_MAX_DEDUPE_LEN)
				3297	olen = BTRFS_MAX_DEDUPE_LEN;
				3298
				3299	if (WARN_ON_ONCE(bs < PAGE_SIZE)) {
				3300	/*
				3301	* Btrfs does not support blocksize < page_size. As a
				3302	* result, btrfs_cmp_data() won't correctly handle
				3303	* this situation without an update.
				3304	*/
				3305	return -EINVAL;
				3306	}
				3307
				3308	res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
				3309	if (res)
				3310	return res;
				3311	return olen;
				3312	}
				3313
				3314	static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
				3315	struct inode *inode,
				3316	u64 endoff,
				3317	const u64 destoff,
				3318	const u64 olen,
				3319	int no_time_update)
				3320	{
				3321	struct btrfs_root *root = BTRFS_I(inode)->root;
				3322	int ret;
				3323
				3324	inode_inc_iversion(inode);
				3325	if (!no_time_update)
				3326	inode->i_mtime = inode->i_ctime = current_time(inode);
				3327	/*
				3328	* We round up to the block size at eof when determining which
				3329	* extents to clone above, but shouldn't round up the file size.
				3330	*/
				3331	if (endoff > destoff + olen)
				3332	endoff = destoff + olen;
				3333	if (endoff > inode->i_size)
				3334	btrfs_i_size_write(BTRFS_I(inode), endoff);
				3335
				3336	ret = btrfs_update_inode(trans, root, inode);
				3337	if (ret) {
				3338	btrfs_abort_transaction(trans, ret);
				3339	btrfs_end_transaction(trans);
				3340	goto out;
				3341	}
				3342	ret = btrfs_end_transaction(trans);
				3343	out:
				3344	return ret;
				3345	}
				3346
				3347	static void clone_update_extent_map(struct btrfs_inode *inode,
				3348	const struct btrfs_trans_handle *trans,
				3349	const struct btrfs_path *path,
				3350	const u64 hole_offset,
				3351	const u64 hole_len)
				3352	{
				3353	struct extent_map_tree *em_tree = &inode->extent_tree;
				3354	struct extent_map *em;
				3355	int ret;
				3356
				3357	em = alloc_extent_map();
				3358	if (!em) {
				3359	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
				3360	return;
				3361	}
				3362
				3363	if (path) {
				3364	struct btrfs_file_extent_item *fi;
				3365
				3366	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3367	struct btrfs_file_extent_item);
				3368	btrfs_extent_item_to_extent_map(inode, path, fi, false, em);
				3369	em->generation = -1;
				3370	if (btrfs_file_extent_type(path->nodes[0], fi) ==
				3371	BTRFS_FILE_EXTENT_INLINE)
				3372	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				3373	&inode->runtime_flags);
				3374	} else {
				3375	em->start = hole_offset;
				3376	em->len = hole_len;
				3377	em->ram_bytes = em->len;
				3378	em->orig_start = hole_offset;
				3379	em->block_start = EXTENT_MAP_HOLE;
				3380	em->block_len = 0;
				3381	em->orig_block_len = 0;
				3382	em->compress_type = BTRFS_COMPRESS_NONE;
				3383	em->generation = trans->transid;
				3384	}
				3385
				3386	while (1) {
				3387	write_lock(&em_tree->lock);
				3388	ret = add_extent_mapping(em_tree, em, 1);
				3389	write_unlock(&em_tree->lock);
				3390	if (ret != -EEXIST) {
				3391	free_extent_map(em);
				3392	break;
				3393	}
				3394	btrfs_drop_extent_cache(inode, em->start,
				3395	em->start + em->len - 1, 0);
				3396	}
				3397
				3398	if (ret)
				3399	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
				3400	}
				3401
				3402	/*
				3403	* Make sure we do not end up inserting an inline extent into a file that has
				3404	* already other (non-inline) extents. If a file has an inline extent it can
				3405	* not have any other extents and the (single) inline extent must start at the
				3406	* file offset 0. Failing to respect these rules will lead to file corruption,
				3407	* resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
				3408	*
				3409	* We can have extents that have been already written to disk or we can have
				3410	* dirty ranges still in delalloc, in which case the extent maps and items are
				3411	* created only when we run delalloc, and the delalloc ranges might fall outside
				3412	* the range we are currently locking in the inode's io tree. So we check the
				3413	* inode's i_size because of that (i_size updates are done while holding the
				3414	* i_mutex, which we are holding here).
				3415	* We also check to see if the inode has a size not greater than "datal" but has
				3416	* extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
				3417	* protected against such concurrent fallocate calls by the i_mutex).
				3418	*
				3419	* If the file has no extents but a size greater than datal, do not allow the
				3420	* copy because we would need turn the inline extent into a non-inline one (even
				3421	* with NO_HOLES enabled). If we find our destination inode only has one inline
				3422	* extent, just overwrite it with the source inline extent if its size is less
				3423	* than the source extent's size, or we could copy the source inline extent's
				3424	* data into the destination inode's inline extent if the later is greater then
				3425	* the former.
				3426	*/
				3427	static int clone_copy_inline_extent(struct inode *dst,
				3428	struct btrfs_trans_handle *trans,
				3429	struct btrfs_path *path,
				3430	struct btrfs_key *new_key,
				3431	const u64 drop_start,
				3432	const u64 datal,
				3433	const u64 skip,
				3434	const u64 size,
				3435	char *inline_data)
				3436	{
				3437	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
				3438	struct btrfs_root *root = BTRFS_I(dst)->root;
				3439	const u64 aligned_end = ALIGN(new_key->offset + datal,
				3440	fs_info->sectorsize);
				3441	int ret;
				3442	struct btrfs_key key;
				3443
				3444	if (new_key->offset > 0)
				3445	return -EOPNOTSUPP;
				3446
				3447	key.objectid = btrfs_ino(BTRFS_I(dst));
				3448	key.type = BTRFS_EXTENT_DATA_KEY;
				3449	key.offset = 0;
				3450	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				3451	if (ret < 0) {
				3452	return ret;
				3453	} else if (ret > 0) {
				3454	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				3455	ret = btrfs_next_leaf(root, path);
				3456	if (ret < 0)
				3457	return ret;
				3458	else if (ret > 0)
				3459	goto copy_inline_extent;
				3460	}
				3461	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				3462	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				3463	key.type == BTRFS_EXTENT_DATA_KEY) {
				3464	ASSERT(key.offset > 0);
				3465	return -EOPNOTSUPP;
				3466	}
				3467	} else if (i_size_read(dst) <= datal) {
				3468	struct btrfs_file_extent_item *ei;
				3469	u64 ext_len;
				3470
				3471	/*
				3472	* If the file size is <= datal, make sure there are no other
				3473	* extents following (can happen do to an fallocate call with
				3474	* the flag FALLOC_FL_KEEP_SIZE).
				3475	*/
				3476	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3477	struct btrfs_file_extent_item);
				3478	/*
				3479	* If it's an inline extent, it can not have other extents
				3480	* following it.
				3481	*/
				3482	if (btrfs_file_extent_type(path->nodes[0], ei) ==
				3483	BTRFS_FILE_EXTENT_INLINE)
				3484	goto copy_inline_extent;
				3485
				3486	ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
				3487	if (ext_len > aligned_end)
				3488	return -EOPNOTSUPP;
				3489
				3490	ret = btrfs_next_item(root, path);
				3491	if (ret < 0) {
				3492	return ret;
				3493	} else if (ret == 0) {
				3494	btrfs_item_key_to_cpu(path->nodes[0], &key,
				3495	path->slots[0]);
				3496	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				3497	key.type == BTRFS_EXTENT_DATA_KEY)
				3498	return -EOPNOTSUPP;
				3499	}
				3500	}
				3501
				3502	copy_inline_extent:
				3503	/*
				3504	* We have no extent items, or we have an extent at offset 0 which may
				3505	* or may not be inlined. All these cases are dealt the same way.
				3506	*/
				3507	if (i_size_read(dst) > datal) {
				3508	/*
				3509	* If the destination inode has an inline extent...
				3510	* This would require copying the data from the source inline
				3511	* extent into the beginning of the destination's inline extent.
				3512	* But this is really complex, both extents can be compressed
				3513	* or just one of them, which would require decompressing and
				3514	* re-compressing data (which could increase the new compressed
				3515	* size, not allowing the compressed data to fit anymore in an
				3516	* inline extent).
				3517	* So just don't support this case for now (it should be rare,
				3518	* we are not really saving space when cloning inline extents).
				3519	*/
				3520	return -EOPNOTSUPP;
				3521	}
				3522
				3523	btrfs_release_path(path);
				3524	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
				3525	if (ret)
				3526	return ret;
				3527	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
				3528	if (ret)
				3529	return ret;
				3530
				3531	if (skip) {
				3532	const u32 start = btrfs_file_extent_calc_inline_size(0);
				3533
				3534	memmove(inline_data + start, inline_data + start + skip, datal);
				3535	}
				3536
				3537	write_extent_buffer(path->nodes[0], inline_data,
				3538	btrfs_item_ptr_offset(path->nodes[0],
				3539	path->slots[0]),
				3540	size);
				3541	inode_add_bytes(dst, datal);
				3542
				3543	return 0;
				3544	}
				3545
				3546	/**
				3547	* btrfs_clone() - clone a range from inode file to another
				3548	*
				3549	* @src: Inode to clone from
				3550	* @inode: Inode to clone to
				3551	* @off: Offset within source to start clone from
				3552	* @olen: Original length, passed by user, of range to clone
				3553	* @olen_aligned: Block-aligned value of olen
				3554	* @destoff: Offset within @inode to start clone
				3555	* @no_time_update: Whether to update mtime/ctime on the target inode
				3556	*/
				3557	static int btrfs_clone(struct inode src, struct inode inode,
				3558	const u64 off, const u64 olen, const u64 olen_aligned,
				3559	const u64 destoff, int no_time_update)
				3560	{
				3561	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				3562	struct btrfs_root *root = BTRFS_I(inode)->root;
				3563	struct btrfs_path *path = NULL;
				3564	struct extent_buffer *leaf;
				3565	struct btrfs_trans_handle *trans;
				3566	char *buf = NULL;
				3567	struct btrfs_key key;
				3568	u32 nritems;
				3569	int slot;
				3570	int ret;
				3571	const u64 len = olen_aligned;
				3572	u64 last_dest_end = destoff;
				3573
				3574	ret = -ENOMEM;
				3575	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
				3576	if (!buf)
				3577	return ret;
				3578
				3579	path = btrfs_alloc_path();
				3580	if (!path) {
				3581	kvfree(buf);
				3582	return ret;
				3583	}
				3584
				3585	path->reada = READA_FORWARD;
				3586	/* clone data */
				3587	key.objectid = btrfs_ino(BTRFS_I(src));
				3588	key.type = BTRFS_EXTENT_DATA_KEY;
				3589	key.offset = off;
				3590
				3591	while (1) {
				3592	u64 next_key_min_offset = key.offset + 1;
				3593
				3594	/*
				3595	* note the key will change type as we walk through the
				3596	* tree.
				3597	*/
				3598	path->leave_spinning = 1;
				3599	ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
				3600	0, 0);
				3601	if (ret < 0)
				3602	goto out;
				3603	/*
				3604	* First search, if no extent item that starts at offset off was
				3605	* found but the previous item is an extent item, it's possible
				3606	* it might overlap our target range, therefore process it.
				3607	*/
				3608	if (key.offset == off && ret > 0 && path->slots[0] > 0) {
				3609	btrfs_item_key_to_cpu(path->nodes[0], &key,
				3610	path->slots[0] - 1);
				3611	if (key.type == BTRFS_EXTENT_DATA_KEY)
				3612	path->slots[0]--;
				3613	}
				3614
				3615	nritems = btrfs_header_nritems(path->nodes[0]);
				3616	process_slot:
				3617	if (path->slots[0] >= nritems) {
				3618	ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
				3619	if (ret < 0)
				3620	goto out;
				3621	if (ret > 0)
				3622	break;
				3623	nritems = btrfs_header_nritems(path->nodes[0]);
				3624	}
				3625	leaf = path->nodes[0];
				3626	slot = path->slots[0];
				3627
				3628	btrfs_item_key_to_cpu(leaf, &key, slot);
				3629	if (key.type > BTRFS_EXTENT_DATA_KEY \|\|
				3630	key.objectid != btrfs_ino(BTRFS_I(src)))
				3631	break;
				3632
				3633	if (key.type == BTRFS_EXTENT_DATA_KEY) {
				3634	struct btrfs_file_extent_item *extent;
				3635	int type;
				3636	u32 size;
				3637	struct btrfs_key new_key;
				3638	u64 disko = 0, diskl = 0;
				3639	u64 datao = 0, datal = 0;
				3640	u8 comp;
				3641	u64 drop_start;
				3642
				3643	extent = btrfs_item_ptr(leaf, slot,
				3644	struct btrfs_file_extent_item);
				3645	comp = btrfs_file_extent_compression(leaf, extent);
				3646	type = btrfs_file_extent_type(leaf, extent);
				3647	if (type == BTRFS_FILE_EXTENT_REG \|\|
				3648	type == BTRFS_FILE_EXTENT_PREALLOC) {
				3649	disko = btrfs_file_extent_disk_bytenr(leaf,
				3650	extent);
				3651	diskl = btrfs_file_extent_disk_num_bytes(leaf,
				3652	extent);
				3653	datao = btrfs_file_extent_offset(leaf, extent);
				3654	datal = btrfs_file_extent_num_bytes(leaf,
				3655	extent);
				3656	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
				3657	/* take upper bound, may be compressed */
				3658	datal = btrfs_file_extent_ram_bytes(leaf,
				3659	extent);
				3660	}
				3661
				3662	/*
				3663	* The first search might have left us at an extent
				3664	* item that ends before our target range's start, can
				3665	* happen if we have holes and NO_HOLES feature enabled.
				3666	*/
				3667	if (key.offset + datal <= off) {
				3668	path->slots[0]++;
				3669	goto process_slot;
				3670	} else if (key.offset >= off + len) {
				3671	break;
				3672	}
				3673	next_key_min_offset = key.offset + datal;
				3674	size = btrfs_item_size_nr(leaf, slot);
				3675	read_extent_buffer(leaf, buf,
				3676	btrfs_item_ptr_offset(leaf, slot),
				3677	size);
				3678
				3679	btrfs_release_path(path);
				3680	path->leave_spinning = 0;
				3681
				3682	memcpy(&new_key, &key, sizeof(new_key));
				3683	new_key.objectid = btrfs_ino(BTRFS_I(inode));
				3684	if (off <= key.offset)
				3685	new_key.offset = key.offset + destoff - off;
				3686	else
				3687	new_key.offset = destoff;
				3688
				3689	/*
				3690	* Deal with a hole that doesn't have an extent item
				3691	* that represents it (NO_HOLES feature enabled).
				3692	* This hole is either in the middle of the cloning
				3693	* range or at the beginning (fully overlaps it or
				3694	* partially overlaps it).
				3695	*/
				3696	if (new_key.offset != last_dest_end)
				3697	drop_start = last_dest_end;
				3698	else
				3699	drop_start = new_key.offset;
				3700
				3701	/*
				3702	* 1 - adjusting old extent (we may have to split it)
				3703	* 1 - add new extent
				3704	* 1 - inode update
				3705	*/
				3706	trans = btrfs_start_transaction(root, 3);
				3707	if (IS_ERR(trans)) {
				3708	ret = PTR_ERR(trans);
				3709	goto out;
				3710	}
				3711
				3712	if (type == BTRFS_FILE_EXTENT_REG \|\|
				3713	type == BTRFS_FILE_EXTENT_PREALLOC) {
				3714	/*
				3715	* a \| --- range to clone ---\| b
				3716	* \| ------------- extent ------------- \|
				3717	*/
				3718
				3719	/* subtract range b */
				3720	if (key.offset + datal > off + len)
				3721	datal = off + len - key.offset;
				3722
				3723	/* subtract range a */
				3724	if (off > key.offset) {
				3725	datao += off - key.offset;
				3726	datal -= off - key.offset;
				3727	}
				3728
				3729	ret = btrfs_drop_extents(trans, root, inode,
				3730	drop_start,
				3731	new_key.offset + datal,
				3732	1);
				3733	if (ret) {
				3734	if (ret != -EOPNOTSUPP)
				3735	btrfs_abort_transaction(trans,
				3736	ret);
				3737	btrfs_end_transaction(trans);
				3738	goto out;
				3739	}
				3740
				3741	ret = btrfs_insert_empty_item(trans, root, path,
				3742	&new_key, size);
				3743	if (ret) {
				3744	btrfs_abort_transaction(trans, ret);
				3745	btrfs_end_transaction(trans);
				3746	goto out;
				3747	}
				3748
				3749	leaf = path->nodes[0];
				3750	slot = path->slots[0];
				3751	write_extent_buffer(leaf, buf,
				3752	btrfs_item_ptr_offset(leaf, slot),
				3753	size);
				3754
				3755	extent = btrfs_item_ptr(leaf, slot,
				3756	struct btrfs_file_extent_item);
				3757
				3758	/* disko == 0 means it's a hole */
				3759	if (!disko)
				3760	datao = 0;
				3761
				3762	btrfs_set_file_extent_offset(leaf, extent,
				3763	datao);
				3764	btrfs_set_file_extent_num_bytes(leaf, extent,
				3765	datal);
				3766
				3767	if (disko) {
				3768	inode_add_bytes(inode, datal);
				3769	ret = btrfs_inc_extent_ref(trans,
				3770	fs_info,
				3771	disko, diskl, 0,
				3772	root->root_key.objectid,
				3773	btrfs_ino(BTRFS_I(inode)),
				3774	new_key.offset - datao);
				3775	if (ret) {
				3776	btrfs_abort_transaction(trans,
				3777	ret);
				3778	btrfs_end_transaction(trans);
				3779	goto out;
				3780
				3781	}
				3782	}
				3783	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
				3784	u64 skip = 0;
				3785	u64 trim = 0;
				3786
				3787	if (off > key.offset) {
				3788	skip = off - key.offset;
				3789	new_key.offset += skip;
				3790	}
				3791
				3792	if (key.offset + datal > off + len)
				3793	trim = key.offset + datal - (off + len);
				3794
				3795	if (comp && (skip \|\| trim)) {
				3796	ret = -EINVAL;
				3797	btrfs_end_transaction(trans);
				3798	goto out;
				3799	}
				3800	size -= skip + trim;
				3801	datal -= skip + trim;
				3802
				3803	ret = clone_copy_inline_extent(inode,
				3804	trans, path,
				3805	&new_key,
				3806	drop_start,
				3807	datal,
				3808	skip, size, buf);
				3809	if (ret) {
				3810	if (ret != -EOPNOTSUPP)
				3811	btrfs_abort_transaction(trans,
				3812	ret);
				3813	btrfs_end_transaction(trans);
				3814	goto out;
				3815	}
				3816	leaf = path->nodes[0];
				3817	slot = path->slots[0];
				3818	}
				3819
				3820	/* If we have an implicit hole (NO_HOLES feature). */
				3821	if (drop_start < new_key.offset)
				3822	clone_update_extent_map(BTRFS_I(inode), trans,
				3823	NULL, drop_start,
				3824	new_key.offset - drop_start);
				3825
				3826	clone_update_extent_map(BTRFS_I(inode), trans,
				3827	path, 0, 0);
				3828
				3829	btrfs_mark_buffer_dirty(leaf);
				3830	btrfs_release_path(path);
				3831
				3832	last_dest_end = ALIGN(new_key.offset + datal,
				3833	fs_info->sectorsize);
				3834	ret = clone_finish_inode_update(trans, inode,
				3835	last_dest_end,
				3836	destoff, olen,
				3837	no_time_update);
				3838	if (ret)
				3839	goto out;
				3840	if (new_key.offset + datal >= destoff + len)
				3841	break;
				3842	}
				3843	btrfs_release_path(path);
				3844	key.offset = next_key_min_offset;
				3845
				3846	if (fatal_signal_pending(current)) {
				3847	ret = -EINTR;
				3848	goto out;
				3849	}
				3850	}
				3851	ret = 0;
				3852
				3853	if (last_dest_end < destoff + len) {
				3854	/*
				3855	* We have an implicit hole (NO_HOLES feature is enabled) that
				3856	* fully or partially overlaps our cloning range at its end.
				3857	*/
				3858	btrfs_release_path(path);
				3859
				3860	/*
				3861	* 1 - remove extent(s)
				3862	* 1 - inode update
				3863	*/
				3864	trans = btrfs_start_transaction(root, 2);
				3865	if (IS_ERR(trans)) {
				3866	ret = PTR_ERR(trans);
				3867	goto out;
				3868	}
				3869	ret = btrfs_drop_extents(trans, root, inode,
				3870	last_dest_end, destoff + len, 1);
				3871	if (ret) {
				3872	if (ret != -EOPNOTSUPP)
				3873	btrfs_abort_transaction(trans, ret);
				3874	btrfs_end_transaction(trans);
				3875	goto out;
				3876	}
				3877	clone_update_extent_map(BTRFS_I(inode), trans, NULL,
				3878	last_dest_end,
				3879	destoff + len - last_dest_end);
				3880	ret = clone_finish_inode_update(trans, inode, destoff + len,
				3881	destoff, olen, no_time_update);
				3882	}
				3883
				3884	out:
				3885	btrfs_free_path(path);
				3886	kvfree(buf);
				3887	return ret;
				3888	}
				3889
				3890	static noinline int btrfs_clone_files(struct file file, struct file file_src,
				3891	u64 off, u64 olen, u64 destoff)
				3892	{
				3893	struct inode *inode = file_inode(file);
				3894	struct inode *src = file_inode(file_src);
				3895	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				3896	struct btrfs_root *root = BTRFS_I(inode)->root;
				3897	int ret;
				3898	u64 len = olen;
				3899	u64 bs = fs_info->sb->s_blocksize;
				3900	int same_inode = src == inode;
				3901
				3902	/*
				3903	* TODO:
				3904	* - split compressed inline extents. annoying: we need to
				3905	* decompress into destination's address_space (the file offset
				3906	* may change, so source mapping won't do), then recompress (or
				3907	* otherwise reinsert) a subrange.
				3908	*
				3909	* - split destination inode's inline extents. The inline extents can
				3910	* be either compressed or non-compressed.
				3911	*/
				3912
				3913	if (btrfs_root_readonly(root))
				3914	return -EROFS;
				3915
				3916	if (file_src->f_path.mnt != file->f_path.mnt \|\|
				3917	src->i_sb != inode->i_sb)
				3918	return -EXDEV;
				3919
				3920	if (S_ISDIR(src->i_mode) \|\| S_ISDIR(inode->i_mode))
				3921	return -EISDIR;
				3922
				3923	if (!same_inode) {
				3924	btrfs_double_inode_lock(src, inode);
				3925	} else {
				3926	inode_lock(src);
				3927	}
				3928
				3929	/* don't make the dst file partly checksummed */
				3930	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
				3931	(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
				3932	ret = -EINVAL;
				3933	goto out_unlock;
				3934	}
				3935
				3936	/* determine range to clone */
				3937	ret = -EINVAL;
				3938	if (off + len > src->i_size \|\| off + len < off)
				3939	goto out_unlock;
				3940	if (len == 0)
				3941	olen = len = src->i_size - off;
				3942	/*
				3943	* If we extend to eof, continue to block boundary if and only if the
				3944	* destination end offset matches the destination file's size, otherwise
				3945	* we would be corrupting data by placing the eof block into the middle
				3946	* of a file.
				3947	*/
				3948	if (off + len == src->i_size) {
				3949	if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
				3950	goto out_unlock;
				3951	len = ALIGN(src->i_size, bs) - off;
				3952	}
				3953
				3954	if (len == 0) {
				3955	ret = 0;
				3956	goto out_unlock;
				3957	}
				3958
				3959	/* verify the end result is block aligned */
				3960	if (!IS_ALIGNED(off, bs) \|\| !IS_ALIGNED(off + len, bs) \|\|
				3961	!IS_ALIGNED(destoff, bs))
				3962	goto out_unlock;
				3963
				3964	/* verify if ranges are overlapped within the same file */
				3965	if (same_inode) {
				3966	if (destoff + len > off && destoff < off + len)
				3967	goto out_unlock;
				3968	}
				3969
				3970	if (destoff > inode->i_size) {
				3971	ret = btrfs_cont_expand(inode, inode->i_size, destoff);
				3972	if (ret)
				3973	goto out_unlock;
				3974	}
				3975
				3976	/*
				3977	* Lock the target range too. Right after we replace the file extent
				3978	* items in the fs tree (which now point to the cloned data), we might
				3979	* have a worker replace them with extent items relative to a write
				3980	* operation that was issued before this clone operation (i.e. confront
				3981	* with inode.c:btrfs_finish_ordered_io).
				3982	*/
				3983	if (same_inode) {
				3984	u64 lock_start = min_t(u64, off, destoff);
				3985	u64 lock_len = max_t(u64, off, destoff) + len - lock_start;
				3986
				3987	ret = lock_extent_range(src, lock_start, lock_len, true);
				3988	} else {
				3989	ret = btrfs_double_extent_lock(src, off, inode, destoff, len,
				3990	true);
				3991	}
				3992	ASSERT(ret == 0);
				3993	if (WARN_ON(ret)) {
				3994	/* ranges in the io trees already unlocked */
				3995	goto out_unlock;
				3996	}
				3997
				3998	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
				3999
				4000	if (same_inode) {
				4001	u64 lock_start = min_t(u64, off, destoff);
				4002	u64 lock_end = max_t(u64, off, destoff) + len - 1;
				4003
				4004	unlock_extent(&BTRFS_I(src)->io_tree, lock_start, lock_end);
				4005	} else {
				4006	btrfs_double_extent_unlock(src, off, inode, destoff, len);
				4007	}
				4008	/*
				4009	* Truncate page cache pages so that future reads will see the cloned
				4010	* data immediately and not the previous data.
				4011	*/
				4012	truncate_inode_pages_range(&inode->i_data,
				4013	round_down(destoff, PAGE_SIZE),
				4014	round_up(destoff + len, PAGE_SIZE) - 1);
				4015	out_unlock:
				4016	if (!same_inode)
				4017	btrfs_double_inode_unlock(src, inode);
				4018	else
				4019	inode_unlock(src);
				4020	return ret;
				4021	}
				4022
				4023	int btrfs_clone_file_range(struct file *src_file, loff_t off,
				4024	struct file *dst_file, loff_t destoff, u64 len)
				4025	{
				4026	return btrfs_clone_files(dst_file, src_file, off, len, destoff);
				4027	}
				4028
				4029	/*
				4030	* there are many ways the trans_start and trans_end ioctls can lead
				4031	* to deadlocks. They should only be used by applications that
				4032	* basically own the machine, and have a very in depth understanding
				4033	* of all the possible deadlocks and enospc problems.
				4034	*/
				4035	static long btrfs_ioctl_trans_start(struct file *file)
				4036	{
				4037	struct inode *inode = file_inode(file);
				4038	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4039	struct btrfs_root *root = BTRFS_I(inode)->root;
				4040	struct btrfs_trans_handle *trans;
				4041	struct btrfs_file_private *private;
				4042	int ret;
				4043	static bool warned = false;
				4044
				4045	ret = -EPERM;
				4046	if (!capable(CAP_SYS_ADMIN))
				4047	goto out;
				4048
				4049	if (!warned) {
				4050	btrfs_warn(fs_info,
				4051	"Userspace transaction mechanism is considered "
				4052	"deprecated and slated to be removed in 4.17. "
				4053	"If you have a valid use case please "
				4054	"speak up on the mailing list");
				4055	WARN_ON(1);
				4056	warned = true;
				4057	}
				4058
				4059	ret = -EINPROGRESS;
				4060	private = file->private_data;
				4061	if (private && private->trans)
				4062	goto out;
				4063	if (!private) {
				4064	private = kzalloc(sizeof(struct btrfs_file_private),
				4065	GFP_KERNEL);
				4066	if (!private)
				4067	return -ENOMEM;
				4068	file->private_data = private;
				4069	}
				4070
				4071	ret = -EROFS;
				4072	if (btrfs_root_readonly(root))
				4073	goto out;
				4074
				4075	ret = mnt_want_write_file(file);
				4076	if (ret)
				4077	goto out;
				4078
				4079	atomic_inc(&fs_info->open_ioctl_trans);
				4080
				4081	ret = -ENOMEM;
				4082	trans = btrfs_start_ioctl_transaction(root);
				4083	if (IS_ERR(trans))
				4084	goto out_drop;
				4085
				4086	private->trans = trans;
				4087	return 0;
				4088
				4089	out_drop:
				4090	atomic_dec(&fs_info->open_ioctl_trans);
				4091	mnt_drop_write_file(file);
				4092	out:
				4093	return ret;
				4094	}
				4095
				4096	static long btrfs_ioctl_default_subvol(struct file file, void __user argp)
				4097	{
				4098	struct inode *inode = file_inode(file);
				4099	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4100	struct btrfs_root *root = BTRFS_I(inode)->root;
				4101	struct btrfs_root *new_root;
				4102	struct btrfs_dir_item *di;
				4103	struct btrfs_trans_handle *trans;
				4104	struct btrfs_path *path;
				4105	struct btrfs_key location;
				4106	struct btrfs_disk_key disk_key;
				4107	u64 objectid = 0;
				4108	u64 dir_id;
				4109	int ret;
				4110
				4111	if (!capable(CAP_SYS_ADMIN))
				4112	return -EPERM;
				4113
				4114	ret = mnt_want_write_file(file);
				4115	if (ret)
				4116	return ret;
				4117
				4118	if (copy_from_user(&objectid, argp, sizeof(objectid))) {
				4119	ret = -EFAULT;
				4120	goto out;
				4121	}
				4122
				4123	if (!objectid)
				4124	objectid = BTRFS_FS_TREE_OBJECTID;
				4125
				4126	location.objectid = objectid;
				4127	location.type = BTRFS_ROOT_ITEM_KEY;
				4128	location.offset = (u64)-1;
				4129
				4130	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
				4131	if (IS_ERR(new_root)) {
				4132	ret = PTR_ERR(new_root);
				4133	goto out;
				4134	}
				4135	if (!is_fstree(new_root->objectid)) {
				4136	ret = -ENOENT;
				4137	goto out;
				4138	}
				4139
				4140	path = btrfs_alloc_path();
				4141	if (!path) {
				4142	ret = -ENOMEM;
				4143	goto out;
				4144	}
				4145	path->leave_spinning = 1;
				4146
				4147	trans = btrfs_start_transaction(root, 1);
				4148	if (IS_ERR(trans)) {
				4149	btrfs_free_path(path);
				4150	ret = PTR_ERR(trans);
				4151	goto out;
				4152	}
				4153
				4154	dir_id = btrfs_super_root_dir(fs_info->super_copy);
				4155	di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
				4156	dir_id, "default", 7, 1);
				4157	if (IS_ERR_OR_NULL(di)) {
				4158	btrfs_free_path(path);
				4159	btrfs_end_transaction(trans);
				4160	btrfs_err(fs_info,
				4161	"Umm, you don't have the default diritem, this isn't going to work");
				4162	ret = -ENOENT;
				4163	goto out;
				4164	}
				4165
				4166	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
				4167	btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
				4168	btrfs_mark_buffer_dirty(path->nodes[0]);
				4169	btrfs_free_path(path);
				4170
				4171	btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
				4172	btrfs_end_transaction(trans);
				4173	out:
				4174	mnt_drop_write_file(file);
				4175	return ret;
				4176	}
				4177
				4178	void btrfs_get_block_group_info(struct list_head *groups_list,
				4179	struct btrfs_ioctl_space_info *space)
				4180	{
				4181	struct btrfs_block_group_cache *block_group;
				4182
				4183	space->total_bytes = 0;
				4184	space->used_bytes = 0;
				4185	space->flags = 0;
				4186	list_for_each_entry(block_group, groups_list, list) {
				4187	space->flags = block_group->flags;
				4188	space->total_bytes += block_group->key.offset;
				4189	space->used_bytes +=
				4190	btrfs_block_group_used(&block_group->item);
				4191	}
				4192	}
				4193
				4194	static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
				4195	void __user *arg)
				4196	{
				4197	struct btrfs_ioctl_space_args space_args;
				4198	struct btrfs_ioctl_space_info space;
				4199	struct btrfs_ioctl_space_info *dest;
				4200	struct btrfs_ioctl_space_info *dest_orig;
				4201	struct btrfs_ioctl_space_info __user *user_dest;
				4202	struct btrfs_space_info *info;
				4203	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
				4204	BTRFS_BLOCK_GROUP_SYSTEM,
				4205	BTRFS_BLOCK_GROUP_METADATA,
				4206	BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA};
				4207	int num_types = 4;
				4208	int alloc_size;
				4209	int ret = 0;
				4210	u64 slot_count = 0;
				4211	int i, c;
				4212
				4213	if (copy_from_user(&space_args,
				4214	(struct btrfs_ioctl_space_args __user *)arg,
				4215	sizeof(space_args)))
				4216	return -EFAULT;
				4217
				4218	for (i = 0; i < num_types; i++) {
				4219	struct btrfs_space_info *tmp;
				4220
				4221	info = NULL;
				4222	rcu_read_lock();
				4223	list_for_each_entry_rcu(tmp, &fs_info->space_info,
				4224	list) {
				4225	if (tmp->flags == types[i]) {
				4226	info = tmp;
				4227	break;
				4228	}
				4229	}
				4230	rcu_read_unlock();
				4231
				4232	if (!info)
				4233	continue;
				4234
				4235	down_read(&info->groups_sem);
				4236	for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
				4237	if (!list_empty(&info->block_groups[c]))
				4238	slot_count++;
				4239	}
				4240	up_read(&info->groups_sem);
				4241	}
				4242
				4243	/*
				4244	* Global block reserve, exported as a space_info
				4245	*/
				4246	slot_count++;
				4247
				4248	/* space_slots == 0 means they are asking for a count */
				4249	if (space_args.space_slots == 0) {
				4250	space_args.total_spaces = slot_count;
				4251	goto out;
				4252	}
				4253
				4254	slot_count = min_t(u64, space_args.space_slots, slot_count);
				4255
				4256	alloc_size = sizeof(dest) slot_count;
				4257
				4258	/* we generally have at most 6 or so space infos, one for each raid
				4259	* level. So, a whole page should be more than enough for everyone
				4260	*/
				4261	if (alloc_size > PAGE_SIZE)
				4262	return -ENOMEM;
				4263
				4264	space_args.total_spaces = 0;
				4265	dest = kmalloc(alloc_size, GFP_KERNEL);
				4266	if (!dest)
				4267	return -ENOMEM;
				4268	dest_orig = dest;
				4269
				4270	/* now we have a buffer to copy into */
				4271	for (i = 0; i < num_types; i++) {
				4272	struct btrfs_space_info *tmp;
				4273
				4274	if (!slot_count)
				4275	break;
				4276
				4277	info = NULL;
				4278	rcu_read_lock();
				4279	list_for_each_entry_rcu(tmp, &fs_info->space_info,
				4280	list) {
				4281	if (tmp->flags == types[i]) {
				4282	info = tmp;
				4283	break;
				4284	}
				4285	}
				4286	rcu_read_unlock();
				4287
				4288	if (!info)
				4289	continue;
				4290	down_read(&info->groups_sem);
				4291	for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
				4292	if (!list_empty(&info->block_groups[c])) {
				4293	btrfs_get_block_group_info(
				4294	&info->block_groups[c], &space);
				4295	memcpy(dest, &space, sizeof(space));
				4296	dest++;
				4297	space_args.total_spaces++;
				4298	slot_count--;
				4299	}
				4300	if (!slot_count)
				4301	break;
				4302	}
				4303	up_read(&info->groups_sem);
				4304	}
				4305
				4306	/*
				4307	* Add global block reserve
				4308	*/
				4309	if (slot_count) {
				4310	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
				4311
				4312	spin_lock(&block_rsv->lock);
				4313	space.total_bytes = block_rsv->size;
				4314	space.used_bytes = block_rsv->size - block_rsv->reserved;
				4315	spin_unlock(&block_rsv->lock);
				4316	space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
				4317	memcpy(dest, &space, sizeof(space));
				4318	space_args.total_spaces++;
				4319	}
				4320
				4321	user_dest = (struct btrfs_ioctl_space_info __user *)
				4322	(arg + sizeof(struct btrfs_ioctl_space_args));
				4323
				4324	if (copy_to_user(user_dest, dest_orig, alloc_size))
				4325	ret = -EFAULT;
				4326
				4327	kfree(dest_orig);
				4328	out:
				4329	if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
				4330	ret = -EFAULT;
				4331
				4332	return ret;
				4333	}
				4334
				4335	/*
				4336	* there are many ways the trans_start and trans_end ioctls can lead
				4337	* to deadlocks. They should only be used by applications that
				4338	* basically own the machine, and have a very in depth understanding
				4339	* of all the possible deadlocks and enospc problems.
				4340	*/
				4341	long btrfs_ioctl_trans_end(struct file *file)
				4342	{
				4343	struct inode *inode = file_inode(file);
				4344	struct btrfs_root *root = BTRFS_I(inode)->root;
				4345	struct btrfs_file_private *private = file->private_data;
				4346
				4347	if (!private \|\| !private->trans)
				4348	return -EINVAL;
				4349
				4350	btrfs_end_transaction(private->trans);
				4351	private->trans = NULL;
				4352
				4353	atomic_dec(&root->fs_info->open_ioctl_trans);
				4354
				4355	mnt_drop_write_file(file);
				4356	return 0;
				4357	}
				4358
				4359	static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
				4360	void __user *argp)
				4361	{
				4362	struct btrfs_trans_handle *trans;
				4363	u64 transid;
				4364	int ret;
				4365
				4366	trans = btrfs_attach_transaction_barrier(root);
				4367	if (IS_ERR(trans)) {
				4368	if (PTR_ERR(trans) != -ENOENT)
				4369	return PTR_ERR(trans);
				4370
				4371	/* No running transaction, don't bother */
				4372	transid = root->fs_info->last_trans_committed;
				4373	goto out;
				4374	}
				4375	transid = trans->transid;
				4376	ret = btrfs_commit_transaction_async(trans, 0);
				4377	if (ret) {
				4378	btrfs_end_transaction(trans);
				4379	return ret;
				4380	}
				4381	out:
				4382	if (argp)
				4383	if (copy_to_user(argp, &transid, sizeof(transid)))
				4384	return -EFAULT;
				4385	return 0;
				4386	}
				4387
				4388	static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
				4389	void __user *argp)
				4390	{
				4391	u64 transid;
				4392
				4393	if (argp) {
				4394	if (copy_from_user(&transid, argp, sizeof(transid)))
				4395	return -EFAULT;
				4396	} else {
				4397	transid = 0; /* current trans */
				4398	}
				4399	return btrfs_wait_for_commit(fs_info, transid);
				4400	}
				4401
				4402	static long btrfs_ioctl_scrub(struct file file, void __user arg)
				4403	{
				4404	struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
				4405	struct btrfs_ioctl_scrub_args *sa;
				4406	int ret;
				4407
				4408	if (!capable(CAP_SYS_ADMIN))
				4409	return -EPERM;
				4410
				4411	sa = memdup_user(arg, sizeof(*sa));
				4412	if (IS_ERR(sa))
				4413	return PTR_ERR(sa);
				4414
				4415	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
				4416	ret = mnt_want_write_file(file);
				4417	if (ret)
				4418	goto out;
				4419	}
				4420
				4421	ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
				4422	&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
				4423	0);
				4424
				4425	if (copy_to_user(arg, sa, sizeof(*sa)))
				4426	ret = -EFAULT;
				4427
				4428	if (!(sa->flags & BTRFS_SCRUB_READONLY))
				4429	mnt_drop_write_file(file);
				4430	out:
				4431	kfree(sa);
				4432	return ret;
				4433	}
				4434
				4435	static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
				4436	{
				4437	if (!capable(CAP_SYS_ADMIN))
				4438	return -EPERM;
				4439
				4440	return btrfs_scrub_cancel(fs_info);
				4441	}
				4442
				4443	static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
				4444	void __user *arg)
				4445	{
				4446	struct btrfs_ioctl_scrub_args *sa;
				4447	int ret;
				4448
				4449	if (!capable(CAP_SYS_ADMIN))
				4450	return -EPERM;
				4451
				4452	sa = memdup_user(arg, sizeof(*sa));
				4453	if (IS_ERR(sa))
				4454	return PTR_ERR(sa);
				4455
				4456	ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
				4457
				4458	if (copy_to_user(arg, sa, sizeof(*sa)))
				4459	ret = -EFAULT;
				4460
				4461	kfree(sa);
				4462	return ret;
				4463	}
				4464
				4465	static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
				4466	void __user *arg)
				4467	{
				4468	struct btrfs_ioctl_get_dev_stats *sa;
				4469	int ret;
				4470
				4471	sa = memdup_user(arg, sizeof(*sa));
				4472	if (IS_ERR(sa))
				4473	return PTR_ERR(sa);
				4474
				4475	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
				4476	kfree(sa);
				4477	return -EPERM;
				4478	}
				4479
				4480	ret = btrfs_get_dev_stats(fs_info, sa);
				4481
				4482	if (copy_to_user(arg, sa, sizeof(*sa)))
				4483	ret = -EFAULT;
				4484
				4485	kfree(sa);
				4486	return ret;
				4487	}
				4488
				4489	static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
				4490	void __user *arg)
				4491	{
				4492	struct btrfs_ioctl_dev_replace_args *p;
				4493	int ret;
				4494
				4495	if (!capable(CAP_SYS_ADMIN))
				4496	return -EPERM;
				4497
				4498	p = memdup_user(arg, sizeof(*p));
				4499	if (IS_ERR(p))
				4500	return PTR_ERR(p);
				4501
				4502	switch (p->cmd) {
				4503	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
				4504	if (sb_rdonly(fs_info->sb)) {
				4505	ret = -EROFS;
				4506	goto out;
				4507	}
				4508	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				4509	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				4510	} else {
				4511	ret = btrfs_dev_replace_by_ioctl(fs_info, p);
				4512	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				4513	}
				4514	break;
				4515	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
				4516	btrfs_dev_replace_status(fs_info, p);
				4517	ret = 0;
				4518	break;
				4519	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
				4520	ret = btrfs_dev_replace_cancel(fs_info, p);
				4521	break;
				4522	default:
				4523	ret = -EINVAL;
				4524	break;
				4525	}
				4526
				4527	if (copy_to_user(arg, p, sizeof(*p)))
				4528	ret = -EFAULT;
				4529	out:
				4530	kfree(p);
				4531	return ret;
				4532	}
				4533
				4534	static long btrfs_ioctl_ino_to_path(struct btrfs_root root, void __user arg)
				4535	{
				4536	int ret = 0;
				4537	int i;
				4538	u64 rel_ptr;
				4539	int size;
				4540	struct btrfs_ioctl_ino_path_args *ipa = NULL;
				4541	struct inode_fs_paths *ipath = NULL;
				4542	struct btrfs_path *path;
				4543
				4544	if (!capable(CAP_DAC_READ_SEARCH))
				4545	return -EPERM;
				4546
				4547	path = btrfs_alloc_path();
				4548	if (!path) {
				4549	ret = -ENOMEM;
				4550	goto out;
				4551	}
				4552
				4553	ipa = memdup_user(arg, sizeof(*ipa));
				4554	if (IS_ERR(ipa)) {
				4555	ret = PTR_ERR(ipa);
				4556	ipa = NULL;
				4557	goto out;
				4558	}
				4559
				4560	size = min_t(u32, ipa->size, 4096);
				4561	ipath = init_ipath(size, root, path);
				4562	if (IS_ERR(ipath)) {
				4563	ret = PTR_ERR(ipath);
				4564	ipath = NULL;
				4565	goto out;
				4566	}
				4567
				4568	ret = paths_from_inode(ipa->inum, ipath);
				4569	if (ret < 0)
				4570	goto out;
				4571
				4572	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
				4573	rel_ptr = ipath->fspath->val[i] -
				4574	(u64)(unsigned long)ipath->fspath->val;
				4575	ipath->fspath->val[i] = rel_ptr;
				4576	}
				4577
				4578	ret = copy_to_user((void *)(unsigned long)ipa->fspath,
				4579	(void *)(unsigned long)ipath->fspath, size);
				4580	if (ret) {
				4581	ret = -EFAULT;
				4582	goto out;
				4583	}
				4584
				4585	out:
				4586	btrfs_free_path(path);
				4587	free_ipath(ipath);
				4588	kfree(ipa);
				4589
				4590	return ret;
				4591	}
				4592
				4593	static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
				4594	{
				4595	struct btrfs_data_container *inodes = ctx;
				4596	const size_t c = 3 * sizeof(u64);
				4597
				4598	if (inodes->bytes_left >= c) {
				4599	inodes->bytes_left -= c;
				4600	inodes->val[inodes->elem_cnt] = inum;
				4601	inodes->val[inodes->elem_cnt + 1] = offset;
				4602	inodes->val[inodes->elem_cnt + 2] = root;
				4603	inodes->elem_cnt += 3;
				4604	} else {
				4605	inodes->bytes_missing += c - inodes->bytes_left;
				4606	inodes->bytes_left = 0;
				4607	inodes->elem_missed += 3;
				4608	}
				4609
				4610	return 0;
				4611	}
				4612
				4613	static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
				4614	void __user *arg)
				4615	{
				4616	int ret = 0;
				4617	int size;
				4618	struct btrfs_ioctl_logical_ino_args *loi;
				4619	struct btrfs_data_container *inodes = NULL;
				4620	struct btrfs_path *path = NULL;
				4621
				4622	if (!capable(CAP_SYS_ADMIN))
				4623	return -EPERM;
				4624
				4625	loi = memdup_user(arg, sizeof(*loi));
				4626	if (IS_ERR(loi))
				4627	return PTR_ERR(loi);
				4628
				4629	path = btrfs_alloc_path();
				4630	if (!path) {
				4631	ret = -ENOMEM;
				4632	goto out;
				4633	}
				4634
				4635	size = min_t(u32, loi->size, SZ_64K);
				4636	inodes = init_data_container(size);
				4637	if (IS_ERR(inodes)) {
				4638	ret = PTR_ERR(inodes);
				4639	inodes = NULL;
				4640	goto out;
				4641	}
				4642
				4643	ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
				4644	build_ino_list, inodes);
				4645	if (ret == -EINVAL)
				4646	ret = -ENOENT;
				4647	if (ret < 0)
				4648	goto out;
				4649
				4650	ret = copy_to_user((void *)(unsigned long)loi->inodes,
				4651	(void *)(unsigned long)inodes, size);
				4652	if (ret)
				4653	ret = -EFAULT;
				4654
				4655	out:
				4656	btrfs_free_path(path);
				4657	kvfree(inodes);
				4658	kfree(loi);
				4659
				4660	return ret;
				4661	}
				4662
				4663	void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
				4664	struct btrfs_ioctl_balance_args *bargs)
				4665	{
				4666	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				4667
				4668	bargs->flags = bctl->flags;
				4669
				4670	if (atomic_read(&fs_info->balance_running))
				4671	bargs->state \|= BTRFS_BALANCE_STATE_RUNNING;
				4672	if (atomic_read(&fs_info->balance_pause_req))
				4673	bargs->state \|= BTRFS_BALANCE_STATE_PAUSE_REQ;
				4674	if (atomic_read(&fs_info->balance_cancel_req))
				4675	bargs->state \|= BTRFS_BALANCE_STATE_CANCEL_REQ;
				4676
				4677	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
				4678	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
				4679	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
				4680
				4681	if (lock) {
				4682	spin_lock(&fs_info->balance_lock);
				4683	memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
				4684	spin_unlock(&fs_info->balance_lock);
				4685	} else {
				4686	memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
				4687	}
				4688	}
				4689
				4690	static long btrfs_ioctl_balance(struct file file, void __user arg)
				4691	{
				4692	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
				4693	struct btrfs_fs_info *fs_info = root->fs_info;
				4694	struct btrfs_ioctl_balance_args *bargs;
				4695	struct btrfs_balance_control *bctl;
				4696	bool need_unlock; /* for mut. excl. ops lock */
				4697	int ret;
				4698
				4699	if (!capable(CAP_SYS_ADMIN))
				4700	return -EPERM;
				4701
				4702	ret = mnt_want_write_file(file);
				4703	if (ret)
				4704	return ret;
				4705
				4706	again:
				4707	if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
				4708	mutex_lock(&fs_info->volume_mutex);
				4709	mutex_lock(&fs_info->balance_mutex);
				4710	need_unlock = true;
				4711	goto locked;
				4712	}
				4713
				4714	/*
				4715	* mut. excl. ops lock is locked. Three possibilities:
				4716	* (1) some other op is running
				4717	* (2) balance is running
				4718	* (3) balance is paused -- special case (think resume)
				4719	*/
				4720	mutex_lock(&fs_info->balance_mutex);
				4721	if (fs_info->balance_ctl) {
				4722	/* this is either (2) or (3) */
				4723	if (!atomic_read(&fs_info->balance_running)) {
				4724	mutex_unlock(&fs_info->balance_mutex);
				4725	if (!mutex_trylock(&fs_info->volume_mutex))
				4726	goto again;
				4727	mutex_lock(&fs_info->balance_mutex);
				4728
				4729	if (fs_info->balance_ctl &&
				4730	!atomic_read(&fs_info->balance_running)) {
				4731	/* this is (3) */
				4732	need_unlock = false;
				4733	goto locked;
				4734	}
				4735
				4736	mutex_unlock(&fs_info->balance_mutex);
				4737	mutex_unlock(&fs_info->volume_mutex);
				4738	goto again;
				4739	} else {
				4740	/* this is (2) */
				4741	mutex_unlock(&fs_info->balance_mutex);
				4742	ret = -EINPROGRESS;
				4743	goto out;
				4744	}
				4745	} else {
				4746	/* this is (1) */
				4747	mutex_unlock(&fs_info->balance_mutex);
				4748	ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
				4749	goto out;
				4750	}
				4751
				4752	locked:
				4753	BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
				4754
				4755	if (arg) {
				4756	bargs = memdup_user(arg, sizeof(*bargs));
				4757	if (IS_ERR(bargs)) {
				4758	ret = PTR_ERR(bargs);
				4759	goto out_unlock;
				4760	}
				4761
				4762	if (bargs->flags & BTRFS_BALANCE_RESUME) {
				4763	if (!fs_info->balance_ctl) {
				4764	ret = -ENOTCONN;
				4765	goto out_bargs;
				4766	}
				4767
				4768	bctl = fs_info->balance_ctl;
				4769	spin_lock(&fs_info->balance_lock);
				4770	bctl->flags \|= BTRFS_BALANCE_RESUME;
				4771	spin_unlock(&fs_info->balance_lock);
				4772
				4773	goto do_balance;
				4774	}
				4775	} else {
				4776	bargs = NULL;
				4777	}
				4778
				4779	if (fs_info->balance_ctl) {
				4780	ret = -EINPROGRESS;
				4781	goto out_bargs;
				4782	}
				4783
				4784	bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
				4785	if (!bctl) {
				4786	ret = -ENOMEM;
				4787	goto out_bargs;
				4788	}
				4789
				4790	bctl->fs_info = fs_info;
				4791	if (arg) {
				4792	memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
				4793	memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
				4794	memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
				4795
				4796	bctl->flags = bargs->flags;
				4797	} else {
				4798	/* balance everything - no filters */
				4799	bctl->flags \|= BTRFS_BALANCE_TYPE_MASK;
				4800	}
				4801
				4802	if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK \| BTRFS_BALANCE_TYPE_MASK)) {
				4803	ret = -EINVAL;
				4804	goto out_bctl;
				4805	}
				4806
				4807	do_balance:
				4808	/*
				4809	* Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP
				4810	* goes to to btrfs_balance. bctl is freed in __cancel_balance,
				4811	* or, if restriper was paused all the way until unmount, in
				4812	* free_fs_info. The flag is cleared in __cancel_balance.
				4813	*/
				4814	need_unlock = false;
				4815
				4816	ret = btrfs_balance(bctl, bargs);
				4817	bctl = NULL;
				4818
				4819	if (arg) {
				4820	if (copy_to_user(arg, bargs, sizeof(*bargs)))
				4821	ret = -EFAULT;
				4822	}
				4823
				4824	out_bctl:
				4825	kfree(bctl);
				4826	out_bargs:
				4827	kfree(bargs);
				4828	out_unlock:
				4829	mutex_unlock(&fs_info->balance_mutex);
				4830	mutex_unlock(&fs_info->volume_mutex);
				4831	if (need_unlock)
				4832	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				4833	out:
				4834	mnt_drop_write_file(file);
				4835	return ret;
				4836	}
				4837
				4838	static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
				4839	{
				4840	if (!capable(CAP_SYS_ADMIN))
				4841	return -EPERM;
				4842
				4843	switch (cmd) {
				4844	case BTRFS_BALANCE_CTL_PAUSE:
				4845	return btrfs_pause_balance(fs_info);
				4846	case BTRFS_BALANCE_CTL_CANCEL:
				4847	return btrfs_cancel_balance(fs_info);
				4848	}
				4849
				4850	return -EINVAL;
				4851	}
				4852
				4853	static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
				4854	void __user *arg)
				4855	{
				4856	struct btrfs_ioctl_balance_args *bargs;
				4857	int ret = 0;
				4858
				4859	if (!capable(CAP_SYS_ADMIN))
				4860	return -EPERM;
				4861
				4862	mutex_lock(&fs_info->balance_mutex);
				4863	if (!fs_info->balance_ctl) {
				4864	ret = -ENOTCONN;
				4865	goto out;
				4866	}
				4867
				4868	bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
				4869	if (!bargs) {
				4870	ret = -ENOMEM;
				4871	goto out;
				4872	}
				4873
				4874	update_ioctl_balance_args(fs_info, 1, bargs);
				4875
				4876	if (copy_to_user(arg, bargs, sizeof(*bargs)))
				4877	ret = -EFAULT;
				4878
				4879	kfree(bargs);
				4880	out:
				4881	mutex_unlock(&fs_info->balance_mutex);
				4882	return ret;
				4883	}
				4884
				4885	static long btrfs_ioctl_quota_ctl(struct file file, void __user arg)
				4886	{
				4887	struct inode *inode = file_inode(file);
				4888	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4889	struct btrfs_ioctl_quota_ctl_args *sa;
				4890	struct btrfs_trans_handle *trans = NULL;
				4891	int ret;
				4892	int err;
				4893
				4894	if (!capable(CAP_SYS_ADMIN))
				4895	return -EPERM;
				4896
				4897	ret = mnt_want_write_file(file);
				4898	if (ret)
				4899	return ret;
				4900
				4901	sa = memdup_user(arg, sizeof(*sa));
				4902	if (IS_ERR(sa)) {
				4903	ret = PTR_ERR(sa);
				4904	goto drop_write;
				4905	}
				4906
				4907	down_write(&fs_info->subvol_sem);
				4908	trans = btrfs_start_transaction(fs_info->tree_root, 2);
				4909	if (IS_ERR(trans)) {
				4910	ret = PTR_ERR(trans);
				4911	goto out;
				4912	}
				4913
				4914	switch (sa->cmd) {
				4915	case BTRFS_QUOTA_CTL_ENABLE:
				4916	ret = btrfs_quota_enable(trans, fs_info);
				4917	break;
				4918	case BTRFS_QUOTA_CTL_DISABLE:
				4919	ret = btrfs_quota_disable(trans, fs_info);
				4920	break;
				4921	default:
				4922	ret = -EINVAL;
				4923	break;
				4924	}
				4925
				4926	err = btrfs_commit_transaction(trans);
				4927	if (err && !ret)
				4928	ret = err;
				4929	out:
				4930	kfree(sa);
				4931	up_write(&fs_info->subvol_sem);
				4932	drop_write:
				4933	mnt_drop_write_file(file);
				4934	return ret;
				4935	}
				4936
				4937	static long btrfs_ioctl_qgroup_assign(struct file file, void __user arg)
				4938	{
				4939	struct inode *inode = file_inode(file);
				4940	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4941	struct btrfs_root *root = BTRFS_I(inode)->root;
				4942	struct btrfs_ioctl_qgroup_assign_args *sa;
				4943	struct btrfs_trans_handle *trans;
				4944	int ret;
				4945	int err;
				4946
				4947	if (!capable(CAP_SYS_ADMIN))
				4948	return -EPERM;
				4949
				4950	ret = mnt_want_write_file(file);
				4951	if (ret)
				4952	return ret;
				4953
				4954	sa = memdup_user(arg, sizeof(*sa));
				4955	if (IS_ERR(sa)) {
				4956	ret = PTR_ERR(sa);
				4957	goto drop_write;
				4958	}
				4959
				4960	trans = btrfs_join_transaction(root);
				4961	if (IS_ERR(trans)) {
				4962	ret = PTR_ERR(trans);
				4963	goto out;
				4964	}
				4965
				4966	if (sa->assign) {
				4967	ret = btrfs_add_qgroup_relation(trans, fs_info,
				4968	sa->src, sa->dst);
				4969	} else {
				4970	ret = btrfs_del_qgroup_relation(trans, fs_info,
				4971	sa->src, sa->dst);
				4972	}
				4973
				4974	/* update qgroup status and info */
				4975	err = btrfs_run_qgroups(trans, fs_info);
				4976	if (err < 0)
				4977	btrfs_handle_fs_error(fs_info, err,
				4978	"failed to update qgroup status and info");
				4979	err = btrfs_end_transaction(trans);
				4980	if (err && !ret)
				4981	ret = err;
				4982
				4983	out:
				4984	kfree(sa);
				4985	drop_write:
				4986	mnt_drop_write_file(file);
				4987	return ret;
				4988	}
				4989
				4990	static long btrfs_ioctl_qgroup_create(struct file file, void __user arg)
				4991	{
				4992	struct inode *inode = file_inode(file);
				4993	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4994	struct btrfs_root *root = BTRFS_I(inode)->root;
				4995	struct btrfs_ioctl_qgroup_create_args *sa;
				4996	struct btrfs_trans_handle *trans;
				4997	int ret;
				4998	int err;
				4999
				5000	if (!capable(CAP_SYS_ADMIN))
				5001	return -EPERM;
				5002
				5003	ret = mnt_want_write_file(file);
				5004	if (ret)
				5005	return ret;
				5006
				5007	sa = memdup_user(arg, sizeof(*sa));
				5008	if (IS_ERR(sa)) {
				5009	ret = PTR_ERR(sa);
				5010	goto drop_write;
				5011	}
				5012
				5013	if (!sa->qgroupid) {
				5014	ret = -EINVAL;
				5015	goto out;
				5016	}
				5017
				5018	trans = btrfs_join_transaction(root);
				5019	if (IS_ERR(trans)) {
				5020	ret = PTR_ERR(trans);
				5021	goto out;
				5022	}
				5023
				5024	if (sa->create) {
				5025	ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid);
				5026	} else {
				5027	ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid);
				5028	}
				5029
				5030	err = btrfs_end_transaction(trans);
				5031	if (err && !ret)
				5032	ret = err;
				5033
				5034	out:
				5035	kfree(sa);
				5036	drop_write:
				5037	mnt_drop_write_file(file);
				5038	return ret;
				5039	}
				5040
				5041	static long btrfs_ioctl_qgroup_limit(struct file file, void __user arg)
				5042	{
				5043	struct inode *inode = file_inode(file);
				5044	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5045	struct btrfs_root *root = BTRFS_I(inode)->root;
				5046	struct btrfs_ioctl_qgroup_limit_args *sa;
				5047	struct btrfs_trans_handle *trans;
				5048	int ret;
				5049	int err;
				5050	u64 qgroupid;
				5051
				5052	if (!capable(CAP_SYS_ADMIN))
				5053	return -EPERM;
				5054
				5055	ret = mnt_want_write_file(file);
				5056	if (ret)
				5057	return ret;
				5058
				5059	sa = memdup_user(arg, sizeof(*sa));
				5060	if (IS_ERR(sa)) {
				5061	ret = PTR_ERR(sa);
				5062	goto drop_write;
				5063	}
				5064
				5065	trans = btrfs_join_transaction(root);
				5066	if (IS_ERR(trans)) {
				5067	ret = PTR_ERR(trans);
				5068	goto out;
				5069	}
				5070
				5071	qgroupid = sa->qgroupid;
				5072	if (!qgroupid) {
				5073	/* take the current subvol as qgroup */
				5074	qgroupid = root->root_key.objectid;
				5075	}
				5076
				5077	ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim);
				5078
				5079	err = btrfs_end_transaction(trans);
				5080	if (err && !ret)
				5081	ret = err;
				5082
				5083	out:
				5084	kfree(sa);
				5085	drop_write:
				5086	mnt_drop_write_file(file);
				5087	return ret;
				5088	}
				5089
				5090	static long btrfs_ioctl_quota_rescan(struct file file, void __user arg)
				5091	{
				5092	struct inode *inode = file_inode(file);
				5093	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5094	struct btrfs_ioctl_quota_rescan_args *qsa;
				5095	int ret;
				5096
				5097	if (!capable(CAP_SYS_ADMIN))
				5098	return -EPERM;
				5099
				5100	ret = mnt_want_write_file(file);
				5101	if (ret)
				5102	return ret;
				5103
				5104	qsa = memdup_user(arg, sizeof(*qsa));
				5105	if (IS_ERR(qsa)) {
				5106	ret = PTR_ERR(qsa);
				5107	goto drop_write;
				5108	}
				5109
				5110	if (qsa->flags) {
				5111	ret = -EINVAL;
				5112	goto out;
				5113	}
				5114
				5115	ret = btrfs_qgroup_rescan(fs_info);
				5116
				5117	out:
				5118	kfree(qsa);
				5119	drop_write:
				5120	mnt_drop_write_file(file);
				5121	return ret;
				5122	}
				5123
				5124	static long btrfs_ioctl_quota_rescan_status(struct file file, void __user arg)
				5125	{
				5126	struct inode *inode = file_inode(file);
				5127	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5128	struct btrfs_ioctl_quota_rescan_args *qsa;
				5129	int ret = 0;
				5130
				5131	if (!capable(CAP_SYS_ADMIN))
				5132	return -EPERM;
				5133
				5134	qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
				5135	if (!qsa)
				5136	return -ENOMEM;
				5137
				5138	if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
				5139	qsa->flags = 1;
				5140	qsa->progress = fs_info->qgroup_rescan_progress.objectid;
				5141	}
				5142
				5143	if (copy_to_user(arg, qsa, sizeof(*qsa)))
				5144	ret = -EFAULT;
				5145
				5146	kfree(qsa);
				5147	return ret;
				5148	}
				5149
				5150	static long btrfs_ioctl_quota_rescan_wait(struct file file, void __user arg)
				5151	{
				5152	struct inode *inode = file_inode(file);
				5153	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5154
				5155	if (!capable(CAP_SYS_ADMIN))
				5156	return -EPERM;
				5157
				5158	return btrfs_qgroup_wait_for_completion(fs_info, true);
				5159	}
				5160
				5161	static long _btrfs_ioctl_set_received_subvol(struct file *file,
				5162	struct btrfs_ioctl_received_subvol_args *sa)
				5163	{
				5164	struct inode *inode = file_inode(file);
				5165	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5166	struct btrfs_root *root = BTRFS_I(inode)->root;
				5167	struct btrfs_root_item *root_item = &root->root_item;
				5168	struct btrfs_trans_handle *trans;
				5169	struct timespec ct = current_time(inode);
				5170	int ret = 0;
				5171	int received_uuid_changed;
				5172
				5173	if (!inode_owner_or_capable(inode))
				5174	return -EPERM;
				5175
				5176	ret = mnt_want_write_file(file);
				5177	if (ret < 0)
				5178	return ret;
				5179
				5180	down_write(&fs_info->subvol_sem);
				5181
				5182	if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
				5183	ret = -EINVAL;
				5184	goto out;
				5185	}
				5186
				5187	if (btrfs_root_readonly(root)) {
				5188	ret = -EROFS;
				5189	goto out;
				5190	}
				5191
				5192	/*
				5193	* 1 - root item
				5194	* 2 - uuid items (received uuid + subvol uuid)
				5195	*/
				5196	trans = btrfs_start_transaction(root, 3);
				5197	if (IS_ERR(trans)) {
				5198	ret = PTR_ERR(trans);
				5199	trans = NULL;
				5200	goto out;
				5201	}
				5202
				5203	sa->rtransid = trans->transid;
				5204	sa->rtime.sec = ct.tv_sec;
				5205	sa->rtime.nsec = ct.tv_nsec;
				5206
				5207	received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
				5208	BTRFS_UUID_SIZE);
				5209	if (received_uuid_changed &&
				5210	!btrfs_is_empty_uuid(root_item->received_uuid))
				5211	btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid,
				5212	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				5213	root->root_key.objectid);
				5214	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
				5215	btrfs_set_root_stransid(root_item, sa->stransid);
				5216	btrfs_set_root_rtransid(root_item, sa->rtransid);
				5217	btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
				5218	btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
				5219	btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
				5220	btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
				5221
				5222	ret = btrfs_update_root(trans, fs_info->tree_root,
				5223	&root->root_key, &root->root_item);
				5224	if (ret < 0) {
				5225	btrfs_end_transaction(trans);
				5226	goto out;
				5227	}
				5228	if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
				5229	ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid,
				5230	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				5231	root->root_key.objectid);
				5232	if (ret < 0 && ret != -EEXIST) {
				5233	btrfs_abort_transaction(trans, ret);
				5234	goto out;
				5235	}
				5236	}
				5237	ret = btrfs_commit_transaction(trans);
				5238	if (ret < 0) {
				5239	btrfs_abort_transaction(trans, ret);
				5240	goto out;
				5241	}
				5242
				5243	out:
				5244	up_write(&fs_info->subvol_sem);
				5245	mnt_drop_write_file(file);
				5246	return ret;
				5247	}
				5248
				5249	#ifdef CONFIG_64BIT
				5250	static long btrfs_ioctl_set_received_subvol_32(struct file *file,
				5251	void __user *arg)
				5252	{
				5253	struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
				5254	struct btrfs_ioctl_received_subvol_args *args64 = NULL;
				5255	int ret = 0;
				5256
				5257	args32 = memdup_user(arg, sizeof(*args32));
				5258	if (IS_ERR(args32))
				5259	return PTR_ERR(args32);
				5260
				5261	args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
				5262	if (!args64) {
				5263	ret = -ENOMEM;
				5264	goto out;
				5265	}
				5266
				5267	memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
				5268	args64->stransid = args32->stransid;
				5269	args64->rtransid = args32->rtransid;
				5270	args64->stime.sec = args32->stime.sec;
				5271	args64->stime.nsec = args32->stime.nsec;
				5272	args64->rtime.sec = args32->rtime.sec;
				5273	args64->rtime.nsec = args32->rtime.nsec;
				5274	args64->flags = args32->flags;
				5275
				5276	ret = _btrfs_ioctl_set_received_subvol(file, args64);
				5277	if (ret)
				5278	goto out;
				5279
				5280	memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
				5281	args32->stransid = args64->stransid;
				5282	args32->rtransid = args64->rtransid;
				5283	args32->stime.sec = args64->stime.sec;
				5284	args32->stime.nsec = args64->stime.nsec;
				5285	args32->rtime.sec = args64->rtime.sec;
				5286	args32->rtime.nsec = args64->rtime.nsec;
				5287	args32->flags = args64->flags;
				5288
				5289	ret = copy_to_user(arg, args32, sizeof(*args32));
				5290	if (ret)
				5291	ret = -EFAULT;
				5292
				5293	out:
				5294	kfree(args32);
				5295	kfree(args64);
				5296	return ret;
				5297	}
				5298	#endif
				5299
				5300	static long btrfs_ioctl_set_received_subvol(struct file *file,
				5301	void __user *arg)
				5302	{
				5303	struct btrfs_ioctl_received_subvol_args *sa = NULL;
				5304	int ret = 0;
				5305
				5306	sa = memdup_user(arg, sizeof(*sa));
				5307	if (IS_ERR(sa))
				5308	return PTR_ERR(sa);
				5309
				5310	ret = _btrfs_ioctl_set_received_subvol(file, sa);
				5311
				5312	if (ret)
				5313	goto out;
				5314
				5315	ret = copy_to_user(arg, sa, sizeof(*sa));
				5316	if (ret)
				5317	ret = -EFAULT;
				5318
				5319	out:
				5320	kfree(sa);
				5321	return ret;
				5322	}
				5323
				5324	static int btrfs_ioctl_get_fslabel(struct file file, void __user arg)
				5325	{
				5326	struct inode *inode = file_inode(file);
				5327	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5328	size_t len;
				5329	int ret;
				5330	char label[BTRFS_LABEL_SIZE];
				5331
				5332	spin_lock(&fs_info->super_lock);
				5333	memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
				5334	spin_unlock(&fs_info->super_lock);
				5335
				5336	len = strnlen(label, BTRFS_LABEL_SIZE);
				5337
				5338	if (len == BTRFS_LABEL_SIZE) {
				5339	btrfs_warn(fs_info,
				5340	"label is too long, return the first %zu bytes",
				5341	--len);
				5342	}
				5343
				5344	ret = copy_to_user(arg, label, len);
				5345
				5346	return ret ? -EFAULT : 0;
				5347	}
				5348
				5349	static int btrfs_ioctl_set_fslabel(struct file file, void __user arg)
				5350	{
				5351	struct inode *inode = file_inode(file);
				5352	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5353	struct btrfs_root *root = BTRFS_I(inode)->root;
				5354	struct btrfs_super_block *super_block = fs_info->super_copy;
				5355	struct btrfs_trans_handle *trans;
				5356	char label[BTRFS_LABEL_SIZE];
				5357	int ret;
				5358
				5359	if (!capable(CAP_SYS_ADMIN))
				5360	return -EPERM;
				5361
				5362	if (copy_from_user(label, arg, sizeof(label)))
				5363	return -EFAULT;
				5364
				5365	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
				5366	btrfs_err(fs_info,
				5367	"unable to set label with more than %d bytes",
				5368	BTRFS_LABEL_SIZE - 1);
				5369	return -EINVAL;
				5370	}
				5371
				5372	ret = mnt_want_write_file(file);
				5373	if (ret)
				5374	return ret;
				5375
				5376	trans = btrfs_start_transaction(root, 0);
				5377	if (IS_ERR(trans)) {
				5378	ret = PTR_ERR(trans);
				5379	goto out_unlock;
				5380	}
				5381
				5382	spin_lock(&fs_info->super_lock);
				5383	strcpy(super_block->label, label);
				5384	spin_unlock(&fs_info->super_lock);
				5385	ret = btrfs_commit_transaction(trans);
				5386
				5387	out_unlock:
				5388	mnt_drop_write_file(file);
				5389	return ret;
				5390	}
				5391
				5392	#define INIT_FEATURE_FLAGS(suffix) \
				5393	{ .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
				5394	.compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
				5395	.incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
				5396
				5397	int btrfs_ioctl_get_supported_features(void __user *arg)
				5398	{
				5399	static const struct btrfs_ioctl_feature_flags features[3] = {
				5400	INIT_FEATURE_FLAGS(SUPP),
				5401	INIT_FEATURE_FLAGS(SAFE_SET),
				5402	INIT_FEATURE_FLAGS(SAFE_CLEAR)
				5403	};
				5404
				5405	if (copy_to_user(arg, &features, sizeof(features)))
				5406	return -EFAULT;
				5407
				5408	return 0;
				5409	}
				5410
				5411	static int btrfs_ioctl_get_features(struct file file, void __user arg)
				5412	{
				5413	struct inode *inode = file_inode(file);
				5414	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5415	struct btrfs_super_block *super_block = fs_info->super_copy;
				5416	struct btrfs_ioctl_feature_flags features;
				5417
				5418	features.compat_flags = btrfs_super_compat_flags(super_block);
				5419	features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
				5420	features.incompat_flags = btrfs_super_incompat_flags(super_block);
				5421
				5422	if (copy_to_user(arg, &features, sizeof(features)))
				5423	return -EFAULT;
				5424
				5425	return 0;
				5426	}
				5427
				5428	static int check_feature_bits(struct btrfs_fs_info *fs_info,
				5429	enum btrfs_feature_set set,
				5430	u64 change_mask, u64 flags, u64 supported_flags,
				5431	u64 safe_set, u64 safe_clear)
				5432	{
				5433	const char *type = btrfs_feature_set_names[set];
				5434	char *names;
				5435	u64 disallowed, unsupported;
				5436	u64 set_mask = flags & change_mask;
				5437	u64 clear_mask = ~flags & change_mask;
				5438
				5439	unsupported = set_mask & ~supported_flags;
				5440	if (unsupported) {
				5441	names = btrfs_printable_features(set, unsupported);
				5442	if (names) {
				5443	btrfs_warn(fs_info,
				5444	"this kernel does not support the %s feature bit%s",
				5445	names, strchr(names, ',') ? "s" : "");
				5446	kfree(names);
				5447	} else
				5448	btrfs_warn(fs_info,
				5449	"this kernel does not support %s bits 0x%llx",
				5450	type, unsupported);
				5451	return -EOPNOTSUPP;
				5452	}
				5453
				5454	disallowed = set_mask & ~safe_set;
				5455	if (disallowed) {
				5456	names = btrfs_printable_features(set, disallowed);
				5457	if (names) {
				5458	btrfs_warn(fs_info,
				5459	"can't set the %s feature bit%s while mounted",
				5460	names, strchr(names, ',') ? "s" : "");
				5461	kfree(names);
				5462	} else
				5463	btrfs_warn(fs_info,
				5464	"can't set %s bits 0x%llx while mounted",
				5465	type, disallowed);
				5466	return -EPERM;
				5467	}
				5468
				5469	disallowed = clear_mask & ~safe_clear;
				5470	if (disallowed) {
				5471	names = btrfs_printable_features(set, disallowed);
				5472	if (names) {
				5473	btrfs_warn(fs_info,
				5474	"can't clear the %s feature bit%s while mounted",
				5475	names, strchr(names, ',') ? "s" : "");
				5476	kfree(names);
				5477	} else
				5478	btrfs_warn(fs_info,
				5479	"can't clear %s bits 0x%llx while mounted",
				5480	type, disallowed);
				5481	return -EPERM;
				5482	}
				5483
				5484	return 0;
				5485	}
				5486
				5487	#define check_feature(fs_info, change_mask, flags, mask_base) \
				5488	check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
				5489	BTRFS_FEATURE_ ## mask_base ## _SUPP, \
				5490	BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
				5491	BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
				5492
				5493	static int btrfs_ioctl_set_features(struct file file, void __user arg)
				5494	{
				5495	struct inode *inode = file_inode(file);
				5496	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5497	struct btrfs_root *root = BTRFS_I(inode)->root;
				5498	struct btrfs_super_block *super_block = fs_info->super_copy;
				5499	struct btrfs_ioctl_feature_flags flags[2];
				5500	struct btrfs_trans_handle *trans;
				5501	u64 newflags;
				5502	int ret;
				5503
				5504	if (!capable(CAP_SYS_ADMIN))
				5505	return -EPERM;
				5506
				5507	if (copy_from_user(flags, arg, sizeof(flags)))
				5508	return -EFAULT;
				5509
				5510	/* Nothing to do */
				5511	if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
				5512	!flags[0].incompat_flags)
				5513	return 0;
				5514
				5515	ret = check_feature(fs_info, flags[0].compat_flags,
				5516	flags[1].compat_flags, COMPAT);
				5517	if (ret)
				5518	return ret;
				5519
				5520	ret = check_feature(fs_info, flags[0].compat_ro_flags,
				5521	flags[1].compat_ro_flags, COMPAT_RO);
				5522	if (ret)
				5523	return ret;
				5524
				5525	ret = check_feature(fs_info, flags[0].incompat_flags,
				5526	flags[1].incompat_flags, INCOMPAT);
				5527	if (ret)
				5528	return ret;
				5529
				5530	ret = mnt_want_write_file(file);
				5531	if (ret)
				5532	return ret;
				5533
				5534	trans = btrfs_start_transaction(root, 0);
				5535	if (IS_ERR(trans)) {
				5536	ret = PTR_ERR(trans);
				5537	goto out_drop_write;
				5538	}
				5539
				5540	spin_lock(&fs_info->super_lock);
				5541	newflags = btrfs_super_compat_flags(super_block);
				5542	newflags \|= flags[0].compat_flags & flags[1].compat_flags;
				5543	newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
				5544	btrfs_set_super_compat_flags(super_block, newflags);
				5545
				5546	newflags = btrfs_super_compat_ro_flags(super_block);
				5547	newflags \|= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
				5548	newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
				5549	btrfs_set_super_compat_ro_flags(super_block, newflags);
				5550
				5551	newflags = btrfs_super_incompat_flags(super_block);
				5552	newflags \|= flags[0].incompat_flags & flags[1].incompat_flags;
				5553	newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
				5554	btrfs_set_super_incompat_flags(super_block, newflags);
				5555	spin_unlock(&fs_info->super_lock);
				5556
				5557	ret = btrfs_commit_transaction(trans);
				5558	out_drop_write:
				5559	mnt_drop_write_file(file);
				5560
				5561	return ret;
				5562	}
				5563
				5564	long btrfs_ioctl(struct file *file, unsigned int
				5565	cmd, unsigned long arg)
				5566	{
				5567	struct inode *inode = file_inode(file);
				5568	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				5569	struct btrfs_root *root = BTRFS_I(inode)->root;
				5570	void __user argp = (void __user )arg;
				5571
				5572	switch (cmd) {
				5573	case FS_IOC_GETFLAGS:
				5574	return btrfs_ioctl_getflags(file, argp);
				5575	case FS_IOC_SETFLAGS:
				5576	return btrfs_ioctl_setflags(file, argp);
				5577	case FS_IOC_GETVERSION:
				5578	return btrfs_ioctl_getversion(file, argp);
				5579	case FITRIM:
				5580	return btrfs_ioctl_fitrim(file, argp);
				5581	case BTRFS_IOC_SNAP_CREATE:
				5582	return btrfs_ioctl_snap_create(file, argp, 0);
				5583	case BTRFS_IOC_SNAP_CREATE_V2:
				5584	return btrfs_ioctl_snap_create_v2(file, argp, 0);
				5585	case BTRFS_IOC_SUBVOL_CREATE:
				5586	return btrfs_ioctl_snap_create(file, argp, 1);
				5587	case BTRFS_IOC_SUBVOL_CREATE_V2:
				5588	return btrfs_ioctl_snap_create_v2(file, argp, 1);
				5589	case BTRFS_IOC_SNAP_DESTROY:
				5590	return btrfs_ioctl_snap_destroy(file, argp);
				5591	case BTRFS_IOC_SUBVOL_GETFLAGS:
				5592	return btrfs_ioctl_subvol_getflags(file, argp);
				5593	case BTRFS_IOC_SUBVOL_SETFLAGS:
				5594	return btrfs_ioctl_subvol_setflags(file, argp);
				5595	case BTRFS_IOC_DEFAULT_SUBVOL:
				5596	return btrfs_ioctl_default_subvol(file, argp);
				5597	case BTRFS_IOC_DEFRAG:
				5598	return btrfs_ioctl_defrag(file, NULL);
				5599	case BTRFS_IOC_DEFRAG_RANGE:
				5600	return btrfs_ioctl_defrag(file, argp);
				5601	case BTRFS_IOC_RESIZE:
				5602	return btrfs_ioctl_resize(file, argp);
				5603	case BTRFS_IOC_ADD_DEV:
				5604	return btrfs_ioctl_add_dev(fs_info, argp);
				5605	case BTRFS_IOC_RM_DEV:
				5606	return btrfs_ioctl_rm_dev(file, argp);
				5607	case BTRFS_IOC_RM_DEV_V2:
				5608	return btrfs_ioctl_rm_dev_v2(file, argp);
				5609	case BTRFS_IOC_FS_INFO:
				5610	return btrfs_ioctl_fs_info(fs_info, argp);
				5611	case BTRFS_IOC_DEV_INFO:
				5612	return btrfs_ioctl_dev_info(fs_info, argp);
				5613	case BTRFS_IOC_BALANCE:
				5614	return btrfs_ioctl_balance(file, NULL);
				5615	case BTRFS_IOC_TRANS_START:
				5616	return btrfs_ioctl_trans_start(file);
				5617	case BTRFS_IOC_TRANS_END:
				5618	return btrfs_ioctl_trans_end(file);
				5619	case BTRFS_IOC_TREE_SEARCH:
				5620	return btrfs_ioctl_tree_search(file, argp);
				5621	case BTRFS_IOC_TREE_SEARCH_V2:
				5622	return btrfs_ioctl_tree_search_v2(file, argp);
				5623	case BTRFS_IOC_INO_LOOKUP:
				5624	return btrfs_ioctl_ino_lookup(file, argp);
				5625	case BTRFS_IOC_INO_PATHS:
				5626	return btrfs_ioctl_ino_to_path(root, argp);
				5627	case BTRFS_IOC_LOGICAL_INO:
				5628	return btrfs_ioctl_logical_to_ino(fs_info, argp);
				5629	case BTRFS_IOC_SPACE_INFO:
				5630	return btrfs_ioctl_space_info(fs_info, argp);
				5631	case BTRFS_IOC_SYNC: {
				5632	int ret;
				5633
				5634	ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
				5635	if (ret)
				5636	return ret;
				5637	ret = btrfs_sync_fs(inode->i_sb, 1);
				5638	/*
				5639	* The transaction thread may want to do more work,
				5640	* namely it pokes the cleaner kthread that will start
				5641	* processing uncleaned subvols.
				5642	*/
				5643	wake_up_process(fs_info->transaction_kthread);
				5644	return ret;
				5645	}
				5646	case BTRFS_IOC_START_SYNC:
				5647	return btrfs_ioctl_start_sync(root, argp);
				5648	case BTRFS_IOC_WAIT_SYNC:
				5649	return btrfs_ioctl_wait_sync(fs_info, argp);
				5650	case BTRFS_IOC_SCRUB:
				5651	return btrfs_ioctl_scrub(file, argp);
				5652	case BTRFS_IOC_SCRUB_CANCEL:
				5653	return btrfs_ioctl_scrub_cancel(fs_info);
				5654	case BTRFS_IOC_SCRUB_PROGRESS:
				5655	return btrfs_ioctl_scrub_progress(fs_info, argp);
				5656	case BTRFS_IOC_BALANCE_V2:
				5657	return btrfs_ioctl_balance(file, argp);
				5658	case BTRFS_IOC_BALANCE_CTL:
				5659	return btrfs_ioctl_balance_ctl(fs_info, arg);
				5660	case BTRFS_IOC_BALANCE_PROGRESS:
				5661	return btrfs_ioctl_balance_progress(fs_info, argp);
				5662	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
				5663	return btrfs_ioctl_set_received_subvol(file, argp);
				5664	#ifdef CONFIG_64BIT
				5665	case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
				5666	return btrfs_ioctl_set_received_subvol_32(file, argp);
				5667	#endif
				5668	case BTRFS_IOC_SEND:
				5669	return btrfs_ioctl_send(file, argp);
				5670	case BTRFS_IOC_GET_DEV_STATS:
				5671	return btrfs_ioctl_get_dev_stats(fs_info, argp);
				5672	case BTRFS_IOC_QUOTA_CTL:
				5673	return btrfs_ioctl_quota_ctl(file, argp);
				5674	case BTRFS_IOC_QGROUP_ASSIGN:
				5675	return btrfs_ioctl_qgroup_assign(file, argp);
				5676	case BTRFS_IOC_QGROUP_CREATE:
				5677	return btrfs_ioctl_qgroup_create(file, argp);
				5678	case BTRFS_IOC_QGROUP_LIMIT:
				5679	return btrfs_ioctl_qgroup_limit(file, argp);
				5680	case BTRFS_IOC_QUOTA_RESCAN:
				5681	return btrfs_ioctl_quota_rescan(file, argp);
				5682	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
				5683	return btrfs_ioctl_quota_rescan_status(file, argp);
				5684	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
				5685	return btrfs_ioctl_quota_rescan_wait(file, argp);
				5686	case BTRFS_IOC_DEV_REPLACE:
				5687	return btrfs_ioctl_dev_replace(fs_info, argp);
				5688	case BTRFS_IOC_GET_FSLABEL:
				5689	return btrfs_ioctl_get_fslabel(file, argp);
				5690	case BTRFS_IOC_SET_FSLABEL:
				5691	return btrfs_ioctl_set_fslabel(file, argp);
				5692	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
				5693	return btrfs_ioctl_get_supported_features(argp);
				5694	case BTRFS_IOC_GET_FEATURES:
				5695	return btrfs_ioctl_get_features(file, argp);
				5696	case BTRFS_IOC_SET_FEATURES:
				5697	return btrfs_ioctl_set_features(file, argp);
				5698	}
				5699
				5700	return -ENOTTY;
				5701	}
				5702
				5703	#ifdef CONFIG_COMPAT
				5704	long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
				5705	{
				5706	/*
				5707	* These all access 32-bit values anyway so no further
				5708	* handling is necessary.
				5709	*/
				5710	switch (cmd) {
				5711	case FS_IOC32_GETFLAGS:
				5712	cmd = FS_IOC_GETFLAGS;
				5713	break;
				5714	case FS_IOC32_SETFLAGS:
				5715	cmd = FS_IOC_SETFLAGS;
				5716	break;
				5717	case FS_IOC32_GETVERSION:
				5718	cmd = FS_IOC_GETVERSION;
				5719	break;
				5720	}
				5721
				5722	return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
				5723	}
				5724	#endif