Blame - src/kernel/linux/v4.14/fs/btrfs/volumes.c - T103

blob: f3cb042a28d5ce9207c16dd70020d95f93ec5b4e [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2007 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18	#include <linux/sched.h>
				19	#include <linux/sched/mm.h>
				20	#include <linux/bio.h>
				21	#include <linux/slab.h>
				22	#include <linux/buffer_head.h>
				23	#include <linux/blkdev.h>
				24	#include <linux/iocontext.h>
				25	#include <linux/capability.h>
				26	#include <linux/ratelimit.h>
				27	#include <linux/kthread.h>
				28	#include <linux/raid/pq.h>
				29	#include <linux/semaphore.h>
				30	#include <linux/uuid.h>
				31	#include <asm/div64.h>
				32	#include "ctree.h"
				33	#include "extent_map.h"
				34	#include "disk-io.h"
				35	#include "transaction.h"
				36	#include "print-tree.h"
				37	#include "volumes.h"
				38	#include "raid56.h"
				39	#include "async-thread.h"
				40	#include "check-integrity.h"
				41	#include "rcu-string.h"
				42	#include "math.h"
				43	#include "dev-replace.h"
				44	#include "sysfs.h"
				45
				46	const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
				47	[BTRFS_RAID_RAID10] = {
				48	.sub_stripes = 2,
				49	.dev_stripes = 1,
				50	.devs_max = 0, /* 0 == as many as possible */
				51	.devs_min = 4,
				52	.tolerated_failures = 1,
				53	.devs_increment = 2,
				54	.ncopies = 2,
				55	},
				56	[BTRFS_RAID_RAID1] = {
				57	.sub_stripes = 1,
				58	.dev_stripes = 1,
				59	.devs_max = 2,
				60	.devs_min = 2,
				61	.tolerated_failures = 1,
				62	.devs_increment = 2,
				63	.ncopies = 2,
				64	},
				65	[BTRFS_RAID_DUP] = {
				66	.sub_stripes = 1,
				67	.dev_stripes = 2,
				68	.devs_max = 1,
				69	.devs_min = 1,
				70	.tolerated_failures = 0,
				71	.devs_increment = 1,
				72	.ncopies = 2,
				73	},
				74	[BTRFS_RAID_RAID0] = {
				75	.sub_stripes = 1,
				76	.dev_stripes = 1,
				77	.devs_max = 0,
				78	.devs_min = 2,
				79	.tolerated_failures = 0,
				80	.devs_increment = 1,
				81	.ncopies = 1,
				82	},
				83	[BTRFS_RAID_SINGLE] = {
				84	.sub_stripes = 1,
				85	.dev_stripes = 1,
				86	.devs_max = 1,
				87	.devs_min = 1,
				88	.tolerated_failures = 0,
				89	.devs_increment = 1,
				90	.ncopies = 1,
				91	},
				92	[BTRFS_RAID_RAID5] = {
				93	.sub_stripes = 1,
				94	.dev_stripes = 1,
				95	.devs_max = 0,
				96	.devs_min = 2,
				97	.tolerated_failures = 1,
				98	.devs_increment = 1,
				99	.ncopies = 2,
				100	},
				101	[BTRFS_RAID_RAID6] = {
				102	.sub_stripes = 1,
				103	.dev_stripes = 1,
				104	.devs_max = 0,
				105	.devs_min = 3,
				106	.tolerated_failures = 2,
				107	.devs_increment = 1,
				108	.ncopies = 3,
				109	},
				110	};
				111
				112	const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
				113	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
				114	[BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
				115	[BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
				116	[BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
				117	[BTRFS_RAID_SINGLE] = 0,
				118	[BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
				119	[BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
				120	};
				121
				122	/*
				123	* Table to convert BTRFS_RAID_* to the error code if minimum number of devices
				124	* condition is not met. Zero means there's no corresponding
				125	* BTRFS_ERROR_DEV_*_NOT_MET value.
				126	*/
				127	const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
				128	[BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
				129	[BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
				130	[BTRFS_RAID_DUP] = 0,
				131	[BTRFS_RAID_RAID0] = 0,
				132	[BTRFS_RAID_SINGLE] = 0,
				133	[BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
				134	[BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
				135	};
				136
				137	static int init_first_rw_device(struct btrfs_trans_handle *trans,
				138	struct btrfs_fs_info *fs_info);
				139	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
				140	static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
				141	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
				142	static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
				143	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
				144	enum btrfs_map_op op,
				145	u64 logical, u64 *length,
				146	struct btrfs_bio **bbio_ret,
				147	int mirror_num, int need_raid_map);
				148
				149	DEFINE_MUTEX(uuid_mutex);
				150	static LIST_HEAD(fs_uuids);
				151	struct list_head *btrfs_get_fs_uuids(void)
				152	{
				153	return &fs_uuids;
				154	}
				155
				156	/*
				157	* alloc_fs_devices - allocate struct btrfs_fs_devices
				158	* @fsid: if not NULL, copy the uuid to fs_devices::fsid
				159	*
				160	* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
				161	* The returned struct is not linked onto any lists and can be destroyed with
				162	* kfree() right away.
				163	*/
				164	static struct btrfs_fs_devices alloc_fs_devices(const u8 fsid)
				165	{
				166	struct btrfs_fs_devices *fs_devs;
				167
				168	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
				169	if (!fs_devs)
				170	return ERR_PTR(-ENOMEM);
				171
				172	mutex_init(&fs_devs->device_list_mutex);
				173
				174	INIT_LIST_HEAD(&fs_devs->devices);
				175	INIT_LIST_HEAD(&fs_devs->resized_devices);
				176	INIT_LIST_HEAD(&fs_devs->alloc_list);
				177	INIT_LIST_HEAD(&fs_devs->list);
				178	if (fsid)
				179	memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
				180
				181	return fs_devs;
				182	}
				183
				184	static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
				185	{
				186	struct btrfs_device *device;
				187	WARN_ON(fs_devices->opened);
				188	while (!list_empty(&fs_devices->devices)) {
				189	device = list_entry(fs_devices->devices.next,
				190	struct btrfs_device, dev_list);
				191	list_del(&device->dev_list);
				192	rcu_string_free(device->name);
				193	kfree(device);
				194	}
				195	kfree(fs_devices);
				196	}
				197
				198	static void btrfs_kobject_uevent(struct block_device *bdev,
				199	enum kobject_action action)
				200	{
				201	int ret;
				202
				203	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
				204	if (ret)
				205	pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
				206	action,
				207	kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
				208	&disk_to_dev(bdev->bd_disk)->kobj);
				209	}
				210
				211	void btrfs_cleanup_fs_uuids(void)
				212	{
				213	struct btrfs_fs_devices *fs_devices;
				214
				215	while (!list_empty(&fs_uuids)) {
				216	fs_devices = list_entry(fs_uuids.next,
				217	struct btrfs_fs_devices, list);
				218	list_del(&fs_devices->list);
				219	free_fs_devices(fs_devices);
				220	}
				221	}
				222
				223	static struct btrfs_device *__alloc_device(void)
				224	{
				225	struct btrfs_device *dev;
				226
				227	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
				228	if (!dev)
				229	return ERR_PTR(-ENOMEM);
				230
				231	/*
				232	* Preallocate a bio that's always going to be used for flushing device
				233	* barriers and matches the device lifespan
				234	*/
				235	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
				236	if (!dev->flush_bio) {
				237	kfree(dev);
				238	return ERR_PTR(-ENOMEM);
				239	}
				240
				241	INIT_LIST_HEAD(&dev->dev_list);
				242	INIT_LIST_HEAD(&dev->dev_alloc_list);
				243	INIT_LIST_HEAD(&dev->resized_list);
				244
				245	spin_lock_init(&dev->io_lock);
				246
				247	spin_lock_init(&dev->reada_lock);
				248	atomic_set(&dev->reada_in_flight, 0);
				249	atomic_set(&dev->dev_stats_ccnt, 0);
				250	btrfs_device_data_ordered_init(dev);
				251	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
				252	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
				253
				254	return dev;
				255	}
				256
				257	/*
				258	* Find a device specified by @devid or @uuid in the list of @fs_devices, or
				259	* return NULL.
				260	*
				261	* If devid and uuid are both specified, the match must be exact, otherwise
				262	* only devid is used.
				263	*/
				264	static struct btrfs_device find_device(struct btrfs_fs_devices fs_devices,
				265	u64 devid, const u8 *uuid)
				266	{
				267	struct list_head *head = &fs_devices->devices;
				268	struct btrfs_device *dev;
				269
				270	list_for_each_entry(dev, head, dev_list) {
				271	if (dev->devid == devid &&
				272	(!uuid \|\| !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
				273	return dev;
				274	}
				275	}
				276	return NULL;
				277	}
				278
				279	static noinline struct btrfs_fs_devices find_fsid(u8 fsid)
				280	{
				281	struct btrfs_fs_devices *fs_devices;
				282
				283	list_for_each_entry(fs_devices, &fs_uuids, list) {
				284	if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
				285	return fs_devices;
				286	}
				287	return NULL;
				288	}
				289
				290	static int
				291	btrfs_get_bdev_and_sb(const char device_path, fmode_t flags, void holder,
				292	int flush, struct block_device **bdev,
				293	struct buffer_head **bh)
				294	{
				295	int ret;
				296
				297	*bdev = blkdev_get_by_path(device_path, flags, holder);
				298
				299	if (IS_ERR(*bdev)) {
				300	ret = PTR_ERR(*bdev);
				301	goto error;
				302	}
				303
				304	if (flush)
				305	filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
				306	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
				307	if (ret) {
				308	blkdev_put(*bdev, flags);
				309	goto error;
				310	}
				311	invalidate_bdev(*bdev);
				312	bh = btrfs_read_dev_super(bdev);
				313	if (IS_ERR(*bh)) {
				314	ret = PTR_ERR(*bh);
				315	blkdev_put(*bdev, flags);
				316	goto error;
				317	}
				318
				319	return 0;
				320
				321	error:
				322	*bdev = NULL;
				323	*bh = NULL;
				324	return ret;
				325	}
				326
				327	static void requeue_list(struct btrfs_pending_bios *pending_bios,
				328	struct bio head, struct bio tail)
				329	{
				330
				331	struct bio *old_head;
				332
				333	old_head = pending_bios->head;
				334	pending_bios->head = head;
				335	if (pending_bios->tail)
				336	tail->bi_next = old_head;
				337	else
				338	pending_bios->tail = tail;
				339	}
				340
				341	/*
				342	* we try to collect pending bios for a device so we don't get a large
				343	* number of procs sending bios down to the same device. This greatly
				344	* improves the schedulers ability to collect and merge the bios.
				345	*
				346	* But, it also turns into a long list of bios to process and that is sure
				347	* to eventually make the worker thread block. The solution here is to
				348	* make some progress and then put this work struct back at the end of
				349	* the list if the block device is congested. This way, multiple devices
				350	* can make progress from a single worker thread.
				351	*/
				352	static noinline void run_scheduled_bios(struct btrfs_device *device)
				353	{
				354	struct btrfs_fs_info *fs_info = device->fs_info;
				355	struct bio *pending;
				356	struct backing_dev_info *bdi;
				357	struct btrfs_pending_bios *pending_bios;
				358	struct bio *tail;
				359	struct bio *cur;
				360	int again = 0;
				361	unsigned long num_run;
				362	unsigned long batch_run = 0;
				363	unsigned long limit;
				364	unsigned long last_waited = 0;
				365	int force_reg = 0;
				366	int sync_pending = 0;
				367	struct blk_plug plug;
				368
				369	/*
				370	* this function runs all the bios we've collected for
				371	* a particular device. We don't want to wander off to
				372	* another device without first sending all of these down.
				373	* So, setup a plug here and finish it off before we return
				374	*/
				375	blk_start_plug(&plug);
				376
				377	bdi = device->bdev->bd_bdi;
				378	limit = btrfs_async_submit_limit(fs_info);
				379	limit = limit * 2 / 3;
				380
				381	loop:
				382	spin_lock(&device->io_lock);
				383
				384	loop_lock:
				385	num_run = 0;
				386
				387	/* take all the bios off the list at once and process them
				388	* later on (without the lock held). But, remember the
				389	* tail and other pointers so the bios can be properly reinserted
				390	* into the list if we hit congestion
				391	*/
				392	if (!force_reg && device->pending_sync_bios.head) {
				393	pending_bios = &device->pending_sync_bios;
				394	force_reg = 1;
				395	} else {
				396	pending_bios = &device->pending_bios;
				397	force_reg = 0;
				398	}
				399
				400	pending = pending_bios->head;
				401	tail = pending_bios->tail;
				402	WARN_ON(pending && !tail);
				403
				404	/*
				405	* if pending was null this time around, no bios need processing
				406	* at all and we can stop. Otherwise it'll loop back up again
				407	* and do an additional check so no bios are missed.
				408	*
				409	* device->running_pending is used to synchronize with the
				410	* schedule_bio code.
				411	*/
				412	if (device->pending_sync_bios.head == NULL &&
				413	device->pending_bios.head == NULL) {
				414	again = 0;
				415	device->running_pending = 0;
				416	} else {
				417	again = 1;
				418	device->running_pending = 1;
				419	}
				420
				421	pending_bios->head = NULL;
				422	pending_bios->tail = NULL;
				423
				424	spin_unlock(&device->io_lock);
				425
				426	while (pending) {
				427
				428	rmb();
				429	/* we want to work on both lists, but do more bios on the
				430	* sync list than the regular list
				431	*/
				432	if ((num_run > 32 &&
				433	pending_bios != &device->pending_sync_bios &&
				434	device->pending_sync_bios.head) \|\|
				435	(num_run > 64 && pending_bios == &device->pending_sync_bios &&
				436	device->pending_bios.head)) {
				437	spin_lock(&device->io_lock);
				438	requeue_list(pending_bios, pending, tail);
				439	goto loop_lock;
				440	}
				441
				442	cur = pending;
				443	pending = pending->bi_next;
				444	cur->bi_next = NULL;
				445
				446	/*
				447	* atomic_dec_return implies a barrier for waitqueue_active
				448	*/
				449	if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
				450	waitqueue_active(&fs_info->async_submit_wait))
				451	wake_up(&fs_info->async_submit_wait);
				452
				453	BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
				454
				455	/*
				456	* if we're doing the sync list, record that our
				457	* plug has some sync requests on it
				458	*
				459	* If we're doing the regular list and there are
				460	* sync requests sitting around, unplug before
				461	* we add more
				462	*/
				463	if (pending_bios == &device->pending_sync_bios) {
				464	sync_pending = 1;
				465	} else if (sync_pending) {
				466	blk_finish_plug(&plug);
				467	blk_start_plug(&plug);
				468	sync_pending = 0;
				469	}
				470
				471	btrfsic_submit_bio(cur);
				472	num_run++;
				473	batch_run++;
				474
				475	cond_resched();
				476
				477	/*
				478	* we made progress, there is more work to do and the bdi
				479	* is now congested. Back off and let other work structs
				480	* run instead
				481	*/
				482	if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
				483	fs_info->fs_devices->open_devices > 1) {
				484	struct io_context *ioc;
				485
				486	ioc = current->io_context;
				487
				488	/*
				489	* the main goal here is that we don't want to
				490	* block if we're going to be able to submit
				491	* more requests without blocking.
				492	*
				493	* This code does two great things, it pokes into
				494	* the elevator code from a filesystem _and_
				495	* it makes assumptions about how batching works.
				496	*/
				497	if (ioc && ioc->nr_batch_requests > 0 &&
				498	time_before(jiffies, ioc->last_waited + HZ/50UL) &&
				499	(last_waited == 0 \|\|
				500	ioc->last_waited == last_waited)) {
				501	/*
				502	* we want to go through our batch of
				503	* requests and stop. So, we copy out
				504	* the ioc->last_waited time and test
				505	* against it before looping
				506	*/
				507	last_waited = ioc->last_waited;
				508	cond_resched();
				509	continue;
				510	}
				511	spin_lock(&device->io_lock);
				512	requeue_list(pending_bios, pending, tail);
				513	device->running_pending = 1;
				514
				515	spin_unlock(&device->io_lock);
				516	btrfs_queue_work(fs_info->submit_workers,
				517	&device->work);
				518	goto done;
				519	}
				520	/* unplug every 64 requests just for good measure */
				521	if (batch_run % 64 == 0) {
				522	blk_finish_plug(&plug);
				523	blk_start_plug(&plug);
				524	sync_pending = 0;
				525	}
				526	}
				527
				528	cond_resched();
				529	if (again)
				530	goto loop;
				531
				532	spin_lock(&device->io_lock);
				533	if (device->pending_bios.head \|\| device->pending_sync_bios.head)
				534	goto loop_lock;
				535	spin_unlock(&device->io_lock);
				536
				537	done:
				538	blk_finish_plug(&plug);
				539	}
				540
				541	static void pending_bios_fn(struct btrfs_work *work)
				542	{
				543	struct btrfs_device *device;
				544
				545	device = container_of(work, struct btrfs_device, work);
				546	run_scheduled_bios(device);
				547	}
				548
				549
				550	void btrfs_free_stale_device(struct btrfs_device *cur_dev)
				551	{
				552	struct btrfs_fs_devices *fs_devs;
				553	struct btrfs_device *dev;
				554
				555	if (!cur_dev->name)
				556	return;
				557
				558	list_for_each_entry(fs_devs, &fs_uuids, list) {
				559	int del = 1;
				560
				561	if (fs_devs->opened)
				562	continue;
				563	if (fs_devs->seeding)
				564	continue;
				565
				566	list_for_each_entry(dev, &fs_devs->devices, dev_list) {
				567
				568	if (dev == cur_dev)
				569	continue;
				570	if (!dev->name)
				571	continue;
				572
				573	/*
				574	* Todo: This won't be enough. What if the same device
				575	* comes back (with new uuid and) with its mapper path?
				576	* But for now, this does help as mostly an admin will
				577	* either use mapper or non mapper path throughout.
				578	*/
				579	rcu_read_lock();
				580	del = strcmp(rcu_str_deref(dev->name),
				581	rcu_str_deref(cur_dev->name));
				582	rcu_read_unlock();
				583	if (!del)
				584	break;
				585	}
				586
				587	if (!del) {
				588	/* delete the stale device */
				589	if (fs_devs->num_devices == 1) {
				590	btrfs_sysfs_remove_fsid(fs_devs);
				591	list_del(&fs_devs->list);
				592	free_fs_devices(fs_devs);
				593	break;
				594	} else {
				595	fs_devs->num_devices--;
				596	list_del(&dev->dev_list);
				597	rcu_string_free(dev->name);
				598	kfree(dev);
				599	}
				600	break;
				601	}
				602	}
				603	}
				604
				605	/*
				606	* Add new device to list of registered devices
				607	*
				608	* Returns:
				609	* 1 - first time device is seen
				610	* 0 - device already known
				611	* < 0 - error
				612	*/
				613	static noinline int device_list_add(const char *path,
				614	struct btrfs_super_block *disk_super,
				615	u64 devid, struct btrfs_fs_devices **fs_devices_ret)
				616	{
				617	struct btrfs_device *device;
				618	struct btrfs_fs_devices *fs_devices;
				619	struct rcu_string *name;
				620	int ret = 0;
				621	u64 found_transid = btrfs_super_generation(disk_super);
				622
				623	fs_devices = find_fsid(disk_super->fsid);
				624	if (!fs_devices) {
				625	fs_devices = alloc_fs_devices(disk_super->fsid);
				626	if (IS_ERR(fs_devices))
				627	return PTR_ERR(fs_devices);
				628
				629	list_add(&fs_devices->list, &fs_uuids);
				630
				631	device = NULL;
				632	} else {
				633	device = find_device(fs_devices, devid,
				634	disk_super->dev_item.uuid);
				635	}
				636
				637	if (!device) {
				638	if (fs_devices->opened)
				639	return -EBUSY;
				640
				641	device = btrfs_alloc_device(NULL, &devid,
				642	disk_super->dev_item.uuid);
				643	if (IS_ERR(device)) {
				644	/* we can safely leave the fs_devices entry around */
				645	return PTR_ERR(device);
				646	}
				647
				648	name = rcu_string_strdup(path, GFP_NOFS);
				649	if (!name) {
				650	kfree(device);
				651	return -ENOMEM;
				652	}
				653	rcu_assign_pointer(device->name, name);
				654
				655	mutex_lock(&fs_devices->device_list_mutex);
				656	list_add_rcu(&device->dev_list, &fs_devices->devices);
				657	fs_devices->num_devices++;
				658	mutex_unlock(&fs_devices->device_list_mutex);
				659
				660	ret = 1;
				661	device->fs_devices = fs_devices;
				662	} else if (!device->name \|\| strcmp(device->name->str, path)) {
				663	/*
				664	* When FS is already mounted.
				665	* 1. If you are here and if the device->name is NULL that
				666	* means this device was missing at time of FS mount.
				667	* 2. If you are here and if the device->name is different
				668	* from 'path' that means either
				669	* a. The same device disappeared and reappeared with
				670	* different name. or
				671	* b. The missing-disk-which-was-replaced, has
				672	* reappeared now.
				673	*
				674	* We must allow 1 and 2a above. But 2b would be a spurious
				675	* and unintentional.
				676	*
				677	* Further in case of 1 and 2a above, the disk at 'path'
				678	* would have missed some transaction when it was away and
				679	* in case of 2a the stale bdev has to be updated as well.
				680	* 2b must not be allowed at all time.
				681	*/
				682
				683	/*
				684	* For now, we do allow update to btrfs_fs_device through the
				685	* btrfs dev scan cli after FS has been mounted. We're still
				686	* tracking a problem where systems fail mount by subvolume id
				687	* when we reject replacement on a mounted FS.
				688	*/
				689	if (!fs_devices->opened && found_transid < device->generation) {
				690	/*
				691	* That is if the FS is _not_ mounted and if you
				692	* are here, that means there is more than one
				693	* disk with same uuid and devid.We keep the one
				694	* with larger generation number or the last-in if
				695	* generation are equal.
				696	*/
				697	return -EEXIST;
				698	}
				699
				700	name = rcu_string_strdup(path, GFP_NOFS);
				701	if (!name)
				702	return -ENOMEM;
				703	rcu_string_free(device->name);
				704	rcu_assign_pointer(device->name, name);
				705	if (device->missing) {
				706	fs_devices->missing_devices--;
				707	device->missing = 0;
				708	}
				709	}
				710
				711	/*
				712	* Unmount does not free the btrfs_device struct but would zero
				713	* generation along with most of the other members. So just update
				714	* it back. We need it to pick the disk with largest generation
				715	* (as above).
				716	*/
				717	if (!fs_devices->opened)
				718	device->generation = found_transid;
				719
				720	/*
				721	* if there is new btrfs on an already registered device,
				722	* then remove the stale device entry.
				723	*/
				724	if (ret > 0)
				725	btrfs_free_stale_device(device);
				726
				727	*fs_devices_ret = fs_devices;
				728
				729	return ret;
				730	}
				731
				732	static struct btrfs_fs_devices clone_fs_devices(struct btrfs_fs_devices orig)
				733	{
				734	struct btrfs_fs_devices *fs_devices;
				735	struct btrfs_device *device;
				736	struct btrfs_device *orig_dev;
				737
				738	fs_devices = alloc_fs_devices(orig->fsid);
				739	if (IS_ERR(fs_devices))
				740	return fs_devices;
				741
				742	mutex_lock(&orig->device_list_mutex);
				743	fs_devices->total_devices = orig->total_devices;
				744
				745	/* We have held the volume lock, it is safe to get the devices. */
				746	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
				747	struct rcu_string *name;
				748
				749	device = btrfs_alloc_device(NULL, &orig_dev->devid,
				750	orig_dev->uuid);
				751	if (IS_ERR(device))
				752	goto error;
				753
				754	/*
				755	* This is ok to do without rcu read locked because we hold the
				756	* uuid mutex so nothing we touch in here is going to disappear.
				757	*/
				758	if (orig_dev->name) {
				759	name = rcu_string_strdup(orig_dev->name->str,
				760	GFP_KERNEL);
				761	if (!name) {
				762	kfree(device);
				763	goto error;
				764	}
				765	rcu_assign_pointer(device->name, name);
				766	}
				767
				768	list_add(&device->dev_list, &fs_devices->devices);
				769	device->fs_devices = fs_devices;
				770	fs_devices->num_devices++;
				771	}
				772	mutex_unlock(&orig->device_list_mutex);
				773	return fs_devices;
				774	error:
				775	mutex_unlock(&orig->device_list_mutex);
				776	free_fs_devices(fs_devices);
				777	return ERR_PTR(-ENOMEM);
				778	}
				779
				780	void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
				781	{
				782	struct btrfs_device device, next;
				783	struct btrfs_device *latest_dev = NULL;
				784
				785	mutex_lock(&uuid_mutex);
				786	again:
				787	/* This is the initialized path, it is safe to release the devices. */
				788	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
				789	if (device->in_fs_metadata) {
				790	if (!device->is_tgtdev_for_dev_replace &&
				791	(!latest_dev \|\|
				792	device->generation > latest_dev->generation)) {
				793	latest_dev = device;
				794	}
				795	continue;
				796	}
				797
				798	if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
				799	/*
				800	* In the first step, keep the device which has
				801	* the correct fsid and the devid that is used
				802	* for the dev_replace procedure.
				803	* In the second step, the dev_replace state is
				804	* read from the device tree and it is known
				805	* whether the procedure is really active or
				806	* not, which means whether this device is
				807	* used or whether it should be removed.
				808	*/
				809	if (step == 0 \|\| device->is_tgtdev_for_dev_replace) {
				810	continue;
				811	}
				812	}
				813	if (device->bdev) {
				814	blkdev_put(device->bdev, device->mode);
				815	device->bdev = NULL;
				816	fs_devices->open_devices--;
				817	}
				818	if (device->writeable) {
				819	list_del_init(&device->dev_alloc_list);
				820	device->writeable = 0;
				821	if (!device->is_tgtdev_for_dev_replace)
				822	fs_devices->rw_devices--;
				823	}
				824	list_del_init(&device->dev_list);
				825	fs_devices->num_devices--;
				826	rcu_string_free(device->name);
				827	kfree(device);
				828	}
				829
				830	if (fs_devices->seed) {
				831	fs_devices = fs_devices->seed;
				832	goto again;
				833	}
				834
				835	fs_devices->latest_bdev = latest_dev->bdev;
				836
				837	mutex_unlock(&uuid_mutex);
				838	}
				839
				840	static void __free_device(struct work_struct *work)
				841	{
				842	struct btrfs_device *device;
				843
				844	device = container_of(work, struct btrfs_device, rcu_work);
				845	rcu_string_free(device->name);
				846	bio_put(device->flush_bio);
				847	kfree(device);
				848	}
				849
				850	static void free_device(struct rcu_head *head)
				851	{
				852	struct btrfs_device *device;
				853
				854	device = container_of(head, struct btrfs_device, rcu);
				855
				856	INIT_WORK(&device->rcu_work, __free_device);
				857	schedule_work(&device->rcu_work);
				858	}
				859
				860	static void btrfs_close_bdev(struct btrfs_device *device)
				861	{
				862	if (device->bdev && device->writeable) {
				863	sync_blockdev(device->bdev);
				864	invalidate_bdev(device->bdev);
				865	}
				866
				867	if (device->bdev)
				868	blkdev_put(device->bdev, device->mode);
				869	}
				870
				871	static void btrfs_prepare_close_one_device(struct btrfs_device *device)
				872	{
				873	struct btrfs_fs_devices *fs_devices = device->fs_devices;
				874	struct btrfs_device *new_device;
				875	struct rcu_string *name;
				876
				877	if (device->bdev)
				878	fs_devices->open_devices--;
				879
				880	if (device->writeable &&
				881	device->devid != BTRFS_DEV_REPLACE_DEVID) {
				882	list_del_init(&device->dev_alloc_list);
				883	fs_devices->rw_devices--;
				884	}
				885
				886	if (device->missing)
				887	fs_devices->missing_devices--;
				888
				889	new_device = btrfs_alloc_device(NULL, &device->devid,
				890	device->uuid);
				891	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
				892
				893	/* Safe because we are under uuid_mutex */
				894	if (device->name) {
				895	name = rcu_string_strdup(device->name->str, GFP_NOFS);
				896	BUG_ON(!name); /* -ENOMEM */
				897	rcu_assign_pointer(new_device->name, name);
				898	}
				899
				900	list_replace_rcu(&device->dev_list, &new_device->dev_list);
				901	new_device->fs_devices = device->fs_devices;
				902	}
				903
				904	static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
				905	{
				906	struct btrfs_device device, tmp;
				907	struct list_head pending_put;
				908
				909	INIT_LIST_HEAD(&pending_put);
				910
				911	if (--fs_devices->opened > 0)
				912	return 0;
				913
				914	mutex_lock(&fs_devices->device_list_mutex);
				915	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
				916	btrfs_prepare_close_one_device(device);
				917	list_add(&device->dev_list, &pending_put);
				918	}
				919	mutex_unlock(&fs_devices->device_list_mutex);
				920
				921	/*
				922	* btrfs_show_devname() is using the device_list_mutex,
				923	* sometimes call to blkdev_put() leads vfs calling
				924	* into this func. So do put outside of device_list_mutex,
				925	* as of now.
				926	*/
				927	while (!list_empty(&pending_put)) {
				928	device = list_first_entry(&pending_put,
				929	struct btrfs_device, dev_list);
				930	list_del(&device->dev_list);
				931	btrfs_close_bdev(device);
				932	call_rcu(&device->rcu, free_device);
				933	}
				934
				935	WARN_ON(fs_devices->open_devices);
				936	WARN_ON(fs_devices->rw_devices);
				937	fs_devices->opened = 0;
				938	fs_devices->seeding = 0;
				939
				940	return 0;
				941	}
				942
				943	int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
				944	{
				945	struct btrfs_fs_devices *seed_devices = NULL;
				946	int ret;
				947
				948	mutex_lock(&uuid_mutex);
				949	ret = __btrfs_close_devices(fs_devices);
				950	if (!fs_devices->opened) {
				951	seed_devices = fs_devices->seed;
				952	fs_devices->seed = NULL;
				953	}
				954	mutex_unlock(&uuid_mutex);
				955
				956	while (seed_devices) {
				957	fs_devices = seed_devices;
				958	seed_devices = fs_devices->seed;
				959	__btrfs_close_devices(fs_devices);
				960	free_fs_devices(fs_devices);
				961	}
				962	/*
				963	* Wait for rcu kworkers under __btrfs_close_devices
				964	* to finish all blkdev_puts so device is really
				965	* free when umount is done.
				966	*/
				967	rcu_barrier();
				968	return ret;
				969	}
				970
				971	static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				972	fmode_t flags, void *holder)
				973	{
				974	struct request_queue *q;
				975	struct block_device *bdev;
				976	struct list_head *head = &fs_devices->devices;
				977	struct btrfs_device *device;
				978	struct btrfs_device *latest_dev = NULL;
				979	struct buffer_head *bh;
				980	struct btrfs_super_block *disk_super;
				981	u64 devid;
				982	int seeding = 1;
				983	int ret = 0;
				984
				985	flags \|= FMODE_EXCL;
				986
				987	list_for_each_entry(device, head, dev_list) {
				988	if (device->bdev)
				989	continue;
				990	if (!device->name)
				991	continue;
				992
				993	/* Just open everything we can; ignore failures here */
				994	if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				995	&bdev, &bh))
				996	continue;
				997
				998	disk_super = (struct btrfs_super_block *)bh->b_data;
				999	devid = btrfs_stack_device_id(&disk_super->dev_item);
				1000	if (devid != device->devid)
				1001	goto error_brelse;
				1002
				1003	if (memcmp(device->uuid, disk_super->dev_item.uuid,
				1004	BTRFS_UUID_SIZE))
				1005	goto error_brelse;
				1006
				1007	device->generation = btrfs_super_generation(disk_super);
				1008	if (!latest_dev \|\|
				1009	device->generation > latest_dev->generation)
				1010	latest_dev = device;
				1011
				1012	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
				1013	device->writeable = 0;
				1014	} else {
				1015	device->writeable = !bdev_read_only(bdev);
				1016	seeding = 0;
				1017	}
				1018
				1019	q = bdev_get_queue(bdev);
				1020	if (blk_queue_discard(q))
				1021	device->can_discard = 1;
				1022	if (!blk_queue_nonrot(q))
				1023	fs_devices->rotating = 1;
				1024
				1025	device->bdev = bdev;
				1026	device->in_fs_metadata = 0;
				1027	device->mode = flags;
				1028
				1029	fs_devices->open_devices++;
				1030	if (device->writeable &&
				1031	device->devid != BTRFS_DEV_REPLACE_DEVID) {
				1032	fs_devices->rw_devices++;
				1033	list_add(&device->dev_alloc_list,
				1034	&fs_devices->alloc_list);
				1035	}
				1036	brelse(bh);
				1037	continue;
				1038
				1039	error_brelse:
				1040	brelse(bh);
				1041	blkdev_put(bdev, flags);
				1042	continue;
				1043	}
				1044	if (fs_devices->open_devices == 0) {
				1045	ret = -EINVAL;
				1046	goto out;
				1047	}
				1048	fs_devices->seeding = seeding;
				1049	fs_devices->opened = 1;
				1050	fs_devices->latest_bdev = latest_dev->bdev;
				1051	fs_devices->total_rw_bytes = 0;
				1052	out:
				1053	return ret;
				1054	}
				1055
				1056	int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				1057	fmode_t flags, void *holder)
				1058	{
				1059	int ret;
				1060
				1061	mutex_lock(&uuid_mutex);
				1062	if (fs_devices->opened) {
				1063	fs_devices->opened++;
				1064	ret = 0;
				1065	} else {
				1066	ret = __btrfs_open_devices(fs_devices, flags, holder);
				1067	}
				1068	mutex_unlock(&uuid_mutex);
				1069	return ret;
				1070	}
				1071
				1072	void btrfs_release_disk_super(struct page *page)
				1073	{
				1074	kunmap(page);
				1075	put_page(page);
				1076	}
				1077
				1078	int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				1079	struct page page, struct btrfs_super_block disk_super)
				1080	{
				1081	void *p;
				1082	pgoff_t index;
				1083
				1084	/* make sure our super fits in the device */
				1085	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
				1086	return 1;
				1087
				1088	/* make sure our super fits in the page */
				1089	if (sizeof(**disk_super) > PAGE_SIZE)
				1090	return 1;
				1091
				1092	/* make sure our super doesn't straddle pages on disk */
				1093	index = bytenr >> PAGE_SHIFT;
				1094	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
				1095	return 1;
				1096
				1097	/* pull in the page with our super */
				1098	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				1099	index, GFP_KERNEL);
				1100
				1101	if (IS_ERR_OR_NULL(*page))
				1102	return 1;
				1103
				1104	p = kmap(*page);
				1105
				1106	/* align our pointer to the offset of the super block */
				1107	*disk_super = p + (bytenr & ~PAGE_MASK);
				1108
				1109	if (btrfs_super_bytenr(*disk_super) != bytenr \|\|
				1110	btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
				1111	btrfs_release_disk_super(*page);
				1112	return 1;
				1113	}
				1114
				1115	if ((*disk_super)->label[0] &&
				1116	(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
				1117	(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
				1118
				1119	return 0;
				1120	}
				1121
				1122	/*
				1123	* Look for a btrfs signature on a device. This may be called out of the mount path
				1124	* and we are not allowed to call set_blocksize during the scan. The superblock
				1125	* is read via pagecache
				1126	*/
				1127	int btrfs_scan_one_device(const char path, fmode_t flags, void holder,
				1128	struct btrfs_fs_devices **fs_devices_ret)
				1129	{
				1130	struct btrfs_super_block *disk_super;
				1131	struct block_device *bdev;
				1132	struct page *page;
				1133	int ret = -EINVAL;
				1134	u64 devid;
				1135	u64 transid;
				1136	u64 total_devices;
				1137	u64 bytenr;
				1138
				1139	/*
				1140	* we would like to check all the supers, but that would make
				1141	* a btrfs mount succeed after a mkfs from a different FS.
				1142	* So, we need to add a special mount option to scan for
				1143	* later supers, using BTRFS_SUPER_MIRROR_MAX instead
				1144	*/
				1145	bytenr = btrfs_sb_offset(0);
				1146	flags \|= FMODE_EXCL;
				1147	mutex_lock(&uuid_mutex);
				1148
				1149	bdev = blkdev_get_by_path(path, flags, holder);
				1150	if (IS_ERR(bdev)) {
				1151	ret = PTR_ERR(bdev);
				1152	goto error;
				1153	}
				1154
				1155	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
				1156	goto error_bdev_put;
				1157
				1158	devid = btrfs_stack_device_id(&disk_super->dev_item);
				1159	transid = btrfs_super_generation(disk_super);
				1160	total_devices = btrfs_super_num_devices(disk_super);
				1161
				1162	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
				1163	if (ret > 0) {
				1164	if (disk_super->label[0]) {
				1165	pr_info("BTRFS: device label %s ", disk_super->label);
				1166	} else {
				1167	pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
				1168	}
				1169
				1170	pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
				1171	ret = 0;
				1172	}
				1173	if (!ret && fs_devices_ret)
				1174	(*fs_devices_ret)->total_devices = total_devices;
				1175
				1176	btrfs_release_disk_super(page);
				1177
				1178	error_bdev_put:
				1179	blkdev_put(bdev, flags);
				1180	error:
				1181	mutex_unlock(&uuid_mutex);
				1182	return ret;
				1183	}
				1184
				1185	/* helper to account the used device space in the range */
				1186	int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
				1187	u64 end, u64 *length)
				1188	{
				1189	struct btrfs_key key;
				1190	struct btrfs_root *root = device->fs_info->dev_root;
				1191	struct btrfs_dev_extent *dev_extent;
				1192	struct btrfs_path *path;
				1193	u64 extent_end;
				1194	int ret;
				1195	int slot;
				1196	struct extent_buffer *l;
				1197
				1198	*length = 0;
				1199
				1200	if (start >= device->total_bytes \|\| device->is_tgtdev_for_dev_replace)
				1201	return 0;
				1202
				1203	path = btrfs_alloc_path();
				1204	if (!path)
				1205	return -ENOMEM;
				1206	path->reada = READA_FORWARD;
				1207
				1208	key.objectid = device->devid;
				1209	key.offset = start;
				1210	key.type = BTRFS_DEV_EXTENT_KEY;
				1211
				1212	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1213	if (ret < 0)
				1214	goto out;
				1215	if (ret > 0) {
				1216	ret = btrfs_previous_item(root, path, key.objectid, key.type);
				1217	if (ret < 0)
				1218	goto out;
				1219	}
				1220
				1221	while (1) {
				1222	l = path->nodes[0];
				1223	slot = path->slots[0];
				1224	if (slot >= btrfs_header_nritems(l)) {
				1225	ret = btrfs_next_leaf(root, path);
				1226	if (ret == 0)
				1227	continue;
				1228	if (ret < 0)
				1229	goto out;
				1230
				1231	break;
				1232	}
				1233	btrfs_item_key_to_cpu(l, &key, slot);
				1234
				1235	if (key.objectid < device->devid)
				1236	goto next;
				1237
				1238	if (key.objectid > device->devid)
				1239	break;
				1240
				1241	if (key.type != BTRFS_DEV_EXTENT_KEY)
				1242	goto next;
				1243
				1244	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				1245	extent_end = key.offset + btrfs_dev_extent_length(l,
				1246	dev_extent);
				1247	if (key.offset <= start && extent_end > end) {
				1248	*length = end - start + 1;
				1249	break;
				1250	} else if (key.offset <= start && extent_end > start)
				1251	*length += extent_end - start;
				1252	else if (key.offset > start && extent_end <= end)
				1253	*length += extent_end - key.offset;
				1254	else if (key.offset > start && key.offset <= end) {
				1255	*length += end - key.offset + 1;
				1256	break;
				1257	} else if (key.offset > end)
				1258	break;
				1259
				1260	next:
				1261	path->slots[0]++;
				1262	}
				1263	ret = 0;
				1264	out:
				1265	btrfs_free_path(path);
				1266	return ret;
				1267	}
				1268
				1269	static int contains_pending_extent(struct btrfs_transaction *transaction,
				1270	struct btrfs_device *device,
				1271	u64 *start, u64 len)
				1272	{
				1273	struct btrfs_fs_info *fs_info = device->fs_info;
				1274	struct extent_map *em;
				1275	struct list_head *search_list = &fs_info->pinned_chunks;
				1276	int ret = 0;
				1277	u64 physical_start = *start;
				1278
				1279	if (transaction)
				1280	search_list = &transaction->pending_chunks;
				1281	again:
				1282	list_for_each_entry(em, search_list, list) {
				1283	struct map_lookup *map;
				1284	int i;
				1285
				1286	map = em->map_lookup;
				1287	for (i = 0; i < map->num_stripes; i++) {
				1288	u64 end;
				1289
				1290	if (map->stripes[i].dev != device)
				1291	continue;
				1292	if (map->stripes[i].physical >= physical_start + len \|\|
				1293	map->stripes[i].physical + em->orig_block_len <=
				1294	physical_start)
				1295	continue;
				1296	/*
				1297	* Make sure that while processing the pinned list we do
				1298	* not override our *start with a lower value, because
				1299	* we can have pinned chunks that fall within this
				1300	* device hole and that have lower physical addresses
				1301	* than the pending chunks we processed before. If we
				1302	* do not take this special care we can end up getting
				1303	* 2 pending chunks that start at the same physical
				1304	* device offsets because the end offset of a pinned
				1305	* chunk can be equal to the start offset of some
				1306	* pending chunk.
				1307	*/
				1308	end = map->stripes[i].physical + em->orig_block_len;
				1309	if (end > *start) {
				1310	*start = end;
				1311	ret = 1;
				1312	}
				1313	}
				1314	}
				1315	if (search_list != &fs_info->pinned_chunks) {
				1316	search_list = &fs_info->pinned_chunks;
				1317	goto again;
				1318	}
				1319
				1320	return ret;
				1321	}
				1322
				1323
				1324	/*
				1325	* find_free_dev_extent_start - find free space in the specified device
				1326	* @device: the device which we search the free space in
				1327	* @num_bytes: the size of the free space that we need
				1328	* @search_start: the position from which to begin the search
				1329	* @start: store the start of the free space.
				1330	* @len: the size of the free space. that we find, or the size
				1331	* of the max free space if we don't find suitable free space
				1332	*
				1333	* this uses a pretty simple search, the expectation is that it is
				1334	* called very infrequently and that a given device has a small number
				1335	* of extents
				1336	*
				1337	* @start is used to store the start of the free space if we find. But if we
				1338	* don't find suitable free space, it will be used to store the start position
				1339	* of the max free space.
				1340	*
				1341	* @len is used to store the size of the free space that we find.
				1342	* But if we don't find suitable free space, it is used to store the size of
				1343	* the max free space.
				1344	*/
				1345	int find_free_dev_extent_start(struct btrfs_transaction *transaction,
				1346	struct btrfs_device *device, u64 num_bytes,
				1347	u64 search_start, u64 start, u64 len)
				1348	{
				1349	struct btrfs_fs_info *fs_info = device->fs_info;
				1350	struct btrfs_root *root = fs_info->dev_root;
				1351	struct btrfs_key key;
				1352	struct btrfs_dev_extent *dev_extent;
				1353	struct btrfs_path *path;
				1354	u64 hole_size;
				1355	u64 max_hole_start;
				1356	u64 max_hole_size;
				1357	u64 extent_end;
				1358	u64 search_end = device->total_bytes;
				1359	int ret;
				1360	int slot;
				1361	struct extent_buffer *l;
				1362
				1363	/*
				1364	* We don't want to overwrite the superblock on the drive nor any area
				1365	* used by the boot loader (grub for example), so we make sure to start
				1366	* at an offset of at least 1MB.
				1367	*/
				1368	search_start = max_t(u64, search_start, SZ_1M);
				1369
				1370	path = btrfs_alloc_path();
				1371	if (!path)
				1372	return -ENOMEM;
				1373
				1374	max_hole_start = search_start;
				1375	max_hole_size = 0;
				1376
				1377	again:
				1378	if (search_start >= search_end \|\| device->is_tgtdev_for_dev_replace) {
				1379	ret = -ENOSPC;
				1380	goto out;
				1381	}
				1382
				1383	path->reada = READA_FORWARD;
				1384	path->search_commit_root = 1;
				1385	path->skip_locking = 1;
				1386
				1387	key.objectid = device->devid;
				1388	key.offset = search_start;
				1389	key.type = BTRFS_DEV_EXTENT_KEY;
				1390
				1391	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1392	if (ret < 0)
				1393	goto out;
				1394	if (ret > 0) {
				1395	ret = btrfs_previous_item(root, path, key.objectid, key.type);
				1396	if (ret < 0)
				1397	goto out;
				1398	}
				1399
				1400	while (1) {
				1401	l = path->nodes[0];
				1402	slot = path->slots[0];
				1403	if (slot >= btrfs_header_nritems(l)) {
				1404	ret = btrfs_next_leaf(root, path);
				1405	if (ret == 0)
				1406	continue;
				1407	if (ret < 0)
				1408	goto out;
				1409
				1410	break;
				1411	}
				1412	btrfs_item_key_to_cpu(l, &key, slot);
				1413
				1414	if (key.objectid < device->devid)
				1415	goto next;
				1416
				1417	if (key.objectid > device->devid)
				1418	break;
				1419
				1420	if (key.type != BTRFS_DEV_EXTENT_KEY)
				1421	goto next;
				1422
				1423	if (key.offset > search_start) {
				1424	hole_size = key.offset - search_start;
				1425
				1426	/*
				1427	* Have to check before we set max_hole_start, otherwise
				1428	* we could end up sending back this offset anyway.
				1429	*/
				1430	if (contains_pending_extent(transaction, device,
				1431	&search_start,
				1432	hole_size)) {
				1433	if (key.offset >= search_start) {
				1434	hole_size = key.offset - search_start;
				1435	} else {
				1436	WARN_ON_ONCE(1);
				1437	hole_size = 0;
				1438	}
				1439	}
				1440
				1441	if (hole_size > max_hole_size) {
				1442	max_hole_start = search_start;
				1443	max_hole_size = hole_size;
				1444	}
				1445
				1446	/*
				1447	* If this free space is greater than which we need,
				1448	* it must be the max free space that we have found
				1449	* until now, so max_hole_start must point to the start
				1450	* of this free space and the length of this free space
				1451	* is stored in max_hole_size. Thus, we return
				1452	* max_hole_start and max_hole_size and go back to the
				1453	* caller.
				1454	*/
				1455	if (hole_size >= num_bytes) {
				1456	ret = 0;
				1457	goto out;
				1458	}
				1459	}
				1460
				1461	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				1462	extent_end = key.offset + btrfs_dev_extent_length(l,
				1463	dev_extent);
				1464	if (extent_end > search_start)
				1465	search_start = extent_end;
				1466	next:
				1467	path->slots[0]++;
				1468	cond_resched();
				1469	}
				1470
				1471	/*
				1472	* At this point, search_start should be the end of
				1473	* allocated dev extents, and when shrinking the device,
				1474	* search_end may be smaller than search_start.
				1475	*/
				1476	if (search_end > search_start) {
				1477	hole_size = search_end - search_start;
				1478
				1479	if (contains_pending_extent(transaction, device, &search_start,
				1480	hole_size)) {
				1481	btrfs_release_path(path);
				1482	goto again;
				1483	}
				1484
				1485	if (hole_size > max_hole_size) {
				1486	max_hole_start = search_start;
				1487	max_hole_size = hole_size;
				1488	}
				1489	}
				1490
				1491	/* See above. */
				1492	if (max_hole_size < num_bytes)
				1493	ret = -ENOSPC;
				1494	else
				1495	ret = 0;
				1496
				1497	out:
				1498	btrfs_free_path(path);
				1499	*start = max_hole_start;
				1500	if (len)
				1501	*len = max_hole_size;
				1502	return ret;
				1503	}
				1504
				1505	int find_free_dev_extent(struct btrfs_trans_handle *trans,
				1506	struct btrfs_device *device, u64 num_bytes,
				1507	u64 start, u64 len)
				1508	{
				1509	/* FIXME use last free of some kind */
				1510	return find_free_dev_extent_start(trans->transaction, device,
				1511	num_bytes, 0, start, len);
				1512	}
				1513
				1514	static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
				1515	struct btrfs_device *device,
				1516	u64 start, u64 *dev_extent_len)
				1517	{
				1518	struct btrfs_fs_info *fs_info = device->fs_info;
				1519	struct btrfs_root *root = fs_info->dev_root;
				1520	int ret;
				1521	struct btrfs_path *path;
				1522	struct btrfs_key key;
				1523	struct btrfs_key found_key;
				1524	struct extent_buffer *leaf = NULL;
				1525	struct btrfs_dev_extent *extent = NULL;
				1526
				1527	path = btrfs_alloc_path();
				1528	if (!path)
				1529	return -ENOMEM;
				1530
				1531	key.objectid = device->devid;
				1532	key.offset = start;
				1533	key.type = BTRFS_DEV_EXTENT_KEY;
				1534	again:
				1535	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1536	if (ret > 0) {
				1537	ret = btrfs_previous_item(root, path, key.objectid,
				1538	BTRFS_DEV_EXTENT_KEY);
				1539	if (ret)
				1540	goto out;
				1541	leaf = path->nodes[0];
				1542	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				1543	extent = btrfs_item_ptr(leaf, path->slots[0],
				1544	struct btrfs_dev_extent);
				1545	BUG_ON(found_key.offset > start \|\| found_key.offset +
				1546	btrfs_dev_extent_length(leaf, extent) < start);
				1547	key = found_key;
				1548	btrfs_release_path(path);
				1549	goto again;
				1550	} else if (ret == 0) {
				1551	leaf = path->nodes[0];
				1552	extent = btrfs_item_ptr(leaf, path->slots[0],
				1553	struct btrfs_dev_extent);
				1554	} else {
				1555	btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
				1556	goto out;
				1557	}
				1558
				1559	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
				1560
				1561	ret = btrfs_del_item(trans, root, path);
				1562	if (ret) {
				1563	btrfs_handle_fs_error(fs_info, ret,
				1564	"Failed to remove dev extent item");
				1565	} else {
				1566	set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
				1567	}
				1568	out:
				1569	btrfs_free_path(path);
				1570	return ret;
				1571	}
				1572
				1573	static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				1574	struct btrfs_device *device,
				1575	u64 chunk_offset, u64 start, u64 num_bytes)
				1576	{
				1577	int ret;
				1578	struct btrfs_path *path;
				1579	struct btrfs_fs_info *fs_info = device->fs_info;
				1580	struct btrfs_root *root = fs_info->dev_root;
				1581	struct btrfs_dev_extent *extent;
				1582	struct extent_buffer *leaf;
				1583	struct btrfs_key key;
				1584
				1585	WARN_ON(!device->in_fs_metadata);
				1586	WARN_ON(device->is_tgtdev_for_dev_replace);
				1587	path = btrfs_alloc_path();
				1588	if (!path)
				1589	return -ENOMEM;
				1590
				1591	key.objectid = device->devid;
				1592	key.offset = start;
				1593	key.type = BTRFS_DEV_EXTENT_KEY;
				1594	ret = btrfs_insert_empty_item(trans, root, path, &key,
				1595	sizeof(*extent));
				1596	if (ret)
				1597	goto out;
				1598
				1599	leaf = path->nodes[0];
				1600	extent = btrfs_item_ptr(leaf, path->slots[0],
				1601	struct btrfs_dev_extent);
				1602	btrfs_set_dev_extent_chunk_tree(leaf, extent,
				1603	BTRFS_CHUNK_TREE_OBJECTID);
				1604	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
				1605	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				1606	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
				1607
				1608	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
				1609	btrfs_mark_buffer_dirty(leaf);
				1610	out:
				1611	btrfs_free_path(path);
				1612	return ret;
				1613	}
				1614
				1615	static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
				1616	{
				1617	struct extent_map_tree *em_tree;
				1618	struct extent_map *em;
				1619	struct rb_node *n;
				1620	u64 ret = 0;
				1621
				1622	em_tree = &fs_info->mapping_tree.map_tree;
				1623	read_lock(&em_tree->lock);
				1624	n = rb_last(&em_tree->map);
				1625	if (n) {
				1626	em = rb_entry(n, struct extent_map, rb_node);
				1627	ret = em->start + em->len;
				1628	}
				1629	read_unlock(&em_tree->lock);
				1630
				1631	return ret;
				1632	}
				1633
				1634	static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				1635	u64 *devid_ret)
				1636	{
				1637	int ret;
				1638	struct btrfs_key key;
				1639	struct btrfs_key found_key;
				1640	struct btrfs_path *path;
				1641
				1642	path = btrfs_alloc_path();
				1643	if (!path)
				1644	return -ENOMEM;
				1645
				1646	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1647	key.type = BTRFS_DEV_ITEM_KEY;
				1648	key.offset = (u64)-1;
				1649
				1650	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
				1651	if (ret < 0)
				1652	goto error;
				1653
				1654	BUG_ON(ret == 0); /* Corruption */
				1655
				1656	ret = btrfs_previous_item(fs_info->chunk_root, path,
				1657	BTRFS_DEV_ITEMS_OBJECTID,
				1658	BTRFS_DEV_ITEM_KEY);
				1659	if (ret) {
				1660	*devid_ret = 1;
				1661	} else {
				1662	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1663	path->slots[0]);
				1664	*devid_ret = found_key.offset + 1;
				1665	}
				1666	ret = 0;
				1667	error:
				1668	btrfs_free_path(path);
				1669	return ret;
				1670	}
				1671
				1672	/*
				1673	* the device information is stored in the chunk root
				1674	* the btrfs_device struct should be fully filled in
				1675	*/
				1676	static int btrfs_add_device(struct btrfs_trans_handle *trans,
				1677	struct btrfs_fs_info *fs_info,
				1678	struct btrfs_device *device)
				1679	{
				1680	struct btrfs_root *root = fs_info->chunk_root;
				1681	int ret;
				1682	struct btrfs_path *path;
				1683	struct btrfs_dev_item *dev_item;
				1684	struct extent_buffer *leaf;
				1685	struct btrfs_key key;
				1686	unsigned long ptr;
				1687
				1688	path = btrfs_alloc_path();
				1689	if (!path)
				1690	return -ENOMEM;
				1691
				1692	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1693	key.type = BTRFS_DEV_ITEM_KEY;
				1694	key.offset = device->devid;
				1695
				1696	ret = btrfs_insert_empty_item(trans, root, path, &key,
				1697	sizeof(*dev_item));
				1698	if (ret)
				1699	goto out;
				1700
				1701	leaf = path->nodes[0];
				1702	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
				1703
				1704	btrfs_set_device_id(leaf, dev_item, device->devid);
				1705	btrfs_set_device_generation(leaf, dev_item, 0);
				1706	btrfs_set_device_type(leaf, dev_item, device->type);
				1707	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
				1708	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
				1709	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
				1710	btrfs_set_device_total_bytes(leaf, dev_item,
				1711	btrfs_device_get_disk_total_bytes(device));
				1712	btrfs_set_device_bytes_used(leaf, dev_item,
				1713	btrfs_device_get_bytes_used(device));
				1714	btrfs_set_device_group(leaf, dev_item, 0);
				1715	btrfs_set_device_seek_speed(leaf, dev_item, 0);
				1716	btrfs_set_device_bandwidth(leaf, dev_item, 0);
				1717	btrfs_set_device_start_offset(leaf, dev_item, 0);
				1718
				1719	ptr = btrfs_device_uuid(dev_item);
				1720	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
				1721	ptr = btrfs_device_fsid(dev_item);
				1722	write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
				1723	btrfs_mark_buffer_dirty(leaf);
				1724
				1725	ret = 0;
				1726	out:
				1727	btrfs_free_path(path);
				1728	return ret;
				1729	}
				1730
				1731	/*
				1732	* Function to update ctime/mtime for a given device path.
				1733	* Mainly used for ctime/mtime based probe like libblkid.
				1734	*/
				1735	static void update_dev_time(const char *path_name)
				1736	{
				1737	struct file *filp;
				1738
				1739	filp = filp_open(path_name, O_RDWR, 0);
				1740	if (IS_ERR(filp))
				1741	return;
				1742	file_update_time(filp);
				1743	filp_close(filp, NULL);
				1744	}
				1745
				1746	static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
				1747	struct btrfs_device *device)
				1748	{
				1749	struct btrfs_root *root = fs_info->chunk_root;
				1750	int ret;
				1751	struct btrfs_path *path;
				1752	struct btrfs_key key;
				1753	struct btrfs_trans_handle *trans;
				1754
				1755	path = btrfs_alloc_path();
				1756	if (!path)
				1757	return -ENOMEM;
				1758
				1759	trans = btrfs_start_transaction(root, 0);
				1760	if (IS_ERR(trans)) {
				1761	btrfs_free_path(path);
				1762	return PTR_ERR(trans);
				1763	}
				1764	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1765	key.type = BTRFS_DEV_ITEM_KEY;
				1766	key.offset = device->devid;
				1767
				1768	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1769	if (ret) {
				1770	if (ret > 0)
				1771	ret = -ENOENT;
				1772	btrfs_abort_transaction(trans, ret);
				1773	btrfs_end_transaction(trans);
				1774	goto out;
				1775	}
				1776
				1777	ret = btrfs_del_item(trans, root, path);
				1778	if (ret) {
				1779	btrfs_abort_transaction(trans, ret);
				1780	btrfs_end_transaction(trans);
				1781	}
				1782
				1783	out:
				1784	btrfs_free_path(path);
				1785	if (!ret)
				1786	ret = btrfs_commit_transaction(trans);
				1787	return ret;
				1788	}
				1789
				1790	/*
				1791	* Verify that @num_devices satisfies the RAID profile constraints in the whole
				1792	* filesystem. It's up to the caller to adjust that number regarding eg. device
				1793	* replace.
				1794	*/
				1795	static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
				1796	u64 num_devices)
				1797	{
				1798	u64 all_avail;
				1799	unsigned seq;
				1800	int i;
				1801
				1802	do {
				1803	seq = read_seqbegin(&fs_info->profiles_lock);
				1804
				1805	all_avail = fs_info->avail_data_alloc_bits \|
				1806	fs_info->avail_system_alloc_bits \|
				1807	fs_info->avail_metadata_alloc_bits;
				1808	} while (read_seqretry(&fs_info->profiles_lock, seq));
				1809
				1810	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
				1811	if (!(all_avail & btrfs_raid_group[i]))
				1812	continue;
				1813
				1814	if (num_devices < btrfs_raid_array[i].devs_min) {
				1815	int ret = btrfs_raid_mindev_error[i];
				1816
				1817	if (ret)
				1818	return ret;
				1819	}
				1820	}
				1821
				1822	return 0;
				1823	}
				1824
				1825	struct btrfs_device btrfs_find_next_active_device(struct btrfs_fs_devices fs_devs,
				1826	struct btrfs_device *device)
				1827	{
				1828	struct btrfs_device *next_device;
				1829
				1830	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
				1831	if (next_device != device &&
				1832	!next_device->missing && next_device->bdev)
				1833	return next_device;
				1834	}
				1835
				1836	return NULL;
				1837	}
				1838
				1839	/*
				1840	* Helper function to check if the given device is part of s_bdev / latest_bdev
				1841	* and replace it with the provided or the next active device, in the context
				1842	* where this function called, there should be always be another device (or
				1843	* this_dev) which is active.
				1844	*/
				1845	void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
				1846	struct btrfs_device device, struct btrfs_device this_dev)
				1847	{
				1848	struct btrfs_device *next_device;
				1849
				1850	if (this_dev)
				1851	next_device = this_dev;
				1852	else
				1853	next_device = btrfs_find_next_active_device(fs_info->fs_devices,
				1854	device);
				1855	ASSERT(next_device);
				1856
				1857	if (fs_info->sb->s_bdev &&
				1858	(fs_info->sb->s_bdev == device->bdev))
				1859	fs_info->sb->s_bdev = next_device->bdev;
				1860
				1861	if (fs_info->fs_devices->latest_bdev == device->bdev)
				1862	fs_info->fs_devices->latest_bdev = next_device->bdev;
				1863	}
				1864
				1865	int btrfs_rm_device(struct btrfs_fs_info fs_info, const char device_path,
				1866	u64 devid)
				1867	{
				1868	struct btrfs_device *device;
				1869	struct btrfs_fs_devices *cur_devices;
				1870	u64 num_devices;
				1871	int ret = 0;
				1872
				1873	mutex_lock(&uuid_mutex);
				1874
				1875	num_devices = fs_info->fs_devices->num_devices;
				1876	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
				1877	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
				1878	WARN_ON(num_devices < 1);
				1879	num_devices--;
				1880	}
				1881	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
				1882
				1883	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
				1884	if (ret)
				1885	goto out;
				1886
				1887	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
				1888	&device);
				1889	if (ret)
				1890	goto out;
				1891
				1892	if (device->is_tgtdev_for_dev_replace) {
				1893	ret = BTRFS_ERROR_DEV_TGT_REPLACE;
				1894	goto out;
				1895	}
				1896
				1897	if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
				1898	ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
				1899	goto out;
				1900	}
				1901
				1902	if (device->writeable) {
				1903	mutex_lock(&fs_info->chunk_mutex);
				1904	list_del_init(&device->dev_alloc_list);
				1905	device->fs_devices->rw_devices--;
				1906	mutex_unlock(&fs_info->chunk_mutex);
				1907	}
				1908
				1909	mutex_unlock(&uuid_mutex);
				1910	ret = btrfs_shrink_device(device, 0);
				1911	mutex_lock(&uuid_mutex);
				1912	if (ret)
				1913	goto error_undo;
				1914
				1915	/*
				1916	* TODO: the superblock still includes this device in its num_devices
				1917	* counter although write_all_supers() is not locked out. This
				1918	* could give a filesystem state which requires a degraded mount.
				1919	*/
				1920	ret = btrfs_rm_dev_item(fs_info, device);
				1921	if (ret)
				1922	goto error_undo;
				1923
				1924	device->in_fs_metadata = 0;
				1925	btrfs_scrub_cancel_dev(fs_info, device);
				1926
				1927	/*
				1928	* the device list mutex makes sure that we don't change
				1929	* the device list while someone else is writing out all
				1930	* the device supers. Whoever is writing all supers, should
				1931	* lock the device list mutex before getting the number of
				1932	* devices in the super block (super_copy). Conversely,
				1933	* whoever updates the number of devices in the super block
				1934	* (super_copy) should hold the device list mutex.
				1935	*/
				1936
				1937	cur_devices = device->fs_devices;
				1938	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				1939	list_del_rcu(&device->dev_list);
				1940
				1941	device->fs_devices->num_devices--;
				1942	device->fs_devices->total_devices--;
				1943
				1944	if (device->missing)
				1945	device->fs_devices->missing_devices--;
				1946
				1947	btrfs_assign_next_active_device(fs_info, device, NULL);
				1948
				1949	if (device->bdev) {
				1950	device->fs_devices->open_devices--;
				1951	/* remove sysfs entry */
				1952	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
				1953	}
				1954
				1955	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
				1956	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
				1957	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				1958
				1959	/*
				1960	* at this point, the device is zero sized and detached from
				1961	* the devices list. All that's left is to zero out the old
				1962	* supers and free the device.
				1963	*/
				1964	if (device->writeable)
				1965	btrfs_scratch_superblocks(device->bdev, device->name->str);
				1966
				1967	btrfs_close_bdev(device);
				1968	call_rcu(&device->rcu, free_device);
				1969
				1970	if (cur_devices->open_devices == 0) {
				1971	struct btrfs_fs_devices *fs_devices;
				1972	fs_devices = fs_info->fs_devices;
				1973	while (fs_devices) {
				1974	if (fs_devices->seed == cur_devices) {
				1975	fs_devices->seed = cur_devices->seed;
				1976	break;
				1977	}
				1978	fs_devices = fs_devices->seed;
				1979	}
				1980	cur_devices->seed = NULL;
				1981	__btrfs_close_devices(cur_devices);
				1982	free_fs_devices(cur_devices);
				1983	}
				1984
				1985	out:
				1986	mutex_unlock(&uuid_mutex);
				1987	return ret;
				1988
				1989	error_undo:
				1990	if (device->writeable) {
				1991	mutex_lock(&fs_info->chunk_mutex);
				1992	list_add(&device->dev_alloc_list,
				1993	&fs_info->fs_devices->alloc_list);
				1994	device->fs_devices->rw_devices++;
				1995	mutex_unlock(&fs_info->chunk_mutex);
				1996	}
				1997	goto out;
				1998	}
				1999
				2000	void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
				2001	struct btrfs_device *srcdev)
				2002	{
				2003	struct btrfs_fs_devices *fs_devices;
				2004
				2005	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
				2006
				2007	/*
				2008	* in case of fs with no seed, srcdev->fs_devices will point
				2009	* to fs_devices of fs_info. However when the dev being replaced is
				2010	* a seed dev it will point to the seed's local fs_devices. In short
				2011	* srcdev will have its correct fs_devices in both the cases.
				2012	*/
				2013	fs_devices = srcdev->fs_devices;
				2014
				2015	list_del_rcu(&srcdev->dev_list);
				2016	list_del_rcu(&srcdev->dev_alloc_list);
				2017	fs_devices->num_devices--;
				2018	if (srcdev->missing)
				2019	fs_devices->missing_devices--;
				2020
				2021	if (srcdev->writeable)
				2022	fs_devices->rw_devices--;
				2023
				2024	if (srcdev->bdev)
				2025	fs_devices->open_devices--;
				2026	}
				2027
				2028	void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
				2029	struct btrfs_device *srcdev)
				2030	{
				2031	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
				2032
				2033	if (srcdev->writeable) {
				2034	/* zero out the old super if it is writable */
				2035	btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
				2036	}
				2037
				2038	btrfs_close_bdev(srcdev);
				2039
				2040	call_rcu(&srcdev->rcu, free_device);
				2041
				2042	/*
				2043	* unless fs_devices is seed fs, num_devices shouldn't go
				2044	* zero
				2045	*/
				2046	BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
				2047
				2048	/* if this is no devs we rather delete the fs_devices */
				2049	if (!fs_devices->num_devices) {
				2050	struct btrfs_fs_devices *tmp_fs_devices;
				2051
				2052	tmp_fs_devices = fs_info->fs_devices;
				2053	while (tmp_fs_devices) {
				2054	if (tmp_fs_devices->seed == fs_devices) {
				2055	tmp_fs_devices->seed = fs_devices->seed;
				2056	break;
				2057	}
				2058	tmp_fs_devices = tmp_fs_devices->seed;
				2059	}
				2060	fs_devices->seed = NULL;
				2061	__btrfs_close_devices(fs_devices);
				2062	free_fs_devices(fs_devices);
				2063	}
				2064	}
				2065
				2066	void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				2067	struct btrfs_device *tgtdev)
				2068	{
				2069	mutex_lock(&uuid_mutex);
				2070	WARN_ON(!tgtdev);
				2071	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2072
				2073	btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
				2074
				2075	if (tgtdev->bdev)
				2076	fs_info->fs_devices->open_devices--;
				2077
				2078	fs_info->fs_devices->num_devices--;
				2079
				2080	btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
				2081
				2082	list_del_rcu(&tgtdev->dev_list);
				2083
				2084	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2085	mutex_unlock(&uuid_mutex);
				2086
				2087	/*
				2088	* The update_dev_time() with in btrfs_scratch_superblocks()
				2089	* may lead to a call to btrfs_show_devname() which will try
				2090	* to hold device_list_mutex. And here this device
				2091	* is already out of device list, so we don't have to hold
				2092	* the device_list_mutex lock.
				2093	*/
				2094	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
				2095
				2096	btrfs_close_bdev(tgtdev);
				2097	call_rcu(&tgtdev->rcu, free_device);
				2098	}
				2099
				2100	static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
				2101	const char *device_path,
				2102	struct btrfs_device **device)
				2103	{
				2104	int ret = 0;
				2105	struct btrfs_super_block *disk_super;
				2106	u64 devid;
				2107	u8 *dev_uuid;
				2108	struct block_device *bdev;
				2109	struct buffer_head *bh;
				2110
				2111	*device = NULL;
				2112	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
				2113	fs_info->bdev_holder, 0, &bdev, &bh);
				2114	if (ret)
				2115	return ret;
				2116	disk_super = (struct btrfs_super_block *)bh->b_data;
				2117	devid = btrfs_stack_device_id(&disk_super->dev_item);
				2118	dev_uuid = disk_super->dev_item.uuid;
				2119	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
				2120	brelse(bh);
				2121	if (!*device)
				2122	ret = -ENOENT;
				2123	blkdev_put(bdev, FMODE_READ);
				2124	return ret;
				2125	}
				2126
				2127	int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
				2128	const char *device_path,
				2129	struct btrfs_device **device)
				2130	{
				2131	*device = NULL;
				2132	if (strcmp(device_path, "missing") == 0) {
				2133	struct list_head *devices;
				2134	struct btrfs_device *tmp;
				2135
				2136	devices = &fs_info->fs_devices->devices;
				2137	/*
				2138	* It is safe to read the devices since the volume_mutex
				2139	* is held by the caller.
				2140	*/
				2141	list_for_each_entry(tmp, devices, dev_list) {
				2142	if (tmp->in_fs_metadata && !tmp->bdev) {
				2143	*device = tmp;
				2144	break;
				2145	}
				2146	}
				2147
				2148	if (!*device)
				2149	return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
				2150
				2151	return 0;
				2152	} else {
				2153	return btrfs_find_device_by_path(fs_info, device_path, device);
				2154	}
				2155	}
				2156
				2157	/*
				2158	* Lookup a device given by device id, or the path if the id is 0.
				2159	*/
				2160	int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
				2161	const char *devpath,
				2162	struct btrfs_device **device)
				2163	{
				2164	int ret;
				2165
				2166	if (devid) {
				2167	ret = 0;
				2168	*device = btrfs_find_device(fs_info, devid, NULL, NULL);
				2169	if (!*device)
				2170	ret = -ENOENT;
				2171	} else {
				2172	if (!devpath \|\| !devpath[0])
				2173	return -EINVAL;
				2174
				2175	ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
				2176	device);
				2177	}
				2178	return ret;
				2179	}
				2180
				2181	/*
				2182	* does all the dirty work required for changing file system's UUID.
				2183	*/
				2184	static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
				2185	{
				2186	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2187	struct btrfs_fs_devices *old_devices;
				2188	struct btrfs_fs_devices *seed_devices;
				2189	struct btrfs_super_block *disk_super = fs_info->super_copy;
				2190	struct btrfs_device *device;
				2191	u64 super_flags;
				2192
				2193	BUG_ON(!mutex_is_locked(&uuid_mutex));
				2194	if (!fs_devices->seeding)
				2195	return -EINVAL;
				2196
				2197	seed_devices = alloc_fs_devices(NULL);
				2198	if (IS_ERR(seed_devices))
				2199	return PTR_ERR(seed_devices);
				2200
				2201	old_devices = clone_fs_devices(fs_devices);
				2202	if (IS_ERR(old_devices)) {
				2203	kfree(seed_devices);
				2204	return PTR_ERR(old_devices);
				2205	}
				2206
				2207	list_add(&old_devices->list, &fs_uuids);
				2208
				2209	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
				2210	seed_devices->opened = 1;
				2211	INIT_LIST_HEAD(&seed_devices->devices);
				2212	INIT_LIST_HEAD(&seed_devices->alloc_list);
				2213	mutex_init(&seed_devices->device_list_mutex);
				2214
				2215	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2216	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
				2217	synchronize_rcu);
				2218	list_for_each_entry(device, &seed_devices->devices, dev_list)
				2219	device->fs_devices = seed_devices;
				2220
				2221	mutex_lock(&fs_info->chunk_mutex);
				2222	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
				2223	mutex_unlock(&fs_info->chunk_mutex);
				2224
				2225	fs_devices->seeding = 0;
				2226	fs_devices->num_devices = 0;
				2227	fs_devices->open_devices = 0;
				2228	fs_devices->missing_devices = 0;
				2229	fs_devices->rotating = 0;
				2230	fs_devices->seed = seed_devices;
				2231
				2232	generate_random_uuid(fs_devices->fsid);
				2233	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
				2234	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
				2235	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2236
				2237	super_flags = btrfs_super_flags(disk_super) &
				2238	~BTRFS_SUPER_FLAG_SEEDING;
				2239	btrfs_set_super_flags(disk_super, super_flags);
				2240
				2241	return 0;
				2242	}
				2243
				2244	/*
				2245	* Store the expected generation for seed devices in device items.
				2246	*/
				2247	static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
				2248	struct btrfs_fs_info *fs_info)
				2249	{
				2250	struct btrfs_root *root = fs_info->chunk_root;
				2251	struct btrfs_path *path;
				2252	struct extent_buffer *leaf;
				2253	struct btrfs_dev_item *dev_item;
				2254	struct btrfs_device *device;
				2255	struct btrfs_key key;
				2256	u8 fs_uuid[BTRFS_FSID_SIZE];
				2257	u8 dev_uuid[BTRFS_UUID_SIZE];
				2258	u64 devid;
				2259	int ret;
				2260
				2261	path = btrfs_alloc_path();
				2262	if (!path)
				2263	return -ENOMEM;
				2264
				2265	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				2266	key.offset = 0;
				2267	key.type = BTRFS_DEV_ITEM_KEY;
				2268
				2269	while (1) {
				2270	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
				2271	if (ret < 0)
				2272	goto error;
				2273
				2274	leaf = path->nodes[0];
				2275	next_slot:
				2276	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
				2277	ret = btrfs_next_leaf(root, path);
				2278	if (ret > 0)
				2279	break;
				2280	if (ret < 0)
				2281	goto error;
				2282	leaf = path->nodes[0];
				2283	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2284	btrfs_release_path(path);
				2285	continue;
				2286	}
				2287
				2288	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2289	if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID \|\|
				2290	key.type != BTRFS_DEV_ITEM_KEY)
				2291	break;
				2292
				2293	dev_item = btrfs_item_ptr(leaf, path->slots[0],
				2294	struct btrfs_dev_item);
				2295	devid = btrfs_device_id(leaf, dev_item);
				2296	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
				2297	BTRFS_UUID_SIZE);
				2298	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
				2299	BTRFS_FSID_SIZE);
				2300	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
				2301	BUG_ON(!device); /* Logic error */
				2302
				2303	if (device->fs_devices->seeding) {
				2304	btrfs_set_device_generation(leaf, dev_item,
				2305	device->generation);
				2306	btrfs_mark_buffer_dirty(leaf);
				2307	}
				2308
				2309	path->slots[0]++;
				2310	goto next_slot;
				2311	}
				2312	ret = 0;
				2313	error:
				2314	btrfs_free_path(path);
				2315	return ret;
				2316	}
				2317
				2318	int btrfs_init_new_device(struct btrfs_fs_info fs_info, const char device_path)
				2319	{
				2320	struct btrfs_root *root = fs_info->dev_root;
				2321	struct request_queue *q;
				2322	struct btrfs_trans_handle *trans;
				2323	struct btrfs_device *device;
				2324	struct block_device *bdev;
				2325	struct list_head *devices;
				2326	struct super_block *sb = fs_info->sb;
				2327	struct rcu_string *name;
				2328	u64 tmp;
				2329	int seeding_dev = 0;
				2330	int ret = 0;
				2331
				2332	if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
				2333	return -EROFS;
				2334
				2335	bdev = blkdev_get_by_path(device_path, FMODE_WRITE \| FMODE_EXCL,
				2336	fs_info->bdev_holder);
				2337	if (IS_ERR(bdev))
				2338	return PTR_ERR(bdev);
				2339
				2340	if (fs_info->fs_devices->seeding) {
				2341	seeding_dev = 1;
				2342	down_write(&sb->s_umount);
				2343	mutex_lock(&uuid_mutex);
				2344	}
				2345
				2346	filemap_write_and_wait(bdev->bd_inode->i_mapping);
				2347
				2348	devices = &fs_info->fs_devices->devices;
				2349
				2350	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2351	list_for_each_entry(device, devices, dev_list) {
				2352	if (device->bdev == bdev) {
				2353	ret = -EEXIST;
				2354	mutex_unlock(
				2355	&fs_info->fs_devices->device_list_mutex);
				2356	goto error;
				2357	}
				2358	}
				2359	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2360
				2361	device = btrfs_alloc_device(fs_info, NULL, NULL);
				2362	if (IS_ERR(device)) {
				2363	/* we can safely leave the fs_devices entry around */
				2364	ret = PTR_ERR(device);
				2365	goto error;
				2366	}
				2367
				2368	name = rcu_string_strdup(device_path, GFP_KERNEL);
				2369	if (!name) {
				2370	kfree(device);
				2371	ret = -ENOMEM;
				2372	goto error;
				2373	}
				2374	rcu_assign_pointer(device->name, name);
				2375
				2376	trans = btrfs_start_transaction(root, 0);
				2377	if (IS_ERR(trans)) {
				2378	rcu_string_free(device->name);
				2379	kfree(device);
				2380	ret = PTR_ERR(trans);
				2381	goto error;
				2382	}
				2383
				2384	q = bdev_get_queue(bdev);
				2385	if (blk_queue_discard(q))
				2386	device->can_discard = 1;
				2387	device->writeable = 1;
				2388	device->generation = trans->transid;
				2389	device->io_width = fs_info->sectorsize;
				2390	device->io_align = fs_info->sectorsize;
				2391	device->sector_size = fs_info->sectorsize;
				2392	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
				2393	fs_info->sectorsize);
				2394	device->disk_total_bytes = device->total_bytes;
				2395	device->commit_total_bytes = device->total_bytes;
				2396	device->fs_info = fs_info;
				2397	device->bdev = bdev;
				2398	device->in_fs_metadata = 1;
				2399	device->is_tgtdev_for_dev_replace = 0;
				2400	device->mode = FMODE_EXCL;
				2401	device->dev_stats_valid = 1;
				2402	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
				2403
				2404	if (seeding_dev) {
				2405	sb->s_flags &= ~MS_RDONLY;
				2406	ret = btrfs_prepare_sprout(fs_info);
				2407	BUG_ON(ret); /* -ENOMEM */
				2408	}
				2409
				2410	device->fs_devices = fs_info->fs_devices;
				2411
				2412	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2413	mutex_lock(&fs_info->chunk_mutex);
				2414	list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
				2415	list_add(&device->dev_alloc_list,
				2416	&fs_info->fs_devices->alloc_list);
				2417	fs_info->fs_devices->num_devices++;
				2418	fs_info->fs_devices->open_devices++;
				2419	fs_info->fs_devices->rw_devices++;
				2420	fs_info->fs_devices->total_devices++;
				2421	fs_info->fs_devices->total_rw_bytes += device->total_bytes;
				2422
				2423	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
				2424
				2425	if (!blk_queue_nonrot(q))
				2426	fs_info->fs_devices->rotating = 1;
				2427
				2428	tmp = btrfs_super_total_bytes(fs_info->super_copy);
				2429	btrfs_set_super_total_bytes(fs_info->super_copy,
				2430	round_down(tmp + device->total_bytes, fs_info->sectorsize));
				2431
				2432	tmp = btrfs_super_num_devices(fs_info->super_copy);
				2433	btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
				2434
				2435	/* add sysfs device entry */
				2436	btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
				2437
				2438	/*
				2439	* we've got more storage, clear any full flags on the space
				2440	* infos
				2441	*/
				2442	btrfs_clear_space_info_full(fs_info);
				2443
				2444	mutex_unlock(&fs_info->chunk_mutex);
				2445	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2446
				2447	if (seeding_dev) {
				2448	mutex_lock(&fs_info->chunk_mutex);
				2449	ret = init_first_rw_device(trans, fs_info);
				2450	mutex_unlock(&fs_info->chunk_mutex);
				2451	if (ret) {
				2452	btrfs_abort_transaction(trans, ret);
				2453	goto error_trans;
				2454	}
				2455	}
				2456
				2457	ret = btrfs_add_device(trans, fs_info, device);
				2458	if (ret) {
				2459	btrfs_abort_transaction(trans, ret);
				2460	goto error_trans;
				2461	}
				2462
				2463	if (seeding_dev) {
				2464	char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
				2465
				2466	ret = btrfs_finish_sprout(trans, fs_info);
				2467	if (ret) {
				2468	btrfs_abort_transaction(trans, ret);
				2469	goto error_trans;
				2470	}
				2471
				2472	/* Sprouting would change fsid of the mounted root,
				2473	* so rename the fsid on the sysfs
				2474	*/
				2475	snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
				2476	fs_info->fsid);
				2477	if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
				2478	btrfs_warn(fs_info,
				2479	"sysfs: failed to create fsid for sprout");
				2480	}
				2481
				2482	ret = btrfs_commit_transaction(trans);
				2483
				2484	if (seeding_dev) {
				2485	mutex_unlock(&uuid_mutex);
				2486	up_write(&sb->s_umount);
				2487
				2488	if (ret) /* transaction commit */
				2489	return ret;
				2490
				2491	ret = btrfs_relocate_sys_chunks(fs_info);
				2492	if (ret < 0)
				2493	btrfs_handle_fs_error(fs_info, ret,
				2494	"Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
				2495	trans = btrfs_attach_transaction(root);
				2496	if (IS_ERR(trans)) {
				2497	if (PTR_ERR(trans) == -ENOENT)
				2498	return 0;
				2499	return PTR_ERR(trans);
				2500	}
				2501	ret = btrfs_commit_transaction(trans);
				2502	}
				2503
				2504	/* Update ctime/mtime for libblkid */
				2505	update_dev_time(device_path);
				2506	return ret;
				2507
				2508	error_trans:
				2509	if (seeding_dev)
				2510	sb->s_flags \|= MS_RDONLY;
				2511	btrfs_end_transaction(trans);
				2512	rcu_string_free(device->name);
				2513	btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
				2514	kfree(device);
				2515	error:
				2516	blkdev_put(bdev, FMODE_EXCL);
				2517	if (seeding_dev) {
				2518	mutex_unlock(&uuid_mutex);
				2519	up_write(&sb->s_umount);
				2520	}
				2521	return ret;
				2522	}
				2523
				2524	int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
				2525	const char *device_path,
				2526	struct btrfs_device *srcdev,
				2527	struct btrfs_device **device_out)
				2528	{
				2529	struct request_queue *q;
				2530	struct btrfs_device *device;
				2531	struct block_device *bdev;
				2532	struct list_head *devices;
				2533	struct rcu_string *name;
				2534	u64 devid = BTRFS_DEV_REPLACE_DEVID;
				2535	int ret = 0;
				2536
				2537	*device_out = NULL;
				2538	if (fs_info->fs_devices->seeding) {
				2539	btrfs_err(fs_info, "the filesystem is a seed filesystem!");
				2540	return -EINVAL;
				2541	}
				2542
				2543	bdev = blkdev_get_by_path(device_path, FMODE_WRITE \| FMODE_EXCL,
				2544	fs_info->bdev_holder);
				2545	if (IS_ERR(bdev)) {
				2546	btrfs_err(fs_info, "target device %s is invalid!", device_path);
				2547	return PTR_ERR(bdev);
				2548	}
				2549
				2550	filemap_write_and_wait(bdev->bd_inode->i_mapping);
				2551
				2552	devices = &fs_info->fs_devices->devices;
				2553	list_for_each_entry(device, devices, dev_list) {
				2554	if (device->bdev == bdev) {
				2555	btrfs_err(fs_info,
				2556	"target device is in the filesystem!");
				2557	ret = -EEXIST;
				2558	goto error;
				2559	}
				2560	}
				2561
				2562
				2563	if (i_size_read(bdev->bd_inode) <
				2564	btrfs_device_get_total_bytes(srcdev)) {
				2565	btrfs_err(fs_info,
				2566	"target device is smaller than source device!");
				2567	ret = -EINVAL;
				2568	goto error;
				2569	}
				2570
				2571
				2572	device = btrfs_alloc_device(NULL, &devid, NULL);
				2573	if (IS_ERR(device)) {
				2574	ret = PTR_ERR(device);
				2575	goto error;
				2576	}
				2577
				2578	name = rcu_string_strdup(device_path, GFP_KERNEL);
				2579	if (!name) {
				2580	kfree(device);
				2581	ret = -ENOMEM;
				2582	goto error;
				2583	}
				2584	rcu_assign_pointer(device->name, name);
				2585
				2586	q = bdev_get_queue(bdev);
				2587	if (blk_queue_discard(q))
				2588	device->can_discard = 1;
				2589	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2590	device->writeable = 1;
				2591	device->generation = 0;
				2592	device->io_width = fs_info->sectorsize;
				2593	device->io_align = fs_info->sectorsize;
				2594	device->sector_size = fs_info->sectorsize;
				2595	device->total_bytes = btrfs_device_get_total_bytes(srcdev);
				2596	device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
				2597	device->bytes_used = btrfs_device_get_bytes_used(srcdev);
				2598	ASSERT(list_empty(&srcdev->resized_list));
				2599	device->commit_total_bytes = srcdev->commit_total_bytes;
				2600	device->commit_bytes_used = device->bytes_used;
				2601	device->fs_info = fs_info;
				2602	device->bdev = bdev;
				2603	device->in_fs_metadata = 1;
				2604	device->is_tgtdev_for_dev_replace = 1;
				2605	device->mode = FMODE_EXCL;
				2606	device->dev_stats_valid = 1;
				2607	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
				2608	device->fs_devices = fs_info->fs_devices;
				2609	list_add(&device->dev_list, &fs_info->fs_devices->devices);
				2610	fs_info->fs_devices->num_devices++;
				2611	fs_info->fs_devices->open_devices++;
				2612	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2613
				2614	*device_out = device;
				2615	return ret;
				2616
				2617	error:
				2618	blkdev_put(bdev, FMODE_EXCL);
				2619	return ret;
				2620	}
				2621
				2622	void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
				2623	struct btrfs_device *tgtdev)
				2624	{
				2625	u32 sectorsize = fs_info->sectorsize;
				2626
				2627	WARN_ON(fs_info->fs_devices->rw_devices == 0);
				2628	tgtdev->io_width = sectorsize;
				2629	tgtdev->io_align = sectorsize;
				2630	tgtdev->sector_size = sectorsize;
				2631	tgtdev->fs_info = fs_info;
				2632	tgtdev->in_fs_metadata = 1;
				2633	}
				2634
				2635	static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
				2636	struct btrfs_device *device)
				2637	{
				2638	int ret;
				2639	struct btrfs_path *path;
				2640	struct btrfs_root *root = device->fs_info->chunk_root;
				2641	struct btrfs_dev_item *dev_item;
				2642	struct extent_buffer *leaf;
				2643	struct btrfs_key key;
				2644
				2645	path = btrfs_alloc_path();
				2646	if (!path)
				2647	return -ENOMEM;
				2648
				2649	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				2650	key.type = BTRFS_DEV_ITEM_KEY;
				2651	key.offset = device->devid;
				2652
				2653	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
				2654	if (ret < 0)
				2655	goto out;
				2656
				2657	if (ret > 0) {
				2658	ret = -ENOENT;
				2659	goto out;
				2660	}
				2661
				2662	leaf = path->nodes[0];
				2663	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
				2664
				2665	btrfs_set_device_id(leaf, dev_item, device->devid);
				2666	btrfs_set_device_type(leaf, dev_item, device->type);
				2667	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
				2668	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
				2669	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
				2670	btrfs_set_device_total_bytes(leaf, dev_item,
				2671	btrfs_device_get_disk_total_bytes(device));
				2672	btrfs_set_device_bytes_used(leaf, dev_item,
				2673	btrfs_device_get_bytes_used(device));
				2674	btrfs_mark_buffer_dirty(leaf);
				2675
				2676	out:
				2677	btrfs_free_path(path);
				2678	return ret;
				2679	}
				2680
				2681	int btrfs_grow_device(struct btrfs_trans_handle *trans,
				2682	struct btrfs_device *device, u64 new_size)
				2683	{
				2684	struct btrfs_fs_info *fs_info = device->fs_info;
				2685	struct btrfs_super_block *super_copy = fs_info->super_copy;
				2686	struct btrfs_fs_devices *fs_devices;
				2687	u64 old_total;
				2688	u64 diff;
				2689
				2690	if (!device->writeable)
				2691	return -EACCES;
				2692
				2693	new_size = round_down(new_size, fs_info->sectorsize);
				2694
				2695	mutex_lock(&fs_info->chunk_mutex);
				2696	old_total = btrfs_super_total_bytes(super_copy);
				2697	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
				2698
				2699	if (new_size <= device->total_bytes \|\|
				2700	device->is_tgtdev_for_dev_replace) {
				2701	mutex_unlock(&fs_info->chunk_mutex);
				2702	return -EINVAL;
				2703	}
				2704
				2705	fs_devices = fs_info->fs_devices;
				2706
				2707	btrfs_set_super_total_bytes(super_copy,
				2708	round_down(old_total + diff, fs_info->sectorsize));
				2709	device->fs_devices->total_rw_bytes += diff;
				2710
				2711	btrfs_device_set_total_bytes(device, new_size);
				2712	btrfs_device_set_disk_total_bytes(device, new_size);
				2713	btrfs_clear_space_info_full(device->fs_info);
				2714	if (list_empty(&device->resized_list))
				2715	list_add_tail(&device->resized_list,
				2716	&fs_devices->resized_devices);
				2717	mutex_unlock(&fs_info->chunk_mutex);
				2718
				2719	return btrfs_update_device(trans, device);
				2720	}
				2721
				2722	static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
				2723	struct btrfs_fs_info *fs_info, u64 chunk_offset)
				2724	{
				2725	struct btrfs_root *root = fs_info->chunk_root;
				2726	int ret;
				2727	struct btrfs_path *path;
				2728	struct btrfs_key key;
				2729
				2730	path = btrfs_alloc_path();
				2731	if (!path)
				2732	return -ENOMEM;
				2733
				2734	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				2735	key.offset = chunk_offset;
				2736	key.type = BTRFS_CHUNK_ITEM_KEY;
				2737
				2738	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				2739	if (ret < 0)
				2740	goto out;
				2741	else if (ret > 0) { /* Logic error or corruption */
				2742	btrfs_handle_fs_error(fs_info, -ENOENT,
				2743	"Failed lookup while freeing chunk.");
				2744	ret = -ENOENT;
				2745	goto out;
				2746	}
				2747
				2748	ret = btrfs_del_item(trans, root, path);
				2749	if (ret < 0)
				2750	btrfs_handle_fs_error(fs_info, ret,
				2751	"Failed to delete chunk item.");
				2752	out:
				2753	btrfs_free_path(path);
				2754	return ret;
				2755	}
				2756
				2757	static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				2758	{
				2759	struct btrfs_super_block *super_copy = fs_info->super_copy;
				2760	struct btrfs_disk_key *disk_key;
				2761	struct btrfs_chunk *chunk;
				2762	u8 *ptr;
				2763	int ret = 0;
				2764	u32 num_stripes;
				2765	u32 array_size;
				2766	u32 len = 0;
				2767	u32 cur;
				2768	struct btrfs_key key;
				2769
				2770	mutex_lock(&fs_info->chunk_mutex);
				2771	array_size = btrfs_super_sys_array_size(super_copy);
				2772
				2773	ptr = super_copy->sys_chunk_array;
				2774	cur = 0;
				2775
				2776	while (cur < array_size) {
				2777	disk_key = (struct btrfs_disk_key *)ptr;
				2778	btrfs_disk_key_to_cpu(&key, disk_key);
				2779
				2780	len = sizeof(*disk_key);
				2781
				2782	if (key.type == BTRFS_CHUNK_ITEM_KEY) {
				2783	chunk = (struct btrfs_chunk *)(ptr + len);
				2784	num_stripes = btrfs_stack_chunk_num_stripes(chunk);
				2785	len += btrfs_chunk_item_size(num_stripes);
				2786	} else {
				2787	ret = -EIO;
				2788	break;
				2789	}
				2790	if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
				2791	key.offset == chunk_offset) {
				2792	memmove(ptr, ptr + len, array_size - (cur + len));
				2793	array_size -= len;
				2794	btrfs_set_super_sys_array_size(super_copy, array_size);
				2795	} else {
				2796	ptr += len;
				2797	cur += len;
				2798	}
				2799	}
				2800	mutex_unlock(&fs_info->chunk_mutex);
				2801	return ret;
				2802	}
				2803
				2804	static struct extent_map get_chunk_map(struct btrfs_fs_info fs_info,
				2805	u64 logical, u64 length)
				2806	{
				2807	struct extent_map_tree *em_tree;
				2808	struct extent_map *em;
				2809
				2810	em_tree = &fs_info->mapping_tree.map_tree;
				2811	read_lock(&em_tree->lock);
				2812	em = lookup_extent_mapping(em_tree, logical, length);
				2813	read_unlock(&em_tree->lock);
				2814
				2815	if (!em) {
				2816	btrfs_crit(fs_info, "unable to find logical %llu length %llu",
				2817	logical, length);
				2818	return ERR_PTR(-EINVAL);
				2819	}
				2820
				2821	if (em->start > logical \|\| em->start + em->len < logical) {
				2822	btrfs_crit(fs_info,
				2823	"found a bad mapping, wanted %llu-%llu, found %llu-%llu",
				2824	logical, length, em->start, em->start + em->len);
				2825	free_extent_map(em);
				2826	return ERR_PTR(-EINVAL);
				2827	}
				2828
				2829	/* callers are responsible for dropping em's ref. */
				2830	return em;
				2831	}
				2832
				2833	int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
				2834	struct btrfs_fs_info *fs_info, u64 chunk_offset)
				2835	{
				2836	struct extent_map *em;
				2837	struct map_lookup *map;
				2838	u64 dev_extent_len = 0;
				2839	int i, ret = 0;
				2840	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2841
				2842	em = get_chunk_map(fs_info, chunk_offset, 1);
				2843	if (IS_ERR(em)) {
				2844	/*
				2845	* This is a logic error, but we don't want to just rely on the
				2846	* user having built with ASSERT enabled, so if ASSERT doesn't
				2847	* do anything we still error out.
				2848	*/
				2849	ASSERT(0);
				2850	return PTR_ERR(em);
				2851	}
				2852	map = em->map_lookup;
				2853	mutex_lock(&fs_info->chunk_mutex);
				2854	check_system_chunk(trans, fs_info, map->type);
				2855	mutex_unlock(&fs_info->chunk_mutex);
				2856
				2857	/*
				2858	* Take the device list mutex to prevent races with the final phase of
				2859	* a device replace operation that replaces the device object associated
				2860	* with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
				2861	*/
				2862	mutex_lock(&fs_devices->device_list_mutex);
				2863	for (i = 0; i < map->num_stripes; i++) {
				2864	struct btrfs_device *device = map->stripes[i].dev;
				2865	ret = btrfs_free_dev_extent(trans, device,
				2866	map->stripes[i].physical,
				2867	&dev_extent_len);
				2868	if (ret) {
				2869	mutex_unlock(&fs_devices->device_list_mutex);
				2870	btrfs_abort_transaction(trans, ret);
				2871	goto out;
				2872	}
				2873
				2874	if (device->bytes_used > 0) {
				2875	mutex_lock(&fs_info->chunk_mutex);
				2876	btrfs_device_set_bytes_used(device,
				2877	device->bytes_used - dev_extent_len);
				2878	atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
				2879	btrfs_clear_space_info_full(fs_info);
				2880	mutex_unlock(&fs_info->chunk_mutex);
				2881	}
				2882
				2883	if (map->stripes[i].dev) {
				2884	ret = btrfs_update_device(trans, map->stripes[i].dev);
				2885	if (ret) {
				2886	mutex_unlock(&fs_devices->device_list_mutex);
				2887	btrfs_abort_transaction(trans, ret);
				2888	goto out;
				2889	}
				2890	}
				2891	}
				2892	mutex_unlock(&fs_devices->device_list_mutex);
				2893
				2894	ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
				2895	if (ret) {
				2896	btrfs_abort_transaction(trans, ret);
				2897	goto out;
				2898	}
				2899
				2900	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
				2901
				2902	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
				2903	ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
				2904	if (ret) {
				2905	btrfs_abort_transaction(trans, ret);
				2906	goto out;
				2907	}
				2908	}
				2909
				2910	ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
				2911	if (ret) {
				2912	btrfs_abort_transaction(trans, ret);
				2913	goto out;
				2914	}
				2915
				2916	out:
				2917	/* once for us */
				2918	free_extent_map(em);
				2919	return ret;
				2920	}
				2921
				2922	static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				2923	{
				2924	struct btrfs_root *root = fs_info->chunk_root;
				2925	struct btrfs_trans_handle *trans;
				2926	int ret;
				2927
				2928	/*
				2929	* Prevent races with automatic removal of unused block groups.
				2930	* After we relocate and before we remove the chunk with offset
				2931	* chunk_offset, automatic removal of the block group can kick in,
				2932	* resulting in a failure when calling btrfs_remove_chunk() below.
				2933	*
				2934	* Make sure to acquire this mutex before doing a tree search (dev
				2935	* or chunk trees) to find chunks. Otherwise the cleaner kthread might
				2936	* call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
				2937	* we release the path used to search the chunk/dev tree and before
				2938	* the current task acquires this mutex and calls us.
				2939	*/
				2940	ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
				2941
				2942	ret = btrfs_can_relocate(fs_info, chunk_offset);
				2943	if (ret)
				2944	return -ENOSPC;
				2945
				2946	/* step one, relocate all the extents inside this chunk */
				2947	btrfs_scrub_pause(fs_info);
				2948	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
				2949	btrfs_scrub_continue(fs_info);
				2950	if (ret)
				2951	return ret;
				2952
				2953	trans = btrfs_start_trans_remove_block_group(root->fs_info,
				2954	chunk_offset);
				2955	if (IS_ERR(trans)) {
				2956	ret = PTR_ERR(trans);
				2957	btrfs_handle_fs_error(root->fs_info, ret, NULL);
				2958	return ret;
				2959	}
				2960
				2961	/*
				2962	* step two, delete the device extents and the
				2963	* chunk tree entries
				2964	*/
				2965	ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
				2966	btrfs_end_transaction(trans);
				2967	return ret;
				2968	}
				2969
				2970	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
				2971	{
				2972	struct btrfs_root *chunk_root = fs_info->chunk_root;
				2973	struct btrfs_path *path;
				2974	struct extent_buffer *leaf;
				2975	struct btrfs_chunk *chunk;
				2976	struct btrfs_key key;
				2977	struct btrfs_key found_key;
				2978	u64 chunk_type;
				2979	bool retried = false;
				2980	int failed = 0;
				2981	int ret;
				2982
				2983	path = btrfs_alloc_path();
				2984	if (!path)
				2985	return -ENOMEM;
				2986
				2987	again:
				2988	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				2989	key.offset = (u64)-1;
				2990	key.type = BTRFS_CHUNK_ITEM_KEY;
				2991
				2992	while (1) {
				2993	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				2994	ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
				2995	if (ret < 0) {
				2996	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				2997	goto error;
				2998	}
				2999	BUG_ON(ret == 0); /* Corruption */
				3000
				3001	ret = btrfs_previous_item(chunk_root, path, key.objectid,
				3002	key.type);
				3003	if (ret)
				3004	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3005	if (ret < 0)
				3006	goto error;
				3007	if (ret > 0)
				3008	break;
				3009
				3010	leaf = path->nodes[0];
				3011	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				3012
				3013	chunk = btrfs_item_ptr(leaf, path->slots[0],
				3014	struct btrfs_chunk);
				3015	chunk_type = btrfs_chunk_type(leaf, chunk);
				3016	btrfs_release_path(path);
				3017
				3018	if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
				3019	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
				3020	if (ret == -ENOSPC)
				3021	failed++;
				3022	else
				3023	BUG_ON(ret);
				3024	}
				3025	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3026
				3027	if (found_key.offset == 0)
				3028	break;
				3029	key.offset = found_key.offset - 1;
				3030	}
				3031	ret = 0;
				3032	if (failed && !retried) {
				3033	failed = 0;
				3034	retried = true;
				3035	goto again;
				3036	} else if (WARN_ON(failed && retried)) {
				3037	ret = -ENOSPC;
				3038	}
				3039	error:
				3040	btrfs_free_path(path);
				3041	return ret;
				3042	}
				3043
				3044	static int insert_balance_item(struct btrfs_fs_info *fs_info,
				3045	struct btrfs_balance_control *bctl)
				3046	{
				3047	struct btrfs_root *root = fs_info->tree_root;
				3048	struct btrfs_trans_handle *trans;
				3049	struct btrfs_balance_item *item;
				3050	struct btrfs_disk_balance_args disk_bargs;
				3051	struct btrfs_path *path;
				3052	struct extent_buffer *leaf;
				3053	struct btrfs_key key;
				3054	int ret, err;
				3055
				3056	path = btrfs_alloc_path();
				3057	if (!path)
				3058	return -ENOMEM;
				3059
				3060	trans = btrfs_start_transaction(root, 0);
				3061	if (IS_ERR(trans)) {
				3062	btrfs_free_path(path);
				3063	return PTR_ERR(trans);
				3064	}
				3065
				3066	key.objectid = BTRFS_BALANCE_OBJECTID;
				3067	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3068	key.offset = 0;
				3069
				3070	ret = btrfs_insert_empty_item(trans, root, path, &key,
				3071	sizeof(*item));
				3072	if (ret)
				3073	goto out;
				3074
				3075	leaf = path->nodes[0];
				3076	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
				3077
				3078	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
				3079
				3080	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
				3081	btrfs_set_balance_data(leaf, item, &disk_bargs);
				3082	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
				3083	btrfs_set_balance_meta(leaf, item, &disk_bargs);
				3084	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
				3085	btrfs_set_balance_sys(leaf, item, &disk_bargs);
				3086
				3087	btrfs_set_balance_flags(leaf, item, bctl->flags);
				3088
				3089	btrfs_mark_buffer_dirty(leaf);
				3090	out:
				3091	btrfs_free_path(path);
				3092	err = btrfs_commit_transaction(trans);
				3093	if (err && !ret)
				3094	ret = err;
				3095	return ret;
				3096	}
				3097
				3098	static int del_balance_item(struct btrfs_fs_info *fs_info)
				3099	{
				3100	struct btrfs_root *root = fs_info->tree_root;
				3101	struct btrfs_trans_handle *trans;
				3102	struct btrfs_path *path;
				3103	struct btrfs_key key;
				3104	int ret, err;
				3105
				3106	path = btrfs_alloc_path();
				3107	if (!path)
				3108	return -ENOMEM;
				3109
				3110	trans = btrfs_start_transaction(root, 0);
				3111	if (IS_ERR(trans)) {
				3112	btrfs_free_path(path);
				3113	return PTR_ERR(trans);
				3114	}
				3115
				3116	key.objectid = BTRFS_BALANCE_OBJECTID;
				3117	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3118	key.offset = 0;
				3119
				3120	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				3121	if (ret < 0)
				3122	goto out;
				3123	if (ret > 0) {
				3124	ret = -ENOENT;
				3125	goto out;
				3126	}
				3127
				3128	ret = btrfs_del_item(trans, root, path);
				3129	out:
				3130	btrfs_free_path(path);
				3131	err = btrfs_commit_transaction(trans);
				3132	if (err && !ret)
				3133	ret = err;
				3134	return ret;
				3135	}
				3136
				3137	/*
				3138	* This is a heuristic used to reduce the number of chunks balanced on
				3139	* resume after balance was interrupted.
				3140	*/
				3141	static void update_balance_args(struct btrfs_balance_control *bctl)
				3142	{
				3143	/*
				3144	* Turn on soft mode for chunk types that were being converted.
				3145	*/
				3146	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3147	bctl->data.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3148	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3149	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3150	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3151	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3152
				3153	/*
				3154	* Turn on usage filter if is not already used. The idea is
				3155	* that chunks that we have already balanced should be
				3156	* reasonably full. Don't do it for chunks that are being
				3157	* converted - that will keep us from relocating unconverted
				3158	* (albeit full) chunks.
				3159	*/
				3160	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3161	!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3162	!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3163	bctl->data.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3164	bctl->data.usage = 90;
				3165	}
				3166	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3167	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3168	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3169	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3170	bctl->sys.usage = 90;
				3171	}
				3172	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3173	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3174	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3175	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3176	bctl->meta.usage = 90;
				3177	}
				3178	}
				3179
				3180	/*
				3181	* Should be called with both balance and volume mutexes held to
				3182	* serialize other volume operations (add_dev/rm_dev/resize) with
				3183	* restriper. Same goes for unset_balance_control.
				3184	*/
				3185	static void set_balance_control(struct btrfs_balance_control *bctl)
				3186	{
				3187	struct btrfs_fs_info *fs_info = bctl->fs_info;
				3188
				3189	BUG_ON(fs_info->balance_ctl);
				3190
				3191	spin_lock(&fs_info->balance_lock);
				3192	fs_info->balance_ctl = bctl;
				3193	spin_unlock(&fs_info->balance_lock);
				3194	}
				3195
				3196	static void unset_balance_control(struct btrfs_fs_info *fs_info)
				3197	{
				3198	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3199
				3200	BUG_ON(!fs_info->balance_ctl);
				3201
				3202	spin_lock(&fs_info->balance_lock);
				3203	fs_info->balance_ctl = NULL;
				3204	spin_unlock(&fs_info->balance_lock);
				3205
				3206	kfree(bctl);
				3207	}
				3208
				3209	/*
				3210	* Balance filters. Return 1 if chunk should be filtered out
				3211	* (should not be balanced).
				3212	*/
				3213	static int chunk_profiles_filter(u64 chunk_type,
				3214	struct btrfs_balance_args *bargs)
				3215	{
				3216	chunk_type = chunk_to_extended(chunk_type) &
				3217	BTRFS_EXTENDED_PROFILE_MASK;
				3218
				3219	if (bargs->profiles & chunk_type)
				3220	return 0;
				3221
				3222	return 1;
				3223	}
				3224
				3225	static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
				3226	struct btrfs_balance_args *bargs)
				3227	{
				3228	struct btrfs_block_group_cache *cache;
				3229	u64 chunk_used;
				3230	u64 user_thresh_min;
				3231	u64 user_thresh_max;
				3232	int ret = 1;
				3233
				3234	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3235	chunk_used = btrfs_block_group_used(&cache->item);
				3236
				3237	if (bargs->usage_min == 0)
				3238	user_thresh_min = 0;
				3239	else
				3240	user_thresh_min = div_factor_fine(cache->key.offset,
				3241	bargs->usage_min);
				3242
				3243	if (bargs->usage_max == 0)
				3244	user_thresh_max = 1;
				3245	else if (bargs->usage_max > 100)
				3246	user_thresh_max = cache->key.offset;
				3247	else
				3248	user_thresh_max = div_factor_fine(cache->key.offset,
				3249	bargs->usage_max);
				3250
				3251	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
				3252	ret = 0;
				3253
				3254	btrfs_put_block_group(cache);
				3255	return ret;
				3256	}
				3257
				3258	static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
				3259	u64 chunk_offset, struct btrfs_balance_args *bargs)
				3260	{
				3261	struct btrfs_block_group_cache *cache;
				3262	u64 chunk_used, user_thresh;
				3263	int ret = 1;
				3264
				3265	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3266	chunk_used = btrfs_block_group_used(&cache->item);
				3267
				3268	if (bargs->usage_min == 0)
				3269	user_thresh = 1;
				3270	else if (bargs->usage > 100)
				3271	user_thresh = cache->key.offset;
				3272	else
				3273	user_thresh = div_factor_fine(cache->key.offset,
				3274	bargs->usage);
				3275
				3276	if (chunk_used < user_thresh)
				3277	ret = 0;
				3278
				3279	btrfs_put_block_group(cache);
				3280	return ret;
				3281	}
				3282
				3283	static int chunk_devid_filter(struct extent_buffer *leaf,
				3284	struct btrfs_chunk *chunk,
				3285	struct btrfs_balance_args *bargs)
				3286	{
				3287	struct btrfs_stripe *stripe;
				3288	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3289	int i;
				3290
				3291	for (i = 0; i < num_stripes; i++) {
				3292	stripe = btrfs_stripe_nr(chunk, i);
				3293	if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
				3294	return 0;
				3295	}
				3296
				3297	return 1;
				3298	}
				3299
				3300	/* [pstart, pend) */
				3301	static int chunk_drange_filter(struct extent_buffer *leaf,
				3302	struct btrfs_chunk *chunk,
				3303	struct btrfs_balance_args *bargs)
				3304	{
				3305	struct btrfs_stripe *stripe;
				3306	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3307	u64 stripe_offset;
				3308	u64 stripe_length;
				3309	int factor;
				3310	int i;
				3311
				3312	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
				3313	return 0;
				3314
				3315	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP \|
				3316	BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10)) {
				3317	factor = num_stripes / 2;
				3318	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
				3319	factor = num_stripes - 1;
				3320	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
				3321	factor = num_stripes - 2;
				3322	} else {
				3323	factor = num_stripes;
				3324	}
				3325
				3326	for (i = 0; i < num_stripes; i++) {
				3327	stripe = btrfs_stripe_nr(chunk, i);
				3328	if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
				3329	continue;
				3330
				3331	stripe_offset = btrfs_stripe_offset(leaf, stripe);
				3332	stripe_length = btrfs_chunk_length(leaf, chunk);
				3333	stripe_length = div_u64(stripe_length, factor);
				3334
				3335	if (stripe_offset < bargs->pend &&
				3336	stripe_offset + stripe_length > bargs->pstart)
				3337	return 0;
				3338	}
				3339
				3340	return 1;
				3341	}
				3342
				3343	/* [vstart, vend) */
				3344	static int chunk_vrange_filter(struct extent_buffer *leaf,
				3345	struct btrfs_chunk *chunk,
				3346	u64 chunk_offset,
				3347	struct btrfs_balance_args *bargs)
				3348	{
				3349	if (chunk_offset < bargs->vend &&
				3350	chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
				3351	/* at least part of the chunk is inside this vrange */
				3352	return 0;
				3353
				3354	return 1;
				3355	}
				3356
				3357	static int chunk_stripes_range_filter(struct extent_buffer *leaf,
				3358	struct btrfs_chunk *chunk,
				3359	struct btrfs_balance_args *bargs)
				3360	{
				3361	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3362
				3363	if (bargs->stripes_min <= num_stripes
				3364	&& num_stripes <= bargs->stripes_max)
				3365	return 0;
				3366
				3367	return 1;
				3368	}
				3369
				3370	static int chunk_soft_convert_filter(u64 chunk_type,
				3371	struct btrfs_balance_args *bargs)
				3372	{
				3373	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
				3374	return 0;
				3375
				3376	chunk_type = chunk_to_extended(chunk_type) &
				3377	BTRFS_EXTENDED_PROFILE_MASK;
				3378
				3379	if (bargs->target == chunk_type)
				3380	return 1;
				3381
				3382	return 0;
				3383	}
				3384
				3385	static int should_balance_chunk(struct btrfs_fs_info *fs_info,
				3386	struct extent_buffer *leaf,
				3387	struct btrfs_chunk *chunk, u64 chunk_offset)
				3388	{
				3389	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3390	struct btrfs_balance_args *bargs = NULL;
				3391	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
				3392
				3393	/* type filter */
				3394	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
				3395	(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
				3396	return 0;
				3397	}
				3398
				3399	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				3400	bargs = &bctl->data;
				3401	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				3402	bargs = &bctl->sys;
				3403	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				3404	bargs = &bctl->meta;
				3405
				3406	/* profiles filter */
				3407	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
				3408	chunk_profiles_filter(chunk_type, bargs)) {
				3409	return 0;
				3410	}
				3411
				3412	/* usage filter */
				3413	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3414	chunk_usage_filter(fs_info, chunk_offset, bargs)) {
				3415	return 0;
				3416	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3417	chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
				3418	return 0;
				3419	}
				3420
				3421	/* devid filter */
				3422	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
				3423	chunk_devid_filter(leaf, chunk, bargs)) {
				3424	return 0;
				3425	}
				3426
				3427	/* drange filter, makes sense only with devid filter */
				3428	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
				3429	chunk_drange_filter(leaf, chunk, bargs)) {
				3430	return 0;
				3431	}
				3432
				3433	/* vrange filter */
				3434	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
				3435	chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
				3436	return 0;
				3437	}
				3438
				3439	/* stripes filter */
				3440	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
				3441	chunk_stripes_range_filter(leaf, chunk, bargs)) {
				3442	return 0;
				3443	}
				3444
				3445	/* soft profile changing mode */
				3446	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
				3447	chunk_soft_convert_filter(chunk_type, bargs)) {
				3448	return 0;
				3449	}
				3450
				3451	/*
				3452	* limited by count, must be the last filter
				3453	*/
				3454	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
				3455	if (bargs->limit == 0)
				3456	return 0;
				3457	else
				3458	bargs->limit--;
				3459	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
				3460	/*
				3461	* Same logic as the 'limit' filter; the minimum cannot be
				3462	* determined here because we do not have the global information
				3463	* about the count of all chunks that satisfy the filters.
				3464	*/
				3465	if (bargs->limit_max == 0)
				3466	return 0;
				3467	else
				3468	bargs->limit_max--;
				3469	}
				3470
				3471	return 1;
				3472	}
				3473
				3474	static int __btrfs_balance(struct btrfs_fs_info *fs_info)
				3475	{
				3476	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3477	struct btrfs_root *chunk_root = fs_info->chunk_root;
				3478	struct btrfs_root *dev_root = fs_info->dev_root;
				3479	struct list_head *devices;
				3480	struct btrfs_device *device;
				3481	u64 old_size;
				3482	u64 size_to_free;
				3483	u64 chunk_type;
				3484	struct btrfs_chunk *chunk;
				3485	struct btrfs_path *path = NULL;
				3486	struct btrfs_key key;
				3487	struct btrfs_key found_key;
				3488	struct btrfs_trans_handle *trans;
				3489	struct extent_buffer *leaf;
				3490	int slot;
				3491	int ret;
				3492	int enospc_errors = 0;
				3493	bool counting = true;
				3494	/* The single value limit and min/max limits use the same bytes in the */
				3495	u64 limit_data = bctl->data.limit;
				3496	u64 limit_meta = bctl->meta.limit;
				3497	u64 limit_sys = bctl->sys.limit;
				3498	u32 count_data = 0;
				3499	u32 count_meta = 0;
				3500	u32 count_sys = 0;
				3501	int chunk_reserved = 0;
				3502	u64 bytes_used = 0;
				3503
				3504	/* step one make some room on all the devices */
				3505	devices = &fs_info->fs_devices->devices;
				3506	list_for_each_entry(device, devices, dev_list) {
				3507	old_size = btrfs_device_get_total_bytes(device);
				3508	size_to_free = div_factor(old_size, 1);
				3509	size_to_free = min_t(u64, size_to_free, SZ_1M);
				3510	if (!device->writeable \|\|
				3511	btrfs_device_get_total_bytes(device) -
				3512	btrfs_device_get_bytes_used(device) > size_to_free \|\|
				3513	device->is_tgtdev_for_dev_replace)
				3514	continue;
				3515
				3516	ret = btrfs_shrink_device(device, old_size - size_to_free);
				3517	if (ret == -ENOSPC)
				3518	break;
				3519	if (ret) {
				3520	/* btrfs_shrink_device never returns ret > 0 */
				3521	WARN_ON(ret > 0);
				3522	goto error;
				3523	}
				3524
				3525	trans = btrfs_start_transaction(dev_root, 0);
				3526	if (IS_ERR(trans)) {
				3527	ret = PTR_ERR(trans);
				3528	btrfs_info_in_rcu(fs_info,
				3529	"resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
				3530	rcu_str_deref(device->name), ret,
				3531	old_size, old_size - size_to_free);
				3532	goto error;
				3533	}
				3534
				3535	ret = btrfs_grow_device(trans, device, old_size);
				3536	if (ret) {
				3537	btrfs_end_transaction(trans);
				3538	/* btrfs_grow_device never returns ret > 0 */
				3539	WARN_ON(ret > 0);
				3540	btrfs_info_in_rcu(fs_info,
				3541	"resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
				3542	rcu_str_deref(device->name), ret,
				3543	old_size, old_size - size_to_free);
				3544	goto error;
				3545	}
				3546
				3547	btrfs_end_transaction(trans);
				3548	}
				3549
				3550	/* step two, relocate all the chunks */
				3551	path = btrfs_alloc_path();
				3552	if (!path) {
				3553	ret = -ENOMEM;
				3554	goto error;
				3555	}
				3556
				3557	/* zero out stat counters */
				3558	spin_lock(&fs_info->balance_lock);
				3559	memset(&bctl->stat, 0, sizeof(bctl->stat));
				3560	spin_unlock(&fs_info->balance_lock);
				3561	again:
				3562	if (!counting) {
				3563	/*
				3564	* The single value limit and min/max limits use the same bytes
				3565	* in the
				3566	*/
				3567	bctl->data.limit = limit_data;
				3568	bctl->meta.limit = limit_meta;
				3569	bctl->sys.limit = limit_sys;
				3570	}
				3571	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				3572	key.offset = (u64)-1;
				3573	key.type = BTRFS_CHUNK_ITEM_KEY;
				3574
				3575	while (1) {
				3576	if ((!counting && atomic_read(&fs_info->balance_pause_req)) \|\|
				3577	atomic_read(&fs_info->balance_cancel_req)) {
				3578	ret = -ECANCELED;
				3579	goto error;
				3580	}
				3581
				3582	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				3583	ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
				3584	if (ret < 0) {
				3585	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3586	goto error;
				3587	}
				3588
				3589	/*
				3590	* this shouldn't happen, it means the last relocate
				3591	* failed
				3592	*/
				3593	if (ret == 0)
				3594	BUG(); /* FIXME break ? */
				3595
				3596	ret = btrfs_previous_item(chunk_root, path, 0,
				3597	BTRFS_CHUNK_ITEM_KEY);
				3598	if (ret) {
				3599	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3600	ret = 0;
				3601	break;
				3602	}
				3603
				3604	leaf = path->nodes[0];
				3605	slot = path->slots[0];
				3606	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				3607
				3608	if (found_key.objectid != key.objectid) {
				3609	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3610	break;
				3611	}
				3612
				3613	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
				3614	chunk_type = btrfs_chunk_type(leaf, chunk);
				3615
				3616	if (!counting) {
				3617	spin_lock(&fs_info->balance_lock);
				3618	bctl->stat.considered++;
				3619	spin_unlock(&fs_info->balance_lock);
				3620	}
				3621
				3622	ret = should_balance_chunk(fs_info, leaf, chunk,
				3623	found_key.offset);
				3624
				3625	btrfs_release_path(path);
				3626	if (!ret) {
				3627	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3628	goto loop;
				3629	}
				3630
				3631	if (counting) {
				3632	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3633	spin_lock(&fs_info->balance_lock);
				3634	bctl->stat.expected++;
				3635	spin_unlock(&fs_info->balance_lock);
				3636
				3637	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				3638	count_data++;
				3639	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				3640	count_sys++;
				3641	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				3642	count_meta++;
				3643
				3644	goto loop;
				3645	}
				3646
				3647	/*
				3648	* Apply limit_min filter, no need to check if the LIMITS
				3649	* filter is used, limit_min is 0 by default
				3650	*/
				3651	if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
				3652	count_data < bctl->data.limit_min)
				3653	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
				3654	count_meta < bctl->meta.limit_min)
				3655	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
				3656	count_sys < bctl->sys.limit_min)) {
				3657	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3658	goto loop;
				3659	}
				3660
				3661	ASSERT(fs_info->data_sinfo);
				3662	spin_lock(&fs_info->data_sinfo->lock);
				3663	bytes_used = fs_info->data_sinfo->bytes_used;
				3664	spin_unlock(&fs_info->data_sinfo->lock);
				3665
				3666	if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
				3667	!chunk_reserved && !bytes_used) {
				3668	trans = btrfs_start_transaction(chunk_root, 0);
				3669	if (IS_ERR(trans)) {
				3670	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3671	ret = PTR_ERR(trans);
				3672	goto error;
				3673	}
				3674
				3675	ret = btrfs_force_chunk_alloc(trans, fs_info,
				3676	BTRFS_BLOCK_GROUP_DATA);
				3677	btrfs_end_transaction(trans);
				3678	if (ret < 0) {
				3679	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3680	goto error;
				3681	}
				3682	chunk_reserved = 1;
				3683	}
				3684
				3685	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
				3686	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3687	if (ret && ret != -ENOSPC)
				3688	goto error;
				3689	if (ret == -ENOSPC) {
				3690	enospc_errors++;
				3691	} else {
				3692	spin_lock(&fs_info->balance_lock);
				3693	bctl->stat.completed++;
				3694	spin_unlock(&fs_info->balance_lock);
				3695	}
				3696	loop:
				3697	if (found_key.offset == 0)
				3698	break;
				3699	key.offset = found_key.offset - 1;
				3700	}
				3701
				3702	if (counting) {
				3703	btrfs_release_path(path);
				3704	counting = false;
				3705	goto again;
				3706	}
				3707	error:
				3708	btrfs_free_path(path);
				3709	if (enospc_errors) {
				3710	btrfs_info(fs_info, "%d enospc errors during balance",
				3711	enospc_errors);
				3712	if (!ret)
				3713	ret = -ENOSPC;
				3714	}
				3715
				3716	return ret;
				3717	}
				3718
				3719	/**
				3720	* alloc_profile_is_valid - see if a given profile is valid and reduced
				3721	* @flags: profile to validate
				3722	* @extended: if true @flags is treated as an extended profile
				3723	*/
				3724	static int alloc_profile_is_valid(u64 flags, int extended)
				3725	{
				3726	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
				3727	BTRFS_BLOCK_GROUP_PROFILE_MASK);
				3728
				3729	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
				3730
				3731	/* 1) check that all other bits are zeroed */
				3732	if (flags & ~mask)
				3733	return 0;
				3734
				3735	/* 2) see if profile is reduced */
				3736	if (flags == 0)
				3737	return !extended; /* "0" is valid for usual profiles */
				3738
				3739	/* true if exactly one bit set */
				3740	return (flags & (flags - 1)) == 0;
				3741	}
				3742
				3743	static inline int balance_need_close(struct btrfs_fs_info *fs_info)
				3744	{
				3745	/* cancel requested \|\| normal exit path */
				3746	return atomic_read(&fs_info->balance_cancel_req) \|\|
				3747	(atomic_read(&fs_info->balance_pause_req) == 0 &&
				3748	atomic_read(&fs_info->balance_cancel_req) == 0);
				3749	}
				3750
				3751	static void __cancel_balance(struct btrfs_fs_info *fs_info)
				3752	{
				3753	int ret;
				3754
				3755	unset_balance_control(fs_info);
				3756	ret = del_balance_item(fs_info);
				3757	if (ret)
				3758	btrfs_handle_fs_error(fs_info, ret, NULL);
				3759
				3760	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				3761	}
				3762
				3763	/* Non-zero return value signifies invalidity */
				3764	static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
				3765	u64 allowed)
				3766	{
				3767	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				3768	(!alloc_profile_is_valid(bctl_arg->target, 1) \|\|
				3769	(bctl_arg->target & ~allowed)));
				3770	}
				3771
				3772	/*
				3773	* Should be called with both balance and volume mutexes held
				3774	*/
				3775	int btrfs_balance(struct btrfs_balance_control *bctl,
				3776	struct btrfs_ioctl_balance_args *bargs)
				3777	{
				3778	struct btrfs_fs_info *fs_info = bctl->fs_info;
				3779	u64 meta_target, data_target;
				3780	u64 allowed;
				3781	int mixed = 0;
				3782	int ret;
				3783	u64 num_devices;
				3784	unsigned seq;
				3785
				3786	if (btrfs_fs_closing(fs_info) \|\|
				3787	atomic_read(&fs_info->balance_pause_req) \|\|
				3788	atomic_read(&fs_info->balance_cancel_req)) {
				3789	ret = -EINVAL;
				3790	goto out;
				3791	}
				3792
				3793	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
				3794	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				3795	mixed = 1;
				3796
				3797	/*
				3798	* In case of mixed groups both data and meta should be picked,
				3799	* and identical options should be given for both of them.
				3800	*/
				3801	allowed = BTRFS_BALANCE_DATA \| BTRFS_BALANCE_METADATA;
				3802	if (mixed && (bctl->flags & allowed)) {
				3803	if (!(bctl->flags & BTRFS_BALANCE_DATA) \|\|
				3804	!(bctl->flags & BTRFS_BALANCE_METADATA) \|\|
				3805	memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
				3806	btrfs_err(fs_info,
				3807	"with mixed groups data and metadata balance options must be the same");
				3808	ret = -EINVAL;
				3809	goto out;
				3810	}
				3811	}
				3812
				3813	num_devices = fs_info->fs_devices->num_devices;
				3814	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
				3815	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
				3816	BUG_ON(num_devices < 1);
				3817	num_devices--;
				3818	}
				3819	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
				3820	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE \| BTRFS_BLOCK_GROUP_DUP;
				3821	if (num_devices > 1)
				3822	allowed \|= (BTRFS_BLOCK_GROUP_RAID0 \| BTRFS_BLOCK_GROUP_RAID1);
				3823	if (num_devices > 2)
				3824	allowed \|= BTRFS_BLOCK_GROUP_RAID5;
				3825	if (num_devices > 3)
				3826	allowed \|= (BTRFS_BLOCK_GROUP_RAID10 \|
				3827	BTRFS_BLOCK_GROUP_RAID6);
				3828	if (validate_convert_profile(&bctl->data, allowed)) {
				3829	btrfs_err(fs_info,
				3830	"unable to start balance with target data profile %llu",
				3831	bctl->data.target);
				3832	ret = -EINVAL;
				3833	goto out;
				3834	}
				3835	if (validate_convert_profile(&bctl->meta, allowed)) {
				3836	btrfs_err(fs_info,
				3837	"unable to start balance with target metadata profile %llu",
				3838	bctl->meta.target);
				3839	ret = -EINVAL;
				3840	goto out;
				3841	}
				3842	if (validate_convert_profile(&bctl->sys, allowed)) {
				3843	btrfs_err(fs_info,
				3844	"unable to start balance with target system profile %llu",
				3845	bctl->sys.target);
				3846	ret = -EINVAL;
				3847	goto out;
				3848	}
				3849
				3850	/* allow to reduce meta or sys integrity only if force set */
				3851	allowed = BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
				3852	BTRFS_BLOCK_GROUP_RAID10 \|
				3853	BTRFS_BLOCK_GROUP_RAID5 \|
				3854	BTRFS_BLOCK_GROUP_RAID6;
				3855	do {
				3856	seq = read_seqbegin(&fs_info->profiles_lock);
				3857
				3858	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				3859	(fs_info->avail_system_alloc_bits & allowed) &&
				3860	!(bctl->sys.target & allowed)) \|\|
				3861	((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				3862	(fs_info->avail_metadata_alloc_bits & allowed) &&
				3863	!(bctl->meta.target & allowed))) {
				3864	if (bctl->flags & BTRFS_BALANCE_FORCE) {
				3865	btrfs_info(fs_info,
				3866	"force reducing metadata integrity");
				3867	} else {
				3868	btrfs_err(fs_info,
				3869	"balance will reduce metadata integrity, use force if you want this");
				3870	ret = -EINVAL;
				3871	goto out;
				3872	}
				3873	}
				3874	} while (read_seqretry(&fs_info->profiles_lock, seq));
				3875
				3876	/* if we're not converting, the target field is uninitialized */
				3877	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
				3878	bctl->meta.target : fs_info->avail_metadata_alloc_bits;
				3879	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
				3880	bctl->data.target : fs_info->avail_data_alloc_bits;
				3881	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
				3882	btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
				3883	btrfs_warn(fs_info,
				3884	"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
				3885	meta_target, data_target);
				3886	}
				3887
				3888	ret = insert_balance_item(fs_info, bctl);
				3889	if (ret && ret != -EEXIST)
				3890	goto out;
				3891
				3892	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
				3893	BUG_ON(ret == -EEXIST);
				3894	set_balance_control(bctl);
				3895	} else {
				3896	BUG_ON(ret != -EEXIST);
				3897	spin_lock(&fs_info->balance_lock);
				3898	update_balance_args(bctl);
				3899	spin_unlock(&fs_info->balance_lock);
				3900	}
				3901
				3902	atomic_inc(&fs_info->balance_running);
				3903	mutex_unlock(&fs_info->balance_mutex);
				3904
				3905	ret = __btrfs_balance(fs_info);
				3906
				3907	mutex_lock(&fs_info->balance_mutex);
				3908	atomic_dec(&fs_info->balance_running);
				3909
				3910	if (bargs) {
				3911	memset(bargs, 0, sizeof(*bargs));
				3912	update_ioctl_balance_args(fs_info, 0, bargs);
				3913	}
				3914
				3915	if ((ret && ret != -ECANCELED && ret != -ENOSPC) \|\|
				3916	balance_need_close(fs_info)) {
				3917	__cancel_balance(fs_info);
				3918	}
				3919
				3920	wake_up(&fs_info->balance_wait_q);
				3921
				3922	return ret;
				3923	out:
				3924	if (bctl->flags & BTRFS_BALANCE_RESUME)
				3925	__cancel_balance(fs_info);
				3926	else {
				3927	kfree(bctl);
				3928	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				3929	}
				3930	return ret;
				3931	}
				3932
				3933	static int balance_kthread(void *data)
				3934	{
				3935	struct btrfs_fs_info *fs_info = data;
				3936	int ret = 0;
				3937
				3938	mutex_lock(&fs_info->volume_mutex);
				3939	mutex_lock(&fs_info->balance_mutex);
				3940
				3941	if (fs_info->balance_ctl) {
				3942	btrfs_info(fs_info, "continuing balance");
				3943	ret = btrfs_balance(fs_info->balance_ctl, NULL);
				3944	}
				3945
				3946	mutex_unlock(&fs_info->balance_mutex);
				3947	mutex_unlock(&fs_info->volume_mutex);
				3948
				3949	return ret;
				3950	}
				3951
				3952	int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
				3953	{
				3954	struct task_struct *tsk;
				3955
				3956	spin_lock(&fs_info->balance_lock);
				3957	if (!fs_info->balance_ctl) {
				3958	spin_unlock(&fs_info->balance_lock);
				3959	return 0;
				3960	}
				3961	spin_unlock(&fs_info->balance_lock);
				3962
				3963	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
				3964	btrfs_info(fs_info, "force skipping balance");
				3965	return 0;
				3966	}
				3967
				3968	/*
				3969	* A ro->rw remount sequence should continue with the paused balance
				3970	* regardless of who pauses it, system or the user as of now, so set
				3971	* the resume flag.
				3972	*/
				3973	spin_lock(&fs_info->balance_lock);
				3974	fs_info->balance_ctl->flags \|= BTRFS_BALANCE_RESUME;
				3975	spin_unlock(&fs_info->balance_lock);
				3976
				3977	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
				3978	return PTR_ERR_OR_ZERO(tsk);
				3979	}
				3980
				3981	int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
				3982	{
				3983	struct btrfs_balance_control *bctl;
				3984	struct btrfs_balance_item *item;
				3985	struct btrfs_disk_balance_args disk_bargs;
				3986	struct btrfs_path *path;
				3987	struct extent_buffer *leaf;
				3988	struct btrfs_key key;
				3989	int ret;
				3990
				3991	path = btrfs_alloc_path();
				3992	if (!path)
				3993	return -ENOMEM;
				3994
				3995	key.objectid = BTRFS_BALANCE_OBJECTID;
				3996	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3997	key.offset = 0;
				3998
				3999	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
				4000	if (ret < 0)
				4001	goto out;
				4002	if (ret > 0) { /* ret = -ENOENT; */
				4003	ret = 0;
				4004	goto out;
				4005	}
				4006
				4007	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
				4008	if (!bctl) {
				4009	ret = -ENOMEM;
				4010	goto out;
				4011	}
				4012
				4013	leaf = path->nodes[0];
				4014	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
				4015
				4016	bctl->fs_info = fs_info;
				4017	bctl->flags = btrfs_balance_flags(leaf, item);
				4018	bctl->flags \|= BTRFS_BALANCE_RESUME;
				4019
				4020	btrfs_balance_data(leaf, item, &disk_bargs);
				4021	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
				4022	btrfs_balance_meta(leaf, item, &disk_bargs);
				4023	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
				4024	btrfs_balance_sys(leaf, item, &disk_bargs);
				4025	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
				4026
				4027	WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
				4028
				4029	mutex_lock(&fs_info->volume_mutex);
				4030	mutex_lock(&fs_info->balance_mutex);
				4031
				4032	set_balance_control(bctl);
				4033
				4034	mutex_unlock(&fs_info->balance_mutex);
				4035	mutex_unlock(&fs_info->volume_mutex);
				4036	out:
				4037	btrfs_free_path(path);
				4038	return ret;
				4039	}
				4040
				4041	int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
				4042	{
				4043	int ret = 0;
				4044
				4045	mutex_lock(&fs_info->balance_mutex);
				4046	if (!fs_info->balance_ctl) {
				4047	mutex_unlock(&fs_info->balance_mutex);
				4048	return -ENOTCONN;
				4049	}
				4050
				4051	if (atomic_read(&fs_info->balance_running)) {
				4052	atomic_inc(&fs_info->balance_pause_req);
				4053	mutex_unlock(&fs_info->balance_mutex);
				4054
				4055	wait_event(fs_info->balance_wait_q,
				4056	atomic_read(&fs_info->balance_running) == 0);
				4057
				4058	mutex_lock(&fs_info->balance_mutex);
				4059	/* we are good with balance_ctl ripped off from under us */
				4060	BUG_ON(atomic_read(&fs_info->balance_running));
				4061	atomic_dec(&fs_info->balance_pause_req);
				4062	} else {
				4063	ret = -ENOTCONN;
				4064	}
				4065
				4066	mutex_unlock(&fs_info->balance_mutex);
				4067	return ret;
				4068	}
				4069
				4070	int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
				4071	{
				4072	if (sb_rdonly(fs_info->sb))
				4073	return -EROFS;
				4074
				4075	mutex_lock(&fs_info->balance_mutex);
				4076	if (!fs_info->balance_ctl) {
				4077	mutex_unlock(&fs_info->balance_mutex);
				4078	return -ENOTCONN;
				4079	}
				4080
				4081	atomic_inc(&fs_info->balance_cancel_req);
				4082	/*
				4083	* if we are running just wait and return, balance item is
				4084	* deleted in btrfs_balance in this case
				4085	*/
				4086	if (atomic_read(&fs_info->balance_running)) {
				4087	mutex_unlock(&fs_info->balance_mutex);
				4088	wait_event(fs_info->balance_wait_q,
				4089	atomic_read(&fs_info->balance_running) == 0);
				4090	mutex_lock(&fs_info->balance_mutex);
				4091	} else {
				4092	/* __cancel_balance needs volume_mutex */
				4093	mutex_unlock(&fs_info->balance_mutex);
				4094	mutex_lock(&fs_info->volume_mutex);
				4095	mutex_lock(&fs_info->balance_mutex);
				4096
				4097	if (fs_info->balance_ctl)
				4098	__cancel_balance(fs_info);
				4099
				4100	mutex_unlock(&fs_info->volume_mutex);
				4101	}
				4102
				4103	BUG_ON(fs_info->balance_ctl \|\| atomic_read(&fs_info->balance_running));
				4104	atomic_dec(&fs_info->balance_cancel_req);
				4105	mutex_unlock(&fs_info->balance_mutex);
				4106	return 0;
				4107	}
				4108
				4109	static int btrfs_uuid_scan_kthread(void *data)
				4110	{
				4111	struct btrfs_fs_info *fs_info = data;
				4112	struct btrfs_root *root = fs_info->tree_root;
				4113	struct btrfs_key key;
				4114	struct btrfs_path *path = NULL;
				4115	int ret = 0;
				4116	struct extent_buffer *eb;
				4117	int slot;
				4118	struct btrfs_root_item root_item;
				4119	u32 item_size;
				4120	struct btrfs_trans_handle *trans = NULL;
				4121
				4122	path = btrfs_alloc_path();
				4123	if (!path) {
				4124	ret = -ENOMEM;
				4125	goto out;
				4126	}
				4127
				4128	key.objectid = 0;
				4129	key.type = BTRFS_ROOT_ITEM_KEY;
				4130	key.offset = 0;
				4131
				4132	while (1) {
				4133	ret = btrfs_search_forward(root, &key, path, 0);
				4134	if (ret) {
				4135	if (ret > 0)
				4136	ret = 0;
				4137	break;
				4138	}
				4139
				4140	if (key.type != BTRFS_ROOT_ITEM_KEY \|\|
				4141	(key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
				4142	key.objectid != BTRFS_FS_TREE_OBJECTID) \|\|
				4143	key.objectid > BTRFS_LAST_FREE_OBJECTID)
				4144	goto skip;
				4145
				4146	eb = path->nodes[0];
				4147	slot = path->slots[0];
				4148	item_size = btrfs_item_size_nr(eb, slot);
				4149	if (item_size < sizeof(root_item))
				4150	goto skip;
				4151
				4152	read_extent_buffer(eb, &root_item,
				4153	btrfs_item_ptr_offset(eb, slot),
				4154	(int)sizeof(root_item));
				4155	if (btrfs_root_refs(&root_item) == 0)
				4156	goto skip;
				4157
				4158	if (!btrfs_is_empty_uuid(root_item.uuid) \|\|
				4159	!btrfs_is_empty_uuid(root_item.received_uuid)) {
				4160	if (trans)
				4161	goto update_tree;
				4162
				4163	btrfs_release_path(path);
				4164	/*
				4165	* 1 - subvol uuid item
				4166	* 1 - received_subvol uuid item
				4167	*/
				4168	trans = btrfs_start_transaction(fs_info->uuid_root, 2);
				4169	if (IS_ERR(trans)) {
				4170	ret = PTR_ERR(trans);
				4171	break;
				4172	}
				4173	continue;
				4174	} else {
				4175	goto skip;
				4176	}
				4177	update_tree:
				4178	btrfs_release_path(path);
				4179	if (!btrfs_is_empty_uuid(root_item.uuid)) {
				4180	ret = btrfs_uuid_tree_add(trans, fs_info,
				4181	root_item.uuid,
				4182	BTRFS_UUID_KEY_SUBVOL,
				4183	key.objectid);
				4184	if (ret < 0) {
				4185	btrfs_warn(fs_info, "uuid_tree_add failed %d",
				4186	ret);
				4187	break;
				4188	}
				4189	}
				4190
				4191	if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
				4192	ret = btrfs_uuid_tree_add(trans, fs_info,
				4193	root_item.received_uuid,
				4194	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				4195	key.objectid);
				4196	if (ret < 0) {
				4197	btrfs_warn(fs_info, "uuid_tree_add failed %d",
				4198	ret);
				4199	break;
				4200	}
				4201	}
				4202
				4203	skip:
				4204	btrfs_release_path(path);
				4205	if (trans) {
				4206	ret = btrfs_end_transaction(trans);
				4207	trans = NULL;
				4208	if (ret)
				4209	break;
				4210	}
				4211
				4212	if (key.offset < (u64)-1) {
				4213	key.offset++;
				4214	} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
				4215	key.offset = 0;
				4216	key.type = BTRFS_ROOT_ITEM_KEY;
				4217	} else if (key.objectid < (u64)-1) {
				4218	key.offset = 0;
				4219	key.type = BTRFS_ROOT_ITEM_KEY;
				4220	key.objectid++;
				4221	} else {
				4222	break;
				4223	}
				4224	cond_resched();
				4225	}
				4226
				4227	out:
				4228	btrfs_free_path(path);
				4229	if (trans && !IS_ERR(trans))
				4230	btrfs_end_transaction(trans);
				4231	if (ret)
				4232	btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
				4233	else
				4234	set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
				4235	up(&fs_info->uuid_tree_rescan_sem);
				4236	return 0;
				4237	}
				4238
				4239	/*
				4240	* Callback for btrfs_uuid_tree_iterate().
				4241	* returns:
				4242	* 0 check succeeded, the entry is not outdated.
				4243	* < 0 if an error occurred.
				4244	* > 0 if the check failed, which means the caller shall remove the entry.
				4245	*/
				4246	static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				4247	u8 *uuid, u8 type, u64 subid)
				4248	{
				4249	struct btrfs_key key;
				4250	int ret = 0;
				4251	struct btrfs_root *subvol_root;
				4252
				4253	if (type != BTRFS_UUID_KEY_SUBVOL &&
				4254	type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
				4255	goto out;
				4256
				4257	key.objectid = subid;
				4258	key.type = BTRFS_ROOT_ITEM_KEY;
				4259	key.offset = (u64)-1;
				4260	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
				4261	if (IS_ERR(subvol_root)) {
				4262	ret = PTR_ERR(subvol_root);
				4263	if (ret == -ENOENT)
				4264	ret = 1;
				4265	goto out;
				4266	}
				4267
				4268	switch (type) {
				4269	case BTRFS_UUID_KEY_SUBVOL:
				4270	if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
				4271	ret = 1;
				4272	break;
				4273	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
				4274	if (memcmp(uuid, subvol_root->root_item.received_uuid,
				4275	BTRFS_UUID_SIZE))
				4276	ret = 1;
				4277	break;
				4278	}
				4279
				4280	out:
				4281	return ret;
				4282	}
				4283
				4284	static int btrfs_uuid_rescan_kthread(void *data)
				4285	{
				4286	struct btrfs_fs_info fs_info = (struct btrfs_fs_info )data;
				4287	int ret;
				4288
				4289	/*
				4290	* 1st step is to iterate through the existing UUID tree and
				4291	* to delete all entries that contain outdated data.
				4292	* 2nd step is to add all missing entries to the UUID tree.
				4293	*/
				4294	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
				4295	if (ret < 0) {
				4296	btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
				4297	up(&fs_info->uuid_tree_rescan_sem);
				4298	return ret;
				4299	}
				4300	return btrfs_uuid_scan_kthread(data);
				4301	}
				4302
				4303	int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
				4304	{
				4305	struct btrfs_trans_handle *trans;
				4306	struct btrfs_root *tree_root = fs_info->tree_root;
				4307	struct btrfs_root *uuid_root;
				4308	struct task_struct *task;
				4309	int ret;
				4310
				4311	/*
				4312	* 1 - root node
				4313	* 1 - root item
				4314	*/
				4315	trans = btrfs_start_transaction(tree_root, 2);
				4316	if (IS_ERR(trans))
				4317	return PTR_ERR(trans);
				4318
				4319	uuid_root = btrfs_create_tree(trans, fs_info,
				4320	BTRFS_UUID_TREE_OBJECTID);
				4321	if (IS_ERR(uuid_root)) {
				4322	ret = PTR_ERR(uuid_root);
				4323	btrfs_abort_transaction(trans, ret);
				4324	btrfs_end_transaction(trans);
				4325	return ret;
				4326	}
				4327
				4328	fs_info->uuid_root = uuid_root;
				4329
				4330	ret = btrfs_commit_transaction(trans);
				4331	if (ret)
				4332	return ret;
				4333
				4334	down(&fs_info->uuid_tree_rescan_sem);
				4335	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
				4336	if (IS_ERR(task)) {
				4337	/* fs_info->update_uuid_tree_gen remains 0 in all error case */
				4338	btrfs_warn(fs_info, "failed to start uuid_scan task");
				4339	up(&fs_info->uuid_tree_rescan_sem);
				4340	return PTR_ERR(task);
				4341	}
				4342
				4343	return 0;
				4344	}
				4345
				4346	int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
				4347	{
				4348	struct task_struct *task;
				4349
				4350	down(&fs_info->uuid_tree_rescan_sem);
				4351	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
				4352	if (IS_ERR(task)) {
				4353	/* fs_info->update_uuid_tree_gen remains 0 in all error case */
				4354	btrfs_warn(fs_info, "failed to start uuid_rescan task");
				4355	up(&fs_info->uuid_tree_rescan_sem);
				4356	return PTR_ERR(task);
				4357	}
				4358
				4359	return 0;
				4360	}
				4361
				4362	/*
				4363	* shrinking a device means finding all of the device extents past
				4364	* the new size, and then following the back refs to the chunks.
				4365	* The chunk relocation code actually frees the device extent
				4366	*/
				4367	int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
				4368	{
				4369	struct btrfs_fs_info *fs_info = device->fs_info;
				4370	struct btrfs_root *root = fs_info->dev_root;
				4371	struct btrfs_trans_handle *trans;
				4372	struct btrfs_dev_extent *dev_extent = NULL;
				4373	struct btrfs_path *path;
				4374	u64 length;
				4375	u64 chunk_offset;
				4376	int ret;
				4377	int slot;
				4378	int failed = 0;
				4379	bool retried = false;
				4380	bool checked_pending_chunks = false;
				4381	struct extent_buffer *l;
				4382	struct btrfs_key key;
				4383	struct btrfs_super_block *super_copy = fs_info->super_copy;
				4384	u64 old_total = btrfs_super_total_bytes(super_copy);
				4385	u64 old_size = btrfs_device_get_total_bytes(device);
				4386	u64 diff;
				4387
				4388	new_size = round_down(new_size, fs_info->sectorsize);
				4389	diff = round_down(old_size - new_size, fs_info->sectorsize);
				4390
				4391	if (device->is_tgtdev_for_dev_replace)
				4392	return -EINVAL;
				4393
				4394	path = btrfs_alloc_path();
				4395	if (!path)
				4396	return -ENOMEM;
				4397
				4398	path->reada = READA_FORWARD;
				4399
				4400	mutex_lock(&fs_info->chunk_mutex);
				4401
				4402	btrfs_device_set_total_bytes(device, new_size);
				4403	if (device->writeable) {
				4404	device->fs_devices->total_rw_bytes -= diff;
				4405	atomic64_sub(diff, &fs_info->free_chunk_space);
				4406	}
				4407	mutex_unlock(&fs_info->chunk_mutex);
				4408
				4409	again:
				4410	key.objectid = device->devid;
				4411	key.offset = (u64)-1;
				4412	key.type = BTRFS_DEV_EXTENT_KEY;
				4413
				4414	do {
				4415	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				4416	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4417	if (ret < 0) {
				4418	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4419	goto done;
				4420	}
				4421
				4422	ret = btrfs_previous_item(root, path, 0, key.type);
				4423	if (ret)
				4424	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4425	if (ret < 0)
				4426	goto done;
				4427	if (ret) {
				4428	ret = 0;
				4429	btrfs_release_path(path);
				4430	break;
				4431	}
				4432
				4433	l = path->nodes[0];
				4434	slot = path->slots[0];
				4435	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
				4436
				4437	if (key.objectid != device->devid) {
				4438	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4439	btrfs_release_path(path);
				4440	break;
				4441	}
				4442
				4443	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				4444	length = btrfs_dev_extent_length(l, dev_extent);
				4445
				4446	if (key.offset + length <= new_size) {
				4447	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4448	btrfs_release_path(path);
				4449	break;
				4450	}
				4451
				4452	chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
				4453	btrfs_release_path(path);
				4454
				4455	ret = btrfs_relocate_chunk(fs_info, chunk_offset);
				4456	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4457	if (ret && ret != -ENOSPC)
				4458	goto done;
				4459	if (ret == -ENOSPC)
				4460	failed++;
				4461	} while (key.offset-- > 0);
				4462
				4463	if (failed && !retried) {
				4464	failed = 0;
				4465	retried = true;
				4466	goto again;
				4467	} else if (failed && retried) {
				4468	ret = -ENOSPC;
				4469	goto done;
				4470	}
				4471
				4472	/* Shrinking succeeded, else we would be at "done". */
				4473	trans = btrfs_start_transaction(root, 0);
				4474	if (IS_ERR(trans)) {
				4475	ret = PTR_ERR(trans);
				4476	goto done;
				4477	}
				4478
				4479	mutex_lock(&fs_info->chunk_mutex);
				4480
				4481	/*
				4482	* We checked in the above loop all device extents that were already in
				4483	* the device tree. However before we have updated the device's
				4484	* total_bytes to the new size, we might have had chunk allocations that
				4485	* have not complete yet (new block groups attached to transaction
				4486	* handles), and therefore their device extents were not yet in the
				4487	* device tree and we missed them in the loop above. So if we have any
				4488	* pending chunk using a device extent that overlaps the device range
				4489	* that we can not use anymore, commit the current transaction and
				4490	* repeat the search on the device tree - this way we guarantee we will
				4491	* not have chunks using device extents that end beyond 'new_size'.
				4492	*/
				4493	if (!checked_pending_chunks) {
				4494	u64 start = new_size;
				4495	u64 len = old_size - new_size;
				4496
				4497	if (contains_pending_extent(trans->transaction, device,
				4498	&start, len)) {
				4499	mutex_unlock(&fs_info->chunk_mutex);
				4500	checked_pending_chunks = true;
				4501	failed = 0;
				4502	retried = false;
				4503	ret = btrfs_commit_transaction(trans);
				4504	if (ret)
				4505	goto done;
				4506	goto again;
				4507	}
				4508	}
				4509
				4510	btrfs_device_set_disk_total_bytes(device, new_size);
				4511	if (list_empty(&device->resized_list))
				4512	list_add_tail(&device->resized_list,
				4513	&fs_info->fs_devices->resized_devices);
				4514
				4515	WARN_ON(diff > old_total);
				4516	btrfs_set_super_total_bytes(super_copy,
				4517	round_down(old_total - diff, fs_info->sectorsize));
				4518	mutex_unlock(&fs_info->chunk_mutex);
				4519
				4520	/* Now btrfs_update_device() will change the on-disk size. */
				4521	ret = btrfs_update_device(trans, device);
				4522	if (ret < 0) {
				4523	btrfs_abort_transaction(trans, ret);
				4524	btrfs_end_transaction(trans);
				4525	} else {
				4526	ret = btrfs_commit_transaction(trans);
				4527	}
				4528	done:
				4529	btrfs_free_path(path);
				4530	if (ret) {
				4531	mutex_lock(&fs_info->chunk_mutex);
				4532	btrfs_device_set_total_bytes(device, old_size);
				4533	if (device->writeable)
				4534	device->fs_devices->total_rw_bytes += diff;
				4535	atomic64_add(diff, &fs_info->free_chunk_space);
				4536	mutex_unlock(&fs_info->chunk_mutex);
				4537	}
				4538	return ret;
				4539	}
				4540
				4541	static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
				4542	struct btrfs_key *key,
				4543	struct btrfs_chunk *chunk, int item_size)
				4544	{
				4545	struct btrfs_super_block *super_copy = fs_info->super_copy;
				4546	struct btrfs_disk_key disk_key;
				4547	u32 array_size;
				4548	u8 *ptr;
				4549
				4550	mutex_lock(&fs_info->chunk_mutex);
				4551	array_size = btrfs_super_sys_array_size(super_copy);
				4552	if (array_size + item_size + sizeof(disk_key)
				4553	> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
				4554	mutex_unlock(&fs_info->chunk_mutex);
				4555	return -EFBIG;
				4556	}
				4557
				4558	ptr = super_copy->sys_chunk_array + array_size;
				4559	btrfs_cpu_key_to_disk(&disk_key, key);
				4560	memcpy(ptr, &disk_key, sizeof(disk_key));
				4561	ptr += sizeof(disk_key);
				4562	memcpy(ptr, chunk, item_size);
				4563	item_size += sizeof(disk_key);
				4564	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
				4565	mutex_unlock(&fs_info->chunk_mutex);
				4566
				4567	return 0;
				4568	}
				4569
				4570	/*
				4571	* sort the devices in descending order by max_avail, total_avail
				4572	*/
				4573	static int btrfs_cmp_device_info(const void a, const void b)
				4574	{
				4575	const struct btrfs_device_info *di_a = a;
				4576	const struct btrfs_device_info *di_b = b;
				4577
				4578	if (di_a->max_avail > di_b->max_avail)
				4579	return -1;
				4580	if (di_a->max_avail < di_b->max_avail)
				4581	return 1;
				4582	if (di_a->total_avail > di_b->total_avail)
				4583	return -1;
				4584	if (di_a->total_avail < di_b->total_avail)
				4585	return 1;
				4586	return 0;
				4587	}
				4588
				4589	static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
				4590	{
				4591	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
				4592	return;
				4593
				4594	btrfs_set_fs_incompat(info, RAID56);
				4595	}
				4596
				4597	#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \
				4598	- sizeof(struct btrfs_chunk)) \
				4599	/ sizeof(struct btrfs_stripe) + 1)
				4600
				4601	#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
				4602	- 2 * sizeof(struct btrfs_disk_key) \
				4603	- 2 * sizeof(struct btrfs_chunk)) \
				4604	/ sizeof(struct btrfs_stripe) + 1)
				4605
				4606	static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
				4607	u64 start, u64 type)
				4608	{
				4609	struct btrfs_fs_info *info = trans->fs_info;
				4610	struct btrfs_fs_devices *fs_devices = info->fs_devices;
				4611	struct btrfs_device *device;
				4612	struct map_lookup *map = NULL;
				4613	struct extent_map_tree *em_tree;
				4614	struct extent_map *em;
				4615	struct btrfs_device_info *devices_info = NULL;
				4616	u64 total_avail;
				4617	int num_stripes; /* total number of stripes to allocate */
				4618	int data_stripes; /* number of stripes that count for
				4619	block group size */
				4620	int sub_stripes; /* sub_stripes info for map */
				4621	int dev_stripes; /* stripes per dev */
				4622	int devs_max; /* max devs to use */
				4623	int devs_min; /* min devs needed */
				4624	int devs_increment; /* ndevs has to be a multiple of this */
				4625	int ncopies; /* how many copies to data has */
				4626	int ret;
				4627	u64 max_stripe_size;
				4628	u64 max_chunk_size;
				4629	u64 stripe_size;
				4630	u64 num_bytes;
				4631	int ndevs;
				4632	int i;
				4633	int j;
				4634	int index;
				4635
				4636	BUG_ON(!alloc_profile_is_valid(type, 0));
				4637
				4638	if (list_empty(&fs_devices->alloc_list))
				4639	return -ENOSPC;
				4640
				4641	index = __get_raid_index(type);
				4642
				4643	sub_stripes = btrfs_raid_array[index].sub_stripes;
				4644	dev_stripes = btrfs_raid_array[index].dev_stripes;
				4645	devs_max = btrfs_raid_array[index].devs_max;
				4646	devs_min = btrfs_raid_array[index].devs_min;
				4647	devs_increment = btrfs_raid_array[index].devs_increment;
				4648	ncopies = btrfs_raid_array[index].ncopies;
				4649
				4650	if (type & BTRFS_BLOCK_GROUP_DATA) {
				4651	max_stripe_size = SZ_1G;
				4652	max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
				4653	if (!devs_max)
				4654	devs_max = BTRFS_MAX_DEVS(info->chunk_root);
				4655	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
				4656	/* for larger filesystems, use larger metadata chunks */
				4657	if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
				4658	max_stripe_size = SZ_1G;
				4659	else
				4660	max_stripe_size = SZ_256M;
				4661	max_chunk_size = max_stripe_size;
				4662	if (!devs_max)
				4663	devs_max = BTRFS_MAX_DEVS(info->chunk_root);
				4664	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
				4665	max_stripe_size = SZ_32M;
				4666	max_chunk_size = 2 * max_stripe_size;
				4667	if (!devs_max)
				4668	devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
				4669	} else {
				4670	btrfs_err(info, "invalid chunk type 0x%llx requested",
				4671	type);
				4672	BUG_ON(1);
				4673	}
				4674
				4675	/* we don't want a chunk larger than 10% of writeable space */
				4676	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
				4677	max_chunk_size);
				4678
				4679	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
				4680	GFP_NOFS);
				4681	if (!devices_info)
				4682	return -ENOMEM;
				4683
				4684	/*
				4685	* in the first pass through the devices list, we gather information
				4686	* about the available holes on each device.
				4687	*/
				4688	ndevs = 0;
				4689	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
				4690	u64 max_avail;
				4691	u64 dev_offset;
				4692
				4693	if (!device->writeable) {
				4694	WARN(1, KERN_ERR
				4695	"BTRFS: read-only device in alloc_list\n");
				4696	continue;
				4697	}
				4698
				4699	if (!device->in_fs_metadata \|\|
				4700	device->is_tgtdev_for_dev_replace)
				4701	continue;
				4702
				4703	if (device->total_bytes > device->bytes_used)
				4704	total_avail = device->total_bytes - device->bytes_used;
				4705	else
				4706	total_avail = 0;
				4707
				4708	/* If there is no space on this device, skip it. */
				4709	if (total_avail == 0)
				4710	continue;
				4711
				4712	ret = find_free_dev_extent(trans, device,
				4713	max_stripe_size * dev_stripes,
				4714	&dev_offset, &max_avail);
				4715	if (ret && ret != -ENOSPC)
				4716	goto error;
				4717
				4718	if (ret == 0)
				4719	max_avail = max_stripe_size * dev_stripes;
				4720
				4721	if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
				4722	continue;
				4723
				4724	if (ndevs == fs_devices->rw_devices) {
				4725	WARN(1, "%s: found more than %llu devices\n",
				4726	__func__, fs_devices->rw_devices);
				4727	break;
				4728	}
				4729	devices_info[ndevs].dev_offset = dev_offset;
				4730	devices_info[ndevs].max_avail = max_avail;
				4731	devices_info[ndevs].total_avail = total_avail;
				4732	devices_info[ndevs].dev = device;
				4733	++ndevs;
				4734	}
				4735
				4736	/*
				4737	* now sort the devices by hole size / available space
				4738	*/
				4739	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
				4740	btrfs_cmp_device_info, NULL);
				4741
				4742	/* round down to number of usable stripes */
				4743	ndevs = round_down(ndevs, devs_increment);
				4744
				4745	if (ndevs < devs_increment * sub_stripes \|\| ndevs < devs_min) {
				4746	ret = -ENOSPC;
				4747	goto error;
				4748	}
				4749
				4750	ndevs = min(ndevs, devs_max);
				4751
				4752	/*
				4753	* The primary goal is to maximize the number of stripes, so use as
				4754	* many devices as possible, even if the stripes are not maximum sized.
				4755	*
				4756	* The DUP profile stores more than one stripe per device, the
				4757	* max_avail is the total size so we have to adjust.
				4758	*/
				4759	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
				4760	num_stripes = ndevs * dev_stripes;
				4761
				4762	/*
				4763	* this will have to be fixed for RAID1 and RAID10 over
				4764	* more drives
				4765	*/
				4766	data_stripes = num_stripes / ncopies;
				4767
				4768	if (type & BTRFS_BLOCK_GROUP_RAID5)
				4769	data_stripes = num_stripes - 1;
				4770
				4771	if (type & BTRFS_BLOCK_GROUP_RAID6)
				4772	data_stripes = num_stripes - 2;
				4773
				4774	/*
				4775	* Use the number of data stripes to figure out how big this chunk
				4776	* is really going to be in terms of logical address space,
				4777	* and compare that answer with the max chunk size
				4778	*/
				4779	if (stripe_size * data_stripes > max_chunk_size) {
				4780	u64 mask = (1ULL << 24) - 1;
				4781
				4782	stripe_size = div_u64(max_chunk_size, data_stripes);
				4783
				4784	/* bump the answer up to a 16MB boundary */
				4785	stripe_size = (stripe_size + mask) & ~mask;
				4786
				4787	/* but don't go higher than the limits we found
				4788	* while searching for free extents
				4789	*/
				4790	if (stripe_size > devices_info[ndevs-1].max_avail)
				4791	stripe_size = devices_info[ndevs-1].max_avail;
				4792	}
				4793
				4794	/* align to BTRFS_STRIPE_LEN */
				4795	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
				4796
				4797	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
				4798	if (!map) {
				4799	ret = -ENOMEM;
				4800	goto error;
				4801	}
				4802	map->num_stripes = num_stripes;
				4803
				4804	for (i = 0; i < ndevs; ++i) {
				4805	for (j = 0; j < dev_stripes; ++j) {
				4806	int s = i * dev_stripes + j;
				4807	map->stripes[s].dev = devices_info[i].dev;
				4808	map->stripes[s].physical = devices_info[i].dev_offset +
				4809	j * stripe_size;
				4810	}
				4811	}
				4812	map->stripe_len = BTRFS_STRIPE_LEN;
				4813	map->io_align = BTRFS_STRIPE_LEN;
				4814	map->io_width = BTRFS_STRIPE_LEN;
				4815	map->type = type;
				4816	map->sub_stripes = sub_stripes;
				4817
				4818	num_bytes = stripe_size * data_stripes;
				4819
				4820	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
				4821
				4822	em = alloc_extent_map();
				4823	if (!em) {
				4824	kfree(map);
				4825	ret = -ENOMEM;
				4826	goto error;
				4827	}
				4828	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
				4829	em->map_lookup = map;
				4830	em->start = start;
				4831	em->len = num_bytes;
				4832	em->block_start = 0;
				4833	em->block_len = em->len;
				4834	em->orig_block_len = stripe_size;
				4835
				4836	em_tree = &info->mapping_tree.map_tree;
				4837	write_lock(&em_tree->lock);
				4838	ret = add_extent_mapping(em_tree, em, 0);
				4839	if (!ret) {
				4840	list_add_tail(&em->list, &trans->transaction->pending_chunks);
				4841	refcount_inc(&em->refs);
				4842	}
				4843	write_unlock(&em_tree->lock);
				4844	if (ret) {
				4845	free_extent_map(em);
				4846	goto error;
				4847	}
				4848
				4849	ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
				4850	if (ret)
				4851	goto error_del_extent;
				4852
				4853	for (i = 0; i < map->num_stripes; i++) {
				4854	num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
				4855	btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
				4856	map->stripes[i].dev->has_pending_chunks = true;
				4857	}
				4858
				4859	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
				4860
				4861	free_extent_map(em);
				4862	check_raid56_incompat_flag(info, type);
				4863
				4864	kfree(devices_info);
				4865	return 0;
				4866
				4867	error_del_extent:
				4868	write_lock(&em_tree->lock);
				4869	remove_extent_mapping(em_tree, em);
				4870	write_unlock(&em_tree->lock);
				4871
				4872	/* One for our allocation */
				4873	free_extent_map(em);
				4874	/* One for the tree reference */
				4875	free_extent_map(em);
				4876	/* One for the pending_chunks list reference */
				4877	free_extent_map(em);
				4878	error:
				4879	kfree(devices_info);
				4880	return ret;
				4881	}
				4882
				4883	int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
				4884	struct btrfs_fs_info *fs_info,
				4885	u64 chunk_offset, u64 chunk_size)
				4886	{
				4887	struct btrfs_root *extent_root = fs_info->extent_root;
				4888	struct btrfs_root *chunk_root = fs_info->chunk_root;
				4889	struct btrfs_key key;
				4890	struct btrfs_device *device;
				4891	struct btrfs_chunk *chunk;
				4892	struct btrfs_stripe *stripe;
				4893	struct extent_map *em;
				4894	struct map_lookup *map;
				4895	size_t item_size;
				4896	u64 dev_offset;
				4897	u64 stripe_size;
				4898	int i = 0;
				4899	int ret = 0;
				4900
				4901	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
				4902	if (IS_ERR(em))
				4903	return PTR_ERR(em);
				4904
				4905	map = em->map_lookup;
				4906	item_size = btrfs_chunk_item_size(map->num_stripes);
				4907	stripe_size = em->orig_block_len;
				4908
				4909	chunk = kzalloc(item_size, GFP_NOFS);
				4910	if (!chunk) {
				4911	ret = -ENOMEM;
				4912	goto out;
				4913	}
				4914
				4915	/*
				4916	* Take the device list mutex to prevent races with the final phase of
				4917	* a device replace operation that replaces the device object associated
				4918	* with the map's stripes, because the device object's id can change
				4919	* at any time during that final phase of the device replace operation
				4920	* (dev-replace.c:btrfs_dev_replace_finishing()).
				4921	*/
				4922	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				4923	for (i = 0; i < map->num_stripes; i++) {
				4924	device = map->stripes[i].dev;
				4925	dev_offset = map->stripes[i].physical;
				4926
				4927	ret = btrfs_update_device(trans, device);
				4928	if (ret)
				4929	break;
				4930	ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
				4931	dev_offset, stripe_size);
				4932	if (ret)
				4933	break;
				4934	}
				4935	if (ret) {
				4936	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				4937	goto out;
				4938	}
				4939
				4940	stripe = &chunk->stripe;
				4941	for (i = 0; i < map->num_stripes; i++) {
				4942	device = map->stripes[i].dev;
				4943	dev_offset = map->stripes[i].physical;
				4944
				4945	btrfs_set_stack_stripe_devid(stripe, device->devid);
				4946	btrfs_set_stack_stripe_offset(stripe, dev_offset);
				4947	memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
				4948	stripe++;
				4949	}
				4950	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				4951
				4952	btrfs_set_stack_chunk_length(chunk, chunk_size);
				4953	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
				4954	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
				4955	btrfs_set_stack_chunk_type(chunk, map->type);
				4956	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
				4957	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
				4958	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
				4959	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
				4960	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
				4961
				4962	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				4963	key.type = BTRFS_CHUNK_ITEM_KEY;
				4964	key.offset = chunk_offset;
				4965
				4966	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
				4967	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
				4968	/*
				4969	* TODO: Cleanup of inserted chunk root in case of
				4970	* failure.
				4971	*/
				4972	ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
				4973	}
				4974
				4975	out:
				4976	kfree(chunk);
				4977	free_extent_map(em);
				4978	return ret;
				4979	}
				4980
				4981	/*
				4982	* Chunk allocation falls into two parts. The first part does works
				4983	* that make the new allocated chunk useable, but not do any operation
				4984	* that modifies the chunk tree. The second part does the works that
				4985	* require modifying the chunk tree. This division is important for the
				4986	* bootstrap process of adding storage to a seed btrfs.
				4987	*/
				4988	int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
				4989	struct btrfs_fs_info *fs_info, u64 type)
				4990	{
				4991	u64 chunk_offset;
				4992
				4993	ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
				4994	chunk_offset = find_next_chunk(fs_info);
				4995	return __btrfs_alloc_chunk(trans, chunk_offset, type);
				4996	}
				4997
				4998	static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
				4999	struct btrfs_fs_info *fs_info)
				5000	{
				5001	u64 chunk_offset;
				5002	u64 sys_chunk_offset;
				5003	u64 alloc_profile;
				5004	int ret;
				5005
				5006	chunk_offset = find_next_chunk(fs_info);
				5007	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
				5008	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
				5009	if (ret)
				5010	return ret;
				5011
				5012	sys_chunk_offset = find_next_chunk(fs_info);
				5013	alloc_profile = btrfs_system_alloc_profile(fs_info);
				5014	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
				5015	return ret;
				5016	}
				5017
				5018	static inline int btrfs_chunk_max_errors(struct map_lookup *map)
				5019	{
				5020	int max_errors;
				5021
				5022	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
				5023	BTRFS_BLOCK_GROUP_RAID10 \|
				5024	BTRFS_BLOCK_GROUP_RAID5)) {
				5025	max_errors = 1;
				5026	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
				5027	max_errors = 2;
				5028	} else {
				5029	max_errors = 0;
				5030	}
				5031
				5032	return max_errors;
				5033	}
				5034
				5035	int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				5036	{
				5037	struct extent_map *em;
				5038	struct map_lookup *map;
				5039	int readonly = 0;
				5040	int miss_ndevs = 0;
				5041	int i;
				5042
				5043	em = get_chunk_map(fs_info, chunk_offset, 1);
				5044	if (IS_ERR(em))
				5045	return 1;
				5046
				5047	map = em->map_lookup;
				5048	for (i = 0; i < map->num_stripes; i++) {
				5049	if (map->stripes[i].dev->missing) {
				5050	miss_ndevs++;
				5051	continue;
				5052	}
				5053
				5054	if (!map->stripes[i].dev->writeable) {
				5055	readonly = 1;
				5056	goto end;
				5057	}
				5058	}
				5059
				5060	/*
				5061	* If the number of missing devices is larger than max errors,
				5062	* we can not write the data into that chunk successfully, so
				5063	* set it readonly.
				5064	*/
				5065	if (miss_ndevs > btrfs_chunk_max_errors(map))
				5066	readonly = 1;
				5067	end:
				5068	free_extent_map(em);
				5069	return readonly;
				5070	}
				5071
				5072	void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
				5073	{
				5074	extent_map_tree_init(&tree->map_tree);
				5075	}
				5076
				5077	void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
				5078	{
				5079	struct extent_map *em;
				5080
				5081	while (1) {
				5082	write_lock(&tree->map_tree.lock);
				5083	em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
				5084	if (em)
				5085	remove_extent_mapping(&tree->map_tree, em);
				5086	write_unlock(&tree->map_tree.lock);
				5087	if (!em)
				5088	break;
				5089	/* once for us */
				5090	free_extent_map(em);
				5091	/* once for the tree */
				5092	free_extent_map(em);
				5093	}
				5094	}
				5095
				5096	int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
				5097	{
				5098	struct extent_map *em;
				5099	struct map_lookup *map;
				5100	int ret;
				5101
				5102	em = get_chunk_map(fs_info, logical, len);
				5103	if (IS_ERR(em))
				5104	/*
				5105	* We could return errors for these cases, but that could get
				5106	* ugly and we'd probably do the same thing which is just not do
				5107	* anything else and exit, so return 1 so the callers don't try
				5108	* to use other copies.
				5109	*/
				5110	return 1;
				5111
				5112	map = em->map_lookup;
				5113	if (map->type & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1))
				5114	ret = map->num_stripes;
				5115	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				5116	ret = map->sub_stripes;
				5117	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
				5118	ret = 2;
				5119	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
				5120	/*
				5121	* There could be two corrupted data stripes, we need
				5122	* to loop retry in order to rebuild the correct data.
				5123	*
				5124	* Fail a stripe at a time on every retry except the
				5125	* stripe under reconstruction.
				5126	*/
				5127	ret = map->num_stripes;
				5128	else
				5129	ret = 1;
				5130	free_extent_map(em);
				5131
				5132	btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
				5133	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
				5134	fs_info->dev_replace.tgtdev)
				5135	ret++;
				5136	btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
				5137
				5138	return ret;
				5139	}
				5140
				5141	unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
				5142	u64 logical)
				5143	{
				5144	struct extent_map *em;
				5145	struct map_lookup *map;
				5146	unsigned long len = fs_info->sectorsize;
				5147
				5148	em = get_chunk_map(fs_info, logical, len);
				5149
				5150	if (!WARN_ON(IS_ERR(em))) {
				5151	map = em->map_lookup;
				5152	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				5153	len = map->stripe_len * nr_data_stripes(map);
				5154	free_extent_map(em);
				5155	}
				5156	return len;
				5157	}
				5158
				5159	int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
				5160	{
				5161	struct extent_map *em;
				5162	struct map_lookup *map;
				5163	int ret = 0;
				5164
				5165	em = get_chunk_map(fs_info, logical, len);
				5166
				5167	if(!WARN_ON(IS_ERR(em))) {
				5168	map = em->map_lookup;
				5169	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				5170	ret = 1;
				5171	free_extent_map(em);
				5172	}
				5173	return ret;
				5174	}
				5175
				5176	static int find_live_mirror(struct btrfs_fs_info *fs_info,
				5177	struct map_lookup *map, int first, int num,
				5178	int optimal, int dev_replace_is_ongoing)
				5179	{
				5180	int i;
				5181	int tolerance;
				5182	struct btrfs_device *srcdev;
				5183
				5184	if (dev_replace_is_ongoing &&
				5185	fs_info->dev_replace.cont_reading_from_srcdev_mode ==
				5186	BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
				5187	srcdev = fs_info->dev_replace.srcdev;
				5188	else
				5189	srcdev = NULL;
				5190
				5191	/*
				5192	* try to avoid the drive that is the source drive for a
				5193	* dev-replace procedure, only choose it if no other non-missing
				5194	* mirror is available
				5195	*/
				5196	for (tolerance = 0; tolerance < 2; tolerance++) {
				5197	if (map->stripes[optimal].dev->bdev &&
				5198	(tolerance \|\| map->stripes[optimal].dev != srcdev))
				5199	return optimal;
				5200	for (i = first; i < first + num; i++) {
				5201	if (map->stripes[i].dev->bdev &&
				5202	(tolerance \|\| map->stripes[i].dev != srcdev))
				5203	return i;
				5204	}
				5205	}
				5206
				5207	/* we couldn't find one that doesn't fail. Just return something
				5208	* and the io error handling code will clean up eventually
				5209	*/
				5210	return optimal;
				5211	}
				5212
				5213	static inline int parity_smaller(u64 a, u64 b)
				5214	{
				5215	return a > b;
				5216	}
				5217
				5218	/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
				5219	static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
				5220	{
				5221	struct btrfs_bio_stripe s;
				5222	int i;
				5223	u64 l;
				5224	int again = 1;
				5225
				5226	while (again) {
				5227	again = 0;
				5228	for (i = 0; i < num_stripes - 1; i++) {
				5229	if (parity_smaller(bbio->raid_map[i],
				5230	bbio->raid_map[i+1])) {
				5231	s = bbio->stripes[i];
				5232	l = bbio->raid_map[i];
				5233	bbio->stripes[i] = bbio->stripes[i+1];
				5234	bbio->raid_map[i] = bbio->raid_map[i+1];
				5235	bbio->stripes[i+1] = s;
				5236	bbio->raid_map[i+1] = l;
				5237
				5238	again = 1;
				5239	}
				5240	}
				5241	}
				5242	}
				5243
				5244	static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
				5245	{
				5246	struct btrfs_bio *bbio = kzalloc(
				5247	/* the size of the btrfs_bio */
				5248	sizeof(struct btrfs_bio) +
				5249	/* plus the variable array for the stripes */
				5250	sizeof(struct btrfs_bio_stripe) * (total_stripes) +
				5251	/* plus the variable array for the tgt dev */
				5252	sizeof(int) * (real_stripes) +
				5253	/*
				5254	* plus the raid_map, which includes both the tgt dev
				5255	* and the stripes
				5256	*/
				5257	sizeof(u64) * (total_stripes),
				5258	GFP_NOFS\|__GFP_NOFAIL);
				5259
				5260	atomic_set(&bbio->error, 0);
				5261	refcount_set(&bbio->refs, 1);
				5262
				5263	return bbio;
				5264	}
				5265
				5266	void btrfs_get_bbio(struct btrfs_bio *bbio)
				5267	{
				5268	WARN_ON(!refcount_read(&bbio->refs));
				5269	refcount_inc(&bbio->refs);
				5270	}
				5271
				5272	void btrfs_put_bbio(struct btrfs_bio *bbio)
				5273	{
				5274	if (!bbio)
				5275	return;
				5276	if (refcount_dec_and_test(&bbio->refs))
				5277	kfree(bbio);
				5278	}
				5279
				5280	/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
				5281	/*
				5282	* Please note that, discard won't be sent to target device of device
				5283	* replace.
				5284	*/
				5285	static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
				5286	u64 logical, u64 length,
				5287	struct btrfs_bio **bbio_ret)
				5288	{
				5289	struct extent_map *em;
				5290	struct map_lookup *map;
				5291	struct btrfs_bio *bbio;
				5292	u64 offset;
				5293	u64 stripe_nr;
				5294	u64 stripe_nr_end;
				5295	u64 stripe_end_offset;
				5296	u64 stripe_cnt;
				5297	u64 stripe_len;
				5298	u64 stripe_offset;
				5299	u64 num_stripes;
				5300	u32 stripe_index;
				5301	u32 factor = 0;
				5302	u32 sub_stripes = 0;
				5303	u64 stripes_per_dev = 0;
				5304	u32 remaining_stripes = 0;
				5305	u32 last_stripe = 0;
				5306	int ret = 0;
				5307	int i;
				5308
				5309	/* discard always return a bbio */
				5310	ASSERT(bbio_ret);
				5311
				5312	em = get_chunk_map(fs_info, logical, length);
				5313	if (IS_ERR(em))
				5314	return PTR_ERR(em);
				5315
				5316	map = em->map_lookup;
				5317	/* we don't discard raid56 yet */
				5318	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5319	ret = -EOPNOTSUPP;
				5320	goto out;
				5321	}
				5322
				5323	offset = logical - em->start;
				5324	length = min_t(u64, em->len - offset, length);
				5325
				5326	stripe_len = map->stripe_len;
				5327	/*
				5328	* stripe_nr counts the total number of stripes we have to stride
				5329	* to get to this block
				5330	*/
				5331	stripe_nr = div64_u64(offset, stripe_len);
				5332
				5333	/* stripe_offset is the offset of this block in its stripe */
				5334	stripe_offset = offset - stripe_nr * stripe_len;
				5335
				5336	stripe_nr_end = round_up(offset + length, map->stripe_len);
				5337	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
				5338	stripe_cnt = stripe_nr_end - stripe_nr;
				5339	stripe_end_offset = stripe_nr_end * map->stripe_len -
				5340	(offset + length);
				5341	/*
				5342	* after this, stripe_nr is the number of stripes on this
				5343	* device we have to walk to find the data, and stripe_index is
				5344	* the number of our device in the stripe array
				5345	*/
				5346	num_stripes = 1;
				5347	stripe_index = 0;
				5348	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
				5349	BTRFS_BLOCK_GROUP_RAID10)) {
				5350	if (map->type & BTRFS_BLOCK_GROUP_RAID0)
				5351	sub_stripes = 1;
				5352	else
				5353	sub_stripes = map->sub_stripes;
				5354
				5355	factor = map->num_stripes / sub_stripes;
				5356	num_stripes = min_t(u64, map->num_stripes,
				5357	sub_stripes * stripe_cnt);
				5358	stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
				5359	stripe_index *= sub_stripes;
				5360	stripes_per_dev = div_u64_rem(stripe_cnt, factor,
				5361	&remaining_stripes);
				5362	div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
				5363	last_stripe *= sub_stripes;
				5364	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
				5365	BTRFS_BLOCK_GROUP_DUP)) {
				5366	num_stripes = map->num_stripes;
				5367	} else {
				5368	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				5369	&stripe_index);
				5370	}
				5371
				5372	bbio = alloc_btrfs_bio(num_stripes, 0);
				5373	if (!bbio) {
				5374	ret = -ENOMEM;
				5375	goto out;
				5376	}
				5377
				5378	for (i = 0; i < num_stripes; i++) {
				5379	bbio->stripes[i].physical =
				5380	map->stripes[stripe_index].physical +
				5381	stripe_offset + stripe_nr * map->stripe_len;
				5382	bbio->stripes[i].dev = map->stripes[stripe_index].dev;
				5383
				5384	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
				5385	BTRFS_BLOCK_GROUP_RAID10)) {
				5386	bbio->stripes[i].length = stripes_per_dev *
				5387	map->stripe_len;
				5388
				5389	if (i / sub_stripes < remaining_stripes)
				5390	bbio->stripes[i].length +=
				5391	map->stripe_len;
				5392
				5393	/*
				5394	* Special for the first stripe and
				5395	* the last stripe:
				5396	*
				5397	* \|-------\|...\|-------\|
				5398	* \|----------\|
				5399	* off end_off
				5400	*/
				5401	if (i < sub_stripes)
				5402	bbio->stripes[i].length -=
				5403	stripe_offset;
				5404
				5405	if (stripe_index >= last_stripe &&
				5406	stripe_index <= (last_stripe +
				5407	sub_stripes - 1))
				5408	bbio->stripes[i].length -=
				5409	stripe_end_offset;
				5410
				5411	if (i == sub_stripes - 1)
				5412	stripe_offset = 0;
				5413	} else {
				5414	bbio->stripes[i].length = length;
				5415	}
				5416
				5417	stripe_index++;
				5418	if (stripe_index == map->num_stripes) {
				5419	stripe_index = 0;
				5420	stripe_nr++;
				5421	}
				5422	}
				5423
				5424	*bbio_ret = bbio;
				5425	bbio->map_type = map->type;
				5426	bbio->num_stripes = num_stripes;
				5427	out:
				5428	free_extent_map(em);
				5429	return ret;
				5430	}
				5431
				5432	/*
				5433	* In dev-replace case, for repair case (that's the only case where the mirror
				5434	* is selected explicitly when calling btrfs_map_block), blocks left of the
				5435	* left cursor can also be read from the target drive.
				5436	*
				5437	* For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
				5438	* array of stripes.
				5439	* For READ, it also needs to be supported using the same mirror number.
				5440	*
				5441	* If the requested block is not left of the left cursor, EIO is returned. This
				5442	* can happen because btrfs_num_copies() returns one more in the dev-replace
				5443	* case.
				5444	*/
				5445	static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
				5446	u64 logical, u64 length,
				5447	u64 srcdev_devid, int *mirror_num,
				5448	u64 *physical)
				5449	{
				5450	struct btrfs_bio *bbio = NULL;
				5451	int num_stripes;
				5452	int index_srcdev = 0;
				5453	int found = 0;
				5454	u64 physical_of_found = 0;
				5455	int i;
				5456	int ret = 0;
				5457
				5458	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				5459	logical, &length, &bbio, 0, 0);
				5460	if (ret) {
				5461	ASSERT(bbio == NULL);
				5462	return ret;
				5463	}
				5464
				5465	num_stripes = bbio->num_stripes;
				5466	if (*mirror_num > num_stripes) {
				5467	/*
				5468	* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
				5469	* that means that the requested area is not left of the left
				5470	* cursor
				5471	*/
				5472	btrfs_put_bbio(bbio);
				5473	return -EIO;
				5474	}
				5475
				5476	/*
				5477	* process the rest of the function using the mirror_num of the source
				5478	* drive. Therefore look it up first. At the end, patch the device
				5479	* pointer to the one of the target drive.
				5480	*/
				5481	for (i = 0; i < num_stripes; i++) {
				5482	if (bbio->stripes[i].dev->devid != srcdev_devid)
				5483	continue;
				5484
				5485	/*
				5486	* In case of DUP, in order to keep it simple, only add the
				5487	* mirror with the lowest physical address
				5488	*/
				5489	if (found &&
				5490	physical_of_found <= bbio->stripes[i].physical)
				5491	continue;
				5492
				5493	index_srcdev = i;
				5494	found = 1;
				5495	physical_of_found = bbio->stripes[i].physical;
				5496	}
				5497
				5498	btrfs_put_bbio(bbio);
				5499
				5500	ASSERT(found);
				5501	if (!found)
				5502	return -EIO;
				5503
				5504	*mirror_num = index_srcdev + 1;
				5505	*physical = physical_of_found;
				5506	return ret;
				5507	}
				5508
				5509	static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				5510	struct btrfs_bio **bbio_ret,
				5511	struct btrfs_dev_replace *dev_replace,
				5512	int num_stripes_ret, int max_errors_ret)
				5513	{
				5514	struct btrfs_bio bbio = bbio_ret;
				5515	u64 srcdev_devid = dev_replace->srcdev->devid;
				5516	int tgtdev_indexes = 0;
				5517	int num_stripes = *num_stripes_ret;
				5518	int max_errors = *max_errors_ret;
				5519	int i;
				5520
				5521	if (op == BTRFS_MAP_WRITE) {
				5522	int index_where_to_add;
				5523
				5524	/*
				5525	* duplicate the write operations while the dev replace
				5526	* procedure is running. Since the copying of the old disk to
				5527	* the new disk takes place at run time while the filesystem is
				5528	* mounted writable, the regular write operations to the old
				5529	* disk have to be duplicated to go to the new disk as well.
				5530	*
				5531	* Note that device->missing is handled by the caller, and that
				5532	* the write to the old disk is already set up in the stripes
				5533	* array.
				5534	*/
				5535	index_where_to_add = num_stripes;
				5536	for (i = 0; i < num_stripes; i++) {
				5537	if (bbio->stripes[i].dev->devid == srcdev_devid) {
				5538	/* write to new disk, too */
				5539	struct btrfs_bio_stripe *new =
				5540	bbio->stripes + index_where_to_add;
				5541	struct btrfs_bio_stripe *old =
				5542	bbio->stripes + i;
				5543
				5544	new->physical = old->physical;
				5545	new->length = old->length;
				5546	new->dev = dev_replace->tgtdev;
				5547	bbio->tgtdev_map[i] = index_where_to_add;
				5548	index_where_to_add++;
				5549	max_errors++;
				5550	tgtdev_indexes++;
				5551	}
				5552	}
				5553	num_stripes = index_where_to_add;
				5554	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
				5555	int index_srcdev = 0;
				5556	int found = 0;
				5557	u64 physical_of_found = 0;
				5558
				5559	/*
				5560	* During the dev-replace procedure, the target drive can also
				5561	* be used to read data in case it is needed to repair a corrupt
				5562	* block elsewhere. This is possible if the requested area is
				5563	* left of the left cursor. In this area, the target drive is a
				5564	* full copy of the source drive.
				5565	*/
				5566	for (i = 0; i < num_stripes; i++) {
				5567	if (bbio->stripes[i].dev->devid == srcdev_devid) {
				5568	/*
				5569	* In case of DUP, in order to keep it simple,
				5570	* only add the mirror with the lowest physical
				5571	* address
				5572	*/
				5573	if (found &&
				5574	physical_of_found <=
				5575	bbio->stripes[i].physical)
				5576	continue;
				5577	index_srcdev = i;
				5578	found = 1;
				5579	physical_of_found = bbio->stripes[i].physical;
				5580	}
				5581	}
				5582	if (found) {
				5583	struct btrfs_bio_stripe *tgtdev_stripe =
				5584	bbio->stripes + num_stripes;
				5585
				5586	tgtdev_stripe->physical = physical_of_found;
				5587	tgtdev_stripe->length =
				5588	bbio->stripes[index_srcdev].length;
				5589	tgtdev_stripe->dev = dev_replace->tgtdev;
				5590	bbio->tgtdev_map[index_srcdev] = num_stripes;
				5591
				5592	tgtdev_indexes++;
				5593	num_stripes++;
				5594	}
				5595	}
				5596
				5597	*num_stripes_ret = num_stripes;
				5598	*max_errors_ret = max_errors;
				5599	bbio->num_tgtdevs = tgtdev_indexes;
				5600	*bbio_ret = bbio;
				5601	}
				5602
				5603	static bool need_full_stripe(enum btrfs_map_op op)
				5604	{
				5605	return (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS);
				5606	}
				5607
				5608	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
				5609	enum btrfs_map_op op,
				5610	u64 logical, u64 *length,
				5611	struct btrfs_bio **bbio_ret,
				5612	int mirror_num, int need_raid_map)
				5613	{
				5614	struct extent_map *em;
				5615	struct map_lookup *map;
				5616	u64 offset;
				5617	u64 stripe_offset;
				5618	u64 stripe_nr;
				5619	u64 stripe_len;
				5620	u32 stripe_index;
				5621	int i;
				5622	int ret = 0;
				5623	int num_stripes;
				5624	int max_errors = 0;
				5625	int tgtdev_indexes = 0;
				5626	struct btrfs_bio *bbio = NULL;
				5627	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
				5628	int dev_replace_is_ongoing = 0;
				5629	int num_alloc_stripes;
				5630	int patch_the_first_stripe_for_dev_replace = 0;
				5631	u64 physical_to_patch_in_first_stripe = 0;
				5632	u64 raid56_full_stripe_start = (u64)-1;
				5633
				5634	if (op == BTRFS_MAP_DISCARD)
				5635	return __btrfs_map_block_for_discard(fs_info, logical,
				5636	*length, bbio_ret);
				5637
				5638	em = get_chunk_map(fs_info, logical, *length);
				5639	if (IS_ERR(em))
				5640	return PTR_ERR(em);
				5641
				5642	map = em->map_lookup;
				5643	offset = logical - em->start;
				5644
				5645	stripe_len = map->stripe_len;
				5646	stripe_nr = offset;
				5647	/*
				5648	* stripe_nr counts the total number of stripes we have to stride
				5649	* to get to this block
				5650	*/
				5651	stripe_nr = div64_u64(stripe_nr, stripe_len);
				5652
				5653	stripe_offset = stripe_nr * stripe_len;
				5654	if (offset < stripe_offset) {
				5655	btrfs_crit(fs_info,
				5656	"stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
				5657	stripe_offset, offset, em->start, logical,
				5658	stripe_len);
				5659	free_extent_map(em);
				5660	return -EINVAL;
				5661	}
				5662
				5663	/* stripe_offset is the offset of this block in its stripe*/
				5664	stripe_offset = offset - stripe_offset;
				5665
				5666	/* if we're here for raid56, we need to know the stripe aligned start */
				5667	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5668	unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
				5669	raid56_full_stripe_start = offset;
				5670
				5671	/* allow a write of a full stripe, but make sure we don't
				5672	* allow straddling of stripes
				5673	*/
				5674	raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				5675	full_stripe_len);
				5676	raid56_full_stripe_start *= full_stripe_len;
				5677	}
				5678
				5679	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
				5680	u64 max_len;
				5681	/* For writes to RAID[56], allow a full stripeset across all disks.
				5682	For other RAID types and for RAID[56] reads, just allow a single
				5683	stripe (on a single disk). */
				5684	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
				5685	(op == BTRFS_MAP_WRITE)) {
				5686	max_len = stripe_len * nr_data_stripes(map) -
				5687	(offset - raid56_full_stripe_start);
				5688	} else {
				5689	/* we limit the length of each bio to what fits in a stripe */
				5690	max_len = stripe_len - stripe_offset;
				5691	}
				5692	*length = min_t(u64, em->len - offset, max_len);
				5693	} else {
				5694	*length = em->len - offset;
				5695	}
				5696
				5697	/* This is for when we're called from btrfs_merge_bio_hook() and all
				5698	it cares about is the length */
				5699	if (!bbio_ret)
				5700	goto out;
				5701
				5702	btrfs_dev_replace_lock(dev_replace, 0);
				5703	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
				5704	if (!dev_replace_is_ongoing)
				5705	btrfs_dev_replace_unlock(dev_replace, 0);
				5706	else
				5707	btrfs_dev_replace_set_lock_blocking(dev_replace);
				5708
				5709	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
				5710	!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
				5711	ret = get_extra_mirror_from_replace(fs_info, logical, *length,
				5712	dev_replace->srcdev->devid,
				5713	&mirror_num,
				5714	&physical_to_patch_in_first_stripe);
				5715	if (ret)
				5716	goto out;
				5717	else
				5718	patch_the_first_stripe_for_dev_replace = 1;
				5719	} else if (mirror_num > map->num_stripes) {
				5720	mirror_num = 0;
				5721	}
				5722
				5723	num_stripes = 1;
				5724	stripe_index = 0;
				5725	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				5726	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				5727	&stripe_index);
				5728	if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
				5729	mirror_num = 1;
				5730	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
				5731	if (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS)
				5732	num_stripes = map->num_stripes;
				5733	else if (mirror_num)
				5734	stripe_index = mirror_num - 1;
				5735	else {
				5736	stripe_index = find_live_mirror(fs_info, map, 0,
				5737	map->num_stripes,
				5738	current->pid % map->num_stripes,
				5739	dev_replace_is_ongoing);
				5740	mirror_num = stripe_index + 1;
				5741	}
				5742
				5743	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
				5744	if (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS) {
				5745	num_stripes = map->num_stripes;
				5746	} else if (mirror_num) {
				5747	stripe_index = mirror_num - 1;
				5748	} else {
				5749	mirror_num = 1;
				5750	}
				5751
				5752	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				5753	u32 factor = map->num_stripes / map->sub_stripes;
				5754
				5755	stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
				5756	stripe_index *= map->sub_stripes;
				5757
				5758	if (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS)
				5759	num_stripes = map->sub_stripes;
				5760	else if (mirror_num)
				5761	stripe_index += mirror_num - 1;
				5762	else {
				5763	int old_stripe_index = stripe_index;
				5764	stripe_index = find_live_mirror(fs_info, map,
				5765	stripe_index,
				5766	map->sub_stripes, stripe_index +
				5767	current->pid % map->sub_stripes,
				5768	dev_replace_is_ongoing);
				5769	mirror_num = stripe_index - old_stripe_index + 1;
				5770	}
				5771
				5772	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5773	if (need_raid_map &&
				5774	(op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS \|\|
				5775	mirror_num > 1)) {
				5776	/* push stripe_nr back to the start of the full stripe */
				5777	stripe_nr = div64_u64(raid56_full_stripe_start,
				5778	stripe_len * nr_data_stripes(map));
				5779
				5780	/* RAID[56] write or recovery. Return all stripes */
				5781	num_stripes = map->num_stripes;
				5782	max_errors = nr_parity_stripes(map);
				5783
				5784	*length = map->stripe_len;
				5785	stripe_index = 0;
				5786	stripe_offset = 0;
				5787	} else {
				5788	/*
				5789	* Mirror #0 or #1 means the original data block.
				5790	* Mirror #2 is RAID5 parity block.
				5791	* Mirror #3 is RAID6 Q block.
				5792	*/
				5793	stripe_nr = div_u64_rem(stripe_nr,
				5794	nr_data_stripes(map), &stripe_index);
				5795	if (mirror_num > 1)
				5796	stripe_index = nr_data_stripes(map) +
				5797	mirror_num - 2;
				5798
				5799	/* We distribute the parity blocks across stripes */
				5800	div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
				5801	&stripe_index);
				5802	if ((op != BTRFS_MAP_WRITE &&
				5803	op != BTRFS_MAP_GET_READ_MIRRORS) &&
				5804	mirror_num <= 1)
				5805	mirror_num = 1;
				5806	}
				5807	} else {
				5808	/*
				5809	* after this, stripe_nr is the number of stripes on this
				5810	* device we have to walk to find the data, and stripe_index is
				5811	* the number of our device in the stripe array
				5812	*/
				5813	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				5814	&stripe_index);
				5815	mirror_num = stripe_index + 1;
				5816	}
				5817	if (stripe_index >= map->num_stripes) {
				5818	btrfs_crit(fs_info,
				5819	"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
				5820	stripe_index, map->num_stripes);
				5821	ret = -EINVAL;
				5822	goto out;
				5823	}
				5824
				5825	num_alloc_stripes = num_stripes;
				5826	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
				5827	if (op == BTRFS_MAP_WRITE)
				5828	num_alloc_stripes <<= 1;
				5829	if (op == BTRFS_MAP_GET_READ_MIRRORS)
				5830	num_alloc_stripes++;
				5831	tgtdev_indexes = num_stripes;
				5832	}
				5833
				5834	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
				5835	if (!bbio) {
				5836	ret = -ENOMEM;
				5837	goto out;
				5838	}
				5839	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
				5840	bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
				5841
				5842	/* build raid_map */
				5843	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
				5844	(need_full_stripe(op) \|\| mirror_num > 1)) {
				5845	u64 tmp;
				5846	unsigned rot;
				5847
				5848	bbio->raid_map = (u64 )((void )bbio->stripes +
				5849	sizeof(struct btrfs_bio_stripe) *
				5850	num_alloc_stripes +
				5851	sizeof(int) * tgtdev_indexes);
				5852
				5853	/* Work out the disk rotation on this stripe-set */
				5854	div_u64_rem(stripe_nr, num_stripes, &rot);
				5855
				5856	/* Fill in the logical address of each stripe */
				5857	tmp = stripe_nr * nr_data_stripes(map);
				5858	for (i = 0; i < nr_data_stripes(map); i++)
				5859	bbio->raid_map[(i+rot) % num_stripes] =
				5860	em->start + (tmp + i) * map->stripe_len;
				5861
				5862	bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
				5863	if (map->type & BTRFS_BLOCK_GROUP_RAID6)
				5864	bbio->raid_map[(i+rot+1) % num_stripes] =
				5865	RAID6_Q_STRIPE;
				5866	}
				5867
				5868
				5869	for (i = 0; i < num_stripes; i++) {
				5870	bbio->stripes[i].physical =
				5871	map->stripes[stripe_index].physical +
				5872	stripe_offset +
				5873	stripe_nr * map->stripe_len;
				5874	bbio->stripes[i].dev =
				5875	map->stripes[stripe_index].dev;
				5876	stripe_index++;
				5877	}
				5878
				5879	if (need_full_stripe(op))
				5880	max_errors = btrfs_chunk_max_errors(map);
				5881
				5882	if (bbio->raid_map)
				5883	sort_parity_stripes(bbio, num_stripes);
				5884
				5885	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
				5886	need_full_stripe(op)) {
				5887	handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
				5888	&max_errors);
				5889	}
				5890
				5891	*bbio_ret = bbio;
				5892	bbio->map_type = map->type;
				5893	bbio->num_stripes = num_stripes;
				5894	bbio->max_errors = max_errors;
				5895	bbio->mirror_num = mirror_num;
				5896
				5897	/*
				5898	* this is the case that REQ_READ && dev_replace_is_ongoing &&
				5899	* mirror_num == num_stripes + 1 && dev_replace target drive is
				5900	* available as a mirror
				5901	*/
				5902	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
				5903	WARN_ON(num_stripes > 1);
				5904	bbio->stripes[0].dev = dev_replace->tgtdev;
				5905	bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
				5906	bbio->mirror_num = map->num_stripes + 1;
				5907	}
				5908	out:
				5909	if (dev_replace_is_ongoing) {
				5910	btrfs_dev_replace_clear_lock_blocking(dev_replace);
				5911	btrfs_dev_replace_unlock(dev_replace, 0);
				5912	}
				5913	free_extent_map(em);
				5914	return ret;
				5915	}
				5916
				5917	int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
				5918	u64 logical, u64 *length,
				5919	struct btrfs_bio **bbio_ret, int mirror_num)
				5920	{
				5921	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
				5922	mirror_num, 0);
				5923	}
				5924
				5925	/* For Scrub/replace */
				5926	int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
				5927	u64 logical, u64 *length,
				5928	struct btrfs_bio **bbio_ret)
				5929	{
				5930	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
				5931	}
				5932
				5933	int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
				5934	u64 chunk_start, u64 physical, u64 devid,
				5935	u64 *logical, int naddrs, int *stripe_len)
				5936	{
				5937	struct extent_map *em;
				5938	struct map_lookup *map;
				5939	u64 *buf;
				5940	u64 bytenr;
				5941	u64 length;
				5942	u64 stripe_nr;
				5943	u64 rmap_len;
				5944	int i, j, nr = 0;
				5945
				5946	em = get_chunk_map(fs_info, chunk_start, 1);
				5947	if (IS_ERR(em))
				5948	return -EIO;
				5949
				5950	map = em->map_lookup;
				5951	length = em->len;
				5952	rmap_len = map->stripe_len;
				5953
				5954	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				5955	length = div_u64(length, map->num_stripes / map->sub_stripes);
				5956	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
				5957	length = div_u64(length, map->num_stripes);
				5958	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5959	length = div_u64(length, nr_data_stripes(map));
				5960	rmap_len = map->stripe_len * nr_data_stripes(map);
				5961	}
				5962
				5963	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
				5964	BUG_ON(!buf); /* -ENOMEM */
				5965
				5966	for (i = 0; i < map->num_stripes; i++) {
				5967	if (devid && map->stripes[i].dev->devid != devid)
				5968	continue;
				5969	if (map->stripes[i].physical > physical \|\|
				5970	map->stripes[i].physical + length <= physical)
				5971	continue;
				5972
				5973	stripe_nr = physical - map->stripes[i].physical;
				5974	stripe_nr = div64_u64(stripe_nr, map->stripe_len);
				5975
				5976	if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				5977	stripe_nr = stripe_nr * map->num_stripes + i;
				5978	stripe_nr = div_u64(stripe_nr, map->sub_stripes);
				5979	} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				5980	stripe_nr = stripe_nr * map->num_stripes + i;
				5981	} /* else if RAID[56], multiply by nr_data_stripes().
				5982	* Alternatively, just use rmap_len below instead of
				5983	* map->stripe_len */
				5984
				5985	bytenr = chunk_start + stripe_nr * rmap_len;
				5986	WARN_ON(nr >= map->num_stripes);
				5987	for (j = 0; j < nr; j++) {
				5988	if (buf[j] == bytenr)
				5989	break;
				5990	}
				5991	if (j == nr) {
				5992	WARN_ON(nr >= map->num_stripes);
				5993	buf[nr++] = bytenr;
				5994	}
				5995	}
				5996
				5997	*logical = buf;
				5998	*naddrs = nr;
				5999	*stripe_len = rmap_len;
				6000
				6001	free_extent_map(em);
				6002	return 0;
				6003	}
				6004
				6005	static inline void btrfs_end_bbio(struct btrfs_bio bbio, struct bio bio)
				6006	{
				6007	bio->bi_private = bbio->private;
				6008	bio->bi_end_io = bbio->end_io;
				6009	bio_endio(bio);
				6010
				6011	btrfs_put_bbio(bbio);
				6012	}
				6013
				6014	static void btrfs_end_bio(struct bio *bio)
				6015	{
				6016	struct btrfs_bio *bbio = bio->bi_private;
				6017	int is_orig_bio = 0;
				6018
				6019	if (bio->bi_status) {
				6020	atomic_inc(&bbio->error);
				6021	if (bio->bi_status == BLK_STS_IOERR \|\|
				6022	bio->bi_status == BLK_STS_TARGET) {
				6023	unsigned int stripe_index =
				6024	btrfs_io_bio(bio)->stripe_index;
				6025	struct btrfs_device *dev;
				6026
				6027	BUG_ON(stripe_index >= bbio->num_stripes);
				6028	dev = bbio->stripes[stripe_index].dev;
				6029	if (dev->bdev) {
				6030	if (bio_op(bio) == REQ_OP_WRITE)
				6031	btrfs_dev_stat_inc(dev,
				6032	BTRFS_DEV_STAT_WRITE_ERRS);
				6033	else
				6034	btrfs_dev_stat_inc(dev,
				6035	BTRFS_DEV_STAT_READ_ERRS);
				6036	if (bio->bi_opf & REQ_PREFLUSH)
				6037	btrfs_dev_stat_inc(dev,
				6038	BTRFS_DEV_STAT_FLUSH_ERRS);
				6039	btrfs_dev_stat_print_on_error(dev);
				6040	}
				6041	}
				6042	}
				6043
				6044	if (bio == bbio->orig_bio)
				6045	is_orig_bio = 1;
				6046
				6047	btrfs_bio_counter_dec(bbio->fs_info);
				6048
				6049	if (atomic_dec_and_test(&bbio->stripes_pending)) {
				6050	if (!is_orig_bio) {
				6051	bio_put(bio);
				6052	bio = bbio->orig_bio;
				6053	}
				6054
				6055	btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
				6056	/* only send an error to the higher layers if it is
				6057	* beyond the tolerance of the btrfs bio
				6058	*/
				6059	if (atomic_read(&bbio->error) > bbio->max_errors) {
				6060	bio->bi_status = BLK_STS_IOERR;
				6061	} else {
				6062	/*
				6063	* this bio is actually up to date, we didn't
				6064	* go over the max number of errors
				6065	*/
				6066	bio->bi_status = 0;
				6067	}
				6068
				6069	btrfs_end_bbio(bbio, bio);
				6070	} else if (!is_orig_bio) {
				6071	bio_put(bio);
				6072	}
				6073	}
				6074
				6075	/*
				6076	* see run_scheduled_bios for a description of why bios are collected for
				6077	* async submit.
				6078	*
				6079	* This will add one bio to the pending list for a device and make sure
				6080	* the work struct is scheduled.
				6081	*/
				6082	static noinline void btrfs_schedule_bio(struct btrfs_device *device,
				6083	struct bio *bio)
				6084	{
				6085	struct btrfs_fs_info *fs_info = device->fs_info;
				6086	int should_queue = 1;
				6087	struct btrfs_pending_bios *pending_bios;
				6088
				6089	if (device->missing \|\| !device->bdev) {
				6090	bio_io_error(bio);
				6091	return;
				6092	}
				6093
				6094	/* don't bother with additional async steps for reads, right now */
				6095	if (bio_op(bio) == REQ_OP_READ) {
				6096	bio_get(bio);
				6097	btrfsic_submit_bio(bio);
				6098	bio_put(bio);
				6099	return;
				6100	}
				6101
				6102	/*
				6103	* nr_async_bios allows us to reliably return congestion to the
				6104	* higher layers. Otherwise, the async bio makes it appear we have
				6105	* made progress against dirty pages when we've really just put it
				6106	* on a queue for later
				6107	*/
				6108	atomic_inc(&fs_info->nr_async_bios);
				6109	WARN_ON(bio->bi_next);
				6110	bio->bi_next = NULL;
				6111
				6112	spin_lock(&device->io_lock);
				6113	if (op_is_sync(bio->bi_opf))
				6114	pending_bios = &device->pending_sync_bios;
				6115	else
				6116	pending_bios = &device->pending_bios;
				6117
				6118	if (pending_bios->tail)
				6119	pending_bios->tail->bi_next = bio;
				6120
				6121	pending_bios->tail = bio;
				6122	if (!pending_bios->head)
				6123	pending_bios->head = bio;
				6124	if (device->running_pending)
				6125	should_queue = 0;
				6126
				6127	spin_unlock(&device->io_lock);
				6128
				6129	if (should_queue)
				6130	btrfs_queue_work(fs_info->submit_workers, &device->work);
				6131	}
				6132
				6133	static void submit_stripe_bio(struct btrfs_bio bbio, struct bio bio,
				6134	u64 physical, int dev_nr, int async)
				6135	{
				6136	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
				6137	struct btrfs_fs_info *fs_info = bbio->fs_info;
				6138
				6139	bio->bi_private = bbio;
				6140	btrfs_io_bio(bio)->stripe_index = dev_nr;
				6141	bio->bi_end_io = btrfs_end_bio;
				6142	bio->bi_iter.bi_sector = physical >> 9;
				6143	#ifdef DEBUG
				6144	{
				6145	struct rcu_string *name;
				6146
				6147	rcu_read_lock();
				6148	name = rcu_dereference(dev->name);
				6149	btrfs_debug(fs_info,
				6150	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
				6151	bio_op(bio), bio->bi_opf,
				6152	(u64)bio->bi_iter.bi_sector,
				6153	(u_long)dev->bdev->bd_dev, name->str, dev->devid,
				6154	bio->bi_iter.bi_size);
				6155	rcu_read_unlock();
				6156	}
				6157	#endif
				6158	bio_set_dev(bio, dev->bdev);
				6159
				6160	btrfs_bio_counter_inc_noblocked(fs_info);
				6161
				6162	if (async)
				6163	btrfs_schedule_bio(dev, bio);
				6164	else
				6165	btrfsic_submit_bio(bio);
				6166	}
				6167
				6168	static void bbio_error(struct btrfs_bio bbio, struct bio bio, u64 logical)
				6169	{
				6170	atomic_inc(&bbio->error);
				6171	if (atomic_dec_and_test(&bbio->stripes_pending)) {
				6172	/* Should be the original bio. */
				6173	WARN_ON(bio != bbio->orig_bio);
				6174
				6175	btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
				6176	bio->bi_iter.bi_sector = logical >> 9;
				6177	if (atomic_read(&bbio->error) > bbio->max_errors)
				6178	bio->bi_status = BLK_STS_IOERR;
				6179	else
				6180	bio->bi_status = BLK_STS_OK;
				6181	btrfs_end_bbio(bbio, bio);
				6182	}
				6183	}
				6184
				6185	blk_status_t btrfs_map_bio(struct btrfs_fs_info fs_info, struct bio bio,
				6186	int mirror_num, int async_submit)
				6187	{
				6188	struct btrfs_device *dev;
				6189	struct bio *first_bio = bio;
				6190	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
				6191	u64 length = 0;
				6192	u64 map_length;
				6193	int ret;
				6194	int dev_nr;
				6195	int total_devs;
				6196	struct btrfs_bio *bbio = NULL;
				6197
				6198	length = bio->bi_iter.bi_size;
				6199	map_length = length;
				6200
				6201	btrfs_bio_counter_inc_blocked(fs_info);
				6202	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
				6203	&map_length, &bbio, mirror_num, 1);
				6204	if (ret) {
				6205	btrfs_bio_counter_dec(fs_info);
				6206	return errno_to_blk_status(ret);
				6207	}
				6208
				6209	total_devs = bbio->num_stripes;
				6210	bbio->orig_bio = first_bio;
				6211	bbio->private = first_bio->bi_private;
				6212	bbio->end_io = first_bio->bi_end_io;
				6213	bbio->fs_info = fs_info;
				6214	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
				6215
				6216	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
				6217	((bio_op(bio) == REQ_OP_WRITE) \|\| (mirror_num > 1))) {
				6218	/* In this case, map_length has been set to the length of
				6219	a single stripe; not the whole write */
				6220	if (bio_op(bio) == REQ_OP_WRITE) {
				6221	ret = raid56_parity_write(fs_info, bio, bbio,
				6222	map_length);
				6223	} else {
				6224	ret = raid56_parity_recover(fs_info, bio, bbio,
				6225	map_length, mirror_num, 1);
				6226	}
				6227
				6228	btrfs_bio_counter_dec(fs_info);
				6229	return errno_to_blk_status(ret);
				6230	}
				6231
				6232	if (map_length < length) {
				6233	btrfs_crit(fs_info,
				6234	"mapping failed logical %llu bio len %llu len %llu",
				6235	logical, length, map_length);
				6236	BUG();
				6237	}
				6238
				6239	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
				6240	dev = bbio->stripes[dev_nr].dev;
				6241	if (!dev \|\| !dev->bdev \|\|
				6242	(bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
				6243	bbio_error(bbio, first_bio, logical);
				6244	continue;
				6245	}
				6246
				6247	if (dev_nr < total_devs - 1)
				6248	bio = btrfs_bio_clone(first_bio);
				6249	else
				6250	bio = first_bio;
				6251
				6252	submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				6253	dev_nr, async_submit);
				6254	}
				6255	btrfs_bio_counter_dec(fs_info);
				6256	return BLK_STS_OK;
				6257	}
				6258
				6259	struct btrfs_device btrfs_find_device(struct btrfs_fs_info fs_info, u64 devid,
				6260	u8 uuid, u8 fsid)
				6261	{
				6262	struct btrfs_device *device;
				6263	struct btrfs_fs_devices *cur_devices;
				6264
				6265	cur_devices = fs_info->fs_devices;
				6266	while (cur_devices) {
				6267	if (!fsid \|\|
				6268	!memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
				6269	device = find_device(cur_devices, devid, uuid);
				6270	if (device)
				6271	return device;
				6272	}
				6273	cur_devices = cur_devices->seed;
				6274	}
				6275	return NULL;
				6276	}
				6277
				6278	static struct btrfs_device add_missing_dev(struct btrfs_fs_devices fs_devices,
				6279	u64 devid, u8 *dev_uuid)
				6280	{
				6281	struct btrfs_device *device;
				6282	unsigned int nofs_flag;
				6283
				6284	/*
				6285	* We call this under the chunk_mutex, so we want to use NOFS for this
				6286	* allocation, however we don't want to change btrfs_alloc_device() to
				6287	* always do NOFS because we use it in a lot of other GFP_KERNEL safe
				6288	* places.
				6289	*/
				6290	nofs_flag = memalloc_nofs_save();
				6291	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
				6292	memalloc_nofs_restore(nofs_flag);
				6293	if (IS_ERR(device))
				6294	return NULL;
				6295
				6296	list_add(&device->dev_list, &fs_devices->devices);
				6297	device->fs_devices = fs_devices;
				6298	fs_devices->num_devices++;
				6299
				6300	device->missing = 1;
				6301	fs_devices->missing_devices++;
				6302
				6303	return device;
				6304	}
				6305
				6306	/**
				6307	* btrfs_alloc_device - allocate struct btrfs_device
				6308	* @fs_info: used only for generating a new devid, can be NULL if
				6309	* devid is provided (i.e. @devid != NULL).
				6310	* @devid: a pointer to devid for this device. If NULL a new devid
				6311	* is generated.
				6312	* @uuid: a pointer to UUID for this device. If NULL a new UUID
				6313	* is generated.
				6314	*
				6315	* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
				6316	* on error. Returned struct is not linked onto any lists and can be
				6317	* destroyed with kfree() right away.
				6318	*/
				6319	struct btrfs_device btrfs_alloc_device(struct btrfs_fs_info fs_info,
				6320	const u64 *devid,
				6321	const u8 *uuid)
				6322	{
				6323	struct btrfs_device *dev;
				6324	u64 tmp;
				6325
				6326	if (WARN_ON(!devid && !fs_info))
				6327	return ERR_PTR(-EINVAL);
				6328
				6329	dev = __alloc_device();
				6330	if (IS_ERR(dev))
				6331	return dev;
				6332
				6333	if (devid)
				6334	tmp = *devid;
				6335	else {
				6336	int ret;
				6337
				6338	ret = find_next_devid(fs_info, &tmp);
				6339	if (ret) {
				6340	kfree(dev);
				6341	return ERR_PTR(ret);
				6342	}
				6343	}
				6344	dev->devid = tmp;
				6345
				6346	if (uuid)
				6347	memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
				6348	else
				6349	generate_random_uuid(dev->uuid);
				6350
				6351	btrfs_init_work(&dev->work, btrfs_submit_helper,
				6352	pending_bios_fn, NULL, NULL);
				6353
				6354	return dev;
				6355	}
				6356
				6357	/* Return -EIO if any error, otherwise return 0. */
				6358	static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
				6359	struct extent_buffer *leaf,
				6360	struct btrfs_chunk *chunk, u64 logical)
				6361	{
				6362	u64 length;
				6363	u64 stripe_len;
				6364	u16 num_stripes;
				6365	u16 sub_stripes;
				6366	u64 type;
				6367	u64 features;
				6368	bool mixed = false;
				6369
				6370	length = btrfs_chunk_length(leaf, chunk);
				6371	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
				6372	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				6373	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
				6374	type = btrfs_chunk_type(leaf, chunk);
				6375
				6376	if (!num_stripes) {
				6377	btrfs_err(fs_info, "invalid chunk num_stripes: %u",
				6378	num_stripes);
				6379	return -EIO;
				6380	}
				6381	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
				6382	btrfs_err(fs_info, "invalid chunk logical %llu", logical);
				6383	return -EIO;
				6384	}
				6385	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
				6386	btrfs_err(fs_info, "invalid chunk sectorsize %u",
				6387	btrfs_chunk_sector_size(leaf, chunk));
				6388	return -EIO;
				6389	}
				6390	if (!length \|\| !IS_ALIGNED(length, fs_info->sectorsize)) {
				6391	btrfs_err(fs_info, "invalid chunk length %llu", length);
				6392	return -EIO;
				6393	}
				6394	if (!is_power_of_2(stripe_len) \|\| stripe_len != BTRFS_STRIPE_LEN) {
				6395	btrfs_err(fs_info, "invalid chunk stripe length: %llu",
				6396	stripe_len);
				6397	return -EIO;
				6398	}
				6399	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK \| BTRFS_BLOCK_GROUP_PROFILE_MASK) &
				6400	type) {
				6401	btrfs_err(fs_info, "unrecognized chunk type: %llu",
				6402	~(BTRFS_BLOCK_GROUP_TYPE_MASK \|
				6403	BTRFS_BLOCK_GROUP_PROFILE_MASK) &
				6404	btrfs_chunk_type(leaf, chunk));
				6405	return -EIO;
				6406	}
				6407
				6408	if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
				6409	btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type);
				6410	return -EIO;
				6411	}
				6412
				6413	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
				6414	(type & (BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA))) {
				6415	btrfs_err(fs_info,
				6416	"system chunk with data or metadata type: 0x%llx", type);
				6417	return -EIO;
				6418	}
				6419
				6420	features = btrfs_super_incompat_flags(fs_info->super_copy);
				6421	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				6422	mixed = true;
				6423
				6424	if (!mixed) {
				6425	if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
				6426	(type & BTRFS_BLOCK_GROUP_DATA)) {
				6427	btrfs_err(fs_info,
				6428	"mixed chunk type in non-mixed mode: 0x%llx", type);
				6429	return -EIO;
				6430	}
				6431	}
				6432
				6433	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) \|\|
				6434	(type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) \|\|
				6435	(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) \|\|
				6436	(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) \|\|
				6437	(type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) \|\|
				6438	((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
				6439	num_stripes != 1)) {
				6440	btrfs_err(fs_info,
				6441	"invalid num_stripes:sub_stripes %u:%u for profile %llu",
				6442	num_stripes, sub_stripes,
				6443	type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
				6444	return -EIO;
				6445	}
				6446
				6447	return 0;
				6448	}
				6449
				6450	static int read_one_chunk(struct btrfs_fs_info fs_info, struct btrfs_key key,
				6451	struct extent_buffer *leaf,
				6452	struct btrfs_chunk *chunk)
				6453	{
				6454	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
				6455	struct map_lookup *map;
				6456	struct extent_map *em;
				6457	u64 logical;
				6458	u64 length;
				6459	u64 devid;
				6460	u8 uuid[BTRFS_UUID_SIZE];
				6461	int num_stripes;
				6462	int ret;
				6463	int i;
				6464
				6465	logical = key->offset;
				6466	length = btrfs_chunk_length(leaf, chunk);
				6467	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				6468
				6469	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
				6470	if (ret)
				6471	return ret;
				6472
				6473	read_lock(&map_tree->map_tree.lock);
				6474	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
				6475	read_unlock(&map_tree->map_tree.lock);
				6476
				6477	/* already mapped? */
				6478	if (em && em->start <= logical && em->start + em->len > logical) {
				6479	free_extent_map(em);
				6480	return 0;
				6481	} else if (em) {
				6482	free_extent_map(em);
				6483	}
				6484
				6485	em = alloc_extent_map();
				6486	if (!em)
				6487	return -ENOMEM;
				6488	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
				6489	if (!map) {
				6490	free_extent_map(em);
				6491	return -ENOMEM;
				6492	}
				6493
				6494	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
				6495	em->map_lookup = map;
				6496	em->start = logical;
				6497	em->len = length;
				6498	em->orig_start = 0;
				6499	em->block_start = 0;
				6500	em->block_len = em->len;
				6501
				6502	map->num_stripes = num_stripes;
				6503	map->io_width = btrfs_chunk_io_width(leaf, chunk);
				6504	map->io_align = btrfs_chunk_io_align(leaf, chunk);
				6505	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
				6506	map->type = btrfs_chunk_type(leaf, chunk);
				6507	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
				6508	for (i = 0; i < num_stripes; i++) {
				6509	map->stripes[i].physical =
				6510	btrfs_stripe_offset_nr(leaf, chunk, i);
				6511	devid = btrfs_stripe_devid_nr(leaf, chunk, i);
				6512	read_extent_buffer(leaf, uuid, (unsigned long)
				6513	btrfs_stripe_dev_uuid_nr(chunk, i),
				6514	BTRFS_UUID_SIZE);
				6515	map->stripes[i].dev = btrfs_find_device(fs_info, devid,
				6516	uuid, NULL);
				6517	if (!map->stripes[i].dev &&
				6518	!btrfs_test_opt(fs_info, DEGRADED)) {
				6519	free_extent_map(em);
				6520	btrfs_report_missing_device(fs_info, devid, uuid);
				6521	return -EIO;
				6522	}
				6523	if (!map->stripes[i].dev) {
				6524	map->stripes[i].dev =
				6525	add_missing_dev(fs_info->fs_devices, devid,
				6526	uuid);
				6527	if (!map->stripes[i].dev) {
				6528	free_extent_map(em);
				6529	return -EIO;
				6530	}
				6531	btrfs_report_missing_device(fs_info, devid, uuid);
				6532	}
				6533	map->stripes[i].dev->in_fs_metadata = 1;
				6534	}
				6535
				6536	write_lock(&map_tree->map_tree.lock);
				6537	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
				6538	write_unlock(&map_tree->map_tree.lock);
				6539	if (ret < 0) {
				6540	btrfs_err(fs_info,
				6541	"failed to add chunk map, start=%llu len=%llu: %d",
				6542	em->start, em->len, ret);
				6543	}
				6544	free_extent_map(em);
				6545
				6546	return ret;
				6547	}
				6548
				6549	static void fill_device_from_item(struct extent_buffer *leaf,
				6550	struct btrfs_dev_item *dev_item,
				6551	struct btrfs_device *device)
				6552	{
				6553	unsigned long ptr;
				6554
				6555	device->devid = btrfs_device_id(leaf, dev_item);
				6556	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
				6557	device->total_bytes = device->disk_total_bytes;
				6558	device->commit_total_bytes = device->disk_total_bytes;
				6559	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
				6560	device->commit_bytes_used = device->bytes_used;
				6561	device->type = btrfs_device_type(leaf, dev_item);
				6562	device->io_align = btrfs_device_io_align(leaf, dev_item);
				6563	device->io_width = btrfs_device_io_width(leaf, dev_item);
				6564	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
				6565	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
				6566	device->is_tgtdev_for_dev_replace = 0;
				6567
				6568	ptr = btrfs_device_uuid(dev_item);
				6569	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
				6570	}
				6571
				6572	static struct btrfs_fs_devices open_seed_devices(struct btrfs_fs_info fs_info,
				6573	u8 *fsid)
				6574	{
				6575	struct btrfs_fs_devices *fs_devices;
				6576	int ret;
				6577
				6578	BUG_ON(!mutex_is_locked(&uuid_mutex));
				6579	ASSERT(fsid);
				6580
				6581	fs_devices = fs_info->fs_devices->seed;
				6582	while (fs_devices) {
				6583	if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
				6584	return fs_devices;
				6585
				6586	fs_devices = fs_devices->seed;
				6587	}
				6588
				6589	fs_devices = find_fsid(fsid);
				6590	if (!fs_devices) {
				6591	if (!btrfs_test_opt(fs_info, DEGRADED))
				6592	return ERR_PTR(-ENOENT);
				6593
				6594	fs_devices = alloc_fs_devices(fsid);
				6595	if (IS_ERR(fs_devices))
				6596	return fs_devices;
				6597
				6598	fs_devices->seeding = 1;
				6599	fs_devices->opened = 1;
				6600	return fs_devices;
				6601	}
				6602
				6603	fs_devices = clone_fs_devices(fs_devices);
				6604	if (IS_ERR(fs_devices))
				6605	return fs_devices;
				6606
				6607	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
				6608	fs_info->bdev_holder);
				6609	if (ret) {
				6610	free_fs_devices(fs_devices);
				6611	fs_devices = ERR_PTR(ret);
				6612	goto out;
				6613	}
				6614
				6615	if (!fs_devices->seeding) {
				6616	__btrfs_close_devices(fs_devices);
				6617	free_fs_devices(fs_devices);
				6618	fs_devices = ERR_PTR(-EINVAL);
				6619	goto out;
				6620	}
				6621
				6622	fs_devices->seed = fs_info->fs_devices->seed;
				6623	fs_info->fs_devices->seed = fs_devices;
				6624	out:
				6625	return fs_devices;
				6626	}
				6627
				6628	static int read_one_dev(struct btrfs_fs_info *fs_info,
				6629	struct extent_buffer *leaf,
				6630	struct btrfs_dev_item *dev_item)
				6631	{
				6632	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				6633	struct btrfs_device *device;
				6634	u64 devid;
				6635	int ret;
				6636	u8 fs_uuid[BTRFS_FSID_SIZE];
				6637	u8 dev_uuid[BTRFS_UUID_SIZE];
				6638
				6639	devid = btrfs_device_id(leaf, dev_item);
				6640	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
				6641	BTRFS_UUID_SIZE);
				6642	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
				6643	BTRFS_FSID_SIZE);
				6644
				6645	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
				6646	fs_devices = open_seed_devices(fs_info, fs_uuid);
				6647	if (IS_ERR(fs_devices))
				6648	return PTR_ERR(fs_devices);
				6649	}
				6650
				6651	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
				6652	if (!device) {
				6653	if (!btrfs_test_opt(fs_info, DEGRADED)) {
				6654	btrfs_report_missing_device(fs_info, devid, dev_uuid);
				6655	return -EIO;
				6656	}
				6657
				6658	device = add_missing_dev(fs_devices, devid, dev_uuid);
				6659	if (!device)
				6660	return -ENOMEM;
				6661	btrfs_report_missing_device(fs_info, devid, dev_uuid);
				6662	} else {
				6663	if (!device->bdev) {
				6664	btrfs_report_missing_device(fs_info, devid, dev_uuid);
				6665	if (!btrfs_test_opt(fs_info, DEGRADED))
				6666	return -EIO;
				6667	}
				6668
				6669	if(!device->bdev && !device->missing) {
				6670	/*
				6671	* this happens when a device that was properly setup
				6672	* in the device info lists suddenly goes bad.
				6673	* device->bdev is NULL, and so we have to set
				6674	* device->missing to one here
				6675	*/
				6676	device->fs_devices->missing_devices++;
				6677	device->missing = 1;
				6678	}
				6679
				6680	/* Move the device to its own fs_devices */
				6681	if (device->fs_devices != fs_devices) {
				6682	ASSERT(device->missing);
				6683
				6684	list_move(&device->dev_list, &fs_devices->devices);
				6685	device->fs_devices->num_devices--;
				6686	fs_devices->num_devices++;
				6687
				6688	device->fs_devices->missing_devices--;
				6689	fs_devices->missing_devices++;
				6690
				6691	device->fs_devices = fs_devices;
				6692	}
				6693	}
				6694
				6695	if (device->fs_devices != fs_info->fs_devices) {
				6696	BUG_ON(device->writeable);
				6697	if (device->generation !=
				6698	btrfs_device_generation(leaf, dev_item))
				6699	return -EINVAL;
				6700	}
				6701
				6702	fill_device_from_item(leaf, dev_item, device);
				6703	device->in_fs_metadata = 1;
				6704	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
				6705	device->fs_devices->total_rw_bytes += device->total_bytes;
				6706	atomic64_add(device->total_bytes - device->bytes_used,
				6707	&fs_info->free_chunk_space);
				6708	}
				6709	ret = 0;
				6710	return ret;
				6711	}
				6712
				6713	int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
				6714	{
				6715	struct btrfs_root *root = fs_info->tree_root;
				6716	struct btrfs_super_block *super_copy = fs_info->super_copy;
				6717	struct extent_buffer *sb;
				6718	struct btrfs_disk_key *disk_key;
				6719	struct btrfs_chunk *chunk;
				6720	u8 *array_ptr;
				6721	unsigned long sb_array_offset;
				6722	int ret = 0;
				6723	u32 num_stripes;
				6724	u32 array_size;
				6725	u32 len = 0;
				6726	u32 cur_offset;
				6727	u64 type;
				6728	struct btrfs_key key;
				6729
				6730	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
				6731	/*
				6732	* This will create extent buffer of nodesize, superblock size is
				6733	* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
				6734	* overallocate but we can keep it as-is, only the first page is used.
				6735	*/
				6736	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
				6737	if (IS_ERR(sb))
				6738	return PTR_ERR(sb);
				6739	set_extent_buffer_uptodate(sb);
				6740	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
				6741	/*
				6742	* The sb extent buffer is artificial and just used to read the system array.
				6743	* set_extent_buffer_uptodate() call does not properly mark all it's
				6744	* pages up-to-date when the page is larger: extent does not cover the
				6745	* whole page and consequently check_page_uptodate does not find all
				6746	* the page's extents up-to-date (the hole beyond sb),
				6747	* write_extent_buffer then triggers a WARN_ON.
				6748	*
				6749	* Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
				6750	* but sb spans only this function. Add an explicit SetPageUptodate call
				6751	* to silence the warning eg. on PowerPC 64.
				6752	*/
				6753	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
				6754	SetPageUptodate(sb->pages[0]);
				6755
				6756	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
				6757	array_size = btrfs_super_sys_array_size(super_copy);
				6758
				6759	array_ptr = super_copy->sys_chunk_array;
				6760	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
				6761	cur_offset = 0;
				6762
				6763	while (cur_offset < array_size) {
				6764	disk_key = (struct btrfs_disk_key *)array_ptr;
				6765	len = sizeof(*disk_key);
				6766	if (cur_offset + len > array_size)
				6767	goto out_short_read;
				6768
				6769	btrfs_disk_key_to_cpu(&key, disk_key);
				6770
				6771	array_ptr += len;
				6772	sb_array_offset += len;
				6773	cur_offset += len;
				6774
				6775	if (key.type == BTRFS_CHUNK_ITEM_KEY) {
				6776	chunk = (struct btrfs_chunk *)sb_array_offset;
				6777	/*
				6778	* At least one btrfs_chunk with one stripe must be
				6779	* present, exact stripe count check comes afterwards
				6780	*/
				6781	len = btrfs_chunk_item_size(1);
				6782	if (cur_offset + len > array_size)
				6783	goto out_short_read;
				6784
				6785	num_stripes = btrfs_chunk_num_stripes(sb, chunk);
				6786	if (!num_stripes) {
				6787	btrfs_err(fs_info,
				6788	"invalid number of stripes %u in sys_array at offset %u",
				6789	num_stripes, cur_offset);
				6790	ret = -EIO;
				6791	break;
				6792	}
				6793
				6794	type = btrfs_chunk_type(sb, chunk);
				6795	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
				6796	btrfs_err(fs_info,
				6797	"invalid chunk type %llu in sys_array at offset %u",
				6798	type, cur_offset);
				6799	ret = -EIO;
				6800	break;
				6801	}
				6802
				6803	len = btrfs_chunk_item_size(num_stripes);
				6804	if (cur_offset + len > array_size)
				6805	goto out_short_read;
				6806
				6807	ret = read_one_chunk(fs_info, &key, sb, chunk);
				6808	if (ret)
				6809	break;
				6810	} else {
				6811	btrfs_err(fs_info,
				6812	"unexpected item type %u in sys_array at offset %u",
				6813	(u32)key.type, cur_offset);
				6814	ret = -EIO;
				6815	break;
				6816	}
				6817	array_ptr += len;
				6818	sb_array_offset += len;
				6819	cur_offset += len;
				6820	}
				6821	clear_extent_buffer_uptodate(sb);
				6822	free_extent_buffer_stale(sb);
				6823	return ret;
				6824
				6825	out_short_read:
				6826	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
				6827	len, cur_offset);
				6828	clear_extent_buffer_uptodate(sb);
				6829	free_extent_buffer_stale(sb);
				6830	return -EIO;
				6831	}
				6832
				6833	void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
				6834	u8 *uuid)
				6835	{
				6836	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
				6837	}
				6838
				6839	/*
				6840	* Check if all chunks in the fs are OK for read-write degraded mount
				6841	*
				6842	* Return true if all chunks meet the minimal RW mount requirements.
				6843	* Return false if any chunk doesn't meet the minimal RW mount requirements.
				6844	*/
				6845	bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
				6846	{
				6847	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
				6848	struct extent_map *em;
				6849	u64 next_start = 0;
				6850	bool ret = true;
				6851
				6852	read_lock(&map_tree->map_tree.lock);
				6853	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
				6854	read_unlock(&map_tree->map_tree.lock);
				6855	/* No chunk at all? Return false anyway */
				6856	if (!em) {
				6857	ret = false;
				6858	goto out;
				6859	}
				6860	while (em) {
				6861	struct map_lookup *map;
				6862	int missing = 0;
				6863	int max_tolerated;
				6864	int i;
				6865
				6866	map = em->map_lookup;
				6867	max_tolerated =
				6868	btrfs_get_num_tolerated_disk_barrier_failures(
				6869	map->type);
				6870	for (i = 0; i < map->num_stripes; i++) {
				6871	struct btrfs_device *dev = map->stripes[i].dev;
				6872
				6873	if (!dev \|\| !dev->bdev \|\| dev->missing \|\|
				6874	dev->last_flush_error)
				6875	missing++;
				6876	}
				6877	if (missing > max_tolerated) {
				6878	btrfs_warn(fs_info,
				6879	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
				6880	em->start, missing, max_tolerated);
				6881	free_extent_map(em);
				6882	ret = false;
				6883	goto out;
				6884	}
				6885	next_start = extent_map_end(em);
				6886	free_extent_map(em);
				6887
				6888	read_lock(&map_tree->map_tree.lock);
				6889	em = lookup_extent_mapping(&map_tree->map_tree, next_start,
				6890	(u64)(-1) - next_start);
				6891	read_unlock(&map_tree->map_tree.lock);
				6892	}
				6893	out:
				6894	return ret;
				6895	}
				6896
				6897	int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
				6898	{
				6899	struct btrfs_root *root = fs_info->chunk_root;
				6900	struct btrfs_path *path;
				6901	struct extent_buffer *leaf;
				6902	struct btrfs_key key;
				6903	struct btrfs_key found_key;
				6904	int ret;
				6905	int slot;
				6906	u64 total_dev = 0;
				6907
				6908	path = btrfs_alloc_path();
				6909	if (!path)
				6910	return -ENOMEM;
				6911
				6912	mutex_lock(&uuid_mutex);
				6913	mutex_lock(&fs_info->chunk_mutex);
				6914
				6915	/*
				6916	* It is possible for mount and umount to race in such a way that
				6917	* we execute this code path, but open_fs_devices failed to clear
				6918	* total_rw_bytes. We certainly want it cleared before reading the
				6919	* device items, so clear it here.
				6920	*/
				6921	fs_info->fs_devices->total_rw_bytes = 0;
				6922
				6923	/*
				6924	* Read all device items, and then all the chunk items. All
				6925	* device items are found before any chunk item (their object id
				6926	* is smaller than the lowest possible object id for a chunk
				6927	* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
				6928	*/
				6929	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				6930	key.offset = 0;
				6931	key.type = 0;
				6932	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				6933	if (ret < 0)
				6934	goto error;
				6935	while (1) {
				6936	leaf = path->nodes[0];
				6937	slot = path->slots[0];
				6938	if (slot >= btrfs_header_nritems(leaf)) {
				6939	ret = btrfs_next_leaf(root, path);
				6940	if (ret == 0)
				6941	continue;
				6942	if (ret < 0)
				6943	goto error;
				6944	break;
				6945	}
				6946	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				6947	if (found_key.type == BTRFS_DEV_ITEM_KEY) {
				6948	struct btrfs_dev_item *dev_item;
				6949	dev_item = btrfs_item_ptr(leaf, slot,
				6950	struct btrfs_dev_item);
				6951	ret = read_one_dev(fs_info, leaf, dev_item);
				6952	if (ret)
				6953	goto error;
				6954	total_dev++;
				6955	} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
				6956	struct btrfs_chunk *chunk;
				6957	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
				6958	ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
				6959	if (ret)
				6960	goto error;
				6961	}
				6962	path->slots[0]++;
				6963	}
				6964
				6965	/*
				6966	* After loading chunk tree, we've got all device information,
				6967	* do another round of validation checks.
				6968	*/
				6969	if (total_dev != fs_info->fs_devices->total_devices) {
				6970	btrfs_err(fs_info,
				6971	"super_num_devices %llu mismatch with num_devices %llu found here",
				6972	btrfs_super_num_devices(fs_info->super_copy),
				6973	total_dev);
				6974	ret = -EINVAL;
				6975	goto error;
				6976	}
				6977	if (btrfs_super_total_bytes(fs_info->super_copy) <
				6978	fs_info->fs_devices->total_rw_bytes) {
				6979	btrfs_err(fs_info,
				6980	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
				6981	btrfs_super_total_bytes(fs_info->super_copy),
				6982	fs_info->fs_devices->total_rw_bytes);
				6983	ret = -EINVAL;
				6984	goto error;
				6985	}
				6986	ret = 0;
				6987	error:
				6988	mutex_unlock(&fs_info->chunk_mutex);
				6989	mutex_unlock(&uuid_mutex);
				6990
				6991	btrfs_free_path(path);
				6992	return ret;
				6993	}
				6994
				6995	void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
				6996	{
				6997	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				6998	struct btrfs_device *device;
				6999
				7000	while (fs_devices) {
				7001	mutex_lock(&fs_devices->device_list_mutex);
				7002	list_for_each_entry(device, &fs_devices->devices, dev_list)
				7003	device->fs_info = fs_info;
				7004	mutex_unlock(&fs_devices->device_list_mutex);
				7005
				7006	fs_devices = fs_devices->seed;
				7007	}
				7008	}
				7009
				7010	static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
				7011	{
				7012	int i;
				7013
				7014	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7015	btrfs_dev_stat_reset(dev, i);
				7016	}
				7017
				7018	int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
				7019	{
				7020	struct btrfs_key key;
				7021	struct btrfs_key found_key;
				7022	struct btrfs_root *dev_root = fs_info->dev_root;
				7023	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7024	struct extent_buffer *eb;
				7025	int slot;
				7026	int ret = 0;
				7027	struct btrfs_device *device;
				7028	struct btrfs_path *path = NULL;
				7029	int i;
				7030
				7031	path = btrfs_alloc_path();
				7032	if (!path) {
				7033	ret = -ENOMEM;
				7034	goto out;
				7035	}
				7036
				7037	mutex_lock(&fs_devices->device_list_mutex);
				7038	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				7039	int item_size;
				7040	struct btrfs_dev_stats_item *ptr;
				7041
				7042	key.objectid = BTRFS_DEV_STATS_OBJECTID;
				7043	key.type = BTRFS_PERSISTENT_ITEM_KEY;
				7044	key.offset = device->devid;
				7045	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
				7046	if (ret) {
				7047	__btrfs_reset_dev_stats(device);
				7048	device->dev_stats_valid = 1;
				7049	btrfs_release_path(path);
				7050	continue;
				7051	}
				7052	slot = path->slots[0];
				7053	eb = path->nodes[0];
				7054	btrfs_item_key_to_cpu(eb, &found_key, slot);
				7055	item_size = btrfs_item_size_nr(eb, slot);
				7056
				7057	ptr = btrfs_item_ptr(eb, slot,
				7058	struct btrfs_dev_stats_item);
				7059
				7060	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
				7061	if (item_size >= (1 + i) * sizeof(__le64))
				7062	btrfs_dev_stat_set(device, i,
				7063	btrfs_dev_stats_value(eb, ptr, i));
				7064	else
				7065	btrfs_dev_stat_reset(device, i);
				7066	}
				7067
				7068	device->dev_stats_valid = 1;
				7069	btrfs_dev_stat_print_on_load(device);
				7070	btrfs_release_path(path);
				7071	}
				7072	mutex_unlock(&fs_devices->device_list_mutex);
				7073
				7074	out:
				7075	btrfs_free_path(path);
				7076	return ret < 0 ? ret : 0;
				7077	}
				7078
				7079	static int update_dev_stat_item(struct btrfs_trans_handle *trans,
				7080	struct btrfs_fs_info *fs_info,
				7081	struct btrfs_device *device)
				7082	{
				7083	struct btrfs_root *dev_root = fs_info->dev_root;
				7084	struct btrfs_path *path;
				7085	struct btrfs_key key;
				7086	struct extent_buffer *eb;
				7087	struct btrfs_dev_stats_item *ptr;
				7088	int ret;
				7089	int i;
				7090
				7091	key.objectid = BTRFS_DEV_STATS_OBJECTID;
				7092	key.type = BTRFS_PERSISTENT_ITEM_KEY;
				7093	key.offset = device->devid;
				7094
				7095	path = btrfs_alloc_path();
				7096	if (!path)
				7097	return -ENOMEM;
				7098	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
				7099	if (ret < 0) {
				7100	btrfs_warn_in_rcu(fs_info,
				7101	"error %d while searching for dev_stats item for device %s",
				7102	ret, rcu_str_deref(device->name));
				7103	goto out;
				7104	}
				7105
				7106	if (ret == 0 &&
				7107	btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
				7108	/* need to delete old one and insert a new one */
				7109	ret = btrfs_del_item(trans, dev_root, path);
				7110	if (ret != 0) {
				7111	btrfs_warn_in_rcu(fs_info,
				7112	"delete too small dev_stats item for device %s failed %d",
				7113	rcu_str_deref(device->name), ret);
				7114	goto out;
				7115	}
				7116	ret = 1;
				7117	}
				7118
				7119	if (ret == 1) {
				7120	/* need to insert a new item */
				7121	btrfs_release_path(path);
				7122	ret = btrfs_insert_empty_item(trans, dev_root, path,
				7123	&key, sizeof(*ptr));
				7124	if (ret < 0) {
				7125	btrfs_warn_in_rcu(fs_info,
				7126	"insert dev_stats item for device %s failed %d",
				7127	rcu_str_deref(device->name), ret);
				7128	goto out;
				7129	}
				7130	}
				7131
				7132	eb = path->nodes[0];
				7133	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
				7134	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7135	btrfs_set_dev_stats_value(eb, ptr, i,
				7136	btrfs_dev_stat_read(device, i));
				7137	btrfs_mark_buffer_dirty(eb);
				7138
				7139	out:
				7140	btrfs_free_path(path);
				7141	return ret;
				7142	}
				7143
				7144	/*
				7145	* called from commit_transaction. Writes all changed device stats to disk.
				7146	*/
				7147	int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
				7148	struct btrfs_fs_info *fs_info)
				7149	{
				7150	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7151	struct btrfs_device *device;
				7152	int stats_cnt;
				7153	int ret = 0;
				7154
				7155	mutex_lock(&fs_devices->device_list_mutex);
				7156	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				7157	stats_cnt = atomic_read(&device->dev_stats_ccnt);
				7158	if (!device->dev_stats_valid \|\| stats_cnt == 0)
				7159	continue;
				7160
				7161
				7162	/*
				7163	* There is a LOAD-LOAD control dependency between the value of
				7164	* dev_stats_ccnt and updating the on-disk values which requires
				7165	* reading the in-memory counters. Such control dependencies
				7166	* require explicit read memory barriers.
				7167	*
				7168	* This memory barriers pairs with smp_mb__before_atomic in
				7169	* btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
				7170	* barrier implied by atomic_xchg in
				7171	* btrfs_dev_stats_read_and_reset
				7172	*/
				7173	smp_rmb();
				7174
				7175	ret = update_dev_stat_item(trans, fs_info, device);
				7176	if (!ret)
				7177	atomic_sub(stats_cnt, &device->dev_stats_ccnt);
				7178	}
				7179	mutex_unlock(&fs_devices->device_list_mutex);
				7180
				7181	return ret;
				7182	}
				7183
				7184	void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
				7185	{
				7186	btrfs_dev_stat_inc(dev, index);
				7187	btrfs_dev_stat_print_on_error(dev);
				7188	}
				7189
				7190	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
				7191	{
				7192	if (!dev->dev_stats_valid)
				7193	return;
				7194	btrfs_err_rl_in_rcu(dev->fs_info,
				7195	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
				7196	rcu_str_deref(dev->name),
				7197	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
				7198	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
				7199	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
				7200	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
				7201	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
				7202	}
				7203
				7204	static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
				7205	{
				7206	int i;
				7207
				7208	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7209	if (btrfs_dev_stat_read(dev, i) != 0)
				7210	break;
				7211	if (i == BTRFS_DEV_STAT_VALUES_MAX)
				7212	return; /* all values == 0, suppress message */
				7213
				7214	btrfs_info_in_rcu(dev->fs_info,
				7215	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
				7216	rcu_str_deref(dev->name),
				7217	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
				7218	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
				7219	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
				7220	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
				7221	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
				7222	}
				7223
				7224	int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
				7225	struct btrfs_ioctl_get_dev_stats *stats)
				7226	{
				7227	struct btrfs_device *dev;
				7228	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7229	int i;
				7230
				7231	mutex_lock(&fs_devices->device_list_mutex);
				7232	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
				7233	mutex_unlock(&fs_devices->device_list_mutex);
				7234
				7235	if (!dev) {
				7236	btrfs_warn(fs_info, "get dev_stats failed, device not found");
				7237	return -ENODEV;
				7238	} else if (!dev->dev_stats_valid) {
				7239	btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
				7240	return -ENODEV;
				7241	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
				7242	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
				7243	if (stats->nr_items > i)
				7244	stats->values[i] =
				7245	btrfs_dev_stat_read_and_reset(dev, i);
				7246	else
				7247	btrfs_dev_stat_reset(dev, i);
				7248	}
				7249	btrfs_info(fs_info, "device stats zeroed by %s (%d)",
				7250	current->comm, task_pid_nr(current));
				7251	} else {
				7252	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7253	if (stats->nr_items > i)
				7254	stats->values[i] = btrfs_dev_stat_read(dev, i);
				7255	}
				7256	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
				7257	stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
				7258	return 0;
				7259	}
				7260
				7261	void btrfs_scratch_superblocks(struct block_device bdev, const char device_path)
				7262	{
				7263	struct buffer_head *bh;
				7264	struct btrfs_super_block *disk_super;
				7265	int copy_num;
				7266
				7267	if (!bdev)
				7268	return;
				7269
				7270	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
				7271	copy_num++) {
				7272
				7273	if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
				7274	continue;
				7275
				7276	disk_super = (struct btrfs_super_block *)bh->b_data;
				7277
				7278	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
				7279	set_buffer_dirty(bh);
				7280	sync_dirty_buffer(bh);
				7281	brelse(bh);
				7282	}
				7283
				7284	/* Notify udev that device has changed */
				7285	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
				7286
				7287	/* Update ctime/mtime for device path for libblkid */
				7288	update_dev_time(device_path);
				7289	}
				7290
				7291	/*
				7292	* Update the size of all devices, which is used for writing out the
				7293	* super blocks.
				7294	*/
				7295	void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
				7296	{
				7297	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7298	struct btrfs_device curr, next;
				7299
				7300	if (list_empty(&fs_devices->resized_devices))
				7301	return;
				7302
				7303	mutex_lock(&fs_devices->device_list_mutex);
				7304	mutex_lock(&fs_info->chunk_mutex);
				7305	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
				7306	resized_list) {
				7307	list_del_init(&curr->resized_list);
				7308	curr->commit_total_bytes = curr->disk_total_bytes;
				7309	}
				7310	mutex_unlock(&fs_info->chunk_mutex);
				7311	mutex_unlock(&fs_devices->device_list_mutex);
				7312	}
				7313
				7314	/* Must be invoked during the transaction commit */
				7315	void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
				7316	struct btrfs_transaction *transaction)
				7317	{
				7318	struct extent_map *em;
				7319	struct map_lookup *map;
				7320	struct btrfs_device *dev;
				7321	int i;
				7322
				7323	if (list_empty(&transaction->pending_chunks))
				7324	return;
				7325
				7326	/* In order to kick the device replace finish process */
				7327	mutex_lock(&fs_info->chunk_mutex);
				7328	list_for_each_entry(em, &transaction->pending_chunks, list) {
				7329	map = em->map_lookup;
				7330
				7331	for (i = 0; i < map->num_stripes; i++) {
				7332	dev = map->stripes[i].dev;
				7333	dev->commit_bytes_used = dev->bytes_used;
				7334	dev->has_pending_chunks = false;
				7335	}
				7336	}
				7337	mutex_unlock(&fs_info->chunk_mutex);
				7338	}
				7339
				7340	void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
				7341	{
				7342	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7343	while (fs_devices) {
				7344	fs_devices->fs_info = fs_info;
				7345	fs_devices = fs_devices->seed;
				7346	}
				7347	}
				7348
				7349	void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
				7350	{
				7351	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7352	while (fs_devices) {
				7353	fs_devices->fs_info = NULL;
				7354	fs_devices = fs_devices->seed;
				7355	}
				7356	}