Blame - src/kernel/linux/v4.19/fs/btrfs/volumes.c - T800

blob: 5bbcdcff68a9eb26c9e016bb3de2f75856d1d858 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/sched.h>
				7	#include <linux/bio.h>
				8	#include <linux/slab.h>
				9	#include <linux/buffer_head.h>
				10	#include <linux/blkdev.h>
				11	#include <linux/ratelimit.h>
				12	#include <linux/kthread.h>
				13	#include <linux/raid/pq.h>
				14	#include <linux/semaphore.h>
				15	#include <linux/uuid.h>
				16	#include <linux/list_sort.h>
				17	#include "ctree.h"
				18	#include "extent_map.h"
				19	#include "disk-io.h"
				20	#include "transaction.h"
				21	#include "print-tree.h"
				22	#include "volumes.h"
				23	#include "raid56.h"
				24	#include "async-thread.h"
				25	#include "check-integrity.h"
				26	#include "rcu-string.h"
				27	#include "math.h"
				28	#include "dev-replace.h"
				29	#include "sysfs.h"
				30
				31	const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
				32	[BTRFS_RAID_RAID10] = {
				33	.sub_stripes = 2,
				34	.dev_stripes = 1,
				35	.devs_max = 0, /* 0 == as many as possible */
				36	.devs_min = 4,
				37	.tolerated_failures = 1,
				38	.devs_increment = 2,
				39	.ncopies = 2,
				40	.raid_name = "raid10",
				41	.bg_flag = BTRFS_BLOCK_GROUP_RAID10,
				42	.mindev_error = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
				43	},
				44	[BTRFS_RAID_RAID1] = {
				45	.sub_stripes = 1,
				46	.dev_stripes = 1,
				47	.devs_max = 2,
				48	.devs_min = 2,
				49	.tolerated_failures = 1,
				50	.devs_increment = 2,
				51	.ncopies = 2,
				52	.raid_name = "raid1",
				53	.bg_flag = BTRFS_BLOCK_GROUP_RAID1,
				54	.mindev_error = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
				55	},
				56	[BTRFS_RAID_DUP] = {
				57	.sub_stripes = 1,
				58	.dev_stripes = 2,
				59	.devs_max = 1,
				60	.devs_min = 1,
				61	.tolerated_failures = 0,
				62	.devs_increment = 1,
				63	.ncopies = 2,
				64	.raid_name = "dup",
				65	.bg_flag = BTRFS_BLOCK_GROUP_DUP,
				66	.mindev_error = 0,
				67	},
				68	[BTRFS_RAID_RAID0] = {
				69	.sub_stripes = 1,
				70	.dev_stripes = 1,
				71	.devs_max = 0,
				72	.devs_min = 2,
				73	.tolerated_failures = 0,
				74	.devs_increment = 1,
				75	.ncopies = 1,
				76	.raid_name = "raid0",
				77	.bg_flag = BTRFS_BLOCK_GROUP_RAID0,
				78	.mindev_error = 0,
				79	},
				80	[BTRFS_RAID_SINGLE] = {
				81	.sub_stripes = 1,
				82	.dev_stripes = 1,
				83	.devs_max = 1,
				84	.devs_min = 1,
				85	.tolerated_failures = 0,
				86	.devs_increment = 1,
				87	.ncopies = 1,
				88	.raid_name = "single",
				89	.bg_flag = 0,
				90	.mindev_error = 0,
				91	},
				92	[BTRFS_RAID_RAID5] = {
				93	.sub_stripes = 1,
				94	.dev_stripes = 1,
				95	.devs_max = 0,
				96	.devs_min = 2,
				97	.tolerated_failures = 1,
				98	.devs_increment = 1,
				99	.ncopies = 1,
				100	.raid_name = "raid5",
				101	.bg_flag = BTRFS_BLOCK_GROUP_RAID5,
				102	.mindev_error = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
				103	},
				104	[BTRFS_RAID_RAID6] = {
				105	.sub_stripes = 1,
				106	.dev_stripes = 1,
				107	.devs_max = 0,
				108	.devs_min = 3,
				109	.tolerated_failures = 2,
				110	.devs_increment = 1,
				111	.ncopies = 1,
				112	.raid_name = "raid6",
				113	.bg_flag = BTRFS_BLOCK_GROUP_RAID6,
				114	.mindev_error = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
				115	},
				116	};
				117
				118	const char *get_raid_name(enum btrfs_raid_types type)
				119	{
				120	if (type >= BTRFS_NR_RAID_TYPES)
				121	return NULL;
				122
				123	return btrfs_raid_array[type].raid_name;
				124	}
				125
				126	static int init_first_rw_device(struct btrfs_trans_handle *trans,
				127	struct btrfs_fs_info *fs_info);
				128	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
				129	static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
				130	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
				131	static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
				132	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
				133	enum btrfs_map_op op,
				134	u64 logical, u64 *length,
				135	struct btrfs_bio **bbio_ret,
				136	int mirror_num, int need_raid_map);
				137
				138	/*
				139	* Device locking
				140	* ==============
				141	*
				142	* There are several mutexes that protect manipulation of devices and low-level
				143	* structures like chunks but not block groups, extents or files
				144	*
				145	* uuid_mutex (global lock)
				146	* ------------------------
				147	* protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
				148	* the SCAN_DEV ioctl registration or from mount either implicitly (the first
				149	* device) or requested by the device= mount option
				150	*
				151	* the mutex can be very coarse and can cover long-running operations
				152	*
				153	* protects: updates to fs_devices counters like missing devices, rw devices,
				154	* seeding, structure cloning, openning/closing devices at mount/umount time
				155	*
				156	* global::fs_devs - add, remove, updates to the global list
				157	*
				158	* does not protect: manipulation of the fs_devices::devices list!
				159	*
				160	* btrfs_device::name - renames (write side), read is RCU
				161	*
				162	* fs_devices::device_list_mutex (per-fs, with RCU)
				163	* ------------------------------------------------
				164	* protects updates to fs_devices::devices, ie. adding and deleting
				165	*
				166	* simple list traversal with read-only actions can be done with RCU protection
				167	*
				168	* may be used to exclude some operations from running concurrently without any
				169	* modifications to the list (see write_all_supers)
				170	*
				171	* balance_mutex
				172	* -------------
				173	* protects balance structures (status, state) and context accessed from
				174	* several places (internally, ioctl)
				175	*
				176	* chunk_mutex
				177	* -----------
				178	* protects chunks, adding or removing during allocation, trim or when a new
				179	* device is added/removed
				180	*
				181	* cleaner_mutex
				182	* -------------
				183	* a big lock that is held by the cleaner thread and prevents running subvolume
				184	* cleaning together with relocation or delayed iputs
				185	*
				186	*
				187	* Lock nesting
				188	* ============
				189	*
				190	* uuid_mutex
				191	* volume_mutex
				192	* device_list_mutex
				193	* chunk_mutex
				194	* balance_mutex
				195	*
				196	*
				197	* Exclusive operations, BTRFS_FS_EXCL_OP
				198	* ======================================
				199	*
				200	* Maintains the exclusivity of the following operations that apply to the
				201	* whole filesystem and cannot run in parallel.
				202	*
				203	* - Balance (*)
				204	* - Device add
				205	* - Device remove
				206	* - Device replace (*)
				207	* - Resize
				208	*
				209	* The device operations (as above) can be in one of the following states:
				210	*
				211	* - Running state
				212	* - Paused state
				213	* - Completed state
				214	*
				215	* Only device operations marked with (*) can go into the Paused state for the
				216	* following reasons:
				217	*
				218	* - ioctl (only Balance can be Paused through ioctl)
				219	* - filesystem remounted as read-only
				220	* - filesystem unmounted and mounted as read-only
				221	* - system power-cycle and filesystem mounted as read-only
				222	* - filesystem or device errors leading to forced read-only
				223	*
				224	* BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
				225	* During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
				226	* A device operation in Paused or Running state can be canceled or resumed
				227	* either by ioctl (Balance only) or when remounted as read-write.
				228	* BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
				229	* completed.
				230	*/
				231
				232	DEFINE_MUTEX(uuid_mutex);
				233	static LIST_HEAD(fs_uuids);
				234	struct list_head *btrfs_get_fs_uuids(void)
				235	{
				236	return &fs_uuids;
				237	}
				238
				239	/*
				240	* alloc_fs_devices - allocate struct btrfs_fs_devices
				241	* @fsid: if not NULL, copy the uuid to fs_devices::fsid
				242	*
				243	* Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
				244	* The returned struct is not linked onto any lists and can be destroyed with
				245	* kfree() right away.
				246	*/
				247	static struct btrfs_fs_devices alloc_fs_devices(const u8 fsid)
				248	{
				249	struct btrfs_fs_devices *fs_devs;
				250
				251	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
				252	if (!fs_devs)
				253	return ERR_PTR(-ENOMEM);
				254
				255	mutex_init(&fs_devs->device_list_mutex);
				256
				257	INIT_LIST_HEAD(&fs_devs->devices);
				258	INIT_LIST_HEAD(&fs_devs->resized_devices);
				259	INIT_LIST_HEAD(&fs_devs->alloc_list);
				260	INIT_LIST_HEAD(&fs_devs->fs_list);
				261	if (fsid)
				262	memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
				263
				264	return fs_devs;
				265	}
				266
				267	void btrfs_free_device(struct btrfs_device *device)
				268	{
				269	rcu_string_free(device->name);
				270	bio_put(device->flush_bio);
				271	kfree(device);
				272	}
				273
				274	static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
				275	{
				276	struct btrfs_device *device;
				277	WARN_ON(fs_devices->opened);
				278	while (!list_empty(&fs_devices->devices)) {
				279	device = list_entry(fs_devices->devices.next,
				280	struct btrfs_device, dev_list);
				281	list_del(&device->dev_list);
				282	btrfs_free_device(device);
				283	}
				284	kfree(fs_devices);
				285	}
				286
				287	static void btrfs_kobject_uevent(struct block_device *bdev,
				288	enum kobject_action action)
				289	{
				290	int ret;
				291
				292	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
				293	if (ret)
				294	pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
				295	action,
				296	kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
				297	&disk_to_dev(bdev->bd_disk)->kobj);
				298	}
				299
				300	void __exit btrfs_cleanup_fs_uuids(void)
				301	{
				302	struct btrfs_fs_devices *fs_devices;
				303
				304	while (!list_empty(&fs_uuids)) {
				305	fs_devices = list_entry(fs_uuids.next,
				306	struct btrfs_fs_devices, fs_list);
				307	list_del(&fs_devices->fs_list);
				308	free_fs_devices(fs_devices);
				309	}
				310	}
				311
				312	/*
				313	* Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
				314	* Returned struct is not linked onto any lists and must be destroyed using
				315	* btrfs_free_device.
				316	*/
				317	static struct btrfs_device *__alloc_device(void)
				318	{
				319	struct btrfs_device *dev;
				320
				321	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
				322	if (!dev)
				323	return ERR_PTR(-ENOMEM);
				324
				325	/*
				326	* Preallocate a bio that's always going to be used for flushing device
				327	* barriers and matches the device lifespan
				328	*/
				329	dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
				330	if (!dev->flush_bio) {
				331	kfree(dev);
				332	return ERR_PTR(-ENOMEM);
				333	}
				334
				335	INIT_LIST_HEAD(&dev->dev_list);
				336	INIT_LIST_HEAD(&dev->dev_alloc_list);
				337	INIT_LIST_HEAD(&dev->resized_list);
				338
				339	spin_lock_init(&dev->io_lock);
				340
				341	atomic_set(&dev->reada_in_flight, 0);
				342	atomic_set(&dev->dev_stats_ccnt, 0);
				343	btrfs_device_data_ordered_init(dev);
				344	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
				345	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
				346
				347	return dev;
				348	}
				349
				350	/*
				351	* Find a device specified by @devid or @uuid in the list of @fs_devices, or
				352	* return NULL.
				353	*
				354	* If devid and uuid are both specified, the match must be exact, otherwise
				355	* only devid is used.
				356	*/
				357	static struct btrfs_device find_device(struct btrfs_fs_devices fs_devices,
				358	u64 devid, const u8 *uuid)
				359	{
				360	struct btrfs_device *dev;
				361
				362	list_for_each_entry(dev, &fs_devices->devices, dev_list) {
				363	if (dev->devid == devid &&
				364	(!uuid \|\| !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
				365	return dev;
				366	}
				367	}
				368	return NULL;
				369	}
				370
				371	static noinline struct btrfs_fs_devices find_fsid(u8 fsid)
				372	{
				373	struct btrfs_fs_devices *fs_devices;
				374
				375	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
				376	if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
				377	return fs_devices;
				378	}
				379	return NULL;
				380	}
				381
				382	static int
				383	btrfs_get_bdev_and_sb(const char device_path, fmode_t flags, void holder,
				384	int flush, struct block_device **bdev,
				385	struct buffer_head **bh)
				386	{
				387	int ret;
				388
				389	*bdev = blkdev_get_by_path(device_path, flags, holder);
				390
				391	if (IS_ERR(*bdev)) {
				392	ret = PTR_ERR(*bdev);
				393	goto error;
				394	}
				395
				396	if (flush)
				397	filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
				398	ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
				399	if (ret) {
				400	blkdev_put(*bdev, flags);
				401	goto error;
				402	}
				403	invalidate_bdev(*bdev);
				404	bh = btrfs_read_dev_super(bdev);
				405	if (IS_ERR(*bh)) {
				406	ret = PTR_ERR(*bh);
				407	blkdev_put(*bdev, flags);
				408	goto error;
				409	}
				410
				411	return 0;
				412
				413	error:
				414	*bdev = NULL;
				415	*bh = NULL;
				416	return ret;
				417	}
				418
				419	static void requeue_list(struct btrfs_pending_bios *pending_bios,
				420	struct bio head, struct bio tail)
				421	{
				422
				423	struct bio *old_head;
				424
				425	old_head = pending_bios->head;
				426	pending_bios->head = head;
				427	if (pending_bios->tail)
				428	tail->bi_next = old_head;
				429	else
				430	pending_bios->tail = tail;
				431	}
				432
				433	/*
				434	* we try to collect pending bios for a device so we don't get a large
				435	* number of procs sending bios down to the same device. This greatly
				436	* improves the schedulers ability to collect and merge the bios.
				437	*
				438	* But, it also turns into a long list of bios to process and that is sure
				439	* to eventually make the worker thread block. The solution here is to
				440	* make some progress and then put this work struct back at the end of
				441	* the list if the block device is congested. This way, multiple devices
				442	* can make progress from a single worker thread.
				443	*/
				444	static noinline void run_scheduled_bios(struct btrfs_device *device)
				445	{
				446	struct btrfs_fs_info *fs_info = device->fs_info;
				447	struct bio *pending;
				448	struct backing_dev_info *bdi;
				449	struct btrfs_pending_bios *pending_bios;
				450	struct bio *tail;
				451	struct bio *cur;
				452	int again = 0;
				453	unsigned long num_run;
				454	unsigned long batch_run = 0;
				455	unsigned long last_waited = 0;
				456	int force_reg = 0;
				457	int sync_pending = 0;
				458	struct blk_plug plug;
				459
				460	/*
				461	* this function runs all the bios we've collected for
				462	* a particular device. We don't want to wander off to
				463	* another device without first sending all of these down.
				464	* So, setup a plug here and finish it off before we return
				465	*/
				466	blk_start_plug(&plug);
				467
				468	bdi = device->bdev->bd_bdi;
				469
				470	loop:
				471	spin_lock(&device->io_lock);
				472
				473	loop_lock:
				474	num_run = 0;
				475
				476	/* take all the bios off the list at once and process them
				477	* later on (without the lock held). But, remember the
				478	* tail and other pointers so the bios can be properly reinserted
				479	* into the list if we hit congestion
				480	*/
				481	if (!force_reg && device->pending_sync_bios.head) {
				482	pending_bios = &device->pending_sync_bios;
				483	force_reg = 1;
				484	} else {
				485	pending_bios = &device->pending_bios;
				486	force_reg = 0;
				487	}
				488
				489	pending = pending_bios->head;
				490	tail = pending_bios->tail;
				491	WARN_ON(pending && !tail);
				492
				493	/*
				494	* if pending was null this time around, no bios need processing
				495	* at all and we can stop. Otherwise it'll loop back up again
				496	* and do an additional check so no bios are missed.
				497	*
				498	* device->running_pending is used to synchronize with the
				499	* schedule_bio code.
				500	*/
				501	if (device->pending_sync_bios.head == NULL &&
				502	device->pending_bios.head == NULL) {
				503	again = 0;
				504	device->running_pending = 0;
				505	} else {
				506	again = 1;
				507	device->running_pending = 1;
				508	}
				509
				510	pending_bios->head = NULL;
				511	pending_bios->tail = NULL;
				512
				513	spin_unlock(&device->io_lock);
				514
				515	while (pending) {
				516
				517	rmb();
				518	/* we want to work on both lists, but do more bios on the
				519	* sync list than the regular list
				520	*/
				521	if ((num_run > 32 &&
				522	pending_bios != &device->pending_sync_bios &&
				523	device->pending_sync_bios.head) \|\|
				524	(num_run > 64 && pending_bios == &device->pending_sync_bios &&
				525	device->pending_bios.head)) {
				526	spin_lock(&device->io_lock);
				527	requeue_list(pending_bios, pending, tail);
				528	goto loop_lock;
				529	}
				530
				531	cur = pending;
				532	pending = pending->bi_next;
				533	cur->bi_next = NULL;
				534
				535	BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
				536
				537	/*
				538	* if we're doing the sync list, record that our
				539	* plug has some sync requests on it
				540	*
				541	* If we're doing the regular list and there are
				542	* sync requests sitting around, unplug before
				543	* we add more
				544	*/
				545	if (pending_bios == &device->pending_sync_bios) {
				546	sync_pending = 1;
				547	} else if (sync_pending) {
				548	blk_finish_plug(&plug);
				549	blk_start_plug(&plug);
				550	sync_pending = 0;
				551	}
				552
				553	btrfsic_submit_bio(cur);
				554	num_run++;
				555	batch_run++;
				556
				557	cond_resched();
				558
				559	/*
				560	* we made progress, there is more work to do and the bdi
				561	* is now congested. Back off and let other work structs
				562	* run instead
				563	*/
				564	if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
				565	fs_info->fs_devices->open_devices > 1) {
				566	struct io_context *ioc;
				567
				568	ioc = current->io_context;
				569
				570	/*
				571	* the main goal here is that we don't want to
				572	* block if we're going to be able to submit
				573	* more requests without blocking.
				574	*
				575	* This code does two great things, it pokes into
				576	* the elevator code from a filesystem _and_
				577	* it makes assumptions about how batching works.
				578	*/
				579	if (ioc && ioc->nr_batch_requests > 0 &&
				580	time_before(jiffies, ioc->last_waited + HZ/50UL) &&
				581	(last_waited == 0 \|\|
				582	ioc->last_waited == last_waited)) {
				583	/*
				584	* we want to go through our batch of
				585	* requests and stop. So, we copy out
				586	* the ioc->last_waited time and test
				587	* against it before looping
				588	*/
				589	last_waited = ioc->last_waited;
				590	cond_resched();
				591	continue;
				592	}
				593	spin_lock(&device->io_lock);
				594	requeue_list(pending_bios, pending, tail);
				595	device->running_pending = 1;
				596
				597	spin_unlock(&device->io_lock);
				598	btrfs_queue_work(fs_info->submit_workers,
				599	&device->work);
				600	goto done;
				601	}
				602	}
				603
				604	cond_resched();
				605	if (again)
				606	goto loop;
				607
				608	spin_lock(&device->io_lock);
				609	if (device->pending_bios.head \|\| device->pending_sync_bios.head)
				610	goto loop_lock;
				611	spin_unlock(&device->io_lock);
				612
				613	done:
				614	blk_finish_plug(&plug);
				615	}
				616
				617	static void pending_bios_fn(struct btrfs_work *work)
				618	{
				619	struct btrfs_device *device;
				620
				621	device = container_of(work, struct btrfs_device, work);
				622	run_scheduled_bios(device);
				623	}
				624
				625	/*
				626	* Search and remove all stale (devices which are not mounted) devices.
				627	* When both inputs are NULL, it will search and release all stale devices.
				628	* path: Optional. When provided will it release all unmounted devices
				629	* matching this path only.
				630	* skip_dev: Optional. Will skip this device when searching for the stale
				631	* devices.
				632	*/
				633	static void btrfs_free_stale_devices(const char *path,
				634	struct btrfs_device *skip_device)
				635	{
				636	struct btrfs_fs_devices fs_devices, tmp_fs_devices;
				637	struct btrfs_device device, tmp_device;
				638
				639	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
				640	mutex_lock(&fs_devices->device_list_mutex);
				641	if (fs_devices->opened) {
				642	mutex_unlock(&fs_devices->device_list_mutex);
				643	continue;
				644	}
				645
				646	list_for_each_entry_safe(device, tmp_device,
				647	&fs_devices->devices, dev_list) {
				648	int not_found = 0;
				649
				650	if (skip_device && skip_device == device)
				651	continue;
				652	if (path && !device->name)
				653	continue;
				654
				655	rcu_read_lock();
				656	if (path)
				657	not_found = strcmp(rcu_str_deref(device->name),
				658	path);
				659	rcu_read_unlock();
				660	if (not_found)
				661	continue;
				662
				663	/* delete the stale device */
				664	fs_devices->num_devices--;
				665	list_del(&device->dev_list);
				666	btrfs_free_device(device);
				667
				668	if (fs_devices->num_devices == 0)
				669	break;
				670	}
				671	mutex_unlock(&fs_devices->device_list_mutex);
				672	if (fs_devices->num_devices == 0) {
				673	btrfs_sysfs_remove_fsid(fs_devices);
				674	list_del(&fs_devices->fs_list);
				675	free_fs_devices(fs_devices);
				676	}
				677	}
				678	}
				679
				680	static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
				681	struct btrfs_device *device, fmode_t flags,
				682	void *holder)
				683	{
				684	struct request_queue *q;
				685	struct block_device *bdev;
				686	struct buffer_head *bh;
				687	struct btrfs_super_block *disk_super;
				688	u64 devid;
				689	int ret;
				690
				691	if (device->bdev)
				692	return -EINVAL;
				693	if (!device->name)
				694	return -EINVAL;
				695
				696	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
				697	&bdev, &bh);
				698	if (ret)
				699	return ret;
				700
				701	disk_super = (struct btrfs_super_block *)bh->b_data;
				702	devid = btrfs_stack_device_id(&disk_super->dev_item);
				703	if (devid != device->devid)
				704	goto error_brelse;
				705
				706	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
				707	goto error_brelse;
				708
				709	device->generation = btrfs_super_generation(disk_super);
				710
				711	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
				712	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				713	fs_devices->seeding = 1;
				714	} else {
				715	if (bdev_read_only(bdev))
				716	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				717	else
				718	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				719	}
				720
				721	q = bdev_get_queue(bdev);
				722	if (!blk_queue_nonrot(q))
				723	fs_devices->rotating = 1;
				724
				725	device->bdev = bdev;
				726	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
				727	device->mode = flags;
				728
				729	fs_devices->open_devices++;
				730	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				731	device->devid != BTRFS_DEV_REPLACE_DEVID) {
				732	fs_devices->rw_devices++;
				733	list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
				734	}
				735	brelse(bh);
				736
				737	return 0;
				738
				739	error_brelse:
				740	brelse(bh);
				741	blkdev_put(bdev, flags);
				742
				743	return -EINVAL;
				744	}
				745
				746	/*
				747	* Add new device to list of registered devices
				748	*
				749	* Returns:
				750	* device pointer which was just added or updated when successful
				751	* error pointer when failed
				752	*/
				753	static noinline struct btrfs_device device_list_add(const char path,
				754	struct btrfs_super_block *disk_super,
				755	bool *new_device_added)
				756	{
				757	struct btrfs_device *device;
				758	struct btrfs_fs_devices *fs_devices;
				759	struct rcu_string *name;
				760	u64 found_transid = btrfs_super_generation(disk_super);
				761	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
				762
				763	fs_devices = find_fsid(disk_super->fsid);
				764	if (!fs_devices) {
				765	fs_devices = alloc_fs_devices(disk_super->fsid);
				766	if (IS_ERR(fs_devices))
				767	return ERR_CAST(fs_devices);
				768
				769	mutex_lock(&fs_devices->device_list_mutex);
				770	list_add(&fs_devices->fs_list, &fs_uuids);
				771
				772	device = NULL;
				773	} else {
				774	mutex_lock(&fs_devices->device_list_mutex);
				775	device = find_device(fs_devices, devid,
				776	disk_super->dev_item.uuid);
				777	}
				778
				779	if (!device) {
				780	if (fs_devices->opened) {
				781	mutex_unlock(&fs_devices->device_list_mutex);
				782	return ERR_PTR(-EBUSY);
				783	}
				784
				785	device = btrfs_alloc_device(NULL, &devid,
				786	disk_super->dev_item.uuid);
				787	if (IS_ERR(device)) {
				788	mutex_unlock(&fs_devices->device_list_mutex);
				789	/* we can safely leave the fs_devices entry around */
				790	return device;
				791	}
				792
				793	name = rcu_string_strdup(path, GFP_NOFS);
				794	if (!name) {
				795	btrfs_free_device(device);
				796	mutex_unlock(&fs_devices->device_list_mutex);
				797	return ERR_PTR(-ENOMEM);
				798	}
				799	rcu_assign_pointer(device->name, name);
				800
				801	list_add_rcu(&device->dev_list, &fs_devices->devices);
				802	fs_devices->num_devices++;
				803
				804	device->fs_devices = fs_devices;
				805	*new_device_added = true;
				806
				807	if (disk_super->label[0])
				808	pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
				809	disk_super->label, devid, found_transid, path);
				810	else
				811	pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
				812	disk_super->fsid, devid, found_transid, path);
				813
				814	} else if (!device->name \|\| strcmp(device->name->str, path)) {
				815	/*
				816	* When FS is already mounted.
				817	* 1. If you are here and if the device->name is NULL that
				818	* means this device was missing at time of FS mount.
				819	* 2. If you are here and if the device->name is different
				820	* from 'path' that means either
				821	* a. The same device disappeared and reappeared with
				822	* different name. or
				823	* b. The missing-disk-which-was-replaced, has
				824	* reappeared now.
				825	*
				826	* We must allow 1 and 2a above. But 2b would be a spurious
				827	* and unintentional.
				828	*
				829	* Further in case of 1 and 2a above, the disk at 'path'
				830	* would have missed some transaction when it was away and
				831	* in case of 2a the stale bdev has to be updated as well.
				832	* 2b must not be allowed at all time.
				833	*/
				834
				835	/*
				836	* For now, we do allow update to btrfs_fs_device through the
				837	* btrfs dev scan cli after FS has been mounted. We're still
				838	* tracking a problem where systems fail mount by subvolume id
				839	* when we reject replacement on a mounted FS.
				840	*/
				841	if (!fs_devices->opened && found_transid < device->generation) {
				842	/*
				843	* That is if the FS is _not_ mounted and if you
				844	* are here, that means there is more than one
				845	* disk with same uuid and devid.We keep the one
				846	* with larger generation number or the last-in if
				847	* generation are equal.
				848	*/
				849	mutex_unlock(&fs_devices->device_list_mutex);
				850	return ERR_PTR(-EEXIST);
				851	}
				852
				853	/*
				854	* We are going to replace the device path for a given devid,
				855	* make sure it's the same device if the device is mounted
				856	*/
				857	if (device->bdev) {
				858	struct block_device *path_bdev;
				859
				860	path_bdev = lookup_bdev(path);
				861	if (IS_ERR(path_bdev)) {
				862	mutex_unlock(&fs_devices->device_list_mutex);
				863	return ERR_CAST(path_bdev);
				864	}
				865
				866	if (device->bdev != path_bdev) {
				867	bdput(path_bdev);
				868	mutex_unlock(&fs_devices->device_list_mutex);
				869	btrfs_warn_in_rcu(device->fs_info,
				870	"duplicate device fsid:devid for %pU:%llu old:%s new:%s",
				871	disk_super->fsid, devid,
				872	rcu_str_deref(device->name), path);
				873	return ERR_PTR(-EEXIST);
				874	}
				875	bdput(path_bdev);
				876	btrfs_info_in_rcu(device->fs_info,
				877	"device fsid %pU devid %llu moved old:%s new:%s",
				878	disk_super->fsid, devid,
				879	rcu_str_deref(device->name), path);
				880	}
				881
				882	name = rcu_string_strdup(path, GFP_NOFS);
				883	if (!name) {
				884	mutex_unlock(&fs_devices->device_list_mutex);
				885	return ERR_PTR(-ENOMEM);
				886	}
				887	rcu_string_free(device->name);
				888	rcu_assign_pointer(device->name, name);
				889	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
				890	fs_devices->missing_devices--;
				891	clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
				892	}
				893	}
				894
				895	/*
				896	* Unmount does not free the btrfs_device struct but would zero
				897	* generation along with most of the other members. So just update
				898	* it back. We need it to pick the disk with largest generation
				899	* (as above).
				900	*/
				901	if (!fs_devices->opened)
				902	device->generation = found_transid;
				903
				904	fs_devices->total_devices = btrfs_super_num_devices(disk_super);
				905
				906	mutex_unlock(&fs_devices->device_list_mutex);
				907	return device;
				908	}
				909
				910	static struct btrfs_fs_devices clone_fs_devices(struct btrfs_fs_devices orig)
				911	{
				912	struct btrfs_fs_devices *fs_devices;
				913	struct btrfs_device *device;
				914	struct btrfs_device *orig_dev;
				915
				916	fs_devices = alloc_fs_devices(orig->fsid);
				917	if (IS_ERR(fs_devices))
				918	return fs_devices;
				919
				920	mutex_lock(&orig->device_list_mutex);
				921	fs_devices->total_devices = orig->total_devices;
				922
				923	/* We have held the volume lock, it is safe to get the devices. */
				924	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
				925	struct rcu_string *name;
				926
				927	device = btrfs_alloc_device(NULL, &orig_dev->devid,
				928	orig_dev->uuid);
				929	if (IS_ERR(device))
				930	goto error;
				931
				932	/*
				933	* This is ok to do without rcu read locked because we hold the
				934	* uuid mutex so nothing we touch in here is going to disappear.
				935	*/
				936	if (orig_dev->name) {
				937	name = rcu_string_strdup(orig_dev->name->str,
				938	GFP_KERNEL);
				939	if (!name) {
				940	btrfs_free_device(device);
				941	goto error;
				942	}
				943	rcu_assign_pointer(device->name, name);
				944	}
				945
				946	list_add(&device->dev_list, &fs_devices->devices);
				947	device->fs_devices = fs_devices;
				948	fs_devices->num_devices++;
				949	}
				950	mutex_unlock(&orig->device_list_mutex);
				951	return fs_devices;
				952	error:
				953	mutex_unlock(&orig->device_list_mutex);
				954	free_fs_devices(fs_devices);
				955	return ERR_PTR(-ENOMEM);
				956	}
				957
				958	/*
				959	* After we have read the system tree and know devids belonging to
				960	* this filesystem, remove the device which does not belong there.
				961	*/
				962	void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
				963	{
				964	struct btrfs_device device, next;
				965	struct btrfs_device *latest_dev = NULL;
				966
				967	mutex_lock(&uuid_mutex);
				968	again:
				969	/* This is the initialized path, it is safe to release the devices. */
				970	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
				971	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				972	&device->dev_state)) {
				973	if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				974	&device->dev_state) &&
				975	(!latest_dev \|\|
				976	device->generation > latest_dev->generation)) {
				977	latest_dev = device;
				978	}
				979	continue;
				980	}
				981
				982	if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
				983	/*
				984	* In the first step, keep the device which has
				985	* the correct fsid and the devid that is used
				986	* for the dev_replace procedure.
				987	* In the second step, the dev_replace state is
				988	* read from the device tree and it is known
				989	* whether the procedure is really active or
				990	* not, which means whether this device is
				991	* used or whether it should be removed.
				992	*/
				993	if (step == 0 \|\| test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				994	&device->dev_state)) {
				995	continue;
				996	}
				997	}
				998	if (device->bdev) {
				999	blkdev_put(device->bdev, device->mode);
				1000	device->bdev = NULL;
				1001	fs_devices->open_devices--;
				1002	}
				1003	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				1004	list_del_init(&device->dev_alloc_list);
				1005	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				1006	if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
				1007	&device->dev_state))
				1008	fs_devices->rw_devices--;
				1009	}
				1010	list_del_init(&device->dev_list);
				1011	fs_devices->num_devices--;
				1012	btrfs_free_device(device);
				1013	}
				1014
				1015	if (fs_devices->seed) {
				1016	fs_devices = fs_devices->seed;
				1017	goto again;
				1018	}
				1019
				1020	fs_devices->latest_bdev = latest_dev->bdev;
				1021
				1022	mutex_unlock(&uuid_mutex);
				1023	}
				1024
				1025	static void free_device_rcu(struct rcu_head *head)
				1026	{
				1027	struct btrfs_device *device;
				1028
				1029	device = container_of(head, struct btrfs_device, rcu);
				1030	btrfs_free_device(device);
				1031	}
				1032
				1033	static void btrfs_close_bdev(struct btrfs_device *device)
				1034	{
				1035	if (!device->bdev)
				1036	return;
				1037
				1038	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				1039	sync_blockdev(device->bdev);
				1040	invalidate_bdev(device->bdev);
				1041	}
				1042
				1043	blkdev_put(device->bdev, device->mode);
				1044	}
				1045
				1046	static void btrfs_close_one_device(struct btrfs_device *device)
				1047	{
				1048	struct btrfs_fs_devices *fs_devices = device->fs_devices;
				1049	struct btrfs_device *new_device;
				1050	struct rcu_string *name;
				1051
				1052	if (device->bdev)
				1053	fs_devices->open_devices--;
				1054
				1055	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				1056	device->devid != BTRFS_DEV_REPLACE_DEVID) {
				1057	list_del_init(&device->dev_alloc_list);
				1058	fs_devices->rw_devices--;
				1059	}
				1060
				1061	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
				1062	fs_devices->missing_devices--;
				1063
				1064	btrfs_close_bdev(device);
				1065
				1066	new_device = btrfs_alloc_device(NULL, &device->devid,
				1067	device->uuid);
				1068	BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
				1069
				1070	/* Safe because we are under uuid_mutex */
				1071	if (device->name) {
				1072	name = rcu_string_strdup(device->name->str, GFP_NOFS);
				1073	BUG_ON(!name); /* -ENOMEM */
				1074	rcu_assign_pointer(new_device->name, name);
				1075	}
				1076
				1077	list_replace_rcu(&device->dev_list, &new_device->dev_list);
				1078	new_device->fs_devices = device->fs_devices;
				1079
				1080	call_rcu(&device->rcu, free_device_rcu);
				1081	}
				1082
				1083	static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
				1084	{
				1085	struct btrfs_device device, tmp;
				1086
				1087	if (--fs_devices->opened > 0)
				1088	return 0;
				1089
				1090	mutex_lock(&fs_devices->device_list_mutex);
				1091	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
				1092	btrfs_close_one_device(device);
				1093	}
				1094	mutex_unlock(&fs_devices->device_list_mutex);
				1095
				1096	WARN_ON(fs_devices->open_devices);
				1097	WARN_ON(fs_devices->rw_devices);
				1098	fs_devices->opened = 0;
				1099	fs_devices->seeding = 0;
				1100
				1101	return 0;
				1102	}
				1103
				1104	int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
				1105	{
				1106	struct btrfs_fs_devices *seed_devices = NULL;
				1107	int ret;
				1108
				1109	mutex_lock(&uuid_mutex);
				1110	ret = close_fs_devices(fs_devices);
				1111	if (!fs_devices->opened) {
				1112	seed_devices = fs_devices->seed;
				1113	fs_devices->seed = NULL;
				1114	}
				1115	mutex_unlock(&uuid_mutex);
				1116
				1117	while (seed_devices) {
				1118	fs_devices = seed_devices;
				1119	seed_devices = fs_devices->seed;
				1120	close_fs_devices(fs_devices);
				1121	free_fs_devices(fs_devices);
				1122	}
				1123	return ret;
				1124	}
				1125
				1126	static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
				1127	fmode_t flags, void *holder)
				1128	{
				1129	struct btrfs_device *device;
				1130	struct btrfs_device *latest_dev = NULL;
				1131	int ret = 0;
				1132
				1133	flags \|= FMODE_EXCL;
				1134
				1135	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				1136	/* Just open everything we can; ignore failures here */
				1137	if (btrfs_open_one_device(fs_devices, device, flags, holder))
				1138	continue;
				1139
				1140	if (!latest_dev \|\|
				1141	device->generation > latest_dev->generation)
				1142	latest_dev = device;
				1143	}
				1144	if (fs_devices->open_devices == 0) {
				1145	ret = -EINVAL;
				1146	goto out;
				1147	}
				1148	fs_devices->opened = 1;
				1149	fs_devices->latest_bdev = latest_dev->bdev;
				1150	fs_devices->total_rw_bytes = 0;
				1151	out:
				1152	return ret;
				1153	}
				1154
				1155	static int devid_cmp(void priv, struct list_head a, struct list_head *b)
				1156	{
				1157	struct btrfs_device dev1, dev2;
				1158
				1159	dev1 = list_entry(a, struct btrfs_device, dev_list);
				1160	dev2 = list_entry(b, struct btrfs_device, dev_list);
				1161
				1162	if (dev1->devid < dev2->devid)
				1163	return -1;
				1164	else if (dev1->devid > dev2->devid)
				1165	return 1;
				1166	return 0;
				1167	}
				1168
				1169	int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
				1170	fmode_t flags, void *holder)
				1171	{
				1172	int ret;
				1173
				1174	lockdep_assert_held(&uuid_mutex);
				1175
				1176	mutex_lock(&fs_devices->device_list_mutex);
				1177	if (fs_devices->opened) {
				1178	fs_devices->opened++;
				1179	ret = 0;
				1180	} else {
				1181	list_sort(NULL, &fs_devices->devices, devid_cmp);
				1182	ret = open_fs_devices(fs_devices, flags, holder);
				1183	}
				1184	mutex_unlock(&fs_devices->device_list_mutex);
				1185
				1186	return ret;
				1187	}
				1188
				1189	static void btrfs_release_disk_super(struct page *page)
				1190	{
				1191	kunmap(page);
				1192	put_page(page);
				1193	}
				1194
				1195	static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
				1196	struct page **page,
				1197	struct btrfs_super_block **disk_super)
				1198	{
				1199	void *p;
				1200	pgoff_t index;
				1201
				1202	/* make sure our super fits in the device */
				1203	if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
				1204	return 1;
				1205
				1206	/* make sure our super fits in the page */
				1207	if (sizeof(**disk_super) > PAGE_SIZE)
				1208	return 1;
				1209
				1210	/* make sure our super doesn't straddle pages on disk */
				1211	index = bytenr >> PAGE_SHIFT;
				1212	if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
				1213	return 1;
				1214
				1215	/* pull in the page with our super */
				1216	*page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
				1217	index, GFP_KERNEL);
				1218
				1219	if (IS_ERR_OR_NULL(*page))
				1220	return 1;
				1221
				1222	p = kmap(*page);
				1223
				1224	/* align our pointer to the offset of the super block */
				1225	*disk_super = p + (bytenr & ~PAGE_MASK);
				1226
				1227	if (btrfs_super_bytenr(*disk_super) != bytenr \|\|
				1228	btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
				1229	btrfs_release_disk_super(*page);
				1230	return 1;
				1231	}
				1232
				1233	if ((*disk_super)->label[0] &&
				1234	(*disk_super)->label[BTRFS_LABEL_SIZE - 1])
				1235	(*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
				1236
				1237	return 0;
				1238	}
				1239
				1240	/*
				1241	* Look for a btrfs signature on a device. This may be called out of the mount path
				1242	* and we are not allowed to call set_blocksize during the scan. The superblock
				1243	* is read via pagecache
				1244	*/
				1245	struct btrfs_device btrfs_scan_one_device(const char path, fmode_t flags,
				1246	void *holder)
				1247	{
				1248	struct btrfs_super_block *disk_super;
				1249	bool new_device_added = false;
				1250	struct btrfs_device *device = NULL;
				1251	struct block_device *bdev;
				1252	struct page *page;
				1253	u64 bytenr;
				1254
				1255	lockdep_assert_held(&uuid_mutex);
				1256
				1257	/*
				1258	* we would like to check all the supers, but that would make
				1259	* a btrfs mount succeed after a mkfs from a different FS.
				1260	* So, we need to add a special mount option to scan for
				1261	* later supers, using BTRFS_SUPER_MIRROR_MAX instead
				1262	*/
				1263	bytenr = btrfs_sb_offset(0);
				1264	flags \|= FMODE_EXCL;
				1265
				1266	bdev = blkdev_get_by_path(path, flags, holder);
				1267	if (IS_ERR(bdev))
				1268	return ERR_CAST(bdev);
				1269
				1270	if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
				1271	device = ERR_PTR(-EINVAL);
				1272	goto error_bdev_put;
				1273	}
				1274
				1275	device = device_list_add(path, disk_super, &new_device_added);
				1276	if (!IS_ERR(device)) {
				1277	if (new_device_added)
				1278	btrfs_free_stale_devices(path, device);
				1279	}
				1280
				1281	btrfs_release_disk_super(page);
				1282
				1283	error_bdev_put:
				1284	blkdev_put(bdev, flags);
				1285
				1286	return device;
				1287	}
				1288
				1289	static int contains_pending_extent(struct btrfs_transaction *transaction,
				1290	struct btrfs_device *device,
				1291	u64 *start, u64 len)
				1292	{
				1293	struct btrfs_fs_info *fs_info = device->fs_info;
				1294	struct extent_map *em;
				1295	struct list_head *search_list = &fs_info->pinned_chunks;
				1296	int ret = 0;
				1297	u64 physical_start = *start;
				1298
				1299	if (transaction)
				1300	search_list = &transaction->pending_chunks;
				1301	again:
				1302	list_for_each_entry(em, search_list, list) {
				1303	struct map_lookup *map;
				1304	int i;
				1305
				1306	map = em->map_lookup;
				1307	for (i = 0; i < map->num_stripes; i++) {
				1308	u64 end;
				1309
				1310	if (map->stripes[i].dev != device)
				1311	continue;
				1312	if (map->stripes[i].physical >= physical_start + len \|\|
				1313	map->stripes[i].physical + em->orig_block_len <=
				1314	physical_start)
				1315	continue;
				1316	/*
				1317	* Make sure that while processing the pinned list we do
				1318	* not override our *start with a lower value, because
				1319	* we can have pinned chunks that fall within this
				1320	* device hole and that have lower physical addresses
				1321	* than the pending chunks we processed before. If we
				1322	* do not take this special care we can end up getting
				1323	* 2 pending chunks that start at the same physical
				1324	* device offsets because the end offset of a pinned
				1325	* chunk can be equal to the start offset of some
				1326	* pending chunk.
				1327	*/
				1328	end = map->stripes[i].physical + em->orig_block_len;
				1329	if (end > *start) {
				1330	*start = end;
				1331	ret = 1;
				1332	}
				1333	}
				1334	}
				1335	if (search_list != &fs_info->pinned_chunks) {
				1336	search_list = &fs_info->pinned_chunks;
				1337	goto again;
				1338	}
				1339
				1340	return ret;
				1341	}
				1342
				1343
				1344	/*
				1345	* find_free_dev_extent_start - find free space in the specified device
				1346	* @device: the device which we search the free space in
				1347	* @num_bytes: the size of the free space that we need
				1348	* @search_start: the position from which to begin the search
				1349	* @start: store the start of the free space.
				1350	* @len: the size of the free space. that we find, or the size
				1351	* of the max free space if we don't find suitable free space
				1352	*
				1353	* this uses a pretty simple search, the expectation is that it is
				1354	* called very infrequently and that a given device has a small number
				1355	* of extents
				1356	*
				1357	* @start is used to store the start of the free space if we find. But if we
				1358	* don't find suitable free space, it will be used to store the start position
				1359	* of the max free space.
				1360	*
				1361	* @len is used to store the size of the free space that we find.
				1362	* But if we don't find suitable free space, it is used to store the size of
				1363	* the max free space.
				1364	*/
				1365	int find_free_dev_extent_start(struct btrfs_transaction *transaction,
				1366	struct btrfs_device *device, u64 num_bytes,
				1367	u64 search_start, u64 start, u64 len)
				1368	{
				1369	struct btrfs_fs_info *fs_info = device->fs_info;
				1370	struct btrfs_root *root = fs_info->dev_root;
				1371	struct btrfs_key key;
				1372	struct btrfs_dev_extent *dev_extent;
				1373	struct btrfs_path *path;
				1374	u64 hole_size;
				1375	u64 max_hole_start;
				1376	u64 max_hole_size;
				1377	u64 extent_end;
				1378	u64 search_end = device->total_bytes;
				1379	int ret;
				1380	int slot;
				1381	struct extent_buffer *l;
				1382
				1383	/*
				1384	* We don't want to overwrite the superblock on the drive nor any area
				1385	* used by the boot loader (grub for example), so we make sure to start
				1386	* at an offset of at least 1MB.
				1387	*/
				1388	search_start = max_t(u64, search_start, SZ_1M);
				1389
				1390	path = btrfs_alloc_path();
				1391	if (!path)
				1392	return -ENOMEM;
				1393
				1394	max_hole_start = search_start;
				1395	max_hole_size = 0;
				1396
				1397	again:
				1398	if (search_start >= search_end \|\|
				1399	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				1400	ret = -ENOSPC;
				1401	goto out;
				1402	}
				1403
				1404	path->reada = READA_FORWARD;
				1405	path->search_commit_root = 1;
				1406	path->skip_locking = 1;
				1407
				1408	key.objectid = device->devid;
				1409	key.offset = search_start;
				1410	key.type = BTRFS_DEV_EXTENT_KEY;
				1411
				1412	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1413	if (ret < 0)
				1414	goto out;
				1415	if (ret > 0) {
				1416	ret = btrfs_previous_item(root, path, key.objectid, key.type);
				1417	if (ret < 0)
				1418	goto out;
				1419	}
				1420
				1421	while (1) {
				1422	l = path->nodes[0];
				1423	slot = path->slots[0];
				1424	if (slot >= btrfs_header_nritems(l)) {
				1425	ret = btrfs_next_leaf(root, path);
				1426	if (ret == 0)
				1427	continue;
				1428	if (ret < 0)
				1429	goto out;
				1430
				1431	break;
				1432	}
				1433	btrfs_item_key_to_cpu(l, &key, slot);
				1434
				1435	if (key.objectid < device->devid)
				1436	goto next;
				1437
				1438	if (key.objectid > device->devid)
				1439	break;
				1440
				1441	if (key.type != BTRFS_DEV_EXTENT_KEY)
				1442	goto next;
				1443
				1444	if (key.offset > search_start) {
				1445	hole_size = key.offset - search_start;
				1446
				1447	/*
				1448	* Have to check before we set max_hole_start, otherwise
				1449	* we could end up sending back this offset anyway.
				1450	*/
				1451	if (contains_pending_extent(transaction, device,
				1452	&search_start,
				1453	hole_size)) {
				1454	if (key.offset >= search_start) {
				1455	hole_size = key.offset - search_start;
				1456	} else {
				1457	WARN_ON_ONCE(1);
				1458	hole_size = 0;
				1459	}
				1460	}
				1461
				1462	if (hole_size > max_hole_size) {
				1463	max_hole_start = search_start;
				1464	max_hole_size = hole_size;
				1465	}
				1466
				1467	/*
				1468	* If this free space is greater than which we need,
				1469	* it must be the max free space that we have found
				1470	* until now, so max_hole_start must point to the start
				1471	* of this free space and the length of this free space
				1472	* is stored in max_hole_size. Thus, we return
				1473	* max_hole_start and max_hole_size and go back to the
				1474	* caller.
				1475	*/
				1476	if (hole_size >= num_bytes) {
				1477	ret = 0;
				1478	goto out;
				1479	}
				1480	}
				1481
				1482	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				1483	extent_end = key.offset + btrfs_dev_extent_length(l,
				1484	dev_extent);
				1485	if (extent_end > search_start)
				1486	search_start = extent_end;
				1487	next:
				1488	path->slots[0]++;
				1489	cond_resched();
				1490	}
				1491
				1492	/*
				1493	* At this point, search_start should be the end of
				1494	* allocated dev extents, and when shrinking the device,
				1495	* search_end may be smaller than search_start.
				1496	*/
				1497	if (search_end > search_start) {
				1498	hole_size = search_end - search_start;
				1499
				1500	if (contains_pending_extent(transaction, device, &search_start,
				1501	hole_size)) {
				1502	btrfs_release_path(path);
				1503	goto again;
				1504	}
				1505
				1506	if (hole_size > max_hole_size) {
				1507	max_hole_start = search_start;
				1508	max_hole_size = hole_size;
				1509	}
				1510	}
				1511
				1512	/* See above. */
				1513	if (max_hole_size < num_bytes)
				1514	ret = -ENOSPC;
				1515	else
				1516	ret = 0;
				1517
				1518	out:
				1519	btrfs_free_path(path);
				1520	*start = max_hole_start;
				1521	if (len)
				1522	*len = max_hole_size;
				1523	return ret;
				1524	}
				1525
				1526	int find_free_dev_extent(struct btrfs_trans_handle *trans,
				1527	struct btrfs_device *device, u64 num_bytes,
				1528	u64 start, u64 len)
				1529	{
				1530	/* FIXME use last free of some kind */
				1531	return find_free_dev_extent_start(trans->transaction, device,
				1532	num_bytes, 0, start, len);
				1533	}
				1534
				1535	static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
				1536	struct btrfs_device *device,
				1537	u64 start, u64 *dev_extent_len)
				1538	{
				1539	struct btrfs_fs_info *fs_info = device->fs_info;
				1540	struct btrfs_root *root = fs_info->dev_root;
				1541	int ret;
				1542	struct btrfs_path *path;
				1543	struct btrfs_key key;
				1544	struct btrfs_key found_key;
				1545	struct extent_buffer *leaf = NULL;
				1546	struct btrfs_dev_extent *extent = NULL;
				1547
				1548	path = btrfs_alloc_path();
				1549	if (!path)
				1550	return -ENOMEM;
				1551
				1552	key.objectid = device->devid;
				1553	key.offset = start;
				1554	key.type = BTRFS_DEV_EXTENT_KEY;
				1555	again:
				1556	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1557	if (ret > 0) {
				1558	ret = btrfs_previous_item(root, path, key.objectid,
				1559	BTRFS_DEV_EXTENT_KEY);
				1560	if (ret)
				1561	goto out;
				1562	leaf = path->nodes[0];
				1563	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				1564	extent = btrfs_item_ptr(leaf, path->slots[0],
				1565	struct btrfs_dev_extent);
				1566	BUG_ON(found_key.offset > start \|\| found_key.offset +
				1567	btrfs_dev_extent_length(leaf, extent) < start);
				1568	key = found_key;
				1569	btrfs_release_path(path);
				1570	goto again;
				1571	} else if (ret == 0) {
				1572	leaf = path->nodes[0];
				1573	extent = btrfs_item_ptr(leaf, path->slots[0],
				1574	struct btrfs_dev_extent);
				1575	} else {
				1576	btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
				1577	goto out;
				1578	}
				1579
				1580	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);
				1581
				1582	ret = btrfs_del_item(trans, root, path);
				1583	if (ret) {
				1584	btrfs_handle_fs_error(fs_info, ret,
				1585	"Failed to remove dev extent item");
				1586	} else {
				1587	set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
				1588	}
				1589	out:
				1590	btrfs_free_path(path);
				1591	return ret;
				1592	}
				1593
				1594	static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
				1595	struct btrfs_device *device,
				1596	u64 chunk_offset, u64 start, u64 num_bytes)
				1597	{
				1598	int ret;
				1599	struct btrfs_path *path;
				1600	struct btrfs_fs_info *fs_info = device->fs_info;
				1601	struct btrfs_root *root = fs_info->dev_root;
				1602	struct btrfs_dev_extent *extent;
				1603	struct extent_buffer *leaf;
				1604	struct btrfs_key key;
				1605
				1606	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
				1607	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
				1608	path = btrfs_alloc_path();
				1609	if (!path)
				1610	return -ENOMEM;
				1611
				1612	key.objectid = device->devid;
				1613	key.offset = start;
				1614	key.type = BTRFS_DEV_EXTENT_KEY;
				1615	ret = btrfs_insert_empty_item(trans, root, path, &key,
				1616	sizeof(*extent));
				1617	if (ret)
				1618	goto out;
				1619
				1620	leaf = path->nodes[0];
				1621	extent = btrfs_item_ptr(leaf, path->slots[0],
				1622	struct btrfs_dev_extent);
				1623	btrfs_set_dev_extent_chunk_tree(leaf, extent,
				1624	BTRFS_CHUNK_TREE_OBJECTID);
				1625	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
				1626	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				1627	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
				1628
				1629	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
				1630	btrfs_mark_buffer_dirty(leaf);
				1631	out:
				1632	btrfs_free_path(path);
				1633	return ret;
				1634	}
				1635
				1636	static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
				1637	{
				1638	struct extent_map_tree *em_tree;
				1639	struct extent_map *em;
				1640	struct rb_node *n;
				1641	u64 ret = 0;
				1642
				1643	em_tree = &fs_info->mapping_tree.map_tree;
				1644	read_lock(&em_tree->lock);
				1645	n = rb_last(&em_tree->map);
				1646	if (n) {
				1647	em = rb_entry(n, struct extent_map, rb_node);
				1648	ret = em->start + em->len;
				1649	}
				1650	read_unlock(&em_tree->lock);
				1651
				1652	return ret;
				1653	}
				1654
				1655	static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				1656	u64 *devid_ret)
				1657	{
				1658	int ret;
				1659	struct btrfs_key key;
				1660	struct btrfs_key found_key;
				1661	struct btrfs_path *path;
				1662
				1663	path = btrfs_alloc_path();
				1664	if (!path)
				1665	return -ENOMEM;
				1666
				1667	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1668	key.type = BTRFS_DEV_ITEM_KEY;
				1669	key.offset = (u64)-1;
				1670
				1671	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
				1672	if (ret < 0)
				1673	goto error;
				1674
				1675	BUG_ON(ret == 0); /* Corruption */
				1676
				1677	ret = btrfs_previous_item(fs_info->chunk_root, path,
				1678	BTRFS_DEV_ITEMS_OBJECTID,
				1679	BTRFS_DEV_ITEM_KEY);
				1680	if (ret) {
				1681	*devid_ret = 1;
				1682	} else {
				1683	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1684	path->slots[0]);
				1685	*devid_ret = found_key.offset + 1;
				1686	}
				1687	ret = 0;
				1688	error:
				1689	btrfs_free_path(path);
				1690	return ret;
				1691	}
				1692
				1693	/*
				1694	* the device information is stored in the chunk root
				1695	* the btrfs_device struct should be fully filled in
				1696	*/
				1697	static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
				1698	struct btrfs_device *device)
				1699	{
				1700	int ret;
				1701	struct btrfs_path *path;
				1702	struct btrfs_dev_item *dev_item;
				1703	struct extent_buffer *leaf;
				1704	struct btrfs_key key;
				1705	unsigned long ptr;
				1706
				1707	path = btrfs_alloc_path();
				1708	if (!path)
				1709	return -ENOMEM;
				1710
				1711	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1712	key.type = BTRFS_DEV_ITEM_KEY;
				1713	key.offset = device->devid;
				1714
				1715	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
				1716	&key, sizeof(*dev_item));
				1717	if (ret)
				1718	goto out;
				1719
				1720	leaf = path->nodes[0];
				1721	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
				1722
				1723	btrfs_set_device_id(leaf, dev_item, device->devid);
				1724	btrfs_set_device_generation(leaf, dev_item, 0);
				1725	btrfs_set_device_type(leaf, dev_item, device->type);
				1726	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
				1727	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
				1728	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
				1729	btrfs_set_device_total_bytes(leaf, dev_item,
				1730	btrfs_device_get_disk_total_bytes(device));
				1731	btrfs_set_device_bytes_used(leaf, dev_item,
				1732	btrfs_device_get_bytes_used(device));
				1733	btrfs_set_device_group(leaf, dev_item, 0);
				1734	btrfs_set_device_seek_speed(leaf, dev_item, 0);
				1735	btrfs_set_device_bandwidth(leaf, dev_item, 0);
				1736	btrfs_set_device_start_offset(leaf, dev_item, 0);
				1737
				1738	ptr = btrfs_device_uuid(dev_item);
				1739	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
				1740	ptr = btrfs_device_fsid(dev_item);
				1741	write_extent_buffer(leaf, trans->fs_info->fsid, ptr, BTRFS_FSID_SIZE);
				1742	btrfs_mark_buffer_dirty(leaf);
				1743
				1744	ret = 0;
				1745	out:
				1746	btrfs_free_path(path);
				1747	return ret;
				1748	}
				1749
				1750	/*
				1751	* Function to update ctime/mtime for a given device path.
				1752	* Mainly used for ctime/mtime based probe like libblkid.
				1753	*/
				1754	static void update_dev_time(const char *path_name)
				1755	{
				1756	struct file *filp;
				1757
				1758	filp = filp_open(path_name, O_RDWR, 0);
				1759	if (IS_ERR(filp))
				1760	return;
				1761	file_update_time(filp);
				1762	filp_close(filp, NULL);
				1763	}
				1764
				1765	static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
				1766	struct btrfs_device *device)
				1767	{
				1768	struct btrfs_root *root = fs_info->chunk_root;
				1769	int ret;
				1770	struct btrfs_path *path;
				1771	struct btrfs_key key;
				1772	struct btrfs_trans_handle *trans;
				1773
				1774	path = btrfs_alloc_path();
				1775	if (!path)
				1776	return -ENOMEM;
				1777
				1778	trans = btrfs_start_transaction(root, 0);
				1779	if (IS_ERR(trans)) {
				1780	btrfs_free_path(path);
				1781	return PTR_ERR(trans);
				1782	}
				1783	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				1784	key.type = BTRFS_DEV_ITEM_KEY;
				1785	key.offset = device->devid;
				1786
				1787	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1788	if (ret) {
				1789	if (ret > 0)
				1790	ret = -ENOENT;
				1791	btrfs_abort_transaction(trans, ret);
				1792	btrfs_end_transaction(trans);
				1793	goto out;
				1794	}
				1795
				1796	ret = btrfs_del_item(trans, root, path);
				1797	if (ret) {
				1798	btrfs_abort_transaction(trans, ret);
				1799	btrfs_end_transaction(trans);
				1800	}
				1801
				1802	out:
				1803	btrfs_free_path(path);
				1804	if (!ret)
				1805	ret = btrfs_commit_transaction(trans);
				1806	return ret;
				1807	}
				1808
				1809	/*
				1810	* Verify that @num_devices satisfies the RAID profile constraints in the whole
				1811	* filesystem. It's up to the caller to adjust that number regarding eg. device
				1812	* replace.
				1813	*/
				1814	static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
				1815	u64 num_devices)
				1816	{
				1817	u64 all_avail;
				1818	unsigned seq;
				1819	int i;
				1820
				1821	do {
				1822	seq = read_seqbegin(&fs_info->profiles_lock);
				1823
				1824	all_avail = fs_info->avail_data_alloc_bits \|
				1825	fs_info->avail_system_alloc_bits \|
				1826	fs_info->avail_metadata_alloc_bits;
				1827	} while (read_seqretry(&fs_info->profiles_lock, seq));
				1828
				1829	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
				1830	if (!(all_avail & btrfs_raid_array[i].bg_flag))
				1831	continue;
				1832
				1833	if (num_devices < btrfs_raid_array[i].devs_min) {
				1834	int ret = btrfs_raid_array[i].mindev_error;
				1835
				1836	if (ret)
				1837	return ret;
				1838	}
				1839	}
				1840
				1841	return 0;
				1842	}
				1843
				1844	static struct btrfs_device * btrfs_find_next_active_device(
				1845	struct btrfs_fs_devices fs_devs, struct btrfs_device device)
				1846	{
				1847	struct btrfs_device *next_device;
				1848
				1849	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
				1850	if (next_device != device &&
				1851	!test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
				1852	&& next_device->bdev)
				1853	return next_device;
				1854	}
				1855
				1856	return NULL;
				1857	}
				1858
				1859	/*
				1860	* Helper function to check if the given device is part of s_bdev / latest_bdev
				1861	* and replace it with the provided or the next active device, in the context
				1862	* where this function called, there should be always be another device (or
				1863	* this_dev) which is active.
				1864	*/
				1865	void btrfs_assign_next_active_device(struct btrfs_device *device,
				1866	struct btrfs_device *this_dev)
				1867	{
				1868	struct btrfs_fs_info *fs_info = device->fs_info;
				1869	struct btrfs_device *next_device;
				1870
				1871	if (this_dev)
				1872	next_device = this_dev;
				1873	else
				1874	next_device = btrfs_find_next_active_device(fs_info->fs_devices,
				1875	device);
				1876	ASSERT(next_device);
				1877
				1878	if (fs_info->sb->s_bdev &&
				1879	(fs_info->sb->s_bdev == device->bdev))
				1880	fs_info->sb->s_bdev = next_device->bdev;
				1881
				1882	if (fs_info->fs_devices->latest_bdev == device->bdev)
				1883	fs_info->fs_devices->latest_bdev = next_device->bdev;
				1884	}
				1885
				1886	int btrfs_rm_device(struct btrfs_fs_info fs_info, const char device_path,
				1887	u64 devid)
				1888	{
				1889	struct btrfs_device *device;
				1890	struct btrfs_fs_devices *cur_devices;
				1891	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				1892	u64 num_devices;
				1893	int ret = 0;
				1894
				1895	mutex_lock(&uuid_mutex);
				1896
				1897	num_devices = fs_devices->num_devices;
				1898	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
				1899	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
				1900	WARN_ON(num_devices < 1);
				1901	num_devices--;
				1902	}
				1903	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
				1904
				1905	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
				1906	if (ret)
				1907	goto out;
				1908
				1909	ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
				1910	&device);
				1911	if (ret)
				1912	goto out;
				1913
				1914	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				1915	ret = BTRFS_ERROR_DEV_TGT_REPLACE;
				1916	goto out;
				1917	}
				1918
				1919	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				1920	fs_info->fs_devices->rw_devices == 1) {
				1921	ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
				1922	goto out;
				1923	}
				1924
				1925	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				1926	mutex_lock(&fs_info->chunk_mutex);
				1927	list_del_init(&device->dev_alloc_list);
				1928	device->fs_devices->rw_devices--;
				1929	mutex_unlock(&fs_info->chunk_mutex);
				1930	}
				1931
				1932	mutex_unlock(&uuid_mutex);
				1933	ret = btrfs_shrink_device(device, 0);
				1934	mutex_lock(&uuid_mutex);
				1935	if (ret)
				1936	goto error_undo;
				1937
				1938	/*
				1939	* TODO: the superblock still includes this device in its num_devices
				1940	* counter although write_all_supers() is not locked out. This
				1941	* could give a filesystem state which requires a degraded mount.
				1942	*/
				1943	ret = btrfs_rm_dev_item(fs_info, device);
				1944	if (ret)
				1945	goto error_undo;
				1946
				1947	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
				1948	btrfs_scrub_cancel_dev(fs_info, device);
				1949
				1950	/*
				1951	* the device list mutex makes sure that we don't change
				1952	* the device list while someone else is writing out all
				1953	* the device supers. Whoever is writing all supers, should
				1954	* lock the device list mutex before getting the number of
				1955	* devices in the super block (super_copy). Conversely,
				1956	* whoever updates the number of devices in the super block
				1957	* (super_copy) should hold the device list mutex.
				1958	*/
				1959
				1960	/*
				1961	* In normal cases the cur_devices == fs_devices. But in case
				1962	* of deleting a seed device, the cur_devices should point to
				1963	* its own fs_devices listed under the fs_devices->seed.
				1964	*/
				1965	cur_devices = device->fs_devices;
				1966	mutex_lock(&fs_devices->device_list_mutex);
				1967	list_del_rcu(&device->dev_list);
				1968
				1969	cur_devices->num_devices--;
				1970	cur_devices->total_devices--;
				1971	/* Update total_devices of the parent fs_devices if it's seed */
				1972	if (cur_devices != fs_devices)
				1973	fs_devices->total_devices--;
				1974
				1975	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
				1976	cur_devices->missing_devices--;
				1977
				1978	btrfs_assign_next_active_device(device, NULL);
				1979
				1980	if (device->bdev) {
				1981	cur_devices->open_devices--;
				1982	/* remove sysfs entry */
				1983	btrfs_sysfs_rm_device_link(fs_devices, device);
				1984	}
				1985
				1986	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
				1987	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
				1988	mutex_unlock(&fs_devices->device_list_mutex);
				1989
				1990	/*
				1991	* at this point, the device is zero sized and detached from
				1992	* the devices list. All that's left is to zero out the old
				1993	* supers and free the device.
				1994	*/
				1995	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				1996	btrfs_scratch_superblocks(device->bdev, device->name->str);
				1997
				1998	btrfs_close_bdev(device);
				1999	call_rcu(&device->rcu, free_device_rcu);
				2000
				2001	if (cur_devices->open_devices == 0) {
				2002	while (fs_devices) {
				2003	if (fs_devices->seed == cur_devices) {
				2004	fs_devices->seed = cur_devices->seed;
				2005	break;
				2006	}
				2007	fs_devices = fs_devices->seed;
				2008	}
				2009	cur_devices->seed = NULL;
				2010	close_fs_devices(cur_devices);
				2011	free_fs_devices(cur_devices);
				2012	}
				2013
				2014	out:
				2015	mutex_unlock(&uuid_mutex);
				2016	return ret;
				2017
				2018	error_undo:
				2019	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				2020	mutex_lock(&fs_info->chunk_mutex);
				2021	list_add(&device->dev_alloc_list,
				2022	&fs_devices->alloc_list);
				2023	device->fs_devices->rw_devices++;
				2024	mutex_unlock(&fs_info->chunk_mutex);
				2025	}
				2026	goto out;
				2027	}
				2028
				2029	void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
				2030	{
				2031	struct btrfs_fs_devices *fs_devices;
				2032
				2033	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
				2034
				2035	/*
				2036	* in case of fs with no seed, srcdev->fs_devices will point
				2037	* to fs_devices of fs_info. However when the dev being replaced is
				2038	* a seed dev it will point to the seed's local fs_devices. In short
				2039	* srcdev will have its correct fs_devices in both the cases.
				2040	*/
				2041	fs_devices = srcdev->fs_devices;
				2042
				2043	list_del_rcu(&srcdev->dev_list);
				2044	list_del(&srcdev->dev_alloc_list);
				2045	fs_devices->num_devices--;
				2046	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
				2047	fs_devices->missing_devices--;
				2048
				2049	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
				2050	fs_devices->rw_devices--;
				2051
				2052	if (srcdev->bdev)
				2053	fs_devices->open_devices--;
				2054	}
				2055
				2056	void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
				2057	struct btrfs_device *srcdev)
				2058	{
				2059	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
				2060
				2061	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
				2062	/* zero out the old super if it is writable */
				2063	btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
				2064	}
				2065
				2066	btrfs_close_bdev(srcdev);
				2067	call_rcu(&srcdev->rcu, free_device_rcu);
				2068
				2069	/* if this is no devs we rather delete the fs_devices */
				2070	if (!fs_devices->num_devices) {
				2071	struct btrfs_fs_devices *tmp_fs_devices;
				2072
				2073	/*
				2074	* On a mounted FS, num_devices can't be zero unless it's a
				2075	* seed. In case of a seed device being replaced, the replace
				2076	* target added to the sprout FS, so there will be no more
				2077	* device left under the seed FS.
				2078	*/
				2079	ASSERT(fs_devices->seeding);
				2080
				2081	tmp_fs_devices = fs_info->fs_devices;
				2082	while (tmp_fs_devices) {
				2083	if (tmp_fs_devices->seed == fs_devices) {
				2084	tmp_fs_devices->seed = fs_devices->seed;
				2085	break;
				2086	}
				2087	tmp_fs_devices = tmp_fs_devices->seed;
				2088	}
				2089	fs_devices->seed = NULL;
				2090	close_fs_devices(fs_devices);
				2091	free_fs_devices(fs_devices);
				2092	}
				2093	}
				2094
				2095	void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
				2096	{
				2097	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
				2098
				2099	WARN_ON(!tgtdev);
				2100	mutex_lock(&fs_devices->device_list_mutex);
				2101
				2102	btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
				2103
				2104	if (tgtdev->bdev)
				2105	fs_devices->open_devices--;
				2106
				2107	fs_devices->num_devices--;
				2108
				2109	btrfs_assign_next_active_device(tgtdev, NULL);
				2110
				2111	list_del_rcu(&tgtdev->dev_list);
				2112
				2113	mutex_unlock(&fs_devices->device_list_mutex);
				2114
				2115	/*
				2116	* The update_dev_time() with in btrfs_scratch_superblocks()
				2117	* may lead to a call to btrfs_show_devname() which will try
				2118	* to hold device_list_mutex. And here this device
				2119	* is already out of device list, so we don't have to hold
				2120	* the device_list_mutex lock.
				2121	*/
				2122	btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
				2123
				2124	btrfs_close_bdev(tgtdev);
				2125	call_rcu(&tgtdev->rcu, free_device_rcu);
				2126	}
				2127
				2128	static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
				2129	const char *device_path,
				2130	struct btrfs_device **device)
				2131	{
				2132	int ret = 0;
				2133	struct btrfs_super_block *disk_super;
				2134	u64 devid;
				2135	u8 *dev_uuid;
				2136	struct block_device *bdev;
				2137	struct buffer_head *bh;
				2138
				2139	*device = NULL;
				2140	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
				2141	fs_info->bdev_holder, 0, &bdev, &bh);
				2142	if (ret)
				2143	return ret;
				2144	disk_super = (struct btrfs_super_block *)bh->b_data;
				2145	devid = btrfs_stack_device_id(&disk_super->dev_item);
				2146	dev_uuid = disk_super->dev_item.uuid;
				2147	*device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
				2148	brelse(bh);
				2149	if (!*device)
				2150	ret = -ENOENT;
				2151	blkdev_put(bdev, FMODE_READ);
				2152	return ret;
				2153	}
				2154
				2155	int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
				2156	const char *device_path,
				2157	struct btrfs_device **device)
				2158	{
				2159	*device = NULL;
				2160	if (strcmp(device_path, "missing") == 0) {
				2161	struct list_head *devices;
				2162	struct btrfs_device *tmp;
				2163
				2164	devices = &fs_info->fs_devices->devices;
				2165	list_for_each_entry(tmp, devices, dev_list) {
				2166	if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				2167	&tmp->dev_state) && !tmp->bdev) {
				2168	*device = tmp;
				2169	break;
				2170	}
				2171	}
				2172
				2173	if (!*device)
				2174	return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
				2175
				2176	return 0;
				2177	} else {
				2178	return btrfs_find_device_by_path(fs_info, device_path, device);
				2179	}
				2180	}
				2181
				2182	/*
				2183	* Lookup a device given by device id, or the path if the id is 0.
				2184	*/
				2185	int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
				2186	const char *devpath,
				2187	struct btrfs_device **device)
				2188	{
				2189	int ret;
				2190
				2191	if (devid) {
				2192	ret = 0;
				2193	*device = btrfs_find_device(fs_info, devid, NULL, NULL);
				2194	if (!*device)
				2195	ret = -ENOENT;
				2196	} else {
				2197	if (!devpath \|\| !devpath[0])
				2198	return -EINVAL;
				2199
				2200	ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
				2201	device);
				2202	}
				2203	return ret;
				2204	}
				2205
				2206	/*
				2207	* does all the dirty work required for changing file system's UUID.
				2208	*/
				2209	static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
				2210	{
				2211	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2212	struct btrfs_fs_devices *old_devices;
				2213	struct btrfs_fs_devices *seed_devices;
				2214	struct btrfs_super_block *disk_super = fs_info->super_copy;
				2215	struct btrfs_device *device;
				2216	u64 super_flags;
				2217
				2218	lockdep_assert_held(&uuid_mutex);
				2219	if (!fs_devices->seeding)
				2220	return -EINVAL;
				2221
				2222	seed_devices = alloc_fs_devices(NULL);
				2223	if (IS_ERR(seed_devices))
				2224	return PTR_ERR(seed_devices);
				2225
				2226	old_devices = clone_fs_devices(fs_devices);
				2227	if (IS_ERR(old_devices)) {
				2228	kfree(seed_devices);
				2229	return PTR_ERR(old_devices);
				2230	}
				2231
				2232	list_add(&old_devices->fs_list, &fs_uuids);
				2233
				2234	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
				2235	seed_devices->opened = 1;
				2236	INIT_LIST_HEAD(&seed_devices->devices);
				2237	INIT_LIST_HEAD(&seed_devices->alloc_list);
				2238	mutex_init(&seed_devices->device_list_mutex);
				2239
				2240	mutex_lock(&fs_devices->device_list_mutex);
				2241	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
				2242	synchronize_rcu);
				2243	list_for_each_entry(device, &seed_devices->devices, dev_list)
				2244	device->fs_devices = seed_devices;
				2245
				2246	mutex_lock(&fs_info->chunk_mutex);
				2247	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
				2248	mutex_unlock(&fs_info->chunk_mutex);
				2249
				2250	fs_devices->seeding = 0;
				2251	fs_devices->num_devices = 0;
				2252	fs_devices->open_devices = 0;
				2253	fs_devices->missing_devices = 0;
				2254	fs_devices->rotating = 0;
				2255	fs_devices->seed = seed_devices;
				2256
				2257	generate_random_uuid(fs_devices->fsid);
				2258	memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
				2259	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
				2260	mutex_unlock(&fs_devices->device_list_mutex);
				2261
				2262	super_flags = btrfs_super_flags(disk_super) &
				2263	~BTRFS_SUPER_FLAG_SEEDING;
				2264	btrfs_set_super_flags(disk_super, super_flags);
				2265
				2266	return 0;
				2267	}
				2268
				2269	/*
				2270	* Store the expected generation for seed devices in device items.
				2271	*/
				2272	static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
				2273	struct btrfs_fs_info *fs_info)
				2274	{
				2275	struct btrfs_root *root = fs_info->chunk_root;
				2276	struct btrfs_path *path;
				2277	struct extent_buffer *leaf;
				2278	struct btrfs_dev_item *dev_item;
				2279	struct btrfs_device *device;
				2280	struct btrfs_key key;
				2281	u8 fs_uuid[BTRFS_FSID_SIZE];
				2282	u8 dev_uuid[BTRFS_UUID_SIZE];
				2283	u64 devid;
				2284	int ret;
				2285
				2286	path = btrfs_alloc_path();
				2287	if (!path)
				2288	return -ENOMEM;
				2289
				2290	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				2291	key.offset = 0;
				2292	key.type = BTRFS_DEV_ITEM_KEY;
				2293
				2294	while (1) {
				2295	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
				2296	if (ret < 0)
				2297	goto error;
				2298
				2299	leaf = path->nodes[0];
				2300	next_slot:
				2301	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
				2302	ret = btrfs_next_leaf(root, path);
				2303	if (ret > 0)
				2304	break;
				2305	if (ret < 0)
				2306	goto error;
				2307	leaf = path->nodes[0];
				2308	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2309	btrfs_release_path(path);
				2310	continue;
				2311	}
				2312
				2313	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2314	if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID \|\|
				2315	key.type != BTRFS_DEV_ITEM_KEY)
				2316	break;
				2317
				2318	dev_item = btrfs_item_ptr(leaf, path->slots[0],
				2319	struct btrfs_dev_item);
				2320	devid = btrfs_device_id(leaf, dev_item);
				2321	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
				2322	BTRFS_UUID_SIZE);
				2323	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
				2324	BTRFS_FSID_SIZE);
				2325	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
				2326	BUG_ON(!device); /* Logic error */
				2327
				2328	if (device->fs_devices->seeding) {
				2329	btrfs_set_device_generation(leaf, dev_item,
				2330	device->generation);
				2331	btrfs_mark_buffer_dirty(leaf);
				2332	}
				2333
				2334	path->slots[0]++;
				2335	goto next_slot;
				2336	}
				2337	ret = 0;
				2338	error:
				2339	btrfs_free_path(path);
				2340	return ret;
				2341	}
				2342
				2343	int btrfs_init_new_device(struct btrfs_fs_info fs_info, const char device_path)
				2344	{
				2345	struct btrfs_root *root = fs_info->dev_root;
				2346	struct request_queue *q;
				2347	struct btrfs_trans_handle *trans;
				2348	struct btrfs_device *device;
				2349	struct block_device *bdev;
				2350	struct super_block *sb = fs_info->sb;
				2351	struct rcu_string *name;
				2352	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2353	u64 orig_super_total_bytes;
				2354	u64 orig_super_num_devices;
				2355	int seeding_dev = 0;
				2356	int ret = 0;
				2357	bool unlocked = false;
				2358
				2359	if (sb_rdonly(sb) && !fs_devices->seeding)
				2360	return -EROFS;
				2361
				2362	bdev = blkdev_get_by_path(device_path, FMODE_WRITE \| FMODE_EXCL,
				2363	fs_info->bdev_holder);
				2364	if (IS_ERR(bdev))
				2365	return PTR_ERR(bdev);
				2366
				2367	if (fs_devices->seeding) {
				2368	seeding_dev = 1;
				2369	down_write(&sb->s_umount);
				2370	mutex_lock(&uuid_mutex);
				2371	}
				2372
				2373	filemap_write_and_wait(bdev->bd_inode->i_mapping);
				2374
				2375	mutex_lock(&fs_devices->device_list_mutex);
				2376	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				2377	if (device->bdev == bdev) {
				2378	ret = -EEXIST;
				2379	mutex_unlock(
				2380	&fs_devices->device_list_mutex);
				2381	goto error;
				2382	}
				2383	}
				2384	mutex_unlock(&fs_devices->device_list_mutex);
				2385
				2386	device = btrfs_alloc_device(fs_info, NULL, NULL);
				2387	if (IS_ERR(device)) {
				2388	/* we can safely leave the fs_devices entry around */
				2389	ret = PTR_ERR(device);
				2390	goto error;
				2391	}
				2392
				2393	name = rcu_string_strdup(device_path, GFP_KERNEL);
				2394	if (!name) {
				2395	ret = -ENOMEM;
				2396	goto error_free_device;
				2397	}
				2398	rcu_assign_pointer(device->name, name);
				2399
				2400	trans = btrfs_start_transaction(root, 0);
				2401	if (IS_ERR(trans)) {
				2402	ret = PTR_ERR(trans);
				2403	goto error_free_device;
				2404	}
				2405
				2406	q = bdev_get_queue(bdev);
				2407	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
				2408	device->generation = trans->transid;
				2409	device->io_width = fs_info->sectorsize;
				2410	device->io_align = fs_info->sectorsize;
				2411	device->sector_size = fs_info->sectorsize;
				2412	device->total_bytes = round_down(i_size_read(bdev->bd_inode),
				2413	fs_info->sectorsize);
				2414	device->disk_total_bytes = device->total_bytes;
				2415	device->commit_total_bytes = device->total_bytes;
				2416	device->fs_info = fs_info;
				2417	device->bdev = bdev;
				2418	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
				2419	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
				2420	device->mode = FMODE_EXCL;
				2421	device->dev_stats_valid = 1;
				2422	set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
				2423
				2424	if (seeding_dev) {
				2425	sb->s_flags &= ~SB_RDONLY;
				2426	ret = btrfs_prepare_sprout(fs_info);
				2427	if (ret) {
				2428	btrfs_abort_transaction(trans, ret);
				2429	goto error_trans;
				2430	}
				2431	}
				2432
				2433	device->fs_devices = fs_devices;
				2434
				2435	mutex_lock(&fs_devices->device_list_mutex);
				2436	mutex_lock(&fs_info->chunk_mutex);
				2437	list_add_rcu(&device->dev_list, &fs_devices->devices);
				2438	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
				2439	fs_devices->num_devices++;
				2440	fs_devices->open_devices++;
				2441	fs_devices->rw_devices++;
				2442	fs_devices->total_devices++;
				2443	fs_devices->total_rw_bytes += device->total_bytes;
				2444
				2445	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
				2446
				2447	if (!blk_queue_nonrot(q))
				2448	fs_devices->rotating = 1;
				2449
				2450	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
				2451	btrfs_set_super_total_bytes(fs_info->super_copy,
				2452	round_down(orig_super_total_bytes + device->total_bytes,
				2453	fs_info->sectorsize));
				2454
				2455	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
				2456	btrfs_set_super_num_devices(fs_info->super_copy,
				2457	orig_super_num_devices + 1);
				2458
				2459	/* add sysfs device entry */
				2460	btrfs_sysfs_add_device_link(fs_devices, device);
				2461
				2462	/*
				2463	* we've got more storage, clear any full flags on the space
				2464	* infos
				2465	*/
				2466	btrfs_clear_space_info_full(fs_info);
				2467
				2468	mutex_unlock(&fs_info->chunk_mutex);
				2469	mutex_unlock(&fs_devices->device_list_mutex);
				2470
				2471	if (seeding_dev) {
				2472	mutex_lock(&fs_info->chunk_mutex);
				2473	ret = init_first_rw_device(trans, fs_info);
				2474	mutex_unlock(&fs_info->chunk_mutex);
				2475	if (ret) {
				2476	btrfs_abort_transaction(trans, ret);
				2477	goto error_sysfs;
				2478	}
				2479	}
				2480
				2481	ret = btrfs_add_dev_item(trans, device);
				2482	if (ret) {
				2483	btrfs_abort_transaction(trans, ret);
				2484	goto error_sysfs;
				2485	}
				2486
				2487	if (seeding_dev) {
				2488	char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
				2489
				2490	ret = btrfs_finish_sprout(trans, fs_info);
				2491	if (ret) {
				2492	btrfs_abort_transaction(trans, ret);
				2493	goto error_sysfs;
				2494	}
				2495
				2496	/* Sprouting would change fsid of the mounted root,
				2497	* so rename the fsid on the sysfs
				2498	*/
				2499	snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
				2500	fs_info->fsid);
				2501	if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
				2502	btrfs_warn(fs_info,
				2503	"sysfs: failed to create fsid for sprout");
				2504	}
				2505
				2506	ret = btrfs_commit_transaction(trans);
				2507
				2508	if (seeding_dev) {
				2509	mutex_unlock(&uuid_mutex);
				2510	up_write(&sb->s_umount);
				2511	unlocked = true;
				2512
				2513	if (ret) /* transaction commit */
				2514	return ret;
				2515
				2516	ret = btrfs_relocate_sys_chunks(fs_info);
				2517	if (ret < 0)
				2518	btrfs_handle_fs_error(fs_info, ret,
				2519	"Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
				2520	trans = btrfs_attach_transaction(root);
				2521	if (IS_ERR(trans)) {
				2522	if (PTR_ERR(trans) == -ENOENT)
				2523	return 0;
				2524	ret = PTR_ERR(trans);
				2525	trans = NULL;
				2526	goto error_sysfs;
				2527	}
				2528	ret = btrfs_commit_transaction(trans);
				2529	}
				2530
				2531	/* Update ctime/mtime for libblkid */
				2532	update_dev_time(device_path);
				2533	return ret;
				2534
				2535	error_sysfs:
				2536	btrfs_sysfs_rm_device_link(fs_devices, device);
				2537	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2538	mutex_lock(&fs_info->chunk_mutex);
				2539	list_del_rcu(&device->dev_list);
				2540	list_del(&device->dev_alloc_list);
				2541	fs_info->fs_devices->num_devices--;
				2542	fs_info->fs_devices->open_devices--;
				2543	fs_info->fs_devices->rw_devices--;
				2544	fs_info->fs_devices->total_devices--;
				2545	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
				2546	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
				2547	btrfs_set_super_total_bytes(fs_info->super_copy,
				2548	orig_super_total_bytes);
				2549	btrfs_set_super_num_devices(fs_info->super_copy,
				2550	orig_super_num_devices);
				2551	mutex_unlock(&fs_info->chunk_mutex);
				2552	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2553	error_trans:
				2554	if (seeding_dev)
				2555	sb->s_flags \|= SB_RDONLY;
				2556	if (trans)
				2557	btrfs_end_transaction(trans);
				2558	error_free_device:
				2559	btrfs_free_device(device);
				2560	error:
				2561	blkdev_put(bdev, FMODE_EXCL);
				2562	if (seeding_dev && !unlocked) {
				2563	mutex_unlock(&uuid_mutex);
				2564	up_write(&sb->s_umount);
				2565	}
				2566	return ret;
				2567	}
				2568
				2569	static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
				2570	struct btrfs_device *device)
				2571	{
				2572	int ret;
				2573	struct btrfs_path *path;
				2574	struct btrfs_root *root = device->fs_info->chunk_root;
				2575	struct btrfs_dev_item *dev_item;
				2576	struct extent_buffer *leaf;
				2577	struct btrfs_key key;
				2578
				2579	path = btrfs_alloc_path();
				2580	if (!path)
				2581	return -ENOMEM;
				2582
				2583	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				2584	key.type = BTRFS_DEV_ITEM_KEY;
				2585	key.offset = device->devid;
				2586
				2587	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
				2588	if (ret < 0)
				2589	goto out;
				2590
				2591	if (ret > 0) {
				2592	ret = -ENOENT;
				2593	goto out;
				2594	}
				2595
				2596	leaf = path->nodes[0];
				2597	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
				2598
				2599	btrfs_set_device_id(leaf, dev_item, device->devid);
				2600	btrfs_set_device_type(leaf, dev_item, device->type);
				2601	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
				2602	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
				2603	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
				2604	btrfs_set_device_total_bytes(leaf, dev_item,
				2605	btrfs_device_get_disk_total_bytes(device));
				2606	btrfs_set_device_bytes_used(leaf, dev_item,
				2607	btrfs_device_get_bytes_used(device));
				2608	btrfs_mark_buffer_dirty(leaf);
				2609
				2610	out:
				2611	btrfs_free_path(path);
				2612	return ret;
				2613	}
				2614
				2615	int btrfs_grow_device(struct btrfs_trans_handle *trans,
				2616	struct btrfs_device *device, u64 new_size)
				2617	{
				2618	struct btrfs_fs_info *fs_info = device->fs_info;
				2619	struct btrfs_super_block *super_copy = fs_info->super_copy;
				2620	struct btrfs_fs_devices *fs_devices;
				2621	u64 old_total;
				2622	u64 diff;
				2623
				2624	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				2625	return -EACCES;
				2626
				2627	new_size = round_down(new_size, fs_info->sectorsize);
				2628
				2629	mutex_lock(&fs_info->chunk_mutex);
				2630	old_total = btrfs_super_total_bytes(super_copy);
				2631	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
				2632
				2633	if (new_size <= device->total_bytes \|\|
				2634	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				2635	mutex_unlock(&fs_info->chunk_mutex);
				2636	return -EINVAL;
				2637	}
				2638
				2639	fs_devices = fs_info->fs_devices;
				2640
				2641	btrfs_set_super_total_bytes(super_copy,
				2642	round_down(old_total + diff, fs_info->sectorsize));
				2643	device->fs_devices->total_rw_bytes += diff;
				2644
				2645	btrfs_device_set_total_bytes(device, new_size);
				2646	btrfs_device_set_disk_total_bytes(device, new_size);
				2647	btrfs_clear_space_info_full(device->fs_info);
				2648	if (list_empty(&device->resized_list))
				2649	list_add_tail(&device->resized_list,
				2650	&fs_devices->resized_devices);
				2651	mutex_unlock(&fs_info->chunk_mutex);
				2652
				2653	return btrfs_update_device(trans, device);
				2654	}
				2655
				2656	static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
				2657	{
				2658	struct btrfs_fs_info *fs_info = trans->fs_info;
				2659	struct btrfs_root *root = fs_info->chunk_root;
				2660	int ret;
				2661	struct btrfs_path *path;
				2662	struct btrfs_key key;
				2663
				2664	path = btrfs_alloc_path();
				2665	if (!path)
				2666	return -ENOMEM;
				2667
				2668	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				2669	key.offset = chunk_offset;
				2670	key.type = BTRFS_CHUNK_ITEM_KEY;
				2671
				2672	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				2673	if (ret < 0)
				2674	goto out;
				2675	else if (ret > 0) { /* Logic error or corruption */
				2676	btrfs_handle_fs_error(fs_info, -ENOENT,
				2677	"Failed lookup while freeing chunk.");
				2678	ret = -ENOENT;
				2679	goto out;
				2680	}
				2681
				2682	ret = btrfs_del_item(trans, root, path);
				2683	if (ret < 0)
				2684	btrfs_handle_fs_error(fs_info, ret,
				2685	"Failed to delete chunk item.");
				2686	out:
				2687	btrfs_free_path(path);
				2688	return ret;
				2689	}
				2690
				2691	static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				2692	{
				2693	struct btrfs_super_block *super_copy = fs_info->super_copy;
				2694	struct btrfs_disk_key *disk_key;
				2695	struct btrfs_chunk *chunk;
				2696	u8 *ptr;
				2697	int ret = 0;
				2698	u32 num_stripes;
				2699	u32 array_size;
				2700	u32 len = 0;
				2701	u32 cur;
				2702	struct btrfs_key key;
				2703
				2704	mutex_lock(&fs_info->chunk_mutex);
				2705	array_size = btrfs_super_sys_array_size(super_copy);
				2706
				2707	ptr = super_copy->sys_chunk_array;
				2708	cur = 0;
				2709
				2710	while (cur < array_size) {
				2711	disk_key = (struct btrfs_disk_key *)ptr;
				2712	btrfs_disk_key_to_cpu(&key, disk_key);
				2713
				2714	len = sizeof(*disk_key);
				2715
				2716	if (key.type == BTRFS_CHUNK_ITEM_KEY) {
				2717	chunk = (struct btrfs_chunk *)(ptr + len);
				2718	num_stripes = btrfs_stack_chunk_num_stripes(chunk);
				2719	len += btrfs_chunk_item_size(num_stripes);
				2720	} else {
				2721	ret = -EIO;
				2722	break;
				2723	}
				2724	if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
				2725	key.offset == chunk_offset) {
				2726	memmove(ptr, ptr + len, array_size - (cur + len));
				2727	array_size -= len;
				2728	btrfs_set_super_sys_array_size(super_copy, array_size);
				2729	} else {
				2730	ptr += len;
				2731	cur += len;
				2732	}
				2733	}
				2734	mutex_unlock(&fs_info->chunk_mutex);
				2735	return ret;
				2736	}
				2737
				2738	static struct extent_map get_chunk_map(struct btrfs_fs_info fs_info,
				2739	u64 logical, u64 length)
				2740	{
				2741	struct extent_map_tree *em_tree;
				2742	struct extent_map *em;
				2743
				2744	em_tree = &fs_info->mapping_tree.map_tree;
				2745	read_lock(&em_tree->lock);
				2746	em = lookup_extent_mapping(em_tree, logical, length);
				2747	read_unlock(&em_tree->lock);
				2748
				2749	if (!em) {
				2750	btrfs_crit(fs_info, "unable to find logical %llu length %llu",
				2751	logical, length);
				2752	return ERR_PTR(-EINVAL);
				2753	}
				2754
				2755	if (em->start > logical \|\| em->start + em->len < logical) {
				2756	btrfs_crit(fs_info,
				2757	"found a bad mapping, wanted %llu-%llu, found %llu-%llu",
				2758	logical, length, em->start, em->start + em->len);
				2759	free_extent_map(em);
				2760	return ERR_PTR(-EINVAL);
				2761	}
				2762
				2763	/* callers are responsible for dropping em's ref. */
				2764	return em;
				2765	}
				2766
				2767	int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
				2768	{
				2769	struct btrfs_fs_info *fs_info = trans->fs_info;
				2770	struct extent_map *em;
				2771	struct map_lookup *map;
				2772	u64 dev_extent_len = 0;
				2773	int i, ret = 0;
				2774	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				2775
				2776	em = get_chunk_map(fs_info, chunk_offset, 1);
				2777	if (IS_ERR(em)) {
				2778	/*
				2779	* This is a logic error, but we don't want to just rely on the
				2780	* user having built with ASSERT enabled, so if ASSERT doesn't
				2781	* do anything we still error out.
				2782	*/
				2783	ASSERT(0);
				2784	return PTR_ERR(em);
				2785	}
				2786	map = em->map_lookup;
				2787	mutex_lock(&fs_info->chunk_mutex);
				2788	check_system_chunk(trans, map->type);
				2789	mutex_unlock(&fs_info->chunk_mutex);
				2790
				2791	/*
				2792	* Take the device list mutex to prevent races with the final phase of
				2793	* a device replace operation that replaces the device object associated
				2794	* with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
				2795	*/
				2796	mutex_lock(&fs_devices->device_list_mutex);
				2797	for (i = 0; i < map->num_stripes; i++) {
				2798	struct btrfs_device *device = map->stripes[i].dev;
				2799	ret = btrfs_free_dev_extent(trans, device,
				2800	map->stripes[i].physical,
				2801	&dev_extent_len);
				2802	if (ret) {
				2803	mutex_unlock(&fs_devices->device_list_mutex);
				2804	btrfs_abort_transaction(trans, ret);
				2805	goto out;
				2806	}
				2807
				2808	if (device->bytes_used > 0) {
				2809	mutex_lock(&fs_info->chunk_mutex);
				2810	btrfs_device_set_bytes_used(device,
				2811	device->bytes_used - dev_extent_len);
				2812	atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
				2813	btrfs_clear_space_info_full(fs_info);
				2814	mutex_unlock(&fs_info->chunk_mutex);
				2815	}
				2816
				2817	if (map->stripes[i].dev) {
				2818	ret = btrfs_update_device(trans, map->stripes[i].dev);
				2819	if (ret) {
				2820	mutex_unlock(&fs_devices->device_list_mutex);
				2821	btrfs_abort_transaction(trans, ret);
				2822	goto out;
				2823	}
				2824	}
				2825	}
				2826	mutex_unlock(&fs_devices->device_list_mutex);
				2827
				2828	ret = btrfs_free_chunk(trans, chunk_offset);
				2829	if (ret) {
				2830	btrfs_abort_transaction(trans, ret);
				2831	goto out;
				2832	}
				2833
				2834	trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
				2835
				2836	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
				2837	ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
				2838	if (ret) {
				2839	btrfs_abort_transaction(trans, ret);
				2840	goto out;
				2841	}
				2842	}
				2843
				2844	ret = btrfs_remove_block_group(trans, chunk_offset, em);
				2845	if (ret) {
				2846	btrfs_abort_transaction(trans, ret);
				2847	goto out;
				2848	}
				2849
				2850	out:
				2851	/* once for us */
				2852	free_extent_map(em);
				2853	return ret;
				2854	}
				2855
				2856	static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				2857	{
				2858	struct btrfs_root *root = fs_info->chunk_root;
				2859	struct btrfs_trans_handle *trans;
				2860	int ret;
				2861
				2862	/*
				2863	* Prevent races with automatic removal of unused block groups.
				2864	* After we relocate and before we remove the chunk with offset
				2865	* chunk_offset, automatic removal of the block group can kick in,
				2866	* resulting in a failure when calling btrfs_remove_chunk() below.
				2867	*
				2868	* Make sure to acquire this mutex before doing a tree search (dev
				2869	* or chunk trees) to find chunks. Otherwise the cleaner kthread might
				2870	* call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
				2871	* we release the path used to search the chunk/dev tree and before
				2872	* the current task acquires this mutex and calls us.
				2873	*/
				2874	lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
				2875
				2876	ret = btrfs_can_relocate(fs_info, chunk_offset);
				2877	if (ret)
				2878	return -ENOSPC;
				2879
				2880	/* step one, relocate all the extents inside this chunk */
				2881	btrfs_scrub_pause(fs_info);
				2882	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
				2883	btrfs_scrub_continue(fs_info);
				2884	if (ret)
				2885	return ret;
				2886
				2887	/*
				2888	* We add the kobjects here (and after forcing data chunk creation)
				2889	* since relocation is the only place we'll create chunks of a new
				2890	* type at runtime. The only place where we'll remove the last
				2891	* chunk of a type is the call immediately below this one. Even
				2892	* so, we're protected against races with the cleaner thread since
				2893	* we're covered by the delete_unused_bgs_mutex.
				2894	*/
				2895	btrfs_add_raid_kobjects(fs_info);
				2896
				2897	trans = btrfs_start_trans_remove_block_group(root->fs_info,
				2898	chunk_offset);
				2899	if (IS_ERR(trans)) {
				2900	ret = PTR_ERR(trans);
				2901	btrfs_handle_fs_error(root->fs_info, ret, NULL);
				2902	return ret;
				2903	}
				2904
				2905	/*
				2906	* step two, delete the device extents and the
				2907	* chunk tree entries
				2908	*/
				2909	ret = btrfs_remove_chunk(trans, chunk_offset);
				2910	btrfs_end_transaction(trans);
				2911	return ret;
				2912	}
				2913
				2914	static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
				2915	{
				2916	struct btrfs_root *chunk_root = fs_info->chunk_root;
				2917	struct btrfs_path *path;
				2918	struct extent_buffer *leaf;
				2919	struct btrfs_chunk *chunk;
				2920	struct btrfs_key key;
				2921	struct btrfs_key found_key;
				2922	u64 chunk_type;
				2923	bool retried = false;
				2924	int failed = 0;
				2925	int ret;
				2926
				2927	path = btrfs_alloc_path();
				2928	if (!path)
				2929	return -ENOMEM;
				2930
				2931	again:
				2932	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				2933	key.offset = (u64)-1;
				2934	key.type = BTRFS_CHUNK_ITEM_KEY;
				2935
				2936	while (1) {
				2937	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				2938	ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
				2939	if (ret < 0) {
				2940	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				2941	goto error;
				2942	}
				2943	BUG_ON(ret == 0); /* Corruption */
				2944
				2945	ret = btrfs_previous_item(chunk_root, path, key.objectid,
				2946	key.type);
				2947	if (ret)
				2948	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				2949	if (ret < 0)
				2950	goto error;
				2951	if (ret > 0)
				2952	break;
				2953
				2954	leaf = path->nodes[0];
				2955	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				2956
				2957	chunk = btrfs_item_ptr(leaf, path->slots[0],
				2958	struct btrfs_chunk);
				2959	chunk_type = btrfs_chunk_type(leaf, chunk);
				2960	btrfs_release_path(path);
				2961
				2962	if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
				2963	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
				2964	if (ret == -ENOSPC)
				2965	failed++;
				2966	else
				2967	BUG_ON(ret);
				2968	}
				2969	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				2970
				2971	if (found_key.offset == 0)
				2972	break;
				2973	key.offset = found_key.offset - 1;
				2974	}
				2975	ret = 0;
				2976	if (failed && !retried) {
				2977	failed = 0;
				2978	retried = true;
				2979	goto again;
				2980	} else if (WARN_ON(failed && retried)) {
				2981	ret = -ENOSPC;
				2982	}
				2983	error:
				2984	btrfs_free_path(path);
				2985	return ret;
				2986	}
				2987
				2988	/*
				2989	* return 1 : allocate a data chunk successfully,
				2990	* return <0: errors during allocating a data chunk,
				2991	* return 0 : no need to allocate a data chunk.
				2992	*/
				2993	static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				2994	u64 chunk_offset)
				2995	{
				2996	struct btrfs_block_group_cache *cache;
				2997	u64 bytes_used;
				2998	u64 chunk_type;
				2999
				3000	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3001	ASSERT(cache);
				3002	chunk_type = cache->flags;
				3003	btrfs_put_block_group(cache);
				3004
				3005	if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
				3006	spin_lock(&fs_info->data_sinfo->lock);
				3007	bytes_used = fs_info->data_sinfo->bytes_used;
				3008	spin_unlock(&fs_info->data_sinfo->lock);
				3009
				3010	if (!bytes_used) {
				3011	struct btrfs_trans_handle *trans;
				3012	int ret;
				3013
				3014	trans = btrfs_join_transaction(fs_info->tree_root);
				3015	if (IS_ERR(trans))
				3016	return PTR_ERR(trans);
				3017
				3018	ret = btrfs_force_chunk_alloc(trans,
				3019	BTRFS_BLOCK_GROUP_DATA);
				3020	btrfs_end_transaction(trans);
				3021	if (ret < 0)
				3022	return ret;
				3023
				3024	btrfs_add_raid_kobjects(fs_info);
				3025
				3026	return 1;
				3027	}
				3028	}
				3029	return 0;
				3030	}
				3031
				3032	static int insert_balance_item(struct btrfs_fs_info *fs_info,
				3033	struct btrfs_balance_control *bctl)
				3034	{
				3035	struct btrfs_root *root = fs_info->tree_root;
				3036	struct btrfs_trans_handle *trans;
				3037	struct btrfs_balance_item *item;
				3038	struct btrfs_disk_balance_args disk_bargs;
				3039	struct btrfs_path *path;
				3040	struct extent_buffer *leaf;
				3041	struct btrfs_key key;
				3042	int ret, err;
				3043
				3044	path = btrfs_alloc_path();
				3045	if (!path)
				3046	return -ENOMEM;
				3047
				3048	trans = btrfs_start_transaction(root, 0);
				3049	if (IS_ERR(trans)) {
				3050	btrfs_free_path(path);
				3051	return PTR_ERR(trans);
				3052	}
				3053
				3054	key.objectid = BTRFS_BALANCE_OBJECTID;
				3055	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3056	key.offset = 0;
				3057
				3058	ret = btrfs_insert_empty_item(trans, root, path, &key,
				3059	sizeof(*item));
				3060	if (ret)
				3061	goto out;
				3062
				3063	leaf = path->nodes[0];
				3064	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
				3065
				3066	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
				3067
				3068	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
				3069	btrfs_set_balance_data(leaf, item, &disk_bargs);
				3070	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
				3071	btrfs_set_balance_meta(leaf, item, &disk_bargs);
				3072	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
				3073	btrfs_set_balance_sys(leaf, item, &disk_bargs);
				3074
				3075	btrfs_set_balance_flags(leaf, item, bctl->flags);
				3076
				3077	btrfs_mark_buffer_dirty(leaf);
				3078	out:
				3079	btrfs_free_path(path);
				3080	err = btrfs_commit_transaction(trans);
				3081	if (err && !ret)
				3082	ret = err;
				3083	return ret;
				3084	}
				3085
				3086	static int del_balance_item(struct btrfs_fs_info *fs_info)
				3087	{
				3088	struct btrfs_root *root = fs_info->tree_root;
				3089	struct btrfs_trans_handle *trans;
				3090	struct btrfs_path *path;
				3091	struct btrfs_key key;
				3092	int ret, err;
				3093
				3094	path = btrfs_alloc_path();
				3095	if (!path)
				3096	return -ENOMEM;
				3097
				3098	trans = btrfs_start_transaction(root, 0);
				3099	if (IS_ERR(trans)) {
				3100	btrfs_free_path(path);
				3101	return PTR_ERR(trans);
				3102	}
				3103
				3104	key.objectid = BTRFS_BALANCE_OBJECTID;
				3105	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3106	key.offset = 0;
				3107
				3108	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				3109	if (ret < 0)
				3110	goto out;
				3111	if (ret > 0) {
				3112	ret = -ENOENT;
				3113	goto out;
				3114	}
				3115
				3116	ret = btrfs_del_item(trans, root, path);
				3117	out:
				3118	btrfs_free_path(path);
				3119	err = btrfs_commit_transaction(trans);
				3120	if (err && !ret)
				3121	ret = err;
				3122	return ret;
				3123	}
				3124
				3125	/*
				3126	* This is a heuristic used to reduce the number of chunks balanced on
				3127	* resume after balance was interrupted.
				3128	*/
				3129	static void update_balance_args(struct btrfs_balance_control *bctl)
				3130	{
				3131	/*
				3132	* Turn on soft mode for chunk types that were being converted.
				3133	*/
				3134	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3135	bctl->data.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3136	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3137	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3138	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
				3139	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_SOFT;
				3140
				3141	/*
				3142	* Turn on usage filter if is not already used. The idea is
				3143	* that chunks that we have already balanced should be
				3144	* reasonably full. Don't do it for chunks that are being
				3145	* converted - that will keep us from relocating unconverted
				3146	* (albeit full) chunks.
				3147	*/
				3148	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3149	!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3150	!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3151	bctl->data.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3152	bctl->data.usage = 90;
				3153	}
				3154	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3155	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3156	!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3157	bctl->sys.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3158	bctl->sys.usage = 90;
				3159	}
				3160	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3161	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3162	!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
				3163	bctl->meta.flags \|= BTRFS_BALANCE_ARGS_USAGE;
				3164	bctl->meta.usage = 90;
				3165	}
				3166	}
				3167
				3168	/*
				3169	* Clear the balance status in fs_info and delete the balance item from disk.
				3170	*/
				3171	static void reset_balance_state(struct btrfs_fs_info *fs_info)
				3172	{
				3173	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3174	int ret;
				3175
				3176	BUG_ON(!fs_info->balance_ctl);
				3177
				3178	spin_lock(&fs_info->balance_lock);
				3179	fs_info->balance_ctl = NULL;
				3180	spin_unlock(&fs_info->balance_lock);
				3181
				3182	kfree(bctl);
				3183	ret = del_balance_item(fs_info);
				3184	if (ret)
				3185	btrfs_handle_fs_error(fs_info, ret, NULL);
				3186	}
				3187
				3188	/*
				3189	* Balance filters. Return 1 if chunk should be filtered out
				3190	* (should not be balanced).
				3191	*/
				3192	static int chunk_profiles_filter(u64 chunk_type,
				3193	struct btrfs_balance_args *bargs)
				3194	{
				3195	chunk_type = chunk_to_extended(chunk_type) &
				3196	BTRFS_EXTENDED_PROFILE_MASK;
				3197
				3198	if (bargs->profiles & chunk_type)
				3199	return 0;
				3200
				3201	return 1;
				3202	}
				3203
				3204	static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
				3205	struct btrfs_balance_args *bargs)
				3206	{
				3207	struct btrfs_block_group_cache *cache;
				3208	u64 chunk_used;
				3209	u64 user_thresh_min;
				3210	u64 user_thresh_max;
				3211	int ret = 1;
				3212
				3213	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3214	chunk_used = btrfs_block_group_used(&cache->item);
				3215
				3216	if (bargs->usage_min == 0)
				3217	user_thresh_min = 0;
				3218	else
				3219	user_thresh_min = div_factor_fine(cache->key.offset,
				3220	bargs->usage_min);
				3221
				3222	if (bargs->usage_max == 0)
				3223	user_thresh_max = 1;
				3224	else if (bargs->usage_max > 100)
				3225	user_thresh_max = cache->key.offset;
				3226	else
				3227	user_thresh_max = div_factor_fine(cache->key.offset,
				3228	bargs->usage_max);
				3229
				3230	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
				3231	ret = 0;
				3232
				3233	btrfs_put_block_group(cache);
				3234	return ret;
				3235	}
				3236
				3237	static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
				3238	u64 chunk_offset, struct btrfs_balance_args *bargs)
				3239	{
				3240	struct btrfs_block_group_cache *cache;
				3241	u64 chunk_used, user_thresh;
				3242	int ret = 1;
				3243
				3244	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
				3245	chunk_used = btrfs_block_group_used(&cache->item);
				3246
				3247	if (bargs->usage_min == 0)
				3248	user_thresh = 1;
				3249	else if (bargs->usage > 100)
				3250	user_thresh = cache->key.offset;
				3251	else
				3252	user_thresh = div_factor_fine(cache->key.offset,
				3253	bargs->usage);
				3254
				3255	if (chunk_used < user_thresh)
				3256	ret = 0;
				3257
				3258	btrfs_put_block_group(cache);
				3259	return ret;
				3260	}
				3261
				3262	static int chunk_devid_filter(struct extent_buffer *leaf,
				3263	struct btrfs_chunk *chunk,
				3264	struct btrfs_balance_args *bargs)
				3265	{
				3266	struct btrfs_stripe *stripe;
				3267	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3268	int i;
				3269
				3270	for (i = 0; i < num_stripes; i++) {
				3271	stripe = btrfs_stripe_nr(chunk, i);
				3272	if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
				3273	return 0;
				3274	}
				3275
				3276	return 1;
				3277	}
				3278
				3279	/* [pstart, pend) */
				3280	static int chunk_drange_filter(struct extent_buffer *leaf,
				3281	struct btrfs_chunk *chunk,
				3282	struct btrfs_balance_args *bargs)
				3283	{
				3284	struct btrfs_stripe *stripe;
				3285	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3286	u64 stripe_offset;
				3287	u64 stripe_length;
				3288	int factor;
				3289	int i;
				3290
				3291	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
				3292	return 0;
				3293
				3294	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP \|
				3295	BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10)) {
				3296	factor = num_stripes / 2;
				3297	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
				3298	factor = num_stripes - 1;
				3299	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
				3300	factor = num_stripes - 2;
				3301	} else {
				3302	factor = num_stripes;
				3303	}
				3304
				3305	for (i = 0; i < num_stripes; i++) {
				3306	stripe = btrfs_stripe_nr(chunk, i);
				3307	if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
				3308	continue;
				3309
				3310	stripe_offset = btrfs_stripe_offset(leaf, stripe);
				3311	stripe_length = btrfs_chunk_length(leaf, chunk);
				3312	stripe_length = div_u64(stripe_length, factor);
				3313
				3314	if (stripe_offset < bargs->pend &&
				3315	stripe_offset + stripe_length > bargs->pstart)
				3316	return 0;
				3317	}
				3318
				3319	return 1;
				3320	}
				3321
				3322	/* [vstart, vend) */
				3323	static int chunk_vrange_filter(struct extent_buffer *leaf,
				3324	struct btrfs_chunk *chunk,
				3325	u64 chunk_offset,
				3326	struct btrfs_balance_args *bargs)
				3327	{
				3328	if (chunk_offset < bargs->vend &&
				3329	chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
				3330	/* at least part of the chunk is inside this vrange */
				3331	return 0;
				3332
				3333	return 1;
				3334	}
				3335
				3336	static int chunk_stripes_range_filter(struct extent_buffer *leaf,
				3337	struct btrfs_chunk *chunk,
				3338	struct btrfs_balance_args *bargs)
				3339	{
				3340	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				3341
				3342	if (bargs->stripes_min <= num_stripes
				3343	&& num_stripes <= bargs->stripes_max)
				3344	return 0;
				3345
				3346	return 1;
				3347	}
				3348
				3349	static int chunk_soft_convert_filter(u64 chunk_type,
				3350	struct btrfs_balance_args *bargs)
				3351	{
				3352	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
				3353	return 0;
				3354
				3355	chunk_type = chunk_to_extended(chunk_type) &
				3356	BTRFS_EXTENDED_PROFILE_MASK;
				3357
				3358	if (bargs->target == chunk_type)
				3359	return 1;
				3360
				3361	return 0;
				3362	}
				3363
				3364	static int should_balance_chunk(struct btrfs_fs_info *fs_info,
				3365	struct extent_buffer *leaf,
				3366	struct btrfs_chunk *chunk, u64 chunk_offset)
				3367	{
				3368	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3369	struct btrfs_balance_args *bargs = NULL;
				3370	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
				3371
				3372	/* type filter */
				3373	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
				3374	(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
				3375	return 0;
				3376	}
				3377
				3378	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				3379	bargs = &bctl->data;
				3380	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				3381	bargs = &bctl->sys;
				3382	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				3383	bargs = &bctl->meta;
				3384
				3385	/* profiles filter */
				3386	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
				3387	chunk_profiles_filter(chunk_type, bargs)) {
				3388	return 0;
				3389	}
				3390
				3391	/* usage filter */
				3392	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
				3393	chunk_usage_filter(fs_info, chunk_offset, bargs)) {
				3394	return 0;
				3395	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
				3396	chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
				3397	return 0;
				3398	}
				3399
				3400	/* devid filter */
				3401	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
				3402	chunk_devid_filter(leaf, chunk, bargs)) {
				3403	return 0;
				3404	}
				3405
				3406	/* drange filter, makes sense only with devid filter */
				3407	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
				3408	chunk_drange_filter(leaf, chunk, bargs)) {
				3409	return 0;
				3410	}
				3411
				3412	/* vrange filter */
				3413	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
				3414	chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
				3415	return 0;
				3416	}
				3417
				3418	/* stripes filter */
				3419	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
				3420	chunk_stripes_range_filter(leaf, chunk, bargs)) {
				3421	return 0;
				3422	}
				3423
				3424	/* soft profile changing mode */
				3425	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
				3426	chunk_soft_convert_filter(chunk_type, bargs)) {
				3427	return 0;
				3428	}
				3429
				3430	/*
				3431	* limited by count, must be the last filter
				3432	*/
				3433	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
				3434	if (bargs->limit == 0)
				3435	return 0;
				3436	else
				3437	bargs->limit--;
				3438	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
				3439	/*
				3440	* Same logic as the 'limit' filter; the minimum cannot be
				3441	* determined here because we do not have the global information
				3442	* about the count of all chunks that satisfy the filters.
				3443	*/
				3444	if (bargs->limit_max == 0)
				3445	return 0;
				3446	else
				3447	bargs->limit_max--;
				3448	}
				3449
				3450	return 1;
				3451	}
				3452
				3453	static int __btrfs_balance(struct btrfs_fs_info *fs_info)
				3454	{
				3455	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3456	struct btrfs_root *chunk_root = fs_info->chunk_root;
				3457	struct btrfs_root *dev_root = fs_info->dev_root;
				3458	struct list_head *devices;
				3459	struct btrfs_device *device;
				3460	u64 old_size;
				3461	u64 size_to_free;
				3462	u64 chunk_type;
				3463	struct btrfs_chunk *chunk;
				3464	struct btrfs_path *path = NULL;
				3465	struct btrfs_key key;
				3466	struct btrfs_key found_key;
				3467	struct btrfs_trans_handle *trans;
				3468	struct extent_buffer *leaf;
				3469	int slot;
				3470	int ret;
				3471	int enospc_errors = 0;
				3472	bool counting = true;
				3473	/* The single value limit and min/max limits use the same bytes in the */
				3474	u64 limit_data = bctl->data.limit;
				3475	u64 limit_meta = bctl->meta.limit;
				3476	u64 limit_sys = bctl->sys.limit;
				3477	u32 count_data = 0;
				3478	u32 count_meta = 0;
				3479	u32 count_sys = 0;
				3480	int chunk_reserved = 0;
				3481
				3482	/* step one make some room on all the devices */
				3483	devices = &fs_info->fs_devices->devices;
				3484	list_for_each_entry(device, devices, dev_list) {
				3485	old_size = btrfs_device_get_total_bytes(device);
				3486	size_to_free = div_factor(old_size, 1);
				3487	size_to_free = min_t(u64, size_to_free, SZ_1M);
				3488	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) \|\|
				3489	btrfs_device_get_total_bytes(device) -
				3490	btrfs_device_get_bytes_used(device) > size_to_free \|\|
				3491	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
				3492	continue;
				3493
				3494	ret = btrfs_shrink_device(device, old_size - size_to_free);
				3495	if (ret == -ENOSPC)
				3496	break;
				3497	if (ret) {
				3498	/* btrfs_shrink_device never returns ret > 0 */
				3499	WARN_ON(ret > 0);
				3500	goto error;
				3501	}
				3502
				3503	trans = btrfs_start_transaction(dev_root, 0);
				3504	if (IS_ERR(trans)) {
				3505	ret = PTR_ERR(trans);
				3506	btrfs_info_in_rcu(fs_info,
				3507	"resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
				3508	rcu_str_deref(device->name), ret,
				3509	old_size, old_size - size_to_free);
				3510	goto error;
				3511	}
				3512
				3513	ret = btrfs_grow_device(trans, device, old_size);
				3514	if (ret) {
				3515	btrfs_end_transaction(trans);
				3516	/* btrfs_grow_device never returns ret > 0 */
				3517	WARN_ON(ret > 0);
				3518	btrfs_info_in_rcu(fs_info,
				3519	"resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
				3520	rcu_str_deref(device->name), ret,
				3521	old_size, old_size - size_to_free);
				3522	goto error;
				3523	}
				3524
				3525	btrfs_end_transaction(trans);
				3526	}
				3527
				3528	/* step two, relocate all the chunks */
				3529	path = btrfs_alloc_path();
				3530	if (!path) {
				3531	ret = -ENOMEM;
				3532	goto error;
				3533	}
				3534
				3535	/* zero out stat counters */
				3536	spin_lock(&fs_info->balance_lock);
				3537	memset(&bctl->stat, 0, sizeof(bctl->stat));
				3538	spin_unlock(&fs_info->balance_lock);
				3539	again:
				3540	if (!counting) {
				3541	/*
				3542	* The single value limit and min/max limits use the same bytes
				3543	* in the
				3544	*/
				3545	bctl->data.limit = limit_data;
				3546	bctl->meta.limit = limit_meta;
				3547	bctl->sys.limit = limit_sys;
				3548	}
				3549	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				3550	key.offset = (u64)-1;
				3551	key.type = BTRFS_CHUNK_ITEM_KEY;
				3552
				3553	while (1) {
				3554	if ((!counting && atomic_read(&fs_info->balance_pause_req)) \|\|
				3555	atomic_read(&fs_info->balance_cancel_req)) {
				3556	ret = -ECANCELED;
				3557	goto error;
				3558	}
				3559
				3560	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				3561	ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
				3562	if (ret < 0) {
				3563	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3564	goto error;
				3565	}
				3566
				3567	/*
				3568	* this shouldn't happen, it means the last relocate
				3569	* failed
				3570	*/
				3571	if (ret == 0)
				3572	BUG(); /* FIXME break ? */
				3573
				3574	ret = btrfs_previous_item(chunk_root, path, 0,
				3575	BTRFS_CHUNK_ITEM_KEY);
				3576	if (ret) {
				3577	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3578	ret = 0;
				3579	break;
				3580	}
				3581
				3582	leaf = path->nodes[0];
				3583	slot = path->slots[0];
				3584	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				3585
				3586	if (found_key.objectid != key.objectid) {
				3587	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3588	break;
				3589	}
				3590
				3591	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
				3592	chunk_type = btrfs_chunk_type(leaf, chunk);
				3593
				3594	if (!counting) {
				3595	spin_lock(&fs_info->balance_lock);
				3596	bctl->stat.considered++;
				3597	spin_unlock(&fs_info->balance_lock);
				3598	}
				3599
				3600	ret = should_balance_chunk(fs_info, leaf, chunk,
				3601	found_key.offset);
				3602
				3603	btrfs_release_path(path);
				3604	if (!ret) {
				3605	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3606	goto loop;
				3607	}
				3608
				3609	if (counting) {
				3610	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3611	spin_lock(&fs_info->balance_lock);
				3612	bctl->stat.expected++;
				3613	spin_unlock(&fs_info->balance_lock);
				3614
				3615	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				3616	count_data++;
				3617	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				3618	count_sys++;
				3619	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				3620	count_meta++;
				3621
				3622	goto loop;
				3623	}
				3624
				3625	/*
				3626	* Apply limit_min filter, no need to check if the LIMITS
				3627	* filter is used, limit_min is 0 by default
				3628	*/
				3629	if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
				3630	count_data < bctl->data.limit_min)
				3631	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
				3632	count_meta < bctl->meta.limit_min)
				3633	\|\| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
				3634	count_sys < bctl->sys.limit_min)) {
				3635	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3636	goto loop;
				3637	}
				3638
				3639	if (!chunk_reserved) {
				3640	/*
				3641	* We may be relocating the only data chunk we have,
				3642	* which could potentially end up with losing data's
				3643	* raid profile, so lets allocate an empty one in
				3644	* advance.
				3645	*/
				3646	ret = btrfs_may_alloc_data_chunk(fs_info,
				3647	found_key.offset);
				3648	if (ret < 0) {
				3649	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3650	goto error;
				3651	} else if (ret == 1) {
				3652	chunk_reserved = 1;
				3653	}
				3654	}
				3655
				3656	ret = btrfs_relocate_chunk(fs_info, found_key.offset);
				3657	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				3658	if (ret && ret != -ENOSPC)
				3659	goto error;
				3660	if (ret == -ENOSPC) {
				3661	enospc_errors++;
				3662	} else {
				3663	spin_lock(&fs_info->balance_lock);
				3664	bctl->stat.completed++;
				3665	spin_unlock(&fs_info->balance_lock);
				3666	}
				3667	loop:
				3668	if (found_key.offset == 0)
				3669	break;
				3670	key.offset = found_key.offset - 1;
				3671	}
				3672
				3673	if (counting) {
				3674	btrfs_release_path(path);
				3675	counting = false;
				3676	goto again;
				3677	}
				3678	error:
				3679	btrfs_free_path(path);
				3680	if (enospc_errors) {
				3681	btrfs_info(fs_info, "%d enospc errors during balance",
				3682	enospc_errors);
				3683	if (!ret)
				3684	ret = -ENOSPC;
				3685	}
				3686
				3687	return ret;
				3688	}
				3689
				3690	/**
				3691	* alloc_profile_is_valid - see if a given profile is valid and reduced
				3692	* @flags: profile to validate
				3693	* @extended: if true @flags is treated as an extended profile
				3694	*/
				3695	static int alloc_profile_is_valid(u64 flags, int extended)
				3696	{
				3697	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
				3698	BTRFS_BLOCK_GROUP_PROFILE_MASK);
				3699
				3700	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
				3701
				3702	/* 1) check that all other bits are zeroed */
				3703	if (flags & ~mask)
				3704	return 0;
				3705
				3706	/* 2) see if profile is reduced */
				3707	if (flags == 0)
				3708	return !extended; /* "0" is valid for usual profiles */
				3709
				3710	/* true if exactly one bit set */
				3711	return (flags & (flags - 1)) == 0;
				3712	}
				3713
				3714	static inline int balance_need_close(struct btrfs_fs_info *fs_info)
				3715	{
				3716	/* cancel requested \|\| normal exit path */
				3717	return atomic_read(&fs_info->balance_cancel_req) \|\|
				3718	(atomic_read(&fs_info->balance_pause_req) == 0 &&
				3719	atomic_read(&fs_info->balance_cancel_req) == 0);
				3720	}
				3721
				3722	/* Non-zero return value signifies invalidity */
				3723	static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
				3724	u64 allowed)
				3725	{
				3726	return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				3727	(!alloc_profile_is_valid(bctl_arg->target, 1) \|\|
				3728	(bctl_arg->target & ~allowed)));
				3729	}
				3730
				3731	/*
				3732	* Should be called with balance mutexe held
				3733	*/
				3734	int btrfs_balance(struct btrfs_fs_info *fs_info,
				3735	struct btrfs_balance_control *bctl,
				3736	struct btrfs_ioctl_balance_args *bargs)
				3737	{
				3738	u64 meta_target, data_target;
				3739	u64 allowed;
				3740	int mixed = 0;
				3741	int ret;
				3742	u64 num_devices;
				3743	unsigned seq;
				3744	bool reducing_integrity;
				3745
				3746	if (btrfs_fs_closing(fs_info) \|\|
				3747	atomic_read(&fs_info->balance_pause_req) \|\|
				3748	atomic_read(&fs_info->balance_cancel_req)) {
				3749	ret = -EINVAL;
				3750	goto out;
				3751	}
				3752
				3753	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
				3754	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				3755	mixed = 1;
				3756
				3757	/*
				3758	* In case of mixed groups both data and meta should be picked,
				3759	* and identical options should be given for both of them.
				3760	*/
				3761	allowed = BTRFS_BALANCE_DATA \| BTRFS_BALANCE_METADATA;
				3762	if (mixed && (bctl->flags & allowed)) {
				3763	if (!(bctl->flags & BTRFS_BALANCE_DATA) \|\|
				3764	!(bctl->flags & BTRFS_BALANCE_METADATA) \|\|
				3765	memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
				3766	btrfs_err(fs_info,
				3767	"balance: mixed groups data and metadata options must be the same");
				3768	ret = -EINVAL;
				3769	goto out;
				3770	}
				3771	}
				3772
				3773	num_devices = fs_info->fs_devices->num_devices;
				3774	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
				3775	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
				3776	BUG_ON(num_devices < 1);
				3777	num_devices--;
				3778	}
				3779	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
				3780	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE \| BTRFS_BLOCK_GROUP_DUP;
				3781	if (num_devices > 1)
				3782	allowed \|= (BTRFS_BLOCK_GROUP_RAID0 \| BTRFS_BLOCK_GROUP_RAID1);
				3783	if (num_devices > 2)
				3784	allowed \|= BTRFS_BLOCK_GROUP_RAID5;
				3785	if (num_devices > 3)
				3786	allowed \|= (BTRFS_BLOCK_GROUP_RAID10 \|
				3787	BTRFS_BLOCK_GROUP_RAID6);
				3788	if (validate_convert_profile(&bctl->data, allowed)) {
				3789	int index = btrfs_bg_flags_to_raid_index(bctl->data.target);
				3790
				3791	btrfs_err(fs_info,
				3792	"balance: invalid convert data profile %s",
				3793	get_raid_name(index));
				3794	ret = -EINVAL;
				3795	goto out;
				3796	}
				3797	if (validate_convert_profile(&bctl->meta, allowed)) {
				3798	int index = btrfs_bg_flags_to_raid_index(bctl->meta.target);
				3799
				3800	btrfs_err(fs_info,
				3801	"balance: invalid convert metadata profile %s",
				3802	get_raid_name(index));
				3803	ret = -EINVAL;
				3804	goto out;
				3805	}
				3806	if (validate_convert_profile(&bctl->sys, allowed)) {
				3807	int index = btrfs_bg_flags_to_raid_index(bctl->sys.target);
				3808
				3809	btrfs_err(fs_info,
				3810	"balance: invalid convert system profile %s",
				3811	get_raid_name(index));
				3812	ret = -EINVAL;
				3813	goto out;
				3814	}
				3815
				3816	/* allow to reduce meta or sys integrity only if force set */
				3817	allowed = BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
				3818	BTRFS_BLOCK_GROUP_RAID10 \|
				3819	BTRFS_BLOCK_GROUP_RAID5 \|
				3820	BTRFS_BLOCK_GROUP_RAID6;
				3821	do {
				3822	seq = read_seqbegin(&fs_info->profiles_lock);
				3823
				3824	if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				3825	(fs_info->avail_system_alloc_bits & allowed) &&
				3826	!(bctl->sys.target & allowed)) \|\|
				3827	((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
				3828	(fs_info->avail_metadata_alloc_bits & allowed) &&
				3829	!(bctl->meta.target & allowed)))
				3830	reducing_integrity = true;
				3831	else
				3832	reducing_integrity = false;
				3833
				3834	/* if we're not converting, the target field is uninitialized */
				3835	meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
				3836	bctl->meta.target : fs_info->avail_metadata_alloc_bits;
				3837	data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
				3838	bctl->data.target : fs_info->avail_data_alloc_bits;
				3839	} while (read_seqretry(&fs_info->profiles_lock, seq));
				3840
				3841	if (reducing_integrity) {
				3842	if (bctl->flags & BTRFS_BALANCE_FORCE) {
				3843	btrfs_info(fs_info,
				3844	"balance: force reducing metadata integrity");
				3845	} else {
				3846	btrfs_err(fs_info,
				3847	"balance: reduces metadata integrity, use --force if you want this");
				3848	ret = -EINVAL;
				3849	goto out;
				3850	}
				3851	}
				3852
				3853	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
				3854	btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
				3855	int meta_index = btrfs_bg_flags_to_raid_index(meta_target);
				3856	int data_index = btrfs_bg_flags_to_raid_index(data_target);
				3857
				3858	btrfs_warn(fs_info,
				3859	"balance: metadata profile %s has lower redundancy than data profile %s",
				3860	get_raid_name(meta_index), get_raid_name(data_index));
				3861	}
				3862
				3863	ret = insert_balance_item(fs_info, bctl);
				3864	if (ret && ret != -EEXIST)
				3865	goto out;
				3866
				3867	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
				3868	BUG_ON(ret == -EEXIST);
				3869	BUG_ON(fs_info->balance_ctl);
				3870	spin_lock(&fs_info->balance_lock);
				3871	fs_info->balance_ctl = bctl;
				3872	spin_unlock(&fs_info->balance_lock);
				3873	} else {
				3874	BUG_ON(ret != -EEXIST);
				3875	spin_lock(&fs_info->balance_lock);
				3876	update_balance_args(bctl);
				3877	spin_unlock(&fs_info->balance_lock);
				3878	}
				3879
				3880	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				3881	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
				3882	mutex_unlock(&fs_info->balance_mutex);
				3883
				3884	ret = __btrfs_balance(fs_info);
				3885
				3886	mutex_lock(&fs_info->balance_mutex);
				3887	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
				3888
				3889	if (bargs) {
				3890	memset(bargs, 0, sizeof(*bargs));
				3891	btrfs_update_ioctl_balance_args(fs_info, bargs);
				3892	}
				3893
				3894	if ((ret && ret != -ECANCELED && ret != -ENOSPC) \|\|
				3895	balance_need_close(fs_info)) {
				3896	reset_balance_state(fs_info);
				3897	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				3898	}
				3899
				3900	wake_up(&fs_info->balance_wait_q);
				3901
				3902	return ret;
				3903	out:
				3904	if (bctl->flags & BTRFS_BALANCE_RESUME)
				3905	reset_balance_state(fs_info);
				3906	else
				3907	kfree(bctl);
				3908	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				3909
				3910	return ret;
				3911	}
				3912
				3913	static int balance_kthread(void *data)
				3914	{
				3915	struct btrfs_fs_info *fs_info = data;
				3916	int ret = 0;
				3917
				3918	mutex_lock(&fs_info->balance_mutex);
				3919	if (fs_info->balance_ctl) {
				3920	btrfs_info(fs_info, "balance: resuming");
				3921	ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
				3922	}
				3923	mutex_unlock(&fs_info->balance_mutex);
				3924
				3925	return ret;
				3926	}
				3927
				3928	int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
				3929	{
				3930	struct task_struct *tsk;
				3931
				3932	mutex_lock(&fs_info->balance_mutex);
				3933	if (!fs_info->balance_ctl) {
				3934	mutex_unlock(&fs_info->balance_mutex);
				3935	return 0;
				3936	}
				3937	mutex_unlock(&fs_info->balance_mutex);
				3938
				3939	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
				3940	btrfs_info(fs_info, "balance: resume skipped");
				3941	return 0;
				3942	}
				3943
				3944	/*
				3945	* A ro->rw remount sequence should continue with the paused balance
				3946	* regardless of who pauses it, system or the user as of now, so set
				3947	* the resume flag.
				3948	*/
				3949	spin_lock(&fs_info->balance_lock);
				3950	fs_info->balance_ctl->flags \|= BTRFS_BALANCE_RESUME;
				3951	spin_unlock(&fs_info->balance_lock);
				3952
				3953	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
				3954	return PTR_ERR_OR_ZERO(tsk);
				3955	}
				3956
				3957	int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
				3958	{
				3959	struct btrfs_balance_control *bctl;
				3960	struct btrfs_balance_item *item;
				3961	struct btrfs_disk_balance_args disk_bargs;
				3962	struct btrfs_path *path;
				3963	struct extent_buffer *leaf;
				3964	struct btrfs_key key;
				3965	int ret;
				3966
				3967	path = btrfs_alloc_path();
				3968	if (!path)
				3969	return -ENOMEM;
				3970
				3971	key.objectid = BTRFS_BALANCE_OBJECTID;
				3972	key.type = BTRFS_TEMPORARY_ITEM_KEY;
				3973	key.offset = 0;
				3974
				3975	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
				3976	if (ret < 0)
				3977	goto out;
				3978	if (ret > 0) { /* ret = -ENOENT; */
				3979	ret = 0;
				3980	goto out;
				3981	}
				3982
				3983	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
				3984	if (!bctl) {
				3985	ret = -ENOMEM;
				3986	goto out;
				3987	}
				3988
				3989	leaf = path->nodes[0];
				3990	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
				3991
				3992	bctl->flags = btrfs_balance_flags(leaf, item);
				3993	bctl->flags \|= BTRFS_BALANCE_RESUME;
				3994
				3995	btrfs_balance_data(leaf, item, &disk_bargs);
				3996	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
				3997	btrfs_balance_meta(leaf, item, &disk_bargs);
				3998	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
				3999	btrfs_balance_sys(leaf, item, &disk_bargs);
				4000	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
				4001
				4002	/*
				4003	* This should never happen, as the paused balance state is recovered
				4004	* during mount without any chance of other exclusive ops to collide.
				4005	*
				4006	* This gives the exclusive op status to balance and keeps in paused
				4007	* state until user intervention (cancel or umount). If the ownership
				4008	* cannot be assigned, show a message but do not fail. The balance
				4009	* is in a paused state and must have fs_info::balance_ctl properly
				4010	* set up.
				4011	*/
				4012	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
				4013	btrfs_warn(fs_info,
				4014	"balance: cannot set exclusive op status, resume manually");
				4015
				4016	mutex_lock(&fs_info->balance_mutex);
				4017	BUG_ON(fs_info->balance_ctl);
				4018	spin_lock(&fs_info->balance_lock);
				4019	fs_info->balance_ctl = bctl;
				4020	spin_unlock(&fs_info->balance_lock);
				4021	mutex_unlock(&fs_info->balance_mutex);
				4022	out:
				4023	btrfs_free_path(path);
				4024	return ret;
				4025	}
				4026
				4027	int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
				4028	{
				4029	int ret = 0;
				4030
				4031	mutex_lock(&fs_info->balance_mutex);
				4032	if (!fs_info->balance_ctl) {
				4033	mutex_unlock(&fs_info->balance_mutex);
				4034	return -ENOTCONN;
				4035	}
				4036
				4037	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
				4038	atomic_inc(&fs_info->balance_pause_req);
				4039	mutex_unlock(&fs_info->balance_mutex);
				4040
				4041	wait_event(fs_info->balance_wait_q,
				4042	!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4043
				4044	mutex_lock(&fs_info->balance_mutex);
				4045	/* we are good with balance_ctl ripped off from under us */
				4046	BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4047	atomic_dec(&fs_info->balance_pause_req);
				4048	} else {
				4049	ret = -ENOTCONN;
				4050	}
				4051
				4052	mutex_unlock(&fs_info->balance_mutex);
				4053	return ret;
				4054	}
				4055
				4056	int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
				4057	{
				4058	mutex_lock(&fs_info->balance_mutex);
				4059	if (!fs_info->balance_ctl) {
				4060	mutex_unlock(&fs_info->balance_mutex);
				4061	return -ENOTCONN;
				4062	}
				4063
				4064	/*
				4065	* A paused balance with the item stored on disk can be resumed at
				4066	* mount time if the mount is read-write. Otherwise it's still paused
				4067	* and we must not allow cancelling as it deletes the item.
				4068	*/
				4069	if (sb_rdonly(fs_info->sb)) {
				4070	mutex_unlock(&fs_info->balance_mutex);
				4071	return -EROFS;
				4072	}
				4073
				4074	atomic_inc(&fs_info->balance_cancel_req);
				4075	/*
				4076	* if we are running just wait and return, balance item is
				4077	* deleted in btrfs_balance in this case
				4078	*/
				4079	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
				4080	mutex_unlock(&fs_info->balance_mutex);
				4081	wait_event(fs_info->balance_wait_q,
				4082	!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4083	mutex_lock(&fs_info->balance_mutex);
				4084	} else {
				4085	mutex_unlock(&fs_info->balance_mutex);
				4086	/*
				4087	* Lock released to allow other waiters to continue, we'll
				4088	* reexamine the status again.
				4089	*/
				4090	mutex_lock(&fs_info->balance_mutex);
				4091
				4092	if (fs_info->balance_ctl) {
				4093	reset_balance_state(fs_info);
				4094	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
				4095	btrfs_info(fs_info, "balance: canceled");
				4096	}
				4097	}
				4098
				4099	BUG_ON(fs_info->balance_ctl \|\|
				4100	test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
				4101	atomic_dec(&fs_info->balance_cancel_req);
				4102	mutex_unlock(&fs_info->balance_mutex);
				4103	return 0;
				4104	}
				4105
				4106	static int btrfs_uuid_scan_kthread(void *data)
				4107	{
				4108	struct btrfs_fs_info *fs_info = data;
				4109	struct btrfs_root *root = fs_info->tree_root;
				4110	struct btrfs_key key;
				4111	struct btrfs_path *path = NULL;
				4112	int ret = 0;
				4113	struct extent_buffer *eb;
				4114	int slot;
				4115	struct btrfs_root_item root_item;
				4116	u32 item_size;
				4117	struct btrfs_trans_handle *trans = NULL;
				4118
				4119	path = btrfs_alloc_path();
				4120	if (!path) {
				4121	ret = -ENOMEM;
				4122	goto out;
				4123	}
				4124
				4125	key.objectid = 0;
				4126	key.type = BTRFS_ROOT_ITEM_KEY;
				4127	key.offset = 0;
				4128
				4129	while (1) {
				4130	ret = btrfs_search_forward(root, &key, path,
				4131	BTRFS_OLDEST_GENERATION);
				4132	if (ret) {
				4133	if (ret > 0)
				4134	ret = 0;
				4135	break;
				4136	}
				4137
				4138	if (key.type != BTRFS_ROOT_ITEM_KEY \|\|
				4139	(key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
				4140	key.objectid != BTRFS_FS_TREE_OBJECTID) \|\|
				4141	key.objectid > BTRFS_LAST_FREE_OBJECTID)
				4142	goto skip;
				4143
				4144	eb = path->nodes[0];
				4145	slot = path->slots[0];
				4146	item_size = btrfs_item_size_nr(eb, slot);
				4147	if (item_size < sizeof(root_item))
				4148	goto skip;
				4149
				4150	read_extent_buffer(eb, &root_item,
				4151	btrfs_item_ptr_offset(eb, slot),
				4152	(int)sizeof(root_item));
				4153	if (btrfs_root_refs(&root_item) == 0)
				4154	goto skip;
				4155
				4156	if (!btrfs_is_empty_uuid(root_item.uuid) \|\|
				4157	!btrfs_is_empty_uuid(root_item.received_uuid)) {
				4158	if (trans)
				4159	goto update_tree;
				4160
				4161	btrfs_release_path(path);
				4162	/*
				4163	* 1 - subvol uuid item
				4164	* 1 - received_subvol uuid item
				4165	*/
				4166	trans = btrfs_start_transaction(fs_info->uuid_root, 2);
				4167	if (IS_ERR(trans)) {
				4168	ret = PTR_ERR(trans);
				4169	break;
				4170	}
				4171	continue;
				4172	} else {
				4173	goto skip;
				4174	}
				4175	update_tree:
				4176	if (!btrfs_is_empty_uuid(root_item.uuid)) {
				4177	ret = btrfs_uuid_tree_add(trans, root_item.uuid,
				4178	BTRFS_UUID_KEY_SUBVOL,
				4179	key.objectid);
				4180	if (ret < 0) {
				4181	btrfs_warn(fs_info, "uuid_tree_add failed %d",
				4182	ret);
				4183	break;
				4184	}
				4185	}
				4186
				4187	if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
				4188	ret = btrfs_uuid_tree_add(trans,
				4189	root_item.received_uuid,
				4190	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				4191	key.objectid);
				4192	if (ret < 0) {
				4193	btrfs_warn(fs_info, "uuid_tree_add failed %d",
				4194	ret);
				4195	break;
				4196	}
				4197	}
				4198
				4199	skip:
				4200	if (trans) {
				4201	ret = btrfs_end_transaction(trans);
				4202	trans = NULL;
				4203	if (ret)
				4204	break;
				4205	}
				4206
				4207	btrfs_release_path(path);
				4208	if (key.offset < (u64)-1) {
				4209	key.offset++;
				4210	} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
				4211	key.offset = 0;
				4212	key.type = BTRFS_ROOT_ITEM_KEY;
				4213	} else if (key.objectid < (u64)-1) {
				4214	key.offset = 0;
				4215	key.type = BTRFS_ROOT_ITEM_KEY;
				4216	key.objectid++;
				4217	} else {
				4218	break;
				4219	}
				4220	cond_resched();
				4221	}
				4222
				4223	out:
				4224	btrfs_free_path(path);
				4225	if (trans && !IS_ERR(trans))
				4226	btrfs_end_transaction(trans);
				4227	if (ret)
				4228	btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
				4229	else
				4230	set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
				4231	up(&fs_info->uuid_tree_rescan_sem);
				4232	return 0;
				4233	}
				4234
				4235	/*
				4236	* Callback for btrfs_uuid_tree_iterate().
				4237	* returns:
				4238	* 0 check succeeded, the entry is not outdated.
				4239	* < 0 if an error occurred.
				4240	* > 0 if the check failed, which means the caller shall remove the entry.
				4241	*/
				4242	static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
				4243	u8 *uuid, u8 type, u64 subid)
				4244	{
				4245	struct btrfs_key key;
				4246	int ret = 0;
				4247	struct btrfs_root *subvol_root;
				4248
				4249	if (type != BTRFS_UUID_KEY_SUBVOL &&
				4250	type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
				4251	goto out;
				4252
				4253	key.objectid = subid;
				4254	key.type = BTRFS_ROOT_ITEM_KEY;
				4255	key.offset = (u64)-1;
				4256	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
				4257	if (IS_ERR(subvol_root)) {
				4258	ret = PTR_ERR(subvol_root);
				4259	if (ret == -ENOENT)
				4260	ret = 1;
				4261	goto out;
				4262	}
				4263
				4264	switch (type) {
				4265	case BTRFS_UUID_KEY_SUBVOL:
				4266	if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
				4267	ret = 1;
				4268	break;
				4269	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
				4270	if (memcmp(uuid, subvol_root->root_item.received_uuid,
				4271	BTRFS_UUID_SIZE))
				4272	ret = 1;
				4273	break;
				4274	}
				4275
				4276	out:
				4277	return ret;
				4278	}
				4279
				4280	static int btrfs_uuid_rescan_kthread(void *data)
				4281	{
				4282	struct btrfs_fs_info fs_info = (struct btrfs_fs_info )data;
				4283	int ret;
				4284
				4285	/*
				4286	* 1st step is to iterate through the existing UUID tree and
				4287	* to delete all entries that contain outdated data.
				4288	* 2nd step is to add all missing entries to the UUID tree.
				4289	*/
				4290	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
				4291	if (ret < 0) {
				4292	btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
				4293	up(&fs_info->uuid_tree_rescan_sem);
				4294	return ret;
				4295	}
				4296	return btrfs_uuid_scan_kthread(data);
				4297	}
				4298
				4299	int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
				4300	{
				4301	struct btrfs_trans_handle *trans;
				4302	struct btrfs_root *tree_root = fs_info->tree_root;
				4303	struct btrfs_root *uuid_root;
				4304	struct task_struct *task;
				4305	int ret;
				4306
				4307	/*
				4308	* 1 - root node
				4309	* 1 - root item
				4310	*/
				4311	trans = btrfs_start_transaction(tree_root, 2);
				4312	if (IS_ERR(trans))
				4313	return PTR_ERR(trans);
				4314
				4315	uuid_root = btrfs_create_tree(trans, fs_info,
				4316	BTRFS_UUID_TREE_OBJECTID);
				4317	if (IS_ERR(uuid_root)) {
				4318	ret = PTR_ERR(uuid_root);
				4319	btrfs_abort_transaction(trans, ret);
				4320	btrfs_end_transaction(trans);
				4321	return ret;
				4322	}
				4323
				4324	fs_info->uuid_root = uuid_root;
				4325
				4326	ret = btrfs_commit_transaction(trans);
				4327	if (ret)
				4328	return ret;
				4329
				4330	down(&fs_info->uuid_tree_rescan_sem);
				4331	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
				4332	if (IS_ERR(task)) {
				4333	/* fs_info->update_uuid_tree_gen remains 0 in all error case */
				4334	btrfs_warn(fs_info, "failed to start uuid_scan task");
				4335	up(&fs_info->uuid_tree_rescan_sem);
				4336	return PTR_ERR(task);
				4337	}
				4338
				4339	return 0;
				4340	}
				4341
				4342	int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
				4343	{
				4344	struct task_struct *task;
				4345
				4346	down(&fs_info->uuid_tree_rescan_sem);
				4347	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
				4348	if (IS_ERR(task)) {
				4349	/* fs_info->update_uuid_tree_gen remains 0 in all error case */
				4350	btrfs_warn(fs_info, "failed to start uuid_rescan task");
				4351	up(&fs_info->uuid_tree_rescan_sem);
				4352	return PTR_ERR(task);
				4353	}
				4354
				4355	return 0;
				4356	}
				4357
				4358	/*
				4359	* shrinking a device means finding all of the device extents past
				4360	* the new size, and then following the back refs to the chunks.
				4361	* The chunk relocation code actually frees the device extent
				4362	*/
				4363	int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
				4364	{
				4365	struct btrfs_fs_info *fs_info = device->fs_info;
				4366	struct btrfs_root *root = fs_info->dev_root;
				4367	struct btrfs_trans_handle *trans;
				4368	struct btrfs_dev_extent *dev_extent = NULL;
				4369	struct btrfs_path *path;
				4370	u64 length;
				4371	u64 chunk_offset;
				4372	int ret;
				4373	int slot;
				4374	int failed = 0;
				4375	bool retried = false;
				4376	bool checked_pending_chunks = false;
				4377	struct extent_buffer *l;
				4378	struct btrfs_key key;
				4379	struct btrfs_super_block *super_copy = fs_info->super_copy;
				4380	u64 old_total = btrfs_super_total_bytes(super_copy);
				4381	u64 old_size = btrfs_device_get_total_bytes(device);
				4382	u64 diff;
				4383
				4384	new_size = round_down(new_size, fs_info->sectorsize);
				4385	diff = round_down(old_size - new_size, fs_info->sectorsize);
				4386
				4387	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
				4388	return -EINVAL;
				4389
				4390	path = btrfs_alloc_path();
				4391	if (!path)
				4392	return -ENOMEM;
				4393
				4394	path->reada = READA_BACK;
				4395
				4396	mutex_lock(&fs_info->chunk_mutex);
				4397
				4398	btrfs_device_set_total_bytes(device, new_size);
				4399	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				4400	device->fs_devices->total_rw_bytes -= diff;
				4401	atomic64_sub(diff, &fs_info->free_chunk_space);
				4402	}
				4403	mutex_unlock(&fs_info->chunk_mutex);
				4404
				4405	again:
				4406	key.objectid = device->devid;
				4407	key.offset = (u64)-1;
				4408	key.type = BTRFS_DEV_EXTENT_KEY;
				4409
				4410	do {
				4411	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				4412	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4413	if (ret < 0) {
				4414	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4415	goto done;
				4416	}
				4417
				4418	ret = btrfs_previous_item(root, path, 0, key.type);
				4419	if (ret)
				4420	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4421	if (ret < 0)
				4422	goto done;
				4423	if (ret) {
				4424	ret = 0;
				4425	btrfs_release_path(path);
				4426	break;
				4427	}
				4428
				4429	l = path->nodes[0];
				4430	slot = path->slots[0];
				4431	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
				4432
				4433	if (key.objectid != device->devid) {
				4434	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4435	btrfs_release_path(path);
				4436	break;
				4437	}
				4438
				4439	dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
				4440	length = btrfs_dev_extent_length(l, dev_extent);
				4441
				4442	if (key.offset + length <= new_size) {
				4443	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4444	btrfs_release_path(path);
				4445	break;
				4446	}
				4447
				4448	chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
				4449	btrfs_release_path(path);
				4450
				4451	/*
				4452	* We may be relocating the only data chunk we have,
				4453	* which could potentially end up with losing data's
				4454	* raid profile, so lets allocate an empty one in
				4455	* advance.
				4456	*/
				4457	ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
				4458	if (ret < 0) {
				4459	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4460	goto done;
				4461	}
				4462
				4463	ret = btrfs_relocate_chunk(fs_info, chunk_offset);
				4464	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				4465	if (ret && ret != -ENOSPC)
				4466	goto done;
				4467	if (ret == -ENOSPC)
				4468	failed++;
				4469	} while (key.offset-- > 0);
				4470
				4471	if (failed && !retried) {
				4472	failed = 0;
				4473	retried = true;
				4474	goto again;
				4475	} else if (failed && retried) {
				4476	ret = -ENOSPC;
				4477	goto done;
				4478	}
				4479
				4480	/* Shrinking succeeded, else we would be at "done". */
				4481	trans = btrfs_start_transaction(root, 0);
				4482	if (IS_ERR(trans)) {
				4483	ret = PTR_ERR(trans);
				4484	goto done;
				4485	}
				4486
				4487	mutex_lock(&fs_info->chunk_mutex);
				4488
				4489	/*
				4490	* We checked in the above loop all device extents that were already in
				4491	* the device tree. However before we have updated the device's
				4492	* total_bytes to the new size, we might have had chunk allocations that
				4493	* have not complete yet (new block groups attached to transaction
				4494	* handles), and therefore their device extents were not yet in the
				4495	* device tree and we missed them in the loop above. So if we have any
				4496	* pending chunk using a device extent that overlaps the device range
				4497	* that we can not use anymore, commit the current transaction and
				4498	* repeat the search on the device tree - this way we guarantee we will
				4499	* not have chunks using device extents that end beyond 'new_size'.
				4500	*/
				4501	if (!checked_pending_chunks) {
				4502	u64 start = new_size;
				4503	u64 len = old_size - new_size;
				4504
				4505	if (contains_pending_extent(trans->transaction, device,
				4506	&start, len)) {
				4507	mutex_unlock(&fs_info->chunk_mutex);
				4508	checked_pending_chunks = true;
				4509	failed = 0;
				4510	retried = false;
				4511	ret = btrfs_commit_transaction(trans);
				4512	if (ret)
				4513	goto done;
				4514	goto again;
				4515	}
				4516	}
				4517
				4518	btrfs_device_set_disk_total_bytes(device, new_size);
				4519	if (list_empty(&device->resized_list))
				4520	list_add_tail(&device->resized_list,
				4521	&fs_info->fs_devices->resized_devices);
				4522
				4523	WARN_ON(diff > old_total);
				4524	btrfs_set_super_total_bytes(super_copy,
				4525	round_down(old_total - diff, fs_info->sectorsize));
				4526	mutex_unlock(&fs_info->chunk_mutex);
				4527
				4528	/* Now btrfs_update_device() will change the on-disk size. */
				4529	ret = btrfs_update_device(trans, device);
				4530	if (ret < 0) {
				4531	btrfs_abort_transaction(trans, ret);
				4532	btrfs_end_transaction(trans);
				4533	} else {
				4534	ret = btrfs_commit_transaction(trans);
				4535	}
				4536	done:
				4537	btrfs_free_path(path);
				4538	if (ret) {
				4539	mutex_lock(&fs_info->chunk_mutex);
				4540	btrfs_device_set_total_bytes(device, old_size);
				4541	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				4542	device->fs_devices->total_rw_bytes += diff;
				4543	atomic64_add(diff, &fs_info->free_chunk_space);
				4544	mutex_unlock(&fs_info->chunk_mutex);
				4545	}
				4546	return ret;
				4547	}
				4548
				4549	static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
				4550	struct btrfs_key *key,
				4551	struct btrfs_chunk *chunk, int item_size)
				4552	{
				4553	struct btrfs_super_block *super_copy = fs_info->super_copy;
				4554	struct btrfs_disk_key disk_key;
				4555	u32 array_size;
				4556	u8 *ptr;
				4557
				4558	mutex_lock(&fs_info->chunk_mutex);
				4559	array_size = btrfs_super_sys_array_size(super_copy);
				4560	if (array_size + item_size + sizeof(disk_key)
				4561	> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
				4562	mutex_unlock(&fs_info->chunk_mutex);
				4563	return -EFBIG;
				4564	}
				4565
				4566	ptr = super_copy->sys_chunk_array + array_size;
				4567	btrfs_cpu_key_to_disk(&disk_key, key);
				4568	memcpy(ptr, &disk_key, sizeof(disk_key));
				4569	ptr += sizeof(disk_key);
				4570	memcpy(ptr, chunk, item_size);
				4571	item_size += sizeof(disk_key);
				4572	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
				4573	mutex_unlock(&fs_info->chunk_mutex);
				4574
				4575	return 0;
				4576	}
				4577
				4578	/*
				4579	* sort the devices in descending order by max_avail, total_avail
				4580	*/
				4581	static int btrfs_cmp_device_info(const void a, const void b)
				4582	{
				4583	const struct btrfs_device_info *di_a = a;
				4584	const struct btrfs_device_info *di_b = b;
				4585
				4586	if (di_a->max_avail > di_b->max_avail)
				4587	return -1;
				4588	if (di_a->max_avail < di_b->max_avail)
				4589	return 1;
				4590	if (di_a->total_avail > di_b->total_avail)
				4591	return -1;
				4592	if (di_a->total_avail < di_b->total_avail)
				4593	return 1;
				4594	return 0;
				4595	}
				4596
				4597	static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
				4598	{
				4599	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
				4600	return;
				4601
				4602	btrfs_set_fs_incompat(info, RAID56);
				4603	}
				4604
				4605	#define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \
				4606	- sizeof(struct btrfs_chunk)) \
				4607	/ sizeof(struct btrfs_stripe) + 1)
				4608
				4609	#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE \
				4610	- 2 * sizeof(struct btrfs_disk_key) \
				4611	- 2 * sizeof(struct btrfs_chunk)) \
				4612	/ sizeof(struct btrfs_stripe) + 1)
				4613
				4614	static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
				4615	u64 start, u64 type)
				4616	{
				4617	struct btrfs_fs_info *info = trans->fs_info;
				4618	struct btrfs_fs_devices *fs_devices = info->fs_devices;
				4619	struct btrfs_device *device;
				4620	struct map_lookup *map = NULL;
				4621	struct extent_map_tree *em_tree;
				4622	struct extent_map *em;
				4623	struct btrfs_device_info *devices_info = NULL;
				4624	u64 total_avail;
				4625	int num_stripes; /* total number of stripes to allocate */
				4626	int data_stripes; /* number of stripes that count for
				4627	block group size */
				4628	int sub_stripes; /* sub_stripes info for map */
				4629	int dev_stripes; /* stripes per dev */
				4630	int devs_max; /* max devs to use */
				4631	int devs_min; /* min devs needed */
				4632	int devs_increment; /* ndevs has to be a multiple of this */
				4633	int ncopies; /* how many copies to data has */
				4634	int ret;
				4635	u64 max_stripe_size;
				4636	u64 max_chunk_size;
				4637	u64 stripe_size;
				4638	u64 num_bytes;
				4639	int ndevs;
				4640	int i;
				4641	int j;
				4642	int index;
				4643
				4644	BUG_ON(!alloc_profile_is_valid(type, 0));
				4645
				4646	if (list_empty(&fs_devices->alloc_list)) {
				4647	if (btrfs_test_opt(info, ENOSPC_DEBUG))
				4648	btrfs_debug(info, "%s: no writable device", __func__);
				4649	return -ENOSPC;
				4650	}
				4651
				4652	index = btrfs_bg_flags_to_raid_index(type);
				4653
				4654	sub_stripes = btrfs_raid_array[index].sub_stripes;
				4655	dev_stripes = btrfs_raid_array[index].dev_stripes;
				4656	devs_max = btrfs_raid_array[index].devs_max;
				4657	devs_min = btrfs_raid_array[index].devs_min;
				4658	devs_increment = btrfs_raid_array[index].devs_increment;
				4659	ncopies = btrfs_raid_array[index].ncopies;
				4660
				4661	if (type & BTRFS_BLOCK_GROUP_DATA) {
				4662	max_stripe_size = SZ_1G;
				4663	max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
				4664	if (!devs_max)
				4665	devs_max = BTRFS_MAX_DEVS(info);
				4666	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
				4667	/* for larger filesystems, use larger metadata chunks */
				4668	if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
				4669	max_stripe_size = SZ_1G;
				4670	else
				4671	max_stripe_size = SZ_256M;
				4672	max_chunk_size = max_stripe_size;
				4673	if (!devs_max)
				4674	devs_max = BTRFS_MAX_DEVS(info);
				4675	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
				4676	max_stripe_size = SZ_32M;
				4677	max_chunk_size = 2 * max_stripe_size;
				4678	if (!devs_max)
				4679	devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
				4680	} else {
				4681	btrfs_err(info, "invalid chunk type 0x%llx requested",
				4682	type);
				4683	BUG_ON(1);
				4684	}
				4685
				4686	/* we don't want a chunk larger than 10% of writeable space */
				4687	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
				4688	max_chunk_size);
				4689
				4690	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
				4691	GFP_NOFS);
				4692	if (!devices_info)
				4693	return -ENOMEM;
				4694
				4695	/*
				4696	* in the first pass through the devices list, we gather information
				4697	* about the available holes on each device.
				4698	*/
				4699	ndevs = 0;
				4700	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
				4701	u64 max_avail;
				4702	u64 dev_offset;
				4703
				4704	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
				4705	WARN(1, KERN_ERR
				4706	"BTRFS: read-only device in alloc_list\n");
				4707	continue;
				4708	}
				4709
				4710	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				4711	&device->dev_state) \|\|
				4712	test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
				4713	continue;
				4714
				4715	if (device->total_bytes > device->bytes_used)
				4716	total_avail = device->total_bytes - device->bytes_used;
				4717	else
				4718	total_avail = 0;
				4719
				4720	/* If there is no space on this device, skip it. */
				4721	if (total_avail == 0)
				4722	continue;
				4723
				4724	ret = find_free_dev_extent(trans, device,
				4725	max_stripe_size * dev_stripes,
				4726	&dev_offset, &max_avail);
				4727	if (ret && ret != -ENOSPC)
				4728	goto error;
				4729
				4730	if (ret == 0)
				4731	max_avail = max_stripe_size * dev_stripes;
				4732
				4733	if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
				4734	if (btrfs_test_opt(info, ENOSPC_DEBUG))
				4735	btrfs_debug(info,
				4736	"%s: devid %llu has no free space, have=%llu want=%u",
				4737	__func__, device->devid, max_avail,
				4738	BTRFS_STRIPE_LEN * dev_stripes);
				4739	continue;
				4740	}
				4741
				4742	if (ndevs == fs_devices->rw_devices) {
				4743	WARN(1, "%s: found more than %llu devices\n",
				4744	__func__, fs_devices->rw_devices);
				4745	break;
				4746	}
				4747	devices_info[ndevs].dev_offset = dev_offset;
				4748	devices_info[ndevs].max_avail = max_avail;
				4749	devices_info[ndevs].total_avail = total_avail;
				4750	devices_info[ndevs].dev = device;
				4751	++ndevs;
				4752	}
				4753
				4754	/*
				4755	* now sort the devices by hole size / available space
				4756	*/
				4757	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
				4758	btrfs_cmp_device_info, NULL);
				4759
				4760	/* round down to number of usable stripes */
				4761	ndevs = round_down(ndevs, devs_increment);
				4762
				4763	if (ndevs < devs_min) {
				4764	ret = -ENOSPC;
				4765	if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
				4766	btrfs_debug(info,
				4767	"%s: not enough devices with free space: have=%d minimum required=%d",
				4768	__func__, ndevs, devs_min);
				4769	}
				4770	goto error;
				4771	}
				4772
				4773	ndevs = min(ndevs, devs_max);
				4774
				4775	/*
				4776	* The primary goal is to maximize the number of stripes, so use as
				4777	* many devices as possible, even if the stripes are not maximum sized.
				4778	*
				4779	* The DUP profile stores more than one stripe per device, the
				4780	* max_avail is the total size so we have to adjust.
				4781	*/
				4782	stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
				4783	num_stripes = ndevs * dev_stripes;
				4784
				4785	/*
				4786	* this will have to be fixed for RAID1 and RAID10 over
				4787	* more drives
				4788	*/
				4789	data_stripes = num_stripes / ncopies;
				4790
				4791	if (type & BTRFS_BLOCK_GROUP_RAID5)
				4792	data_stripes = num_stripes - 1;
				4793
				4794	if (type & BTRFS_BLOCK_GROUP_RAID6)
				4795	data_stripes = num_stripes - 2;
				4796
				4797	/*
				4798	* Use the number of data stripes to figure out how big this chunk
				4799	* is really going to be in terms of logical address space,
				4800	* and compare that answer with the max chunk size. If it's higher,
				4801	* we try to reduce stripe_size.
				4802	*/
				4803	if (stripe_size * data_stripes > max_chunk_size) {
				4804	/*
				4805	* Reduce stripe_size, round it up to a 16MB boundary again and
				4806	* then use it, unless it ends up being even bigger than the
				4807	* previous value we had already.
				4808	*/
				4809	stripe_size = min(round_up(div_u64(max_chunk_size,
				4810	data_stripes), SZ_16M),
				4811	stripe_size);
				4812	}
				4813
				4814	/* align to BTRFS_STRIPE_LEN */
				4815	stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
				4816
				4817	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
				4818	if (!map) {
				4819	ret = -ENOMEM;
				4820	goto error;
				4821	}
				4822	map->num_stripes = num_stripes;
				4823
				4824	for (i = 0; i < ndevs; ++i) {
				4825	for (j = 0; j < dev_stripes; ++j) {
				4826	int s = i * dev_stripes + j;
				4827	map->stripes[s].dev = devices_info[i].dev;
				4828	map->stripes[s].physical = devices_info[i].dev_offset +
				4829	j * stripe_size;
				4830	}
				4831	}
				4832	map->stripe_len = BTRFS_STRIPE_LEN;
				4833	map->io_align = BTRFS_STRIPE_LEN;
				4834	map->io_width = BTRFS_STRIPE_LEN;
				4835	map->type = type;
				4836	map->sub_stripes = sub_stripes;
				4837
				4838	num_bytes = stripe_size * data_stripes;
				4839
				4840	trace_btrfs_chunk_alloc(info, map, start, num_bytes);
				4841
				4842	em = alloc_extent_map();
				4843	if (!em) {
				4844	kfree(map);
				4845	ret = -ENOMEM;
				4846	goto error;
				4847	}
				4848	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
				4849	em->map_lookup = map;
				4850	em->start = start;
				4851	em->len = num_bytes;
				4852	em->block_start = 0;
				4853	em->block_len = em->len;
				4854	em->orig_block_len = stripe_size;
				4855
				4856	em_tree = &info->mapping_tree.map_tree;
				4857	write_lock(&em_tree->lock);
				4858	ret = add_extent_mapping(em_tree, em, 0);
				4859	if (ret) {
				4860	write_unlock(&em_tree->lock);
				4861	free_extent_map(em);
				4862	goto error;
				4863	}
				4864
				4865	list_add_tail(&em->list, &trans->transaction->pending_chunks);
				4866	refcount_inc(&em->refs);
				4867	write_unlock(&em_tree->lock);
				4868
				4869	ret = btrfs_make_block_group(trans, 0, type, start, num_bytes);
				4870	if (ret)
				4871	goto error_del_extent;
				4872
				4873	for (i = 0; i < map->num_stripes; i++) {
				4874	num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
				4875	btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
				4876	map->stripes[i].dev->has_pending_chunks = true;
				4877	}
				4878
				4879	atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
				4880
				4881	free_extent_map(em);
				4882	check_raid56_incompat_flag(info, type);
				4883
				4884	kfree(devices_info);
				4885	return 0;
				4886
				4887	error_del_extent:
				4888	write_lock(&em_tree->lock);
				4889	remove_extent_mapping(em_tree, em);
				4890	write_unlock(&em_tree->lock);
				4891
				4892	/* One for our allocation */
				4893	free_extent_map(em);
				4894	/* One for the tree reference */
				4895	free_extent_map(em);
				4896	/* One for the pending_chunks list reference */
				4897	free_extent_map(em);
				4898	error:
				4899	kfree(devices_info);
				4900	return ret;
				4901	}
				4902
				4903	int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
				4904	u64 chunk_offset, u64 chunk_size)
				4905	{
				4906	struct btrfs_fs_info *fs_info = trans->fs_info;
				4907	struct btrfs_root *extent_root = fs_info->extent_root;
				4908	struct btrfs_root *chunk_root = fs_info->chunk_root;
				4909	struct btrfs_key key;
				4910	struct btrfs_device *device;
				4911	struct btrfs_chunk *chunk;
				4912	struct btrfs_stripe *stripe;
				4913	struct extent_map *em;
				4914	struct map_lookup *map;
				4915	size_t item_size;
				4916	u64 dev_offset;
				4917	u64 stripe_size;
				4918	int i = 0;
				4919	int ret = 0;
				4920
				4921	em = get_chunk_map(fs_info, chunk_offset, chunk_size);
				4922	if (IS_ERR(em))
				4923	return PTR_ERR(em);
				4924
				4925	map = em->map_lookup;
				4926	item_size = btrfs_chunk_item_size(map->num_stripes);
				4927	stripe_size = em->orig_block_len;
				4928
				4929	chunk = kzalloc(item_size, GFP_NOFS);
				4930	if (!chunk) {
				4931	ret = -ENOMEM;
				4932	goto out;
				4933	}
				4934
				4935	/*
				4936	* Take the device list mutex to prevent races with the final phase of
				4937	* a device replace operation that replaces the device object associated
				4938	* with the map's stripes, because the device object's id can change
				4939	* at any time during that final phase of the device replace operation
				4940	* (dev-replace.c:btrfs_dev_replace_finishing()).
				4941	*/
				4942	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				4943	for (i = 0; i < map->num_stripes; i++) {
				4944	device = map->stripes[i].dev;
				4945	dev_offset = map->stripes[i].physical;
				4946
				4947	ret = btrfs_update_device(trans, device);
				4948	if (ret)
				4949	break;
				4950	ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
				4951	dev_offset, stripe_size);
				4952	if (ret)
				4953	break;
				4954	}
				4955	if (ret) {
				4956	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				4957	goto out;
				4958	}
				4959
				4960	stripe = &chunk->stripe;
				4961	for (i = 0; i < map->num_stripes; i++) {
				4962	device = map->stripes[i].dev;
				4963	dev_offset = map->stripes[i].physical;
				4964
				4965	btrfs_set_stack_stripe_devid(stripe, device->devid);
				4966	btrfs_set_stack_stripe_offset(stripe, dev_offset);
				4967	memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
				4968	stripe++;
				4969	}
				4970	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				4971
				4972	btrfs_set_stack_chunk_length(chunk, chunk_size);
				4973	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
				4974	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
				4975	btrfs_set_stack_chunk_type(chunk, map->type);
				4976	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
				4977	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
				4978	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
				4979	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
				4980	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
				4981
				4982	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
				4983	key.type = BTRFS_CHUNK_ITEM_KEY;
				4984	key.offset = chunk_offset;
				4985
				4986	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
				4987	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
				4988	/*
				4989	* TODO: Cleanup of inserted chunk root in case of
				4990	* failure.
				4991	*/
				4992	ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
				4993	}
				4994
				4995	out:
				4996	kfree(chunk);
				4997	free_extent_map(em);
				4998	return ret;
				4999	}
				5000
				5001	/*
				5002	* Chunk allocation falls into two parts. The first part does works
				5003	* that make the new allocated chunk useable, but not do any operation
				5004	* that modifies the chunk tree. The second part does the works that
				5005	* require modifying the chunk tree. This division is important for the
				5006	* bootstrap process of adding storage to a seed btrfs.
				5007	*/
				5008	int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
				5009	{
				5010	u64 chunk_offset;
				5011
				5012	lockdep_assert_held(&trans->fs_info->chunk_mutex);
				5013	chunk_offset = find_next_chunk(trans->fs_info);
				5014	return __btrfs_alloc_chunk(trans, chunk_offset, type);
				5015	}
				5016
				5017	static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
				5018	struct btrfs_fs_info *fs_info)
				5019	{
				5020	u64 chunk_offset;
				5021	u64 sys_chunk_offset;
				5022	u64 alloc_profile;
				5023	int ret;
				5024
				5025	chunk_offset = find_next_chunk(fs_info);
				5026	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
				5027	ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
				5028	if (ret)
				5029	return ret;
				5030
				5031	sys_chunk_offset = find_next_chunk(fs_info);
				5032	alloc_profile = btrfs_system_alloc_profile(fs_info);
				5033	ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
				5034	return ret;
				5035	}
				5036
				5037	static inline int btrfs_chunk_max_errors(struct map_lookup *map)
				5038	{
				5039	int max_errors;
				5040
				5041	if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
				5042	BTRFS_BLOCK_GROUP_RAID10 \|
				5043	BTRFS_BLOCK_GROUP_RAID5)) {
				5044	max_errors = 1;
				5045	} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
				5046	max_errors = 2;
				5047	} else {
				5048	max_errors = 0;
				5049	}
				5050
				5051	return max_errors;
				5052	}
				5053
				5054	int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
				5055	{
				5056	struct extent_map *em;
				5057	struct map_lookup *map;
				5058	int readonly = 0;
				5059	int miss_ndevs = 0;
				5060	int i;
				5061
				5062	em = get_chunk_map(fs_info, chunk_offset, 1);
				5063	if (IS_ERR(em))
				5064	return 1;
				5065
				5066	map = em->map_lookup;
				5067	for (i = 0; i < map->num_stripes; i++) {
				5068	if (test_bit(BTRFS_DEV_STATE_MISSING,
				5069	&map->stripes[i].dev->dev_state)) {
				5070	miss_ndevs++;
				5071	continue;
				5072	}
				5073	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
				5074	&map->stripes[i].dev->dev_state)) {
				5075	readonly = 1;
				5076	goto end;
				5077	}
				5078	}
				5079
				5080	/*
				5081	* If the number of missing devices is larger than max errors,
				5082	* we can not write the data into that chunk successfully, so
				5083	* set it readonly.
				5084	*/
				5085	if (miss_ndevs > btrfs_chunk_max_errors(map))
				5086	readonly = 1;
				5087	end:
				5088	free_extent_map(em);
				5089	return readonly;
				5090	}
				5091
				5092	void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
				5093	{
				5094	extent_map_tree_init(&tree->map_tree);
				5095	}
				5096
				5097	void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
				5098	{
				5099	struct extent_map *em;
				5100
				5101	while (1) {
				5102	write_lock(&tree->map_tree.lock);
				5103	em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
				5104	if (em)
				5105	remove_extent_mapping(&tree->map_tree, em);
				5106	write_unlock(&tree->map_tree.lock);
				5107	if (!em)
				5108	break;
				5109	/* once for us */
				5110	free_extent_map(em);
				5111	/* once for the tree */
				5112	free_extent_map(em);
				5113	}
				5114	}
				5115
				5116	int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
				5117	{
				5118	struct extent_map *em;
				5119	struct map_lookup *map;
				5120	int ret;
				5121
				5122	em = get_chunk_map(fs_info, logical, len);
				5123	if (IS_ERR(em))
				5124	/*
				5125	* We could return errors for these cases, but that could get
				5126	* ugly and we'd probably do the same thing which is just not do
				5127	* anything else and exit, so return 1 so the callers don't try
				5128	* to use other copies.
				5129	*/
				5130	return 1;
				5131
				5132	map = em->map_lookup;
				5133	if (map->type & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1))
				5134	ret = map->num_stripes;
				5135	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				5136	ret = map->sub_stripes;
				5137	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
				5138	ret = 2;
				5139	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
				5140	/*
				5141	* There could be two corrupted data stripes, we need
				5142	* to loop retry in order to rebuild the correct data.
				5143	*
				5144	* Fail a stripe at a time on every retry except the
				5145	* stripe under reconstruction.
				5146	*/
				5147	ret = map->num_stripes;
				5148	else
				5149	ret = 1;
				5150	free_extent_map(em);
				5151
				5152	btrfs_dev_replace_read_lock(&fs_info->dev_replace);
				5153	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
				5154	fs_info->dev_replace.tgtdev)
				5155	ret++;
				5156	btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
				5157
				5158	return ret;
				5159	}
				5160
				5161	unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
				5162	u64 logical)
				5163	{
				5164	struct extent_map *em;
				5165	struct map_lookup *map;
				5166	unsigned long len = fs_info->sectorsize;
				5167
				5168	em = get_chunk_map(fs_info, logical, len);
				5169
				5170	if (!WARN_ON(IS_ERR(em))) {
				5171	map = em->map_lookup;
				5172	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				5173	len = map->stripe_len * nr_data_stripes(map);
				5174	free_extent_map(em);
				5175	}
				5176	return len;
				5177	}
				5178
				5179	int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
				5180	{
				5181	struct extent_map *em;
				5182	struct map_lookup *map;
				5183	int ret = 0;
				5184
				5185	em = get_chunk_map(fs_info, logical, len);
				5186
				5187	if(!WARN_ON(IS_ERR(em))) {
				5188	map = em->map_lookup;
				5189	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
				5190	ret = 1;
				5191	free_extent_map(em);
				5192	}
				5193	return ret;
				5194	}
				5195
				5196	static int find_live_mirror(struct btrfs_fs_info *fs_info,
				5197	struct map_lookup *map, int first,
				5198	int dev_replace_is_ongoing)
				5199	{
				5200	int i;
				5201	int num_stripes;
				5202	int preferred_mirror;
				5203	int tolerance;
				5204	struct btrfs_device *srcdev;
				5205
				5206	ASSERT((map->type &
				5207	(BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10)));
				5208
				5209	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				5210	num_stripes = map->sub_stripes;
				5211	else
				5212	num_stripes = map->num_stripes;
				5213
				5214	preferred_mirror = first + current->pid % num_stripes;
				5215
				5216	if (dev_replace_is_ongoing &&
				5217	fs_info->dev_replace.cont_reading_from_srcdev_mode ==
				5218	BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
				5219	srcdev = fs_info->dev_replace.srcdev;
				5220	else
				5221	srcdev = NULL;
				5222
				5223	/*
				5224	* try to avoid the drive that is the source drive for a
				5225	* dev-replace procedure, only choose it if no other non-missing
				5226	* mirror is available
				5227	*/
				5228	for (tolerance = 0; tolerance < 2; tolerance++) {
				5229	if (map->stripes[preferred_mirror].dev->bdev &&
				5230	(tolerance \|\| map->stripes[preferred_mirror].dev != srcdev))
				5231	return preferred_mirror;
				5232	for (i = first; i < first + num_stripes; i++) {
				5233	if (map->stripes[i].dev->bdev &&
				5234	(tolerance \|\| map->stripes[i].dev != srcdev))
				5235	return i;
				5236	}
				5237	}
				5238
				5239	/* we couldn't find one that doesn't fail. Just return something
				5240	* and the io error handling code will clean up eventually
				5241	*/
				5242	return preferred_mirror;
				5243	}
				5244
				5245	static inline int parity_smaller(u64 a, u64 b)
				5246	{
				5247	return a > b;
				5248	}
				5249
				5250	/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
				5251	static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
				5252	{
				5253	struct btrfs_bio_stripe s;
				5254	int i;
				5255	u64 l;
				5256	int again = 1;
				5257
				5258	while (again) {
				5259	again = 0;
				5260	for (i = 0; i < num_stripes - 1; i++) {
				5261	if (parity_smaller(bbio->raid_map[i],
				5262	bbio->raid_map[i+1])) {
				5263	s = bbio->stripes[i];
				5264	l = bbio->raid_map[i];
				5265	bbio->stripes[i] = bbio->stripes[i+1];
				5266	bbio->raid_map[i] = bbio->raid_map[i+1];
				5267	bbio->stripes[i+1] = s;
				5268	bbio->raid_map[i+1] = l;
				5269
				5270	again = 1;
				5271	}
				5272	}
				5273	}
				5274	}
				5275
				5276	static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
				5277	{
				5278	struct btrfs_bio *bbio = kzalloc(
				5279	/* the size of the btrfs_bio */
				5280	sizeof(struct btrfs_bio) +
				5281	/* plus the variable array for the stripes */
				5282	sizeof(struct btrfs_bio_stripe) * (total_stripes) +
				5283	/* plus the variable array for the tgt dev */
				5284	sizeof(int) * (real_stripes) +
				5285	/*
				5286	* plus the raid_map, which includes both the tgt dev
				5287	* and the stripes
				5288	*/
				5289	sizeof(u64) * (total_stripes),
				5290	GFP_NOFS\|__GFP_NOFAIL);
				5291
				5292	atomic_set(&bbio->error, 0);
				5293	refcount_set(&bbio->refs, 1);
				5294
				5295	return bbio;
				5296	}
				5297
				5298	void btrfs_get_bbio(struct btrfs_bio *bbio)
				5299	{
				5300	WARN_ON(!refcount_read(&bbio->refs));
				5301	refcount_inc(&bbio->refs);
				5302	}
				5303
				5304	void btrfs_put_bbio(struct btrfs_bio *bbio)
				5305	{
				5306	if (!bbio)
				5307	return;
				5308	if (refcount_dec_and_test(&bbio->refs))
				5309	kfree(bbio);
				5310	}
				5311
				5312	/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
				5313	/*
				5314	* Please note that, discard won't be sent to target device of device
				5315	* replace.
				5316	*/
				5317	static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
				5318	u64 logical, u64 length,
				5319	struct btrfs_bio **bbio_ret)
				5320	{
				5321	struct extent_map *em;
				5322	struct map_lookup *map;
				5323	struct btrfs_bio *bbio;
				5324	u64 offset;
				5325	u64 stripe_nr;
				5326	u64 stripe_nr_end;
				5327	u64 stripe_end_offset;
				5328	u64 stripe_cnt;
				5329	u64 stripe_len;
				5330	u64 stripe_offset;
				5331	u64 num_stripes;
				5332	u32 stripe_index;
				5333	u32 factor = 0;
				5334	u32 sub_stripes = 0;
				5335	u64 stripes_per_dev = 0;
				5336	u32 remaining_stripes = 0;
				5337	u32 last_stripe = 0;
				5338	int ret = 0;
				5339	int i;
				5340
				5341	/* discard always return a bbio */
				5342	ASSERT(bbio_ret);
				5343
				5344	em = get_chunk_map(fs_info, logical, length);
				5345	if (IS_ERR(em))
				5346	return PTR_ERR(em);
				5347
				5348	map = em->map_lookup;
				5349	/* we don't discard raid56 yet */
				5350	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5351	ret = -EOPNOTSUPP;
				5352	goto out;
				5353	}
				5354
				5355	offset = logical - em->start;
				5356	length = min_t(u64, em->len - offset, length);
				5357
				5358	stripe_len = map->stripe_len;
				5359	/*
				5360	* stripe_nr counts the total number of stripes we have to stride
				5361	* to get to this block
				5362	*/
				5363	stripe_nr = div64_u64(offset, stripe_len);
				5364
				5365	/* stripe_offset is the offset of this block in its stripe */
				5366	stripe_offset = offset - stripe_nr * stripe_len;
				5367
				5368	stripe_nr_end = round_up(offset + length, map->stripe_len);
				5369	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
				5370	stripe_cnt = stripe_nr_end - stripe_nr;
				5371	stripe_end_offset = stripe_nr_end * map->stripe_len -
				5372	(offset + length);
				5373	/*
				5374	* after this, stripe_nr is the number of stripes on this
				5375	* device we have to walk to find the data, and stripe_index is
				5376	* the number of our device in the stripe array
				5377	*/
				5378	num_stripes = 1;
				5379	stripe_index = 0;
				5380	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
				5381	BTRFS_BLOCK_GROUP_RAID10)) {
				5382	if (map->type & BTRFS_BLOCK_GROUP_RAID0)
				5383	sub_stripes = 1;
				5384	else
				5385	sub_stripes = map->sub_stripes;
				5386
				5387	factor = map->num_stripes / sub_stripes;
				5388	num_stripes = min_t(u64, map->num_stripes,
				5389	sub_stripes * stripe_cnt);
				5390	stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
				5391	stripe_index *= sub_stripes;
				5392	stripes_per_dev = div_u64_rem(stripe_cnt, factor,
				5393	&remaining_stripes);
				5394	div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
				5395	last_stripe *= sub_stripes;
				5396	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 \|
				5397	BTRFS_BLOCK_GROUP_DUP)) {
				5398	num_stripes = map->num_stripes;
				5399	} else {
				5400	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				5401	&stripe_index);
				5402	}
				5403
				5404	bbio = alloc_btrfs_bio(num_stripes, 0);
				5405	if (!bbio) {
				5406	ret = -ENOMEM;
				5407	goto out;
				5408	}
				5409
				5410	for (i = 0; i < num_stripes; i++) {
				5411	bbio->stripes[i].physical =
				5412	map->stripes[stripe_index].physical +
				5413	stripe_offset + stripe_nr * map->stripe_len;
				5414	bbio->stripes[i].dev = map->stripes[stripe_index].dev;
				5415
				5416	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 \|
				5417	BTRFS_BLOCK_GROUP_RAID10)) {
				5418	bbio->stripes[i].length = stripes_per_dev *
				5419	map->stripe_len;
				5420
				5421	if (i / sub_stripes < remaining_stripes)
				5422	bbio->stripes[i].length +=
				5423	map->stripe_len;
				5424
				5425	/*
				5426	* Special for the first stripe and
				5427	* the last stripe:
				5428	*
				5429	* \|-------\|...\|-------\|
				5430	* \|----------\|
				5431	* off end_off
				5432	*/
				5433	if (i < sub_stripes)
				5434	bbio->stripes[i].length -=
				5435	stripe_offset;
				5436
				5437	if (stripe_index >= last_stripe &&
				5438	stripe_index <= (last_stripe +
				5439	sub_stripes - 1))
				5440	bbio->stripes[i].length -=
				5441	stripe_end_offset;
				5442
				5443	if (i == sub_stripes - 1)
				5444	stripe_offset = 0;
				5445	} else {
				5446	bbio->stripes[i].length = length;
				5447	}
				5448
				5449	stripe_index++;
				5450	if (stripe_index == map->num_stripes) {
				5451	stripe_index = 0;
				5452	stripe_nr++;
				5453	}
				5454	}
				5455
				5456	*bbio_ret = bbio;
				5457	bbio->map_type = map->type;
				5458	bbio->num_stripes = num_stripes;
				5459	out:
				5460	free_extent_map(em);
				5461	return ret;
				5462	}
				5463
				5464	/*
				5465	* In dev-replace case, for repair case (that's the only case where the mirror
				5466	* is selected explicitly when calling btrfs_map_block), blocks left of the
				5467	* left cursor can also be read from the target drive.
				5468	*
				5469	* For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
				5470	* array of stripes.
				5471	* For READ, it also needs to be supported using the same mirror number.
				5472	*
				5473	* If the requested block is not left of the left cursor, EIO is returned. This
				5474	* can happen because btrfs_num_copies() returns one more in the dev-replace
				5475	* case.
				5476	*/
				5477	static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
				5478	u64 logical, u64 length,
				5479	u64 srcdev_devid, int *mirror_num,
				5480	u64 *physical)
				5481	{
				5482	struct btrfs_bio *bbio = NULL;
				5483	int num_stripes;
				5484	int index_srcdev = 0;
				5485	int found = 0;
				5486	u64 physical_of_found = 0;
				5487	int i;
				5488	int ret = 0;
				5489
				5490	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
				5491	logical, &length, &bbio, 0, 0);
				5492	if (ret) {
				5493	ASSERT(bbio == NULL);
				5494	return ret;
				5495	}
				5496
				5497	num_stripes = bbio->num_stripes;
				5498	if (*mirror_num > num_stripes) {
				5499	/*
				5500	* BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
				5501	* that means that the requested area is not left of the left
				5502	* cursor
				5503	*/
				5504	btrfs_put_bbio(bbio);
				5505	return -EIO;
				5506	}
				5507
				5508	/*
				5509	* process the rest of the function using the mirror_num of the source
				5510	* drive. Therefore look it up first. At the end, patch the device
				5511	* pointer to the one of the target drive.
				5512	*/
				5513	for (i = 0; i < num_stripes; i++) {
				5514	if (bbio->stripes[i].dev->devid != srcdev_devid)
				5515	continue;
				5516
				5517	/*
				5518	* In case of DUP, in order to keep it simple, only add the
				5519	* mirror with the lowest physical address
				5520	*/
				5521	if (found &&
				5522	physical_of_found <= bbio->stripes[i].physical)
				5523	continue;
				5524
				5525	index_srcdev = i;
				5526	found = 1;
				5527	physical_of_found = bbio->stripes[i].physical;
				5528	}
				5529
				5530	btrfs_put_bbio(bbio);
				5531
				5532	ASSERT(found);
				5533	if (!found)
				5534	return -EIO;
				5535
				5536	*mirror_num = index_srcdev + 1;
				5537	*physical = physical_of_found;
				5538	return ret;
				5539	}
				5540
				5541	static void handle_ops_on_dev_replace(enum btrfs_map_op op,
				5542	struct btrfs_bio **bbio_ret,
				5543	struct btrfs_dev_replace *dev_replace,
				5544	int num_stripes_ret, int max_errors_ret)
				5545	{
				5546	struct btrfs_bio bbio = bbio_ret;
				5547	u64 srcdev_devid = dev_replace->srcdev->devid;
				5548	int tgtdev_indexes = 0;
				5549	int num_stripes = *num_stripes_ret;
				5550	int max_errors = *max_errors_ret;
				5551	int i;
				5552
				5553	if (op == BTRFS_MAP_WRITE) {
				5554	int index_where_to_add;
				5555
				5556	/*
				5557	* duplicate the write operations while the dev replace
				5558	* procedure is running. Since the copying of the old disk to
				5559	* the new disk takes place at run time while the filesystem is
				5560	* mounted writable, the regular write operations to the old
				5561	* disk have to be duplicated to go to the new disk as well.
				5562	*
				5563	* Note that device->missing is handled by the caller, and that
				5564	* the write to the old disk is already set up in the stripes
				5565	* array.
				5566	*/
				5567	index_where_to_add = num_stripes;
				5568	for (i = 0; i < num_stripes; i++) {
				5569	if (bbio->stripes[i].dev->devid == srcdev_devid) {
				5570	/* write to new disk, too */
				5571	struct btrfs_bio_stripe *new =
				5572	bbio->stripes + index_where_to_add;
				5573	struct btrfs_bio_stripe *old =
				5574	bbio->stripes + i;
				5575
				5576	new->physical = old->physical;
				5577	new->length = old->length;
				5578	new->dev = dev_replace->tgtdev;
				5579	bbio->tgtdev_map[i] = index_where_to_add;
				5580	index_where_to_add++;
				5581	max_errors++;
				5582	tgtdev_indexes++;
				5583	}
				5584	}
				5585	num_stripes = index_where_to_add;
				5586	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
				5587	int index_srcdev = 0;
				5588	int found = 0;
				5589	u64 physical_of_found = 0;
				5590
				5591	/*
				5592	* During the dev-replace procedure, the target drive can also
				5593	* be used to read data in case it is needed to repair a corrupt
				5594	* block elsewhere. This is possible if the requested area is
				5595	* left of the left cursor. In this area, the target drive is a
				5596	* full copy of the source drive.
				5597	*/
				5598	for (i = 0; i < num_stripes; i++) {
				5599	if (bbio->stripes[i].dev->devid == srcdev_devid) {
				5600	/*
				5601	* In case of DUP, in order to keep it simple,
				5602	* only add the mirror with the lowest physical
				5603	* address
				5604	*/
				5605	if (found &&
				5606	physical_of_found <=
				5607	bbio->stripes[i].physical)
				5608	continue;
				5609	index_srcdev = i;
				5610	found = 1;
				5611	physical_of_found = bbio->stripes[i].physical;
				5612	}
				5613	}
				5614	if (found) {
				5615	struct btrfs_bio_stripe *tgtdev_stripe =
				5616	bbio->stripes + num_stripes;
				5617
				5618	tgtdev_stripe->physical = physical_of_found;
				5619	tgtdev_stripe->length =
				5620	bbio->stripes[index_srcdev].length;
				5621	tgtdev_stripe->dev = dev_replace->tgtdev;
				5622	bbio->tgtdev_map[index_srcdev] = num_stripes;
				5623
				5624	tgtdev_indexes++;
				5625	num_stripes++;
				5626	}
				5627	}
				5628
				5629	*num_stripes_ret = num_stripes;
				5630	*max_errors_ret = max_errors;
				5631	bbio->num_tgtdevs = tgtdev_indexes;
				5632	*bbio_ret = bbio;
				5633	}
				5634
				5635	static bool need_full_stripe(enum btrfs_map_op op)
				5636	{
				5637	return (op == BTRFS_MAP_WRITE \|\| op == BTRFS_MAP_GET_READ_MIRRORS);
				5638	}
				5639
				5640	static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
				5641	enum btrfs_map_op op,
				5642	u64 logical, u64 *length,
				5643	struct btrfs_bio **bbio_ret,
				5644	int mirror_num, int need_raid_map)
				5645	{
				5646	struct extent_map *em;
				5647	struct map_lookup *map;
				5648	u64 offset;
				5649	u64 stripe_offset;
				5650	u64 stripe_nr;
				5651	u64 stripe_len;
				5652	u32 stripe_index;
				5653	int i;
				5654	int ret = 0;
				5655	int num_stripes;
				5656	int max_errors = 0;
				5657	int tgtdev_indexes = 0;
				5658	struct btrfs_bio *bbio = NULL;
				5659	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
				5660	int dev_replace_is_ongoing = 0;
				5661	int num_alloc_stripes;
				5662	int patch_the_first_stripe_for_dev_replace = 0;
				5663	u64 physical_to_patch_in_first_stripe = 0;
				5664	u64 raid56_full_stripe_start = (u64)-1;
				5665
				5666	if (op == BTRFS_MAP_DISCARD)
				5667	return __btrfs_map_block_for_discard(fs_info, logical,
				5668	*length, bbio_ret);
				5669
				5670	em = get_chunk_map(fs_info, logical, *length);
				5671	if (IS_ERR(em))
				5672	return PTR_ERR(em);
				5673
				5674	map = em->map_lookup;
				5675	offset = logical - em->start;
				5676
				5677	stripe_len = map->stripe_len;
				5678	stripe_nr = offset;
				5679	/*
				5680	* stripe_nr counts the total number of stripes we have to stride
				5681	* to get to this block
				5682	*/
				5683	stripe_nr = div64_u64(stripe_nr, stripe_len);
				5684
				5685	stripe_offset = stripe_nr * stripe_len;
				5686	if (offset < stripe_offset) {
				5687	btrfs_crit(fs_info,
				5688	"stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
				5689	stripe_offset, offset, em->start, logical,
				5690	stripe_len);
				5691	free_extent_map(em);
				5692	return -EINVAL;
				5693	}
				5694
				5695	/* stripe_offset is the offset of this block in its stripe*/
				5696	stripe_offset = offset - stripe_offset;
				5697
				5698	/* if we're here for raid56, we need to know the stripe aligned start */
				5699	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5700	unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
				5701	raid56_full_stripe_start = offset;
				5702
				5703	/* allow a write of a full stripe, but make sure we don't
				5704	* allow straddling of stripes
				5705	*/
				5706	raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
				5707	full_stripe_len);
				5708	raid56_full_stripe_start *= full_stripe_len;
				5709	}
				5710
				5711	if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
				5712	u64 max_len;
				5713	/* For writes to RAID[56], allow a full stripeset across all disks.
				5714	For other RAID types and for RAID[56] reads, just allow a single
				5715	stripe (on a single disk). */
				5716	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
				5717	(op == BTRFS_MAP_WRITE)) {
				5718	max_len = stripe_len * nr_data_stripes(map) -
				5719	(offset - raid56_full_stripe_start);
				5720	} else {
				5721	/* we limit the length of each bio to what fits in a stripe */
				5722	max_len = stripe_len - stripe_offset;
				5723	}
				5724	*length = min_t(u64, em->len - offset, max_len);
				5725	} else {
				5726	*length = em->len - offset;
				5727	}
				5728
				5729	/* This is for when we're called from btrfs_merge_bio_hook() and all
				5730	it cares about is the length */
				5731	if (!bbio_ret)
				5732	goto out;
				5733
				5734	btrfs_dev_replace_read_lock(dev_replace);
				5735	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
				5736	if (!dev_replace_is_ongoing)
				5737	btrfs_dev_replace_read_unlock(dev_replace);
				5738	else
				5739	btrfs_dev_replace_set_lock_blocking(dev_replace);
				5740
				5741	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
				5742	!need_full_stripe(op) && dev_replace->tgtdev != NULL) {
				5743	ret = get_extra_mirror_from_replace(fs_info, logical, *length,
				5744	dev_replace->srcdev->devid,
				5745	&mirror_num,
				5746	&physical_to_patch_in_first_stripe);
				5747	if (ret)
				5748	goto out;
				5749	else
				5750	patch_the_first_stripe_for_dev_replace = 1;
				5751	} else if (mirror_num > map->num_stripes) {
				5752	mirror_num = 0;
				5753	}
				5754
				5755	num_stripes = 1;
				5756	stripe_index = 0;
				5757	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				5758	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				5759	&stripe_index);
				5760	if (!need_full_stripe(op))
				5761	mirror_num = 1;
				5762	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
				5763	if (need_full_stripe(op))
				5764	num_stripes = map->num_stripes;
				5765	else if (mirror_num)
				5766	stripe_index = mirror_num - 1;
				5767	else {
				5768	stripe_index = find_live_mirror(fs_info, map, 0,
				5769	dev_replace_is_ongoing);
				5770	mirror_num = stripe_index + 1;
				5771	}
				5772
				5773	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
				5774	if (need_full_stripe(op)) {
				5775	num_stripes = map->num_stripes;
				5776	} else if (mirror_num) {
				5777	stripe_index = mirror_num - 1;
				5778	} else {
				5779	mirror_num = 1;
				5780	}
				5781
				5782	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				5783	u32 factor = map->num_stripes / map->sub_stripes;
				5784
				5785	stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
				5786	stripe_index *= map->sub_stripes;
				5787
				5788	if (need_full_stripe(op))
				5789	num_stripes = map->sub_stripes;
				5790	else if (mirror_num)
				5791	stripe_index += mirror_num - 1;
				5792	else {
				5793	int old_stripe_index = stripe_index;
				5794	stripe_index = find_live_mirror(fs_info, map,
				5795	stripe_index,
				5796	dev_replace_is_ongoing);
				5797	mirror_num = stripe_index - old_stripe_index + 1;
				5798	}
				5799
				5800	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5801	if (need_raid_map && (need_full_stripe(op) \|\| mirror_num > 1)) {
				5802	/* push stripe_nr back to the start of the full stripe */
				5803	stripe_nr = div64_u64(raid56_full_stripe_start,
				5804	stripe_len * nr_data_stripes(map));
				5805
				5806	/* RAID[56] write or recovery. Return all stripes */
				5807	num_stripes = map->num_stripes;
				5808	max_errors = nr_parity_stripes(map);
				5809
				5810	*length = map->stripe_len;
				5811	stripe_index = 0;
				5812	stripe_offset = 0;
				5813	} else {
				5814	/*
				5815	* Mirror #0 or #1 means the original data block.
				5816	* Mirror #2 is RAID5 parity block.
				5817	* Mirror #3 is RAID6 Q block.
				5818	*/
				5819	stripe_nr = div_u64_rem(stripe_nr,
				5820	nr_data_stripes(map), &stripe_index);
				5821	if (mirror_num > 1)
				5822	stripe_index = nr_data_stripes(map) +
				5823	mirror_num - 2;
				5824
				5825	/* We distribute the parity blocks across stripes */
				5826	div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
				5827	&stripe_index);
				5828	if (!need_full_stripe(op) && mirror_num <= 1)
				5829	mirror_num = 1;
				5830	}
				5831	} else {
				5832	/*
				5833	* after this, stripe_nr is the number of stripes on this
				5834	* device we have to walk to find the data, and stripe_index is
				5835	* the number of our device in the stripe array
				5836	*/
				5837	stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
				5838	&stripe_index);
				5839	mirror_num = stripe_index + 1;
				5840	}
				5841	if (stripe_index >= map->num_stripes) {
				5842	btrfs_crit(fs_info,
				5843	"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
				5844	stripe_index, map->num_stripes);
				5845	ret = -EINVAL;
				5846	goto out;
				5847	}
				5848
				5849	num_alloc_stripes = num_stripes;
				5850	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
				5851	if (op == BTRFS_MAP_WRITE)
				5852	num_alloc_stripes <<= 1;
				5853	if (op == BTRFS_MAP_GET_READ_MIRRORS)
				5854	num_alloc_stripes++;
				5855	tgtdev_indexes = num_stripes;
				5856	}
				5857
				5858	bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
				5859	if (!bbio) {
				5860	ret = -ENOMEM;
				5861	goto out;
				5862	}
				5863	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
				5864	bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
				5865
				5866	/* build raid_map */
				5867	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
				5868	(need_full_stripe(op) \|\| mirror_num > 1)) {
				5869	u64 tmp;
				5870	unsigned rot;
				5871
				5872	bbio->raid_map = (u64 )((void )bbio->stripes +
				5873	sizeof(struct btrfs_bio_stripe) *
				5874	num_alloc_stripes +
				5875	sizeof(int) * tgtdev_indexes);
				5876
				5877	/* Work out the disk rotation on this stripe-set */
				5878	div_u64_rem(stripe_nr, num_stripes, &rot);
				5879
				5880	/* Fill in the logical address of each stripe */
				5881	tmp = stripe_nr * nr_data_stripes(map);
				5882	for (i = 0; i < nr_data_stripes(map); i++)
				5883	bbio->raid_map[(i+rot) % num_stripes] =
				5884	em->start + (tmp + i) * map->stripe_len;
				5885
				5886	bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
				5887	if (map->type & BTRFS_BLOCK_GROUP_RAID6)
				5888	bbio->raid_map[(i+rot+1) % num_stripes] =
				5889	RAID6_Q_STRIPE;
				5890	}
				5891
				5892
				5893	for (i = 0; i < num_stripes; i++) {
				5894	bbio->stripes[i].physical =
				5895	map->stripes[stripe_index].physical +
				5896	stripe_offset +
				5897	stripe_nr * map->stripe_len;
				5898	bbio->stripes[i].dev =
				5899	map->stripes[stripe_index].dev;
				5900	stripe_index++;
				5901	}
				5902
				5903	if (need_full_stripe(op))
				5904	max_errors = btrfs_chunk_max_errors(map);
				5905
				5906	if (bbio->raid_map)
				5907	sort_parity_stripes(bbio, num_stripes);
				5908
				5909	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
				5910	need_full_stripe(op)) {
				5911	handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
				5912	&max_errors);
				5913	}
				5914
				5915	*bbio_ret = bbio;
				5916	bbio->map_type = map->type;
				5917	bbio->num_stripes = num_stripes;
				5918	bbio->max_errors = max_errors;
				5919	bbio->mirror_num = mirror_num;
				5920
				5921	/*
				5922	* this is the case that REQ_READ && dev_replace_is_ongoing &&
				5923	* mirror_num == num_stripes + 1 && dev_replace target drive is
				5924	* available as a mirror
				5925	*/
				5926	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
				5927	WARN_ON(num_stripes > 1);
				5928	bbio->stripes[0].dev = dev_replace->tgtdev;
				5929	bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
				5930	bbio->mirror_num = map->num_stripes + 1;
				5931	}
				5932	out:
				5933	if (dev_replace_is_ongoing) {
				5934	btrfs_dev_replace_clear_lock_blocking(dev_replace);
				5935	btrfs_dev_replace_read_unlock(dev_replace);
				5936	}
				5937	free_extent_map(em);
				5938	return ret;
				5939	}
				5940
				5941	int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
				5942	u64 logical, u64 *length,
				5943	struct btrfs_bio **bbio_ret, int mirror_num)
				5944	{
				5945	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
				5946	mirror_num, 0);
				5947	}
				5948
				5949	/* For Scrub/replace */
				5950	int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
				5951	u64 logical, u64 *length,
				5952	struct btrfs_bio **bbio_ret)
				5953	{
				5954	return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
				5955	}
				5956
				5957	int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
				5958	u64 physical, u64 *logical, int naddrs, int *stripe_len)
				5959	{
				5960	struct extent_map *em;
				5961	struct map_lookup *map;
				5962	u64 *buf;
				5963	u64 bytenr;
				5964	u64 length;
				5965	u64 stripe_nr;
				5966	u64 rmap_len;
				5967	int i, j, nr = 0;
				5968
				5969	em = get_chunk_map(fs_info, chunk_start, 1);
				5970	if (IS_ERR(em))
				5971	return -EIO;
				5972
				5973	map = em->map_lookup;
				5974	length = em->len;
				5975	rmap_len = map->stripe_len;
				5976
				5977	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
				5978	length = div_u64(length, map->num_stripes / map->sub_stripes);
				5979	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
				5980	length = div_u64(length, map->num_stripes);
				5981	else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
				5982	length = div_u64(length, nr_data_stripes(map));
				5983	rmap_len = map->stripe_len * nr_data_stripes(map);
				5984	}
				5985
				5986	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
				5987	BUG_ON(!buf); /* -ENOMEM */
				5988
				5989	for (i = 0; i < map->num_stripes; i++) {
				5990	if (map->stripes[i].physical > physical \|\|
				5991	map->stripes[i].physical + length <= physical)
				5992	continue;
				5993
				5994	stripe_nr = physical - map->stripes[i].physical;
				5995	stripe_nr = div64_u64(stripe_nr, map->stripe_len);
				5996
				5997	if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				5998	stripe_nr = stripe_nr * map->num_stripes + i;
				5999	stripe_nr = div_u64(stripe_nr, map->sub_stripes);
				6000	} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				6001	stripe_nr = stripe_nr * map->num_stripes + i;
				6002	} /* else if RAID[56], multiply by nr_data_stripes().
				6003	* Alternatively, just use rmap_len below instead of
				6004	* map->stripe_len */
				6005
				6006	bytenr = chunk_start + stripe_nr * rmap_len;
				6007	WARN_ON(nr >= map->num_stripes);
				6008	for (j = 0; j < nr; j++) {
				6009	if (buf[j] == bytenr)
				6010	break;
				6011	}
				6012	if (j == nr) {
				6013	WARN_ON(nr >= map->num_stripes);
				6014	buf[nr++] = bytenr;
				6015	}
				6016	}
				6017
				6018	*logical = buf;
				6019	*naddrs = nr;
				6020	*stripe_len = rmap_len;
				6021
				6022	free_extent_map(em);
				6023	return 0;
				6024	}
				6025
				6026	static inline void btrfs_end_bbio(struct btrfs_bio bbio, struct bio bio)
				6027	{
				6028	bio->bi_private = bbio->private;
				6029	bio->bi_end_io = bbio->end_io;
				6030	bio_endio(bio);
				6031
				6032	btrfs_put_bbio(bbio);
				6033	}
				6034
				6035	static void btrfs_end_bio(struct bio *bio)
				6036	{
				6037	struct btrfs_bio *bbio = bio->bi_private;
				6038	int is_orig_bio = 0;
				6039
				6040	if (bio->bi_status) {
				6041	atomic_inc(&bbio->error);
				6042	if (bio->bi_status == BLK_STS_IOERR \|\|
				6043	bio->bi_status == BLK_STS_TARGET) {
				6044	unsigned int stripe_index =
				6045	btrfs_io_bio(bio)->stripe_index;
				6046	struct btrfs_device *dev;
				6047
				6048	BUG_ON(stripe_index >= bbio->num_stripes);
				6049	dev = bbio->stripes[stripe_index].dev;
				6050	if (dev->bdev) {
				6051	if (bio_op(bio) == REQ_OP_WRITE)
				6052	btrfs_dev_stat_inc_and_print(dev,
				6053	BTRFS_DEV_STAT_WRITE_ERRS);
				6054	else if (!(bio->bi_opf & REQ_RAHEAD))
				6055	btrfs_dev_stat_inc_and_print(dev,
				6056	BTRFS_DEV_STAT_READ_ERRS);
				6057	if (bio->bi_opf & REQ_PREFLUSH)
				6058	btrfs_dev_stat_inc_and_print(dev,
				6059	BTRFS_DEV_STAT_FLUSH_ERRS);
				6060	}
				6061	}
				6062	}
				6063
				6064	if (bio == bbio->orig_bio)
				6065	is_orig_bio = 1;
				6066
				6067	btrfs_bio_counter_dec(bbio->fs_info);
				6068
				6069	if (atomic_dec_and_test(&bbio->stripes_pending)) {
				6070	if (!is_orig_bio) {
				6071	bio_put(bio);
				6072	bio = bbio->orig_bio;
				6073	}
				6074
				6075	btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
				6076	/* only send an error to the higher layers if it is
				6077	* beyond the tolerance of the btrfs bio
				6078	*/
				6079	if (atomic_read(&bbio->error) > bbio->max_errors) {
				6080	bio->bi_status = BLK_STS_IOERR;
				6081	} else {
				6082	/*
				6083	* this bio is actually up to date, we didn't
				6084	* go over the max number of errors
				6085	*/
				6086	bio->bi_status = BLK_STS_OK;
				6087	}
				6088
				6089	btrfs_end_bbio(bbio, bio);
				6090	} else if (!is_orig_bio) {
				6091	bio_put(bio);
				6092	}
				6093	}
				6094
				6095	/*
				6096	* see run_scheduled_bios for a description of why bios are collected for
				6097	* async submit.
				6098	*
				6099	* This will add one bio to the pending list for a device and make sure
				6100	* the work struct is scheduled.
				6101	*/
				6102	static noinline void btrfs_schedule_bio(struct btrfs_device *device,
				6103	struct bio *bio)
				6104	{
				6105	struct btrfs_fs_info *fs_info = device->fs_info;
				6106	int should_queue = 1;
				6107	struct btrfs_pending_bios *pending_bios;
				6108
				6109	/* don't bother with additional async steps for reads, right now */
				6110	if (bio_op(bio) == REQ_OP_READ) {
				6111	btrfsic_submit_bio(bio);
				6112	return;
				6113	}
				6114
				6115	WARN_ON(bio->bi_next);
				6116	bio->bi_next = NULL;
				6117
				6118	spin_lock(&device->io_lock);
				6119	if (op_is_sync(bio->bi_opf))
				6120	pending_bios = &device->pending_sync_bios;
				6121	else
				6122	pending_bios = &device->pending_bios;
				6123
				6124	if (pending_bios->tail)
				6125	pending_bios->tail->bi_next = bio;
				6126
				6127	pending_bios->tail = bio;
				6128	if (!pending_bios->head)
				6129	pending_bios->head = bio;
				6130	if (device->running_pending)
				6131	should_queue = 0;
				6132
				6133	spin_unlock(&device->io_lock);
				6134
				6135	if (should_queue)
				6136	btrfs_queue_work(fs_info->submit_workers, &device->work);
				6137	}
				6138
				6139	static void submit_stripe_bio(struct btrfs_bio bbio, struct bio bio,
				6140	u64 physical, int dev_nr, int async)
				6141	{
				6142	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
				6143	struct btrfs_fs_info *fs_info = bbio->fs_info;
				6144
				6145	bio->bi_private = bbio;
				6146	btrfs_io_bio(bio)->stripe_index = dev_nr;
				6147	bio->bi_end_io = btrfs_end_bio;
				6148	bio->bi_iter.bi_sector = physical >> 9;
				6149	btrfs_debug_in_rcu(fs_info,
				6150	"btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
				6151	bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
				6152	(u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
				6153	bio->bi_iter.bi_size);
				6154	bio_set_dev(bio, dev->bdev);
				6155
				6156	btrfs_bio_counter_inc_noblocked(fs_info);
				6157
				6158	if (async)
				6159	btrfs_schedule_bio(dev, bio);
				6160	else
				6161	btrfsic_submit_bio(bio);
				6162	}
				6163
				6164	static void bbio_error(struct btrfs_bio bbio, struct bio bio, u64 logical)
				6165	{
				6166	atomic_inc(&bbio->error);
				6167	if (atomic_dec_and_test(&bbio->stripes_pending)) {
				6168	/* Should be the original bio. */
				6169	WARN_ON(bio != bbio->orig_bio);
				6170
				6171	btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
				6172	bio->bi_iter.bi_sector = logical >> 9;
				6173	if (atomic_read(&bbio->error) > bbio->max_errors)
				6174	bio->bi_status = BLK_STS_IOERR;
				6175	else
				6176	bio->bi_status = BLK_STS_OK;
				6177	btrfs_end_bbio(bbio, bio);
				6178	}
				6179	}
				6180
				6181	blk_status_t btrfs_map_bio(struct btrfs_fs_info fs_info, struct bio bio,
				6182	int mirror_num, int async_submit)
				6183	{
				6184	struct btrfs_device *dev;
				6185	struct bio *first_bio = bio;
				6186	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
				6187	u64 length = 0;
				6188	u64 map_length;
				6189	int ret;
				6190	int dev_nr;
				6191	int total_devs;
				6192	struct btrfs_bio *bbio = NULL;
				6193
				6194	length = bio->bi_iter.bi_size;
				6195	map_length = length;
				6196
				6197	btrfs_bio_counter_inc_blocked(fs_info);
				6198	ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
				6199	&map_length, &bbio, mirror_num, 1);
				6200	if (ret) {
				6201	btrfs_bio_counter_dec(fs_info);
				6202	return errno_to_blk_status(ret);
				6203	}
				6204
				6205	total_devs = bbio->num_stripes;
				6206	bbio->orig_bio = first_bio;
				6207	bbio->private = first_bio->bi_private;
				6208	bbio->end_io = first_bio->bi_end_io;
				6209	bbio->fs_info = fs_info;
				6210	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
				6211
				6212	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
				6213	((bio_op(bio) == REQ_OP_WRITE) \|\| (mirror_num > 1))) {
				6214	/* In this case, map_length has been set to the length of
				6215	a single stripe; not the whole write */
				6216	if (bio_op(bio) == REQ_OP_WRITE) {
				6217	ret = raid56_parity_write(fs_info, bio, bbio,
				6218	map_length);
				6219	} else {
				6220	ret = raid56_parity_recover(fs_info, bio, bbio,
				6221	map_length, mirror_num, 1);
				6222	}
				6223
				6224	btrfs_bio_counter_dec(fs_info);
				6225	return errno_to_blk_status(ret);
				6226	}
				6227
				6228	if (map_length < length) {
				6229	btrfs_crit(fs_info,
				6230	"mapping failed logical %llu bio len %llu len %llu",
				6231	logical, length, map_length);
				6232	BUG();
				6233	}
				6234
				6235	for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
				6236	dev = bbio->stripes[dev_nr].dev;
				6237	if (!dev \|\| !dev->bdev \|\| test_bit(BTRFS_DEV_STATE_MISSING,
				6238	&dev->dev_state) \|\|
				6239	(bio_op(first_bio) == REQ_OP_WRITE &&
				6240	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
				6241	bbio_error(bbio, first_bio, logical);
				6242	continue;
				6243	}
				6244
				6245	if (dev_nr < total_devs - 1)
				6246	bio = btrfs_bio_clone(first_bio);
				6247	else
				6248	bio = first_bio;
				6249
				6250	submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
				6251	dev_nr, async_submit);
				6252	}
				6253	btrfs_bio_counter_dec(fs_info);
				6254	return BLK_STS_OK;
				6255	}
				6256
				6257	struct btrfs_device btrfs_find_device(struct btrfs_fs_info fs_info, u64 devid,
				6258	u8 uuid, u8 fsid)
				6259	{
				6260	struct btrfs_device *device;
				6261	struct btrfs_fs_devices *cur_devices;
				6262
				6263	cur_devices = fs_info->fs_devices;
				6264	while (cur_devices) {
				6265	if (!fsid \|\|
				6266	!memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
				6267	device = find_device(cur_devices, devid, uuid);
				6268	if (device)
				6269	return device;
				6270	}
				6271	cur_devices = cur_devices->seed;
				6272	}
				6273	return NULL;
				6274	}
				6275
				6276	static struct btrfs_device add_missing_dev(struct btrfs_fs_devices fs_devices,
				6277	u64 devid, u8 *dev_uuid)
				6278	{
				6279	struct btrfs_device *device;
				6280
				6281	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
				6282	if (IS_ERR(device))
				6283	return device;
				6284
				6285	list_add(&device->dev_list, &fs_devices->devices);
				6286	device->fs_devices = fs_devices;
				6287	fs_devices->num_devices++;
				6288
				6289	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
				6290	fs_devices->missing_devices++;
				6291
				6292	return device;
				6293	}
				6294
				6295	/**
				6296	* btrfs_alloc_device - allocate struct btrfs_device
				6297	* @fs_info: used only for generating a new devid, can be NULL if
				6298	* devid is provided (i.e. @devid != NULL).
				6299	* @devid: a pointer to devid for this device. If NULL a new devid
				6300	* is generated.
				6301	* @uuid: a pointer to UUID for this device. If NULL a new UUID
				6302	* is generated.
				6303	*
				6304	* Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
				6305	* on error. Returned struct is not linked onto any lists and must be
				6306	* destroyed with btrfs_free_device.
				6307	*/
				6308	struct btrfs_device btrfs_alloc_device(struct btrfs_fs_info fs_info,
				6309	const u64 *devid,
				6310	const u8 *uuid)
				6311	{
				6312	struct btrfs_device *dev;
				6313	u64 tmp;
				6314
				6315	if (WARN_ON(!devid && !fs_info))
				6316	return ERR_PTR(-EINVAL);
				6317
				6318	dev = __alloc_device();
				6319	if (IS_ERR(dev))
				6320	return dev;
				6321
				6322	if (devid)
				6323	tmp = *devid;
				6324	else {
				6325	int ret;
				6326
				6327	ret = find_next_devid(fs_info, &tmp);
				6328	if (ret) {
				6329	btrfs_free_device(dev);
				6330	return ERR_PTR(ret);
				6331	}
				6332	}
				6333	dev->devid = tmp;
				6334
				6335	if (uuid)
				6336	memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
				6337	else
				6338	generate_random_uuid(dev->uuid);
				6339
				6340	btrfs_init_work(&dev->work, btrfs_submit_helper,
				6341	pending_bios_fn, NULL, NULL);
				6342
				6343	return dev;
				6344	}
				6345
				6346	/* Return -EIO if any error, otherwise return 0. */
				6347	static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
				6348	struct extent_buffer *leaf,
				6349	struct btrfs_chunk *chunk, u64 logical)
				6350	{
				6351	u64 length;
				6352	u64 stripe_len;
				6353	u16 num_stripes;
				6354	u16 sub_stripes;
				6355	u64 type;
				6356	u64 features;
				6357	bool mixed = false;
				6358
				6359	length = btrfs_chunk_length(leaf, chunk);
				6360	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
				6361	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				6362	sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
				6363	type = btrfs_chunk_type(leaf, chunk);
				6364
				6365	if (!num_stripes) {
				6366	btrfs_err(fs_info, "invalid chunk num_stripes: %u",
				6367	num_stripes);
				6368	return -EIO;
				6369	}
				6370	if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
				6371	btrfs_err(fs_info, "invalid chunk logical %llu", logical);
				6372	return -EIO;
				6373	}
				6374	if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
				6375	btrfs_err(fs_info, "invalid chunk sectorsize %u",
				6376	btrfs_chunk_sector_size(leaf, chunk));
				6377	return -EIO;
				6378	}
				6379	if (!length \|\| !IS_ALIGNED(length, fs_info->sectorsize)) {
				6380	btrfs_err(fs_info, "invalid chunk length %llu", length);
				6381	return -EIO;
				6382	}
				6383	if (!is_power_of_2(stripe_len) \|\| stripe_len != BTRFS_STRIPE_LEN) {
				6384	btrfs_err(fs_info, "invalid chunk stripe length: %llu",
				6385	stripe_len);
				6386	return -EIO;
				6387	}
				6388	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK \| BTRFS_BLOCK_GROUP_PROFILE_MASK) &
				6389	type) {
				6390	btrfs_err(fs_info, "unrecognized chunk type: %llu",
				6391	~(BTRFS_BLOCK_GROUP_TYPE_MASK \|
				6392	BTRFS_BLOCK_GROUP_PROFILE_MASK) &
				6393	btrfs_chunk_type(leaf, chunk));
				6394	return -EIO;
				6395	}
				6396
				6397	if ((type & BTRFS_BLOCK_GROUP_TYPE_MASK) == 0) {
				6398	btrfs_err(fs_info, "missing chunk type flag: 0x%llx", type);
				6399	return -EIO;
				6400	}
				6401
				6402	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) &&
				6403	(type & (BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA))) {
				6404	btrfs_err(fs_info,
				6405	"system chunk with data or metadata type: 0x%llx", type);
				6406	return -EIO;
				6407	}
				6408
				6409	features = btrfs_super_incompat_flags(fs_info->super_copy);
				6410	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				6411	mixed = true;
				6412
				6413	if (!mixed) {
				6414	if ((type & BTRFS_BLOCK_GROUP_METADATA) &&
				6415	(type & BTRFS_BLOCK_GROUP_DATA)) {
				6416	btrfs_err(fs_info,
				6417	"mixed chunk type in non-mixed mode: 0x%llx", type);
				6418	return -EIO;
				6419	}
				6420	}
				6421
				6422	if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) \|\|
				6423	(type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) \|\|
				6424	(type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) \|\|
				6425	(type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) \|\|
				6426	(type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) \|\|
				6427	((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
				6428	num_stripes != 1)) {
				6429	btrfs_err(fs_info,
				6430	"invalid num_stripes:sub_stripes %u:%u for profile %llu",
				6431	num_stripes, sub_stripes,
				6432	type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
				6433	return -EIO;
				6434	}
				6435
				6436	return 0;
				6437	}
				6438
				6439	static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
				6440	u64 devid, u8 *uuid, bool error)
				6441	{
				6442	if (error)
				6443	btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
				6444	devid, uuid);
				6445	else
				6446	btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
				6447	devid, uuid);
				6448	}
				6449
				6450	static int read_one_chunk(struct btrfs_fs_info fs_info, struct btrfs_key key,
				6451	struct extent_buffer *leaf,
				6452	struct btrfs_chunk *chunk)
				6453	{
				6454	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
				6455	struct map_lookup *map;
				6456	struct extent_map *em;
				6457	u64 logical;
				6458	u64 length;
				6459	u64 devid;
				6460	u8 uuid[BTRFS_UUID_SIZE];
				6461	int num_stripes;
				6462	int ret;
				6463	int i;
				6464
				6465	logical = key->offset;
				6466	length = btrfs_chunk_length(leaf, chunk);
				6467	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
				6468
				6469	ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
				6470	if (ret)
				6471	return ret;
				6472
				6473	read_lock(&map_tree->map_tree.lock);
				6474	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
				6475	read_unlock(&map_tree->map_tree.lock);
				6476
				6477	/* already mapped? */
				6478	if (em && em->start <= logical && em->start + em->len > logical) {
				6479	free_extent_map(em);
				6480	return 0;
				6481	} else if (em) {
				6482	free_extent_map(em);
				6483	}
				6484
				6485	em = alloc_extent_map();
				6486	if (!em)
				6487	return -ENOMEM;
				6488	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
				6489	if (!map) {
				6490	free_extent_map(em);
				6491	return -ENOMEM;
				6492	}
				6493
				6494	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
				6495	em->map_lookup = map;
				6496	em->start = logical;
				6497	em->len = length;
				6498	em->orig_start = 0;
				6499	em->block_start = 0;
				6500	em->block_len = em->len;
				6501
				6502	map->num_stripes = num_stripes;
				6503	map->io_width = btrfs_chunk_io_width(leaf, chunk);
				6504	map->io_align = btrfs_chunk_io_align(leaf, chunk);
				6505	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
				6506	map->type = btrfs_chunk_type(leaf, chunk);
				6507	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
				6508	map->verified_stripes = 0;
				6509	for (i = 0; i < num_stripes; i++) {
				6510	map->stripes[i].physical =
				6511	btrfs_stripe_offset_nr(leaf, chunk, i);
				6512	devid = btrfs_stripe_devid_nr(leaf, chunk, i);
				6513	read_extent_buffer(leaf, uuid, (unsigned long)
				6514	btrfs_stripe_dev_uuid_nr(chunk, i),
				6515	BTRFS_UUID_SIZE);
				6516	map->stripes[i].dev = btrfs_find_device(fs_info, devid,
				6517	uuid, NULL);
				6518	if (!map->stripes[i].dev &&
				6519	!btrfs_test_opt(fs_info, DEGRADED)) {
				6520	free_extent_map(em);
				6521	btrfs_report_missing_device(fs_info, devid, uuid, true);
				6522	return -ENOENT;
				6523	}
				6524	if (!map->stripes[i].dev) {
				6525	map->stripes[i].dev =
				6526	add_missing_dev(fs_info->fs_devices, devid,
				6527	uuid);
				6528	if (IS_ERR(map->stripes[i].dev)) {
				6529	free_extent_map(em);
				6530	btrfs_err(fs_info,
				6531	"failed to init missing dev %llu: %ld",
				6532	devid, PTR_ERR(map->stripes[i].dev));
				6533	return PTR_ERR(map->stripes[i].dev);
				6534	}
				6535	btrfs_report_missing_device(fs_info, devid, uuid, false);
				6536	}
				6537	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				6538	&(map->stripes[i].dev->dev_state));
				6539
				6540	}
				6541
				6542	write_lock(&map_tree->map_tree.lock);
				6543	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
				6544	write_unlock(&map_tree->map_tree.lock);
				6545	if (ret < 0) {
				6546	btrfs_err(fs_info,
				6547	"failed to add chunk map, start=%llu len=%llu: %d",
				6548	em->start, em->len, ret);
				6549	}
				6550	free_extent_map(em);
				6551
				6552	return ret;
				6553	}
				6554
				6555	static void fill_device_from_item(struct extent_buffer *leaf,
				6556	struct btrfs_dev_item *dev_item,
				6557	struct btrfs_device *device)
				6558	{
				6559	unsigned long ptr;
				6560
				6561	device->devid = btrfs_device_id(leaf, dev_item);
				6562	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
				6563	device->total_bytes = device->disk_total_bytes;
				6564	device->commit_total_bytes = device->disk_total_bytes;
				6565	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
				6566	device->commit_bytes_used = device->bytes_used;
				6567	device->type = btrfs_device_type(leaf, dev_item);
				6568	device->io_align = btrfs_device_io_align(leaf, dev_item);
				6569	device->io_width = btrfs_device_io_width(leaf, dev_item);
				6570	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
				6571	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
				6572	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
				6573
				6574	ptr = btrfs_device_uuid(dev_item);
				6575	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
				6576	}
				6577
				6578	static struct btrfs_fs_devices open_seed_devices(struct btrfs_fs_info fs_info,
				6579	u8 *fsid)
				6580	{
				6581	struct btrfs_fs_devices *fs_devices;
				6582	int ret;
				6583
				6584	lockdep_assert_held(&uuid_mutex);
				6585	ASSERT(fsid);
				6586
				6587	fs_devices = fs_info->fs_devices->seed;
				6588	while (fs_devices) {
				6589	if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
				6590	return fs_devices;
				6591
				6592	fs_devices = fs_devices->seed;
				6593	}
				6594
				6595	fs_devices = find_fsid(fsid);
				6596	if (!fs_devices) {
				6597	if (!btrfs_test_opt(fs_info, DEGRADED))
				6598	return ERR_PTR(-ENOENT);
				6599
				6600	fs_devices = alloc_fs_devices(fsid);
				6601	if (IS_ERR(fs_devices))
				6602	return fs_devices;
				6603
				6604	fs_devices->seeding = 1;
				6605	fs_devices->opened = 1;
				6606	return fs_devices;
				6607	}
				6608
				6609	fs_devices = clone_fs_devices(fs_devices);
				6610	if (IS_ERR(fs_devices))
				6611	return fs_devices;
				6612
				6613	ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
				6614	if (ret) {
				6615	free_fs_devices(fs_devices);
				6616	fs_devices = ERR_PTR(ret);
				6617	goto out;
				6618	}
				6619
				6620	if (!fs_devices->seeding) {
				6621	close_fs_devices(fs_devices);
				6622	free_fs_devices(fs_devices);
				6623	fs_devices = ERR_PTR(-EINVAL);
				6624	goto out;
				6625	}
				6626
				6627	fs_devices->seed = fs_info->fs_devices->seed;
				6628	fs_info->fs_devices->seed = fs_devices;
				6629	out:
				6630	return fs_devices;
				6631	}
				6632
				6633	static int read_one_dev(struct btrfs_fs_info *fs_info,
				6634	struct extent_buffer *leaf,
				6635	struct btrfs_dev_item *dev_item)
				6636	{
				6637	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				6638	struct btrfs_device *device;
				6639	u64 devid;
				6640	int ret;
				6641	u8 fs_uuid[BTRFS_FSID_SIZE];
				6642	u8 dev_uuid[BTRFS_UUID_SIZE];
				6643
				6644	devid = btrfs_device_id(leaf, dev_item);
				6645	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
				6646	BTRFS_UUID_SIZE);
				6647	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
				6648	BTRFS_FSID_SIZE);
				6649
				6650	if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
				6651	fs_devices = open_seed_devices(fs_info, fs_uuid);
				6652	if (IS_ERR(fs_devices))
				6653	return PTR_ERR(fs_devices);
				6654	}
				6655
				6656	device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
				6657	if (!device) {
				6658	if (!btrfs_test_opt(fs_info, DEGRADED)) {
				6659	btrfs_report_missing_device(fs_info, devid,
				6660	dev_uuid, true);
				6661	return -ENOENT;
				6662	}
				6663
				6664	device = add_missing_dev(fs_devices, devid, dev_uuid);
				6665	if (IS_ERR(device)) {
				6666	btrfs_err(fs_info,
				6667	"failed to add missing dev %llu: %ld",
				6668	devid, PTR_ERR(device));
				6669	return PTR_ERR(device);
				6670	}
				6671	btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
				6672	} else {
				6673	if (!device->bdev) {
				6674	if (!btrfs_test_opt(fs_info, DEGRADED)) {
				6675	btrfs_report_missing_device(fs_info,
				6676	devid, dev_uuid, true);
				6677	return -ENOENT;
				6678	}
				6679	btrfs_report_missing_device(fs_info, devid,
				6680	dev_uuid, false);
				6681	}
				6682
				6683	if (!device->bdev &&
				6684	!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
				6685	/*
				6686	* this happens when a device that was properly setup
				6687	* in the device info lists suddenly goes bad.
				6688	* device->bdev is NULL, and so we have to set
				6689	* device->missing to one here
				6690	*/
				6691	device->fs_devices->missing_devices++;
				6692	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
				6693	}
				6694
				6695	/* Move the device to its own fs_devices */
				6696	if (device->fs_devices != fs_devices) {
				6697	ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
				6698	&device->dev_state));
				6699
				6700	list_move(&device->dev_list, &fs_devices->devices);
				6701	device->fs_devices->num_devices--;
				6702	fs_devices->num_devices++;
				6703
				6704	device->fs_devices->missing_devices--;
				6705	fs_devices->missing_devices++;
				6706
				6707	device->fs_devices = fs_devices;
				6708	}
				6709	}
				6710
				6711	if (device->fs_devices != fs_info->fs_devices) {
				6712	BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
				6713	if (device->generation !=
				6714	btrfs_device_generation(leaf, dev_item))
				6715	return -EINVAL;
				6716	}
				6717
				6718	fill_device_from_item(leaf, dev_item, device);
				6719	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
				6720	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
				6721	!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				6722	device->fs_devices->total_rw_bytes += device->total_bytes;
				6723	atomic64_add(device->total_bytes - device->bytes_used,
				6724	&fs_info->free_chunk_space);
				6725	}
				6726	ret = 0;
				6727	return ret;
				6728	}
				6729
				6730	int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
				6731	{
				6732	struct btrfs_root *root = fs_info->tree_root;
				6733	struct btrfs_super_block *super_copy = fs_info->super_copy;
				6734	struct extent_buffer *sb;
				6735	struct btrfs_disk_key *disk_key;
				6736	struct btrfs_chunk *chunk;
				6737	u8 *array_ptr;
				6738	unsigned long sb_array_offset;
				6739	int ret = 0;
				6740	u32 num_stripes;
				6741	u32 array_size;
				6742	u32 len = 0;
				6743	u32 cur_offset;
				6744	u64 type;
				6745	struct btrfs_key key;
				6746
				6747	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
				6748	/*
				6749	* This will create extent buffer of nodesize, superblock size is
				6750	* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
				6751	* overallocate but we can keep it as-is, only the first page is used.
				6752	*/
				6753	sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
				6754	if (IS_ERR(sb))
				6755	return PTR_ERR(sb);
				6756	set_extent_buffer_uptodate(sb);
				6757	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
				6758	/*
				6759	* The sb extent buffer is artificial and just used to read the system array.
				6760	* set_extent_buffer_uptodate() call does not properly mark all it's
				6761	* pages up-to-date when the page is larger: extent does not cover the
				6762	* whole page and consequently check_page_uptodate does not find all
				6763	* the page's extents up-to-date (the hole beyond sb),
				6764	* write_extent_buffer then triggers a WARN_ON.
				6765	*
				6766	* Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
				6767	* but sb spans only this function. Add an explicit SetPageUptodate call
				6768	* to silence the warning eg. on PowerPC 64.
				6769	*/
				6770	if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
				6771	SetPageUptodate(sb->pages[0]);
				6772
				6773	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
				6774	array_size = btrfs_super_sys_array_size(super_copy);
				6775
				6776	array_ptr = super_copy->sys_chunk_array;
				6777	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
				6778	cur_offset = 0;
				6779
				6780	while (cur_offset < array_size) {
				6781	disk_key = (struct btrfs_disk_key *)array_ptr;
				6782	len = sizeof(*disk_key);
				6783	if (cur_offset + len > array_size)
				6784	goto out_short_read;
				6785
				6786	btrfs_disk_key_to_cpu(&key, disk_key);
				6787
				6788	array_ptr += len;
				6789	sb_array_offset += len;
				6790	cur_offset += len;
				6791
				6792	if (key.type == BTRFS_CHUNK_ITEM_KEY) {
				6793	chunk = (struct btrfs_chunk *)sb_array_offset;
				6794	/*
				6795	* At least one btrfs_chunk with one stripe must be
				6796	* present, exact stripe count check comes afterwards
				6797	*/
				6798	len = btrfs_chunk_item_size(1);
				6799	if (cur_offset + len > array_size)
				6800	goto out_short_read;
				6801
				6802	num_stripes = btrfs_chunk_num_stripes(sb, chunk);
				6803	if (!num_stripes) {
				6804	btrfs_err(fs_info,
				6805	"invalid number of stripes %u in sys_array at offset %u",
				6806	num_stripes, cur_offset);
				6807	ret = -EIO;
				6808	break;
				6809	}
				6810
				6811	type = btrfs_chunk_type(sb, chunk);
				6812	if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
				6813	btrfs_err(fs_info,
				6814	"invalid chunk type %llu in sys_array at offset %u",
				6815	type, cur_offset);
				6816	ret = -EIO;
				6817	break;
				6818	}
				6819
				6820	len = btrfs_chunk_item_size(num_stripes);
				6821	if (cur_offset + len > array_size)
				6822	goto out_short_read;
				6823
				6824	ret = read_one_chunk(fs_info, &key, sb, chunk);
				6825	if (ret)
				6826	break;
				6827	} else {
				6828	btrfs_err(fs_info,
				6829	"unexpected item type %u in sys_array at offset %u",
				6830	(u32)key.type, cur_offset);
				6831	ret = -EIO;
				6832	break;
				6833	}
				6834	array_ptr += len;
				6835	sb_array_offset += len;
				6836	cur_offset += len;
				6837	}
				6838	clear_extent_buffer_uptodate(sb);
				6839	free_extent_buffer_stale(sb);
				6840	return ret;
				6841
				6842	out_short_read:
				6843	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
				6844	len, cur_offset);
				6845	clear_extent_buffer_uptodate(sb);
				6846	free_extent_buffer_stale(sb);
				6847	return -EIO;
				6848	}
				6849
				6850	/*
				6851	* Check if all chunks in the fs are OK for read-write degraded mount
				6852	*
				6853	* If the @failing_dev is specified, it's accounted as missing.
				6854	*
				6855	* Return true if all chunks meet the minimal RW mount requirements.
				6856	* Return false if any chunk doesn't meet the minimal RW mount requirements.
				6857	*/
				6858	bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
				6859	struct btrfs_device *failing_dev)
				6860	{
				6861	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
				6862	struct extent_map *em;
				6863	u64 next_start = 0;
				6864	bool ret = true;
				6865
				6866	read_lock(&map_tree->map_tree.lock);
				6867	em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
				6868	read_unlock(&map_tree->map_tree.lock);
				6869	/* No chunk at all? Return false anyway */
				6870	if (!em) {
				6871	ret = false;
				6872	goto out;
				6873	}
				6874	while (em) {
				6875	struct map_lookup *map;
				6876	int missing = 0;
				6877	int max_tolerated;
				6878	int i;
				6879
				6880	map = em->map_lookup;
				6881	max_tolerated =
				6882	btrfs_get_num_tolerated_disk_barrier_failures(
				6883	map->type);
				6884	for (i = 0; i < map->num_stripes; i++) {
				6885	struct btrfs_device *dev = map->stripes[i].dev;
				6886
				6887	if (!dev \|\| !dev->bdev \|\|
				6888	test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) \|\|
				6889	dev->last_flush_error)
				6890	missing++;
				6891	else if (failing_dev && failing_dev == dev)
				6892	missing++;
				6893	}
				6894	if (missing > max_tolerated) {
				6895	if (!failing_dev)
				6896	btrfs_warn(fs_info,
				6897	"chunk %llu missing %d devices, max tolerance is %d for writeable mount",
				6898	em->start, missing, max_tolerated);
				6899	free_extent_map(em);
				6900	ret = false;
				6901	goto out;
				6902	}
				6903	next_start = extent_map_end(em);
				6904	free_extent_map(em);
				6905
				6906	read_lock(&map_tree->map_tree.lock);
				6907	em = lookup_extent_mapping(&map_tree->map_tree, next_start,
				6908	(u64)(-1) - next_start);
				6909	read_unlock(&map_tree->map_tree.lock);
				6910	}
				6911	out:
				6912	return ret;
				6913	}
				6914
				6915	int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
				6916	{
				6917	struct btrfs_root *root = fs_info->chunk_root;
				6918	struct btrfs_path *path;
				6919	struct extent_buffer *leaf;
				6920	struct btrfs_key key;
				6921	struct btrfs_key found_key;
				6922	int ret;
				6923	int slot;
				6924	u64 total_dev = 0;
				6925
				6926	path = btrfs_alloc_path();
				6927	if (!path)
				6928	return -ENOMEM;
				6929
				6930	/*
				6931	* uuid_mutex is needed only if we are mounting a sprout FS
				6932	* otherwise we don't need it.
				6933	*/
				6934	mutex_lock(&uuid_mutex);
				6935	mutex_lock(&fs_info->chunk_mutex);
				6936
				6937	/*
				6938	* Read all device items, and then all the chunk items. All
				6939	* device items are found before any chunk item (their object id
				6940	* is smaller than the lowest possible object id for a chunk
				6941	* item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
				6942	*/
				6943	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
				6944	key.offset = 0;
				6945	key.type = 0;
				6946	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				6947	if (ret < 0)
				6948	goto error;
				6949	while (1) {
				6950	leaf = path->nodes[0];
				6951	slot = path->slots[0];
				6952	if (slot >= btrfs_header_nritems(leaf)) {
				6953	ret = btrfs_next_leaf(root, path);
				6954	if (ret == 0)
				6955	continue;
				6956	if (ret < 0)
				6957	goto error;
				6958	break;
				6959	}
				6960	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				6961	if (found_key.type == BTRFS_DEV_ITEM_KEY) {
				6962	struct btrfs_dev_item *dev_item;
				6963	dev_item = btrfs_item_ptr(leaf, slot,
				6964	struct btrfs_dev_item);
				6965	ret = read_one_dev(fs_info, leaf, dev_item);
				6966	if (ret)
				6967	goto error;
				6968	total_dev++;
				6969	} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
				6970	struct btrfs_chunk *chunk;
				6971	chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
				6972	ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
				6973	if (ret)
				6974	goto error;
				6975	}
				6976	path->slots[0]++;
				6977	}
				6978
				6979	/*
				6980	* After loading chunk tree, we've got all device information,
				6981	* do another round of validation checks.
				6982	*/
				6983	if (total_dev != fs_info->fs_devices->total_devices) {
				6984	btrfs_err(fs_info,
				6985	"super_num_devices %llu mismatch with num_devices %llu found here",
				6986	btrfs_super_num_devices(fs_info->super_copy),
				6987	total_dev);
				6988	ret = -EINVAL;
				6989	goto error;
				6990	}
				6991	if (btrfs_super_total_bytes(fs_info->super_copy) <
				6992	fs_info->fs_devices->total_rw_bytes) {
				6993	btrfs_err(fs_info,
				6994	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
				6995	btrfs_super_total_bytes(fs_info->super_copy),
				6996	fs_info->fs_devices->total_rw_bytes);
				6997	ret = -EINVAL;
				6998	goto error;
				6999	}
				7000	ret = 0;
				7001	error:
				7002	mutex_unlock(&fs_info->chunk_mutex);
				7003	mutex_unlock(&uuid_mutex);
				7004
				7005	btrfs_free_path(path);
				7006	return ret;
				7007	}
				7008
				7009	void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
				7010	{
				7011	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7012	struct btrfs_device *device;
				7013
				7014	while (fs_devices) {
				7015	mutex_lock(&fs_devices->device_list_mutex);
				7016	list_for_each_entry(device, &fs_devices->devices, dev_list)
				7017	device->fs_info = fs_info;
				7018	mutex_unlock(&fs_devices->device_list_mutex);
				7019
				7020	fs_devices = fs_devices->seed;
				7021	}
				7022	}
				7023
				7024	static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
				7025	{
				7026	int i;
				7027
				7028	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7029	btrfs_dev_stat_reset(dev, i);
				7030	}
				7031
				7032	int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
				7033	{
				7034	struct btrfs_key key;
				7035	struct btrfs_key found_key;
				7036	struct btrfs_root *dev_root = fs_info->dev_root;
				7037	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7038	struct extent_buffer *eb;
				7039	int slot;
				7040	int ret = 0;
				7041	struct btrfs_device *device;
				7042	struct btrfs_path *path = NULL;
				7043	int i;
				7044
				7045	path = btrfs_alloc_path();
				7046	if (!path) {
				7047	ret = -ENOMEM;
				7048	goto out;
				7049	}
				7050
				7051	mutex_lock(&fs_devices->device_list_mutex);
				7052	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				7053	int item_size;
				7054	struct btrfs_dev_stats_item *ptr;
				7055
				7056	key.objectid = BTRFS_DEV_STATS_OBJECTID;
				7057	key.type = BTRFS_PERSISTENT_ITEM_KEY;
				7058	key.offset = device->devid;
				7059	ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
				7060	if (ret) {
				7061	__btrfs_reset_dev_stats(device);
				7062	device->dev_stats_valid = 1;
				7063	btrfs_release_path(path);
				7064	continue;
				7065	}
				7066	slot = path->slots[0];
				7067	eb = path->nodes[0];
				7068	btrfs_item_key_to_cpu(eb, &found_key, slot);
				7069	item_size = btrfs_item_size_nr(eb, slot);
				7070
				7071	ptr = btrfs_item_ptr(eb, slot,
				7072	struct btrfs_dev_stats_item);
				7073
				7074	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
				7075	if (item_size >= (1 + i) * sizeof(__le64))
				7076	btrfs_dev_stat_set(device, i,
				7077	btrfs_dev_stats_value(eb, ptr, i));
				7078	else
				7079	btrfs_dev_stat_reset(device, i);
				7080	}
				7081
				7082	device->dev_stats_valid = 1;
				7083	btrfs_dev_stat_print_on_load(device);
				7084	btrfs_release_path(path);
				7085	}
				7086	mutex_unlock(&fs_devices->device_list_mutex);
				7087
				7088	out:
				7089	btrfs_free_path(path);
				7090	return ret < 0 ? ret : 0;
				7091	}
				7092
				7093	static int update_dev_stat_item(struct btrfs_trans_handle *trans,
				7094	struct btrfs_device *device)
				7095	{
				7096	struct btrfs_fs_info *fs_info = trans->fs_info;
				7097	struct btrfs_root *dev_root = fs_info->dev_root;
				7098	struct btrfs_path *path;
				7099	struct btrfs_key key;
				7100	struct extent_buffer *eb;
				7101	struct btrfs_dev_stats_item *ptr;
				7102	int ret;
				7103	int i;
				7104
				7105	key.objectid = BTRFS_DEV_STATS_OBJECTID;
				7106	key.type = BTRFS_PERSISTENT_ITEM_KEY;
				7107	key.offset = device->devid;
				7108
				7109	path = btrfs_alloc_path();
				7110	if (!path)
				7111	return -ENOMEM;
				7112	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
				7113	if (ret < 0) {
				7114	btrfs_warn_in_rcu(fs_info,
				7115	"error %d while searching for dev_stats item for device %s",
				7116	ret, rcu_str_deref(device->name));
				7117	goto out;
				7118	}
				7119
				7120	if (ret == 0 &&
				7121	btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
				7122	/* need to delete old one and insert a new one */
				7123	ret = btrfs_del_item(trans, dev_root, path);
				7124	if (ret != 0) {
				7125	btrfs_warn_in_rcu(fs_info,
				7126	"delete too small dev_stats item for device %s failed %d",
				7127	rcu_str_deref(device->name), ret);
				7128	goto out;
				7129	}
				7130	ret = 1;
				7131	}
				7132
				7133	if (ret == 1) {
				7134	/* need to insert a new item */
				7135	btrfs_release_path(path);
				7136	ret = btrfs_insert_empty_item(trans, dev_root, path,
				7137	&key, sizeof(*ptr));
				7138	if (ret < 0) {
				7139	btrfs_warn_in_rcu(fs_info,
				7140	"insert dev_stats item for device %s failed %d",
				7141	rcu_str_deref(device->name), ret);
				7142	goto out;
				7143	}
				7144	}
				7145
				7146	eb = path->nodes[0];
				7147	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
				7148	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7149	btrfs_set_dev_stats_value(eb, ptr, i,
				7150	btrfs_dev_stat_read(device, i));
				7151	btrfs_mark_buffer_dirty(eb);
				7152
				7153	out:
				7154	btrfs_free_path(path);
				7155	return ret;
				7156	}
				7157
				7158	/*
				7159	* called from commit_transaction. Writes all changed device stats to disk.
				7160	*/
				7161	int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
				7162	struct btrfs_fs_info *fs_info)
				7163	{
				7164	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7165	struct btrfs_device *device;
				7166	int stats_cnt;
				7167	int ret = 0;
				7168
				7169	mutex_lock(&fs_devices->device_list_mutex);
				7170	list_for_each_entry(device, &fs_devices->devices, dev_list) {
				7171	stats_cnt = atomic_read(&device->dev_stats_ccnt);
				7172	if (!device->dev_stats_valid \|\| stats_cnt == 0)
				7173	continue;
				7174
				7175
				7176	/*
				7177	* There is a LOAD-LOAD control dependency between the value of
				7178	* dev_stats_ccnt and updating the on-disk values which requires
				7179	* reading the in-memory counters. Such control dependencies
				7180	* require explicit read memory barriers.
				7181	*
				7182	* This memory barriers pairs with smp_mb__before_atomic in
				7183	* btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
				7184	* barrier implied by atomic_xchg in
				7185	* btrfs_dev_stats_read_and_reset
				7186	*/
				7187	smp_rmb();
				7188
				7189	ret = update_dev_stat_item(trans, device);
				7190	if (!ret)
				7191	atomic_sub(stats_cnt, &device->dev_stats_ccnt);
				7192	}
				7193	mutex_unlock(&fs_devices->device_list_mutex);
				7194
				7195	return ret;
				7196	}
				7197
				7198	void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
				7199	{
				7200	btrfs_dev_stat_inc(dev, index);
				7201	btrfs_dev_stat_print_on_error(dev);
				7202	}
				7203
				7204	static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
				7205	{
				7206	if (!dev->dev_stats_valid)
				7207	return;
				7208	btrfs_err_rl_in_rcu(dev->fs_info,
				7209	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
				7210	rcu_str_deref(dev->name),
				7211	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
				7212	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
				7213	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
				7214	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
				7215	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
				7216	}
				7217
				7218	static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
				7219	{
				7220	int i;
				7221
				7222	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7223	if (btrfs_dev_stat_read(dev, i) != 0)
				7224	break;
				7225	if (i == BTRFS_DEV_STAT_VALUES_MAX)
				7226	return; /* all values == 0, suppress message */
				7227
				7228	btrfs_info_in_rcu(dev->fs_info,
				7229	"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
				7230	rcu_str_deref(dev->name),
				7231	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
				7232	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
				7233	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
				7234	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
				7235	btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
				7236	}
				7237
				7238	int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
				7239	struct btrfs_ioctl_get_dev_stats *stats)
				7240	{
				7241	struct btrfs_device *dev;
				7242	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7243	int i;
				7244
				7245	mutex_lock(&fs_devices->device_list_mutex);
				7246	dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
				7247	mutex_unlock(&fs_devices->device_list_mutex);
				7248
				7249	if (!dev) {
				7250	btrfs_warn(fs_info, "get dev_stats failed, device not found");
				7251	return -ENODEV;
				7252	} else if (!dev->dev_stats_valid) {
				7253	btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
				7254	return -ENODEV;
				7255	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
				7256	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
				7257	if (stats->nr_items > i)
				7258	stats->values[i] =
				7259	btrfs_dev_stat_read_and_reset(dev, i);
				7260	else
				7261	btrfs_dev_stat_reset(dev, i);
				7262	}
				7263	} else {
				7264	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
				7265	if (stats->nr_items > i)
				7266	stats->values[i] = btrfs_dev_stat_read(dev, i);
				7267	}
				7268	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
				7269	stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
				7270	return 0;
				7271	}
				7272
				7273	void btrfs_scratch_superblocks(struct block_device bdev, const char device_path)
				7274	{
				7275	struct buffer_head *bh;
				7276	struct btrfs_super_block *disk_super;
				7277	int copy_num;
				7278
				7279	if (!bdev)
				7280	return;
				7281
				7282	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
				7283	copy_num++) {
				7284
				7285	if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
				7286	continue;
				7287
				7288	disk_super = (struct btrfs_super_block *)bh->b_data;
				7289
				7290	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
				7291	set_buffer_dirty(bh);
				7292	sync_dirty_buffer(bh);
				7293	brelse(bh);
				7294	}
				7295
				7296	/* Notify udev that device has changed */
				7297	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
				7298
				7299	/* Update ctime/mtime for device path for libblkid */
				7300	update_dev_time(device_path);
				7301	}
				7302
				7303	/*
				7304	* Update the size of all devices, which is used for writing out the
				7305	* super blocks.
				7306	*/
				7307	void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
				7308	{
				7309	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7310	struct btrfs_device curr, next;
				7311
				7312	if (list_empty(&fs_devices->resized_devices))
				7313	return;
				7314
				7315	mutex_lock(&fs_devices->device_list_mutex);
				7316	mutex_lock(&fs_info->chunk_mutex);
				7317	list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
				7318	resized_list) {
				7319	list_del_init(&curr->resized_list);
				7320	curr->commit_total_bytes = curr->disk_total_bytes;
				7321	}
				7322	mutex_unlock(&fs_info->chunk_mutex);
				7323	mutex_unlock(&fs_devices->device_list_mutex);
				7324	}
				7325
				7326	/* Must be invoked during the transaction commit */
				7327	void btrfs_update_commit_device_bytes_used(struct btrfs_transaction *trans)
				7328	{
				7329	struct btrfs_fs_info *fs_info = trans->fs_info;
				7330	struct extent_map *em;
				7331	struct map_lookup *map;
				7332	struct btrfs_device *dev;
				7333	int i;
				7334
				7335	if (list_empty(&trans->pending_chunks))
				7336	return;
				7337
				7338	/* In order to kick the device replace finish process */
				7339	mutex_lock(&fs_info->chunk_mutex);
				7340	list_for_each_entry(em, &trans->pending_chunks, list) {
				7341	map = em->map_lookup;
				7342
				7343	for (i = 0; i < map->num_stripes; i++) {
				7344	dev = map->stripes[i].dev;
				7345	dev->commit_bytes_used = dev->bytes_used;
				7346	dev->has_pending_chunks = false;
				7347	}
				7348	}
				7349	mutex_unlock(&fs_info->chunk_mutex);
				7350	}
				7351
				7352	void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
				7353	{
				7354	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7355	while (fs_devices) {
				7356	fs_devices->fs_info = fs_info;
				7357	fs_devices = fs_devices->seed;
				7358	}
				7359	}
				7360
				7361	void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
				7362	{
				7363	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				7364	while (fs_devices) {
				7365	fs_devices->fs_info = NULL;
				7366	fs_devices = fs_devices->seed;
				7367	}
				7368	}
				7369
				7370	/*
				7371	* Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
				7372	*/
				7373	int btrfs_bg_type_to_factor(u64 flags)
				7374	{
				7375	if (flags & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
				7376	BTRFS_BLOCK_GROUP_RAID10))
				7377	return 2;
				7378	return 1;
				7379	}
				7380
				7381
				7382	static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
				7383	{
				7384	int index = btrfs_bg_flags_to_raid_index(type);
				7385	int ncopies = btrfs_raid_array[index].ncopies;
				7386	int data_stripes;
				7387
				7388	switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
				7389	case BTRFS_BLOCK_GROUP_RAID5:
				7390	data_stripes = num_stripes - 1;
				7391	break;
				7392	case BTRFS_BLOCK_GROUP_RAID6:
				7393	data_stripes = num_stripes - 2;
				7394	break;
				7395	default:
				7396	data_stripes = num_stripes / ncopies;
				7397	break;
				7398	}
				7399	return div_u64(chunk_len, data_stripes);
				7400	}
				7401
				7402	static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
				7403	u64 chunk_offset, u64 devid,
				7404	u64 physical_offset, u64 physical_len)
				7405	{
				7406	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
				7407	struct extent_map *em;
				7408	struct map_lookup *map;
				7409	struct btrfs_device *dev;
				7410	u64 stripe_len;
				7411	bool found = false;
				7412	int ret = 0;
				7413	int i;
				7414
				7415	read_lock(&em_tree->lock);
				7416	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
				7417	read_unlock(&em_tree->lock);
				7418
				7419	if (!em) {
				7420	btrfs_err(fs_info,
				7421	"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
				7422	physical_offset, devid);
				7423	ret = -EUCLEAN;
				7424	goto out;
				7425	}
				7426
				7427	map = em->map_lookup;
				7428	stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
				7429	if (physical_len != stripe_len) {
				7430	btrfs_err(fs_info,
				7431	"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
				7432	physical_offset, devid, em->start, physical_len,
				7433	stripe_len);
				7434	ret = -EUCLEAN;
				7435	goto out;
				7436	}
				7437
				7438	for (i = 0; i < map->num_stripes; i++) {
				7439	if (map->stripes[i].dev->devid == devid &&
				7440	map->stripes[i].physical == physical_offset) {
				7441	found = true;
				7442	if (map->verified_stripes >= map->num_stripes) {
				7443	btrfs_err(fs_info,
				7444	"too many dev extents for chunk %llu found",
				7445	em->start);
				7446	ret = -EUCLEAN;
				7447	goto out;
				7448	}
				7449	map->verified_stripes++;
				7450	break;
				7451	}
				7452	}
				7453	if (!found) {
				7454	btrfs_err(fs_info,
				7455	"dev extent physical offset %llu devid %llu has no corresponding chunk",
				7456	physical_offset, devid);
				7457	ret = -EUCLEAN;
				7458	}
				7459
				7460	/* Make sure no dev extent is beyond device bondary */
				7461	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
				7462	if (!dev) {
				7463	btrfs_err(fs_info, "failed to find devid %llu", devid);
				7464	ret = -EUCLEAN;
				7465	goto out;
				7466	}
				7467
				7468	/* It's possible this device is a dummy for seed device */
				7469	if (dev->disk_total_bytes == 0) {
				7470	dev = find_device(fs_info->fs_devices->seed, devid, NULL);
				7471	if (!dev) {
				7472	btrfs_err(fs_info, "failed to find seed devid %llu",
				7473	devid);
				7474	ret = -EUCLEAN;
				7475	goto out;
				7476	}
				7477	}
				7478
				7479	if (physical_offset + physical_len > dev->disk_total_bytes) {
				7480	btrfs_err(fs_info,
				7481	"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
				7482	devid, physical_offset, physical_len,
				7483	dev->disk_total_bytes);
				7484	ret = -EUCLEAN;
				7485	goto out;
				7486	}
				7487	out:
				7488	free_extent_map(em);
				7489	return ret;
				7490	}
				7491
				7492	static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
				7493	{
				7494	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
				7495	struct extent_map *em;
				7496	struct rb_node *node;
				7497	int ret = 0;
				7498
				7499	read_lock(&em_tree->lock);
				7500	for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
				7501	em = rb_entry(node, struct extent_map, rb_node);
				7502	if (em->map_lookup->num_stripes !=
				7503	em->map_lookup->verified_stripes) {
				7504	btrfs_err(fs_info,
				7505	"chunk %llu has missing dev extent, have %d expect %d",
				7506	em->start, em->map_lookup->verified_stripes,
				7507	em->map_lookup->num_stripes);
				7508	ret = -EUCLEAN;
				7509	goto out;
				7510	}
				7511	}
				7512	out:
				7513	read_unlock(&em_tree->lock);
				7514	return ret;
				7515	}
				7516
				7517	/*
				7518	* Ensure that all dev extents are mapped to correct chunk, otherwise
				7519	* later chunk allocation/free would cause unexpected behavior.
				7520	*
				7521	* NOTE: This will iterate through the whole device tree, which should be of
				7522	* the same size level as the chunk tree. This slightly increases mount time.
				7523	*/
				7524	int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
				7525	{
				7526	struct btrfs_path *path;
				7527	struct btrfs_root *root = fs_info->dev_root;
				7528	struct btrfs_key key;
				7529	u64 prev_devid = 0;
				7530	u64 prev_dev_ext_end = 0;
				7531	int ret = 0;
				7532
				7533	key.objectid = 1;
				7534	key.type = BTRFS_DEV_EXTENT_KEY;
				7535	key.offset = 0;
				7536
				7537	path = btrfs_alloc_path();
				7538	if (!path)
				7539	return -ENOMEM;
				7540
				7541	path->reada = READA_FORWARD;
				7542	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				7543	if (ret < 0)
				7544	goto out;
				7545
				7546	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				7547	ret = btrfs_next_item(root, path);
				7548	if (ret < 0)
				7549	goto out;
				7550	/* No dev extents at all? Not good */
				7551	if (ret > 0) {
				7552	ret = -EUCLEAN;
				7553	goto out;
				7554	}
				7555	}
				7556	while (1) {
				7557	struct extent_buffer *leaf = path->nodes[0];
				7558	struct btrfs_dev_extent *dext;
				7559	int slot = path->slots[0];
				7560	u64 chunk_offset;
				7561	u64 physical_offset;
				7562	u64 physical_len;
				7563	u64 devid;
				7564
				7565	btrfs_item_key_to_cpu(leaf, &key, slot);
				7566	if (key.type != BTRFS_DEV_EXTENT_KEY)
				7567	break;
				7568	devid = key.objectid;
				7569	physical_offset = key.offset;
				7570
				7571	dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
				7572	chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
				7573	physical_len = btrfs_dev_extent_length(leaf, dext);
				7574
				7575	/* Check if this dev extent overlaps with the previous one */
				7576	if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
				7577	btrfs_err(fs_info,
				7578	"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
				7579	devid, physical_offset, prev_dev_ext_end);
				7580	ret = -EUCLEAN;
				7581	goto out;
				7582	}
				7583
				7584	ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
				7585	physical_offset, physical_len);
				7586	if (ret < 0)
				7587	goto out;
				7588	prev_devid = devid;
				7589	prev_dev_ext_end = physical_offset + physical_len;
				7590
				7591	ret = btrfs_next_item(root, path);
				7592	if (ret < 0)
				7593	goto out;
				7594	if (ret > 0) {
				7595	ret = 0;
				7596	break;
				7597	}
				7598	}
				7599
				7600	/* Ensure all chunks have corresponding dev extents */
				7601	ret = verify_chunk_dev_extent_mapping(fs_info);
				7602	out:
				7603	btrfs_free_path(path);
				7604	return ret;
				7605	}