Blame - src/kernel/linux/v4.14/fs/btrfs/disk-io.c - T103

blob: 495430e4f84bebfa30f8c16328d8c355c67eced1 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2007 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/fs.h>
				20	#include <linux/blkdev.h>
				21	#include <linux/scatterlist.h>
				22	#include <linux/swap.h>
				23	#include <linux/radix-tree.h>
				24	#include <linux/writeback.h>
				25	#include <linux/buffer_head.h>
				26	#include <linux/workqueue.h>
				27	#include <linux/kthread.h>
				28	#include <linux/slab.h>
				29	#include <linux/migrate.h>
				30	#include <linux/ratelimit.h>
				31	#include <linux/uuid.h>
				32	#include <linux/semaphore.h>
				33	#include <asm/unaligned.h>
				34	#include "ctree.h"
				35	#include "disk-io.h"
				36	#include "hash.h"
				37	#include "transaction.h"
				38	#include "btrfs_inode.h"
				39	#include "volumes.h"
				40	#include "print-tree.h"
				41	#include "locking.h"
				42	#include "tree-log.h"
				43	#include "free-space-cache.h"
				44	#include "free-space-tree.h"
				45	#include "inode-map.h"
				46	#include "check-integrity.h"
				47	#include "rcu-string.h"
				48	#include "dev-replace.h"
				49	#include "raid56.h"
				50	#include "sysfs.h"
				51	#include "qgroup.h"
				52	#include "compression.h"
				53	#include "tree-checker.h"
				54
				55	#ifdef CONFIG_X86
				56	#include <asm/cpufeature.h>
				57	#endif
				58
				59	#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN \|\
				60	BTRFS_HEADER_FLAG_RELOC \|\
				61	BTRFS_SUPER_FLAG_ERROR \|\
				62	BTRFS_SUPER_FLAG_SEEDING \|\
				63	BTRFS_SUPER_FLAG_METADUMP \|\
				64	BTRFS_SUPER_FLAG_METADUMP_V2)
				65
				66	static const struct extent_io_ops btree_extent_io_ops;
				67	static void end_workqueue_fn(struct btrfs_work *work);
				68	static void free_fs_root(struct btrfs_root *root);
				69	static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info);
				70	static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
				71	static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				72	struct btrfs_fs_info *fs_info);
				73	static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
				74	static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
				75	struct extent_io_tree *dirty_pages,
				76	int mark);
				77	static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
				78	struct extent_io_tree *pinned_extents);
				79	static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
				80	static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
				81
				82	/*
				83	* btrfs_end_io_wq structs are used to do processing in task context when an IO
				84	* is complete. This is used during reads to verify checksums, and it is used
				85	* by writes to insert metadata for new file extents after IO is complete.
				86	*/
				87	struct btrfs_end_io_wq {
				88	struct bio *bio;
				89	bio_end_io_t *end_io;
				90	void *private;
				91	struct btrfs_fs_info *info;
				92	blk_status_t status;
				93	enum btrfs_wq_endio_type metadata;
				94	struct btrfs_work work;
				95	};
				96
				97	static struct kmem_cache *btrfs_end_io_wq_cache;
				98
				99	int __init btrfs_end_io_wq_init(void)
				100	{
				101	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
				102	sizeof(struct btrfs_end_io_wq),
				103	0,
				104	SLAB_MEM_SPREAD,
				105	NULL);
				106	if (!btrfs_end_io_wq_cache)
				107	return -ENOMEM;
				108	return 0;
				109	}
				110
				111	void btrfs_end_io_wq_exit(void)
				112	{
				113	kmem_cache_destroy(btrfs_end_io_wq_cache);
				114	}
				115
				116	/*
				117	* async submit bios are used to offload expensive checksumming
				118	* onto the worker threads. They checksum file and metadata bios
				119	* just before they are sent down the IO stack.
				120	*/
				121	struct async_submit_bio {
				122	void *private_data;
				123	struct btrfs_fs_info *fs_info;
				124	struct bio *bio;
				125	extent_submit_bio_hook_t *submit_bio_start;
				126	extent_submit_bio_hook_t *submit_bio_done;
				127	int mirror_num;
				128	unsigned long bio_flags;
				129	/*
				130	* bio_offset is optional, can be used if the pages in the bio
				131	* can't tell us where in the file the bio should go
				132	*/
				133	u64 bio_offset;
				134	struct btrfs_work work;
				135	blk_status_t status;
				136	};
				137
				138	/*
				139	* Lockdep class keys for extent_buffer->lock's in this root. For a given
				140	* eb, the lockdep key is determined by the btrfs_root it belongs to and
				141	* the level the eb occupies in the tree.
				142	*
				143	* Different roots are used for different purposes and may nest inside each
				144	* other and they require separate keysets. As lockdep keys should be
				145	* static, assign keysets according to the purpose of the root as indicated
				146	* by btrfs_root->objectid. This ensures that all special purpose roots
				147	* have separate keysets.
				148	*
				149	* Lock-nesting across peer nodes is always done with the immediate parent
				150	* node locked thus preventing deadlock. As lockdep doesn't know this, use
				151	* subclass to avoid triggering lockdep warning in such cases.
				152	*
				153	* The key is set by the readpage_end_io_hook after the buffer has passed
				154	* csum validation but before the pages are unlocked. It is also set by
				155	* btrfs_init_new_buffer on freshly allocated blocks.
				156	*
				157	* We also add a check to make sure the highest level of the tree is the
				158	* same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
				159	* needs update as well.
				160	*/
				161	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				162	# if BTRFS_MAX_LEVEL != 8
				163	# error
				164	# endif
				165
				166	static struct btrfs_lockdep_keyset {
				167	u64 id; /* root objectid */
				168	const char name_stem; / lock name stem */
				169	char names[BTRFS_MAX_LEVEL + 1][20];
				170	struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
				171	} btrfs_lockdep_keysets[] = {
				172	{ .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
				173	{ .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
				174	{ .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
				175	{ .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
				176	{ .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
				177	{ .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
				178	{ .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
				179	{ .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
				180	{ .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
				181	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
				182	{ .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
				183	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
				184	{ .id = 0, .name_stem = "tree" },
				185	};
				186
				187	void __init btrfs_init_lockdep(void)
				188	{
				189	int i, j;
				190
				191	/* initialize lockdep class names */
				192	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
				193	struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
				194
				195	for (j = 0; j < ARRAY_SIZE(ks->names); j++)
				196	snprintf(ks->names[j], sizeof(ks->names[j]),
				197	"btrfs-%s-%02d", ks->name_stem, j);
				198	}
				199	}
				200
				201	void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
				202	int level)
				203	{
				204	struct btrfs_lockdep_keyset *ks;
				205
				206	BUG_ON(level >= ARRAY_SIZE(ks->keys));
				207
				208	/* find the matching keyset, id 0 is the default entry */
				209	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
				210	if (ks->id == objectid)
				211	break;
				212
				213	lockdep_set_class_and_name(&eb->lock,
				214	&ks->keys[level], ks->names[level]);
				215	}
				216
				217	#endif
				218
				219	/*
				220	* extents on the btree inode are pretty simple, there's one extent
				221	* that covers the entire device
				222	*/
				223	static struct extent_map btree_get_extent(struct btrfs_inode inode,
				224	struct page *page, size_t pg_offset, u64 start, u64 len,
				225	int create)
				226	{
				227	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				228	struct extent_map_tree *em_tree = &inode->extent_tree;
				229	struct extent_map *em;
				230	int ret;
				231
				232	read_lock(&em_tree->lock);
				233	em = lookup_extent_mapping(em_tree, start, len);
				234	if (em) {
				235	em->bdev = fs_info->fs_devices->latest_bdev;
				236	read_unlock(&em_tree->lock);
				237	goto out;
				238	}
				239	read_unlock(&em_tree->lock);
				240
				241	em = alloc_extent_map();
				242	if (!em) {
				243	em = ERR_PTR(-ENOMEM);
				244	goto out;
				245	}
				246	em->start = 0;
				247	em->len = (u64)-1;
				248	em->block_len = (u64)-1;
				249	em->block_start = 0;
				250	em->bdev = fs_info->fs_devices->latest_bdev;
				251
				252	write_lock(&em_tree->lock);
				253	ret = add_extent_mapping(em_tree, em, 0);
				254	if (ret == -EEXIST) {
				255	free_extent_map(em);
				256	em = lookup_extent_mapping(em_tree, start, len);
				257	if (!em)
				258	em = ERR_PTR(-EIO);
				259	} else if (ret) {
				260	free_extent_map(em);
				261	em = ERR_PTR(ret);
				262	}
				263	write_unlock(&em_tree->lock);
				264
				265	out:
				266	return em;
				267	}
				268
				269	u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
				270	{
				271	return btrfs_crc32c(seed, data, len);
				272	}
				273
				274	void btrfs_csum_final(u32 crc, u8 *result)
				275	{
				276	put_unaligned_le32(~crc, result);
				277	}
				278
				279	/*
				280	* compute the csum for a btree block, and either verify it or write it
				281	* into the csum field of the block.
				282	*/
				283	static int csum_tree_block(struct btrfs_fs_info *fs_info,
				284	struct extent_buffer *buf,
				285	int verify)
				286	{
				287	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
				288	char *result = NULL;
				289	unsigned long len;
				290	unsigned long cur_len;
				291	unsigned long offset = BTRFS_CSUM_SIZE;
				292	char *kaddr;
				293	unsigned long map_start;
				294	unsigned long map_len;
				295	int err;
				296	u32 crc = ~(u32)0;
				297	unsigned long inline_result;
				298
				299	len = buf->len - offset;
				300	while (len > 0) {
				301	err = map_private_extent_buffer(buf, offset, 32,
				302	&kaddr, &map_start, &map_len);
				303	if (err)
				304	return err;
				305	cur_len = min(len, map_len - (offset - map_start));
				306	crc = btrfs_csum_data(kaddr + offset - map_start,
				307	crc, cur_len);
				308	len -= cur_len;
				309	offset += cur_len;
				310	}
				311	if (csum_size > sizeof(inline_result)) {
				312	result = kzalloc(csum_size, GFP_NOFS);
				313	if (!result)
				314	return -ENOMEM;
				315	} else {
				316	result = (char *)&inline_result;
				317	}
				318
				319	btrfs_csum_final(crc, result);
				320
				321	if (verify) {
				322	if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
				323	u32 val;
				324	u32 found = 0;
				325	memcpy(&found, result, csum_size);
				326
				327	read_extent_buffer(buf, &val, 0, csum_size);
				328	btrfs_warn_rl(fs_info,
				329	"%s checksum verify failed on %llu wanted %X found %X level %d",
				330	fs_info->sb->s_id, buf->start,
				331	val, found, btrfs_header_level(buf));
				332	if (result != (char *)&inline_result)
				333	kfree(result);
				334	return -EUCLEAN;
				335	}
				336	} else {
				337	write_extent_buffer(buf, result, 0, csum_size);
				338	}
				339	if (result != (char *)&inline_result)
				340	kfree(result);
				341	return 0;
				342	}
				343
				344	/*
				345	* we can't consider a given block up to date unless the transid of the
				346	* block matches the transid in the parent node's pointer. This is how we
				347	* detect blocks that either didn't get written at all or got written
				348	* in the wrong place.
				349	*/
				350	static int verify_parent_transid(struct extent_io_tree *io_tree,
				351	struct extent_buffer *eb, u64 parent_transid,
				352	int atomic)
				353	{
				354	struct extent_state *cached_state = NULL;
				355	int ret;
				356	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
				357
				358	if (!parent_transid \|\| btrfs_header_generation(eb) == parent_transid)
				359	return 0;
				360
				361	if (atomic)
				362	return -EAGAIN;
				363
				364	if (need_lock) {
				365	btrfs_tree_read_lock(eb);
				366	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
				367	}
				368
				369	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
				370	&cached_state);
				371	if (extent_buffer_uptodate(eb) &&
				372	btrfs_header_generation(eb) == parent_transid) {
				373	ret = 0;
				374	goto out;
				375	}
				376	btrfs_err_rl(eb->fs_info,
				377	"parent transid verify failed on %llu wanted %llu found %llu",
				378	eb->start,
				379	parent_transid, btrfs_header_generation(eb));
				380	ret = 1;
				381
				382	/*
				383	* Things reading via commit roots that don't have normal protection,
				384	* like send, can have a really old block in cache that may point at a
				385	* block that has been freed and re-allocated. So don't clear uptodate
				386	* if we find an eb that is under IO (dirty/writeback) because we could
				387	* end up reading in the stale data and then writing it back out and
				388	* making everybody very sad.
				389	*/
				390	if (!extent_buffer_under_io(eb))
				391	clear_extent_buffer_uptodate(eb);
				392	out:
				393	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
				394	&cached_state, GFP_NOFS);
				395	if (need_lock)
				396	btrfs_tree_read_unlock_blocking(eb);
				397	return ret;
				398	}
				399
				400	/*
				401	* Return 0 if the superblock checksum type matches the checksum value of that
				402	* algorithm. Pass the raw disk superblock data.
				403	*/
				404	static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
				405	char *raw_disk_sb)
				406	{
				407	struct btrfs_super_block *disk_sb =
				408	(struct btrfs_super_block *)raw_disk_sb;
				409	u16 csum_type = btrfs_super_csum_type(disk_sb);
				410	int ret = 0;
				411
				412	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
				413	u32 crc = ~(u32)0;
				414	const int csum_size = sizeof(crc);
				415	char result[csum_size];
				416
				417	/*
				418	* The super_block structure does not span the whole
				419	* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
				420	* is filled with zeros and is included in the checksum.
				421	*/
				422	crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
				423	crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
				424	btrfs_csum_final(crc, result);
				425
				426	if (memcmp(raw_disk_sb, result, csum_size))
				427	ret = 1;
				428	}
				429
				430	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
				431	btrfs_err(fs_info, "unsupported checksum algorithm %u",
				432	csum_type);
				433	ret = 1;
				434	}
				435
				436	return ret;
				437	}
				438
				439	/*
				440	* helper to read a given tree block, doing retries as required when
				441	* the checksums don't match and we have alternate mirrors to try.
				442	*/
				443	static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
				444	struct extent_buffer *eb,
				445	u64 parent_transid)
				446	{
				447	struct extent_io_tree *io_tree;
				448	int failed = 0;
				449	int ret;
				450	int num_copies = 0;
				451	int mirror_num = 0;
				452	int failed_mirror = 0;
				453
				454	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
				455	while (1) {
				456	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
				457	ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
				458	btree_get_extent, mirror_num);
				459	if (!ret) {
				460	if (!verify_parent_transid(io_tree, eb,
				461	parent_transid, 0))
				462	break;
				463	else
				464	ret = -EIO;
				465	}
				466
				467	num_copies = btrfs_num_copies(fs_info,
				468	eb->start, eb->len);
				469	if (num_copies == 1)
				470	break;
				471
				472	if (!failed_mirror) {
				473	failed = 1;
				474	failed_mirror = eb->read_mirror;
				475	}
				476
				477	mirror_num++;
				478	if (mirror_num == failed_mirror)
				479	mirror_num++;
				480
				481	if (mirror_num > num_copies)
				482	break;
				483	}
				484
				485	if (failed && !ret && failed_mirror)
				486	repair_eb_io_failure(fs_info, eb, failed_mirror);
				487
				488	return ret;
				489	}
				490
				491	/*
				492	* checksum a dirty tree block before IO. This has extra checks to make sure
				493	* we only fill in the checksum field in the first page of a multi-page block
				494	*/
				495
				496	static int csum_dirty_buffer(struct btrfs_fs_info fs_info, struct page page)
				497	{
				498	u64 start = page_offset(page);
				499	u64 found_start;
				500	struct extent_buffer *eb;
				501
				502	eb = (struct extent_buffer *)page->private;
				503	if (page != eb->pages[0])
				504	return 0;
				505
				506	found_start = btrfs_header_bytenr(eb);
				507	/*
				508	* Please do not consolidate these warnings into a single if.
				509	* It is useful to know what went wrong.
				510	*/
				511	if (WARN_ON(found_start != start))
				512	return -EUCLEAN;
				513	if (WARN_ON(!PageUptodate(page)))
				514	return -EUCLEAN;
				515
				516	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
				517	btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
				518
				519	return csum_tree_block(fs_info, eb, 0);
				520	}
				521
				522	static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
				523	struct extent_buffer *eb)
				524	{
				525	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				526	u8 fsid[BTRFS_FSID_SIZE];
				527	int ret = 1;
				528
				529	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
				530	while (fs_devices) {
				531	if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
				532	ret = 0;
				533	break;
				534	}
				535	fs_devices = fs_devices->seed;
				536	}
				537	return ret;
				538	}
				539
				540	static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
				541	u64 phy_offset, struct page *page,
				542	u64 start, u64 end, int mirror)
				543	{
				544	u64 found_start;
				545	int found_level;
				546	struct extent_buffer *eb;
				547	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
				548	struct btrfs_fs_info *fs_info = root->fs_info;
				549	int ret = 0;
				550	int reads_done;
				551
				552	if (!page->private)
				553	goto out;
				554
				555	eb = (struct extent_buffer *)page->private;
				556
				557	/* the pending IO might have been the only thing that kept this buffer
				558	* in memory. Make sure we have a ref for all this other checks
				559	*/
				560	extent_buffer_get(eb);
				561
				562	reads_done = atomic_dec_and_test(&eb->io_pages);
				563	if (!reads_done)
				564	goto err;
				565
				566	eb->read_mirror = mirror;
				567	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
				568	ret = -EIO;
				569	goto err;
				570	}
				571
				572	found_start = btrfs_header_bytenr(eb);
				573	if (found_start != eb->start) {
				574	btrfs_err_rl(fs_info, "bad tree block start %llu %llu",
				575	found_start, eb->start);
				576	ret = -EIO;
				577	goto err;
				578	}
				579	if (check_tree_block_fsid(fs_info, eb)) {
				580	btrfs_err_rl(fs_info, "bad fsid on block %llu",
				581	eb->start);
				582	ret = -EIO;
				583	goto err;
				584	}
				585	found_level = btrfs_header_level(eb);
				586	if (found_level >= BTRFS_MAX_LEVEL) {
				587	btrfs_err(fs_info, "bad tree block level %d",
				588	(int)btrfs_header_level(eb));
				589	ret = -EIO;
				590	goto err;
				591	}
				592
				593	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
				594	eb, found_level);
				595
				596	ret = csum_tree_block(fs_info, eb, 1);
				597	if (ret)
				598	goto err;
				599
				600	/*
				601	* If this is a leaf block and it is corrupt, set the corrupt bit so
				602	* that we don't try and read the other copies of this block, just
				603	* return -EIO.
				604	*/
				605	if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
				606	set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
				607	ret = -EIO;
				608	}
				609
				610	if (found_level > 0 && btrfs_check_node(root, eb))
				611	ret = -EIO;
				612
				613	if (!ret)
				614	set_extent_buffer_uptodate(eb);
				615	err:
				616	if (reads_done &&
				617	test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
				618	btree_readahead_hook(eb, ret);
				619
				620	if (ret) {
				621	/*
				622	* our io error hook is going to dec the io pages
				623	* again, we have to make sure it has something
				624	* to decrement
				625	*/
				626	atomic_inc(&eb->io_pages);
				627	clear_extent_buffer_uptodate(eb);
				628	}
				629	free_extent_buffer(eb);
				630	out:
				631	return ret;
				632	}
				633
				634	static int btree_io_failed_hook(struct page *page, int failed_mirror)
				635	{
				636	struct extent_buffer *eb;
				637
				638	eb = (struct extent_buffer *)page->private;
				639	set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
				640	eb->read_mirror = failed_mirror;
				641	atomic_dec(&eb->io_pages);
				642	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
				643	btree_readahead_hook(eb, -EIO);
				644	return -EIO; /* we fixed nothing */
				645	}
				646
				647	static void end_workqueue_bio(struct bio *bio)
				648	{
				649	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
				650	struct btrfs_fs_info *fs_info;
				651	struct btrfs_workqueue *wq;
				652	btrfs_work_func_t func;
				653
				654	fs_info = end_io_wq->info;
				655	end_io_wq->status = bio->bi_status;
				656
				657	if (bio_op(bio) == REQ_OP_WRITE) {
				658	if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
				659	wq = fs_info->endio_meta_write_workers;
				660	func = btrfs_endio_meta_write_helper;
				661	} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
				662	wq = fs_info->endio_freespace_worker;
				663	func = btrfs_freespace_write_helper;
				664	} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
				665	wq = fs_info->endio_raid56_workers;
				666	func = btrfs_endio_raid56_helper;
				667	} else {
				668	wq = fs_info->endio_write_workers;
				669	func = btrfs_endio_write_helper;
				670	}
				671	} else {
				672	if (unlikely(end_io_wq->metadata ==
				673	BTRFS_WQ_ENDIO_DIO_REPAIR)) {
				674	wq = fs_info->endio_repair_workers;
				675	func = btrfs_endio_repair_helper;
				676	} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
				677	wq = fs_info->endio_raid56_workers;
				678	func = btrfs_endio_raid56_helper;
				679	} else if (end_io_wq->metadata) {
				680	wq = fs_info->endio_meta_workers;
				681	func = btrfs_endio_meta_helper;
				682	} else {
				683	wq = fs_info->endio_workers;
				684	func = btrfs_endio_helper;
				685	}
				686	}
				687
				688	btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
				689	btrfs_queue_work(wq, &end_io_wq->work);
				690	}
				691
				692	blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info info, struct bio bio,
				693	enum btrfs_wq_endio_type metadata)
				694	{
				695	struct btrfs_end_io_wq *end_io_wq;
				696
				697	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
				698	if (!end_io_wq)
				699	return BLK_STS_RESOURCE;
				700
				701	end_io_wq->private = bio->bi_private;
				702	end_io_wq->end_io = bio->bi_end_io;
				703	end_io_wq->info = info;
				704	end_io_wq->status = 0;
				705	end_io_wq->bio = bio;
				706	end_io_wq->metadata = metadata;
				707
				708	bio->bi_private = end_io_wq;
				709	bio->bi_end_io = end_workqueue_bio;
				710	return 0;
				711	}
				712
				713	unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
				714	{
				715	unsigned long limit = min_t(unsigned long,
				716	info->thread_pool_size,
				717	info->fs_devices->open_devices);
				718	return 256 * limit;
				719	}
				720
				721	static void run_one_async_start(struct btrfs_work *work)
				722	{
				723	struct async_submit_bio *async;
				724	blk_status_t ret;
				725
				726	async = container_of(work, struct async_submit_bio, work);
				727	ret = async->submit_bio_start(async->private_data, async->bio,
				728	async->mirror_num, async->bio_flags,
				729	async->bio_offset);
				730	if (ret)
				731	async->status = ret;
				732	}
				733
				734	static void run_one_async_done(struct btrfs_work *work)
				735	{
				736	struct btrfs_fs_info *fs_info;
				737	struct async_submit_bio *async;
				738	int limit;
				739
				740	async = container_of(work, struct async_submit_bio, work);
				741	fs_info = async->fs_info;
				742
				743	limit = btrfs_async_submit_limit(fs_info);
				744	limit = limit * 2 / 3;
				745
				746	/*
				747	* atomic_dec_return implies a barrier for waitqueue_active
				748	*/
				749	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
				750	waitqueue_active(&fs_info->async_submit_wait))
				751	wake_up(&fs_info->async_submit_wait);
				752
				753	/* If an error occurred we just want to clean up the bio and move on */
				754	if (async->status) {
				755	async->bio->bi_status = async->status;
				756	bio_endio(async->bio);
				757	return;
				758	}
				759
				760	async->submit_bio_done(async->private_data, async->bio, async->mirror_num,
				761	async->bio_flags, async->bio_offset);
				762	}
				763
				764	static void run_one_async_free(struct btrfs_work *work)
				765	{
				766	struct async_submit_bio *async;
				767
				768	async = container_of(work, struct async_submit_bio, work);
				769	kfree(async);
				770	}
				771
				772	blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info fs_info, struct bio bio,
				773	int mirror_num, unsigned long bio_flags,
				774	u64 bio_offset, void *private_data,
				775	extent_submit_bio_hook_t *submit_bio_start,
				776	extent_submit_bio_hook_t *submit_bio_done)
				777	{
				778	struct async_submit_bio *async;
				779
				780	async = kmalloc(sizeof(*async), GFP_NOFS);
				781	if (!async)
				782	return BLK_STS_RESOURCE;
				783
				784	async->private_data = private_data;
				785	async->fs_info = fs_info;
				786	async->bio = bio;
				787	async->mirror_num = mirror_num;
				788	async->submit_bio_start = submit_bio_start;
				789	async->submit_bio_done = submit_bio_done;
				790
				791	btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
				792	run_one_async_done, run_one_async_free);
				793
				794	async->bio_flags = bio_flags;
				795	async->bio_offset = bio_offset;
				796
				797	async->status = 0;
				798
				799	atomic_inc(&fs_info->nr_async_submits);
				800
				801	if (op_is_sync(bio->bi_opf))
				802	btrfs_set_work_high_priority(&async->work);
				803
				804	btrfs_queue_work(fs_info->workers, &async->work);
				805
				806	while (atomic_read(&fs_info->async_submit_draining) &&
				807	atomic_read(&fs_info->nr_async_submits)) {
				808	wait_event(fs_info->async_submit_wait,
				809	(atomic_read(&fs_info->nr_async_submits) == 0));
				810	}
				811
				812	return 0;
				813	}
				814
				815	static blk_status_t btree_csum_one_bio(struct bio *bio)
				816	{
				817	struct bio_vec *bvec;
				818	struct btrfs_root *root;
				819	int i, ret = 0;
				820
				821	ASSERT(!bio_flagged(bio, BIO_CLONED));
				822	bio_for_each_segment_all(bvec, bio, i) {
				823	root = BTRFS_I(bvec->bv_page->mapping->host)->root;
				824	ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
				825	if (ret)
				826	break;
				827	}
				828
				829	return errno_to_blk_status(ret);
				830	}
				831
				832	static blk_status_t __btree_submit_bio_start(void private_data, struct bio bio,
				833	int mirror_num, unsigned long bio_flags,
				834	u64 bio_offset)
				835	{
				836	/*
				837	* when we're called for a write, we're already in the async
				838	* submission context. Just jump into btrfs_map_bio
				839	*/
				840	return btree_csum_one_bio(bio);
				841	}
				842
				843	static blk_status_t __btree_submit_bio_done(void private_data, struct bio bio,
				844	int mirror_num, unsigned long bio_flags,
				845	u64 bio_offset)
				846	{
				847	struct inode *inode = private_data;
				848	blk_status_t ret;
				849
				850	/*
				851	* when we're called for a write, we're already in the async
				852	* submission context. Just jump into btrfs_map_bio
				853	*/
				854	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1);
				855	if (ret) {
				856	bio->bi_status = ret;
				857	bio_endio(bio);
				858	}
				859	return ret;
				860	}
				861
				862	static int check_async_write(unsigned long bio_flags)
				863	{
				864	if (bio_flags & EXTENT_BIO_TREE_LOG)
				865	return 0;
				866	#ifdef CONFIG_X86
				867	if (static_cpu_has(X86_FEATURE_XMM4_2))
				868	return 0;
				869	#endif
				870	return 1;
				871	}
				872
				873	static blk_status_t btree_submit_bio_hook(void private_data, struct bio bio,
				874	int mirror_num, unsigned long bio_flags,
				875	u64 bio_offset)
				876	{
				877	struct inode *inode = private_data;
				878	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				879	int async = check_async_write(bio_flags);
				880	blk_status_t ret;
				881
				882	if (bio_op(bio) != REQ_OP_WRITE) {
				883	/*
				884	* called for a read, do the setup so that checksum validation
				885	* can happen in the async kernel threads
				886	*/
				887	ret = btrfs_bio_wq_end_io(fs_info, bio,
				888	BTRFS_WQ_ENDIO_METADATA);
				889	if (ret)
				890	goto out_w_error;
				891	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
				892	} else if (!async) {
				893	ret = btree_csum_one_bio(bio);
				894	if (ret)
				895	goto out_w_error;
				896	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
				897	} else {
				898	/*
				899	* kthread helpers are used to submit writes so that
				900	* checksumming can happen in parallel across all CPUs
				901	*/
				902	ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
				903	bio_offset, private_data,
				904	__btree_submit_bio_start,
				905	__btree_submit_bio_done);
				906	}
				907
				908	if (ret)
				909	goto out_w_error;
				910	return 0;
				911
				912	out_w_error:
				913	bio->bi_status = ret;
				914	bio_endio(bio);
				915	return ret;
				916	}
				917
				918	#ifdef CONFIG_MIGRATION
				919	static int btree_migratepage(struct address_space *mapping,
				920	struct page newpage, struct page page,
				921	enum migrate_mode mode)
				922	{
				923	/*
				924	* we can't safely write a btree page from here,
				925	* we haven't done the locking hook
				926	*/
				927	if (PageDirty(page))
				928	return -EAGAIN;
				929	/*
				930	* Buffers may be managed in a filesystem specific way.
				931	* We must have no buffers or drop them.
				932	*/
				933	if (page_has_private(page) &&
				934	!try_to_release_page(page, GFP_KERNEL))
				935	return -EAGAIN;
				936	return migrate_page(mapping, newpage, page, mode);
				937	}
				938	#endif
				939
				940
				941	static int btree_writepages(struct address_space *mapping,
				942	struct writeback_control *wbc)
				943	{
				944	struct btrfs_fs_info *fs_info;
				945	int ret;
				946
				947	if (wbc->sync_mode == WB_SYNC_NONE) {
				948
				949	if (wbc->for_kupdate)
				950	return 0;
				951
				952	fs_info = BTRFS_I(mapping->host)->root->fs_info;
				953	/* this is a bit racy, but that's ok */
				954	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				955	BTRFS_DIRTY_METADATA_THRESH,
				956	fs_info->dirty_metadata_batch);
				957	if (ret < 0)
				958	return 0;
				959	}
				960	return btree_write_cache_pages(mapping, wbc);
				961	}
				962
				963	static int btree_readpage(struct file file, struct page page)
				964	{
				965	struct extent_io_tree *tree;
				966	tree = &BTRFS_I(page->mapping->host)->io_tree;
				967	return extent_read_full_page(tree, page, btree_get_extent, 0);
				968	}
				969
				970	static int btree_releasepage(struct page *page, gfp_t gfp_flags)
				971	{
				972	if (PageWriteback(page) \|\| PageDirty(page))
				973	return 0;
				974
				975	return try_release_extent_buffer(page);
				976	}
				977
				978	static void btree_invalidatepage(struct page *page, unsigned int offset,
				979	unsigned int length)
				980	{
				981	struct extent_io_tree *tree;
				982	tree = &BTRFS_I(page->mapping->host)->io_tree;
				983	extent_invalidatepage(tree, page, offset);
				984	btree_releasepage(page, GFP_NOFS);
				985	if (PagePrivate(page)) {
				986	btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
				987	"page private not zero on page %llu",
				988	(unsigned long long)page_offset(page));
				989	ClearPagePrivate(page);
				990	set_page_private(page, 0);
				991	put_page(page);
				992	}
				993	}
				994
				995	static int btree_set_page_dirty(struct page *page)
				996	{
				997	#ifdef DEBUG
				998	struct extent_buffer *eb;
				999
				1000	BUG_ON(!PagePrivate(page));
				1001	eb = (struct extent_buffer *)page->private;
				1002	BUG_ON(!eb);
				1003	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				1004	BUG_ON(!atomic_read(&eb->refs));
				1005	btrfs_assert_tree_locked(eb);
				1006	#endif
				1007	return __set_page_dirty_nobuffers(page);
				1008	}
				1009
				1010	static const struct address_space_operations btree_aops = {
				1011	.readpage = btree_readpage,
				1012	.writepages = btree_writepages,
				1013	.releasepage = btree_releasepage,
				1014	.invalidatepage = btree_invalidatepage,
				1015	#ifdef CONFIG_MIGRATION
				1016	.migratepage = btree_migratepage,
				1017	#endif
				1018	.set_page_dirty = btree_set_page_dirty,
				1019	};
				1020
				1021	void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
				1022	{
				1023	struct extent_buffer *buf = NULL;
				1024	struct inode *btree_inode = fs_info->btree_inode;
				1025
				1026	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				1027	if (IS_ERR(buf))
				1028	return;
				1029	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
				1030	buf, WAIT_NONE, btree_get_extent, 0);
				1031	free_extent_buffer(buf);
				1032	}
				1033
				1034	int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
				1035	int mirror_num, struct extent_buffer **eb)
				1036	{
				1037	struct extent_buffer *buf = NULL;
				1038	struct inode *btree_inode = fs_info->btree_inode;
				1039	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
				1040	int ret;
				1041
				1042	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				1043	if (IS_ERR(buf))
				1044	return 0;
				1045
				1046	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
				1047
				1048	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
				1049	btree_get_extent, mirror_num);
				1050	if (ret) {
				1051	free_extent_buffer(buf);
				1052	return ret;
				1053	}
				1054
				1055	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
				1056	free_extent_buffer(buf);
				1057	return -EIO;
				1058	} else if (extent_buffer_uptodate(buf)) {
				1059	*eb = buf;
				1060	} else {
				1061	free_extent_buffer(buf);
				1062	}
				1063	return 0;
				1064	}
				1065
				1066	struct extent_buffer *btrfs_find_create_tree_block(
				1067	struct btrfs_fs_info *fs_info,
				1068	u64 bytenr)
				1069	{
				1070	if (btrfs_is_testing(fs_info))
				1071	return alloc_test_extent_buffer(fs_info, bytenr);
				1072	return alloc_extent_buffer(fs_info, bytenr);
				1073	}
				1074
				1075
				1076	int btrfs_write_tree_block(struct extent_buffer *buf)
				1077	{
				1078	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
				1079	buf->start + buf->len - 1);
				1080	}
				1081
				1082	void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
				1083	{
				1084	filemap_fdatawait_range(buf->pages[0]->mapping,
				1085	buf->start, buf->start + buf->len - 1);
				1086	}
				1087
				1088	struct extent_buffer read_tree_block(struct btrfs_fs_info fs_info, u64 bytenr,
				1089	u64 parent_transid)
				1090	{
				1091	struct extent_buffer *buf = NULL;
				1092	int ret;
				1093
				1094	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				1095	if (IS_ERR(buf))
				1096	return buf;
				1097
				1098	ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
				1099	if (ret) {
				1100	free_extent_buffer(buf);
				1101	return ERR_PTR(ret);
				1102	}
				1103	return buf;
				1104
				1105	}
				1106
				1107	void clean_tree_block(struct btrfs_fs_info *fs_info,
				1108	struct extent_buffer *buf)
				1109	{
				1110	if (btrfs_header_generation(buf) ==
				1111	fs_info->running_transaction->transid) {
				1112	btrfs_assert_tree_locked(buf);
				1113
				1114	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
				1115	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
				1116	-buf->len,
				1117	fs_info->dirty_metadata_batch);
				1118	/* ugh, clear_extent_buffer_dirty needs to lock the page */
				1119	btrfs_set_lock_blocking(buf);
				1120	clear_extent_buffer_dirty(buf);
				1121	}
				1122	}
				1123	}
				1124
				1125	static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
				1126	{
				1127	struct btrfs_subvolume_writers *writers;
				1128	int ret;
				1129
				1130	writers = kmalloc(sizeof(*writers), GFP_NOFS);
				1131	if (!writers)
				1132	return ERR_PTR(-ENOMEM);
				1133
				1134	ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
				1135	if (ret < 0) {
				1136	kfree(writers);
				1137	return ERR_PTR(ret);
				1138	}
				1139
				1140	init_waitqueue_head(&writers->wait);
				1141	return writers;
				1142	}
				1143
				1144	static void
				1145	btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
				1146	{
				1147	percpu_counter_destroy(&writers->counter);
				1148	kfree(writers);
				1149	}
				1150
				1151	static void __setup_root(struct btrfs_root root, struct btrfs_fs_info fs_info,
				1152	u64 objectid)
				1153	{
				1154	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
				1155	root->node = NULL;
				1156	root->commit_root = NULL;
				1157	root->state = 0;
				1158	root->orphan_cleanup_state = 0;
				1159
				1160	root->objectid = objectid;
				1161	root->last_trans = 0;
				1162	root->highest_objectid = 0;
				1163	root->nr_delalloc_inodes = 0;
				1164	root->nr_ordered_extents = 0;
				1165	root->name = NULL;
				1166	root->inode_tree = RB_ROOT;
				1167	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
				1168	root->block_rsv = NULL;
				1169	root->orphan_block_rsv = NULL;
				1170
				1171	INIT_LIST_HEAD(&root->dirty_list);
				1172	INIT_LIST_HEAD(&root->root_list);
				1173	INIT_LIST_HEAD(&root->delalloc_inodes);
				1174	INIT_LIST_HEAD(&root->delalloc_root);
				1175	INIT_LIST_HEAD(&root->ordered_extents);
				1176	INIT_LIST_HEAD(&root->ordered_root);
				1177	INIT_LIST_HEAD(&root->logged_list[0]);
				1178	INIT_LIST_HEAD(&root->logged_list[1]);
				1179	spin_lock_init(&root->orphan_lock);
				1180	spin_lock_init(&root->inode_lock);
				1181	spin_lock_init(&root->delalloc_lock);
				1182	spin_lock_init(&root->ordered_extent_lock);
				1183	spin_lock_init(&root->accounting_lock);
				1184	spin_lock_init(&root->log_extents_lock[0]);
				1185	spin_lock_init(&root->log_extents_lock[1]);
				1186	mutex_init(&root->objectid_mutex);
				1187	mutex_init(&root->log_mutex);
				1188	mutex_init(&root->ordered_extent_mutex);
				1189	mutex_init(&root->delalloc_mutex);
				1190	init_waitqueue_head(&root->log_writer_wait);
				1191	init_waitqueue_head(&root->log_commit_wait[0]);
				1192	init_waitqueue_head(&root->log_commit_wait[1]);
				1193	INIT_LIST_HEAD(&root->log_ctxs[0]);
				1194	INIT_LIST_HEAD(&root->log_ctxs[1]);
				1195	atomic_set(&root->log_commit[0], 0);
				1196	atomic_set(&root->log_commit[1], 0);
				1197	atomic_set(&root->log_writers, 0);
				1198	atomic_set(&root->log_batch, 0);
				1199	atomic_set(&root->orphan_inodes, 0);
				1200	refcount_set(&root->refs, 1);
				1201	atomic_set(&root->will_be_snapshotted, 0);
				1202	atomic64_set(&root->qgroup_meta_rsv, 0);
				1203	root->log_transid = 0;
				1204	root->log_transid_committed = -1;
				1205	root->last_log_commit = 0;
				1206	if (!dummy)
				1207	extent_io_tree_init(&root->dirty_log_pages, NULL);
				1208
				1209	memset(&root->root_key, 0, sizeof(root->root_key));
				1210	memset(&root->root_item, 0, sizeof(root->root_item));
				1211	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
				1212	if (!dummy)
				1213	root->defrag_trans_start = fs_info->generation;
				1214	else
				1215	root->defrag_trans_start = 0;
				1216	root->root_key.objectid = objectid;
				1217	root->anon_dev = 0;
				1218
				1219	spin_lock_init(&root->root_item_lock);
				1220	}
				1221
				1222	static struct btrfs_root btrfs_alloc_root(struct btrfs_fs_info fs_info,
				1223	gfp_t flags)
				1224	{
				1225	struct btrfs_root root = kzalloc(sizeof(root), flags);
				1226	if (root)
				1227	root->fs_info = fs_info;
				1228	return root;
				1229	}
				1230
				1231	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				1232	/* Should only be used by the testing infrastructure */
				1233	struct btrfs_root btrfs_alloc_dummy_root(struct btrfs_fs_info fs_info)
				1234	{
				1235	struct btrfs_root *root;
				1236
				1237	if (!fs_info)
				1238	return ERR_PTR(-EINVAL);
				1239
				1240	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				1241	if (!root)
				1242	return ERR_PTR(-ENOMEM);
				1243
				1244	/* We don't use the stripesize in selftest, set it as sectorsize */
				1245	__setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
				1246	root->alloc_bytenr = 0;
				1247
				1248	return root;
				1249	}
				1250	#endif
				1251
				1252	struct btrfs_root btrfs_create_tree(struct btrfs_trans_handle trans,
				1253	struct btrfs_fs_info *fs_info,
				1254	u64 objectid)
				1255	{
				1256	struct extent_buffer *leaf;
				1257	struct btrfs_root *tree_root = fs_info->tree_root;
				1258	struct btrfs_root *root;
				1259	struct btrfs_key key;
				1260	int ret = 0;
				1261	uuid_le uuid;
				1262
				1263	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				1264	if (!root)
				1265	return ERR_PTR(-ENOMEM);
				1266
				1267	__setup_root(root, fs_info, objectid);
				1268	root->root_key.objectid = objectid;
				1269	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
				1270	root->root_key.offset = 0;
				1271
				1272	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
				1273	if (IS_ERR(leaf)) {
				1274	ret = PTR_ERR(leaf);
				1275	leaf = NULL;
				1276	goto fail;
				1277	}
				1278
				1279	memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
				1280	btrfs_set_header_bytenr(leaf, leaf->start);
				1281	btrfs_set_header_generation(leaf, trans->transid);
				1282	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
				1283	btrfs_set_header_owner(leaf, objectid);
				1284	root->node = leaf;
				1285
				1286	write_extent_buffer_fsid(leaf, fs_info->fsid);
				1287	write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid);
				1288	btrfs_mark_buffer_dirty(leaf);
				1289
				1290	root->commit_root = btrfs_root_node(root);
				1291	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				1292
				1293	root->root_item.flags = 0;
				1294	root->root_item.byte_limit = 0;
				1295	btrfs_set_root_bytenr(&root->root_item, leaf->start);
				1296	btrfs_set_root_generation(&root->root_item, trans->transid);
				1297	btrfs_set_root_level(&root->root_item, 0);
				1298	btrfs_set_root_refs(&root->root_item, 1);
				1299	btrfs_set_root_used(&root->root_item, leaf->len);
				1300	btrfs_set_root_last_snapshot(&root->root_item, 0);
				1301	btrfs_set_root_dirid(&root->root_item, 0);
				1302	uuid_le_gen(&uuid);
				1303	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
				1304	root->root_item.drop_level = 0;
				1305
				1306	key.objectid = objectid;
				1307	key.type = BTRFS_ROOT_ITEM_KEY;
				1308	key.offset = 0;
				1309	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
				1310	if (ret)
				1311	goto fail;
				1312
				1313	btrfs_tree_unlock(leaf);
				1314
				1315	return root;
				1316
				1317	fail:
				1318	if (leaf) {
				1319	btrfs_tree_unlock(leaf);
				1320	free_extent_buffer(root->commit_root);
				1321	free_extent_buffer(leaf);
				1322	}
				1323	kfree(root);
				1324
				1325	return ERR_PTR(ret);
				1326	}
				1327
				1328	static struct btrfs_root alloc_log_tree(struct btrfs_trans_handle trans,
				1329	struct btrfs_fs_info *fs_info)
				1330	{
				1331	struct btrfs_root *root;
				1332	struct extent_buffer *leaf;
				1333
				1334	root = btrfs_alloc_root(fs_info, GFP_NOFS);
				1335	if (!root)
				1336	return ERR_PTR(-ENOMEM);
				1337
				1338	__setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID);
				1339
				1340	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
				1341	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
				1342	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
				1343
				1344	/*
				1345	* DON'T set REF_COWS for log trees
				1346	*
				1347	* log trees do not get reference counted because they go away
				1348	* before a real commit is actually done. They do store pointers
				1349	* to file data extents, and those reference counts still get
				1350	* updated (along with back refs to the log tree).
				1351	*/
				1352
				1353	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
				1354	NULL, 0, 0, 0);
				1355	if (IS_ERR(leaf)) {
				1356	kfree(root);
				1357	return ERR_CAST(leaf);
				1358	}
				1359
				1360	memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header));
				1361	btrfs_set_header_bytenr(leaf, leaf->start);
				1362	btrfs_set_header_generation(leaf, trans->transid);
				1363	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
				1364	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
				1365	root->node = leaf;
				1366
				1367	write_extent_buffer_fsid(root->node, fs_info->fsid);
				1368	btrfs_mark_buffer_dirty(root->node);
				1369	btrfs_tree_unlock(root->node);
				1370	return root;
				1371	}
				1372
				1373	int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
				1374	struct btrfs_fs_info *fs_info)
				1375	{
				1376	struct btrfs_root *log_root;
				1377
				1378	log_root = alloc_log_tree(trans, fs_info);
				1379	if (IS_ERR(log_root))
				1380	return PTR_ERR(log_root);
				1381	WARN_ON(fs_info->log_root_tree);
				1382	fs_info->log_root_tree = log_root;
				1383	return 0;
				1384	}
				1385
				1386	int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
				1387	struct btrfs_root *root)
				1388	{
				1389	struct btrfs_fs_info *fs_info = root->fs_info;
				1390	struct btrfs_root *log_root;
				1391	struct btrfs_inode_item *inode_item;
				1392
				1393	log_root = alloc_log_tree(trans, fs_info);
				1394	if (IS_ERR(log_root))
				1395	return PTR_ERR(log_root);
				1396
				1397	log_root->last_trans = trans->transid;
				1398	log_root->root_key.offset = root->root_key.objectid;
				1399
				1400	inode_item = &log_root->root_item.inode;
				1401	btrfs_set_stack_inode_generation(inode_item, 1);
				1402	btrfs_set_stack_inode_size(inode_item, 3);
				1403	btrfs_set_stack_inode_nlink(inode_item, 1);
				1404	btrfs_set_stack_inode_nbytes(inode_item,
				1405	fs_info->nodesize);
				1406	btrfs_set_stack_inode_mode(inode_item, S_IFDIR \| 0755);
				1407
				1408	btrfs_set_root_node(&log_root->root_item, log_root->node);
				1409
				1410	WARN_ON(root->log_root);
				1411	root->log_root = log_root;
				1412	root->log_transid = 0;
				1413	root->log_transid_committed = -1;
				1414	root->last_log_commit = 0;
				1415	return 0;
				1416	}
				1417
				1418	static struct btrfs_root btrfs_read_tree_root(struct btrfs_root tree_root,
				1419	struct btrfs_key *key)
				1420	{
				1421	struct btrfs_root *root;
				1422	struct btrfs_fs_info *fs_info = tree_root->fs_info;
				1423	struct btrfs_path *path;
				1424	u64 generation;
				1425	int ret;
				1426
				1427	path = btrfs_alloc_path();
				1428	if (!path)
				1429	return ERR_PTR(-ENOMEM);
				1430
				1431	root = btrfs_alloc_root(fs_info, GFP_NOFS);
				1432	if (!root) {
				1433	ret = -ENOMEM;
				1434	goto alloc_fail;
				1435	}
				1436
				1437	__setup_root(root, fs_info, key->objectid);
				1438
				1439	ret = btrfs_find_root(tree_root, key, path,
				1440	&root->root_item, &root->root_key);
				1441	if (ret) {
				1442	if (ret > 0)
				1443	ret = -ENOENT;
				1444	goto find_fail;
				1445	}
				1446
				1447	generation = btrfs_root_generation(&root->root_item);
				1448	root->node = read_tree_block(fs_info,
				1449	btrfs_root_bytenr(&root->root_item),
				1450	generation);
				1451	if (IS_ERR(root->node)) {
				1452	ret = PTR_ERR(root->node);
				1453	goto find_fail;
				1454	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
				1455	ret = -EIO;
				1456	free_extent_buffer(root->node);
				1457	goto find_fail;
				1458	}
				1459	root->commit_root = btrfs_root_node(root);
				1460	out:
				1461	btrfs_free_path(path);
				1462	return root;
				1463
				1464	find_fail:
				1465	kfree(root);
				1466	alloc_fail:
				1467	root = ERR_PTR(ret);
				1468	goto out;
				1469	}
				1470
				1471	struct btrfs_root btrfs_read_fs_root(struct btrfs_root tree_root,
				1472	struct btrfs_key *location)
				1473	{
				1474	struct btrfs_root *root;
				1475
				1476	root = btrfs_read_tree_root(tree_root, location);
				1477	if (IS_ERR(root))
				1478	return root;
				1479
				1480	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				1481	set_bit(BTRFS_ROOT_REF_COWS, &root->state);
				1482	btrfs_check_and_init_root_item(&root->root_item);
				1483	}
				1484
				1485	return root;
				1486	}
				1487
				1488	int btrfs_init_fs_root(struct btrfs_root *root)
				1489	{
				1490	int ret;
				1491	struct btrfs_subvolume_writers *writers;
				1492
				1493	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
				1494	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
				1495	GFP_NOFS);
				1496	if (!root->free_ino_pinned \|\| !root->free_ino_ctl) {
				1497	ret = -ENOMEM;
				1498	goto fail;
				1499	}
				1500
				1501	writers = btrfs_alloc_subvolume_writers();
				1502	if (IS_ERR(writers)) {
				1503	ret = PTR_ERR(writers);
				1504	goto fail;
				1505	}
				1506	root->subv_writers = writers;
				1507
				1508	btrfs_init_free_ino_ctl(root);
				1509	spin_lock_init(&root->ino_cache_lock);
				1510	init_waitqueue_head(&root->ino_cache_wait);
				1511
				1512	/*
				1513	* Don't assign anonymous block device to roots that are not exposed to
				1514	* userspace, the id pool is limited to 1M
				1515	*/
				1516	if (is_fstree(root->root_key.objectid) &&
				1517	btrfs_root_refs(&root->root_item) > 0) {
				1518	ret = get_anon_bdev(&root->anon_dev);
				1519	if (ret)
				1520	goto fail;
				1521	}
				1522
				1523	mutex_lock(&root->objectid_mutex);
				1524	ret = btrfs_find_highest_objectid(root,
				1525	&root->highest_objectid);
				1526	if (ret) {
				1527	mutex_unlock(&root->objectid_mutex);
				1528	goto fail;
				1529	}
				1530
				1531	ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
				1532
				1533	mutex_unlock(&root->objectid_mutex);
				1534
				1535	return 0;
				1536	fail:
				1537	/* the caller is responsible to call free_fs_root */
				1538	return ret;
				1539	}
				1540
				1541	struct btrfs_root btrfs_lookup_fs_root(struct btrfs_fs_info fs_info,
				1542	u64 root_id)
				1543	{
				1544	struct btrfs_root *root;
				1545
				1546	spin_lock(&fs_info->fs_roots_radix_lock);
				1547	root = radix_tree_lookup(&fs_info->fs_roots_radix,
				1548	(unsigned long)root_id);
				1549	spin_unlock(&fs_info->fs_roots_radix_lock);
				1550	return root;
				1551	}
				1552
				1553	int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
				1554	struct btrfs_root *root)
				1555	{
				1556	int ret;
				1557
				1558	ret = radix_tree_preload(GFP_NOFS);
				1559	if (ret)
				1560	return ret;
				1561
				1562	spin_lock(&fs_info->fs_roots_radix_lock);
				1563	ret = radix_tree_insert(&fs_info->fs_roots_radix,
				1564	(unsigned long)root->root_key.objectid,
				1565	root);
				1566	if (ret == 0)
				1567	set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
				1568	spin_unlock(&fs_info->fs_roots_radix_lock);
				1569	radix_tree_preload_end();
				1570
				1571	return ret;
				1572	}
				1573
				1574	struct btrfs_root btrfs_get_fs_root(struct btrfs_fs_info fs_info,
				1575	struct btrfs_key *location,
				1576	bool check_ref)
				1577	{
				1578	struct btrfs_root *root;
				1579	struct btrfs_path *path;
				1580	struct btrfs_key key;
				1581	int ret;
				1582
				1583	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
				1584	return fs_info->tree_root;
				1585	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
				1586	return fs_info->extent_root;
				1587	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
				1588	return fs_info->chunk_root;
				1589	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
				1590	return fs_info->dev_root;
				1591	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
				1592	return fs_info->csum_root;
				1593	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
				1594	return fs_info->quota_root ? fs_info->quota_root :
				1595	ERR_PTR(-ENOENT);
				1596	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
				1597	return fs_info->uuid_root ? fs_info->uuid_root :
				1598	ERR_PTR(-ENOENT);
				1599	if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
				1600	return fs_info->free_space_root ? fs_info->free_space_root :
				1601	ERR_PTR(-ENOENT);
				1602	again:
				1603	root = btrfs_lookup_fs_root(fs_info, location->objectid);
				1604	if (root) {
				1605	if (check_ref && btrfs_root_refs(&root->root_item) == 0)
				1606	return ERR_PTR(-ENOENT);
				1607	return root;
				1608	}
				1609
				1610	root = btrfs_read_fs_root(fs_info->tree_root, location);
				1611	if (IS_ERR(root))
				1612	return root;
				1613
				1614	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
				1615	ret = -ENOENT;
				1616	goto fail;
				1617	}
				1618
				1619	ret = btrfs_init_fs_root(root);
				1620	if (ret)
				1621	goto fail;
				1622
				1623	path = btrfs_alloc_path();
				1624	if (!path) {
				1625	ret = -ENOMEM;
				1626	goto fail;
				1627	}
				1628	key.objectid = BTRFS_ORPHAN_OBJECTID;
				1629	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1630	key.offset = location->objectid;
				1631
				1632	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
				1633	btrfs_free_path(path);
				1634	if (ret < 0)
				1635	goto fail;
				1636	if (ret == 0)
				1637	set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
				1638
				1639	ret = btrfs_insert_fs_root(fs_info, root);
				1640	if (ret) {
				1641	if (ret == -EEXIST) {
				1642	free_fs_root(root);
				1643	goto again;
				1644	}
				1645	goto fail;
				1646	}
				1647	return root;
				1648	fail:
				1649	free_fs_root(root);
				1650	return ERR_PTR(ret);
				1651	}
				1652
				1653	static int btrfs_congested_fn(void *congested_data, int bdi_bits)
				1654	{
				1655	struct btrfs_fs_info info = (struct btrfs_fs_info )congested_data;
				1656	int ret = 0;
				1657	struct btrfs_device *device;
				1658	struct backing_dev_info *bdi;
				1659
				1660	rcu_read_lock();
				1661	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
				1662	if (!device->bdev)
				1663	continue;
				1664	bdi = device->bdev->bd_bdi;
				1665	if (bdi_congested(bdi, bdi_bits)) {
				1666	ret = 1;
				1667	break;
				1668	}
				1669	}
				1670	rcu_read_unlock();
				1671	return ret;
				1672	}
				1673
				1674	/*
				1675	* called by the kthread helper functions to finally call the bio end_io
				1676	* functions. This is where read checksum verification actually happens
				1677	*/
				1678	static void end_workqueue_fn(struct btrfs_work *work)
				1679	{
				1680	struct bio *bio;
				1681	struct btrfs_end_io_wq *end_io_wq;
				1682
				1683	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
				1684	bio = end_io_wq->bio;
				1685
				1686	bio->bi_status = end_io_wq->status;
				1687	bio->bi_private = end_io_wq->private;
				1688	bio->bi_end_io = end_io_wq->end_io;
				1689	bio_endio(bio);
				1690	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
				1691	}
				1692
				1693	static int cleaner_kthread(void *arg)
				1694	{
				1695	struct btrfs_root *root = arg;
				1696	struct btrfs_fs_info *fs_info = root->fs_info;
				1697	int again;
				1698
				1699	while (1) {
				1700	again = 0;
				1701
				1702	/* Make the cleaner go to sleep early. */
				1703	if (btrfs_need_cleaner_sleep(fs_info))
				1704	goto sleep;
				1705
				1706	/*
				1707	* Do not do anything if we might cause open_ctree() to block
				1708	* before we have finished mounting the filesystem.
				1709	*/
				1710	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				1711	goto sleep;
				1712
				1713	if (!mutex_trylock(&fs_info->cleaner_mutex))
				1714	goto sleep;
				1715
				1716	/*
				1717	* Avoid the problem that we change the status of the fs
				1718	* during the above check and trylock.
				1719	*/
				1720	if (btrfs_need_cleaner_sleep(fs_info)) {
				1721	mutex_unlock(&fs_info->cleaner_mutex);
				1722	goto sleep;
				1723	}
				1724
				1725	mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
				1726	btrfs_run_delayed_iputs(fs_info);
				1727	mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
				1728
				1729	again = btrfs_clean_one_deleted_snapshot(root);
				1730	mutex_unlock(&fs_info->cleaner_mutex);
				1731
				1732	/*
				1733	* The defragger has dealt with the R/O remount and umount,
				1734	* needn't do anything special here.
				1735	*/
				1736	btrfs_run_defrag_inodes(fs_info);
				1737
				1738	/*
				1739	* Acquires fs_info->delete_unused_bgs_mutex to avoid racing
				1740	* with relocation (btrfs_relocate_chunk) and relocation
				1741	* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
				1742	* after acquiring fs_info->delete_unused_bgs_mutex. So we
				1743	* can't hold, nor need to, fs_info->cleaner_mutex when deleting
				1744	* unused block groups.
				1745	*/
				1746	btrfs_delete_unused_bgs(fs_info);
				1747	sleep:
				1748	if (kthread_should_park())
				1749	kthread_parkme();
				1750	if (kthread_should_stop())
				1751	return 0;
				1752	if (!again) {
				1753	set_current_state(TASK_INTERRUPTIBLE);
				1754	schedule();
				1755	__set_current_state(TASK_RUNNING);
				1756	}
				1757	}
				1758	}
				1759
				1760	static int transaction_kthread(void *arg)
				1761	{
				1762	struct btrfs_root *root = arg;
				1763	struct btrfs_fs_info *fs_info = root->fs_info;
				1764	struct btrfs_trans_handle *trans;
				1765	struct btrfs_transaction *cur;
				1766	u64 transid;
				1767	unsigned long now;
				1768	unsigned long delay;
				1769	bool cannot_commit;
				1770
				1771	do {
				1772	cannot_commit = false;
				1773	delay = HZ * fs_info->commit_interval;
				1774	mutex_lock(&fs_info->transaction_kthread_mutex);
				1775
				1776	spin_lock(&fs_info->trans_lock);
				1777	cur = fs_info->running_transaction;
				1778	if (!cur) {
				1779	spin_unlock(&fs_info->trans_lock);
				1780	goto sleep;
				1781	}
				1782
				1783	now = get_seconds();
				1784	if (cur->state < TRANS_STATE_BLOCKED &&
				1785	(now < cur->start_time \|\|
				1786	now - cur->start_time < fs_info->commit_interval)) {
				1787	spin_unlock(&fs_info->trans_lock);
				1788	delay = HZ * 5;
				1789	goto sleep;
				1790	}
				1791	transid = cur->transid;
				1792	spin_unlock(&fs_info->trans_lock);
				1793
				1794	/* If the file system is aborted, this will always fail. */
				1795	trans = btrfs_attach_transaction(root);
				1796	if (IS_ERR(trans)) {
				1797	if (PTR_ERR(trans) != -ENOENT)
				1798	cannot_commit = true;
				1799	goto sleep;
				1800	}
				1801	if (transid == trans->transid) {
				1802	btrfs_commit_transaction(trans);
				1803	} else {
				1804	btrfs_end_transaction(trans);
				1805	}
				1806	sleep:
				1807	wake_up_process(fs_info->cleaner_kthread);
				1808	mutex_unlock(&fs_info->transaction_kthread_mutex);
				1809
				1810	if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
				1811	&fs_info->fs_state)))
				1812	btrfs_cleanup_transaction(fs_info);
				1813	set_current_state(TASK_INTERRUPTIBLE);
				1814	if (!kthread_should_stop() &&
				1815	(!btrfs_transaction_blocked(fs_info) \|\|
				1816	cannot_commit))
				1817	schedule_timeout(delay);
				1818	__set_current_state(TASK_RUNNING);
				1819	} while (!kthread_should_stop());
				1820	return 0;
				1821	}
				1822
				1823	/*
				1824	* this will find the highest generation in the array of
				1825	* root backups. The index of the highest array is returned,
				1826	* or -1 if we can't find anything.
				1827	*
				1828	* We check to make sure the array is valid by comparing the
				1829	* generation of the latest root in the array with the generation
				1830	* in the super block. If they don't match we pitch it.
				1831	*/
				1832	static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
				1833	{
				1834	u64 cur;
				1835	int newest_index = -1;
				1836	struct btrfs_root_backup *root_backup;
				1837	int i;
				1838
				1839	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
				1840	root_backup = info->super_copy->super_roots + i;
				1841	cur = btrfs_backup_tree_root_gen(root_backup);
				1842	if (cur == newest_gen)
				1843	newest_index = i;
				1844	}
				1845
				1846	/* check to see if we actually wrapped around */
				1847	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
				1848	root_backup = info->super_copy->super_roots;
				1849	cur = btrfs_backup_tree_root_gen(root_backup);
				1850	if (cur == newest_gen)
				1851	newest_index = 0;
				1852	}
				1853	return newest_index;
				1854	}
				1855
				1856
				1857	/*
				1858	* find the oldest backup so we know where to store new entries
				1859	* in the backup array. This will set the backup_root_index
				1860	* field in the fs_info struct
				1861	*/
				1862	static void find_oldest_super_backup(struct btrfs_fs_info *info,
				1863	u64 newest_gen)
				1864	{
				1865	int newest_index = -1;
				1866
				1867	newest_index = find_newest_super_backup(info, newest_gen);
				1868	/* if there was garbage in there, just move along */
				1869	if (newest_index == -1) {
				1870	info->backup_root_index = 0;
				1871	} else {
				1872	info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
				1873	}
				1874	}
				1875
				1876	/*
				1877	* copy all the root pointers into the super backup array.
				1878	* this will bump the backup pointer by one when it is
				1879	* done
				1880	*/
				1881	static void backup_super_roots(struct btrfs_fs_info *info)
				1882	{
				1883	int next_backup;
				1884	struct btrfs_root_backup *root_backup;
				1885	int last_backup;
				1886
				1887	next_backup = info->backup_root_index;
				1888	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
				1889	BTRFS_NUM_BACKUP_ROOTS;
				1890
				1891	/*
				1892	* just overwrite the last backup if we're at the same generation
				1893	* this happens only at umount
				1894	*/
				1895	root_backup = info->super_for_commit->super_roots + last_backup;
				1896	if (btrfs_backup_tree_root_gen(root_backup) ==
				1897	btrfs_header_generation(info->tree_root->node))
				1898	next_backup = last_backup;
				1899
				1900	root_backup = info->super_for_commit->super_roots + next_backup;
				1901
				1902	/*
				1903	* make sure all of our padding and empty slots get zero filled
				1904	* regardless of which ones we use today
				1905	*/
				1906	memset(root_backup, 0, sizeof(*root_backup));
				1907
				1908	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
				1909
				1910	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
				1911	btrfs_set_backup_tree_root_gen(root_backup,
				1912	btrfs_header_generation(info->tree_root->node));
				1913
				1914	btrfs_set_backup_tree_root_level(root_backup,
				1915	btrfs_header_level(info->tree_root->node));
				1916
				1917	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
				1918	btrfs_set_backup_chunk_root_gen(root_backup,
				1919	btrfs_header_generation(info->chunk_root->node));
				1920	btrfs_set_backup_chunk_root_level(root_backup,
				1921	btrfs_header_level(info->chunk_root->node));
				1922
				1923	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
				1924	btrfs_set_backup_extent_root_gen(root_backup,
				1925	btrfs_header_generation(info->extent_root->node));
				1926	btrfs_set_backup_extent_root_level(root_backup,
				1927	btrfs_header_level(info->extent_root->node));
				1928
				1929	/*
				1930	* we might commit during log recovery, which happens before we set
				1931	* the fs_root. Make sure it is valid before we fill it in.
				1932	*/
				1933	if (info->fs_root && info->fs_root->node) {
				1934	btrfs_set_backup_fs_root(root_backup,
				1935	info->fs_root->node->start);
				1936	btrfs_set_backup_fs_root_gen(root_backup,
				1937	btrfs_header_generation(info->fs_root->node));
				1938	btrfs_set_backup_fs_root_level(root_backup,
				1939	btrfs_header_level(info->fs_root->node));
				1940	}
				1941
				1942	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
				1943	btrfs_set_backup_dev_root_gen(root_backup,
				1944	btrfs_header_generation(info->dev_root->node));
				1945	btrfs_set_backup_dev_root_level(root_backup,
				1946	btrfs_header_level(info->dev_root->node));
				1947
				1948	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
				1949	btrfs_set_backup_csum_root_gen(root_backup,
				1950	btrfs_header_generation(info->csum_root->node));
				1951	btrfs_set_backup_csum_root_level(root_backup,
				1952	btrfs_header_level(info->csum_root->node));
				1953
				1954	btrfs_set_backup_total_bytes(root_backup,
				1955	btrfs_super_total_bytes(info->super_copy));
				1956	btrfs_set_backup_bytes_used(root_backup,
				1957	btrfs_super_bytes_used(info->super_copy));
				1958	btrfs_set_backup_num_devices(root_backup,
				1959	btrfs_super_num_devices(info->super_copy));
				1960
				1961	/*
				1962	* if we don't copy this out to the super_copy, it won't get remembered
				1963	* for the next commit
				1964	*/
				1965	memcpy(&info->super_copy->super_roots,
				1966	&info->super_for_commit->super_roots,
				1967	sizeof(root_backup) BTRFS_NUM_BACKUP_ROOTS);
				1968	}
				1969
				1970	/*
				1971	* this copies info out of the root backup array and back into
				1972	* the in-memory super block. It is meant to help iterate through
				1973	* the array, so you send it the number of backups you've already
				1974	* tried and the last backup index you used.
				1975	*
				1976	* this returns -1 when it has tried all the backups
				1977	*/
				1978	static noinline int next_root_backup(struct btrfs_fs_info *info,
				1979	struct btrfs_super_block *super,
				1980	int num_backups_tried, int backup_index)
				1981	{
				1982	struct btrfs_root_backup *root_backup;
				1983	int newest = *backup_index;
				1984
				1985	if (*num_backups_tried == 0) {
				1986	u64 gen = btrfs_super_generation(super);
				1987
				1988	newest = find_newest_super_backup(info, gen);
				1989	if (newest == -1)
				1990	return -1;
				1991
				1992	*backup_index = newest;
				1993	*num_backups_tried = 1;
				1994	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
				1995	/* we've tried all the backups, all done */
				1996	return -1;
				1997	} else {
				1998	/* jump to the next oldest backup */
				1999	newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
				2000	BTRFS_NUM_BACKUP_ROOTS;
				2001	*backup_index = newest;
				2002	*num_backups_tried += 1;
				2003	}
				2004	root_backup = super->super_roots + newest;
				2005
				2006	btrfs_set_super_generation(super,
				2007	btrfs_backup_tree_root_gen(root_backup));
				2008	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
				2009	btrfs_set_super_root_level(super,
				2010	btrfs_backup_tree_root_level(root_backup));
				2011	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
				2012
				2013	/*
				2014	* fixme: the total bytes and num_devices need to match or we should
				2015	* need a fsck
				2016	*/
				2017	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
				2018	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
				2019	return 0;
				2020	}
				2021
				2022	/* helper to cleanup workers */
				2023	static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
				2024	{
				2025	btrfs_destroy_workqueue(fs_info->fixup_workers);
				2026	btrfs_destroy_workqueue(fs_info->delalloc_workers);
				2027	btrfs_destroy_workqueue(fs_info->workers);
				2028	btrfs_destroy_workqueue(fs_info->endio_workers);
				2029	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
				2030	btrfs_destroy_workqueue(fs_info->endio_repair_workers);
				2031	btrfs_destroy_workqueue(fs_info->rmw_workers);
				2032	btrfs_destroy_workqueue(fs_info->endio_write_workers);
				2033	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
				2034	btrfs_destroy_workqueue(fs_info->submit_workers);
				2035	btrfs_destroy_workqueue(fs_info->delayed_workers);
				2036	btrfs_destroy_workqueue(fs_info->caching_workers);
				2037	btrfs_destroy_workqueue(fs_info->readahead_workers);
				2038	btrfs_destroy_workqueue(fs_info->flush_workers);
				2039	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
				2040	btrfs_destroy_workqueue(fs_info->extent_workers);
				2041	/*
				2042	* Now that all other work queues are destroyed, we can safely destroy
				2043	* the queues used for metadata I/O, since tasks from those other work
				2044	* queues can do metadata I/O operations.
				2045	*/
				2046	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
				2047	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
				2048	}
				2049
				2050	static void free_root_extent_buffers(struct btrfs_root *root)
				2051	{
				2052	if (root) {
				2053	free_extent_buffer(root->node);
				2054	free_extent_buffer(root->commit_root);
				2055	root->node = NULL;
				2056	root->commit_root = NULL;
				2057	}
				2058	}
				2059
				2060	/* helper to cleanup tree roots */
				2061	static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
				2062	{
				2063	free_root_extent_buffers(info->tree_root);
				2064
				2065	free_root_extent_buffers(info->dev_root);
				2066	free_root_extent_buffers(info->extent_root);
				2067	free_root_extent_buffers(info->csum_root);
				2068	free_root_extent_buffers(info->quota_root);
				2069	free_root_extent_buffers(info->uuid_root);
				2070	if (free_chunk_root)
				2071	free_root_extent_buffers(info->chunk_root);
				2072	free_root_extent_buffers(info->free_space_root);
				2073	}
				2074
				2075	void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
				2076	{
				2077	int ret;
				2078	struct btrfs_root *gang[8];
				2079	int i;
				2080
				2081	while (!list_empty(&fs_info->dead_roots)) {
				2082	gang[0] = list_entry(fs_info->dead_roots.next,
				2083	struct btrfs_root, root_list);
				2084	list_del(&gang[0]->root_list);
				2085
				2086	if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
				2087	btrfs_drop_and_free_fs_root(fs_info, gang[0]);
				2088	} else {
				2089	free_extent_buffer(gang[0]->node);
				2090	free_extent_buffer(gang[0]->commit_root);
				2091	btrfs_put_fs_root(gang[0]);
				2092	}
				2093	}
				2094
				2095	while (1) {
				2096	ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
				2097	(void **)gang, 0,
				2098	ARRAY_SIZE(gang));
				2099	if (!ret)
				2100	break;
				2101	for (i = 0; i < ret; i++)
				2102	btrfs_drop_and_free_fs_root(fs_info, gang[i]);
				2103	}
				2104
				2105	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				2106	btrfs_free_log_root_tree(NULL, fs_info);
				2107	btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
				2108	}
				2109	}
				2110
				2111	static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
				2112	{
				2113	mutex_init(&fs_info->scrub_lock);
				2114	atomic_set(&fs_info->scrubs_running, 0);
				2115	atomic_set(&fs_info->scrub_pause_req, 0);
				2116	atomic_set(&fs_info->scrubs_paused, 0);
				2117	atomic_set(&fs_info->scrub_cancel_req, 0);
				2118	init_waitqueue_head(&fs_info->scrub_pause_wait);
				2119	fs_info->scrub_workers_refcnt = 0;
				2120	}
				2121
				2122	static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
				2123	{
				2124	spin_lock_init(&fs_info->balance_lock);
				2125	mutex_init(&fs_info->balance_mutex);
				2126	atomic_set(&fs_info->balance_running, 0);
				2127	atomic_set(&fs_info->balance_pause_req, 0);
				2128	atomic_set(&fs_info->balance_cancel_req, 0);
				2129	fs_info->balance_ctl = NULL;
				2130	init_waitqueue_head(&fs_info->balance_wait_q);
				2131	}
				2132
				2133	static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
				2134	{
				2135	struct inode *inode = fs_info->btree_inode;
				2136
				2137	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
				2138	set_nlink(inode, 1);
				2139	/*
				2140	* we set the i_size on the btree inode to the max possible int.
				2141	* the real end of the address space is determined by all of
				2142	* the devices in the system
				2143	*/
				2144	inode->i_size = OFFSET_MAX;
				2145	inode->i_mapping->a_ops = &btree_aops;
				2146
				2147	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
				2148	extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
				2149	BTRFS_I(inode)->io_tree.track_uptodate = 0;
				2150	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
				2151
				2152	BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
				2153
				2154	BTRFS_I(inode)->root = fs_info->tree_root;
				2155	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
				2156	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
				2157	btrfs_insert_inode_hash(inode);
				2158	}
				2159
				2160	static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
				2161	{
				2162	fs_info->dev_replace.lock_owner = 0;
				2163	atomic_set(&fs_info->dev_replace.nesting_level, 0);
				2164	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
				2165	rwlock_init(&fs_info->dev_replace.lock);
				2166	atomic_set(&fs_info->dev_replace.read_locks, 0);
				2167	atomic_set(&fs_info->dev_replace.blocking_readers, 0);
				2168	init_waitqueue_head(&fs_info->replace_wait);
				2169	init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
				2170	}
				2171
				2172	static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
				2173	{
				2174	spin_lock_init(&fs_info->qgroup_lock);
				2175	mutex_init(&fs_info->qgroup_ioctl_lock);
				2176	fs_info->qgroup_tree = RB_ROOT;
				2177	fs_info->qgroup_op_tree = RB_ROOT;
				2178	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
				2179	fs_info->qgroup_seq = 1;
				2180	fs_info->qgroup_ulist = NULL;
				2181	fs_info->qgroup_rescan_running = false;
				2182	mutex_init(&fs_info->qgroup_rescan_lock);
				2183	}
				2184
				2185	static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
				2186	struct btrfs_fs_devices *fs_devices)
				2187	{
				2188	int max_active = fs_info->thread_pool_size;
				2189	unsigned int flags = WQ_MEM_RECLAIM \| WQ_FREEZABLE \| WQ_UNBOUND;
				2190
				2191	fs_info->workers =
				2192	btrfs_alloc_workqueue(fs_info, "worker",
				2193	flags \| WQ_HIGHPRI, max_active, 16);
				2194
				2195	fs_info->delalloc_workers =
				2196	btrfs_alloc_workqueue(fs_info, "delalloc",
				2197	flags, max_active, 2);
				2198
				2199	fs_info->flush_workers =
				2200	btrfs_alloc_workqueue(fs_info, "flush_delalloc",
				2201	flags, max_active, 0);
				2202
				2203	fs_info->caching_workers =
				2204	btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
				2205
				2206	/*
				2207	* a higher idle thresh on the submit workers makes it much more
				2208	* likely that bios will be send down in a sane order to the
				2209	* devices
				2210	*/
				2211	fs_info->submit_workers =
				2212	btrfs_alloc_workqueue(fs_info, "submit", flags,
				2213	min_t(u64, fs_devices->num_devices,
				2214	max_active), 64);
				2215
				2216	fs_info->fixup_workers =
				2217	btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
				2218
				2219	/*
				2220	* endios are largely parallel and should have a very
				2221	* low idle thresh
				2222	*/
				2223	fs_info->endio_workers =
				2224	btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
				2225	fs_info->endio_meta_workers =
				2226	btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
				2227	max_active, 4);
				2228	fs_info->endio_meta_write_workers =
				2229	btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
				2230	max_active, 2);
				2231	fs_info->endio_raid56_workers =
				2232	btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
				2233	max_active, 4);
				2234	fs_info->endio_repair_workers =
				2235	btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
				2236	fs_info->rmw_workers =
				2237	btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
				2238	fs_info->endio_write_workers =
				2239	btrfs_alloc_workqueue(fs_info, "endio-write", flags,
				2240	max_active, 2);
				2241	fs_info->endio_freespace_worker =
				2242	btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
				2243	max_active, 0);
				2244	fs_info->delayed_workers =
				2245	btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
				2246	max_active, 0);
				2247	fs_info->readahead_workers =
				2248	btrfs_alloc_workqueue(fs_info, "readahead", flags,
				2249	max_active, 2);
				2250	fs_info->qgroup_rescan_workers =
				2251	btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
				2252	fs_info->extent_workers =
				2253	btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
				2254	min_t(u64, fs_devices->num_devices,
				2255	max_active), 8);
				2256
				2257	if (!(fs_info->workers && fs_info->delalloc_workers &&
				2258	fs_info->submit_workers && fs_info->flush_workers &&
				2259	fs_info->endio_workers && fs_info->endio_meta_workers &&
				2260	fs_info->endio_meta_write_workers &&
				2261	fs_info->endio_repair_workers &&
				2262	fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
				2263	fs_info->endio_freespace_worker && fs_info->rmw_workers &&
				2264	fs_info->caching_workers && fs_info->readahead_workers &&
				2265	fs_info->fixup_workers && fs_info->delayed_workers &&
				2266	fs_info->extent_workers &&
				2267	fs_info->qgroup_rescan_workers)) {
				2268	return -ENOMEM;
				2269	}
				2270
				2271	return 0;
				2272	}
				2273
				2274	static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
				2275	struct btrfs_fs_devices *fs_devices)
				2276	{
				2277	int ret;
				2278	struct btrfs_root *log_tree_root;
				2279	struct btrfs_super_block *disk_super = fs_info->super_copy;
				2280	u64 bytenr = btrfs_super_log_root(disk_super);
				2281
				2282	if (fs_devices->rw_devices == 0) {
				2283	btrfs_warn(fs_info, "log replay required on RO media");
				2284	return -EIO;
				2285	}
				2286
				2287	log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				2288	if (!log_tree_root)
				2289	return -ENOMEM;
				2290
				2291	__setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
				2292
				2293	log_tree_root->node = read_tree_block(fs_info, bytenr,
				2294	fs_info->generation + 1);
				2295	if (IS_ERR(log_tree_root->node)) {
				2296	btrfs_warn(fs_info, "failed to read log tree");
				2297	ret = PTR_ERR(log_tree_root->node);
				2298	kfree(log_tree_root);
				2299	return ret;
				2300	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
				2301	btrfs_err(fs_info, "failed to read log tree");
				2302	free_extent_buffer(log_tree_root->node);
				2303	kfree(log_tree_root);
				2304	return -EIO;
				2305	}
				2306	/* returns with log_tree_root freed on success */
				2307	ret = btrfs_recover_log_trees(log_tree_root);
				2308	if (ret) {
				2309	btrfs_handle_fs_error(fs_info, ret,
				2310	"Failed to recover log tree");
				2311	free_extent_buffer(log_tree_root->node);
				2312	kfree(log_tree_root);
				2313	return ret;
				2314	}
				2315
				2316	if (sb_rdonly(fs_info->sb)) {
				2317	ret = btrfs_commit_super(fs_info);
				2318	if (ret)
				2319	return ret;
				2320	}
				2321
				2322	return 0;
				2323	}
				2324
				2325	static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
				2326	{
				2327	struct btrfs_root *tree_root = fs_info->tree_root;
				2328	struct btrfs_root *root;
				2329	struct btrfs_key location;
				2330	int ret;
				2331
				2332	BUG_ON(!fs_info->tree_root);
				2333
				2334	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
				2335	location.type = BTRFS_ROOT_ITEM_KEY;
				2336	location.offset = 0;
				2337
				2338	root = btrfs_read_tree_root(tree_root, &location);
				2339	if (IS_ERR(root))
				2340	return PTR_ERR(root);
				2341	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2342	fs_info->extent_root = root;
				2343
				2344	location.objectid = BTRFS_DEV_TREE_OBJECTID;
				2345	root = btrfs_read_tree_root(tree_root, &location);
				2346	if (IS_ERR(root))
				2347	return PTR_ERR(root);
				2348	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2349	fs_info->dev_root = root;
				2350	btrfs_init_devices_late(fs_info);
				2351
				2352	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
				2353	root = btrfs_read_tree_root(tree_root, &location);
				2354	if (IS_ERR(root))
				2355	return PTR_ERR(root);
				2356	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2357	fs_info->csum_root = root;
				2358
				2359	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
				2360	root = btrfs_read_tree_root(tree_root, &location);
				2361	if (!IS_ERR(root)) {
				2362	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2363	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
				2364	fs_info->quota_root = root;
				2365	}
				2366
				2367	location.objectid = BTRFS_UUID_TREE_OBJECTID;
				2368	root = btrfs_read_tree_root(tree_root, &location);
				2369	if (IS_ERR(root)) {
				2370	ret = PTR_ERR(root);
				2371	if (ret != -ENOENT)
				2372	return ret;
				2373	} else {
				2374	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2375	fs_info->uuid_root = root;
				2376	}
				2377
				2378	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
				2379	location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
				2380	root = btrfs_read_tree_root(tree_root, &location);
				2381	if (IS_ERR(root))
				2382	return PTR_ERR(root);
				2383	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2384	fs_info->free_space_root = root;
				2385	}
				2386
				2387	return 0;
				2388	}
				2389
				2390	int open_ctree(struct super_block *sb,
				2391	struct btrfs_fs_devices *fs_devices,
				2392	char *options)
				2393	{
				2394	u32 sectorsize;
				2395	u32 nodesize;
				2396	u32 stripesize;
				2397	u64 generation;
				2398	u64 features;
				2399	struct btrfs_key location;
				2400	struct buffer_head *bh;
				2401	struct btrfs_super_block *disk_super;
				2402	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
				2403	struct btrfs_root *tree_root;
				2404	struct btrfs_root *chunk_root;
				2405	int ret;
				2406	int err = -EINVAL;
				2407	int num_backups_tried = 0;
				2408	int backup_index = 0;
				2409	int max_active;
				2410	int clear_free_space_tree = 0;
				2411
				2412	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				2413	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				2414	if (!tree_root \|\| !chunk_root) {
				2415	err = -ENOMEM;
				2416	goto fail;
				2417	}
				2418
				2419	ret = init_srcu_struct(&fs_info->subvol_srcu);
				2420	if (ret) {
				2421	err = ret;
				2422	goto fail;
				2423	}
				2424
				2425	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
				2426	if (ret) {
				2427	err = ret;
				2428	goto fail_srcu;
				2429	}
				2430	fs_info->dirty_metadata_batch = PAGE_SIZE *
				2431	(1 + ilog2(nr_cpu_ids));
				2432
				2433	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
				2434	if (ret) {
				2435	err = ret;
				2436	goto fail_dirty_metadata_bytes;
				2437	}
				2438
				2439	ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
				2440	if (ret) {
				2441	err = ret;
				2442	goto fail_delalloc_bytes;
				2443	}
				2444
				2445	fs_info->btree_inode = new_inode(sb);
				2446	if (!fs_info->btree_inode) {
				2447	err = -ENOMEM;
				2448	goto fail_bio_counter;
				2449	}
				2450
				2451	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
				2452
				2453	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
				2454	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
				2455	INIT_LIST_HEAD(&fs_info->trans_list);
				2456	INIT_LIST_HEAD(&fs_info->dead_roots);
				2457	INIT_LIST_HEAD(&fs_info->delayed_iputs);
				2458	INIT_LIST_HEAD(&fs_info->delalloc_roots);
				2459	INIT_LIST_HEAD(&fs_info->caching_block_groups);
				2460	spin_lock_init(&fs_info->delalloc_root_lock);
				2461	spin_lock_init(&fs_info->trans_lock);
				2462	spin_lock_init(&fs_info->fs_roots_radix_lock);
				2463	spin_lock_init(&fs_info->delayed_iput_lock);
				2464	spin_lock_init(&fs_info->defrag_inodes_lock);
				2465	spin_lock_init(&fs_info->super_lock);
				2466	spin_lock_init(&fs_info->qgroup_op_lock);
				2467	spin_lock_init(&fs_info->buffer_lock);
				2468	spin_lock_init(&fs_info->unused_bgs_lock);
				2469	rwlock_init(&fs_info->tree_mod_log_lock);
				2470	mutex_init(&fs_info->unused_bg_unpin_mutex);
				2471	mutex_init(&fs_info->delete_unused_bgs_mutex);
				2472	mutex_init(&fs_info->reloc_mutex);
				2473	mutex_init(&fs_info->delalloc_root_mutex);
				2474	mutex_init(&fs_info->cleaner_delayed_iput_mutex);
				2475	seqlock_init(&fs_info->profiles_lock);
				2476
				2477	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
				2478	INIT_LIST_HEAD(&fs_info->space_info);
				2479	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
				2480	INIT_LIST_HEAD(&fs_info->unused_bgs);
				2481	btrfs_mapping_init(&fs_info->mapping_tree);
				2482	btrfs_init_block_rsv(&fs_info->global_block_rsv,
				2483	BTRFS_BLOCK_RSV_GLOBAL);
				2484	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
				2485	BTRFS_BLOCK_RSV_DELALLOC);
				2486	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
				2487	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
				2488	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
				2489	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
				2490	BTRFS_BLOCK_RSV_DELOPS);
				2491	atomic_set(&fs_info->nr_async_submits, 0);
				2492	atomic_set(&fs_info->async_delalloc_pages, 0);
				2493	atomic_set(&fs_info->async_submit_draining, 0);
				2494	atomic_set(&fs_info->nr_async_bios, 0);
				2495	atomic_set(&fs_info->defrag_running, 0);
				2496	atomic_set(&fs_info->qgroup_op_seq, 0);
				2497	atomic_set(&fs_info->reada_works_cnt, 0);
				2498	atomic64_set(&fs_info->tree_mod_seq, 0);
				2499	fs_info->sb = sb;
				2500	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
				2501	fs_info->metadata_ratio = 0;
				2502	fs_info->defrag_inodes = RB_ROOT;
				2503	atomic64_set(&fs_info->free_chunk_space, 0);
				2504	fs_info->tree_mod_log = RB_ROOT;
				2505	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
				2506	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
				2507	/* readahead state */
				2508	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
				2509	spin_lock_init(&fs_info->reada_lock);
				2510
				2511	fs_info->thread_pool_size = min_t(unsigned long,
				2512	num_online_cpus() + 2, 8);
				2513
				2514	INIT_LIST_HEAD(&fs_info->ordered_roots);
				2515	spin_lock_init(&fs_info->ordered_root_lock);
				2516	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
				2517	GFP_KERNEL);
				2518	if (!fs_info->delayed_root) {
				2519	err = -ENOMEM;
				2520	goto fail_iput;
				2521	}
				2522	btrfs_init_delayed_root(fs_info->delayed_root);
				2523
				2524	btrfs_init_scrub(fs_info);
				2525	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				2526	fs_info->check_integrity_print_mask = 0;
				2527	#endif
				2528	btrfs_init_balance(fs_info);
				2529	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
				2530
				2531	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
				2532	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
				2533
				2534	btrfs_init_btree_inode(fs_info);
				2535
				2536	spin_lock_init(&fs_info->block_group_cache_lock);
				2537	fs_info->block_group_cache_tree = RB_ROOT;
				2538	fs_info->first_logical_byte = (u64)-1;
				2539
				2540	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
				2541	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
				2542	fs_info->pinned_extents = &fs_info->freed_extents[0];
				2543	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
				2544
				2545	mutex_init(&fs_info->ordered_operations_mutex);
				2546	mutex_init(&fs_info->tree_log_mutex);
				2547	mutex_init(&fs_info->chunk_mutex);
				2548	mutex_init(&fs_info->transaction_kthread_mutex);
				2549	mutex_init(&fs_info->cleaner_mutex);
				2550	mutex_init(&fs_info->volume_mutex);
				2551	mutex_init(&fs_info->ro_block_group_mutex);
				2552	init_rwsem(&fs_info->commit_root_sem);
				2553	init_rwsem(&fs_info->cleanup_work_sem);
				2554	init_rwsem(&fs_info->subvol_sem);
				2555	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
				2556
				2557	btrfs_init_dev_replace_locks(fs_info);
				2558	btrfs_init_qgroup(fs_info);
				2559
				2560	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
				2561	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
				2562
				2563	init_waitqueue_head(&fs_info->transaction_throttle);
				2564	init_waitqueue_head(&fs_info->transaction_wait);
				2565	init_waitqueue_head(&fs_info->transaction_blocked_wait);
				2566	init_waitqueue_head(&fs_info->async_submit_wait);
				2567
				2568	INIT_LIST_HEAD(&fs_info->pinned_chunks);
				2569
				2570	/* Usable values until the real ones are cached from the superblock */
				2571	fs_info->nodesize = 4096;
				2572	fs_info->sectorsize = 4096;
				2573	fs_info->stripesize = 4096;
				2574
				2575	ret = btrfs_alloc_stripe_hash_table(fs_info);
				2576	if (ret) {
				2577	err = ret;
				2578	goto fail_alloc;
				2579	}
				2580
				2581	__setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
				2582
				2583	invalidate_bdev(fs_devices->latest_bdev);
				2584
				2585	/*
				2586	* Read super block and check the signature bytes only
				2587	*/
				2588	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
				2589	if (IS_ERR(bh)) {
				2590	err = PTR_ERR(bh);
				2591	goto fail_alloc;
				2592	}
				2593
				2594	/*
				2595	* We want to check superblock checksum, the type is stored inside.
				2596	* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
				2597	*/
				2598	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
				2599	btrfs_err(fs_info, "superblock checksum mismatch");
				2600	err = -EINVAL;
				2601	brelse(bh);
				2602	goto fail_alloc;
				2603	}
				2604
				2605	/*
				2606	* super_copy is zeroed at allocation time and we never touch the
				2607	* following bytes up to INFO_SIZE, the checksum is calculated from
				2608	* the whole block of INFO_SIZE
				2609	*/
				2610	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
				2611	memcpy(fs_info->super_for_commit, fs_info->super_copy,
				2612	sizeof(*fs_info->super_for_commit));
				2613	brelse(bh);
				2614
				2615	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
				2616
				2617	ret = btrfs_check_super_valid(fs_info);
				2618	if (ret) {
				2619	btrfs_err(fs_info, "superblock contains fatal errors");
				2620	err = -EINVAL;
				2621	goto fail_alloc;
				2622	}
				2623
				2624	disk_super = fs_info->super_copy;
				2625	if (!btrfs_super_root(disk_super))
				2626	goto fail_alloc;
				2627
				2628	/* check FS state, whether FS is broken. */
				2629	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
				2630	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
				2631
				2632	/*
				2633	* run through our array of backup supers and setup
				2634	* our ring pointer to the oldest one
				2635	*/
				2636	generation = btrfs_super_generation(disk_super);
				2637	find_oldest_super_backup(fs_info, generation);
				2638
				2639	/*
				2640	* In the long term, we'll store the compression type in the super
				2641	* block, and it'll be used for per file compression control.
				2642	*/
				2643	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
				2644
				2645	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
				2646	if (ret) {
				2647	err = ret;
				2648	goto fail_alloc;
				2649	}
				2650
				2651	features = btrfs_super_incompat_flags(disk_super) &
				2652	~BTRFS_FEATURE_INCOMPAT_SUPP;
				2653	if (features) {
				2654	btrfs_err(fs_info,
				2655	"cannot mount because of unsupported optional features (%llx)",
				2656	features);
				2657	err = -EINVAL;
				2658	goto fail_alloc;
				2659	}
				2660
				2661	features = btrfs_super_incompat_flags(disk_super);
				2662	features \|= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
				2663	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
				2664	features \|= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
				2665	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
				2666	features \|= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
				2667
				2668	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
				2669	btrfs_info(fs_info, "has skinny extents");
				2670
				2671	/*
				2672	* flag our filesystem as having big metadata blocks if
				2673	* they are bigger than the page size
				2674	*/
				2675	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
				2676	if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
				2677	btrfs_info(fs_info,
				2678	"flagging fs with big metadata feature");
				2679	features \|= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
				2680	}
				2681
				2682	nodesize = btrfs_super_nodesize(disk_super);
				2683	sectorsize = btrfs_super_sectorsize(disk_super);
				2684	stripesize = sectorsize;
				2685	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
				2686	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
				2687
				2688	/* Cache block sizes */
				2689	fs_info->nodesize = nodesize;
				2690	fs_info->sectorsize = sectorsize;
				2691	fs_info->stripesize = stripesize;
				2692
				2693	/*
				2694	* mixed block groups end up with duplicate but slightly offset
				2695	* extent buffers for the same range. It leads to corruptions
				2696	*/
				2697	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
				2698	(sectorsize != nodesize)) {
				2699	btrfs_err(fs_info,
				2700	"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
				2701	nodesize, sectorsize);
				2702	goto fail_alloc;
				2703	}
				2704
				2705	/*
				2706	* Needn't use the lock because there is no other task which will
				2707	* update the flag.
				2708	*/
				2709	btrfs_set_super_incompat_flags(disk_super, features);
				2710
				2711	features = btrfs_super_compat_ro_flags(disk_super) &
				2712	~BTRFS_FEATURE_COMPAT_RO_SUPP;
				2713	if (!sb_rdonly(sb) && features) {
				2714	btrfs_err(fs_info,
				2715	"cannot mount read-write because of unsupported optional features (%llx)",
				2716	features);
				2717	err = -EINVAL;
				2718	goto fail_alloc;
				2719	}
				2720
				2721	max_active = fs_info->thread_pool_size;
				2722
				2723	ret = btrfs_init_workqueues(fs_info, fs_devices);
				2724	if (ret) {
				2725	err = ret;
				2726	goto fail_sb_buffer;
				2727	}
				2728
				2729	sb->s_bdi->congested_fn = btrfs_congested_fn;
				2730	sb->s_bdi->congested_data = fs_info;
				2731	sb->s_bdi->capabilities \|= BDI_CAP_CGROUP_WRITEBACK;
				2732	sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
				2733	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
				2734	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
				2735
				2736	sb->s_blocksize = sectorsize;
				2737	sb->s_blocksize_bits = blksize_bits(sectorsize);
				2738
				2739	mutex_lock(&fs_info->chunk_mutex);
				2740	ret = btrfs_read_sys_array(fs_info);
				2741	mutex_unlock(&fs_info->chunk_mutex);
				2742	if (ret) {
				2743	btrfs_err(fs_info, "failed to read the system array: %d", ret);
				2744	goto fail_sb_buffer;
				2745	}
				2746
				2747	generation = btrfs_super_chunk_root_generation(disk_super);
				2748
				2749	__setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
				2750
				2751	chunk_root->node = read_tree_block(fs_info,
				2752	btrfs_super_chunk_root(disk_super),
				2753	generation);
				2754	if (IS_ERR(chunk_root->node) \|\|
				2755	!extent_buffer_uptodate(chunk_root->node)) {
				2756	btrfs_err(fs_info, "failed to read chunk root");
				2757	if (!IS_ERR(chunk_root->node))
				2758	free_extent_buffer(chunk_root->node);
				2759	chunk_root->node = NULL;
				2760	goto fail_tree_roots;
				2761	}
				2762	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
				2763	chunk_root->commit_root = btrfs_root_node(chunk_root);
				2764
				2765	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
				2766	btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
				2767
				2768	ret = btrfs_read_chunk_tree(fs_info);
				2769	if (ret) {
				2770	btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
				2771	goto fail_tree_roots;
				2772	}
				2773
				2774	/*
				2775	* keep the device that is marked to be the target device for the
				2776	* dev_replace procedure
				2777	*/
				2778	btrfs_close_extra_devices(fs_devices, 0);
				2779
				2780	if (!fs_devices->latest_bdev) {
				2781	btrfs_err(fs_info, "failed to read devices");
				2782	goto fail_tree_roots;
				2783	}
				2784
				2785	retry_root_backup:
				2786	generation = btrfs_super_generation(disk_super);
				2787
				2788	tree_root->node = read_tree_block(fs_info,
				2789	btrfs_super_root(disk_super),
				2790	generation);
				2791	if (IS_ERR(tree_root->node) \|\|
				2792	!extent_buffer_uptodate(tree_root->node)) {
				2793	btrfs_warn(fs_info, "failed to read tree root");
				2794	if (!IS_ERR(tree_root->node))
				2795	free_extent_buffer(tree_root->node);
				2796	tree_root->node = NULL;
				2797	goto recovery_tree_root;
				2798	}
				2799
				2800	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
				2801	tree_root->commit_root = btrfs_root_node(tree_root);
				2802	btrfs_set_root_refs(&tree_root->root_item, 1);
				2803
				2804	mutex_lock(&tree_root->objectid_mutex);
				2805	ret = btrfs_find_highest_objectid(tree_root,
				2806	&tree_root->highest_objectid);
				2807	if (ret) {
				2808	mutex_unlock(&tree_root->objectid_mutex);
				2809	goto recovery_tree_root;
				2810	}
				2811
				2812	ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
				2813
				2814	mutex_unlock(&tree_root->objectid_mutex);
				2815
				2816	ret = btrfs_read_roots(fs_info);
				2817	if (ret)
				2818	goto recovery_tree_root;
				2819
				2820	fs_info->generation = generation;
				2821	fs_info->last_trans_committed = generation;
				2822
				2823	ret = btrfs_recover_balance(fs_info);
				2824	if (ret) {
				2825	btrfs_err(fs_info, "failed to recover balance: %d", ret);
				2826	goto fail_block_groups;
				2827	}
				2828
				2829	ret = btrfs_init_dev_stats(fs_info);
				2830	if (ret) {
				2831	btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
				2832	goto fail_block_groups;
				2833	}
				2834
				2835	ret = btrfs_init_dev_replace(fs_info);
				2836	if (ret) {
				2837	btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
				2838	goto fail_block_groups;
				2839	}
				2840
				2841	btrfs_close_extra_devices(fs_devices, 1);
				2842
				2843	ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
				2844	if (ret) {
				2845	btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
				2846	ret);
				2847	goto fail_block_groups;
				2848	}
				2849
				2850	ret = btrfs_sysfs_add_device(fs_devices);
				2851	if (ret) {
				2852	btrfs_err(fs_info, "failed to init sysfs device interface: %d",
				2853	ret);
				2854	goto fail_fsdev_sysfs;
				2855	}
				2856
				2857	ret = btrfs_sysfs_add_mounted(fs_info);
				2858	if (ret) {
				2859	btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
				2860	goto fail_fsdev_sysfs;
				2861	}
				2862
				2863	ret = btrfs_init_space_info(fs_info);
				2864	if (ret) {
				2865	btrfs_err(fs_info, "failed to initialize space info: %d", ret);
				2866	goto fail_sysfs;
				2867	}
				2868
				2869	ret = btrfs_read_block_groups(fs_info);
				2870	if (ret) {
				2871	btrfs_err(fs_info, "failed to read block groups: %d", ret);
				2872	goto fail_sysfs;
				2873	}
				2874
				2875	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info)) {
				2876	btrfs_warn(fs_info,
				2877	"writeable mount is not allowed due to too many missing devices");
				2878	goto fail_sysfs;
				2879	}
				2880
				2881	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
				2882	"btrfs-cleaner");
				2883	if (IS_ERR(fs_info->cleaner_kthread))
				2884	goto fail_sysfs;
				2885
				2886	fs_info->transaction_kthread = kthread_run(transaction_kthread,
				2887	tree_root,
				2888	"btrfs-transaction");
				2889	if (IS_ERR(fs_info->transaction_kthread))
				2890	goto fail_cleaner;
				2891
				2892	if (!btrfs_test_opt(fs_info, NOSSD) &&
				2893	!fs_info->fs_devices->rotating) {
				2894	btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
				2895	}
				2896
				2897	/*
				2898	* Mount does not set all options immediately, we can do it now and do
				2899	* not have to wait for transaction commit
				2900	*/
				2901	btrfs_apply_pending_changes(fs_info);
				2902
				2903	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				2904	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
				2905	ret = btrfsic_mount(fs_info, fs_devices,
				2906	btrfs_test_opt(fs_info,
				2907	CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
				2908	1 : 0,
				2909	fs_info->check_integrity_print_mask);
				2910	if (ret)
				2911	btrfs_warn(fs_info,
				2912	"failed to initialize integrity check module: %d",
				2913	ret);
				2914	}
				2915	#endif
				2916	ret = btrfs_read_qgroup_config(fs_info);
				2917	if (ret)
				2918	goto fail_trans_kthread;
				2919
				2920	/* do not make disk changes in broken FS or nologreplay is given */
				2921	if (btrfs_super_log_root(disk_super) != 0 &&
				2922	!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
				2923	btrfs_info(fs_info, "start tree-log replay");
				2924	ret = btrfs_replay_log(fs_info, fs_devices);
				2925	if (ret) {
				2926	err = ret;
				2927	goto fail_qgroup;
				2928	}
				2929	}
				2930
				2931	ret = btrfs_find_orphan_roots(fs_info);
				2932	if (ret)
				2933	goto fail_qgroup;
				2934
				2935	if (!sb_rdonly(sb)) {
				2936	ret = btrfs_cleanup_fs_roots(fs_info);
				2937	if (ret)
				2938	goto fail_qgroup;
				2939
				2940	mutex_lock(&fs_info->cleaner_mutex);
				2941	ret = btrfs_recover_relocation(tree_root);
				2942	mutex_unlock(&fs_info->cleaner_mutex);
				2943	if (ret < 0) {
				2944	btrfs_warn(fs_info, "failed to recover relocation: %d",
				2945	ret);
				2946	err = -EINVAL;
				2947	goto fail_qgroup;
				2948	}
				2949	}
				2950
				2951	location.objectid = BTRFS_FS_TREE_OBJECTID;
				2952	location.type = BTRFS_ROOT_ITEM_KEY;
				2953	location.offset = 0;
				2954
				2955	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
				2956	if (IS_ERR(fs_info->fs_root)) {
				2957	err = PTR_ERR(fs_info->fs_root);
				2958	goto fail_qgroup;
				2959	}
				2960
				2961	if (sb_rdonly(sb))
				2962	return 0;
				2963
				2964	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
				2965	btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
				2966	clear_free_space_tree = 1;
				2967	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
				2968	!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
				2969	btrfs_warn(fs_info, "free space tree is invalid");
				2970	clear_free_space_tree = 1;
				2971	}
				2972
				2973	if (clear_free_space_tree) {
				2974	btrfs_info(fs_info, "clearing free space tree");
				2975	ret = btrfs_clear_free_space_tree(fs_info);
				2976	if (ret) {
				2977	btrfs_warn(fs_info,
				2978	"failed to clear free space tree: %d", ret);
				2979	close_ctree(fs_info);
				2980	return ret;
				2981	}
				2982	}
				2983
				2984	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
				2985	!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
				2986	btrfs_info(fs_info, "creating free space tree");
				2987	ret = btrfs_create_free_space_tree(fs_info);
				2988	if (ret) {
				2989	btrfs_warn(fs_info,
				2990	"failed to create free space tree: %d", ret);
				2991	close_ctree(fs_info);
				2992	return ret;
				2993	}
				2994	}
				2995
				2996	down_read(&fs_info->cleanup_work_sem);
				2997	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) \|\|
				2998	(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
				2999	up_read(&fs_info->cleanup_work_sem);
				3000	close_ctree(fs_info);
				3001	return ret;
				3002	}
				3003	up_read(&fs_info->cleanup_work_sem);
				3004
				3005	ret = btrfs_resume_balance_async(fs_info);
				3006	if (ret) {
				3007	btrfs_warn(fs_info, "failed to resume balance: %d", ret);
				3008	close_ctree(fs_info);
				3009	return ret;
				3010	}
				3011
				3012	ret = btrfs_resume_dev_replace_async(fs_info);
				3013	if (ret) {
				3014	btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
				3015	close_ctree(fs_info);
				3016	return ret;
				3017	}
				3018
				3019	btrfs_qgroup_rescan_resume(fs_info);
				3020
				3021	if (!fs_info->uuid_root) {
				3022	btrfs_info(fs_info, "creating UUID tree");
				3023	ret = btrfs_create_uuid_tree(fs_info);
				3024	if (ret) {
				3025	btrfs_warn(fs_info,
				3026	"failed to create the UUID tree: %d", ret);
				3027	close_ctree(fs_info);
				3028	return ret;
				3029	}
				3030	} else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) \|\|
				3031	fs_info->generation !=
				3032	btrfs_super_uuid_tree_generation(disk_super)) {
				3033	btrfs_info(fs_info, "checking UUID tree");
				3034	ret = btrfs_check_uuid_tree(fs_info);
				3035	if (ret) {
				3036	btrfs_warn(fs_info,
				3037	"failed to check the UUID tree: %d", ret);
				3038	close_ctree(fs_info);
				3039	return ret;
				3040	}
				3041	} else {
				3042	set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
				3043	}
				3044	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
				3045
				3046	/*
				3047	* backuproot only affect mount behavior, and if open_ctree succeeded,
				3048	* no need to keep the flag
				3049	*/
				3050	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
				3051
				3052	return 0;
				3053
				3054	fail_qgroup:
				3055	btrfs_free_qgroup_config(fs_info);
				3056	fail_trans_kthread:
				3057	kthread_stop(fs_info->transaction_kthread);
				3058	btrfs_cleanup_transaction(fs_info);
				3059	btrfs_free_fs_roots(fs_info);
				3060	fail_cleaner:
				3061	kthread_stop(fs_info->cleaner_kthread);
				3062
				3063	/*
				3064	* make sure we're done with the btree inode before we stop our
				3065	* kthreads
				3066	*/
				3067	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
				3068
				3069	fail_sysfs:
				3070	btrfs_sysfs_remove_mounted(fs_info);
				3071
				3072	fail_fsdev_sysfs:
				3073	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
				3074
				3075	fail_block_groups:
				3076	btrfs_put_block_group_cache(fs_info);
				3077
				3078	fail_tree_roots:
				3079	free_root_pointers(fs_info, true);
				3080	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
				3081
				3082	fail_sb_buffer:
				3083	btrfs_stop_all_workers(fs_info);
				3084	btrfs_free_block_groups(fs_info);
				3085	fail_alloc:
				3086	fail_iput:
				3087	btrfs_mapping_tree_free(&fs_info->mapping_tree);
				3088
				3089	iput(fs_info->btree_inode);
				3090	fail_bio_counter:
				3091	percpu_counter_destroy(&fs_info->bio_counter);
				3092	fail_delalloc_bytes:
				3093	percpu_counter_destroy(&fs_info->delalloc_bytes);
				3094	fail_dirty_metadata_bytes:
				3095	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
				3096	fail_srcu:
				3097	cleanup_srcu_struct(&fs_info->subvol_srcu);
				3098	fail:
				3099	btrfs_free_stripe_hash_table(fs_info);
				3100	btrfs_close_devices(fs_info->fs_devices);
				3101	return err;
				3102
				3103	recovery_tree_root:
				3104	if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
				3105	goto fail_tree_roots;
				3106
				3107	free_root_pointers(fs_info, false);
				3108
				3109	/* don't use the log in recovery mode, it won't be valid */
				3110	btrfs_set_super_log_root(disk_super, 0);
				3111
				3112	/* we can't trust the free space cache either */
				3113	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
				3114
				3115	ret = next_root_backup(fs_info, fs_info->super_copy,
				3116	&num_backups_tried, &backup_index);
				3117	if (ret == -1)
				3118	goto fail_block_groups;
				3119	goto retry_root_backup;
				3120	}
				3121
				3122	static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				3123	{
				3124	if (uptodate) {
				3125	set_buffer_uptodate(bh);
				3126	} else {
				3127	struct btrfs_device device = (struct btrfs_device )
				3128	bh->b_private;
				3129
				3130	btrfs_warn_rl_in_rcu(device->fs_info,
				3131	"lost page write due to IO error on %s",
				3132	rcu_str_deref(device->name));
				3133	/* note, we don't set_buffer_write_io_error because we have
				3134	* our own ways of dealing with the IO errors
				3135	*/
				3136	clear_buffer_uptodate(bh);
				3137	btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
				3138	}
				3139	unlock_buffer(bh);
				3140	put_bh(bh);
				3141	}
				3142
				3143	int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
				3144	struct buffer_head **bh_ret)
				3145	{
				3146	struct buffer_head *bh;
				3147	struct btrfs_super_block *super;
				3148	u64 bytenr;
				3149
				3150	bytenr = btrfs_sb_offset(copy_num);
				3151	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
				3152	return -EINVAL;
				3153
				3154	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
				3155	/*
				3156	* If we fail to read from the underlying devices, as of now
				3157	* the best option we have is to mark it EIO.
				3158	*/
				3159	if (!bh)
				3160	return -EIO;
				3161
				3162	super = (struct btrfs_super_block *)bh->b_data;
				3163	if (btrfs_super_bytenr(super) != bytenr \|\|
				3164	btrfs_super_magic(super) != BTRFS_MAGIC) {
				3165	brelse(bh);
				3166	return -EINVAL;
				3167	}
				3168
				3169	*bh_ret = bh;
				3170	return 0;
				3171	}
				3172
				3173
				3174	struct buffer_head btrfs_read_dev_super(struct block_device bdev)
				3175	{
				3176	struct buffer_head *bh;
				3177	struct buffer_head *latest = NULL;
				3178	struct btrfs_super_block *super;
				3179	int i;
				3180	u64 transid = 0;
				3181	int ret = -EINVAL;
				3182
				3183	/* we would like to check all the supers, but that would make
				3184	* a btrfs mount succeed after a mkfs from a different FS.
				3185	* So, we need to add a special mount option to scan for
				3186	* later supers, using BTRFS_SUPER_MIRROR_MAX instead
				3187	*/
				3188	for (i = 0; i < 1; i++) {
				3189	ret = btrfs_read_dev_one_super(bdev, i, &bh);
				3190	if (ret)
				3191	continue;
				3192
				3193	super = (struct btrfs_super_block *)bh->b_data;
				3194
				3195	if (!latest \|\| btrfs_super_generation(super) > transid) {
				3196	brelse(latest);
				3197	latest = bh;
				3198	transid = btrfs_super_generation(super);
				3199	} else {
				3200	brelse(bh);
				3201	}
				3202	}
				3203
				3204	if (!latest)
				3205	return ERR_PTR(ret);
				3206
				3207	return latest;
				3208	}
				3209
				3210	/*
				3211	* Write superblock @sb to the @device. Do not wait for completion, all the
				3212	* buffer heads we write are pinned.
				3213	*
				3214	* Write @max_mirrors copies of the superblock, where 0 means default that fit
				3215	* the expected device size at commit time. Note that max_mirrors must be
				3216	* same for write and wait phases.
				3217	*
				3218	* Return number of errors when buffer head is not found or submission fails.
				3219	*/
				3220	static int write_dev_supers(struct btrfs_device *device,
				3221	struct btrfs_super_block *sb, int max_mirrors)
				3222	{
				3223	struct buffer_head *bh;
				3224	int i;
				3225	int ret;
				3226	int errors = 0;
				3227	u32 crc;
				3228	u64 bytenr;
				3229	int op_flags;
				3230
				3231	if (max_mirrors == 0)
				3232	max_mirrors = BTRFS_SUPER_MIRROR_MAX;
				3233
				3234	for (i = 0; i < max_mirrors; i++) {
				3235	bytenr = btrfs_sb_offset(i);
				3236	if (bytenr + BTRFS_SUPER_INFO_SIZE >=
				3237	device->commit_total_bytes)
				3238	break;
				3239
				3240	btrfs_set_super_bytenr(sb, bytenr);
				3241
				3242	crc = ~(u32)0;
				3243	crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc,
				3244	BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
				3245	btrfs_csum_final(crc, sb->csum);
				3246
				3247	/* One reference for us, and we leave it for the caller */
				3248	bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
				3249	BTRFS_SUPER_INFO_SIZE);
				3250	if (!bh) {
				3251	btrfs_err(device->fs_info,
				3252	"couldn't get super buffer head for bytenr %llu",
				3253	bytenr);
				3254	errors++;
				3255	continue;
				3256	}
				3257
				3258	memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
				3259
				3260	/* one reference for submit_bh */
				3261	get_bh(bh);
				3262
				3263	set_buffer_uptodate(bh);
				3264	lock_buffer(bh);
				3265	bh->b_end_io = btrfs_end_buffer_write_sync;
				3266	bh->b_private = device;
				3267
				3268	/*
				3269	* we fua the first super. The others we allow
				3270	* to go down lazy.
				3271	*/
				3272	op_flags = REQ_SYNC \| REQ_META \| REQ_PRIO;
				3273	if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
				3274	op_flags \|= REQ_FUA;
				3275	ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
				3276	if (ret)
				3277	errors++;
				3278	}
				3279	return errors < i ? 0 : -1;
				3280	}
				3281
				3282	/*
				3283	* Wait for write completion of superblocks done by write_dev_supers,
				3284	* @max_mirrors same for write and wait phases.
				3285	*
				3286	* Return number of errors when buffer head is not found or not marked up to
				3287	* date.
				3288	*/
				3289	static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
				3290	{
				3291	struct buffer_head *bh;
				3292	int i;
				3293	int errors = 0;
				3294	u64 bytenr;
				3295
				3296	if (max_mirrors == 0)
				3297	max_mirrors = BTRFS_SUPER_MIRROR_MAX;
				3298
				3299	for (i = 0; i < max_mirrors; i++) {
				3300	bytenr = btrfs_sb_offset(i);
				3301	if (bytenr + BTRFS_SUPER_INFO_SIZE >=
				3302	device->commit_total_bytes)
				3303	break;
				3304
				3305	bh = __find_get_block(device->bdev,
				3306	bytenr / BTRFS_BDEV_BLOCKSIZE,
				3307	BTRFS_SUPER_INFO_SIZE);
				3308	if (!bh) {
				3309	errors++;
				3310	continue;
				3311	}
				3312	wait_on_buffer(bh);
				3313	if (!buffer_uptodate(bh))
				3314	errors++;
				3315
				3316	/* drop our reference */
				3317	brelse(bh);
				3318
				3319	/* drop the reference from the writing run */
				3320	brelse(bh);
				3321	}
				3322
				3323	return errors < i ? 0 : -1;
				3324	}
				3325
				3326	/*
				3327	* endio for the write_dev_flush, this will wake anyone waiting
				3328	* for the barrier when it is done
				3329	*/
				3330	static void btrfs_end_empty_barrier(struct bio *bio)
				3331	{
				3332	complete(bio->bi_private);
				3333	}
				3334
				3335	/*
				3336	* Submit a flush request to the device if it supports it. Error handling is
				3337	* done in the waiting counterpart.
				3338	*/
				3339	static void write_dev_flush(struct btrfs_device *device)
				3340	{
				3341	struct request_queue *q = bdev_get_queue(device->bdev);
				3342	struct bio *bio = device->flush_bio;
				3343
				3344	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
				3345	return;
				3346
				3347	bio_reset(bio);
				3348	bio->bi_end_io = btrfs_end_empty_barrier;
				3349	bio_set_dev(bio, device->bdev);
				3350	bio->bi_opf = REQ_OP_WRITE \| REQ_SYNC \| REQ_PREFLUSH;
				3351	init_completion(&device->flush_wait);
				3352	bio->bi_private = &device->flush_wait;
				3353
				3354	btrfsic_submit_bio(bio);
				3355	device->flush_bio_sent = 1;
				3356	}
				3357
				3358	/*
				3359	* If the flush bio has been submitted by write_dev_flush, wait for it.
				3360	*/
				3361	static blk_status_t wait_dev_flush(struct btrfs_device *device)
				3362	{
				3363	struct bio *bio = device->flush_bio;
				3364
				3365	if (!device->flush_bio_sent)
				3366	return BLK_STS_OK;
				3367
				3368	device->flush_bio_sent = 0;
				3369	wait_for_completion_io(&device->flush_wait);
				3370
				3371	return bio->bi_status;
				3372	}
				3373
				3374	static int check_barrier_error(struct btrfs_fs_info *fs_info)
				3375	{
				3376	if (!btrfs_check_rw_degradable(fs_info))
				3377	return -EIO;
				3378	return 0;
				3379	}
				3380
				3381	/*
				3382	* send an empty flush down to each device in parallel,
				3383	* then wait for them
				3384	*/
				3385	static int barrier_all_devices(struct btrfs_fs_info *info)
				3386	{
				3387	struct list_head *head;
				3388	struct btrfs_device *dev;
				3389	int errors_wait = 0;
				3390	blk_status_t ret;
				3391
				3392	/* send down all the barriers */
				3393	head = &info->fs_devices->devices;
				3394	list_for_each_entry_rcu(dev, head, dev_list) {
				3395	if (dev->missing)
				3396	continue;
				3397	if (!dev->bdev)
				3398	continue;
				3399	if (!dev->in_fs_metadata \|\| !dev->writeable)
				3400	continue;
				3401
				3402	write_dev_flush(dev);
				3403	dev->last_flush_error = BLK_STS_OK;
				3404	}
				3405
				3406	/* wait for all the barriers */
				3407	list_for_each_entry_rcu(dev, head, dev_list) {
				3408	if (dev->missing)
				3409	continue;
				3410	if (!dev->bdev) {
				3411	errors_wait++;
				3412	continue;
				3413	}
				3414	if (!dev->in_fs_metadata \|\| !dev->writeable)
				3415	continue;
				3416
				3417	ret = wait_dev_flush(dev);
				3418	if (ret) {
				3419	dev->last_flush_error = ret;
				3420	btrfs_dev_stat_inc_and_print(dev,
				3421	BTRFS_DEV_STAT_FLUSH_ERRS);
				3422	errors_wait++;
				3423	}
				3424	}
				3425
				3426	if (errors_wait) {
				3427	/*
				3428	* At some point we need the status of all disks
				3429	* to arrive at the volume status. So error checking
				3430	* is being pushed to a separate loop.
				3431	*/
				3432	return check_barrier_error(info);
				3433	}
				3434	return 0;
				3435	}
				3436
				3437	int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
				3438	{
				3439	int raid_type;
				3440	int min_tolerated = INT_MAX;
				3441
				3442	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 \|\|
				3443	(flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
				3444	min_tolerated = min(min_tolerated,
				3445	btrfs_raid_array[BTRFS_RAID_SINGLE].
				3446	tolerated_failures);
				3447
				3448	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
				3449	if (raid_type == BTRFS_RAID_SINGLE)
				3450	continue;
				3451	if (!(flags & btrfs_raid_group[raid_type]))
				3452	continue;
				3453	min_tolerated = min(min_tolerated,
				3454	btrfs_raid_array[raid_type].
				3455	tolerated_failures);
				3456	}
				3457
				3458	if (min_tolerated == INT_MAX) {
				3459	pr_warn("BTRFS: unknown raid flag: %llu", flags);
				3460	min_tolerated = 0;
				3461	}
				3462
				3463	return min_tolerated;
				3464	}
				3465
				3466	int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
				3467	{
				3468	struct list_head *head;
				3469	struct btrfs_device *dev;
				3470	struct btrfs_super_block *sb;
				3471	struct btrfs_dev_item *dev_item;
				3472	int ret;
				3473	int do_barriers;
				3474	int max_errors;
				3475	int total_errors = 0;
				3476	u64 flags;
				3477
				3478	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
				3479
				3480	/*
				3481	* max_mirrors == 0 indicates we're from commit_transaction,
				3482	* not from fsync where the tree roots in fs_info have not
				3483	* been consistent on disk.
				3484	*/
				3485	if (max_mirrors == 0)
				3486	backup_super_roots(fs_info);
				3487
				3488	sb = fs_info->super_for_commit;
				3489	dev_item = &sb->dev_item;
				3490
				3491	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				3492	head = &fs_info->fs_devices->devices;
				3493	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
				3494
				3495	if (do_barriers) {
				3496	ret = barrier_all_devices(fs_info);
				3497	if (ret) {
				3498	mutex_unlock(
				3499	&fs_info->fs_devices->device_list_mutex);
				3500	btrfs_handle_fs_error(fs_info, ret,
				3501	"errors while submitting device barriers.");
				3502	return ret;
				3503	}
				3504	}
				3505
				3506	list_for_each_entry_rcu(dev, head, dev_list) {
				3507	if (!dev->bdev) {
				3508	total_errors++;
				3509	continue;
				3510	}
				3511	if (!dev->in_fs_metadata \|\| !dev->writeable)
				3512	continue;
				3513
				3514	btrfs_set_stack_device_generation(dev_item, 0);
				3515	btrfs_set_stack_device_type(dev_item, dev->type);
				3516	btrfs_set_stack_device_id(dev_item, dev->devid);
				3517	btrfs_set_stack_device_total_bytes(dev_item,
				3518	dev->commit_total_bytes);
				3519	btrfs_set_stack_device_bytes_used(dev_item,
				3520	dev->commit_bytes_used);
				3521	btrfs_set_stack_device_io_align(dev_item, dev->io_align);
				3522	btrfs_set_stack_device_io_width(dev_item, dev->io_width);
				3523	btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
				3524	memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
				3525	memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
				3526
				3527	flags = btrfs_super_flags(sb);
				3528	btrfs_set_super_flags(sb, flags \| BTRFS_HEADER_FLAG_WRITTEN);
				3529
				3530	ret = write_dev_supers(dev, sb, max_mirrors);
				3531	if (ret)
				3532	total_errors++;
				3533	}
				3534	if (total_errors > max_errors) {
				3535	btrfs_err(fs_info, "%d errors while writing supers",
				3536	total_errors);
				3537	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3538
				3539	/* FUA is masked off if unsupported and can't be the reason */
				3540	btrfs_handle_fs_error(fs_info, -EIO,
				3541	"%d errors while writing supers",
				3542	total_errors);
				3543	return -EIO;
				3544	}
				3545
				3546	total_errors = 0;
				3547	list_for_each_entry_rcu(dev, head, dev_list) {
				3548	if (!dev->bdev)
				3549	continue;
				3550	if (!dev->in_fs_metadata \|\| !dev->writeable)
				3551	continue;
				3552
				3553	ret = wait_dev_supers(dev, max_mirrors);
				3554	if (ret)
				3555	total_errors++;
				3556	}
				3557	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3558	if (total_errors > max_errors) {
				3559	btrfs_handle_fs_error(fs_info, -EIO,
				3560	"%d errors while writing supers",
				3561	total_errors);
				3562	return -EIO;
				3563	}
				3564	return 0;
				3565	}
				3566
				3567	/* Drop a fs root from the radix tree and free it. */
				3568	void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
				3569	struct btrfs_root *root)
				3570	{
				3571	spin_lock(&fs_info->fs_roots_radix_lock);
				3572	radix_tree_delete(&fs_info->fs_roots_radix,
				3573	(unsigned long)root->root_key.objectid);
				3574	spin_unlock(&fs_info->fs_roots_radix_lock);
				3575
				3576	if (btrfs_root_refs(&root->root_item) == 0)
				3577	synchronize_srcu(&fs_info->subvol_srcu);
				3578
				3579	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				3580	btrfs_free_log(NULL, root);
				3581	if (root->reloc_root) {
				3582	free_extent_buffer(root->reloc_root->node);
				3583	free_extent_buffer(root->reloc_root->commit_root);
				3584	btrfs_put_fs_root(root->reloc_root);
				3585	root->reloc_root = NULL;
				3586	}
				3587	}
				3588
				3589	if (root->free_ino_pinned)
				3590	__btrfs_remove_free_space_cache(root->free_ino_pinned);
				3591	if (root->free_ino_ctl)
				3592	__btrfs_remove_free_space_cache(root->free_ino_ctl);
				3593	free_fs_root(root);
				3594	}
				3595
				3596	static void free_fs_root(struct btrfs_root *root)
				3597	{
				3598	iput(root->ino_cache_inode);
				3599	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
				3600	btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv);
				3601	root->orphan_block_rsv = NULL;
				3602	if (root->anon_dev)
				3603	free_anon_bdev(root->anon_dev);
				3604	if (root->subv_writers)
				3605	btrfs_free_subvolume_writers(root->subv_writers);
				3606	free_extent_buffer(root->node);
				3607	free_extent_buffer(root->commit_root);
				3608	kfree(root->free_ino_ctl);
				3609	kfree(root->free_ino_pinned);
				3610	kfree(root->name);
				3611	btrfs_put_fs_root(root);
				3612	}
				3613
				3614	void btrfs_free_fs_root(struct btrfs_root *root)
				3615	{
				3616	free_fs_root(root);
				3617	}
				3618
				3619	int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
				3620	{
				3621	u64 root_objectid = 0;
				3622	struct btrfs_root *gang[8];
				3623	int i = 0;
				3624	int err = 0;
				3625	unsigned int ret = 0;
				3626	int index;
				3627
				3628	while (1) {
				3629	index = srcu_read_lock(&fs_info->subvol_srcu);
				3630	ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
				3631	(void **)gang, root_objectid,
				3632	ARRAY_SIZE(gang));
				3633	if (!ret) {
				3634	srcu_read_unlock(&fs_info->subvol_srcu, index);
				3635	break;
				3636	}
				3637	root_objectid = gang[ret - 1]->root_key.objectid + 1;
				3638
				3639	for (i = 0; i < ret; i++) {
				3640	/* Avoid to grab roots in dead_roots */
				3641	if (btrfs_root_refs(&gang[i]->root_item) == 0) {
				3642	gang[i] = NULL;
				3643	continue;
				3644	}
				3645	/* grab all the search result for later use */
				3646	gang[i] = btrfs_grab_fs_root(gang[i]);
				3647	}
				3648	srcu_read_unlock(&fs_info->subvol_srcu, index);
				3649
				3650	for (i = 0; i < ret; i++) {
				3651	if (!gang[i])
				3652	continue;
				3653	root_objectid = gang[i]->root_key.objectid;
				3654	err = btrfs_orphan_cleanup(gang[i]);
				3655	if (err)
				3656	break;
				3657	btrfs_put_fs_root(gang[i]);
				3658	}
				3659	root_objectid++;
				3660	}
				3661
				3662	/* release the uncleaned roots due to error */
				3663	for (; i < ret; i++) {
				3664	if (gang[i])
				3665	btrfs_put_fs_root(gang[i]);
				3666	}
				3667	return err;
				3668	}
				3669
				3670	int btrfs_commit_super(struct btrfs_fs_info *fs_info)
				3671	{
				3672	struct btrfs_root *root = fs_info->tree_root;
				3673	struct btrfs_trans_handle *trans;
				3674
				3675	mutex_lock(&fs_info->cleaner_mutex);
				3676	btrfs_run_delayed_iputs(fs_info);
				3677	mutex_unlock(&fs_info->cleaner_mutex);
				3678	wake_up_process(fs_info->cleaner_kthread);
				3679
				3680	/* wait until ongoing cleanup work done */
				3681	down_write(&fs_info->cleanup_work_sem);
				3682	up_write(&fs_info->cleanup_work_sem);
				3683
				3684	trans = btrfs_join_transaction(root);
				3685	if (IS_ERR(trans))
				3686	return PTR_ERR(trans);
				3687	return btrfs_commit_transaction(trans);
				3688	}
				3689
				3690	void close_ctree(struct btrfs_fs_info *fs_info)
				3691	{
				3692	struct btrfs_root *root = fs_info->tree_root;
				3693	int ret;
				3694
				3695	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
				3696	/*
				3697	* We don't want the cleaner to start new transactions, add more delayed
				3698	* iputs, etc. while we're closing. We can't use kthread_stop() yet
				3699	* because that frees the task_struct, and the transaction kthread might
				3700	* still try to wake up the cleaner.
				3701	*/
				3702	kthread_park(fs_info->cleaner_kthread);
				3703
				3704	/* wait for the qgroup rescan worker to stop */
				3705	btrfs_qgroup_wait_for_completion(fs_info, false);
				3706
				3707	/* wait for the uuid_scan task to finish */
				3708	down(&fs_info->uuid_tree_rescan_sem);
				3709	/* avoid complains from lockdep et al., set sem back to initial state */
				3710	up(&fs_info->uuid_tree_rescan_sem);
				3711
				3712	/* pause restriper - we want to resume on mount */
				3713	btrfs_pause_balance(fs_info);
				3714
				3715	btrfs_dev_replace_suspend_for_unmount(fs_info);
				3716
				3717	btrfs_scrub_cancel(fs_info);
				3718
				3719	/* wait for any defraggers to finish */
				3720	wait_event(fs_info->transaction_wait,
				3721	(atomic_read(&fs_info->defrag_running) == 0));
				3722
				3723	/* clear out the rbtree of defraggable inodes */
				3724	btrfs_cleanup_defrag_inodes(fs_info);
				3725
				3726	cancel_work_sync(&fs_info->async_reclaim_work);
				3727
				3728	if (!sb_rdonly(fs_info->sb)) {
				3729	/*
				3730	* The cleaner kthread is stopped, so do one final pass over
				3731	* unused block groups.
				3732	*/
				3733	btrfs_delete_unused_bgs(fs_info);
				3734
				3735	/*
				3736	* There might be existing delayed inode workers still running
				3737	* and holding an empty delayed inode item. We must wait for
				3738	* them to complete first because they can create a transaction.
				3739	* This happens when someone calls btrfs_balance_delayed_items()
				3740	* and then a transaction commit runs the same delayed nodes
				3741	* before any delayed worker has done something with the nodes.
				3742	* We must wait for any worker here and not at transaction
				3743	* commit time since that could cause a deadlock.
				3744	* This is a very rare case.
				3745	*/
				3746	btrfs_flush_workqueue(fs_info->delayed_workers);
				3747
				3748	ret = btrfs_commit_super(fs_info);
				3749	if (ret)
				3750	btrfs_err(fs_info, "commit super ret %d", ret);
				3751	}
				3752
				3753	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) \|\|
				3754	test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
				3755	btrfs_error_commit_super(fs_info);
				3756
				3757	kthread_stop(fs_info->transaction_kthread);
				3758	kthread_stop(fs_info->cleaner_kthread);
				3759
				3760	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
				3761
				3762	btrfs_free_qgroup_config(fs_info);
				3763	ASSERT(list_empty(&fs_info->delalloc_roots));
				3764
				3765	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
				3766	btrfs_info(fs_info, "at unmount delalloc count %lld",
				3767	percpu_counter_sum(&fs_info->delalloc_bytes));
				3768	}
				3769
				3770	btrfs_sysfs_remove_mounted(fs_info);
				3771	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
				3772
				3773	btrfs_free_fs_roots(fs_info);
				3774
				3775	btrfs_put_block_group_cache(fs_info);
				3776
				3777	/*
				3778	* we must make sure there is not any read request to
				3779	* submit after we stopping all workers.
				3780	*/
				3781	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
				3782	btrfs_stop_all_workers(fs_info);
				3783
				3784	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
				3785	free_root_pointers(fs_info, true);
				3786
				3787	/*
				3788	* We must free the block groups after dropping the fs_roots as we could
				3789	* have had an IO error and have left over tree log blocks that aren't
				3790	* cleaned up until the fs roots are freed. This makes the block group
				3791	* accounting appear to be wrong because there's pending reserved bytes,
				3792	* so make sure we do the block group cleanup afterwards.
				3793	*/
				3794	btrfs_free_block_groups(fs_info);
				3795
				3796	iput(fs_info->btree_inode);
				3797
				3798	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				3799	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
				3800	btrfsic_unmount(fs_info->fs_devices);
				3801	#endif
				3802
				3803	btrfs_close_devices(fs_info->fs_devices);
				3804	btrfs_mapping_tree_free(&fs_info->mapping_tree);
				3805
				3806	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
				3807	percpu_counter_destroy(&fs_info->delalloc_bytes);
				3808	percpu_counter_destroy(&fs_info->bio_counter);
				3809	cleanup_srcu_struct(&fs_info->subvol_srcu);
				3810
				3811	btrfs_free_stripe_hash_table(fs_info);
				3812
				3813	__btrfs_free_block_rsv(root->orphan_block_rsv);
				3814	root->orphan_block_rsv = NULL;
				3815
				3816	while (!list_empty(&fs_info->pinned_chunks)) {
				3817	struct extent_map *em;
				3818
				3819	em = list_first_entry(&fs_info->pinned_chunks,
				3820	struct extent_map, list);
				3821	list_del_init(&em->list);
				3822	free_extent_map(em);
				3823	}
				3824	}
				3825
				3826	int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
				3827	int atomic)
				3828	{
				3829	int ret;
				3830	struct inode *btree_inode = buf->pages[0]->mapping->host;
				3831
				3832	ret = extent_buffer_uptodate(buf);
				3833	if (!ret)
				3834	return ret;
				3835
				3836	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
				3837	parent_transid, atomic);
				3838	if (ret == -EAGAIN)
				3839	return ret;
				3840	return !ret;
				3841	}
				3842
				3843	void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
				3844	{
				3845	struct btrfs_fs_info *fs_info;
				3846	struct btrfs_root *root;
				3847	u64 transid = btrfs_header_generation(buf);
				3848	int was_dirty;
				3849
				3850	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				3851	/*
				3852	* This is a fast path so only do this check if we have sanity tests
				3853	* enabled. Normal people shouldn't be marking dummy buffers as dirty
				3854	* outside of the sanity tests.
				3855	*/
				3856	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &buf->bflags)))
				3857	return;
				3858	#endif
				3859	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
				3860	fs_info = root->fs_info;
				3861	btrfs_assert_tree_locked(buf);
				3862	if (transid != fs_info->generation)
				3863	WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
				3864	buf->start, transid, fs_info->generation);
				3865	was_dirty = set_extent_buffer_dirty(buf);
				3866	if (!was_dirty)
				3867	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
				3868	buf->len,
				3869	fs_info->dirty_metadata_batch);
				3870	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				3871	/*
				3872	* Since btrfs_mark_buffer_dirty() can be called with item pointer set
				3873	* but item data not updated.
				3874	* So here we should only check item pointers, not item data.
				3875	*/
				3876	if (btrfs_header_level(buf) == 0 &&
				3877	btrfs_check_leaf_relaxed(root, buf)) {
				3878	btrfs_print_leaf(buf);
				3879	ASSERT(0);
				3880	}
				3881	#endif
				3882	}
				3883
				3884	static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
				3885	int flush_delayed)
				3886	{
				3887	/*
				3888	* looks as though older kernels can get into trouble with
				3889	* this code, they end up stuck in balance_dirty_pages forever
				3890	*/
				3891	int ret;
				3892
				3893	if (current->flags & PF_MEMALLOC)
				3894	return;
				3895
				3896	if (flush_delayed)
				3897	btrfs_balance_delayed_items(fs_info);
				3898
				3899	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				3900	BTRFS_DIRTY_METADATA_THRESH,
				3901	fs_info->dirty_metadata_batch);
				3902	if (ret > 0) {
				3903	balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
				3904	}
				3905	}
				3906
				3907	void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
				3908	{
				3909	__btrfs_btree_balance_dirty(fs_info, 1);
				3910	}
				3911
				3912	void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
				3913	{
				3914	__btrfs_btree_balance_dirty(fs_info, 0);
				3915	}
				3916
				3917	int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
				3918	{
				3919	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
				3920	struct btrfs_fs_info *fs_info = root->fs_info;
				3921
				3922	return btree_read_extent_buffer_pages(fs_info, buf, parent_transid);
				3923	}
				3924
				3925	static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info)
				3926	{
				3927	struct btrfs_super_block *sb = fs_info->super_copy;
				3928	u64 nodesize = btrfs_super_nodesize(sb);
				3929	u64 sectorsize = btrfs_super_sectorsize(sb);
				3930	int ret = 0;
				3931
				3932	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
				3933	btrfs_err(fs_info, "no valid FS found");
				3934	ret = -EINVAL;
				3935	}
				3936	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
				3937	btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
				3938	btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
				3939	ret = -EINVAL;
				3940	}
				3941	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
				3942	btrfs_err(fs_info, "tree_root level too big: %d >= %d",
				3943	btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
				3944	ret = -EINVAL;
				3945	}
				3946	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
				3947	btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
				3948	btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
				3949	ret = -EINVAL;
				3950	}
				3951	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
				3952	btrfs_err(fs_info, "log_root level too big: %d >= %d",
				3953	btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
				3954	ret = -EINVAL;
				3955	}
				3956
				3957	/*
				3958	* Check sectorsize and nodesize first, other check will need it.
				3959	* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
				3960	*/
				3961	if (!is_power_of_2(sectorsize) \|\| sectorsize < 4096 \|\|
				3962	sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
				3963	btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
				3964	ret = -EINVAL;
				3965	}
				3966	/* Only PAGE SIZE is supported yet */
				3967	if (sectorsize != PAGE_SIZE) {
				3968	btrfs_err(fs_info,
				3969	"sectorsize %llu not supported yet, only support %lu",
				3970	sectorsize, PAGE_SIZE);
				3971	ret = -EINVAL;
				3972	}
				3973	if (!is_power_of_2(nodesize) \|\| nodesize < sectorsize \|\|
				3974	nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
				3975	btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
				3976	ret = -EINVAL;
				3977	}
				3978	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
				3979	btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
				3980	le32_to_cpu(sb->__unused_leafsize), nodesize);
				3981	ret = -EINVAL;
				3982	}
				3983
				3984	/* Root alignment check */
				3985	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
				3986	btrfs_warn(fs_info, "tree_root block unaligned: %llu",
				3987	btrfs_super_root(sb));
				3988	ret = -EINVAL;
				3989	}
				3990	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
				3991	btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
				3992	btrfs_super_chunk_root(sb));
				3993	ret = -EINVAL;
				3994	}
				3995	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
				3996	btrfs_warn(fs_info, "log_root block unaligned: %llu",
				3997	btrfs_super_log_root(sb));
				3998	ret = -EINVAL;
				3999	}
				4000
				4001	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
				4002	btrfs_err(fs_info,
				4003	"dev_item UUID does not match fsid: %pU != %pU",
				4004	fs_info->fsid, sb->dev_item.fsid);
				4005	ret = -EINVAL;
				4006	}
				4007
				4008	/*
				4009	* Hint to catch really bogus numbers, bitflips or so, more exact checks are
				4010	* done later
				4011	*/
				4012	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
				4013	btrfs_err(fs_info, "bytes_used is too small %llu",
				4014	btrfs_super_bytes_used(sb));
				4015	ret = -EINVAL;
				4016	}
				4017	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
				4018	btrfs_err(fs_info, "invalid stripesize %u",
				4019	btrfs_super_stripesize(sb));
				4020	ret = -EINVAL;
				4021	}
				4022	if (btrfs_super_num_devices(sb) > (1UL << 31))
				4023	btrfs_warn(fs_info, "suspicious number of devices: %llu",
				4024	btrfs_super_num_devices(sb));
				4025	if (btrfs_super_num_devices(sb) == 0) {
				4026	btrfs_err(fs_info, "number of devices is 0");
				4027	ret = -EINVAL;
				4028	}
				4029
				4030	if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
				4031	btrfs_err(fs_info, "super offset mismatch %llu != %u",
				4032	btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
				4033	ret = -EINVAL;
				4034	}
				4035
				4036	/*
				4037	* Obvious sys_chunk_array corruptions, it must hold at least one key
				4038	* and one chunk
				4039	*/
				4040	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
				4041	btrfs_err(fs_info, "system chunk array too big %u > %u",
				4042	btrfs_super_sys_array_size(sb),
				4043	BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
				4044	ret = -EINVAL;
				4045	}
				4046	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
				4047	+ sizeof(struct btrfs_chunk)) {
				4048	btrfs_err(fs_info, "system chunk array too small %u < %zu",
				4049	btrfs_super_sys_array_size(sb),
				4050	sizeof(struct btrfs_disk_key)
				4051	+ sizeof(struct btrfs_chunk));
				4052	ret = -EINVAL;
				4053	}
				4054
				4055	/*
				4056	* The generation is a global counter, we'll trust it more than the others
				4057	* but it's still possible that it's the one that's wrong.
				4058	*/
				4059	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
				4060	btrfs_warn(fs_info,
				4061	"suspicious: generation < chunk_root_generation: %llu < %llu",
				4062	btrfs_super_generation(sb),
				4063	btrfs_super_chunk_root_generation(sb));
				4064	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
				4065	&& btrfs_super_cache_generation(sb) != (u64)-1)
				4066	btrfs_warn(fs_info,
				4067	"suspicious: generation < cache_generation: %llu < %llu",
				4068	btrfs_super_generation(sb),
				4069	btrfs_super_cache_generation(sb));
				4070
				4071	return ret;
				4072	}
				4073
				4074	static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
				4075	{
				4076	/* cleanup FS via transaction */
				4077	btrfs_cleanup_transaction(fs_info);
				4078
				4079	mutex_lock(&fs_info->cleaner_mutex);
				4080	btrfs_run_delayed_iputs(fs_info);
				4081	mutex_unlock(&fs_info->cleaner_mutex);
				4082
				4083	down_write(&fs_info->cleanup_work_sem);
				4084	up_write(&fs_info->cleanup_work_sem);
				4085	}
				4086
				4087	static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
				4088	{
				4089	struct btrfs_ordered_extent *ordered;
				4090
				4091	spin_lock(&root->ordered_extent_lock);
				4092	/*
				4093	* This will just short circuit the ordered completion stuff which will
				4094	* make sure the ordered extent gets properly cleaned up.
				4095	*/
				4096	list_for_each_entry(ordered, &root->ordered_extents,
				4097	root_extent_list)
				4098	set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
				4099	spin_unlock(&root->ordered_extent_lock);
				4100	}
				4101
				4102	static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
				4103	{
				4104	struct btrfs_root *root;
				4105	struct list_head splice;
				4106
				4107	INIT_LIST_HEAD(&splice);
				4108
				4109	spin_lock(&fs_info->ordered_root_lock);
				4110	list_splice_init(&fs_info->ordered_roots, &splice);
				4111	while (!list_empty(&splice)) {
				4112	root = list_first_entry(&splice, struct btrfs_root,
				4113	ordered_root);
				4114	list_move_tail(&root->ordered_root,
				4115	&fs_info->ordered_roots);
				4116
				4117	spin_unlock(&fs_info->ordered_root_lock);
				4118	btrfs_destroy_ordered_extents(root);
				4119
				4120	cond_resched();
				4121	spin_lock(&fs_info->ordered_root_lock);
				4122	}
				4123	spin_unlock(&fs_info->ordered_root_lock);
				4124
				4125	/*
				4126	* We need this here because if we've been flipped read-only we won't
				4127	* get sync() from the umount, so we need to make sure any ordered
				4128	* extents that haven't had their dirty pages IO start writeout yet
				4129	* actually get run and error out properly.
				4130	*/
				4131	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
				4132	}
				4133
				4134	static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				4135	struct btrfs_fs_info *fs_info)
				4136	{
				4137	struct rb_node *node;
				4138	struct btrfs_delayed_ref_root *delayed_refs;
				4139	struct btrfs_delayed_ref_node *ref;
				4140	int ret = 0;
				4141
				4142	delayed_refs = &trans->delayed_refs;
				4143
				4144	spin_lock(&delayed_refs->lock);
				4145	if (atomic_read(&delayed_refs->num_entries) == 0) {
				4146	spin_unlock(&delayed_refs->lock);
				4147	btrfs_info(fs_info, "delayed_refs has NO entry");
				4148	return ret;
				4149	}
				4150
				4151	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
				4152	struct btrfs_delayed_ref_head *head;
				4153	struct btrfs_delayed_ref_node *tmp;
				4154	bool pin_bytes = false;
				4155
				4156	head = rb_entry(node, struct btrfs_delayed_ref_head,
				4157	href_node);
				4158	if (!mutex_trylock(&head->mutex)) {
				4159	refcount_inc(&head->node.refs);
				4160	spin_unlock(&delayed_refs->lock);
				4161
				4162	mutex_lock(&head->mutex);
				4163	mutex_unlock(&head->mutex);
				4164	btrfs_put_delayed_ref(&head->node);
				4165	spin_lock(&delayed_refs->lock);
				4166	continue;
				4167	}
				4168	spin_lock(&head->lock);
				4169	list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list,
				4170	list) {
				4171	ref->in_tree = 0;
				4172	list_del(&ref->list);
				4173	if (!list_empty(&ref->add_list))
				4174	list_del(&ref->add_list);
				4175	atomic_dec(&delayed_refs->num_entries);
				4176	btrfs_put_delayed_ref(ref);
				4177	}
				4178	if (head->must_insert_reserved)
				4179	pin_bytes = true;
				4180	btrfs_free_delayed_extent_op(head->extent_op);
				4181	delayed_refs->num_heads--;
				4182	if (head->processing == 0)
				4183	delayed_refs->num_heads_ready--;
				4184	atomic_dec(&delayed_refs->num_entries);
				4185	head->node.in_tree = 0;
				4186	rb_erase(&head->href_node, &delayed_refs->href_root);
				4187	spin_unlock(&head->lock);
				4188	spin_unlock(&delayed_refs->lock);
				4189	mutex_unlock(&head->mutex);
				4190
				4191	if (pin_bytes)
				4192	btrfs_pin_extent(fs_info, head->node.bytenr,
				4193	head->node.num_bytes, 1);
				4194	btrfs_put_delayed_ref(&head->node);
				4195	cond_resched();
				4196	spin_lock(&delayed_refs->lock);
				4197	}
				4198
				4199	spin_unlock(&delayed_refs->lock);
				4200
				4201	return ret;
				4202	}
				4203
				4204	static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
				4205	{
				4206	struct btrfs_inode *btrfs_inode;
				4207	struct list_head splice;
				4208
				4209	INIT_LIST_HEAD(&splice);
				4210
				4211	spin_lock(&root->delalloc_lock);
				4212	list_splice_init(&root->delalloc_inodes, &splice);
				4213
				4214	while (!list_empty(&splice)) {
				4215	struct inode *inode = NULL;
				4216	btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
				4217	delalloc_inodes);
				4218	__btrfs_del_delalloc_inode(root, btrfs_inode);
				4219	spin_unlock(&root->delalloc_lock);
				4220
				4221	/*
				4222	* Make sure we get a live inode and that it'll not disappear
				4223	* meanwhile.
				4224	*/
				4225	inode = igrab(&btrfs_inode->vfs_inode);
				4226	if (inode) {
				4227	invalidate_inode_pages2(inode->i_mapping);
				4228	iput(inode);
				4229	}
				4230	spin_lock(&root->delalloc_lock);
				4231	}
				4232	spin_unlock(&root->delalloc_lock);
				4233	}
				4234
				4235	static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
				4236	{
				4237	struct btrfs_root *root;
				4238	struct list_head splice;
				4239
				4240	INIT_LIST_HEAD(&splice);
				4241
				4242	spin_lock(&fs_info->delalloc_root_lock);
				4243	list_splice_init(&fs_info->delalloc_roots, &splice);
				4244	while (!list_empty(&splice)) {
				4245	root = list_first_entry(&splice, struct btrfs_root,
				4246	delalloc_root);
				4247	root = btrfs_grab_fs_root(root);
				4248	BUG_ON(!root);
				4249	spin_unlock(&fs_info->delalloc_root_lock);
				4250
				4251	btrfs_destroy_delalloc_inodes(root);
				4252	btrfs_put_fs_root(root);
				4253
				4254	spin_lock(&fs_info->delalloc_root_lock);
				4255	}
				4256	spin_unlock(&fs_info->delalloc_root_lock);
				4257	}
				4258
				4259	static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
				4260	struct extent_io_tree *dirty_pages,
				4261	int mark)
				4262	{
				4263	int ret;
				4264	struct extent_buffer *eb;
				4265	u64 start = 0;
				4266	u64 end;
				4267
				4268	while (1) {
				4269	ret = find_first_extent_bit(dirty_pages, start, &start, &end,
				4270	mark, NULL);
				4271	if (ret)
				4272	break;
				4273
				4274	clear_extent_bits(dirty_pages, start, end, mark);
				4275	while (start <= end) {
				4276	eb = find_extent_buffer(fs_info, start);
				4277	start += fs_info->nodesize;
				4278	if (!eb)
				4279	continue;
				4280	wait_on_extent_buffer_writeback(eb);
				4281
				4282	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
				4283	&eb->bflags))
				4284	clear_extent_buffer_dirty(eb);
				4285	free_extent_buffer_stale(eb);
				4286	}
				4287	}
				4288
				4289	return ret;
				4290	}
				4291
				4292	static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
				4293	struct extent_io_tree *pinned_extents)
				4294	{
				4295	struct extent_io_tree *unpin;
				4296	u64 start;
				4297	u64 end;
				4298	int ret;
				4299	bool loop = true;
				4300
				4301	unpin = pinned_extents;
				4302	again:
				4303	while (1) {
				4304	/*
				4305	* The btrfs_finish_extent_commit() may get the same range as
				4306	* ours between find_first_extent_bit and clear_extent_dirty.
				4307	* Hence, hold the unused_bg_unpin_mutex to avoid double unpin
				4308	* the same extent range.
				4309	*/
				4310	mutex_lock(&fs_info->unused_bg_unpin_mutex);
				4311	ret = find_first_extent_bit(unpin, 0, &start, &end,
				4312	EXTENT_DIRTY, NULL);
				4313	if (ret) {
				4314	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				4315	break;
				4316	}
				4317
				4318	clear_extent_dirty(unpin, start, end);
				4319	btrfs_error_unpin_extent_range(fs_info, start, end);
				4320	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				4321	cond_resched();
				4322	}
				4323
				4324	if (loop) {
				4325	if (unpin == &fs_info->freed_extents[0])
				4326	unpin = &fs_info->freed_extents[1];
				4327	else
				4328	unpin = &fs_info->freed_extents[0];
				4329	loop = false;
				4330	goto again;
				4331	}
				4332
				4333	return 0;
				4334	}
				4335
				4336	static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
				4337	{
				4338	struct inode *inode;
				4339
				4340	inode = cache->io_ctl.inode;
				4341	if (inode) {
				4342	invalidate_inode_pages2(inode->i_mapping);
				4343	BTRFS_I(inode)->generation = 0;
				4344	cache->io_ctl.inode = NULL;
				4345	iput(inode);
				4346	}
				4347	ASSERT(cache->io_ctl.pages == NULL);
				4348	btrfs_put_block_group(cache);
				4349	}
				4350
				4351	void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
				4352	struct btrfs_fs_info *fs_info)
				4353	{
				4354	struct btrfs_block_group_cache *cache;
				4355
				4356	spin_lock(&cur_trans->dirty_bgs_lock);
				4357	while (!list_empty(&cur_trans->dirty_bgs)) {
				4358	cache = list_first_entry(&cur_trans->dirty_bgs,
				4359	struct btrfs_block_group_cache,
				4360	dirty_list);
				4361	if (!cache) {
				4362	btrfs_err(fs_info, "orphan block group dirty_bgs list");
				4363	spin_unlock(&cur_trans->dirty_bgs_lock);
				4364	return;
				4365	}
				4366
				4367	if (!list_empty(&cache->io_list)) {
				4368	spin_unlock(&cur_trans->dirty_bgs_lock);
				4369	list_del_init(&cache->io_list);
				4370	btrfs_cleanup_bg_io(cache);
				4371	spin_lock(&cur_trans->dirty_bgs_lock);
				4372	}
				4373
				4374	list_del_init(&cache->dirty_list);
				4375	spin_lock(&cache->lock);
				4376	cache->disk_cache_state = BTRFS_DC_ERROR;
				4377	spin_unlock(&cache->lock);
				4378
				4379	spin_unlock(&cur_trans->dirty_bgs_lock);
				4380	btrfs_put_block_group(cache);
				4381	spin_lock(&cur_trans->dirty_bgs_lock);
				4382	}
				4383	spin_unlock(&cur_trans->dirty_bgs_lock);
				4384
				4385	while (!list_empty(&cur_trans->io_bgs)) {
				4386	cache = list_first_entry(&cur_trans->io_bgs,
				4387	struct btrfs_block_group_cache,
				4388	io_list);
				4389	if (!cache) {
				4390	btrfs_err(fs_info, "orphan block group on io_bgs list");
				4391	return;
				4392	}
				4393
				4394	list_del_init(&cache->io_list);
				4395	spin_lock(&cache->lock);
				4396	cache->disk_cache_state = BTRFS_DC_ERROR;
				4397	spin_unlock(&cache->lock);
				4398	btrfs_cleanup_bg_io(cache);
				4399	}
				4400	}
				4401
				4402	void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
				4403	struct btrfs_fs_info *fs_info)
				4404	{
				4405	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
				4406	ASSERT(list_empty(&cur_trans->dirty_bgs));
				4407	ASSERT(list_empty(&cur_trans->io_bgs));
				4408
				4409	btrfs_destroy_delayed_refs(cur_trans, fs_info);
				4410
				4411	cur_trans->state = TRANS_STATE_COMMIT_START;
				4412	wake_up(&fs_info->transaction_blocked_wait);
				4413
				4414	cur_trans->state = TRANS_STATE_UNBLOCKED;
				4415	wake_up(&fs_info->transaction_wait);
				4416
				4417	btrfs_destroy_delayed_inodes(fs_info);
				4418
				4419	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
				4420	EXTENT_DIRTY);
				4421	btrfs_destroy_pinned_extent(fs_info,
				4422	fs_info->pinned_extents);
				4423
				4424	cur_trans->state =TRANS_STATE_COMPLETED;
				4425	wake_up(&cur_trans->commit_wait);
				4426	}
				4427
				4428	static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
				4429	{
				4430	struct btrfs_transaction *t;
				4431
				4432	mutex_lock(&fs_info->transaction_kthread_mutex);
				4433
				4434	spin_lock(&fs_info->trans_lock);
				4435	while (!list_empty(&fs_info->trans_list)) {
				4436	t = list_first_entry(&fs_info->trans_list,
				4437	struct btrfs_transaction, list);
				4438	if (t->state >= TRANS_STATE_COMMIT_START) {
				4439	refcount_inc(&t->use_count);
				4440	spin_unlock(&fs_info->trans_lock);
				4441	btrfs_wait_for_commit(fs_info, t->transid);
				4442	btrfs_put_transaction(t);
				4443	spin_lock(&fs_info->trans_lock);
				4444	continue;
				4445	}
				4446	if (t == fs_info->running_transaction) {
				4447	t->state = TRANS_STATE_COMMIT_DOING;
				4448	spin_unlock(&fs_info->trans_lock);
				4449	/*
				4450	* We wait for 0 num_writers since we don't hold a trans
				4451	* handle open currently for this transaction.
				4452	*/
				4453	wait_event(t->writer_wait,
				4454	atomic_read(&t->num_writers) == 0);
				4455	} else {
				4456	spin_unlock(&fs_info->trans_lock);
				4457	}
				4458	btrfs_cleanup_one_transaction(t, fs_info);
				4459
				4460	spin_lock(&fs_info->trans_lock);
				4461	if (t == fs_info->running_transaction)
				4462	fs_info->running_transaction = NULL;
				4463	list_del_init(&t->list);
				4464	spin_unlock(&fs_info->trans_lock);
				4465
				4466	btrfs_put_transaction(t);
				4467	trace_btrfs_transaction_commit(fs_info->tree_root);
				4468	spin_lock(&fs_info->trans_lock);
				4469	}
				4470	spin_unlock(&fs_info->trans_lock);
				4471	btrfs_destroy_all_ordered_extents(fs_info);
				4472	btrfs_destroy_delayed_inodes(fs_info);
				4473	btrfs_assert_delayed_root_empty(fs_info);
				4474	btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
				4475	btrfs_destroy_all_delalloc_inodes(fs_info);
				4476	mutex_unlock(&fs_info->transaction_kthread_mutex);
				4477
				4478	return 0;
				4479	}
				4480
				4481	static struct btrfs_fs_info btree_fs_info(void private_data)
				4482	{
				4483	struct inode *inode = private_data;
				4484	return btrfs_sb(inode->i_sb);
				4485	}
				4486
				4487	static const struct extent_io_ops btree_extent_io_ops = {
				4488	/* mandatory callbacks */
				4489	.submit_bio_hook = btree_submit_bio_hook,
				4490	.readpage_end_io_hook = btree_readpage_end_io_hook,
				4491	/* note we're sharing with inode.c for the merge bio hook */
				4492	.merge_bio_hook = btrfs_merge_bio_hook,
				4493	.readpage_io_failed_hook = btree_io_failed_hook,
				4494	.set_range_writeback = btrfs_set_range_writeback,
				4495	.tree_fs_info = btree_fs_info,
				4496
				4497	/* optional callbacks */
				4498	};