Blame - src/kernel/linux/v4.19/fs/btrfs/disk-io.c - T800

blob: e12c37f457e056aa15b2a5c61d83b1d0f263d4a9 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/fs.h>
				7	#include <linux/blkdev.h>
				8	#include <linux/radix-tree.h>
				9	#include <linux/writeback.h>
				10	#include <linux/buffer_head.h>
				11	#include <linux/workqueue.h>
				12	#include <linux/kthread.h>
				13	#include <linux/slab.h>
				14	#include <linux/migrate.h>
				15	#include <linux/ratelimit.h>
				16	#include <linux/uuid.h>
				17	#include <linux/semaphore.h>
				18	#include <linux/error-injection.h>
				19	#include <linux/crc32c.h>
				20	#include <linux/sched/mm.h>
				21	#include <asm/unaligned.h>
				22	#include "ctree.h"
				23	#include "disk-io.h"
				24	#include "transaction.h"
				25	#include "btrfs_inode.h"
				26	#include "volumes.h"
				27	#include "print-tree.h"
				28	#include "locking.h"
				29	#include "tree-log.h"
				30	#include "free-space-cache.h"
				31	#include "free-space-tree.h"
				32	#include "inode-map.h"
				33	#include "check-integrity.h"
				34	#include "rcu-string.h"
				35	#include "dev-replace.h"
				36	#include "raid56.h"
				37	#include "sysfs.h"
				38	#include "qgroup.h"
				39	#include "compression.h"
				40	#include "tree-checker.h"
				41	#include "ref-verify.h"
				42
				43	#ifdef CONFIG_X86
				44	#include <asm/cpufeature.h>
				45	#endif
				46
				47	#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN \|\
				48	BTRFS_HEADER_FLAG_RELOC \|\
				49	BTRFS_SUPER_FLAG_ERROR \|\
				50	BTRFS_SUPER_FLAG_SEEDING \|\
				51	BTRFS_SUPER_FLAG_METADUMP \|\
				52	BTRFS_SUPER_FLAG_METADUMP_V2)
				53
				54	static const struct extent_io_ops btree_extent_io_ops;
				55	static void end_workqueue_fn(struct btrfs_work *work);
				56	static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
				57	static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				58	struct btrfs_fs_info *fs_info);
				59	static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
				60	static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
				61	struct extent_io_tree *dirty_pages,
				62	int mark);
				63	static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
				64	struct extent_io_tree *pinned_extents);
				65	static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
				66	static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
				67
				68	/*
				69	* btrfs_end_io_wq structs are used to do processing in task context when an IO
				70	* is complete. This is used during reads to verify checksums, and it is used
				71	* by writes to insert metadata for new file extents after IO is complete.
				72	*/
				73	struct btrfs_end_io_wq {
				74	struct bio *bio;
				75	bio_end_io_t *end_io;
				76	void *private;
				77	struct btrfs_fs_info *info;
				78	blk_status_t status;
				79	enum btrfs_wq_endio_type metadata;
				80	struct btrfs_work work;
				81	};
				82
				83	static struct kmem_cache *btrfs_end_io_wq_cache;
				84
				85	int __init btrfs_end_io_wq_init(void)
				86	{
				87	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
				88	sizeof(struct btrfs_end_io_wq),
				89	0,
				90	SLAB_MEM_SPREAD,
				91	NULL);
				92	if (!btrfs_end_io_wq_cache)
				93	return -ENOMEM;
				94	return 0;
				95	}
				96
				97	void __cold btrfs_end_io_wq_exit(void)
				98	{
				99	kmem_cache_destroy(btrfs_end_io_wq_cache);
				100	}
				101
				102	/*
				103	* async submit bios are used to offload expensive checksumming
				104	* onto the worker threads. They checksum file and metadata bios
				105	* just before they are sent down the IO stack.
				106	*/
				107	struct async_submit_bio {
				108	void *private_data;
				109	struct bio *bio;
				110	extent_submit_bio_start_t *submit_bio_start;
				111	int mirror_num;
				112	/*
				113	* bio_offset is optional, can be used if the pages in the bio
				114	* can't tell us where in the file the bio should go
				115	*/
				116	u64 bio_offset;
				117	struct btrfs_work work;
				118	blk_status_t status;
				119	};
				120
				121	/*
				122	* Lockdep class keys for extent_buffer->lock's in this root. For a given
				123	* eb, the lockdep key is determined by the btrfs_root it belongs to and
				124	* the level the eb occupies in the tree.
				125	*
				126	* Different roots are used for different purposes and may nest inside each
				127	* other and they require separate keysets. As lockdep keys should be
				128	* static, assign keysets according to the purpose of the root as indicated
				129	* by btrfs_root->objectid. This ensures that all special purpose roots
				130	* have separate keysets.
				131	*
				132	* Lock-nesting across peer nodes is always done with the immediate parent
				133	* node locked thus preventing deadlock. As lockdep doesn't know this, use
				134	* subclass to avoid triggering lockdep warning in such cases.
				135	*
				136	* The key is set by the readpage_end_io_hook after the buffer has passed
				137	* csum validation but before the pages are unlocked. It is also set by
				138	* btrfs_init_new_buffer on freshly allocated blocks.
				139	*
				140	* We also add a check to make sure the highest level of the tree is the
				141	* same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
				142	* needs update as well.
				143	*/
				144	#ifdef CONFIG_DEBUG_LOCK_ALLOC
				145	# if BTRFS_MAX_LEVEL != 8
				146	# error
				147	# endif
				148
				149	static struct btrfs_lockdep_keyset {
				150	u64 id; /* root objectid */
				151	const char name_stem; / lock name stem */
				152	char names[BTRFS_MAX_LEVEL + 1][20];
				153	struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
				154	} btrfs_lockdep_keysets[] = {
				155	{ .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
				156	{ .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
				157	{ .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
				158	{ .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
				159	{ .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
				160	{ .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
				161	{ .id = BTRFS_QUOTA_TREE_OBJECTID, .name_stem = "quota" },
				162	{ .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
				163	{ .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
				164	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
				165	{ .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" },
				166	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" },
				167	{ .id = 0, .name_stem = "tree" },
				168	};
				169
				170	void __init btrfs_init_lockdep(void)
				171	{
				172	int i, j;
				173
				174	/* initialize lockdep class names */
				175	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
				176	struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
				177
				178	for (j = 0; j < ARRAY_SIZE(ks->names); j++)
				179	snprintf(ks->names[j], sizeof(ks->names[j]),
				180	"btrfs-%s-%02d", ks->name_stem, j);
				181	}
				182	}
				183
				184	void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
				185	int level)
				186	{
				187	struct btrfs_lockdep_keyset *ks;
				188
				189	BUG_ON(level >= ARRAY_SIZE(ks->keys));
				190
				191	/* find the matching keyset, id 0 is the default entry */
				192	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
				193	if (ks->id == objectid)
				194	break;
				195
				196	lockdep_set_class_and_name(&eb->lock,
				197	&ks->keys[level], ks->names[level]);
				198	}
				199
				200	#endif
				201
				202	/*
				203	* extents on the btree inode are pretty simple, there's one extent
				204	* that covers the entire device
				205	*/
				206	struct extent_map btree_get_extent(struct btrfs_inode inode,
				207	struct page *page, size_t pg_offset, u64 start, u64 len,
				208	int create)
				209	{
				210	struct btrfs_fs_info *fs_info = inode->root->fs_info;
				211	struct extent_map_tree *em_tree = &inode->extent_tree;
				212	struct extent_map *em;
				213	int ret;
				214
				215	read_lock(&em_tree->lock);
				216	em = lookup_extent_mapping(em_tree, start, len);
				217	if (em) {
				218	em->bdev = fs_info->fs_devices->latest_bdev;
				219	read_unlock(&em_tree->lock);
				220	goto out;
				221	}
				222	read_unlock(&em_tree->lock);
				223
				224	em = alloc_extent_map();
				225	if (!em) {
				226	em = ERR_PTR(-ENOMEM);
				227	goto out;
				228	}
				229	em->start = 0;
				230	em->len = (u64)-1;
				231	em->block_len = (u64)-1;
				232	em->block_start = 0;
				233	em->bdev = fs_info->fs_devices->latest_bdev;
				234
				235	write_lock(&em_tree->lock);
				236	ret = add_extent_mapping(em_tree, em, 0);
				237	if (ret == -EEXIST) {
				238	free_extent_map(em);
				239	em = lookup_extent_mapping(em_tree, start, len);
				240	if (!em)
				241	em = ERR_PTR(-EIO);
				242	} else if (ret) {
				243	free_extent_map(em);
				244	em = ERR_PTR(ret);
				245	}
				246	write_unlock(&em_tree->lock);
				247
				248	out:
				249	return em;
				250	}
				251
				252	u32 btrfs_csum_data(const char *data, u32 seed, size_t len)
				253	{
				254	return crc32c(seed, data, len);
				255	}
				256
				257	void btrfs_csum_final(u32 crc, u8 *result)
				258	{
				259	put_unaligned_le32(~crc, result);
				260	}
				261
				262	/*
				263	* compute the csum for a btree block, and either verify it or write it
				264	* into the csum field of the block.
				265	*/
				266	static int csum_tree_block(struct btrfs_fs_info *fs_info,
				267	struct extent_buffer *buf,
				268	int verify)
				269	{
				270	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
				271	char result[BTRFS_CSUM_SIZE];
				272	unsigned long len;
				273	unsigned long cur_len;
				274	unsigned long offset = BTRFS_CSUM_SIZE;
				275	char *kaddr;
				276	unsigned long map_start;
				277	unsigned long map_len;
				278	int err;
				279	u32 crc = ~(u32)0;
				280
				281	len = buf->len - offset;
				282	while (len > 0) {
				283	err = map_private_extent_buffer(buf, offset, 32,
				284	&kaddr, &map_start, &map_len);
				285	if (err)
				286	return err;
				287	cur_len = min(len, map_len - (offset - map_start));
				288	crc = btrfs_csum_data(kaddr + offset - map_start,
				289	crc, cur_len);
				290	len -= cur_len;
				291	offset += cur_len;
				292	}
				293	memset(result, 0, BTRFS_CSUM_SIZE);
				294
				295	btrfs_csum_final(crc, result);
				296
				297	if (verify) {
				298	if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
				299	u32 val;
				300	u32 found = 0;
				301	memcpy(&found, result, csum_size);
				302
				303	read_extent_buffer(buf, &val, 0, csum_size);
				304	btrfs_warn_rl(fs_info,
				305	"%s checksum verify failed on %llu wanted %X found %X level %d",
				306	fs_info->sb->s_id, buf->start,
				307	val, found, btrfs_header_level(buf));
				308	return -EUCLEAN;
				309	}
				310	} else {
				311	write_extent_buffer(buf, result, 0, csum_size);
				312	}
				313
				314	return 0;
				315	}
				316
				317	/*
				318	* we can't consider a given block up to date unless the transid of the
				319	* block matches the transid in the parent node's pointer. This is how we
				320	* detect blocks that either didn't get written at all or got written
				321	* in the wrong place.
				322	*/
				323	static int verify_parent_transid(struct extent_io_tree *io_tree,
				324	struct extent_buffer *eb, u64 parent_transid,
				325	int atomic)
				326	{
				327	struct extent_state *cached_state = NULL;
				328	int ret;
				329	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
				330
				331	if (!parent_transid \|\| btrfs_header_generation(eb) == parent_transid)
				332	return 0;
				333
				334	if (atomic)
				335	return -EAGAIN;
				336
				337	if (need_lock) {
				338	btrfs_tree_read_lock(eb);
				339	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
				340	}
				341
				342	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
				343	&cached_state);
				344	if (extent_buffer_uptodate(eb) &&
				345	btrfs_header_generation(eb) == parent_transid) {
				346	ret = 0;
				347	goto out;
				348	}
				349	btrfs_err_rl(eb->fs_info,
				350	"parent transid verify failed on %llu wanted %llu found %llu",
				351	eb->start,
				352	parent_transid, btrfs_header_generation(eb));
				353	ret = 1;
				354
				355	/*
				356	* Things reading via commit roots that don't have normal protection,
				357	* like send, can have a really old block in cache that may point at a
				358	* block that has been freed and re-allocated. So don't clear uptodate
				359	* if we find an eb that is under IO (dirty/writeback) because we could
				360	* end up reading in the stale data and then writing it back out and
				361	* making everybody very sad.
				362	*/
				363	if (!extent_buffer_under_io(eb))
				364	clear_extent_buffer_uptodate(eb);
				365	out:
				366	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
				367	&cached_state);
				368	if (need_lock)
				369	btrfs_tree_read_unlock_blocking(eb);
				370	return ret;
				371	}
				372
				373	/*
				374	* Return 0 if the superblock checksum type matches the checksum value of that
				375	* algorithm. Pass the raw disk superblock data.
				376	*/
				377	static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
				378	char *raw_disk_sb)
				379	{
				380	struct btrfs_super_block *disk_sb =
				381	(struct btrfs_super_block *)raw_disk_sb;
				382	u16 csum_type = btrfs_super_csum_type(disk_sb);
				383	int ret = 0;
				384
				385	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
				386	u32 crc = ~(u32)0;
				387	char result[sizeof(crc)];
				388
				389	/*
				390	* The super_block structure does not span the whole
				391	* BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
				392	* is filled with zeros and is included in the checksum.
				393	*/
				394	crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
				395	crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
				396	btrfs_csum_final(crc, result);
				397
				398	if (memcmp(raw_disk_sb, result, sizeof(result)))
				399	ret = 1;
				400	}
				401
				402	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
				403	btrfs_err(fs_info, "unsupported checksum algorithm %u",
				404	csum_type);
				405	ret = 1;
				406	}
				407
				408	return ret;
				409	}
				410
				411	int btrfs_verify_level_key(struct btrfs_fs_info *fs_info,
				412	struct extent_buffer *eb, int level,
				413	struct btrfs_key *first_key, u64 parent_transid)
				414	{
				415	int found_level;
				416	struct btrfs_key found_key;
				417	int ret;
				418
				419	found_level = btrfs_header_level(eb);
				420	if (found_level != level) {
				421	#ifdef CONFIG_BTRFS_DEBUG
				422	WARN_ON(1);
				423	btrfs_err(fs_info,
				424	"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
				425	eb->start, level, found_level);
				426	#endif
				427	return -EIO;
				428	}
				429
				430	if (!first_key)
				431	return 0;
				432
				433	/*
				434	* For live tree block (new tree blocks in current transaction),
				435	* we need proper lock context to avoid race, which is impossible here.
				436	* So we only checks tree blocks which is read from disk, whose
				437	* generation <= fs_info->last_trans_committed.
				438	*/
				439	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
				440	return 0;
				441	if (found_level)
				442	btrfs_node_key_to_cpu(eb, &found_key, 0);
				443	else
				444	btrfs_item_key_to_cpu(eb, &found_key, 0);
				445	ret = btrfs_comp_cpu_keys(first_key, &found_key);
				446
				447	#ifdef CONFIG_BTRFS_DEBUG
				448	if (ret) {
				449	WARN_ON(1);
				450	btrfs_err(fs_info,
				451	"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
				452	eb->start, parent_transid, first_key->objectid,
				453	first_key->type, first_key->offset,
				454	found_key.objectid, found_key.type,
				455	found_key.offset);
				456	}
				457	#endif
				458	return ret;
				459	}
				460
				461	/*
				462	* helper to read a given tree block, doing retries as required when
				463	* the checksums don't match and we have alternate mirrors to try.
				464	*
				465	* @parent_transid: expected transid, skip check if 0
				466	* @level: expected level, mandatory check
				467	* @first_key: expected key of first slot, skip check if NULL
				468	*/
				469	static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info,
				470	struct extent_buffer *eb,
				471	u64 parent_transid, int level,
				472	struct btrfs_key *first_key)
				473	{
				474	struct extent_io_tree *io_tree;
				475	int failed = 0;
				476	int ret;
				477	int num_copies = 0;
				478	int mirror_num = 0;
				479	int failed_mirror = 0;
				480
				481	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
				482	while (1) {
				483	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
				484	ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE,
				485	mirror_num);
				486	if (!ret) {
				487	if (verify_parent_transid(io_tree, eb,
				488	parent_transid, 0))
				489	ret = -EIO;
				490	else if (btrfs_verify_level_key(fs_info, eb, level,
				491	first_key, parent_transid))
				492	ret = -EUCLEAN;
				493	else
				494	break;
				495	}
				496
				497	num_copies = btrfs_num_copies(fs_info,
				498	eb->start, eb->len);
				499	if (num_copies == 1)
				500	break;
				501
				502	if (!failed_mirror) {
				503	failed = 1;
				504	failed_mirror = eb->read_mirror;
				505	}
				506
				507	mirror_num++;
				508	if (mirror_num == failed_mirror)
				509	mirror_num++;
				510
				511	if (mirror_num > num_copies)
				512	break;
				513	}
				514
				515	if (failed && !ret && failed_mirror)
				516	repair_eb_io_failure(fs_info, eb, failed_mirror);
				517
				518	return ret;
				519	}
				520
				521	/*
				522	* checksum a dirty tree block before IO. This has extra checks to make sure
				523	* we only fill in the checksum field in the first page of a multi-page block
				524	*/
				525
				526	static int csum_dirty_buffer(struct btrfs_fs_info fs_info, struct page page)
				527	{
				528	u64 start = page_offset(page);
				529	u64 found_start;
				530	struct extent_buffer *eb;
				531
				532	eb = (struct extent_buffer *)page->private;
				533	if (page != eb->pages[0])
				534	return 0;
				535
				536	found_start = btrfs_header_bytenr(eb);
				537	/*
				538	* Please do not consolidate these warnings into a single if.
				539	* It is useful to know what went wrong.
				540	*/
				541	if (WARN_ON(found_start != start))
				542	return -EUCLEAN;
				543	if (WARN_ON(!PageUptodate(page)))
				544	return -EUCLEAN;
				545
				546	ASSERT(memcmp_extent_buffer(eb, fs_info->fsid,
				547	btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);
				548
				549	return csum_tree_block(fs_info, eb, 0);
				550	}
				551
				552	static int check_tree_block_fsid(struct btrfs_fs_info *fs_info,
				553	struct extent_buffer *eb)
				554	{
				555	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				556	u8 fsid[BTRFS_FSID_SIZE];
				557	int ret = 1;
				558
				559	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
				560	while (fs_devices) {
				561	if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
				562	ret = 0;
				563	break;
				564	}
				565	fs_devices = fs_devices->seed;
				566	}
				567	return ret;
				568	}
				569
				570	static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
				571	u64 phy_offset, struct page *page,
				572	u64 start, u64 end, int mirror)
				573	{
				574	u64 found_start;
				575	int found_level;
				576	struct extent_buffer *eb;
				577	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
				578	struct btrfs_fs_info *fs_info = root->fs_info;
				579	int ret = 0;
				580	int reads_done;
				581
				582	if (!page->private)
				583	goto out;
				584
				585	eb = (struct extent_buffer *)page->private;
				586
				587	/* the pending IO might have been the only thing that kept this buffer
				588	* in memory. Make sure we have a ref for all this other checks
				589	*/
				590	extent_buffer_get(eb);
				591
				592	reads_done = atomic_dec_and_test(&eb->io_pages);
				593	if (!reads_done)
				594	goto err;
				595
				596	eb->read_mirror = mirror;
				597	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
				598	ret = -EIO;
				599	goto err;
				600	}
				601
				602	found_start = btrfs_header_bytenr(eb);
				603	if (found_start != eb->start) {
				604	btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
				605	eb->start, found_start);
				606	ret = -EIO;
				607	goto err;
				608	}
				609	if (check_tree_block_fsid(fs_info, eb)) {
				610	btrfs_err_rl(fs_info, "bad fsid on block %llu",
				611	eb->start);
				612	ret = -EIO;
				613	goto err;
				614	}
				615	found_level = btrfs_header_level(eb);
				616	if (found_level >= BTRFS_MAX_LEVEL) {
				617	btrfs_err(fs_info, "bad tree block level %d on %llu",
				618	(int)btrfs_header_level(eb), eb->start);
				619	ret = -EIO;
				620	goto err;
				621	}
				622
				623	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
				624	eb, found_level);
				625
				626	ret = csum_tree_block(fs_info, eb, 1);
				627	if (ret)
				628	goto err;
				629
				630	/*
				631	* If this is a leaf block and it is corrupt, set the corrupt bit so
				632	* that we don't try and read the other copies of this block, just
				633	* return -EIO.
				634	*/
				635	if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
				636	set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
				637	ret = -EIO;
				638	}
				639
				640	if (found_level > 0 && btrfs_check_node(fs_info, eb))
				641	ret = -EIO;
				642
				643	if (!ret)
				644	set_extent_buffer_uptodate(eb);
				645	err:
				646	if (reads_done &&
				647	test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
				648	btree_readahead_hook(eb, ret);
				649
				650	if (ret) {
				651	/*
				652	* our io error hook is going to dec the io pages
				653	* again, we have to make sure it has something
				654	* to decrement
				655	*/
				656	atomic_inc(&eb->io_pages);
				657	clear_extent_buffer_uptodate(eb);
				658	}
				659	free_extent_buffer(eb);
				660	out:
				661	return ret;
				662	}
				663
				664	static int btree_io_failed_hook(struct page *page, int failed_mirror)
				665	{
				666	struct extent_buffer *eb;
				667
				668	eb = (struct extent_buffer *)page->private;
				669	set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
				670	eb->read_mirror = failed_mirror;
				671	atomic_dec(&eb->io_pages);
				672	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
				673	btree_readahead_hook(eb, -EIO);
				674	return -EIO; /* we fixed nothing */
				675	}
				676
				677	static void end_workqueue_bio(struct bio *bio)
				678	{
				679	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
				680	struct btrfs_fs_info *fs_info;
				681	struct btrfs_workqueue *wq;
				682	btrfs_work_func_t func;
				683
				684	fs_info = end_io_wq->info;
				685	end_io_wq->status = bio->bi_status;
				686
				687	if (bio_op(bio) == REQ_OP_WRITE) {
				688	if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
				689	wq = fs_info->endio_meta_write_workers;
				690	func = btrfs_endio_meta_write_helper;
				691	} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
				692	wq = fs_info->endio_freespace_worker;
				693	func = btrfs_freespace_write_helper;
				694	} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
				695	wq = fs_info->endio_raid56_workers;
				696	func = btrfs_endio_raid56_helper;
				697	} else {
				698	wq = fs_info->endio_write_workers;
				699	func = btrfs_endio_write_helper;
				700	}
				701	} else {
				702	if (unlikely(end_io_wq->metadata ==
				703	BTRFS_WQ_ENDIO_DIO_REPAIR)) {
				704	wq = fs_info->endio_repair_workers;
				705	func = btrfs_endio_repair_helper;
				706	} else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
				707	wq = fs_info->endio_raid56_workers;
				708	func = btrfs_endio_raid56_helper;
				709	} else if (end_io_wq->metadata) {
				710	wq = fs_info->endio_meta_workers;
				711	func = btrfs_endio_meta_helper;
				712	} else {
				713	wq = fs_info->endio_workers;
				714	func = btrfs_endio_helper;
				715	}
				716	}
				717
				718	btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
				719	btrfs_queue_work(wq, &end_io_wq->work);
				720	}
				721
				722	blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info info, struct bio bio,
				723	enum btrfs_wq_endio_type metadata)
				724	{
				725	struct btrfs_end_io_wq *end_io_wq;
				726
				727	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
				728	if (!end_io_wq)
				729	return BLK_STS_RESOURCE;
				730
				731	end_io_wq->private = bio->bi_private;
				732	end_io_wq->end_io = bio->bi_end_io;
				733	end_io_wq->info = info;
				734	end_io_wq->status = 0;
				735	end_io_wq->bio = bio;
				736	end_io_wq->metadata = metadata;
				737
				738	bio->bi_private = end_io_wq;
				739	bio->bi_end_io = end_workqueue_bio;
				740	return 0;
				741	}
				742
				743	static void run_one_async_start(struct btrfs_work *work)
				744	{
				745	struct async_submit_bio *async;
				746	blk_status_t ret;
				747
				748	async = container_of(work, struct async_submit_bio, work);
				749	ret = async->submit_bio_start(async->private_data, async->bio,
				750	async->bio_offset);
				751	if (ret)
				752	async->status = ret;
				753	}
				754
				755	static void run_one_async_done(struct btrfs_work *work)
				756	{
				757	struct async_submit_bio *async;
				758
				759	async = container_of(work, struct async_submit_bio, work);
				760
				761	/* If an error occurred we just want to clean up the bio and move on */
				762	if (async->status) {
				763	async->bio->bi_status = async->status;
				764	bio_endio(async->bio);
				765	return;
				766	}
				767
				768	btrfs_submit_bio_done(async->private_data, async->bio, async->mirror_num);
				769	}
				770
				771	static void run_one_async_free(struct btrfs_work *work)
				772	{
				773	struct async_submit_bio *async;
				774
				775	async = container_of(work, struct async_submit_bio, work);
				776	kfree(async);
				777	}
				778
				779	blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info fs_info, struct bio bio,
				780	int mirror_num, unsigned long bio_flags,
				781	u64 bio_offset, void *private_data,
				782	extent_submit_bio_start_t *submit_bio_start)
				783	{
				784	struct async_submit_bio *async;
				785
				786	async = kmalloc(sizeof(*async), GFP_NOFS);
				787	if (!async)
				788	return BLK_STS_RESOURCE;
				789
				790	async->private_data = private_data;
				791	async->bio = bio;
				792	async->mirror_num = mirror_num;
				793	async->submit_bio_start = submit_bio_start;
				794
				795	btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
				796	run_one_async_done, run_one_async_free);
				797
				798	async->bio_offset = bio_offset;
				799
				800	async->status = 0;
				801
				802	if (op_is_sync(bio->bi_opf))
				803	btrfs_set_work_high_priority(&async->work);
				804
				805	btrfs_queue_work(fs_info->workers, &async->work);
				806	return 0;
				807	}
				808
				809	static blk_status_t btree_csum_one_bio(struct bio *bio)
				810	{
				811	struct bio_vec *bvec;
				812	struct btrfs_root *root;
				813	int i, ret = 0;
				814
				815	ASSERT(!bio_flagged(bio, BIO_CLONED));
				816	bio_for_each_segment_all(bvec, bio, i) {
				817	root = BTRFS_I(bvec->bv_page->mapping->host)->root;
				818	ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
				819	if (ret)
				820	break;
				821	}
				822
				823	return errno_to_blk_status(ret);
				824	}
				825
				826	static blk_status_t btree_submit_bio_start(void private_data, struct bio bio,
				827	u64 bio_offset)
				828	{
				829	/*
				830	* when we're called for a write, we're already in the async
				831	* submission context. Just jump into btrfs_map_bio
				832	*/
				833	return btree_csum_one_bio(bio);
				834	}
				835
				836	static int check_async_write(struct btrfs_inode *bi)
				837	{
				838	if (atomic_read(&bi->sync_writers))
				839	return 0;
				840	#ifdef CONFIG_X86
				841	if (static_cpu_has(X86_FEATURE_XMM4_2))
				842	return 0;
				843	#endif
				844	return 1;
				845	}
				846
				847	static blk_status_t btree_submit_bio_hook(void private_data, struct bio bio,
				848	int mirror_num, unsigned long bio_flags,
				849	u64 bio_offset)
				850	{
				851	struct inode *inode = private_data;
				852	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				853	int async = check_async_write(BTRFS_I(inode));
				854	blk_status_t ret;
				855
				856	if (bio_op(bio) != REQ_OP_WRITE) {
				857	/*
				858	* called for a read, do the setup so that checksum validation
				859	* can happen in the async kernel threads
				860	*/
				861	ret = btrfs_bio_wq_end_io(fs_info, bio,
				862	BTRFS_WQ_ENDIO_METADATA);
				863	if (ret)
				864	goto out_w_error;
				865	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
				866	} else if (!async) {
				867	ret = btree_csum_one_bio(bio);
				868	if (ret)
				869	goto out_w_error;
				870	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
				871	} else {
				872	/*
				873	* kthread helpers are used to submit writes so that
				874	* checksumming can happen in parallel across all CPUs
				875	*/
				876	ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
				877	bio_offset, private_data,
				878	btree_submit_bio_start);
				879	}
				880
				881	if (ret)
				882	goto out_w_error;
				883	return 0;
				884
				885	out_w_error:
				886	bio->bi_status = ret;
				887	bio_endio(bio);
				888	return ret;
				889	}
				890
				891	#ifdef CONFIG_MIGRATION
				892	static int btree_migratepage(struct address_space *mapping,
				893	struct page newpage, struct page page,
				894	enum migrate_mode mode)
				895	{
				896	/*
				897	* we can't safely write a btree page from here,
				898	* we haven't done the locking hook
				899	*/
				900	if (PageDirty(page))
				901	return -EAGAIN;
				902	/*
				903	* Buffers may be managed in a filesystem specific way.
				904	* We must have no buffers or drop them.
				905	*/
				906	if (page_has_private(page) &&
				907	!try_to_release_page(page, GFP_KERNEL))
				908	return -EAGAIN;
				909	return migrate_page(mapping, newpage, page, mode);
				910	}
				911	#endif
				912
				913
				914	static int btree_writepages(struct address_space *mapping,
				915	struct writeback_control *wbc)
				916	{
				917	struct btrfs_fs_info *fs_info;
				918	int ret;
				919
				920	if (wbc->sync_mode == WB_SYNC_NONE) {
				921
				922	if (wbc->for_kupdate)
				923	return 0;
				924
				925	fs_info = BTRFS_I(mapping->host)->root->fs_info;
				926	/* this is a bit racy, but that's ok */
				927	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				928	BTRFS_DIRTY_METADATA_THRESH,
				929	fs_info->dirty_metadata_batch);
				930	if (ret < 0)
				931	return 0;
				932	}
				933	return btree_write_cache_pages(mapping, wbc);
				934	}
				935
				936	static int btree_readpage(struct file file, struct page page)
				937	{
				938	struct extent_io_tree *tree;
				939	tree = &BTRFS_I(page->mapping->host)->io_tree;
				940	return extent_read_full_page(tree, page, btree_get_extent, 0);
				941	}
				942
				943	static int btree_releasepage(struct page *page, gfp_t gfp_flags)
				944	{
				945	if (PageWriteback(page) \|\| PageDirty(page))
				946	return 0;
				947
				948	return try_release_extent_buffer(page);
				949	}
				950
				951	static void btree_invalidatepage(struct page *page, unsigned int offset,
				952	unsigned int length)
				953	{
				954	struct extent_io_tree *tree;
				955	tree = &BTRFS_I(page->mapping->host)->io_tree;
				956	extent_invalidatepage(tree, page, offset);
				957	btree_releasepage(page, GFP_NOFS);
				958	if (PagePrivate(page)) {
				959	btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
				960	"page private not zero on page %llu",
				961	(unsigned long long)page_offset(page));
				962	ClearPagePrivate(page);
				963	set_page_private(page, 0);
				964	put_page(page);
				965	}
				966	}
				967
				968	static int btree_set_page_dirty(struct page *page)
				969	{
				970	#ifdef DEBUG
				971	struct extent_buffer *eb;
				972
				973	BUG_ON(!PagePrivate(page));
				974	eb = (struct extent_buffer *)page->private;
				975	BUG_ON(!eb);
				976	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				977	BUG_ON(!atomic_read(&eb->refs));
				978	btrfs_assert_tree_locked(eb);
				979	#endif
				980	return __set_page_dirty_nobuffers(page);
				981	}
				982
				983	static const struct address_space_operations btree_aops = {
				984	.readpage = btree_readpage,
				985	.writepages = btree_writepages,
				986	.releasepage = btree_releasepage,
				987	.invalidatepage = btree_invalidatepage,
				988	#ifdef CONFIG_MIGRATION
				989	.migratepage = btree_migratepage,
				990	#endif
				991	.set_page_dirty = btree_set_page_dirty,
				992	};
				993
				994	void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
				995	{
				996	struct extent_buffer *buf = NULL;
				997	struct inode *btree_inode = fs_info->btree_inode;
				998	int ret;
				999
				1000	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				1001	if (IS_ERR(buf))
				1002	return;
				1003
				1004	ret = read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf,
				1005	WAIT_NONE, 0);
				1006	if (ret < 0)
				1007	free_extent_buffer_stale(buf);
				1008	else
				1009	free_extent_buffer(buf);
				1010	}
				1011
				1012	int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr,
				1013	int mirror_num, struct extent_buffer **eb)
				1014	{
				1015	struct extent_buffer *buf = NULL;
				1016	struct inode *btree_inode = fs_info->btree_inode;
				1017	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
				1018	int ret;
				1019
				1020	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				1021	if (IS_ERR(buf))
				1022	return 0;
				1023
				1024	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
				1025
				1026	ret = read_extent_buffer_pages(io_tree, buf, WAIT_PAGE_LOCK,
				1027	mirror_num);
				1028	if (ret) {
				1029	free_extent_buffer_stale(buf);
				1030	return ret;
				1031	}
				1032
				1033	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
				1034	free_extent_buffer_stale(buf);
				1035	return -EIO;
				1036	} else if (extent_buffer_uptodate(buf)) {
				1037	*eb = buf;
				1038	} else {
				1039	free_extent_buffer(buf);
				1040	}
				1041	return 0;
				1042	}
				1043
				1044	struct extent_buffer *btrfs_find_create_tree_block(
				1045	struct btrfs_fs_info *fs_info,
				1046	u64 bytenr)
				1047	{
				1048	if (btrfs_is_testing(fs_info))
				1049	return alloc_test_extent_buffer(fs_info, bytenr);
				1050	return alloc_extent_buffer(fs_info, bytenr);
				1051	}
				1052
				1053
				1054	int btrfs_write_tree_block(struct extent_buffer *buf)
				1055	{
				1056	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
				1057	buf->start + buf->len - 1);
				1058	}
				1059
				1060	void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
				1061	{
				1062	filemap_fdatawait_range(buf->pages[0]->mapping,
				1063	buf->start, buf->start + buf->len - 1);
				1064	}
				1065
				1066	/*
				1067	* Read tree block at logical address @bytenr and do variant basic but critical
				1068	* verification.
				1069	*
				1070	* @parent_transid: expected transid of this tree block, skip check if 0
				1071	* @level: expected level, mandatory check
				1072	* @first_key: expected key in slot 0, skip check if NULL
				1073	*/
				1074	struct extent_buffer read_tree_block(struct btrfs_fs_info fs_info, u64 bytenr,
				1075	u64 parent_transid, int level,
				1076	struct btrfs_key *first_key)
				1077	{
				1078	struct extent_buffer *buf = NULL;
				1079	int ret;
				1080
				1081	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				1082	if (IS_ERR(buf))
				1083	return buf;
				1084
				1085	ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
				1086	level, first_key);
				1087	if (ret) {
				1088	free_extent_buffer_stale(buf);
				1089	return ERR_PTR(ret);
				1090	}
				1091	return buf;
				1092
				1093	}
				1094
				1095	void clean_tree_block(struct btrfs_fs_info *fs_info,
				1096	struct extent_buffer *buf)
				1097	{
				1098	if (btrfs_header_generation(buf) ==
				1099	fs_info->running_transaction->transid) {
				1100	btrfs_assert_tree_locked(buf);
				1101
				1102	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
				1103	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
				1104	-buf->len,
				1105	fs_info->dirty_metadata_batch);
				1106	/* ugh, clear_extent_buffer_dirty needs to lock the page */
				1107	btrfs_set_lock_blocking(buf);
				1108	clear_extent_buffer_dirty(buf);
				1109	}
				1110	}
				1111	}
				1112
				1113	static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
				1114	{
				1115	struct btrfs_subvolume_writers *writers;
				1116	int ret;
				1117
				1118	writers = kmalloc(sizeof(*writers), GFP_NOFS);
				1119	if (!writers)
				1120	return ERR_PTR(-ENOMEM);
				1121
				1122	ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
				1123	if (ret < 0) {
				1124	kfree(writers);
				1125	return ERR_PTR(ret);
				1126	}
				1127
				1128	init_waitqueue_head(&writers->wait);
				1129	return writers;
				1130	}
				1131
				1132	static void
				1133	btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
				1134	{
				1135	percpu_counter_destroy(&writers->counter);
				1136	kfree(writers);
				1137	}
				1138
				1139	static void __setup_root(struct btrfs_root root, struct btrfs_fs_info fs_info,
				1140	u64 objectid)
				1141	{
				1142	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
				1143	root->node = NULL;
				1144	root->commit_root = NULL;
				1145	root->state = 0;
				1146	root->orphan_cleanup_state = 0;
				1147
				1148	root->objectid = objectid;
				1149	root->last_trans = 0;
				1150	root->highest_objectid = 0;
				1151	root->nr_delalloc_inodes = 0;
				1152	root->nr_ordered_extents = 0;
				1153	root->inode_tree = RB_ROOT;
				1154	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
				1155	root->block_rsv = NULL;
				1156
				1157	INIT_LIST_HEAD(&root->dirty_list);
				1158	INIT_LIST_HEAD(&root->root_list);
				1159	INIT_LIST_HEAD(&root->delalloc_inodes);
				1160	INIT_LIST_HEAD(&root->delalloc_root);
				1161	INIT_LIST_HEAD(&root->ordered_extents);
				1162	INIT_LIST_HEAD(&root->ordered_root);
				1163	INIT_LIST_HEAD(&root->logged_list[0]);
				1164	INIT_LIST_HEAD(&root->logged_list[1]);
				1165	spin_lock_init(&root->inode_lock);
				1166	spin_lock_init(&root->delalloc_lock);
				1167	spin_lock_init(&root->ordered_extent_lock);
				1168	spin_lock_init(&root->accounting_lock);
				1169	spin_lock_init(&root->log_extents_lock[0]);
				1170	spin_lock_init(&root->log_extents_lock[1]);
				1171	spin_lock_init(&root->qgroup_meta_rsv_lock);
				1172	mutex_init(&root->objectid_mutex);
				1173	mutex_init(&root->log_mutex);
				1174	mutex_init(&root->ordered_extent_mutex);
				1175	mutex_init(&root->delalloc_mutex);
				1176	init_waitqueue_head(&root->log_writer_wait);
				1177	init_waitqueue_head(&root->log_commit_wait[0]);
				1178	init_waitqueue_head(&root->log_commit_wait[1]);
				1179	INIT_LIST_HEAD(&root->log_ctxs[0]);
				1180	INIT_LIST_HEAD(&root->log_ctxs[1]);
				1181	atomic_set(&root->log_commit[0], 0);
				1182	atomic_set(&root->log_commit[1], 0);
				1183	atomic_set(&root->log_writers, 0);
				1184	atomic_set(&root->log_batch, 0);
				1185	refcount_set(&root->refs, 1);
				1186	atomic_set(&root->will_be_snapshotted, 0);
				1187	atomic_set(&root->snapshot_force_cow, 0);
				1188	root->log_transid = 0;
				1189	root->log_transid_committed = -1;
				1190	root->last_log_commit = 0;
				1191	if (!dummy)
				1192	extent_io_tree_init(&root->dirty_log_pages, NULL);
				1193
				1194	memset(&root->root_key, 0, sizeof(root->root_key));
				1195	memset(&root->root_item, 0, sizeof(root->root_item));
				1196	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
				1197	if (!dummy)
				1198	root->defrag_trans_start = fs_info->generation;
				1199	else
				1200	root->defrag_trans_start = 0;
				1201	root->root_key.objectid = objectid;
				1202	root->anon_dev = 0;
				1203
				1204	spin_lock_init(&root->root_item_lock);
				1205	}
				1206
				1207	static struct btrfs_root btrfs_alloc_root(struct btrfs_fs_info fs_info,
				1208	gfp_t flags)
				1209	{
				1210	struct btrfs_root root = kzalloc(sizeof(root), flags);
				1211	if (root)
				1212	root->fs_info = fs_info;
				1213	return root;
				1214	}
				1215
				1216	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				1217	/* Should only be used by the testing infrastructure */
				1218	struct btrfs_root btrfs_alloc_dummy_root(struct btrfs_fs_info fs_info)
				1219	{
				1220	struct btrfs_root *root;
				1221
				1222	if (!fs_info)
				1223	return ERR_PTR(-EINVAL);
				1224
				1225	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				1226	if (!root)
				1227	return ERR_PTR(-ENOMEM);
				1228
				1229	/* We don't use the stripesize in selftest, set it as sectorsize */
				1230	__setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
				1231	root->alloc_bytenr = 0;
				1232
				1233	return root;
				1234	}
				1235	#endif
				1236
				1237	struct btrfs_root btrfs_create_tree(struct btrfs_trans_handle trans,
				1238	struct btrfs_fs_info *fs_info,
				1239	u64 objectid)
				1240	{
				1241	struct extent_buffer *leaf;
				1242	struct btrfs_root *tree_root = fs_info->tree_root;
				1243	struct btrfs_root *root;
				1244	struct btrfs_key key;
				1245	unsigned int nofs_flag;
				1246	int ret = 0;
				1247	uuid_le uuid = NULL_UUID_LE;
				1248
				1249	/*
				1250	* We're holding a transaction handle, so use a NOFS memory allocation
				1251	* context to avoid deadlock if reclaim happens.
				1252	*/
				1253	nofs_flag = memalloc_nofs_save();
				1254	root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				1255	memalloc_nofs_restore(nofs_flag);
				1256	if (!root)
				1257	return ERR_PTR(-ENOMEM);
				1258
				1259	__setup_root(root, fs_info, objectid);
				1260	root->root_key.objectid = objectid;
				1261	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
				1262	root->root_key.offset = 0;
				1263
				1264	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
				1265	if (IS_ERR(leaf)) {
				1266	ret = PTR_ERR(leaf);
				1267	leaf = NULL;
				1268	goto fail;
				1269	}
				1270
				1271	root->node = leaf;
				1272	btrfs_mark_buffer_dirty(leaf);
				1273
				1274	root->commit_root = btrfs_root_node(root);
				1275	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				1276
				1277	root->root_item.flags = 0;
				1278	root->root_item.byte_limit = 0;
				1279	btrfs_set_root_bytenr(&root->root_item, leaf->start);
				1280	btrfs_set_root_generation(&root->root_item, trans->transid);
				1281	btrfs_set_root_level(&root->root_item, 0);
				1282	btrfs_set_root_refs(&root->root_item, 1);
				1283	btrfs_set_root_used(&root->root_item, leaf->len);
				1284	btrfs_set_root_last_snapshot(&root->root_item, 0);
				1285	btrfs_set_root_dirid(&root->root_item, 0);
				1286	if (is_fstree(objectid))
				1287	uuid_le_gen(&uuid);
				1288	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
				1289	root->root_item.drop_level = 0;
				1290
				1291	key.objectid = objectid;
				1292	key.type = BTRFS_ROOT_ITEM_KEY;
				1293	key.offset = 0;
				1294	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
				1295	if (ret)
				1296	goto fail;
				1297
				1298	btrfs_tree_unlock(leaf);
				1299
				1300	return root;
				1301
				1302	fail:
				1303	if (leaf) {
				1304	btrfs_tree_unlock(leaf);
				1305	free_extent_buffer(root->commit_root);
				1306	free_extent_buffer(leaf);
				1307	}
				1308	kfree(root);
				1309
				1310	return ERR_PTR(ret);
				1311	}
				1312
				1313	static struct btrfs_root alloc_log_tree(struct btrfs_trans_handle trans,
				1314	struct btrfs_fs_info *fs_info)
				1315	{
				1316	struct btrfs_root *root;
				1317	struct extent_buffer *leaf;
				1318
				1319	root = btrfs_alloc_root(fs_info, GFP_NOFS);
				1320	if (!root)
				1321	return ERR_PTR(-ENOMEM);
				1322
				1323	__setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID);
				1324
				1325	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
				1326	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
				1327	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
				1328
				1329	/*
				1330	* DON'T set REF_COWS for log trees
				1331	*
				1332	* log trees do not get reference counted because they go away
				1333	* before a real commit is actually done. They do store pointers
				1334	* to file data extents, and those reference counts still get
				1335	* updated (along with back refs to the log tree).
				1336	*/
				1337
				1338	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
				1339	NULL, 0, 0, 0);
				1340	if (IS_ERR(leaf)) {
				1341	kfree(root);
				1342	return ERR_CAST(leaf);
				1343	}
				1344
				1345	root->node = leaf;
				1346
				1347	btrfs_mark_buffer_dirty(root->node);
				1348	btrfs_tree_unlock(root->node);
				1349	return root;
				1350	}
				1351
				1352	int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
				1353	struct btrfs_fs_info *fs_info)
				1354	{
				1355	struct btrfs_root *log_root;
				1356
				1357	log_root = alloc_log_tree(trans, fs_info);
				1358	if (IS_ERR(log_root))
				1359	return PTR_ERR(log_root);
				1360	WARN_ON(fs_info->log_root_tree);
				1361	fs_info->log_root_tree = log_root;
				1362	return 0;
				1363	}
				1364
				1365	int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
				1366	struct btrfs_root *root)
				1367	{
				1368	struct btrfs_fs_info *fs_info = root->fs_info;
				1369	struct btrfs_root *log_root;
				1370	struct btrfs_inode_item *inode_item;
				1371
				1372	log_root = alloc_log_tree(trans, fs_info);
				1373	if (IS_ERR(log_root))
				1374	return PTR_ERR(log_root);
				1375
				1376	log_root->last_trans = trans->transid;
				1377	log_root->root_key.offset = root->root_key.objectid;
				1378
				1379	inode_item = &log_root->root_item.inode;
				1380	btrfs_set_stack_inode_generation(inode_item, 1);
				1381	btrfs_set_stack_inode_size(inode_item, 3);
				1382	btrfs_set_stack_inode_nlink(inode_item, 1);
				1383	btrfs_set_stack_inode_nbytes(inode_item,
				1384	fs_info->nodesize);
				1385	btrfs_set_stack_inode_mode(inode_item, S_IFDIR \| 0755);
				1386
				1387	btrfs_set_root_node(&log_root->root_item, log_root->node);
				1388
				1389	WARN_ON(root->log_root);
				1390	root->log_root = log_root;
				1391	root->log_transid = 0;
				1392	root->log_transid_committed = -1;
				1393	root->last_log_commit = 0;
				1394	return 0;
				1395	}
				1396
				1397	static struct btrfs_root btrfs_read_tree_root(struct btrfs_root tree_root,
				1398	struct btrfs_key *key)
				1399	{
				1400	struct btrfs_root *root;
				1401	struct btrfs_fs_info *fs_info = tree_root->fs_info;
				1402	struct btrfs_path *path;
				1403	u64 generation;
				1404	int ret;
				1405	int level;
				1406
				1407	path = btrfs_alloc_path();
				1408	if (!path)
				1409	return ERR_PTR(-ENOMEM);
				1410
				1411	root = btrfs_alloc_root(fs_info, GFP_NOFS);
				1412	if (!root) {
				1413	ret = -ENOMEM;
				1414	goto alloc_fail;
				1415	}
				1416
				1417	__setup_root(root, fs_info, key->objectid);
				1418
				1419	ret = btrfs_find_root(tree_root, key, path,
				1420	&root->root_item, &root->root_key);
				1421	if (ret) {
				1422	if (ret > 0)
				1423	ret = -ENOENT;
				1424	goto find_fail;
				1425	}
				1426
				1427	generation = btrfs_root_generation(&root->root_item);
				1428	level = btrfs_root_level(&root->root_item);
				1429	root->node = read_tree_block(fs_info,
				1430	btrfs_root_bytenr(&root->root_item),
				1431	generation, level, NULL);
				1432	if (IS_ERR(root->node)) {
				1433	ret = PTR_ERR(root->node);
				1434	goto find_fail;
				1435	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
				1436	ret = -EIO;
				1437	free_extent_buffer(root->node);
				1438	goto find_fail;
				1439	}
				1440	root->commit_root = btrfs_root_node(root);
				1441	out:
				1442	btrfs_free_path(path);
				1443	return root;
				1444
				1445	find_fail:
				1446	kfree(root);
				1447	alloc_fail:
				1448	root = ERR_PTR(ret);
				1449	goto out;
				1450	}
				1451
				1452	struct btrfs_root btrfs_read_fs_root(struct btrfs_root tree_root,
				1453	struct btrfs_key *location)
				1454	{
				1455	struct btrfs_root *root;
				1456
				1457	root = btrfs_read_tree_root(tree_root, location);
				1458	if (IS_ERR(root))
				1459	return root;
				1460
				1461	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				1462	set_bit(BTRFS_ROOT_REF_COWS, &root->state);
				1463	btrfs_check_and_init_root_item(&root->root_item);
				1464	}
				1465
				1466	return root;
				1467	}
				1468
				1469	int btrfs_init_fs_root(struct btrfs_root *root)
				1470	{
				1471	int ret;
				1472	struct btrfs_subvolume_writers *writers;
				1473
				1474	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
				1475	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
				1476	GFP_NOFS);
				1477	if (!root->free_ino_pinned \|\| !root->free_ino_ctl) {
				1478	ret = -ENOMEM;
				1479	goto fail;
				1480	}
				1481
				1482	writers = btrfs_alloc_subvolume_writers();
				1483	if (IS_ERR(writers)) {
				1484	ret = PTR_ERR(writers);
				1485	goto fail;
				1486	}
				1487	root->subv_writers = writers;
				1488
				1489	btrfs_init_free_ino_ctl(root);
				1490	spin_lock_init(&root->ino_cache_lock);
				1491	init_waitqueue_head(&root->ino_cache_wait);
				1492
				1493	ret = get_anon_bdev(&root->anon_dev);
				1494	if (ret)
				1495	goto fail;
				1496
				1497	mutex_lock(&root->objectid_mutex);
				1498	ret = btrfs_find_highest_objectid(root,
				1499	&root->highest_objectid);
				1500	if (ret) {
				1501	mutex_unlock(&root->objectid_mutex);
				1502	goto fail;
				1503	}
				1504
				1505	ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
				1506
				1507	mutex_unlock(&root->objectid_mutex);
				1508
				1509	return 0;
				1510	fail:
				1511	/* The caller is responsible to call btrfs_free_fs_root */
				1512	return ret;
				1513	}
				1514
				1515	struct btrfs_root btrfs_lookup_fs_root(struct btrfs_fs_info fs_info,
				1516	u64 root_id)
				1517	{
				1518	struct btrfs_root *root;
				1519
				1520	spin_lock(&fs_info->fs_roots_radix_lock);
				1521	root = radix_tree_lookup(&fs_info->fs_roots_radix,
				1522	(unsigned long)root_id);
				1523	spin_unlock(&fs_info->fs_roots_radix_lock);
				1524	return root;
				1525	}
				1526
				1527	int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
				1528	struct btrfs_root *root)
				1529	{
				1530	int ret;
				1531
				1532	ret = radix_tree_preload(GFP_NOFS);
				1533	if (ret)
				1534	return ret;
				1535
				1536	spin_lock(&fs_info->fs_roots_radix_lock);
				1537	ret = radix_tree_insert(&fs_info->fs_roots_radix,
				1538	(unsigned long)root->root_key.objectid,
				1539	root);
				1540	if (ret == 0)
				1541	set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
				1542	spin_unlock(&fs_info->fs_roots_radix_lock);
				1543	radix_tree_preload_end();
				1544
				1545	return ret;
				1546	}
				1547
				1548	struct btrfs_root btrfs_get_fs_root(struct btrfs_fs_info fs_info,
				1549	struct btrfs_key *location,
				1550	bool check_ref)
				1551	{
				1552	struct btrfs_root *root;
				1553	struct btrfs_path *path;
				1554	struct btrfs_key key;
				1555	int ret;
				1556
				1557	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
				1558	return fs_info->tree_root;
				1559	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
				1560	return fs_info->extent_root;
				1561	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
				1562	return fs_info->chunk_root;
				1563	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
				1564	return fs_info->dev_root;
				1565	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
				1566	return fs_info->csum_root;
				1567	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
				1568	return fs_info->quota_root ? fs_info->quota_root :
				1569	ERR_PTR(-ENOENT);
				1570	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
				1571	return fs_info->uuid_root ? fs_info->uuid_root :
				1572	ERR_PTR(-ENOENT);
				1573	if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
				1574	return fs_info->free_space_root ? fs_info->free_space_root :
				1575	ERR_PTR(-ENOENT);
				1576	again:
				1577	root = btrfs_lookup_fs_root(fs_info, location->objectid);
				1578	if (root) {
				1579	if (check_ref && btrfs_root_refs(&root->root_item) == 0)
				1580	return ERR_PTR(-ENOENT);
				1581	return root;
				1582	}
				1583
				1584	root = btrfs_read_fs_root(fs_info->tree_root, location);
				1585	if (IS_ERR(root))
				1586	return root;
				1587
				1588	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
				1589	ret = -ENOENT;
				1590	goto fail;
				1591	}
				1592
				1593	ret = btrfs_init_fs_root(root);
				1594	if (ret)
				1595	goto fail;
				1596
				1597	path = btrfs_alloc_path();
				1598	if (!path) {
				1599	ret = -ENOMEM;
				1600	goto fail;
				1601	}
				1602	key.objectid = BTRFS_ORPHAN_OBJECTID;
				1603	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1604	key.offset = location->objectid;
				1605
				1606	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
				1607	btrfs_free_path(path);
				1608	if (ret < 0)
				1609	goto fail;
				1610	if (ret == 0)
				1611	set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
				1612
				1613	ret = btrfs_insert_fs_root(fs_info, root);
				1614	if (ret) {
				1615	if (ret == -EEXIST) {
				1616	btrfs_free_fs_root(root);
				1617	goto again;
				1618	}
				1619	goto fail;
				1620	}
				1621	return root;
				1622	fail:
				1623	btrfs_free_fs_root(root);
				1624	return ERR_PTR(ret);
				1625	}
				1626
				1627	static int btrfs_congested_fn(void *congested_data, int bdi_bits)
				1628	{
				1629	struct btrfs_fs_info info = (struct btrfs_fs_info )congested_data;
				1630	int ret = 0;
				1631	struct btrfs_device *device;
				1632	struct backing_dev_info *bdi;
				1633
				1634	rcu_read_lock();
				1635	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
				1636	if (!device->bdev)
				1637	continue;
				1638	bdi = device->bdev->bd_bdi;
				1639	if (bdi_congested(bdi, bdi_bits)) {
				1640	ret = 1;
				1641	break;
				1642	}
				1643	}
				1644	rcu_read_unlock();
				1645	return ret;
				1646	}
				1647
				1648	/*
				1649	* called by the kthread helper functions to finally call the bio end_io
				1650	* functions. This is where read checksum verification actually happens
				1651	*/
				1652	static void end_workqueue_fn(struct btrfs_work *work)
				1653	{
				1654	struct bio *bio;
				1655	struct btrfs_end_io_wq *end_io_wq;
				1656
				1657	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
				1658	bio = end_io_wq->bio;
				1659
				1660	bio->bi_status = end_io_wq->status;
				1661	bio->bi_private = end_io_wq->private;
				1662	bio->bi_end_io = end_io_wq->end_io;
				1663	bio_endio(bio);
				1664	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
				1665	}
				1666
				1667	static int cleaner_kthread(void *arg)
				1668	{
				1669	struct btrfs_root *root = arg;
				1670	struct btrfs_fs_info *fs_info = root->fs_info;
				1671	int again;
				1672
				1673	while (1) {
				1674	again = 0;
				1675
				1676	/* Make the cleaner go to sleep early. */
				1677	if (btrfs_need_cleaner_sleep(fs_info))
				1678	goto sleep;
				1679
				1680	/*
				1681	* Do not do anything if we might cause open_ctree() to block
				1682	* before we have finished mounting the filesystem.
				1683	*/
				1684	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				1685	goto sleep;
				1686
				1687	if (!mutex_trylock(&fs_info->cleaner_mutex))
				1688	goto sleep;
				1689
				1690	/*
				1691	* Avoid the problem that we change the status of the fs
				1692	* during the above check and trylock.
				1693	*/
				1694	if (btrfs_need_cleaner_sleep(fs_info)) {
				1695	mutex_unlock(&fs_info->cleaner_mutex);
				1696	goto sleep;
				1697	}
				1698
				1699	mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
				1700	btrfs_run_delayed_iputs(fs_info);
				1701	mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
				1702
				1703	again = btrfs_clean_one_deleted_snapshot(root);
				1704	mutex_unlock(&fs_info->cleaner_mutex);
				1705
				1706	/*
				1707	* The defragger has dealt with the R/O remount and umount,
				1708	* needn't do anything special here.
				1709	*/
				1710	btrfs_run_defrag_inodes(fs_info);
				1711
				1712	/*
				1713	* Acquires fs_info->delete_unused_bgs_mutex to avoid racing
				1714	* with relocation (btrfs_relocate_chunk) and relocation
				1715	* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
				1716	* after acquiring fs_info->delete_unused_bgs_mutex. So we
				1717	* can't hold, nor need to, fs_info->cleaner_mutex when deleting
				1718	* unused block groups.
				1719	*/
				1720	btrfs_delete_unused_bgs(fs_info);
				1721	sleep:
				1722	if (kthread_should_park())
				1723	kthread_parkme();
				1724	if (kthread_should_stop())
				1725	return 0;
				1726	if (!again) {
				1727	set_current_state(TASK_INTERRUPTIBLE);
				1728	schedule();
				1729	__set_current_state(TASK_RUNNING);
				1730	}
				1731	}
				1732	}
				1733
				1734	static int transaction_kthread(void *arg)
				1735	{
				1736	struct btrfs_root *root = arg;
				1737	struct btrfs_fs_info *fs_info = root->fs_info;
				1738	struct btrfs_trans_handle *trans;
				1739	struct btrfs_transaction *cur;
				1740	u64 transid;
				1741	time64_t now;
				1742	unsigned long delay;
				1743	bool cannot_commit;
				1744
				1745	do {
				1746	cannot_commit = false;
				1747	delay = HZ * fs_info->commit_interval;
				1748	mutex_lock(&fs_info->transaction_kthread_mutex);
				1749
				1750	spin_lock(&fs_info->trans_lock);
				1751	cur = fs_info->running_transaction;
				1752	if (!cur) {
				1753	spin_unlock(&fs_info->trans_lock);
				1754	goto sleep;
				1755	}
				1756
				1757	now = ktime_get_seconds();
				1758	if (cur->state < TRANS_STATE_BLOCKED &&
				1759	!test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
				1760	(now < cur->start_time \|\|
				1761	now - cur->start_time < fs_info->commit_interval)) {
				1762	spin_unlock(&fs_info->trans_lock);
				1763	delay = HZ * 5;
				1764	goto sleep;
				1765	}
				1766	transid = cur->transid;
				1767	spin_unlock(&fs_info->trans_lock);
				1768
				1769	/* If the file system is aborted, this will always fail. */
				1770	trans = btrfs_attach_transaction(root);
				1771	if (IS_ERR(trans)) {
				1772	if (PTR_ERR(trans) != -ENOENT)
				1773	cannot_commit = true;
				1774	goto sleep;
				1775	}
				1776	if (transid == trans->transid) {
				1777	btrfs_commit_transaction(trans);
				1778	} else {
				1779	btrfs_end_transaction(trans);
				1780	}
				1781	sleep:
				1782	wake_up_process(fs_info->cleaner_kthread);
				1783	mutex_unlock(&fs_info->transaction_kthread_mutex);
				1784
				1785	if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
				1786	&fs_info->fs_state)))
				1787	btrfs_cleanup_transaction(fs_info);
				1788	if (!kthread_should_stop() &&
				1789	(!btrfs_transaction_blocked(fs_info) \|\|
				1790	cannot_commit))
				1791	schedule_timeout_interruptible(delay);
				1792	} while (!kthread_should_stop());
				1793	return 0;
				1794	}
				1795
				1796	/*
				1797	* this will find the highest generation in the array of
				1798	* root backups. The index of the highest array is returned,
				1799	* or -1 if we can't find anything.
				1800	*
				1801	* We check to make sure the array is valid by comparing the
				1802	* generation of the latest root in the array with the generation
				1803	* in the super block. If they don't match we pitch it.
				1804	*/
				1805	static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
				1806	{
				1807	u64 cur;
				1808	int newest_index = -1;
				1809	struct btrfs_root_backup *root_backup;
				1810	int i;
				1811
				1812	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
				1813	root_backup = info->super_copy->super_roots + i;
				1814	cur = btrfs_backup_tree_root_gen(root_backup);
				1815	if (cur == newest_gen)
				1816	newest_index = i;
				1817	}
				1818
				1819	/* check to see if we actually wrapped around */
				1820	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
				1821	root_backup = info->super_copy->super_roots;
				1822	cur = btrfs_backup_tree_root_gen(root_backup);
				1823	if (cur == newest_gen)
				1824	newest_index = 0;
				1825	}
				1826	return newest_index;
				1827	}
				1828
				1829
				1830	/*
				1831	* find the oldest backup so we know where to store new entries
				1832	* in the backup array. This will set the backup_root_index
				1833	* field in the fs_info struct
				1834	*/
				1835	static void find_oldest_super_backup(struct btrfs_fs_info *info,
				1836	u64 newest_gen)
				1837	{
				1838	int newest_index = -1;
				1839
				1840	newest_index = find_newest_super_backup(info, newest_gen);
				1841	/* if there was garbage in there, just move along */
				1842	if (newest_index == -1) {
				1843	info->backup_root_index = 0;
				1844	} else {
				1845	info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
				1846	}
				1847	}
				1848
				1849	/*
				1850	* copy all the root pointers into the super backup array.
				1851	* this will bump the backup pointer by one when it is
				1852	* done
				1853	*/
				1854	static void backup_super_roots(struct btrfs_fs_info *info)
				1855	{
				1856	int next_backup;
				1857	struct btrfs_root_backup *root_backup;
				1858	int last_backup;
				1859
				1860	next_backup = info->backup_root_index;
				1861	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
				1862	BTRFS_NUM_BACKUP_ROOTS;
				1863
				1864	/*
				1865	* just overwrite the last backup if we're at the same generation
				1866	* this happens only at umount
				1867	*/
				1868	root_backup = info->super_for_commit->super_roots + last_backup;
				1869	if (btrfs_backup_tree_root_gen(root_backup) ==
				1870	btrfs_header_generation(info->tree_root->node))
				1871	next_backup = last_backup;
				1872
				1873	root_backup = info->super_for_commit->super_roots + next_backup;
				1874
				1875	/*
				1876	* make sure all of our padding and empty slots get zero filled
				1877	* regardless of which ones we use today
				1878	*/
				1879	memset(root_backup, 0, sizeof(*root_backup));
				1880
				1881	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
				1882
				1883	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
				1884	btrfs_set_backup_tree_root_gen(root_backup,
				1885	btrfs_header_generation(info->tree_root->node));
				1886
				1887	btrfs_set_backup_tree_root_level(root_backup,
				1888	btrfs_header_level(info->tree_root->node));
				1889
				1890	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
				1891	btrfs_set_backup_chunk_root_gen(root_backup,
				1892	btrfs_header_generation(info->chunk_root->node));
				1893	btrfs_set_backup_chunk_root_level(root_backup,
				1894	btrfs_header_level(info->chunk_root->node));
				1895
				1896	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
				1897	btrfs_set_backup_extent_root_gen(root_backup,
				1898	btrfs_header_generation(info->extent_root->node));
				1899	btrfs_set_backup_extent_root_level(root_backup,
				1900	btrfs_header_level(info->extent_root->node));
				1901
				1902	/*
				1903	* we might commit during log recovery, which happens before we set
				1904	* the fs_root. Make sure it is valid before we fill it in.
				1905	*/
				1906	if (info->fs_root && info->fs_root->node) {
				1907	btrfs_set_backup_fs_root(root_backup,
				1908	info->fs_root->node->start);
				1909	btrfs_set_backup_fs_root_gen(root_backup,
				1910	btrfs_header_generation(info->fs_root->node));
				1911	btrfs_set_backup_fs_root_level(root_backup,
				1912	btrfs_header_level(info->fs_root->node));
				1913	}
				1914
				1915	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
				1916	btrfs_set_backup_dev_root_gen(root_backup,
				1917	btrfs_header_generation(info->dev_root->node));
				1918	btrfs_set_backup_dev_root_level(root_backup,
				1919	btrfs_header_level(info->dev_root->node));
				1920
				1921	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
				1922	btrfs_set_backup_csum_root_gen(root_backup,
				1923	btrfs_header_generation(info->csum_root->node));
				1924	btrfs_set_backup_csum_root_level(root_backup,
				1925	btrfs_header_level(info->csum_root->node));
				1926
				1927	btrfs_set_backup_total_bytes(root_backup,
				1928	btrfs_super_total_bytes(info->super_copy));
				1929	btrfs_set_backup_bytes_used(root_backup,
				1930	btrfs_super_bytes_used(info->super_copy));
				1931	btrfs_set_backup_num_devices(root_backup,
				1932	btrfs_super_num_devices(info->super_copy));
				1933
				1934	/*
				1935	* if we don't copy this out to the super_copy, it won't get remembered
				1936	* for the next commit
				1937	*/
				1938	memcpy(&info->super_copy->super_roots,
				1939	&info->super_for_commit->super_roots,
				1940	sizeof(root_backup) BTRFS_NUM_BACKUP_ROOTS);
				1941	}
				1942
				1943	/*
				1944	* this copies info out of the root backup array and back into
				1945	* the in-memory super block. It is meant to help iterate through
				1946	* the array, so you send it the number of backups you've already
				1947	* tried and the last backup index you used.
				1948	*
				1949	* this returns -1 when it has tried all the backups
				1950	*/
				1951	static noinline int next_root_backup(struct btrfs_fs_info *info,
				1952	struct btrfs_super_block *super,
				1953	int num_backups_tried, int backup_index)
				1954	{
				1955	struct btrfs_root_backup *root_backup;
				1956	int newest = *backup_index;
				1957
				1958	if (*num_backups_tried == 0) {
				1959	u64 gen = btrfs_super_generation(super);
				1960
				1961	newest = find_newest_super_backup(info, gen);
				1962	if (newest == -1)
				1963	return -1;
				1964
				1965	*backup_index = newest;
				1966	*num_backups_tried = 1;
				1967	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
				1968	/* we've tried all the backups, all done */
				1969	return -1;
				1970	} else {
				1971	/* jump to the next oldest backup */
				1972	newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
				1973	BTRFS_NUM_BACKUP_ROOTS;
				1974	*backup_index = newest;
				1975	*num_backups_tried += 1;
				1976	}
				1977	root_backup = super->super_roots + newest;
				1978
				1979	btrfs_set_super_generation(super,
				1980	btrfs_backup_tree_root_gen(root_backup));
				1981	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
				1982	btrfs_set_super_root_level(super,
				1983	btrfs_backup_tree_root_level(root_backup));
				1984	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
				1985
				1986	/*
				1987	* fixme: the total bytes and num_devices need to match or we should
				1988	* need a fsck
				1989	*/
				1990	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
				1991	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
				1992	return 0;
				1993	}
				1994
				1995	/* helper to cleanup workers */
				1996	static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
				1997	{
				1998	btrfs_destroy_workqueue(fs_info->fixup_workers);
				1999	btrfs_destroy_workqueue(fs_info->delalloc_workers);
				2000	btrfs_destroy_workqueue(fs_info->workers);
				2001	btrfs_destroy_workqueue(fs_info->endio_workers);
				2002	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
				2003	btrfs_destroy_workqueue(fs_info->endio_repair_workers);
				2004	btrfs_destroy_workqueue(fs_info->rmw_workers);
				2005	btrfs_destroy_workqueue(fs_info->endio_write_workers);
				2006	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
				2007	btrfs_destroy_workqueue(fs_info->submit_workers);
				2008	btrfs_destroy_workqueue(fs_info->delayed_workers);
				2009	btrfs_destroy_workqueue(fs_info->caching_workers);
				2010	btrfs_destroy_workqueue(fs_info->readahead_workers);
				2011	btrfs_destroy_workqueue(fs_info->flush_workers);
				2012	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
				2013	btrfs_destroy_workqueue(fs_info->extent_workers);
				2014	/*
				2015	* Now that all other work queues are destroyed, we can safely destroy
				2016	* the queues used for metadata I/O, since tasks from those other work
				2017	* queues can do metadata I/O operations.
				2018	*/
				2019	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
				2020	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
				2021	}
				2022
				2023	static void free_root_extent_buffers(struct btrfs_root *root)
				2024	{
				2025	if (root) {
				2026	free_extent_buffer(root->node);
				2027	free_extent_buffer(root->commit_root);
				2028	root->node = NULL;
				2029	root->commit_root = NULL;
				2030	}
				2031	}
				2032
				2033	/* helper to cleanup tree roots */
				2034	static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
				2035	{
				2036	free_root_extent_buffers(info->tree_root);
				2037
				2038	free_root_extent_buffers(info->dev_root);
				2039	free_root_extent_buffers(info->extent_root);
				2040	free_root_extent_buffers(info->csum_root);
				2041	free_root_extent_buffers(info->quota_root);
				2042	free_root_extent_buffers(info->uuid_root);
				2043	if (chunk_root)
				2044	free_root_extent_buffers(info->chunk_root);
				2045	free_root_extent_buffers(info->free_space_root);
				2046	}
				2047
				2048	void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
				2049	{
				2050	int ret;
				2051	struct btrfs_root *gang[8];
				2052	int i;
				2053
				2054	while (!list_empty(&fs_info->dead_roots)) {
				2055	gang[0] = list_entry(fs_info->dead_roots.next,
				2056	struct btrfs_root, root_list);
				2057	list_del(&gang[0]->root_list);
				2058
				2059	if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
				2060	btrfs_drop_and_free_fs_root(fs_info, gang[0]);
				2061	} else {
				2062	free_extent_buffer(gang[0]->node);
				2063	free_extent_buffer(gang[0]->commit_root);
				2064	btrfs_put_fs_root(gang[0]);
				2065	}
				2066	}
				2067
				2068	while (1) {
				2069	ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
				2070	(void **)gang, 0,
				2071	ARRAY_SIZE(gang));
				2072	if (!ret)
				2073	break;
				2074	for (i = 0; i < ret; i++)
				2075	btrfs_drop_and_free_fs_root(fs_info, gang[i]);
				2076	}
				2077
				2078	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				2079	btrfs_free_log_root_tree(NULL, fs_info);
				2080	btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
				2081	}
				2082	}
				2083
				2084	static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
				2085	{
				2086	mutex_init(&fs_info->scrub_lock);
				2087	atomic_set(&fs_info->scrubs_running, 0);
				2088	atomic_set(&fs_info->scrub_pause_req, 0);
				2089	atomic_set(&fs_info->scrubs_paused, 0);
				2090	atomic_set(&fs_info->scrub_cancel_req, 0);
				2091	init_waitqueue_head(&fs_info->scrub_pause_wait);
				2092	fs_info->scrub_workers_refcnt = 0;
				2093	}
				2094
				2095	static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
				2096	{
				2097	spin_lock_init(&fs_info->balance_lock);
				2098	mutex_init(&fs_info->balance_mutex);
				2099	atomic_set(&fs_info->balance_pause_req, 0);
				2100	atomic_set(&fs_info->balance_cancel_req, 0);
				2101	fs_info->balance_ctl = NULL;
				2102	init_waitqueue_head(&fs_info->balance_wait_q);
				2103	}
				2104
				2105	static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
				2106	{
				2107	struct inode *inode = fs_info->btree_inode;
				2108
				2109	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
				2110	set_nlink(inode, 1);
				2111	/*
				2112	* we set the i_size on the btree inode to the max possible int.
				2113	* the real end of the address space is determined by all of
				2114	* the devices in the system
				2115	*/
				2116	inode->i_size = OFFSET_MAX;
				2117	inode->i_mapping->a_ops = &btree_aops;
				2118
				2119	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
				2120	extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode);
				2121	BTRFS_I(inode)->io_tree.track_uptodate = 0;
				2122	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
				2123
				2124	BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
				2125
				2126	BTRFS_I(inode)->root = fs_info->tree_root;
				2127	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
				2128	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
				2129	btrfs_insert_inode_hash(inode);
				2130	}
				2131
				2132	static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
				2133	{
				2134	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
				2135	rwlock_init(&fs_info->dev_replace.lock);
				2136	atomic_set(&fs_info->dev_replace.read_locks, 0);
				2137	atomic_set(&fs_info->dev_replace.blocking_readers, 0);
				2138	init_waitqueue_head(&fs_info->replace_wait);
				2139	init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
				2140	}
				2141
				2142	static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
				2143	{
				2144	spin_lock_init(&fs_info->qgroup_lock);
				2145	mutex_init(&fs_info->qgroup_ioctl_lock);
				2146	fs_info->qgroup_tree = RB_ROOT;
				2147	fs_info->qgroup_op_tree = RB_ROOT;
				2148	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
				2149	fs_info->qgroup_seq = 1;
				2150	fs_info->qgroup_ulist = NULL;
				2151	fs_info->qgroup_rescan_running = false;
				2152	mutex_init(&fs_info->qgroup_rescan_lock);
				2153	}
				2154
				2155	static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
				2156	struct btrfs_fs_devices *fs_devices)
				2157	{
				2158	u32 max_active = fs_info->thread_pool_size;
				2159	unsigned int flags = WQ_MEM_RECLAIM \| WQ_FREEZABLE \| WQ_UNBOUND;
				2160
				2161	fs_info->workers =
				2162	btrfs_alloc_workqueue(fs_info, "worker",
				2163	flags \| WQ_HIGHPRI, max_active, 16);
				2164
				2165	fs_info->delalloc_workers =
				2166	btrfs_alloc_workqueue(fs_info, "delalloc",
				2167	flags, max_active, 2);
				2168
				2169	fs_info->flush_workers =
				2170	btrfs_alloc_workqueue(fs_info, "flush_delalloc",
				2171	flags, max_active, 0);
				2172
				2173	fs_info->caching_workers =
				2174	btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
				2175
				2176	/*
				2177	* a higher idle thresh on the submit workers makes it much more
				2178	* likely that bios will be send down in a sane order to the
				2179	* devices
				2180	*/
				2181	fs_info->submit_workers =
				2182	btrfs_alloc_workqueue(fs_info, "submit", flags,
				2183	min_t(u64, fs_devices->num_devices,
				2184	max_active), 64);
				2185
				2186	fs_info->fixup_workers =
				2187	btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
				2188
				2189	/*
				2190	* endios are largely parallel and should have a very
				2191	* low idle thresh
				2192	*/
				2193	fs_info->endio_workers =
				2194	btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
				2195	fs_info->endio_meta_workers =
				2196	btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
				2197	max_active, 4);
				2198	fs_info->endio_meta_write_workers =
				2199	btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
				2200	max_active, 2);
				2201	fs_info->endio_raid56_workers =
				2202	btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
				2203	max_active, 4);
				2204	fs_info->endio_repair_workers =
				2205	btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
				2206	fs_info->rmw_workers =
				2207	btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
				2208	fs_info->endio_write_workers =
				2209	btrfs_alloc_workqueue(fs_info, "endio-write", flags,
				2210	max_active, 2);
				2211	fs_info->endio_freespace_worker =
				2212	btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
				2213	max_active, 0);
				2214	fs_info->delayed_workers =
				2215	btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
				2216	max_active, 0);
				2217	fs_info->readahead_workers =
				2218	btrfs_alloc_workqueue(fs_info, "readahead", flags,
				2219	max_active, 2);
				2220	fs_info->qgroup_rescan_workers =
				2221	btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
				2222	fs_info->extent_workers =
				2223	btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
				2224	min_t(u64, fs_devices->num_devices,
				2225	max_active), 8);
				2226
				2227	if (!(fs_info->workers && fs_info->delalloc_workers &&
				2228	fs_info->submit_workers && fs_info->flush_workers &&
				2229	fs_info->endio_workers && fs_info->endio_meta_workers &&
				2230	fs_info->endio_meta_write_workers &&
				2231	fs_info->endio_repair_workers &&
				2232	fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
				2233	fs_info->endio_freespace_worker && fs_info->rmw_workers &&
				2234	fs_info->caching_workers && fs_info->readahead_workers &&
				2235	fs_info->fixup_workers && fs_info->delayed_workers &&
				2236	fs_info->extent_workers &&
				2237	fs_info->qgroup_rescan_workers)) {
				2238	return -ENOMEM;
				2239	}
				2240
				2241	return 0;
				2242	}
				2243
				2244	static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
				2245	struct btrfs_fs_devices *fs_devices)
				2246	{
				2247	int ret;
				2248	struct btrfs_root *log_tree_root;
				2249	struct btrfs_super_block *disk_super = fs_info->super_copy;
				2250	u64 bytenr = btrfs_super_log_root(disk_super);
				2251	int level = btrfs_super_log_root_level(disk_super);
				2252
				2253	if (fs_devices->rw_devices == 0) {
				2254	btrfs_warn(fs_info, "log replay required on RO media");
				2255	return -EIO;
				2256	}
				2257
				2258	log_tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				2259	if (!log_tree_root)
				2260	return -ENOMEM;
				2261
				2262	__setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
				2263
				2264	log_tree_root->node = read_tree_block(fs_info, bytenr,
				2265	fs_info->generation + 1,
				2266	level, NULL);
				2267	if (IS_ERR(log_tree_root->node)) {
				2268	btrfs_warn(fs_info, "failed to read log tree");
				2269	ret = PTR_ERR(log_tree_root->node);
				2270	kfree(log_tree_root);
				2271	return ret;
				2272	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
				2273	btrfs_err(fs_info, "failed to read log tree");
				2274	free_extent_buffer(log_tree_root->node);
				2275	kfree(log_tree_root);
				2276	return -EIO;
				2277	}
				2278	/* returns with log_tree_root freed on success */
				2279	ret = btrfs_recover_log_trees(log_tree_root);
				2280	if (ret) {
				2281	btrfs_handle_fs_error(fs_info, ret,
				2282	"Failed to recover log tree");
				2283	free_extent_buffer(log_tree_root->node);
				2284	kfree(log_tree_root);
				2285	return ret;
				2286	}
				2287
				2288	if (sb_rdonly(fs_info->sb)) {
				2289	ret = btrfs_commit_super(fs_info);
				2290	if (ret)
				2291	return ret;
				2292	}
				2293
				2294	return 0;
				2295	}
				2296
				2297	static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
				2298	{
				2299	struct btrfs_root *tree_root = fs_info->tree_root;
				2300	struct btrfs_root *root;
				2301	struct btrfs_key location;
				2302	int ret;
				2303
				2304	BUG_ON(!fs_info->tree_root);
				2305
				2306	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
				2307	location.type = BTRFS_ROOT_ITEM_KEY;
				2308	location.offset = 0;
				2309
				2310	root = btrfs_read_tree_root(tree_root, &location);
				2311	if (IS_ERR(root)) {
				2312	ret = PTR_ERR(root);
				2313	goto out;
				2314	}
				2315	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2316	fs_info->extent_root = root;
				2317
				2318	location.objectid = BTRFS_DEV_TREE_OBJECTID;
				2319	root = btrfs_read_tree_root(tree_root, &location);
				2320	if (IS_ERR(root)) {
				2321	ret = PTR_ERR(root);
				2322	goto out;
				2323	}
				2324	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2325	fs_info->dev_root = root;
				2326	btrfs_init_devices_late(fs_info);
				2327
				2328	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
				2329	root = btrfs_read_tree_root(tree_root, &location);
				2330	if (IS_ERR(root)) {
				2331	ret = PTR_ERR(root);
				2332	goto out;
				2333	}
				2334	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2335	fs_info->csum_root = root;
				2336
				2337	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
				2338	root = btrfs_read_tree_root(tree_root, &location);
				2339	if (!IS_ERR(root)) {
				2340	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2341	set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
				2342	fs_info->quota_root = root;
				2343	}
				2344
				2345	location.objectid = BTRFS_UUID_TREE_OBJECTID;
				2346	root = btrfs_read_tree_root(tree_root, &location);
				2347	if (IS_ERR(root)) {
				2348	ret = PTR_ERR(root);
				2349	if (ret != -ENOENT)
				2350	goto out;
				2351	} else {
				2352	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2353	fs_info->uuid_root = root;
				2354	}
				2355
				2356	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
				2357	location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
				2358	root = btrfs_read_tree_root(tree_root, &location);
				2359	if (IS_ERR(root)) {
				2360	ret = PTR_ERR(root);
				2361	goto out;
				2362	}
				2363	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
				2364	fs_info->free_space_root = root;
				2365	}
				2366
				2367	return 0;
				2368	out:
				2369	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
				2370	location.objectid, ret);
				2371	return ret;
				2372	}
				2373
				2374	/*
				2375	* Real super block validation
				2376	* NOTE: super csum type and incompat features will not be checked here.
				2377	*
				2378	* @sb: super block to check
				2379	* @mirror_num: the super block number to check its bytenr:
				2380	* 0 the primary (1st) sb
				2381	* 1, 2 2nd and 3rd backup copy
				2382	* -1 skip bytenr check
				2383	*/
				2384	static int validate_super(struct btrfs_fs_info *fs_info,
				2385	struct btrfs_super_block *sb, int mirror_num)
				2386	{
				2387	u64 nodesize = btrfs_super_nodesize(sb);
				2388	u64 sectorsize = btrfs_super_sectorsize(sb);
				2389	int ret = 0;
				2390
				2391	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
				2392	btrfs_err(fs_info, "no valid FS found");
				2393	ret = -EINVAL;
				2394	}
				2395	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
				2396	btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
				2397	btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
				2398	ret = -EINVAL;
				2399	}
				2400	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
				2401	btrfs_err(fs_info, "tree_root level too big: %d >= %d",
				2402	btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
				2403	ret = -EINVAL;
				2404	}
				2405	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
				2406	btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
				2407	btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
				2408	ret = -EINVAL;
				2409	}
				2410	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
				2411	btrfs_err(fs_info, "log_root level too big: %d >= %d",
				2412	btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
				2413	ret = -EINVAL;
				2414	}
				2415
				2416	/*
				2417	* Check sectorsize and nodesize first, other check will need it.
				2418	* Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
				2419	*/
				2420	if (!is_power_of_2(sectorsize) \|\| sectorsize < 4096 \|\|
				2421	sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
				2422	btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
				2423	ret = -EINVAL;
				2424	}
				2425	/* Only PAGE SIZE is supported yet */
				2426	if (sectorsize != PAGE_SIZE) {
				2427	btrfs_err(fs_info,
				2428	"sectorsize %llu not supported yet, only support %lu",
				2429	sectorsize, PAGE_SIZE);
				2430	ret = -EINVAL;
				2431	}
				2432	if (!is_power_of_2(nodesize) \|\| nodesize < sectorsize \|\|
				2433	nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
				2434	btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
				2435	ret = -EINVAL;
				2436	}
				2437	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
				2438	btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
				2439	le32_to_cpu(sb->__unused_leafsize), nodesize);
				2440	ret = -EINVAL;
				2441	}
				2442
				2443	/* Root alignment check */
				2444	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
				2445	btrfs_warn(fs_info, "tree_root block unaligned: %llu",
				2446	btrfs_super_root(sb));
				2447	ret = -EINVAL;
				2448	}
				2449	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
				2450	btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
				2451	btrfs_super_chunk_root(sb));
				2452	ret = -EINVAL;
				2453	}
				2454	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
				2455	btrfs_warn(fs_info, "log_root block unaligned: %llu",
				2456	btrfs_super_log_root(sb));
				2457	ret = -EINVAL;
				2458	}
				2459
				2460	if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) {
				2461	btrfs_err(fs_info,
				2462	"dev_item UUID does not match fsid: %pU != %pU",
				2463	fs_info->fsid, sb->dev_item.fsid);
				2464	ret = -EINVAL;
				2465	}
				2466
				2467	/*
				2468	* Hint to catch really bogus numbers, bitflips or so, more exact checks are
				2469	* done later
				2470	*/
				2471	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
				2472	btrfs_err(fs_info, "bytes_used is too small %llu",
				2473	btrfs_super_bytes_used(sb));
				2474	ret = -EINVAL;
				2475	}
				2476	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
				2477	btrfs_err(fs_info, "invalid stripesize %u",
				2478	btrfs_super_stripesize(sb));
				2479	ret = -EINVAL;
				2480	}
				2481	if (btrfs_super_num_devices(sb) > (1UL << 31))
				2482	btrfs_warn(fs_info, "suspicious number of devices: %llu",
				2483	btrfs_super_num_devices(sb));
				2484	if (btrfs_super_num_devices(sb) == 0) {
				2485	btrfs_err(fs_info, "number of devices is 0");
				2486	ret = -EINVAL;
				2487	}
				2488
				2489	if (mirror_num >= 0 &&
				2490	btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
				2491	btrfs_err(fs_info, "super offset mismatch %llu != %u",
				2492	btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
				2493	ret = -EINVAL;
				2494	}
				2495
				2496	/*
				2497	* Obvious sys_chunk_array corruptions, it must hold at least one key
				2498	* and one chunk
				2499	*/
				2500	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
				2501	btrfs_err(fs_info, "system chunk array too big %u > %u",
				2502	btrfs_super_sys_array_size(sb),
				2503	BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
				2504	ret = -EINVAL;
				2505	}
				2506	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
				2507	+ sizeof(struct btrfs_chunk)) {
				2508	btrfs_err(fs_info, "system chunk array too small %u < %zu",
				2509	btrfs_super_sys_array_size(sb),
				2510	sizeof(struct btrfs_disk_key)
				2511	+ sizeof(struct btrfs_chunk));
				2512	ret = -EINVAL;
				2513	}
				2514
				2515	/*
				2516	* The generation is a global counter, we'll trust it more than the others
				2517	* but it's still possible that it's the one that's wrong.
				2518	*/
				2519	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
				2520	btrfs_warn(fs_info,
				2521	"suspicious: generation < chunk_root_generation: %llu < %llu",
				2522	btrfs_super_generation(sb),
				2523	btrfs_super_chunk_root_generation(sb));
				2524	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
				2525	&& btrfs_super_cache_generation(sb) != (u64)-1)
				2526	btrfs_warn(fs_info,
				2527	"suspicious: generation < cache_generation: %llu < %llu",
				2528	btrfs_super_generation(sb),
				2529	btrfs_super_cache_generation(sb));
				2530
				2531	return ret;
				2532	}
				2533
				2534	/*
				2535	* Validation of super block at mount time.
				2536	* Some checks already done early at mount time, like csum type and incompat
				2537	* flags will be skipped.
				2538	*/
				2539	static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
				2540	{
				2541	return validate_super(fs_info, fs_info->super_copy, 0);
				2542	}
				2543
				2544	/*
				2545	* Validation of super block at write time.
				2546	* Some checks like bytenr check will be skipped as their values will be
				2547	* overwritten soon.
				2548	* Extra checks like csum type and incompat flags will be done here.
				2549	*/
				2550	static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
				2551	struct btrfs_super_block *sb)
				2552	{
				2553	int ret;
				2554
				2555	ret = validate_super(fs_info, sb, -1);
				2556	if (ret < 0)
				2557	goto out;
				2558	if (btrfs_super_csum_type(sb) != BTRFS_CSUM_TYPE_CRC32) {
				2559	ret = -EUCLEAN;
				2560	btrfs_err(fs_info, "invalid csum type, has %u want %u",
				2561	btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
				2562	goto out;
				2563	}
				2564	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
				2565	ret = -EUCLEAN;
				2566	btrfs_err(fs_info,
				2567	"invalid incompat flags, has 0x%llx valid mask 0x%llx",
				2568	btrfs_super_incompat_flags(sb),
				2569	(unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
				2570	goto out;
				2571	}
				2572	out:
				2573	if (ret < 0)
				2574	btrfs_err(fs_info,
				2575	"super block corruption detected before writing it to disk");
				2576	return ret;
				2577	}
				2578
				2579	int open_ctree(struct super_block *sb,
				2580	struct btrfs_fs_devices *fs_devices,
				2581	char *options)
				2582	{
				2583	u32 sectorsize;
				2584	u32 nodesize;
				2585	u32 stripesize;
				2586	u64 generation;
				2587	u64 features;
				2588	struct btrfs_key location;
				2589	struct buffer_head *bh;
				2590	struct btrfs_super_block *disk_super;
				2591	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
				2592	struct btrfs_root *tree_root;
				2593	struct btrfs_root *chunk_root;
				2594	int ret;
				2595	int err = -EINVAL;
				2596	int num_backups_tried = 0;
				2597	int backup_index = 0;
				2598	int clear_free_space_tree = 0;
				2599	int level;
				2600
				2601	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				2602	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
				2603	if (!tree_root \|\| !chunk_root) {
				2604	err = -ENOMEM;
				2605	goto fail;
				2606	}
				2607
				2608	ret = init_srcu_struct(&fs_info->subvol_srcu);
				2609	if (ret) {
				2610	err = ret;
				2611	goto fail;
				2612	}
				2613
				2614	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
				2615	if (ret) {
				2616	err = ret;
				2617	goto fail_srcu;
				2618	}
				2619	fs_info->dirty_metadata_batch = PAGE_SIZE *
				2620	(1 + ilog2(nr_cpu_ids));
				2621
				2622	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
				2623	if (ret) {
				2624	err = ret;
				2625	goto fail_dirty_metadata_bytes;
				2626	}
				2627
				2628	ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
				2629	if (ret) {
				2630	err = ret;
				2631	goto fail_delalloc_bytes;
				2632	}
				2633
				2634	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
				2635	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
				2636	INIT_LIST_HEAD(&fs_info->trans_list);
				2637	INIT_LIST_HEAD(&fs_info->dead_roots);
				2638	INIT_LIST_HEAD(&fs_info->delayed_iputs);
				2639	INIT_LIST_HEAD(&fs_info->delalloc_roots);
				2640	INIT_LIST_HEAD(&fs_info->caching_block_groups);
				2641	INIT_LIST_HEAD(&fs_info->pending_raid_kobjs);
				2642	spin_lock_init(&fs_info->pending_raid_kobjs_lock);
				2643	spin_lock_init(&fs_info->delalloc_root_lock);
				2644	spin_lock_init(&fs_info->trans_lock);
				2645	spin_lock_init(&fs_info->fs_roots_radix_lock);
				2646	spin_lock_init(&fs_info->delayed_iput_lock);
				2647	spin_lock_init(&fs_info->defrag_inodes_lock);
				2648	spin_lock_init(&fs_info->tree_mod_seq_lock);
				2649	spin_lock_init(&fs_info->super_lock);
				2650	spin_lock_init(&fs_info->qgroup_op_lock);
				2651	spin_lock_init(&fs_info->buffer_lock);
				2652	spin_lock_init(&fs_info->unused_bgs_lock);
				2653	rwlock_init(&fs_info->tree_mod_log_lock);
				2654	mutex_init(&fs_info->unused_bg_unpin_mutex);
				2655	mutex_init(&fs_info->delete_unused_bgs_mutex);
				2656	mutex_init(&fs_info->reloc_mutex);
				2657	mutex_init(&fs_info->delalloc_root_mutex);
				2658	mutex_init(&fs_info->cleaner_delayed_iput_mutex);
				2659	seqlock_init(&fs_info->profiles_lock);
				2660
				2661	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
				2662	INIT_LIST_HEAD(&fs_info->space_info);
				2663	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
				2664	INIT_LIST_HEAD(&fs_info->unused_bgs);
				2665	btrfs_mapping_init(&fs_info->mapping_tree);
				2666	btrfs_init_block_rsv(&fs_info->global_block_rsv,
				2667	BTRFS_BLOCK_RSV_GLOBAL);
				2668	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
				2669	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
				2670	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
				2671	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
				2672	BTRFS_BLOCK_RSV_DELOPS);
				2673	atomic_set(&fs_info->async_delalloc_pages, 0);
				2674	atomic_set(&fs_info->defrag_running, 0);
				2675	atomic_set(&fs_info->qgroup_op_seq, 0);
				2676	atomic_set(&fs_info->reada_works_cnt, 0);
				2677	atomic64_set(&fs_info->tree_mod_seq, 0);
				2678	fs_info->sb = sb;
				2679	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
				2680	fs_info->metadata_ratio = 0;
				2681	fs_info->defrag_inodes = RB_ROOT;
				2682	atomic64_set(&fs_info->free_chunk_space, 0);
				2683	fs_info->tree_mod_log = RB_ROOT;
				2684	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
				2685	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
				2686	/* readahead state */
				2687	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
				2688	spin_lock_init(&fs_info->reada_lock);
				2689	btrfs_init_ref_verify(fs_info);
				2690
				2691	fs_info->thread_pool_size = min_t(unsigned long,
				2692	num_online_cpus() + 2, 8);
				2693
				2694	INIT_LIST_HEAD(&fs_info->ordered_roots);
				2695	spin_lock_init(&fs_info->ordered_root_lock);
				2696
				2697	fs_info->btree_inode = new_inode(sb);
				2698	if (!fs_info->btree_inode) {
				2699	err = -ENOMEM;
				2700	goto fail_bio_counter;
				2701	}
				2702	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
				2703
				2704	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
				2705	GFP_KERNEL);
				2706	if (!fs_info->delayed_root) {
				2707	err = -ENOMEM;
				2708	goto fail_iput;
				2709	}
				2710	btrfs_init_delayed_root(fs_info->delayed_root);
				2711
				2712	btrfs_init_scrub(fs_info);
				2713	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				2714	fs_info->check_integrity_print_mask = 0;
				2715	#endif
				2716	btrfs_init_balance(fs_info);
				2717	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
				2718
				2719	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
				2720	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
				2721
				2722	btrfs_init_btree_inode(fs_info);
				2723
				2724	spin_lock_init(&fs_info->block_group_cache_lock);
				2725	fs_info->block_group_cache_tree = RB_ROOT;
				2726	fs_info->first_logical_byte = (u64)-1;
				2727
				2728	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
				2729	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
				2730	fs_info->pinned_extents = &fs_info->freed_extents[0];
				2731	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
				2732
				2733	mutex_init(&fs_info->ordered_operations_mutex);
				2734	mutex_init(&fs_info->tree_log_mutex);
				2735	mutex_init(&fs_info->chunk_mutex);
				2736	mutex_init(&fs_info->transaction_kthread_mutex);
				2737	mutex_init(&fs_info->cleaner_mutex);
				2738	mutex_init(&fs_info->ro_block_group_mutex);
				2739	init_rwsem(&fs_info->commit_root_sem);
				2740	init_rwsem(&fs_info->cleanup_work_sem);
				2741	init_rwsem(&fs_info->subvol_sem);
				2742	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
				2743
				2744	btrfs_init_dev_replace_locks(fs_info);
				2745	btrfs_init_qgroup(fs_info);
				2746
				2747	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
				2748	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
				2749
				2750	init_waitqueue_head(&fs_info->transaction_throttle);
				2751	init_waitqueue_head(&fs_info->transaction_wait);
				2752	init_waitqueue_head(&fs_info->transaction_blocked_wait);
				2753	init_waitqueue_head(&fs_info->async_submit_wait);
				2754
				2755	INIT_LIST_HEAD(&fs_info->pinned_chunks);
				2756
				2757	/* Usable values until the real ones are cached from the superblock */
				2758	fs_info->nodesize = 4096;
				2759	fs_info->sectorsize = 4096;
				2760	fs_info->stripesize = 4096;
				2761
				2762	ret = btrfs_alloc_stripe_hash_table(fs_info);
				2763	if (ret) {
				2764	err = ret;
				2765	goto fail_alloc;
				2766	}
				2767
				2768	__setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID);
				2769
				2770	invalidate_bdev(fs_devices->latest_bdev);
				2771
				2772	/*
				2773	* Read super block and check the signature bytes only
				2774	*/
				2775	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
				2776	if (IS_ERR(bh)) {
				2777	err = PTR_ERR(bh);
				2778	goto fail_alloc;
				2779	}
				2780
				2781	/*
				2782	* We want to check superblock checksum, the type is stored inside.
				2783	* Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
				2784	*/
				2785	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
				2786	btrfs_err(fs_info, "superblock checksum mismatch");
				2787	err = -EINVAL;
				2788	brelse(bh);
				2789	goto fail_alloc;
				2790	}
				2791
				2792	/*
				2793	* super_copy is zeroed at allocation time and we never touch the
				2794	* following bytes up to INFO_SIZE, the checksum is calculated from
				2795	* the whole block of INFO_SIZE
				2796	*/
				2797	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
				2798	memcpy(fs_info->super_for_commit, fs_info->super_copy,
				2799	sizeof(*fs_info->super_for_commit));
				2800	brelse(bh);
				2801
				2802	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
				2803
				2804	ret = btrfs_validate_mount_super(fs_info);
				2805	if (ret) {
				2806	btrfs_err(fs_info, "superblock contains fatal errors");
				2807	err = -EINVAL;
				2808	goto fail_alloc;
				2809	}
				2810
				2811	disk_super = fs_info->super_copy;
				2812	if (!btrfs_super_root(disk_super))
				2813	goto fail_alloc;
				2814
				2815	/* check FS state, whether FS is broken. */
				2816	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
				2817	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
				2818
				2819	/*
				2820	* run through our array of backup supers and setup
				2821	* our ring pointer to the oldest one
				2822	*/
				2823	generation = btrfs_super_generation(disk_super);
				2824	find_oldest_super_backup(fs_info, generation);
				2825
				2826	/*
				2827	* In the long term, we'll store the compression type in the super
				2828	* block, and it'll be used for per file compression control.
				2829	*/
				2830	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
				2831
				2832	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
				2833	if (ret) {
				2834	err = ret;
				2835	goto fail_alloc;
				2836	}
				2837
				2838	features = btrfs_super_incompat_flags(disk_super) &
				2839	~BTRFS_FEATURE_INCOMPAT_SUPP;
				2840	if (features) {
				2841	btrfs_err(fs_info,
				2842	"cannot mount because of unsupported optional features (%llx)",
				2843	features);
				2844	err = -EINVAL;
				2845	goto fail_alloc;
				2846	}
				2847
				2848	features = btrfs_super_incompat_flags(disk_super);
				2849	features \|= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
				2850	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
				2851	features \|= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
				2852	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
				2853	features \|= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
				2854
				2855	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
				2856	btrfs_info(fs_info, "has skinny extents");
				2857
				2858	/*
				2859	* flag our filesystem as having big metadata blocks if
				2860	* they are bigger than the page size
				2861	*/
				2862	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
				2863	if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
				2864	btrfs_info(fs_info,
				2865	"flagging fs with big metadata feature");
				2866	features \|= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
				2867	}
				2868
				2869	nodesize = btrfs_super_nodesize(disk_super);
				2870	sectorsize = btrfs_super_sectorsize(disk_super);
				2871	stripesize = sectorsize;
				2872	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
				2873	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
				2874
				2875	/* Cache block sizes */
				2876	fs_info->nodesize = nodesize;
				2877	fs_info->sectorsize = sectorsize;
				2878	fs_info->stripesize = stripesize;
				2879
				2880	/*
				2881	* mixed block groups end up with duplicate but slightly offset
				2882	* extent buffers for the same range. It leads to corruptions
				2883	*/
				2884	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
				2885	(sectorsize != nodesize)) {
				2886	btrfs_err(fs_info,
				2887	"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
				2888	nodesize, sectorsize);
				2889	goto fail_alloc;
				2890	}
				2891
				2892	/*
				2893	* Needn't use the lock because there is no other task which will
				2894	* update the flag.
				2895	*/
				2896	btrfs_set_super_incompat_flags(disk_super, features);
				2897
				2898	features = btrfs_super_compat_ro_flags(disk_super) &
				2899	~BTRFS_FEATURE_COMPAT_RO_SUPP;
				2900	if (!sb_rdonly(sb) && features) {
				2901	btrfs_err(fs_info,
				2902	"cannot mount read-write because of unsupported optional features (%llx)",
				2903	features);
				2904	err = -EINVAL;
				2905	goto fail_alloc;
				2906	}
				2907
				2908	ret = btrfs_init_workqueues(fs_info, fs_devices);
				2909	if (ret) {
				2910	err = ret;
				2911	goto fail_sb_buffer;
				2912	}
				2913
				2914	sb->s_bdi->congested_fn = btrfs_congested_fn;
				2915	sb->s_bdi->congested_data = fs_info;
				2916	sb->s_bdi->capabilities \|= BDI_CAP_CGROUP_WRITEBACK;
				2917	sb->s_bdi->ra_pages = VM_MAX_READAHEAD * SZ_1K / PAGE_SIZE;
				2918	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
				2919	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
				2920
				2921	sb->s_blocksize = sectorsize;
				2922	sb->s_blocksize_bits = blksize_bits(sectorsize);
				2923	memcpy(&sb->s_uuid, fs_info->fsid, BTRFS_FSID_SIZE);
				2924
				2925	mutex_lock(&fs_info->chunk_mutex);
				2926	ret = btrfs_read_sys_array(fs_info);
				2927	mutex_unlock(&fs_info->chunk_mutex);
				2928	if (ret) {
				2929	btrfs_err(fs_info, "failed to read the system array: %d", ret);
				2930	goto fail_sb_buffer;
				2931	}
				2932
				2933	generation = btrfs_super_chunk_root_generation(disk_super);
				2934	level = btrfs_super_chunk_root_level(disk_super);
				2935
				2936	__setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
				2937
				2938	chunk_root->node = read_tree_block(fs_info,
				2939	btrfs_super_chunk_root(disk_super),
				2940	generation, level, NULL);
				2941	if (IS_ERR(chunk_root->node) \|\|
				2942	!extent_buffer_uptodate(chunk_root->node)) {
				2943	btrfs_err(fs_info, "failed to read chunk root");
				2944	if (!IS_ERR(chunk_root->node))
				2945	free_extent_buffer(chunk_root->node);
				2946	chunk_root->node = NULL;
				2947	goto fail_tree_roots;
				2948	}
				2949	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
				2950	chunk_root->commit_root = btrfs_root_node(chunk_root);
				2951
				2952	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
				2953	btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
				2954
				2955	ret = btrfs_read_chunk_tree(fs_info);
				2956	if (ret) {
				2957	btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
				2958	goto fail_tree_roots;
				2959	}
				2960
				2961	/*
				2962	* Keep the devid that is marked to be the target device for the
				2963	* device replace procedure
				2964	*/
				2965	btrfs_free_extra_devids(fs_devices, 0);
				2966
				2967	if (!fs_devices->latest_bdev) {
				2968	btrfs_err(fs_info, "failed to read devices");
				2969	goto fail_tree_roots;
				2970	}
				2971
				2972	retry_root_backup:
				2973	generation = btrfs_super_generation(disk_super);
				2974	level = btrfs_super_root_level(disk_super);
				2975
				2976	tree_root->node = read_tree_block(fs_info,
				2977	btrfs_super_root(disk_super),
				2978	generation, level, NULL);
				2979	if (IS_ERR(tree_root->node) \|\|
				2980	!extent_buffer_uptodate(tree_root->node)) {
				2981	btrfs_warn(fs_info, "failed to read tree root");
				2982	if (!IS_ERR(tree_root->node))
				2983	free_extent_buffer(tree_root->node);
				2984	tree_root->node = NULL;
				2985	goto recovery_tree_root;
				2986	}
				2987
				2988	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
				2989	tree_root->commit_root = btrfs_root_node(tree_root);
				2990	btrfs_set_root_refs(&tree_root->root_item, 1);
				2991
				2992	mutex_lock(&tree_root->objectid_mutex);
				2993	ret = btrfs_find_highest_objectid(tree_root,
				2994	&tree_root->highest_objectid);
				2995	if (ret) {
				2996	mutex_unlock(&tree_root->objectid_mutex);
				2997	goto recovery_tree_root;
				2998	}
				2999
				3000	ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
				3001
				3002	mutex_unlock(&tree_root->objectid_mutex);
				3003
				3004	ret = btrfs_read_roots(fs_info);
				3005	if (ret)
				3006	goto recovery_tree_root;
				3007
				3008	fs_info->generation = generation;
				3009	fs_info->last_trans_committed = generation;
				3010
				3011	ret = btrfs_verify_dev_extents(fs_info);
				3012	if (ret) {
				3013	btrfs_err(fs_info,
				3014	"failed to verify dev extents against chunks: %d",
				3015	ret);
				3016	goto fail_block_groups;
				3017	}
				3018	ret = btrfs_recover_balance(fs_info);
				3019	if (ret) {
				3020	btrfs_err(fs_info, "failed to recover balance: %d", ret);
				3021	goto fail_block_groups;
				3022	}
				3023
				3024	ret = btrfs_init_dev_stats(fs_info);
				3025	if (ret) {
				3026	btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
				3027	goto fail_block_groups;
				3028	}
				3029
				3030	ret = btrfs_init_dev_replace(fs_info);
				3031	if (ret) {
				3032	btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
				3033	goto fail_block_groups;
				3034	}
				3035
				3036	btrfs_free_extra_devids(fs_devices, 1);
				3037
				3038	ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
				3039	if (ret) {
				3040	btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
				3041	ret);
				3042	goto fail_block_groups;
				3043	}
				3044
				3045	ret = btrfs_sysfs_add_device(fs_devices);
				3046	if (ret) {
				3047	btrfs_err(fs_info, "failed to init sysfs device interface: %d",
				3048	ret);
				3049	goto fail_fsdev_sysfs;
				3050	}
				3051
				3052	ret = btrfs_sysfs_add_mounted(fs_info);
				3053	if (ret) {
				3054	btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
				3055	goto fail_fsdev_sysfs;
				3056	}
				3057
				3058	ret = btrfs_init_space_info(fs_info);
				3059	if (ret) {
				3060	btrfs_err(fs_info, "failed to initialize space info: %d", ret);
				3061	goto fail_sysfs;
				3062	}
				3063
				3064	ret = btrfs_read_block_groups(fs_info);
				3065	if (ret) {
				3066	btrfs_err(fs_info, "failed to read block groups: %d", ret);
				3067	goto fail_sysfs;
				3068	}
				3069
				3070	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
				3071	btrfs_warn(fs_info,
				3072	"writeable mount is not allowed due to too many missing devices");
				3073	goto fail_sysfs;
				3074	}
				3075
				3076	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
				3077	"btrfs-cleaner");
				3078	if (IS_ERR(fs_info->cleaner_kthread))
				3079	goto fail_sysfs;
				3080
				3081	fs_info->transaction_kthread = kthread_run(transaction_kthread,
				3082	tree_root,
				3083	"btrfs-transaction");
				3084	if (IS_ERR(fs_info->transaction_kthread))
				3085	goto fail_cleaner;
				3086
				3087	if (!btrfs_test_opt(fs_info, NOSSD) &&
				3088	!fs_info->fs_devices->rotating) {
				3089	btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
				3090	}
				3091
				3092	/*
				3093	* Mount does not set all options immediately, we can do it now and do
				3094	* not have to wait for transaction commit
				3095	*/
				3096	btrfs_apply_pending_changes(fs_info);
				3097
				3098	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				3099	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
				3100	ret = btrfsic_mount(fs_info, fs_devices,
				3101	btrfs_test_opt(fs_info,
				3102	CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
				3103	1 : 0,
				3104	fs_info->check_integrity_print_mask);
				3105	if (ret)
				3106	btrfs_warn(fs_info,
				3107	"failed to initialize integrity check module: %d",
				3108	ret);
				3109	}
				3110	#endif
				3111	ret = btrfs_read_qgroup_config(fs_info);
				3112	if (ret)
				3113	goto fail_trans_kthread;
				3114
				3115	if (btrfs_build_ref_tree(fs_info))
				3116	btrfs_err(fs_info, "couldn't build ref tree");
				3117
				3118	/* do not make disk changes in broken FS or nologreplay is given */
				3119	if (btrfs_super_log_root(disk_super) != 0 &&
				3120	!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
				3121	ret = btrfs_replay_log(fs_info, fs_devices);
				3122	if (ret) {
				3123	err = ret;
				3124	goto fail_qgroup;
				3125	}
				3126	}
				3127
				3128	ret = btrfs_find_orphan_roots(fs_info);
				3129	if (ret)
				3130	goto fail_qgroup;
				3131
				3132	if (!sb_rdonly(sb)) {
				3133	ret = btrfs_cleanup_fs_roots(fs_info);
				3134	if (ret)
				3135	goto fail_qgroup;
				3136
				3137	mutex_lock(&fs_info->cleaner_mutex);
				3138	ret = btrfs_recover_relocation(tree_root);
				3139	mutex_unlock(&fs_info->cleaner_mutex);
				3140	if (ret < 0) {
				3141	btrfs_warn(fs_info, "failed to recover relocation: %d",
				3142	ret);
				3143	err = -EINVAL;
				3144	goto fail_qgroup;
				3145	}
				3146	}
				3147
				3148	location.objectid = BTRFS_FS_TREE_OBJECTID;
				3149	location.type = BTRFS_ROOT_ITEM_KEY;
				3150	location.offset = 0;
				3151
				3152	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
				3153	if (IS_ERR(fs_info->fs_root)) {
				3154	err = PTR_ERR(fs_info->fs_root);
				3155	btrfs_warn(fs_info, "failed to read fs tree: %d", err);
				3156	goto fail_qgroup;
				3157	}
				3158
				3159	if (sb_rdonly(sb))
				3160	return 0;
				3161
				3162	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
				3163	btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
				3164	clear_free_space_tree = 1;
				3165	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
				3166	!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
				3167	btrfs_warn(fs_info, "free space tree is invalid");
				3168	clear_free_space_tree = 1;
				3169	}
				3170
				3171	if (clear_free_space_tree) {
				3172	btrfs_info(fs_info, "clearing free space tree");
				3173	ret = btrfs_clear_free_space_tree(fs_info);
				3174	if (ret) {
				3175	btrfs_warn(fs_info,
				3176	"failed to clear free space tree: %d", ret);
				3177	close_ctree(fs_info);
				3178	return ret;
				3179	}
				3180	}
				3181
				3182	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
				3183	!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
				3184	btrfs_info(fs_info, "creating free space tree");
				3185	ret = btrfs_create_free_space_tree(fs_info);
				3186	if (ret) {
				3187	btrfs_warn(fs_info,
				3188	"failed to create free space tree: %d", ret);
				3189	close_ctree(fs_info);
				3190	return ret;
				3191	}
				3192	}
				3193
				3194	down_read(&fs_info->cleanup_work_sem);
				3195	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) \|\|
				3196	(ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
				3197	up_read(&fs_info->cleanup_work_sem);
				3198	close_ctree(fs_info);
				3199	return ret;
				3200	}
				3201	up_read(&fs_info->cleanup_work_sem);
				3202
				3203	ret = btrfs_resume_balance_async(fs_info);
				3204	if (ret) {
				3205	btrfs_warn(fs_info, "failed to resume balance: %d", ret);
				3206	close_ctree(fs_info);
				3207	return ret;
				3208	}
				3209
				3210	ret = btrfs_resume_dev_replace_async(fs_info);
				3211	if (ret) {
				3212	btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
				3213	close_ctree(fs_info);
				3214	return ret;
				3215	}
				3216
				3217	btrfs_qgroup_rescan_resume(fs_info);
				3218
				3219	if (!fs_info->uuid_root) {
				3220	btrfs_info(fs_info, "creating UUID tree");
				3221	ret = btrfs_create_uuid_tree(fs_info);
				3222	if (ret) {
				3223	btrfs_warn(fs_info,
				3224	"failed to create the UUID tree: %d", ret);
				3225	close_ctree(fs_info);
				3226	return ret;
				3227	}
				3228	} else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) \|\|
				3229	fs_info->generation !=
				3230	btrfs_super_uuid_tree_generation(disk_super)) {
				3231	btrfs_info(fs_info, "checking UUID tree");
				3232	ret = btrfs_check_uuid_tree(fs_info);
				3233	if (ret) {
				3234	btrfs_warn(fs_info,
				3235	"failed to check the UUID tree: %d", ret);
				3236	close_ctree(fs_info);
				3237	return ret;
				3238	}
				3239	} else {
				3240	set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
				3241	}
				3242	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
				3243
				3244	/*
				3245	* backuproot only affect mount behavior, and if open_ctree succeeded,
				3246	* no need to keep the flag
				3247	*/
				3248	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
				3249
				3250	return 0;
				3251
				3252	fail_qgroup:
				3253	btrfs_free_qgroup_config(fs_info);
				3254	fail_trans_kthread:
				3255	kthread_stop(fs_info->transaction_kthread);
				3256	btrfs_cleanup_transaction(fs_info);
				3257	btrfs_free_fs_roots(fs_info);
				3258	fail_cleaner:
				3259	kthread_stop(fs_info->cleaner_kthread);
				3260
				3261	/*
				3262	* make sure we're done with the btree inode before we stop our
				3263	* kthreads
				3264	*/
				3265	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
				3266
				3267	fail_sysfs:
				3268	btrfs_sysfs_remove_mounted(fs_info);
				3269
				3270	fail_fsdev_sysfs:
				3271	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
				3272
				3273	fail_block_groups:
				3274	btrfs_put_block_group_cache(fs_info);
				3275
				3276	fail_tree_roots:
				3277	free_root_pointers(fs_info, 1);
				3278	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
				3279
				3280	fail_sb_buffer:
				3281	btrfs_stop_all_workers(fs_info);
				3282	btrfs_free_block_groups(fs_info);
				3283	fail_alloc:
				3284	fail_iput:
				3285	btrfs_mapping_tree_free(&fs_info->mapping_tree);
				3286
				3287	iput(fs_info->btree_inode);
				3288	fail_bio_counter:
				3289	percpu_counter_destroy(&fs_info->bio_counter);
				3290	fail_delalloc_bytes:
				3291	percpu_counter_destroy(&fs_info->delalloc_bytes);
				3292	fail_dirty_metadata_bytes:
				3293	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
				3294	fail_srcu:
				3295	cleanup_srcu_struct(&fs_info->subvol_srcu);
				3296	fail:
				3297	btrfs_free_stripe_hash_table(fs_info);
				3298	btrfs_close_devices(fs_info->fs_devices);
				3299	return err;
				3300
				3301	recovery_tree_root:
				3302	if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
				3303	goto fail_tree_roots;
				3304
				3305	free_root_pointers(fs_info, 0);
				3306
				3307	/* don't use the log in recovery mode, it won't be valid */
				3308	btrfs_set_super_log_root(disk_super, 0);
				3309
				3310	/* we can't trust the free space cache either */
				3311	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
				3312
				3313	ret = next_root_backup(fs_info, fs_info->super_copy,
				3314	&num_backups_tried, &backup_index);
				3315	if (ret == -1)
				3316	goto fail_block_groups;
				3317	goto retry_root_backup;
				3318	}
				3319	ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
				3320
				3321	static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
				3322	{
				3323	if (uptodate) {
				3324	set_buffer_uptodate(bh);
				3325	} else {
				3326	struct btrfs_device device = (struct btrfs_device )
				3327	bh->b_private;
				3328
				3329	btrfs_warn_rl_in_rcu(device->fs_info,
				3330	"lost page write due to IO error on %s",
				3331	rcu_str_deref(device->name));
				3332	/* note, we don't set_buffer_write_io_error because we have
				3333	* our own ways of dealing with the IO errors
				3334	*/
				3335	clear_buffer_uptodate(bh);
				3336	btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
				3337	}
				3338	unlock_buffer(bh);
				3339	put_bh(bh);
				3340	}
				3341
				3342	int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
				3343	struct buffer_head **bh_ret)
				3344	{
				3345	struct buffer_head *bh;
				3346	struct btrfs_super_block *super;
				3347	u64 bytenr;
				3348
				3349	bytenr = btrfs_sb_offset(copy_num);
				3350	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
				3351	return -EINVAL;
				3352
				3353	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
				3354	/*
				3355	* If we fail to read from the underlying devices, as of now
				3356	* the best option we have is to mark it EIO.
				3357	*/
				3358	if (!bh)
				3359	return -EIO;
				3360
				3361	super = (struct btrfs_super_block *)bh->b_data;
				3362	if (btrfs_super_bytenr(super) != bytenr \|\|
				3363	btrfs_super_magic(super) != BTRFS_MAGIC) {
				3364	brelse(bh);
				3365	return -EINVAL;
				3366	}
				3367
				3368	*bh_ret = bh;
				3369	return 0;
				3370	}
				3371
				3372
				3373	struct buffer_head btrfs_read_dev_super(struct block_device bdev)
				3374	{
				3375	struct buffer_head *bh;
				3376	struct buffer_head *latest = NULL;
				3377	struct btrfs_super_block *super;
				3378	int i;
				3379	u64 transid = 0;
				3380	int ret = -EINVAL;
				3381
				3382	/* we would like to check all the supers, but that would make
				3383	* a btrfs mount succeed after a mkfs from a different FS.
				3384	* So, we need to add a special mount option to scan for
				3385	* later supers, using BTRFS_SUPER_MIRROR_MAX instead
				3386	*/
				3387	for (i = 0; i < 1; i++) {
				3388	ret = btrfs_read_dev_one_super(bdev, i, &bh);
				3389	if (ret)
				3390	continue;
				3391
				3392	super = (struct btrfs_super_block *)bh->b_data;
				3393
				3394	if (!latest \|\| btrfs_super_generation(super) > transid) {
				3395	brelse(latest);
				3396	latest = bh;
				3397	transid = btrfs_super_generation(super);
				3398	} else {
				3399	brelse(bh);
				3400	}
				3401	}
				3402
				3403	if (!latest)
				3404	return ERR_PTR(ret);
				3405
				3406	return latest;
				3407	}
				3408
				3409	/*
				3410	* Write superblock @sb to the @device. Do not wait for completion, all the
				3411	* buffer heads we write are pinned.
				3412	*
				3413	* Write @max_mirrors copies of the superblock, where 0 means default that fit
				3414	* the expected device size at commit time. Note that max_mirrors must be
				3415	* same for write and wait phases.
				3416	*
				3417	* Return number of errors when buffer head is not found or submission fails.
				3418	*/
				3419	static int write_dev_supers(struct btrfs_device *device,
				3420	struct btrfs_super_block *sb, int max_mirrors)
				3421	{
				3422	struct buffer_head *bh;
				3423	int i;
				3424	int ret;
				3425	int errors = 0;
				3426	u32 crc;
				3427	u64 bytenr;
				3428	int op_flags;
				3429
				3430	if (max_mirrors == 0)
				3431	max_mirrors = BTRFS_SUPER_MIRROR_MAX;
				3432
				3433	for (i = 0; i < max_mirrors; i++) {
				3434	bytenr = btrfs_sb_offset(i);
				3435	if (bytenr + BTRFS_SUPER_INFO_SIZE >=
				3436	device->commit_total_bytes)
				3437	break;
				3438
				3439	btrfs_set_super_bytenr(sb, bytenr);
				3440
				3441	crc = ~(u32)0;
				3442	crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc,
				3443	BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
				3444	btrfs_csum_final(crc, sb->csum);
				3445
				3446	/* One reference for us, and we leave it for the caller */
				3447	bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
				3448	BTRFS_SUPER_INFO_SIZE);
				3449	if (!bh) {
				3450	btrfs_err(device->fs_info,
				3451	"couldn't get super buffer head for bytenr %llu",
				3452	bytenr);
				3453	errors++;
				3454	continue;
				3455	}
				3456
				3457	memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
				3458
				3459	/* one reference for submit_bh */
				3460	get_bh(bh);
				3461
				3462	set_buffer_uptodate(bh);
				3463	lock_buffer(bh);
				3464	bh->b_end_io = btrfs_end_buffer_write_sync;
				3465	bh->b_private = device;
				3466
				3467	/*
				3468	* we fua the first super. The others we allow
				3469	* to go down lazy.
				3470	*/
				3471	op_flags = REQ_SYNC \| REQ_META \| REQ_PRIO;
				3472	if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
				3473	op_flags \|= REQ_FUA;
				3474	ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
				3475	if (ret)
				3476	errors++;
				3477	}
				3478	return errors < i ? 0 : -1;
				3479	}
				3480
				3481	/*
				3482	* Wait for write completion of superblocks done by write_dev_supers,
				3483	* @max_mirrors same for write and wait phases.
				3484	*
				3485	* Return number of errors when buffer head is not found or not marked up to
				3486	* date.
				3487	*/
				3488	static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
				3489	{
				3490	struct buffer_head *bh;
				3491	int i;
				3492	int errors = 0;
				3493	bool primary_failed = false;
				3494	u64 bytenr;
				3495
				3496	if (max_mirrors == 0)
				3497	max_mirrors = BTRFS_SUPER_MIRROR_MAX;
				3498
				3499	for (i = 0; i < max_mirrors; i++) {
				3500	bytenr = btrfs_sb_offset(i);
				3501	if (bytenr + BTRFS_SUPER_INFO_SIZE >=
				3502	device->commit_total_bytes)
				3503	break;
				3504
				3505	bh = __find_get_block(device->bdev,
				3506	bytenr / BTRFS_BDEV_BLOCKSIZE,
				3507	BTRFS_SUPER_INFO_SIZE);
				3508	if (!bh) {
				3509	errors++;
				3510	if (i == 0)
				3511	primary_failed = true;
				3512	continue;
				3513	}
				3514	wait_on_buffer(bh);
				3515	if (!buffer_uptodate(bh)) {
				3516	errors++;
				3517	if (i == 0)
				3518	primary_failed = true;
				3519	}
				3520
				3521	/* drop our reference */
				3522	brelse(bh);
				3523
				3524	/* drop the reference from the writing run */
				3525	brelse(bh);
				3526	}
				3527
				3528	/* log error, force error return */
				3529	if (primary_failed) {
				3530	btrfs_err(device->fs_info, "error writing primary super block to device %llu",
				3531	device->devid);
				3532	return -1;
				3533	}
				3534
				3535	return errors < i ? 0 : -1;
				3536	}
				3537
				3538	/*
				3539	* endio for the write_dev_flush, this will wake anyone waiting
				3540	* for the barrier when it is done
				3541	*/
				3542	static void btrfs_end_empty_barrier(struct bio *bio)
				3543	{
				3544	complete(bio->bi_private);
				3545	}
				3546
				3547	/*
				3548	* Submit a flush request to the device if it supports it. Error handling is
				3549	* done in the waiting counterpart.
				3550	*/
				3551	static void write_dev_flush(struct btrfs_device *device)
				3552	{
				3553	struct request_queue *q = bdev_get_queue(device->bdev);
				3554	struct bio *bio = device->flush_bio;
				3555
				3556	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
				3557	return;
				3558
				3559	bio_reset(bio);
				3560	bio->bi_end_io = btrfs_end_empty_barrier;
				3561	bio_set_dev(bio, device->bdev);
				3562	bio->bi_opf = REQ_OP_WRITE \| REQ_SYNC \| REQ_PREFLUSH;
				3563	init_completion(&device->flush_wait);
				3564	bio->bi_private = &device->flush_wait;
				3565
				3566	btrfsic_submit_bio(bio);
				3567	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
				3568	}
				3569
				3570	/*
				3571	* If the flush bio has been submitted by write_dev_flush, wait for it.
				3572	*/
				3573	static blk_status_t wait_dev_flush(struct btrfs_device *device)
				3574	{
				3575	struct bio *bio = device->flush_bio;
				3576
				3577	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
				3578	return BLK_STS_OK;
				3579
				3580	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
				3581	wait_for_completion_io(&device->flush_wait);
				3582
				3583	return bio->bi_status;
				3584	}
				3585
				3586	static int check_barrier_error(struct btrfs_fs_info *fs_info)
				3587	{
				3588	if (!btrfs_check_rw_degradable(fs_info, NULL))
				3589	return -EIO;
				3590	return 0;
				3591	}
				3592
				3593	/*
				3594	* send an empty flush down to each device in parallel,
				3595	* then wait for them
				3596	*/
				3597	static int barrier_all_devices(struct btrfs_fs_info *info)
				3598	{
				3599	struct list_head *head;
				3600	struct btrfs_device *dev;
				3601	int errors_wait = 0;
				3602	blk_status_t ret;
				3603
				3604	lockdep_assert_held(&info->fs_devices->device_list_mutex);
				3605	/* send down all the barriers */
				3606	head = &info->fs_devices->devices;
				3607	list_for_each_entry(dev, head, dev_list) {
				3608	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
				3609	continue;
				3610	if (!dev->bdev)
				3611	continue;
				3612	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) \|\|
				3613	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
				3614	continue;
				3615
				3616	write_dev_flush(dev);
				3617	dev->last_flush_error = BLK_STS_OK;
				3618	}
				3619
				3620	/* wait for all the barriers */
				3621	list_for_each_entry(dev, head, dev_list) {
				3622	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
				3623	continue;
				3624	if (!dev->bdev) {
				3625	errors_wait++;
				3626	continue;
				3627	}
				3628	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) \|\|
				3629	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
				3630	continue;
				3631
				3632	ret = wait_dev_flush(dev);
				3633	if (ret) {
				3634	dev->last_flush_error = ret;
				3635	btrfs_dev_stat_inc_and_print(dev,
				3636	BTRFS_DEV_STAT_FLUSH_ERRS);
				3637	errors_wait++;
				3638	}
				3639	}
				3640
				3641	if (errors_wait) {
				3642	/*
				3643	* At some point we need the status of all disks
				3644	* to arrive at the volume status. So error checking
				3645	* is being pushed to a separate loop.
				3646	*/
				3647	return check_barrier_error(info);
				3648	}
				3649	return 0;
				3650	}
				3651
				3652	int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
				3653	{
				3654	int raid_type;
				3655	int min_tolerated = INT_MAX;
				3656
				3657	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 \|\|
				3658	(flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
				3659	min_tolerated = min(min_tolerated,
				3660	btrfs_raid_array[BTRFS_RAID_SINGLE].
				3661	tolerated_failures);
				3662
				3663	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
				3664	if (raid_type == BTRFS_RAID_SINGLE)
				3665	continue;
				3666	if (!(flags & btrfs_raid_array[raid_type].bg_flag))
				3667	continue;
				3668	min_tolerated = min(min_tolerated,
				3669	btrfs_raid_array[raid_type].
				3670	tolerated_failures);
				3671	}
				3672
				3673	if (min_tolerated == INT_MAX) {
				3674	pr_warn("BTRFS: unknown raid flag: %llu", flags);
				3675	min_tolerated = 0;
				3676	}
				3677
				3678	return min_tolerated;
				3679	}
				3680
				3681	int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
				3682	{
				3683	struct list_head *head;
				3684	struct btrfs_device *dev;
				3685	struct btrfs_super_block *sb;
				3686	struct btrfs_dev_item *dev_item;
				3687	int ret;
				3688	int do_barriers;
				3689	int max_errors;
				3690	int total_errors = 0;
				3691	u64 flags;
				3692
				3693	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
				3694
				3695	/*
				3696	* max_mirrors == 0 indicates we're from commit_transaction,
				3697	* not from fsync where the tree roots in fs_info have not
				3698	* been consistent on disk.
				3699	*/
				3700	if (max_mirrors == 0)
				3701	backup_super_roots(fs_info);
				3702
				3703	sb = fs_info->super_for_commit;
				3704	dev_item = &sb->dev_item;
				3705
				3706	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				3707	head = &fs_info->fs_devices->devices;
				3708	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
				3709
				3710	if (do_barriers) {
				3711	ret = barrier_all_devices(fs_info);
				3712	if (ret) {
				3713	mutex_unlock(
				3714	&fs_info->fs_devices->device_list_mutex);
				3715	btrfs_handle_fs_error(fs_info, ret,
				3716	"errors while submitting device barriers.");
				3717	return ret;
				3718	}
				3719	}
				3720
				3721	list_for_each_entry(dev, head, dev_list) {
				3722	if (!dev->bdev) {
				3723	total_errors++;
				3724	continue;
				3725	}
				3726	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) \|\|
				3727	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
				3728	continue;
				3729
				3730	btrfs_set_stack_device_generation(dev_item, 0);
				3731	btrfs_set_stack_device_type(dev_item, dev->type);
				3732	btrfs_set_stack_device_id(dev_item, dev->devid);
				3733	btrfs_set_stack_device_total_bytes(dev_item,
				3734	dev->commit_total_bytes);
				3735	btrfs_set_stack_device_bytes_used(dev_item,
				3736	dev->commit_bytes_used);
				3737	btrfs_set_stack_device_io_align(dev_item, dev->io_align);
				3738	btrfs_set_stack_device_io_width(dev_item, dev->io_width);
				3739	btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
				3740	memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
				3741	memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE);
				3742
				3743	flags = btrfs_super_flags(sb);
				3744	btrfs_set_super_flags(sb, flags \| BTRFS_HEADER_FLAG_WRITTEN);
				3745
				3746	ret = btrfs_validate_write_super(fs_info, sb);
				3747	if (ret < 0) {
				3748	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3749	btrfs_handle_fs_error(fs_info, -EUCLEAN,
				3750	"unexpected superblock corruption detected");
				3751	return -EUCLEAN;
				3752	}
				3753
				3754	ret = write_dev_supers(dev, sb, max_mirrors);
				3755	if (ret)
				3756	total_errors++;
				3757	}
				3758	if (total_errors > max_errors) {
				3759	btrfs_err(fs_info, "%d errors while writing supers",
				3760	total_errors);
				3761	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3762
				3763	/* FUA is masked off if unsupported and can't be the reason */
				3764	btrfs_handle_fs_error(fs_info, -EIO,
				3765	"%d errors while writing supers",
				3766	total_errors);
				3767	return -EIO;
				3768	}
				3769
				3770	total_errors = 0;
				3771	list_for_each_entry(dev, head, dev_list) {
				3772	if (!dev->bdev)
				3773	continue;
				3774	if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) \|\|
				3775	!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
				3776	continue;
				3777
				3778	ret = wait_dev_supers(dev, max_mirrors);
				3779	if (ret)
				3780	total_errors++;
				3781	}
				3782	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				3783	if (total_errors > max_errors) {
				3784	btrfs_handle_fs_error(fs_info, -EIO,
				3785	"%d errors while writing supers",
				3786	total_errors);
				3787	return -EIO;
				3788	}
				3789	return 0;
				3790	}
				3791
				3792	/* Drop a fs root from the radix tree and free it. */
				3793	void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
				3794	struct btrfs_root *root)
				3795	{
				3796	spin_lock(&fs_info->fs_roots_radix_lock);
				3797	radix_tree_delete(&fs_info->fs_roots_radix,
				3798	(unsigned long)root->root_key.objectid);
				3799	spin_unlock(&fs_info->fs_roots_radix_lock);
				3800
				3801	if (btrfs_root_refs(&root->root_item) == 0)
				3802	synchronize_srcu(&fs_info->subvol_srcu);
				3803
				3804	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				3805	btrfs_free_log(NULL, root);
				3806	if (root->reloc_root) {
				3807	free_extent_buffer(root->reloc_root->node);
				3808	free_extent_buffer(root->reloc_root->commit_root);
				3809	btrfs_put_fs_root(root->reloc_root);
				3810	root->reloc_root = NULL;
				3811	}
				3812	}
				3813
				3814	if (root->free_ino_pinned)
				3815	__btrfs_remove_free_space_cache(root->free_ino_pinned);
				3816	if (root->free_ino_ctl)
				3817	__btrfs_remove_free_space_cache(root->free_ino_ctl);
				3818	btrfs_free_fs_root(root);
				3819	}
				3820
				3821	void btrfs_free_fs_root(struct btrfs_root *root)
				3822	{
				3823	iput(root->ino_cache_inode);
				3824	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
				3825	if (root->anon_dev)
				3826	free_anon_bdev(root->anon_dev);
				3827	if (root->subv_writers)
				3828	btrfs_free_subvolume_writers(root->subv_writers);
				3829	free_extent_buffer(root->node);
				3830	free_extent_buffer(root->commit_root);
				3831	kfree(root->free_ino_ctl);
				3832	kfree(root->free_ino_pinned);
				3833	btrfs_put_fs_root(root);
				3834	}
				3835
				3836	int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
				3837	{
				3838	u64 root_objectid = 0;
				3839	struct btrfs_root *gang[8];
				3840	int i = 0;
				3841	int err = 0;
				3842	unsigned int ret = 0;
				3843	int index;
				3844
				3845	while (1) {
				3846	index = srcu_read_lock(&fs_info->subvol_srcu);
				3847	ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
				3848	(void **)gang, root_objectid,
				3849	ARRAY_SIZE(gang));
				3850	if (!ret) {
				3851	srcu_read_unlock(&fs_info->subvol_srcu, index);
				3852	break;
				3853	}
				3854	root_objectid = gang[ret - 1]->root_key.objectid + 1;
				3855
				3856	for (i = 0; i < ret; i++) {
				3857	/* Avoid to grab roots in dead_roots */
				3858	if (btrfs_root_refs(&gang[i]->root_item) == 0) {
				3859	gang[i] = NULL;
				3860	continue;
				3861	}
				3862	/* grab all the search result for later use */
				3863	gang[i] = btrfs_grab_fs_root(gang[i]);
				3864	}
				3865	srcu_read_unlock(&fs_info->subvol_srcu, index);
				3866
				3867	for (i = 0; i < ret; i++) {
				3868	if (!gang[i])
				3869	continue;
				3870	root_objectid = gang[i]->root_key.objectid;
				3871	err = btrfs_orphan_cleanup(gang[i]);
				3872	if (err)
				3873	break;
				3874	btrfs_put_fs_root(gang[i]);
				3875	}
				3876	root_objectid++;
				3877	}
				3878
				3879	/* release the uncleaned roots due to error */
				3880	for (; i < ret; i++) {
				3881	if (gang[i])
				3882	btrfs_put_fs_root(gang[i]);
				3883	}
				3884	return err;
				3885	}
				3886
				3887	int btrfs_commit_super(struct btrfs_fs_info *fs_info)
				3888	{
				3889	struct btrfs_root *root = fs_info->tree_root;
				3890	struct btrfs_trans_handle *trans;
				3891
				3892	mutex_lock(&fs_info->cleaner_mutex);
				3893	btrfs_run_delayed_iputs(fs_info);
				3894	mutex_unlock(&fs_info->cleaner_mutex);
				3895	wake_up_process(fs_info->cleaner_kthread);
				3896
				3897	/* wait until ongoing cleanup work done */
				3898	down_write(&fs_info->cleanup_work_sem);
				3899	up_write(&fs_info->cleanup_work_sem);
				3900
				3901	trans = btrfs_join_transaction(root);
				3902	if (IS_ERR(trans))
				3903	return PTR_ERR(trans);
				3904	return btrfs_commit_transaction(trans);
				3905	}
				3906
				3907	void close_ctree(struct btrfs_fs_info *fs_info)
				3908	{
				3909	int ret;
				3910
				3911	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
				3912	/*
				3913	* We don't want the cleaner to start new transactions, add more delayed
				3914	* iputs, etc. while we're closing. We can't use kthread_stop() yet
				3915	* because that frees the task_struct, and the transaction kthread might
				3916	* still try to wake up the cleaner.
				3917	*/
				3918	kthread_park(fs_info->cleaner_kthread);
				3919
				3920	/* wait for the qgroup rescan worker to stop */
				3921	btrfs_qgroup_wait_for_completion(fs_info, false);
				3922
				3923	/* wait for the uuid_scan task to finish */
				3924	down(&fs_info->uuid_tree_rescan_sem);
				3925	/* avoid complains from lockdep et al., set sem back to initial state */
				3926	up(&fs_info->uuid_tree_rescan_sem);
				3927
				3928	/* pause restriper - we want to resume on mount */
				3929	btrfs_pause_balance(fs_info);
				3930
				3931	btrfs_dev_replace_suspend_for_unmount(fs_info);
				3932
				3933	btrfs_scrub_cancel(fs_info);
				3934
				3935	/* wait for any defraggers to finish */
				3936	wait_event(fs_info->transaction_wait,
				3937	(atomic_read(&fs_info->defrag_running) == 0));
				3938
				3939	/* clear out the rbtree of defraggable inodes */
				3940	btrfs_cleanup_defrag_inodes(fs_info);
				3941
				3942	cancel_work_sync(&fs_info->async_reclaim_work);
				3943
				3944	if (!sb_rdonly(fs_info->sb)) {
				3945	/*
				3946	* The cleaner kthread is stopped, so do one final pass over
				3947	* unused block groups.
				3948	*/
				3949	btrfs_delete_unused_bgs(fs_info);
				3950
				3951	ret = btrfs_commit_super(fs_info);
				3952	if (ret)
				3953	btrfs_err(fs_info, "commit super ret %d", ret);
				3954	}
				3955
				3956	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) \|\|
				3957	test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
				3958	btrfs_error_commit_super(fs_info);
				3959
				3960	kthread_stop(fs_info->transaction_kthread);
				3961	kthread_stop(fs_info->cleaner_kthread);
				3962
				3963	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
				3964
				3965	btrfs_free_qgroup_config(fs_info);
				3966	ASSERT(list_empty(&fs_info->delalloc_roots));
				3967
				3968	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
				3969	btrfs_info(fs_info, "at unmount delalloc count %lld",
				3970	percpu_counter_sum(&fs_info->delalloc_bytes));
				3971	}
				3972
				3973	btrfs_sysfs_remove_mounted(fs_info);
				3974	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
				3975
				3976	btrfs_free_fs_roots(fs_info);
				3977
				3978	btrfs_put_block_group_cache(fs_info);
				3979
				3980	/*
				3981	* we must make sure there is not any read request to
				3982	* submit after we stopping all workers.
				3983	*/
				3984	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
				3985	btrfs_stop_all_workers(fs_info);
				3986
				3987	btrfs_free_block_groups(fs_info);
				3988
				3989	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
				3990	free_root_pointers(fs_info, 1);
				3991
				3992	iput(fs_info->btree_inode);
				3993
				3994	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				3995	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
				3996	btrfsic_unmount(fs_info->fs_devices);
				3997	#endif
				3998
				3999	btrfs_close_devices(fs_info->fs_devices);
				4000	btrfs_mapping_tree_free(&fs_info->mapping_tree);
				4001
				4002	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
				4003	percpu_counter_destroy(&fs_info->delalloc_bytes);
				4004	percpu_counter_destroy(&fs_info->bio_counter);
				4005	cleanup_srcu_struct(&fs_info->subvol_srcu);
				4006
				4007	btrfs_free_stripe_hash_table(fs_info);
				4008	btrfs_free_ref_cache(fs_info);
				4009
				4010	while (!list_empty(&fs_info->pinned_chunks)) {
				4011	struct extent_map *em;
				4012
				4013	em = list_first_entry(&fs_info->pinned_chunks,
				4014	struct extent_map, list);
				4015	list_del_init(&em->list);
				4016	free_extent_map(em);
				4017	}
				4018	}
				4019
				4020	int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
				4021	int atomic)
				4022	{
				4023	int ret;
				4024	struct inode *btree_inode = buf->pages[0]->mapping->host;
				4025
				4026	ret = extent_buffer_uptodate(buf);
				4027	if (!ret)
				4028	return ret;
				4029
				4030	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
				4031	parent_transid, atomic);
				4032	if (ret == -EAGAIN)
				4033	return ret;
				4034	return !ret;
				4035	}
				4036
				4037	void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
				4038	{
				4039	struct btrfs_fs_info *fs_info;
				4040	struct btrfs_root *root;
				4041	u64 transid = btrfs_header_generation(buf);
				4042	int was_dirty;
				4043
				4044	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				4045	/*
				4046	* This is a fast path so only do this check if we have sanity tests
				4047	* enabled. Normal people shouldn't be using umapped buffers as dirty
				4048	* outside of the sanity tests.
				4049	*/
				4050	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
				4051	return;
				4052	#endif
				4053	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
				4054	fs_info = root->fs_info;
				4055	btrfs_assert_tree_locked(buf);
				4056	if (transid != fs_info->generation)
				4057	WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
				4058	buf->start, transid, fs_info->generation);
				4059	was_dirty = set_extent_buffer_dirty(buf);
				4060	if (!was_dirty)
				4061	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
				4062	buf->len,
				4063	fs_info->dirty_metadata_batch);
				4064	#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
				4065	/*
				4066	* Since btrfs_mark_buffer_dirty() can be called with item pointer set
				4067	* but item data not updated.
				4068	* So here we should only check item pointers, not item data.
				4069	*/
				4070	if (btrfs_header_level(buf) == 0 &&
				4071	btrfs_check_leaf_relaxed(fs_info, buf)) {
				4072	btrfs_print_leaf(buf);
				4073	ASSERT(0);
				4074	}
				4075	#endif
				4076	}
				4077
				4078	static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
				4079	int flush_delayed)
				4080	{
				4081	/*
				4082	* looks as though older kernels can get into trouble with
				4083	* this code, they end up stuck in balance_dirty_pages forever
				4084	*/
				4085	int ret;
				4086
				4087	if (current->flags & PF_MEMALLOC)
				4088	return;
				4089
				4090	if (flush_delayed)
				4091	btrfs_balance_delayed_items(fs_info);
				4092
				4093	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				4094	BTRFS_DIRTY_METADATA_THRESH,
				4095	fs_info->dirty_metadata_batch);
				4096	if (ret > 0) {
				4097	balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
				4098	}
				4099	}
				4100
				4101	void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
				4102	{
				4103	__btrfs_btree_balance_dirty(fs_info, 1);
				4104	}
				4105
				4106	void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
				4107	{
				4108	__btrfs_btree_balance_dirty(fs_info, 0);
				4109	}
				4110
				4111	int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
				4112	struct btrfs_key *first_key)
				4113	{
				4114	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
				4115	struct btrfs_fs_info *fs_info = root->fs_info;
				4116
				4117	return btree_read_extent_buffer_pages(fs_info, buf, parent_transid,
				4118	level, first_key);
				4119	}
				4120
				4121	static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
				4122	{
				4123	/* cleanup FS via transaction */
				4124	btrfs_cleanup_transaction(fs_info);
				4125
				4126	mutex_lock(&fs_info->cleaner_mutex);
				4127	btrfs_run_delayed_iputs(fs_info);
				4128	mutex_unlock(&fs_info->cleaner_mutex);
				4129
				4130	down_write(&fs_info->cleanup_work_sem);
				4131	up_write(&fs_info->cleanup_work_sem);
				4132	}
				4133
				4134	static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
				4135	{
				4136	struct btrfs_ordered_extent *ordered;
				4137
				4138	spin_lock(&root->ordered_extent_lock);
				4139	/*
				4140	* This will just short circuit the ordered completion stuff which will
				4141	* make sure the ordered extent gets properly cleaned up.
				4142	*/
				4143	list_for_each_entry(ordered, &root->ordered_extents,
				4144	root_extent_list)
				4145	set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
				4146	spin_unlock(&root->ordered_extent_lock);
				4147	}
				4148
				4149	static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
				4150	{
				4151	struct btrfs_root *root;
				4152	struct list_head splice;
				4153
				4154	INIT_LIST_HEAD(&splice);
				4155
				4156	spin_lock(&fs_info->ordered_root_lock);
				4157	list_splice_init(&fs_info->ordered_roots, &splice);
				4158	while (!list_empty(&splice)) {
				4159	root = list_first_entry(&splice, struct btrfs_root,
				4160	ordered_root);
				4161	list_move_tail(&root->ordered_root,
				4162	&fs_info->ordered_roots);
				4163
				4164	spin_unlock(&fs_info->ordered_root_lock);
				4165	btrfs_destroy_ordered_extents(root);
				4166
				4167	cond_resched();
				4168	spin_lock(&fs_info->ordered_root_lock);
				4169	}
				4170	spin_unlock(&fs_info->ordered_root_lock);
				4171
				4172	/*
				4173	* We need this here because if we've been flipped read-only we won't
				4174	* get sync() from the umount, so we need to make sure any ordered
				4175	* extents that haven't had their dirty pages IO start writeout yet
				4176	* actually get run and error out properly.
				4177	*/
				4178	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
				4179	}
				4180
				4181	static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				4182	struct btrfs_fs_info *fs_info)
				4183	{
				4184	struct rb_node *node;
				4185	struct btrfs_delayed_ref_root *delayed_refs;
				4186	struct btrfs_delayed_ref_node *ref;
				4187	int ret = 0;
				4188
				4189	delayed_refs = &trans->delayed_refs;
				4190
				4191	spin_lock(&delayed_refs->lock);
				4192	if (atomic_read(&delayed_refs->num_entries) == 0) {
				4193	spin_unlock(&delayed_refs->lock);
				4194	btrfs_info(fs_info, "delayed_refs has NO entry");
				4195	return ret;
				4196	}
				4197
				4198	while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
				4199	struct btrfs_delayed_ref_head *head;
				4200	struct rb_node *n;
				4201	bool pin_bytes = false;
				4202
				4203	head = rb_entry(node, struct btrfs_delayed_ref_head,
				4204	href_node);
				4205	if (!mutex_trylock(&head->mutex)) {
				4206	refcount_inc(&head->refs);
				4207	spin_unlock(&delayed_refs->lock);
				4208
				4209	mutex_lock(&head->mutex);
				4210	mutex_unlock(&head->mutex);
				4211	btrfs_put_delayed_ref_head(head);
				4212	spin_lock(&delayed_refs->lock);
				4213	continue;
				4214	}
				4215	spin_lock(&head->lock);
				4216	while ((n = rb_first(&head->ref_tree)) != NULL) {
				4217	ref = rb_entry(n, struct btrfs_delayed_ref_node,
				4218	ref_node);
				4219	ref->in_tree = 0;
				4220	rb_erase(&ref->ref_node, &head->ref_tree);
				4221	RB_CLEAR_NODE(&ref->ref_node);
				4222	if (!list_empty(&ref->add_list))
				4223	list_del(&ref->add_list);
				4224	atomic_dec(&delayed_refs->num_entries);
				4225	btrfs_put_delayed_ref(ref);
				4226	}
				4227	if (head->must_insert_reserved)
				4228	pin_bytes = true;
				4229	btrfs_free_delayed_extent_op(head->extent_op);
				4230	delayed_refs->num_heads--;
				4231	if (head->processing == 0)
				4232	delayed_refs->num_heads_ready--;
				4233	atomic_dec(&delayed_refs->num_entries);
				4234	rb_erase(&head->href_node, &delayed_refs->href_root);
				4235	RB_CLEAR_NODE(&head->href_node);
				4236	spin_unlock(&head->lock);
				4237	spin_unlock(&delayed_refs->lock);
				4238	mutex_unlock(&head->mutex);
				4239
				4240	if (pin_bytes)
				4241	btrfs_pin_extent(fs_info, head->bytenr,
				4242	head->num_bytes, 1);
				4243	btrfs_put_delayed_ref_head(head);
				4244	cond_resched();
				4245	spin_lock(&delayed_refs->lock);
				4246	}
				4247
				4248	spin_unlock(&delayed_refs->lock);
				4249
				4250	return ret;
				4251	}
				4252
				4253	static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
				4254	{
				4255	struct btrfs_inode *btrfs_inode;
				4256	struct list_head splice;
				4257
				4258	INIT_LIST_HEAD(&splice);
				4259
				4260	spin_lock(&root->delalloc_lock);
				4261	list_splice_init(&root->delalloc_inodes, &splice);
				4262
				4263	while (!list_empty(&splice)) {
				4264	struct inode *inode = NULL;
				4265	btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
				4266	delalloc_inodes);
				4267	__btrfs_del_delalloc_inode(root, btrfs_inode);
				4268	spin_unlock(&root->delalloc_lock);
				4269
				4270	/*
				4271	* Make sure we get a live inode and that it'll not disappear
				4272	* meanwhile.
				4273	*/
				4274	inode = igrab(&btrfs_inode->vfs_inode);
				4275	if (inode) {
				4276	invalidate_inode_pages2(inode->i_mapping);
				4277	iput(inode);
				4278	}
				4279	spin_lock(&root->delalloc_lock);
				4280	}
				4281	spin_unlock(&root->delalloc_lock);
				4282	}
				4283
				4284	static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
				4285	{
				4286	struct btrfs_root *root;
				4287	struct list_head splice;
				4288
				4289	INIT_LIST_HEAD(&splice);
				4290
				4291	spin_lock(&fs_info->delalloc_root_lock);
				4292	list_splice_init(&fs_info->delalloc_roots, &splice);
				4293	while (!list_empty(&splice)) {
				4294	root = list_first_entry(&splice, struct btrfs_root,
				4295	delalloc_root);
				4296	root = btrfs_grab_fs_root(root);
				4297	BUG_ON(!root);
				4298	spin_unlock(&fs_info->delalloc_root_lock);
				4299
				4300	btrfs_destroy_delalloc_inodes(root);
				4301	btrfs_put_fs_root(root);
				4302
				4303	spin_lock(&fs_info->delalloc_root_lock);
				4304	}
				4305	spin_unlock(&fs_info->delalloc_root_lock);
				4306	}
				4307
				4308	static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
				4309	struct extent_io_tree *dirty_pages,
				4310	int mark)
				4311	{
				4312	int ret;
				4313	struct extent_buffer *eb;
				4314	u64 start = 0;
				4315	u64 end;
				4316
				4317	while (1) {
				4318	ret = find_first_extent_bit(dirty_pages, start, &start, &end,
				4319	mark, NULL);
				4320	if (ret)
				4321	break;
				4322
				4323	clear_extent_bits(dirty_pages, start, end, mark);
				4324	while (start <= end) {
				4325	eb = find_extent_buffer(fs_info, start);
				4326	start += fs_info->nodesize;
				4327	if (!eb)
				4328	continue;
				4329	wait_on_extent_buffer_writeback(eb);
				4330
				4331	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
				4332	&eb->bflags))
				4333	clear_extent_buffer_dirty(eb);
				4334	free_extent_buffer_stale(eb);
				4335	}
				4336	}
				4337
				4338	return ret;
				4339	}
				4340
				4341	static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
				4342	struct extent_io_tree *pinned_extents)
				4343	{
				4344	struct extent_io_tree *unpin;
				4345	u64 start;
				4346	u64 end;
				4347	int ret;
				4348	bool loop = true;
				4349
				4350	unpin = pinned_extents;
				4351	again:
				4352	while (1) {
				4353	struct extent_state *cached_state = NULL;
				4354
				4355	/*
				4356	* The btrfs_finish_extent_commit() may get the same range as
				4357	* ours between find_first_extent_bit and clear_extent_dirty.
				4358	* Hence, hold the unused_bg_unpin_mutex to avoid double unpin
				4359	* the same extent range.
				4360	*/
				4361	mutex_lock(&fs_info->unused_bg_unpin_mutex);
				4362	ret = find_first_extent_bit(unpin, 0, &start, &end,
				4363	EXTENT_DIRTY, &cached_state);
				4364	if (ret) {
				4365	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				4366	break;
				4367	}
				4368
				4369	clear_extent_dirty(unpin, start, end, &cached_state);
				4370	free_extent_state(cached_state);
				4371	btrfs_error_unpin_extent_range(fs_info, start, end);
				4372	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				4373	cond_resched();
				4374	}
				4375
				4376	if (loop) {
				4377	if (unpin == &fs_info->freed_extents[0])
				4378	unpin = &fs_info->freed_extents[1];
				4379	else
				4380	unpin = &fs_info->freed_extents[0];
				4381	loop = false;
				4382	goto again;
				4383	}
				4384
				4385	return 0;
				4386	}
				4387
				4388	static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
				4389	{
				4390	struct inode *inode;
				4391
				4392	inode = cache->io_ctl.inode;
				4393	if (inode) {
				4394	invalidate_inode_pages2(inode->i_mapping);
				4395	BTRFS_I(inode)->generation = 0;
				4396	cache->io_ctl.inode = NULL;
				4397	iput(inode);
				4398	}
				4399	btrfs_put_block_group(cache);
				4400	}
				4401
				4402	void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
				4403	struct btrfs_fs_info *fs_info)
				4404	{
				4405	struct btrfs_block_group_cache *cache;
				4406
				4407	spin_lock(&cur_trans->dirty_bgs_lock);
				4408	while (!list_empty(&cur_trans->dirty_bgs)) {
				4409	cache = list_first_entry(&cur_trans->dirty_bgs,
				4410	struct btrfs_block_group_cache,
				4411	dirty_list);
				4412
				4413	if (!list_empty(&cache->io_list)) {
				4414	spin_unlock(&cur_trans->dirty_bgs_lock);
				4415	list_del_init(&cache->io_list);
				4416	btrfs_cleanup_bg_io(cache);
				4417	spin_lock(&cur_trans->dirty_bgs_lock);
				4418	}
				4419
				4420	list_del_init(&cache->dirty_list);
				4421	spin_lock(&cache->lock);
				4422	cache->disk_cache_state = BTRFS_DC_ERROR;
				4423	spin_unlock(&cache->lock);
				4424
				4425	spin_unlock(&cur_trans->dirty_bgs_lock);
				4426	btrfs_put_block_group(cache);
				4427	spin_lock(&cur_trans->dirty_bgs_lock);
				4428	}
				4429	spin_unlock(&cur_trans->dirty_bgs_lock);
				4430
				4431	/*
				4432	* Refer to the definition of io_bgs member for details why it's safe
				4433	* to use it without any locking
				4434	*/
				4435	while (!list_empty(&cur_trans->io_bgs)) {
				4436	cache = list_first_entry(&cur_trans->io_bgs,
				4437	struct btrfs_block_group_cache,
				4438	io_list);
				4439
				4440	list_del_init(&cache->io_list);
				4441	spin_lock(&cache->lock);
				4442	cache->disk_cache_state = BTRFS_DC_ERROR;
				4443	spin_unlock(&cache->lock);
				4444	btrfs_cleanup_bg_io(cache);
				4445	}
				4446	}
				4447
				4448	void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
				4449	struct btrfs_fs_info *fs_info)
				4450	{
				4451	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
				4452	ASSERT(list_empty(&cur_trans->dirty_bgs));
				4453	ASSERT(list_empty(&cur_trans->io_bgs));
				4454
				4455	btrfs_destroy_delayed_refs(cur_trans, fs_info);
				4456
				4457	cur_trans->state = TRANS_STATE_COMMIT_START;
				4458	wake_up(&fs_info->transaction_blocked_wait);
				4459
				4460	cur_trans->state = TRANS_STATE_UNBLOCKED;
				4461	wake_up(&fs_info->transaction_wait);
				4462
				4463	btrfs_destroy_delayed_inodes(fs_info);
				4464	btrfs_assert_delayed_root_empty(fs_info);
				4465
				4466	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
				4467	EXTENT_DIRTY);
				4468	btrfs_destroy_pinned_extent(fs_info,
				4469	fs_info->pinned_extents);
				4470
				4471	cur_trans->state =TRANS_STATE_COMPLETED;
				4472	wake_up(&cur_trans->commit_wait);
				4473	}
				4474
				4475	static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
				4476	{
				4477	struct btrfs_transaction *t;
				4478
				4479	mutex_lock(&fs_info->transaction_kthread_mutex);
				4480
				4481	spin_lock(&fs_info->trans_lock);
				4482	while (!list_empty(&fs_info->trans_list)) {
				4483	t = list_first_entry(&fs_info->trans_list,
				4484	struct btrfs_transaction, list);
				4485	if (t->state >= TRANS_STATE_COMMIT_START) {
				4486	refcount_inc(&t->use_count);
				4487	spin_unlock(&fs_info->trans_lock);
				4488	btrfs_wait_for_commit(fs_info, t->transid);
				4489	btrfs_put_transaction(t);
				4490	spin_lock(&fs_info->trans_lock);
				4491	continue;
				4492	}
				4493	if (t == fs_info->running_transaction) {
				4494	t->state = TRANS_STATE_COMMIT_DOING;
				4495	spin_unlock(&fs_info->trans_lock);
				4496	/*
				4497	* We wait for 0 num_writers since we don't hold a trans
				4498	* handle open currently for this transaction.
				4499	*/
				4500	wait_event(t->writer_wait,
				4501	atomic_read(&t->num_writers) == 0);
				4502	} else {
				4503	spin_unlock(&fs_info->trans_lock);
				4504	}
				4505	btrfs_cleanup_one_transaction(t, fs_info);
				4506
				4507	spin_lock(&fs_info->trans_lock);
				4508	if (t == fs_info->running_transaction)
				4509	fs_info->running_transaction = NULL;
				4510	list_del_init(&t->list);
				4511	spin_unlock(&fs_info->trans_lock);
				4512
				4513	btrfs_put_transaction(t);
				4514	trace_btrfs_transaction_commit(fs_info->tree_root);
				4515	spin_lock(&fs_info->trans_lock);
				4516	}
				4517	spin_unlock(&fs_info->trans_lock);
				4518	btrfs_destroy_all_ordered_extents(fs_info);
				4519	btrfs_destroy_delayed_inodes(fs_info);
				4520	btrfs_assert_delayed_root_empty(fs_info);
				4521	btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
				4522	btrfs_destroy_all_delalloc_inodes(fs_info);
				4523	mutex_unlock(&fs_info->transaction_kthread_mutex);
				4524
				4525	return 0;
				4526	}
				4527
				4528	static const struct extent_io_ops btree_extent_io_ops = {
				4529	/* mandatory callbacks */
				4530	.submit_bio_hook = btree_submit_bio_hook,
				4531	.readpage_end_io_hook = btree_readpage_end_io_hook,
				4532	.readpage_io_failed_hook = btree_io_failed_hook,
				4533
				4534	/* optional callbacks */
				4535	};