Blame - src/kernel/linux/v4.19/fs/btrfs/transaction.c - T800

blob: 26317bca56499a406492dd84e4a602e2ee00f1eb [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/fs.h>
				7	#include <linux/slab.h>
				8	#include <linux/sched.h>
				9	#include <linux/writeback.h>
				10	#include <linux/pagemap.h>
				11	#include <linux/blkdev.h>
				12	#include <linux/uuid.h>
				13	#include "ctree.h"
				14	#include "disk-io.h"
				15	#include "transaction.h"
				16	#include "locking.h"
				17	#include "tree-log.h"
				18	#include "inode-map.h"
				19	#include "volumes.h"
				20	#include "dev-replace.h"
				21	#include "qgroup.h"
				22
				23	#define BTRFS_ROOT_TRANS_TAG 0
				24
				25	static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
				26	[TRANS_STATE_RUNNING] = 0U,
				27	[TRANS_STATE_BLOCKED] = __TRANS_START,
				28	[TRANS_STATE_COMMIT_START] = (__TRANS_START \| __TRANS_ATTACH),
				29	[TRANS_STATE_COMMIT_DOING] = (__TRANS_START \|
				30	__TRANS_ATTACH \|
				31	__TRANS_JOIN \|
				32	__TRANS_JOIN_NOSTART),
				33	[TRANS_STATE_UNBLOCKED] = (__TRANS_START \|
				34	__TRANS_ATTACH \|
				35	__TRANS_JOIN \|
				36	__TRANS_JOIN_NOLOCK \|
				37	__TRANS_JOIN_NOSTART),
				38	[TRANS_STATE_COMPLETED] = (__TRANS_START \|
				39	__TRANS_ATTACH \|
				40	__TRANS_JOIN \|
				41	__TRANS_JOIN_NOLOCK \|
				42	__TRANS_JOIN_NOSTART),
				43	};
				44
				45	void btrfs_put_transaction(struct btrfs_transaction *transaction)
				46	{
				47	WARN_ON(refcount_read(&transaction->use_count) == 0);
				48	if (refcount_dec_and_test(&transaction->use_count)) {
				49	BUG_ON(!list_empty(&transaction->list));
				50	WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
				51	if (transaction->delayed_refs.pending_csums)
				52	btrfs_err(transaction->fs_info,
				53	"pending csums is %llu",
				54	transaction->delayed_refs.pending_csums);
				55	while (!list_empty(&transaction->pending_chunks)) {
				56	struct extent_map *em;
				57
				58	em = list_first_entry(&transaction->pending_chunks,
				59	struct extent_map, list);
				60	list_del_init(&em->list);
				61	free_extent_map(em);
				62	}
				63	/*
				64	* If any block groups are found in ->deleted_bgs then it's
				65	* because the transaction was aborted and a commit did not
				66	* happen (things failed before writing the new superblock
				67	* and calling btrfs_finish_extent_commit()), so we can not
				68	* discard the physical locations of the block groups.
				69	*/
				70	while (!list_empty(&transaction->deleted_bgs)) {
				71	struct btrfs_block_group_cache *cache;
				72
				73	cache = list_first_entry(&transaction->deleted_bgs,
				74	struct btrfs_block_group_cache,
				75	bg_list);
				76	list_del_init(&cache->bg_list);
				77	btrfs_put_block_group_trimming(cache);
				78	btrfs_put_block_group(cache);
				79	}
				80	kfree(transaction);
				81	}
				82	}
				83
				84	static void clear_btree_io_tree(struct extent_io_tree *tree)
				85	{
				86	spin_lock(&tree->lock);
				87	/*
				88	* Do a single barrier for the waitqueue_active check here, the state
				89	* of the waitqueue should not change once clear_btree_io_tree is
				90	* called.
				91	*/
				92	smp_mb();
				93	while (!RB_EMPTY_ROOT(&tree->state)) {
				94	struct rb_node *node;
				95	struct extent_state *state;
				96
				97	node = rb_first(&tree->state);
				98	state = rb_entry(node, struct extent_state, rb_node);
				99	rb_erase(&state->rb_node, &tree->state);
				100	RB_CLEAR_NODE(&state->rb_node);
				101	/*
				102	* btree io trees aren't supposed to have tasks waiting for
				103	* changes in the flags of extent states ever.
				104	*/
				105	ASSERT(!waitqueue_active(&state->wq));
				106	free_extent_state(state);
				107
				108	cond_resched_lock(&tree->lock);
				109	}
				110	spin_unlock(&tree->lock);
				111	}
				112
				113	static noinline void switch_commit_roots(struct btrfs_transaction *trans)
				114	{
				115	struct btrfs_fs_info *fs_info = trans->fs_info;
				116	struct btrfs_root root, tmp;
				117
				118	down_write(&fs_info->commit_root_sem);
				119	list_for_each_entry_safe(root, tmp, &trans->switch_commits,
				120	dirty_list) {
				121	list_del_init(&root->dirty_list);
				122	free_extent_buffer(root->commit_root);
				123	root->commit_root = btrfs_root_node(root);
				124	if (is_fstree(root->objectid))
				125	btrfs_unpin_free_ino(root);
				126	clear_btree_io_tree(&root->dirty_log_pages);
				127	}
				128
				129	/* We can free old roots now. */
				130	spin_lock(&trans->dropped_roots_lock);
				131	while (!list_empty(&trans->dropped_roots)) {
				132	root = list_first_entry(&trans->dropped_roots,
				133	struct btrfs_root, root_list);
				134	list_del_init(&root->root_list);
				135	spin_unlock(&trans->dropped_roots_lock);
				136	btrfs_drop_and_free_fs_root(fs_info, root);
				137	spin_lock(&trans->dropped_roots_lock);
				138	}
				139	spin_unlock(&trans->dropped_roots_lock);
				140	up_write(&fs_info->commit_root_sem);
				141	}
				142
				143	static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
				144	unsigned int type)
				145	{
				146	if (type & TRANS_EXTWRITERS)
				147	atomic_inc(&trans->num_extwriters);
				148	}
				149
				150	static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
				151	unsigned int type)
				152	{
				153	if (type & TRANS_EXTWRITERS)
				154	atomic_dec(&trans->num_extwriters);
				155	}
				156
				157	static inline void extwriter_counter_init(struct btrfs_transaction *trans,
				158	unsigned int type)
				159	{
				160	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
				161	}
				162
				163	static inline int extwriter_counter_read(struct btrfs_transaction *trans)
				164	{
				165	return atomic_read(&trans->num_extwriters);
				166	}
				167
				168	/*
				169	* either allocate a new transaction or hop into the existing one
				170	*/
				171	static noinline int join_transaction(struct btrfs_fs_info *fs_info,
				172	unsigned int type)
				173	{
				174	struct btrfs_transaction *cur_trans;
				175
				176	spin_lock(&fs_info->trans_lock);
				177	loop:
				178	/* The file system has been taken offline. No new transactions. */
				179	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				180	spin_unlock(&fs_info->trans_lock);
				181	return -EROFS;
				182	}
				183
				184	cur_trans = fs_info->running_transaction;
				185	if (cur_trans) {
				186	if (cur_trans->aborted) {
				187	spin_unlock(&fs_info->trans_lock);
				188	return cur_trans->aborted;
				189	}
				190	if (btrfs_blocked_trans_types[cur_trans->state] & type) {
				191	spin_unlock(&fs_info->trans_lock);
				192	return -EBUSY;
				193	}
				194	refcount_inc(&cur_trans->use_count);
				195	atomic_inc(&cur_trans->num_writers);
				196	extwriter_counter_inc(cur_trans, type);
				197	spin_unlock(&fs_info->trans_lock);
				198	return 0;
				199	}
				200	spin_unlock(&fs_info->trans_lock);
				201
				202	/*
				203	* If we are ATTACH, we just want to catch the current transaction,
				204	* and commit it. If there is no transaction, just return ENOENT.
				205	*/
				206	if (type == TRANS_ATTACH)
				207	return -ENOENT;
				208
				209	/*
				210	* JOIN_NOLOCK only happens during the transaction commit, so
				211	* it is impossible that ->running_transaction is NULL
				212	*/
				213	BUG_ON(type == TRANS_JOIN_NOLOCK);
				214
				215	cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
				216	if (!cur_trans)
				217	return -ENOMEM;
				218
				219	spin_lock(&fs_info->trans_lock);
				220	if (fs_info->running_transaction) {
				221	/*
				222	* someone started a transaction after we unlocked. Make sure
				223	* to redo the checks above
				224	*/
				225	kfree(cur_trans);
				226	goto loop;
				227	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
				228	spin_unlock(&fs_info->trans_lock);
				229	kfree(cur_trans);
				230	return -EROFS;
				231	}
				232
				233	cur_trans->fs_info = fs_info;
				234	atomic_set(&cur_trans->num_writers, 1);
				235	extwriter_counter_init(cur_trans, type);
				236	init_waitqueue_head(&cur_trans->writer_wait);
				237	init_waitqueue_head(&cur_trans->commit_wait);
				238	init_waitqueue_head(&cur_trans->pending_wait);
				239	cur_trans->state = TRANS_STATE_RUNNING;
				240	/*
				241	* One for this trans handle, one so it will live on until we
				242	* commit the transaction.
				243	*/
				244	refcount_set(&cur_trans->use_count, 2);
				245	atomic_set(&cur_trans->pending_ordered, 0);
				246	cur_trans->flags = 0;
				247	cur_trans->start_time = ktime_get_seconds();
				248
				249	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
				250
				251	cur_trans->delayed_refs.href_root = RB_ROOT;
				252	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
				253	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
				254
				255	/*
				256	* although the tree mod log is per file system and not per transaction,
				257	* the log must never go across transaction boundaries.
				258	*/
				259	smp_mb();
				260	if (!list_empty(&fs_info->tree_mod_seq_list))
				261	WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
				262	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
				263	WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
				264	atomic64_set(&fs_info->tree_mod_seq, 0);
				265
				266	spin_lock_init(&cur_trans->delayed_refs.lock);
				267
				268	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
				269	INIT_LIST_HEAD(&cur_trans->pending_chunks);
				270	INIT_LIST_HEAD(&cur_trans->switch_commits);
				271	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
				272	INIT_LIST_HEAD(&cur_trans->io_bgs);
				273	INIT_LIST_HEAD(&cur_trans->dropped_roots);
				274	mutex_init(&cur_trans->cache_write_mutex);
				275	cur_trans->num_dirty_bgs = 0;
				276	spin_lock_init(&cur_trans->dirty_bgs_lock);
				277	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
				278	spin_lock_init(&cur_trans->dropped_roots_lock);
				279	list_add_tail(&cur_trans->list, &fs_info->trans_list);
				280	extent_io_tree_init(&cur_trans->dirty_pages,
				281	fs_info->btree_inode);
				282	fs_info->generation++;
				283	cur_trans->transid = fs_info->generation;
				284	fs_info->running_transaction = cur_trans;
				285	cur_trans->aborted = 0;
				286	spin_unlock(&fs_info->trans_lock);
				287
				288	return 0;
				289	}
				290
				291	/*
				292	* this does all the record keeping required to make sure that a reference
				293	* counted root is properly recorded in a given transaction. This is required
				294	* to make sure the old root from before we joined the transaction is deleted
				295	* when the transaction commits
				296	*/
				297	static int record_root_in_trans(struct btrfs_trans_handle *trans,
				298	struct btrfs_root *root,
				299	int force)
				300	{
				301	struct btrfs_fs_info *fs_info = root->fs_info;
				302
				303	if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
				304	root->last_trans < trans->transid) \|\| force) {
				305	WARN_ON(root == fs_info->extent_root);
				306	WARN_ON(!force && root->commit_root != root->node);
				307
				308	/*
				309	* see below for IN_TRANS_SETUP usage rules
				310	* we have the reloc mutex held now, so there
				311	* is only one writer in this function
				312	*/
				313	set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
				314
				315	/* make sure readers find IN_TRANS_SETUP before
				316	* they find our root->last_trans update
				317	*/
				318	smp_wmb();
				319
				320	spin_lock(&fs_info->fs_roots_radix_lock);
				321	if (root->last_trans == trans->transid && !force) {
				322	spin_unlock(&fs_info->fs_roots_radix_lock);
				323	return 0;
				324	}
				325	radix_tree_tag_set(&fs_info->fs_roots_radix,
				326	(unsigned long)root->root_key.objectid,
				327	BTRFS_ROOT_TRANS_TAG);
				328	spin_unlock(&fs_info->fs_roots_radix_lock);
				329	root->last_trans = trans->transid;
				330
				331	/* this is pretty tricky. We don't want to
				332	* take the relocation lock in btrfs_record_root_in_trans
				333	* unless we're really doing the first setup for this root in
				334	* this transaction.
				335	*
				336	* Normally we'd use root->last_trans as a flag to decide
				337	* if we want to take the expensive mutex.
				338	*
				339	* But, we have to set root->last_trans before we
				340	* init the relocation root, otherwise, we trip over warnings
				341	* in ctree.c. The solution used here is to flag ourselves
				342	* with root IN_TRANS_SETUP. When this is 1, we're still
				343	* fixing up the reloc trees and everyone must wait.
				344	*
				345	* When this is zero, they can trust root->last_trans and fly
				346	* through btrfs_record_root_in_trans without having to take the
				347	* lock. smp_wmb() makes sure that all the writes above are
				348	* done before we pop in the zero below
				349	*/
				350	btrfs_init_reloc_root(trans, root);
				351	smp_mb__before_atomic();
				352	clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
				353	}
				354	return 0;
				355	}
				356
				357
				358	void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
				359	struct btrfs_root *root)
				360	{
				361	struct btrfs_fs_info *fs_info = root->fs_info;
				362	struct btrfs_transaction *cur_trans = trans->transaction;
				363
				364	/* Add ourselves to the transaction dropped list */
				365	spin_lock(&cur_trans->dropped_roots_lock);
				366	list_add_tail(&root->root_list, &cur_trans->dropped_roots);
				367	spin_unlock(&cur_trans->dropped_roots_lock);
				368
				369	/* Make sure we don't try to update the root at commit time */
				370	spin_lock(&fs_info->fs_roots_radix_lock);
				371	radix_tree_tag_clear(&fs_info->fs_roots_radix,
				372	(unsigned long)root->root_key.objectid,
				373	BTRFS_ROOT_TRANS_TAG);
				374	spin_unlock(&fs_info->fs_roots_radix_lock);
				375	}
				376
				377	int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
				378	struct btrfs_root *root)
				379	{
				380	struct btrfs_fs_info *fs_info = root->fs_info;
				381
				382	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
				383	return 0;
				384
				385	/*
				386	* see record_root_in_trans for comments about IN_TRANS_SETUP usage
				387	* and barriers
				388	*/
				389	smp_rmb();
				390	if (root->last_trans == trans->transid &&
				391	!test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
				392	return 0;
				393
				394	mutex_lock(&fs_info->reloc_mutex);
				395	record_root_in_trans(trans, root, 0);
				396	mutex_unlock(&fs_info->reloc_mutex);
				397
				398	return 0;
				399	}
				400
				401	static inline int is_transaction_blocked(struct btrfs_transaction *trans)
				402	{
				403	return (trans->state >= TRANS_STATE_BLOCKED &&
				404	trans->state < TRANS_STATE_UNBLOCKED &&
				405	!trans->aborted);
				406	}
				407
				408	/* wait for commit against the current transaction to become unblocked
				409	* when this is done, it is safe to start a new transaction, but the current
				410	* transaction might not be fully on disk.
				411	*/
				412	static void wait_current_trans(struct btrfs_fs_info *fs_info)
				413	{
				414	struct btrfs_transaction *cur_trans;
				415
				416	spin_lock(&fs_info->trans_lock);
				417	cur_trans = fs_info->running_transaction;
				418	if (cur_trans && is_transaction_blocked(cur_trans)) {
				419	refcount_inc(&cur_trans->use_count);
				420	spin_unlock(&fs_info->trans_lock);
				421
				422	wait_event(fs_info->transaction_wait,
				423	cur_trans->state >= TRANS_STATE_UNBLOCKED \|\|
				424	cur_trans->aborted);
				425	btrfs_put_transaction(cur_trans);
				426	} else {
				427	spin_unlock(&fs_info->trans_lock);
				428	}
				429	}
				430
				431	static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
				432	{
				433	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
				434	return 0;
				435
				436	if (type == TRANS_START)
				437	return 1;
				438
				439	return 0;
				440	}
				441
				442	static inline bool need_reserve_reloc_root(struct btrfs_root *root)
				443	{
				444	struct btrfs_fs_info *fs_info = root->fs_info;
				445
				446	if (!fs_info->reloc_ctl \|\|
				447	!test_bit(BTRFS_ROOT_REF_COWS, &root->state) \|\|
				448	root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID \|\|
				449	root->reloc_root)
				450	return false;
				451
				452	return true;
				453	}
				454
				455	static struct btrfs_trans_handle *
				456	start_transaction(struct btrfs_root *root, unsigned int num_items,
				457	unsigned int type, enum btrfs_reserve_flush_enum flush,
				458	bool enforce_qgroups)
				459	{
				460	struct btrfs_fs_info *fs_info = root->fs_info;
				461
				462	struct btrfs_trans_handle *h;
				463	struct btrfs_transaction *cur_trans;
				464	u64 num_bytes = 0;
				465	u64 qgroup_reserved = 0;
				466	bool reloc_reserved = false;
				467	int ret;
				468
				469	/* Send isn't supposed to start transactions. */
				470	ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
				471
				472	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
				473	return ERR_PTR(-EROFS);
				474
				475	if (current->journal_info) {
				476	WARN_ON(type & TRANS_EXTWRITERS);
				477	h = current->journal_info;
				478	refcount_inc(&h->use_count);
				479	WARN_ON(refcount_read(&h->use_count) > 2);
				480	h->orig_rsv = h->block_rsv;
				481	h->block_rsv = NULL;
				482	goto got_it;
				483	}
				484
				485	/*
				486	* Do the reservation before we join the transaction so we can do all
				487	* the appropriate flushing if need be.
				488	*/
				489	if (num_items && root != fs_info->chunk_root) {
				490	qgroup_reserved = num_items * fs_info->nodesize;
				491	ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
				492	enforce_qgroups);
				493	if (ret)
				494	return ERR_PTR(ret);
				495
				496	num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
				497	/*
				498	* Do the reservation for the relocation root creation
				499	*/
				500	if (need_reserve_reloc_root(root)) {
				501	num_bytes += fs_info->nodesize;
				502	reloc_reserved = true;
				503	}
				504
				505	ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
				506	num_bytes, flush);
				507	if (ret)
				508	goto reserve_fail;
				509	}
				510	again:
				511	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
				512	if (!h) {
				513	ret = -ENOMEM;
				514	goto alloc_fail;
				515	}
				516
				517	/*
				518	* If we are JOIN_NOLOCK we're already committing a transaction and
				519	* waiting on this guy, so we don't need to do the sb_start_intwrite
				520	* because we're already holding a ref. We need this because we could
				521	* have raced in and did an fsync() on a file which can kick a commit
				522	* and then we deadlock with somebody doing a freeze.
				523	*
				524	* If we are ATTACH, it means we just want to catch the current
				525	* transaction and commit it, so we needn't do sb_start_intwrite().
				526	*/
				527	if (type & __TRANS_FREEZABLE)
				528	sb_start_intwrite(fs_info->sb);
				529
				530	if (may_wait_transaction(fs_info, type))
				531	wait_current_trans(fs_info);
				532
				533	do {
				534	ret = join_transaction(fs_info, type);
				535	if (ret == -EBUSY) {
				536	wait_current_trans(fs_info);
				537	if (unlikely(type == TRANS_ATTACH \|\|
				538	type == TRANS_JOIN_NOSTART))
				539	ret = -ENOENT;
				540	}
				541	} while (ret == -EBUSY);
				542
				543	if (ret < 0)
				544	goto join_fail;
				545
				546	cur_trans = fs_info->running_transaction;
				547
				548	h->transid = cur_trans->transid;
				549	h->transaction = cur_trans;
				550	h->root = root;
				551	refcount_set(&h->use_count, 1);
				552	h->fs_info = root->fs_info;
				553
				554	h->type = type;
				555	h->can_flush_pending_bgs = true;
				556	INIT_LIST_HEAD(&h->new_bgs);
				557
				558	smp_mb();
				559	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
				560	may_wait_transaction(fs_info, type)) {
				561	current->journal_info = h;
				562	btrfs_commit_transaction(h);
				563	goto again;
				564	}
				565
				566	if (num_bytes) {
				567	trace_btrfs_space_reservation(fs_info, "transaction",
				568	h->transid, num_bytes, 1);
				569	h->block_rsv = &fs_info->trans_block_rsv;
				570	h->bytes_reserved = num_bytes;
				571	h->reloc_reserved = reloc_reserved;
				572	}
				573
				574	got_it:
				575	btrfs_record_root_in_trans(h, root);
				576
				577	if (!current->journal_info)
				578	current->journal_info = h;
				579	return h;
				580
				581	join_fail:
				582	if (type & __TRANS_FREEZABLE)
				583	sb_end_intwrite(fs_info->sb);
				584	kmem_cache_free(btrfs_trans_handle_cachep, h);
				585	alloc_fail:
				586	if (num_bytes)
				587	btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
				588	num_bytes);
				589	reserve_fail:
				590	btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
				591	return ERR_PTR(ret);
				592	}
				593
				594	struct btrfs_trans_handle btrfs_start_transaction(struct btrfs_root root,
				595	unsigned int num_items)
				596	{
				597	return start_transaction(root, num_items, TRANS_START,
				598	BTRFS_RESERVE_FLUSH_ALL, true);
				599	}
				600
				601	struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
				602	struct btrfs_root *root,
				603	unsigned int num_items,
				604	int min_factor)
				605	{
				606	struct btrfs_fs_info *fs_info = root->fs_info;
				607	struct btrfs_trans_handle *trans;
				608	u64 num_bytes;
				609	int ret;
				610
				611	/*
				612	* We have two callers: unlink and block group removal. The
				613	* former should succeed even if we will temporarily exceed
				614	* quota and the latter operates on the extent root so
				615	* qgroup enforcement is ignored anyway.
				616	*/
				617	trans = start_transaction(root, num_items, TRANS_START,
				618	BTRFS_RESERVE_FLUSH_ALL, false);
				619	if (!IS_ERR(trans) \|\| PTR_ERR(trans) != -ENOSPC)
				620	return trans;
				621
				622	trans = btrfs_start_transaction(root, 0);
				623	if (IS_ERR(trans))
				624	return trans;
				625
				626	num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
				627	ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
				628	num_bytes, min_factor);
				629	if (ret) {
				630	btrfs_end_transaction(trans);
				631	return ERR_PTR(ret);
				632	}
				633
				634	trans->block_rsv = &fs_info->trans_block_rsv;
				635	trans->bytes_reserved = num_bytes;
				636	trace_btrfs_space_reservation(fs_info, "transaction",
				637	trans->transid, num_bytes, 1);
				638
				639	return trans;
				640	}
				641
				642	struct btrfs_trans_handle btrfs_join_transaction(struct btrfs_root root)
				643	{
				644	return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
				645	true);
				646	}
				647
				648	struct btrfs_trans_handle btrfs_join_transaction_nolock(struct btrfs_root root)
				649	{
				650	return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
				651	BTRFS_RESERVE_NO_FLUSH, true);
				652	}
				653
				654	/*
				655	* Similar to regular join but it never starts a transaction when none is
				656	* running or after waiting for the current one to finish.
				657	*/
				658	struct btrfs_trans_handle btrfs_join_transaction_nostart(struct btrfs_root root)
				659	{
				660	return start_transaction(root, 0, TRANS_JOIN_NOSTART,
				661	BTRFS_RESERVE_NO_FLUSH, true);
				662	}
				663
				664	/*
				665	* btrfs_attach_transaction() - catch the running transaction
				666	*
				667	* It is used when we want to commit the current the transaction, but
				668	* don't want to start a new one.
				669	*
				670	* Note: If this function return -ENOENT, it just means there is no
				671	* running transaction. But it is possible that the inactive transaction
				672	* is still in the memory, not fully on disk. If you hope there is no
				673	* inactive transaction in the fs when -ENOENT is returned, you should
				674	* invoke
				675	* btrfs_attach_transaction_barrier()
				676	*/
				677	struct btrfs_trans_handle btrfs_attach_transaction(struct btrfs_root root)
				678	{
				679	return start_transaction(root, 0, TRANS_ATTACH,
				680	BTRFS_RESERVE_NO_FLUSH, true);
				681	}
				682
				683	/*
				684	* btrfs_attach_transaction_barrier() - catch the running transaction
				685	*
				686	* It is similar to the above function, the differentia is this one
				687	* will wait for all the inactive transactions until they fully
				688	* complete.
				689	*/
				690	struct btrfs_trans_handle *
				691	btrfs_attach_transaction_barrier(struct btrfs_root *root)
				692	{
				693	struct btrfs_trans_handle *trans;
				694
				695	trans = start_transaction(root, 0, TRANS_ATTACH,
				696	BTRFS_RESERVE_NO_FLUSH, true);
				697	if (trans == ERR_PTR(-ENOENT))
				698	btrfs_wait_for_commit(root->fs_info, 0);
				699
				700	return trans;
				701	}
				702
				703	/* wait for a transaction commit to be fully complete */
				704	static noinline void wait_for_commit(struct btrfs_transaction *commit)
				705	{
				706	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
				707	}
				708
				709	int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
				710	{
				711	struct btrfs_transaction cur_trans = NULL, t;
				712	int ret = 0;
				713
				714	if (transid) {
				715	if (transid <= fs_info->last_trans_committed)
				716	goto out;
				717
				718	/* find specified transaction */
				719	spin_lock(&fs_info->trans_lock);
				720	list_for_each_entry(t, &fs_info->trans_list, list) {
				721	if (t->transid == transid) {
				722	cur_trans = t;
				723	refcount_inc(&cur_trans->use_count);
				724	ret = 0;
				725	break;
				726	}
				727	if (t->transid > transid) {
				728	ret = 0;
				729	break;
				730	}
				731	}
				732	spin_unlock(&fs_info->trans_lock);
				733
				734	/*
				735	* The specified transaction doesn't exist, or we
				736	* raced with btrfs_commit_transaction
				737	*/
				738	if (!cur_trans) {
				739	if (transid > fs_info->last_trans_committed)
				740	ret = -EINVAL;
				741	goto out;
				742	}
				743	} else {
				744	/* find newest transaction that is committing \| committed */
				745	spin_lock(&fs_info->trans_lock);
				746	list_for_each_entry_reverse(t, &fs_info->trans_list,
				747	list) {
				748	if (t->state >= TRANS_STATE_COMMIT_START) {
				749	if (t->state == TRANS_STATE_COMPLETED)
				750	break;
				751	cur_trans = t;
				752	refcount_inc(&cur_trans->use_count);
				753	break;
				754	}
				755	}
				756	spin_unlock(&fs_info->trans_lock);
				757	if (!cur_trans)
				758	goto out; /* nothing committing\|committed */
				759	}
				760
				761	wait_for_commit(cur_trans);
				762	btrfs_put_transaction(cur_trans);
				763	out:
				764	return ret;
				765	}
				766
				767	void btrfs_throttle(struct btrfs_fs_info *fs_info)
				768	{
				769	wait_current_trans(fs_info);
				770	}
				771
				772	static int should_end_transaction(struct btrfs_trans_handle *trans)
				773	{
				774	struct btrfs_fs_info *fs_info = trans->fs_info;
				775
				776	if (btrfs_check_space_for_delayed_refs(trans, fs_info))
				777	return 1;
				778
				779	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
				780	}
				781
				782	int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
				783	{
				784	struct btrfs_transaction *cur_trans = trans->transaction;
				785	int updates;
				786	int err;
				787
				788	smp_mb();
				789	if (cur_trans->state >= TRANS_STATE_BLOCKED \|\|
				790	cur_trans->delayed_refs.flushing)
				791	return 1;
				792
				793	updates = trans->delayed_ref_updates;
				794	trans->delayed_ref_updates = 0;
				795	if (updates) {
				796	err = btrfs_run_delayed_refs(trans, updates * 2);
				797	if (err) /* Error code will also eval true */
				798	return err;
				799	}
				800
				801	return should_end_transaction(trans);
				802	}
				803
				804	static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
				805
				806	{
				807	struct btrfs_fs_info *fs_info = trans->fs_info;
				808
				809	if (!trans->block_rsv) {
				810	ASSERT(!trans->bytes_reserved);
				811	return;
				812	}
				813
				814	if (!trans->bytes_reserved)
				815	return;
				816
				817	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
				818	trace_btrfs_space_reservation(fs_info, "transaction",
				819	trans->transid, trans->bytes_reserved, 0);
				820	btrfs_block_rsv_release(fs_info, trans->block_rsv,
				821	trans->bytes_reserved);
				822	trans->bytes_reserved = 0;
				823	}
				824
				825	static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
				826	int throttle)
				827	{
				828	struct btrfs_fs_info *info = trans->fs_info;
				829	struct btrfs_transaction *cur_trans = trans->transaction;
				830	u64 transid = trans->transid;
				831	unsigned long cur = trans->delayed_ref_updates;
				832	int lock = (trans->type != TRANS_JOIN_NOLOCK);
				833	int err = 0;
				834	int must_run_delayed_refs = 0;
				835
				836	if (refcount_read(&trans->use_count) > 1) {
				837	refcount_dec(&trans->use_count);
				838	trans->block_rsv = trans->orig_rsv;
				839	return 0;
				840	}
				841
				842	btrfs_trans_release_metadata(trans);
				843	trans->block_rsv = NULL;
				844
				845	if (!list_empty(&trans->new_bgs))
				846	btrfs_create_pending_block_groups(trans);
				847
				848	trans->delayed_ref_updates = 0;
				849	if (!trans->sync) {
				850	must_run_delayed_refs =
				851	btrfs_should_throttle_delayed_refs(trans, info);
				852	cur = max_t(unsigned long, cur, 32);
				853
				854	/*
				855	* don't make the caller wait if they are from a NOLOCK
				856	* or ATTACH transaction, it will deadlock with commit
				857	*/
				858	if (must_run_delayed_refs == 1 &&
				859	(trans->type & (__TRANS_JOIN_NOLOCK \| __TRANS_ATTACH)))
				860	must_run_delayed_refs = 2;
				861	}
				862
				863	btrfs_trans_release_metadata(trans);
				864	trans->block_rsv = NULL;
				865
				866	if (!list_empty(&trans->new_bgs))
				867	btrfs_create_pending_block_groups(trans);
				868
				869	btrfs_trans_release_chunk_metadata(trans);
				870
				871	if (lock && should_end_transaction(trans) &&
				872	READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
				873	spin_lock(&info->trans_lock);
				874	if (cur_trans->state == TRANS_STATE_RUNNING)
				875	cur_trans->state = TRANS_STATE_BLOCKED;
				876	spin_unlock(&info->trans_lock);
				877	}
				878
				879	if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
				880	if (throttle)
				881	return btrfs_commit_transaction(trans);
				882	else
				883	wake_up_process(info->transaction_kthread);
				884	}
				885
				886	if (trans->type & __TRANS_FREEZABLE)
				887	sb_end_intwrite(info->sb);
				888
				889	WARN_ON(cur_trans != info->running_transaction);
				890	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
				891	atomic_dec(&cur_trans->num_writers);
				892	extwriter_counter_dec(cur_trans, trans->type);
				893
				894	cond_wake_up(&cur_trans->writer_wait);
				895	btrfs_put_transaction(cur_trans);
				896
				897	if (current->journal_info == trans)
				898	current->journal_info = NULL;
				899
				900	if (throttle)
				901	btrfs_run_delayed_iputs(info);
				902
				903	if (trans->aborted \|\|
				904	test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
				905	wake_up_process(info->transaction_kthread);
				906	err = -EIO;
				907	}
				908
				909	kmem_cache_free(btrfs_trans_handle_cachep, trans);
				910	if (must_run_delayed_refs) {
				911	btrfs_async_run_delayed_refs(info, cur, transid,
				912	must_run_delayed_refs == 1);
				913	}
				914	return err;
				915	}
				916
				917	int btrfs_end_transaction(struct btrfs_trans_handle *trans)
				918	{
				919	return __btrfs_end_transaction(trans, 0);
				920	}
				921
				922	int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
				923	{
				924	return __btrfs_end_transaction(trans, 1);
				925	}
				926
				927	/*
				928	* when btree blocks are allocated, they have some corresponding bits set for
				929	* them in one of two extent_io trees. This is used to make sure all of
				930	* those extents are sent to disk but does not wait on them
				931	*/
				932	int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
				933	struct extent_io_tree *dirty_pages, int mark)
				934	{
				935	int err = 0;
				936	int werr = 0;
				937	struct address_space *mapping = fs_info->btree_inode->i_mapping;
				938	struct extent_state *cached_state = NULL;
				939	u64 start = 0;
				940	u64 end;
				941
				942	atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
				943	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				944	mark, &cached_state)) {
				945	bool wait_writeback = false;
				946
				947	err = convert_extent_bit(dirty_pages, start, end,
				948	EXTENT_NEED_WAIT,
				949	mark, &cached_state);
				950	/*
				951	* convert_extent_bit can return -ENOMEM, which is most of the
				952	* time a temporary error. So when it happens, ignore the error
				953	* and wait for writeback of this range to finish - because we
				954	* failed to set the bit EXTENT_NEED_WAIT for the range, a call
				955	* to __btrfs_wait_marked_extents() would not know that
				956	* writeback for this range started and therefore wouldn't
				957	* wait for it to finish - we don't want to commit a
				958	* superblock that points to btree nodes/leafs for which
				959	* writeback hasn't finished yet (and without errors).
				960	* We cleanup any entries left in the io tree when committing
				961	* the transaction (through clear_btree_io_tree()).
				962	*/
				963	if (err == -ENOMEM) {
				964	err = 0;
				965	wait_writeback = true;
				966	}
				967	if (!err)
				968	err = filemap_fdatawrite_range(mapping, start, end);
				969	if (err)
				970	werr = err;
				971	else if (wait_writeback)
				972	werr = filemap_fdatawait_range(mapping, start, end);
				973	free_extent_state(cached_state);
				974	cached_state = NULL;
				975	cond_resched();
				976	start = end + 1;
				977	}
				978	atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
				979	return werr;
				980	}
				981
				982	/*
				983	* when btree blocks are allocated, they have some corresponding bits set for
				984	* them in one of two extent_io trees. This is used to make sure all of
				985	* those extents are on disk for transaction or log commit. We wait
				986	* on all the pages and clear them from the dirty pages state tree
				987	*/
				988	static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
				989	struct extent_io_tree *dirty_pages)
				990	{
				991	int err = 0;
				992	int werr = 0;
				993	struct address_space *mapping = fs_info->btree_inode->i_mapping;
				994	struct extent_state *cached_state = NULL;
				995	u64 start = 0;
				996	u64 end;
				997
				998	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				999	EXTENT_NEED_WAIT, &cached_state)) {
				1000	/*
				1001	* Ignore -ENOMEM errors returned by clear_extent_bit().
				1002	* When committing the transaction, we'll remove any entries
				1003	* left in the io tree. For a log commit, we don't remove them
				1004	* after committing the log because the tree can be accessed
				1005	* concurrently - we do it only at transaction commit time when
				1006	* it's safe to do it (through clear_btree_io_tree()).
				1007	*/
				1008	err = clear_extent_bit(dirty_pages, start, end,
				1009	EXTENT_NEED_WAIT, 0, 0, &cached_state);
				1010	if (err == -ENOMEM)
				1011	err = 0;
				1012	if (!err)
				1013	err = filemap_fdatawait_range(mapping, start, end);
				1014	if (err)
				1015	werr = err;
				1016	free_extent_state(cached_state);
				1017	cached_state = NULL;
				1018	cond_resched();
				1019	start = end + 1;
				1020	}
				1021	if (err)
				1022	werr = err;
				1023	return werr;
				1024	}
				1025
				1026	int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
				1027	struct extent_io_tree *dirty_pages)
				1028	{
				1029	bool errors = false;
				1030	int err;
				1031
				1032	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
				1033	if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
				1034	errors = true;
				1035
				1036	if (errors && !err)
				1037	err = -EIO;
				1038	return err;
				1039	}
				1040
				1041	int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
				1042	{
				1043	struct btrfs_fs_info *fs_info = log_root->fs_info;
				1044	struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
				1045	bool errors = false;
				1046	int err;
				1047
				1048	ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
				1049
				1050	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
				1051	if ((mark & EXTENT_DIRTY) &&
				1052	test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
				1053	errors = true;
				1054
				1055	if ((mark & EXTENT_NEW) &&
				1056	test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
				1057	errors = true;
				1058
				1059	if (errors && !err)
				1060	err = -EIO;
				1061	return err;
				1062	}
				1063
				1064	/*
				1065	* When btree blocks are allocated the corresponding extents are marked dirty.
				1066	* This function ensures such extents are persisted on disk for transaction or
				1067	* log commit.
				1068	*
				1069	* @trans: transaction whose dirty pages we'd like to write
				1070	*/
				1071	static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
				1072	{
				1073	int ret;
				1074	int ret2;
				1075	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
				1076	struct btrfs_fs_info *fs_info = trans->fs_info;
				1077	struct blk_plug plug;
				1078
				1079	blk_start_plug(&plug);
				1080	ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
				1081	blk_finish_plug(&plug);
				1082	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
				1083
				1084	clear_btree_io_tree(&trans->transaction->dirty_pages);
				1085
				1086	if (ret)
				1087	return ret;
				1088	else if (ret2)
				1089	return ret2;
				1090	else
				1091	return 0;
				1092	}
				1093
				1094	/*
				1095	* this is used to update the root pointer in the tree of tree roots.
				1096	*
				1097	* But, in the case of the extent allocation tree, updating the root
				1098	* pointer may allocate blocks which may change the root of the extent
				1099	* allocation tree.
				1100	*
				1101	* So, this loops and repeats and makes sure the cowonly root didn't
				1102	* change while the root pointer was being updated in the metadata.
				1103	*/
				1104	static int update_cowonly_root(struct btrfs_trans_handle *trans,
				1105	struct btrfs_root *root)
				1106	{
				1107	int ret;
				1108	u64 old_root_bytenr;
				1109	u64 old_root_used;
				1110	struct btrfs_fs_info *fs_info = root->fs_info;
				1111	struct btrfs_root *tree_root = fs_info->tree_root;
				1112
				1113	old_root_used = btrfs_root_used(&root->root_item);
				1114
				1115	while (1) {
				1116	old_root_bytenr = btrfs_root_bytenr(&root->root_item);
				1117	if (old_root_bytenr == root->node->start &&
				1118	old_root_used == btrfs_root_used(&root->root_item))
				1119	break;
				1120
				1121	btrfs_set_root_node(&root->root_item, root->node);
				1122	ret = btrfs_update_root(trans, tree_root,
				1123	&root->root_key,
				1124	&root->root_item);
				1125	if (ret)
				1126	return ret;
				1127
				1128	old_root_used = btrfs_root_used(&root->root_item);
				1129	}
				1130
				1131	return 0;
				1132	}
				1133
				1134	/*
				1135	* update all the cowonly tree roots on disk
				1136	*
				1137	* The error handling in this function may not be obvious. Any of the
				1138	* failures will cause the file system to go offline. We still need
				1139	* to clean up the delayed refs.
				1140	*/
				1141	static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
				1142	{
				1143	struct btrfs_fs_info *fs_info = trans->fs_info;
				1144	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
				1145	struct list_head *io_bgs = &trans->transaction->io_bgs;
				1146	struct list_head *next;
				1147	struct extent_buffer *eb;
				1148	int ret;
				1149
				1150	eb = btrfs_lock_root_node(fs_info->tree_root);
				1151	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
				1152	0, &eb);
				1153	btrfs_tree_unlock(eb);
				1154	free_extent_buffer(eb);
				1155
				1156	if (ret)
				1157	return ret;
				1158
				1159	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1160	if (ret)
				1161	return ret;
				1162
				1163	ret = btrfs_run_dev_stats(trans, fs_info);
				1164	if (ret)
				1165	return ret;
				1166	ret = btrfs_run_dev_replace(trans, fs_info);
				1167	if (ret)
				1168	return ret;
				1169	ret = btrfs_run_qgroups(trans);
				1170	if (ret)
				1171	return ret;
				1172
				1173	ret = btrfs_setup_space_cache(trans, fs_info);
				1174	if (ret)
				1175	return ret;
				1176
				1177	/* run_qgroups might have added some more refs */
				1178	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1179	if (ret)
				1180	return ret;
				1181	again:
				1182	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
				1183	struct btrfs_root *root;
				1184	next = fs_info->dirty_cowonly_roots.next;
				1185	list_del_init(next);
				1186	root = list_entry(next, struct btrfs_root, dirty_list);
				1187	clear_bit(BTRFS_ROOT_DIRTY, &root->state);
				1188
				1189	if (root != fs_info->extent_root)
				1190	list_add_tail(&root->dirty_list,
				1191	&trans->transaction->switch_commits);
				1192	ret = update_cowonly_root(trans, root);
				1193	if (ret)
				1194	return ret;
				1195	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1196	if (ret)
				1197	return ret;
				1198	}
				1199
				1200	while (!list_empty(dirty_bgs) \|\| !list_empty(io_bgs)) {
				1201	ret = btrfs_write_dirty_block_groups(trans, fs_info);
				1202	if (ret)
				1203	return ret;
				1204	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1205	if (ret)
				1206	return ret;
				1207	}
				1208
				1209	if (!list_empty(&fs_info->dirty_cowonly_roots))
				1210	goto again;
				1211
				1212	list_add_tail(&fs_info->extent_root->dirty_list,
				1213	&trans->transaction->switch_commits);
				1214	btrfs_after_dev_replace_commit(fs_info);
				1215
				1216	return 0;
				1217	}
				1218
				1219	/*
				1220	* dead roots are old snapshots that need to be deleted. This allocates
				1221	* a dirty root struct and adds it into the list of dead roots that need to
				1222	* be deleted
				1223	*/
				1224	void btrfs_add_dead_root(struct btrfs_root *root)
				1225	{
				1226	struct btrfs_fs_info *fs_info = root->fs_info;
				1227
				1228	spin_lock(&fs_info->trans_lock);
				1229	if (list_empty(&root->root_list))
				1230	list_add_tail(&root->root_list, &fs_info->dead_roots);
				1231	spin_unlock(&fs_info->trans_lock);
				1232	}
				1233
				1234	/*
				1235	* update all the cowonly tree roots on disk
				1236	*/
				1237	static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
				1238	{
				1239	struct btrfs_fs_info *fs_info = trans->fs_info;
				1240	struct btrfs_root *gang[8];
				1241	int i;
				1242	int ret;
				1243	int err = 0;
				1244
				1245	spin_lock(&fs_info->fs_roots_radix_lock);
				1246	while (1) {
				1247	ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
				1248	(void **)gang, 0,
				1249	ARRAY_SIZE(gang),
				1250	BTRFS_ROOT_TRANS_TAG);
				1251	if (ret == 0)
				1252	break;
				1253	for (i = 0; i < ret; i++) {
				1254	struct btrfs_root *root = gang[i];
				1255	radix_tree_tag_clear(&fs_info->fs_roots_radix,
				1256	(unsigned long)root->root_key.objectid,
				1257	BTRFS_ROOT_TRANS_TAG);
				1258	spin_unlock(&fs_info->fs_roots_radix_lock);
				1259
				1260	btrfs_free_log(trans, root);
				1261	btrfs_update_reloc_root(trans, root);
				1262
				1263	btrfs_save_ino_cache(root, trans);
				1264
				1265	/* see comments in should_cow_block() */
				1266	clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
				1267	smp_mb__after_atomic();
				1268
				1269	if (root->commit_root != root->node) {
				1270	list_add_tail(&root->dirty_list,
				1271	&trans->transaction->switch_commits);
				1272	btrfs_set_root_node(&root->root_item,
				1273	root->node);
				1274	}
				1275
				1276	err = btrfs_update_root(trans, fs_info->tree_root,
				1277	&root->root_key,
				1278	&root->root_item);
				1279	spin_lock(&fs_info->fs_roots_radix_lock);
				1280	if (err)
				1281	break;
				1282	btrfs_qgroup_free_meta_all_pertrans(root);
				1283	}
				1284	}
				1285	spin_unlock(&fs_info->fs_roots_radix_lock);
				1286	return err;
				1287	}
				1288
				1289	/*
				1290	* defrag a given btree.
				1291	* Every leaf in the btree is read and defragged.
				1292	*/
				1293	int btrfs_defrag_root(struct btrfs_root *root)
				1294	{
				1295	struct btrfs_fs_info *info = root->fs_info;
				1296	struct btrfs_trans_handle *trans;
				1297	int ret;
				1298
				1299	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
				1300	return 0;
				1301
				1302	while (1) {
				1303	trans = btrfs_start_transaction(root, 0);
				1304	if (IS_ERR(trans))
				1305	return PTR_ERR(trans);
				1306
				1307	ret = btrfs_defrag_leaves(trans, root);
				1308
				1309	btrfs_end_transaction(trans);
				1310	btrfs_btree_balance_dirty(info);
				1311	cond_resched();
				1312
				1313	if (btrfs_fs_closing(info) \|\| ret != -EAGAIN)
				1314	break;
				1315
				1316	if (btrfs_defrag_cancelled(info)) {
				1317	btrfs_debug(info, "defrag_root cancelled");
				1318	ret = -EAGAIN;
				1319	break;
				1320	}
				1321	}
				1322	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
				1323	return ret;
				1324	}
				1325
				1326	/*
				1327	* Do all special snapshot related qgroup dirty hack.
				1328	*
				1329	* Will do all needed qgroup inherit and dirty hack like switch commit
				1330	* roots inside one transaction and write all btree into disk, to make
				1331	* qgroup works.
				1332	*/
				1333	static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
				1334	struct btrfs_root *src,
				1335	struct btrfs_root *parent,
				1336	struct btrfs_qgroup_inherit *inherit,
				1337	u64 dst_objectid)
				1338	{
				1339	struct btrfs_fs_info *fs_info = src->fs_info;
				1340	int ret;
				1341
				1342	/*
				1343	* Save some performance in the case that qgroups are not
				1344	* enabled. If this check races with the ioctl, rescan will
				1345	* kick in anyway.
				1346	*/
				1347	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
				1348	return 0;
				1349
				1350	/*
				1351	* Ensure dirty @src will be commited. Or, after comming
				1352	* commit_fs_roots() and switch_commit_roots(), any dirty but not
				1353	* recorded root will never be updated again, causing an outdated root
				1354	* item.
				1355	*/
				1356	record_root_in_trans(trans, src, 1);
				1357
				1358	/*
				1359	* We are going to commit transaction, see btrfs_commit_transaction()
				1360	* comment for reason locking tree_log_mutex
				1361	*/
				1362	mutex_lock(&fs_info->tree_log_mutex);
				1363
				1364	ret = commit_fs_roots(trans);
				1365	if (ret)
				1366	goto out;
				1367	ret = btrfs_qgroup_account_extents(trans);
				1368	if (ret < 0)
				1369	goto out;
				1370
				1371	/* Now qgroup are all updated, we can inherit it to new qgroups */
				1372	ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
				1373	inherit);
				1374	if (ret < 0)
				1375	goto out;
				1376
				1377	/*
				1378	* Now we do a simplified commit transaction, which will:
				1379	* 1) commit all subvolume and extent tree
				1380	* To ensure all subvolume and extent tree have a valid
				1381	* commit_root to accounting later insert_dir_item()
				1382	* 2) write all btree blocks onto disk
				1383	* This is to make sure later btree modification will be cowed
				1384	* Or commit_root can be populated and cause wrong qgroup numbers
				1385	* In this simplified commit, we don't really care about other trees
				1386	* like chunk and root tree, as they won't affect qgroup.
				1387	* And we don't write super to avoid half committed status.
				1388	*/
				1389	ret = commit_cowonly_roots(trans);
				1390	if (ret)
				1391	goto out;
				1392	switch_commit_roots(trans->transaction);
				1393	ret = btrfs_write_and_wait_transaction(trans);
				1394	if (ret)
				1395	btrfs_handle_fs_error(fs_info, ret,
				1396	"Error while writing out transaction for qgroup");
				1397
				1398	out:
				1399	mutex_unlock(&fs_info->tree_log_mutex);
				1400
				1401	/*
				1402	* Force parent root to be updated, as we recorded it before so its
				1403	* last_trans == cur_transid.
				1404	* Or it won't be committed again onto disk after later
				1405	* insert_dir_item()
				1406	*/
				1407	if (!ret)
				1408	record_root_in_trans(trans, parent, 1);
				1409	return ret;
				1410	}
				1411
				1412	/*
				1413	* new snapshots need to be created at a very specific time in the
				1414	* transaction commit. This does the actual creation.
				1415	*
				1416	* Note:
				1417	* If the error which may affect the commitment of the current transaction
				1418	* happens, we should return the error number. If the error which just affect
				1419	* the creation of the pending snapshots, just return 0.
				1420	*/
				1421	static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
				1422	struct btrfs_pending_snapshot *pending)
				1423	{
				1424
				1425	struct btrfs_fs_info *fs_info = trans->fs_info;
				1426	struct btrfs_key key;
				1427	struct btrfs_root_item *new_root_item;
				1428	struct btrfs_root *tree_root = fs_info->tree_root;
				1429	struct btrfs_root *root = pending->root;
				1430	struct btrfs_root *parent_root;
				1431	struct btrfs_block_rsv *rsv;
				1432	struct inode *parent_inode;
				1433	struct btrfs_path *path;
				1434	struct btrfs_dir_item *dir_item;
				1435	struct dentry *dentry;
				1436	struct extent_buffer *tmp;
				1437	struct extent_buffer *old;
				1438	struct timespec64 cur_time;
				1439	int ret = 0;
				1440	u64 to_reserve = 0;
				1441	u64 index = 0;
				1442	u64 objectid;
				1443	u64 root_flags;
				1444	uuid_le new_uuid;
				1445
				1446	ASSERT(pending->path);
				1447	path = pending->path;
				1448
				1449	ASSERT(pending->root_item);
				1450	new_root_item = pending->root_item;
				1451
				1452	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
				1453	if (pending->error)
				1454	goto no_free_objectid;
				1455
				1456	/*
				1457	* Make qgroup to skip current new snapshot's qgroupid, as it is
				1458	* accounted by later btrfs_qgroup_inherit().
				1459	*/
				1460	btrfs_set_skip_qgroup(trans, objectid);
				1461
				1462	btrfs_reloc_pre_snapshot(pending, &to_reserve);
				1463
				1464	if (to_reserve > 0) {
				1465	pending->error = btrfs_block_rsv_add(root,
				1466	&pending->block_rsv,
				1467	to_reserve,
				1468	BTRFS_RESERVE_NO_FLUSH);
				1469	if (pending->error)
				1470	goto clear_skip_qgroup;
				1471	}
				1472
				1473	key.objectid = objectid;
				1474	key.offset = (u64)-1;
				1475	key.type = BTRFS_ROOT_ITEM_KEY;
				1476
				1477	rsv = trans->block_rsv;
				1478	trans->block_rsv = &pending->block_rsv;
				1479	trans->bytes_reserved = trans->block_rsv->reserved;
				1480	trace_btrfs_space_reservation(fs_info, "transaction",
				1481	trans->transid,
				1482	trans->bytes_reserved, 1);
				1483	dentry = pending->dentry;
				1484	parent_inode = pending->dir;
				1485	parent_root = BTRFS_I(parent_inode)->root;
				1486	record_root_in_trans(trans, parent_root, 0);
				1487
				1488	cur_time = current_time(parent_inode);
				1489
				1490	/*
				1491	* insert the directory item
				1492	*/
				1493	ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
				1494	BUG_ON(ret); /* -ENOMEM */
				1495
				1496	/* check if there is a file/dir which has the same name. */
				1497	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
				1498	btrfs_ino(BTRFS_I(parent_inode)),
				1499	dentry->d_name.name,
				1500	dentry->d_name.len, 0);
				1501	if (dir_item != NULL && !IS_ERR(dir_item)) {
				1502	pending->error = -EEXIST;
				1503	goto dir_item_existed;
				1504	} else if (IS_ERR(dir_item)) {
				1505	ret = PTR_ERR(dir_item);
				1506	btrfs_abort_transaction(trans, ret);
				1507	goto fail;
				1508	}
				1509	btrfs_release_path(path);
				1510
				1511	/*
				1512	* pull in the delayed directory update
				1513	* and the delayed inode item
				1514	* otherwise we corrupt the FS during
				1515	* snapshot
				1516	*/
				1517	ret = btrfs_run_delayed_items(trans);
				1518	if (ret) { /* Transaction aborted */
				1519	btrfs_abort_transaction(trans, ret);
				1520	goto fail;
				1521	}
				1522
				1523	record_root_in_trans(trans, root, 0);
				1524	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
				1525	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
				1526	btrfs_check_and_init_root_item(new_root_item);
				1527
				1528	root_flags = btrfs_root_flags(new_root_item);
				1529	if (pending->readonly)
				1530	root_flags \|= BTRFS_ROOT_SUBVOL_RDONLY;
				1531	else
				1532	root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
				1533	btrfs_set_root_flags(new_root_item, root_flags);
				1534
				1535	btrfs_set_root_generation_v2(new_root_item,
				1536	trans->transid);
				1537	uuid_le_gen(&new_uuid);
				1538	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
				1539	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
				1540	BTRFS_UUID_SIZE);
				1541	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
				1542	memset(new_root_item->received_uuid, 0,
				1543	sizeof(new_root_item->received_uuid));
				1544	memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
				1545	memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
				1546	btrfs_set_root_stransid(new_root_item, 0);
				1547	btrfs_set_root_rtransid(new_root_item, 0);
				1548	}
				1549	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
				1550	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
				1551	btrfs_set_root_otransid(new_root_item, trans->transid);
				1552
				1553	old = btrfs_lock_root_node(root);
				1554	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
				1555	if (ret) {
				1556	btrfs_tree_unlock(old);
				1557	free_extent_buffer(old);
				1558	btrfs_abort_transaction(trans, ret);
				1559	goto fail;
				1560	}
				1561
				1562	btrfs_set_lock_blocking(old);
				1563
				1564	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
				1565	/* clean up in any case */
				1566	btrfs_tree_unlock(old);
				1567	free_extent_buffer(old);
				1568	if (ret) {
				1569	btrfs_abort_transaction(trans, ret);
				1570	goto fail;
				1571	}
				1572	/* see comments in should_cow_block() */
				1573	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
				1574	smp_wmb();
				1575
				1576	btrfs_set_root_node(new_root_item, tmp);
				1577	/* record when the snapshot was created in key.offset */
				1578	key.offset = trans->transid;
				1579	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
				1580	btrfs_tree_unlock(tmp);
				1581	free_extent_buffer(tmp);
				1582	if (ret) {
				1583	btrfs_abort_transaction(trans, ret);
				1584	goto fail;
				1585	}
				1586
				1587	/*
				1588	* insert root back/forward references
				1589	*/
				1590	ret = btrfs_add_root_ref(trans, objectid,
				1591	parent_root->root_key.objectid,
				1592	btrfs_ino(BTRFS_I(parent_inode)), index,
				1593	dentry->d_name.name, dentry->d_name.len);
				1594	if (ret) {
				1595	btrfs_abort_transaction(trans, ret);
				1596	goto fail;
				1597	}
				1598
				1599	key.offset = (u64)-1;
				1600	pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
				1601	if (IS_ERR(pending->snap)) {
				1602	ret = PTR_ERR(pending->snap);
				1603	btrfs_abort_transaction(trans, ret);
				1604	goto fail;
				1605	}
				1606
				1607	ret = btrfs_reloc_post_snapshot(trans, pending);
				1608	if (ret) {
				1609	btrfs_abort_transaction(trans, ret);
				1610	goto fail;
				1611	}
				1612
				1613	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1614	if (ret) {
				1615	btrfs_abort_transaction(trans, ret);
				1616	goto fail;
				1617	}
				1618
				1619	/*
				1620	* Do special qgroup accounting for snapshot, as we do some qgroup
				1621	* snapshot hack to do fast snapshot.
				1622	* To co-operate with that hack, we do hack again.
				1623	* Or snapshot will be greatly slowed down by a subtree qgroup rescan
				1624	*/
				1625	ret = qgroup_account_snapshot(trans, root, parent_root,
				1626	pending->inherit, objectid);
				1627	if (ret < 0)
				1628	goto fail;
				1629
				1630	ret = btrfs_insert_dir_item(trans, parent_root,
				1631	dentry->d_name.name, dentry->d_name.len,
				1632	BTRFS_I(parent_inode), &key,
				1633	BTRFS_FT_DIR, index);
				1634	/* We have check then name at the beginning, so it is impossible. */
				1635	BUG_ON(ret == -EEXIST \|\| ret == -EOVERFLOW);
				1636	if (ret) {
				1637	btrfs_abort_transaction(trans, ret);
				1638	goto fail;
				1639	}
				1640
				1641	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
				1642	dentry->d_name.len * 2);
				1643	parent_inode->i_mtime = parent_inode->i_ctime =
				1644	current_time(parent_inode);
				1645	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
				1646	if (ret) {
				1647	btrfs_abort_transaction(trans, ret);
				1648	goto fail;
				1649	}
				1650	ret = btrfs_uuid_tree_add(trans, new_uuid.b, BTRFS_UUID_KEY_SUBVOL,
				1651	objectid);
				1652	if (ret) {
				1653	btrfs_abort_transaction(trans, ret);
				1654	goto fail;
				1655	}
				1656	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
				1657	ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
				1658	BTRFS_UUID_KEY_RECEIVED_SUBVOL,
				1659	objectid);
				1660	if (ret && ret != -EEXIST) {
				1661	btrfs_abort_transaction(trans, ret);
				1662	goto fail;
				1663	}
				1664	}
				1665
				1666	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				1667	if (ret) {
				1668	btrfs_abort_transaction(trans, ret);
				1669	goto fail;
				1670	}
				1671
				1672	fail:
				1673	pending->error = ret;
				1674	dir_item_existed:
				1675	trans->block_rsv = rsv;
				1676	trans->bytes_reserved = 0;
				1677	clear_skip_qgroup:
				1678	btrfs_clear_skip_qgroup(trans);
				1679	no_free_objectid:
				1680	kfree(new_root_item);
				1681	pending->root_item = NULL;
				1682	btrfs_free_path(path);
				1683	pending->path = NULL;
				1684
				1685	return ret;
				1686	}
				1687
				1688	/*
				1689	* create all the snapshots we've scheduled for creation
				1690	*/
				1691	static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
				1692	{
				1693	struct btrfs_pending_snapshot pending, next;
				1694	struct list_head *head = &trans->transaction->pending_snapshots;
				1695	int ret = 0;
				1696
				1697	list_for_each_entry_safe(pending, next, head, list) {
				1698	list_del(&pending->list);
				1699	ret = create_pending_snapshot(trans, pending);
				1700	if (ret)
				1701	break;
				1702	}
				1703	return ret;
				1704	}
				1705
				1706	static void update_super_roots(struct btrfs_fs_info *fs_info)
				1707	{
				1708	struct btrfs_root_item *root_item;
				1709	struct btrfs_super_block *super;
				1710
				1711	super = fs_info->super_copy;
				1712
				1713	root_item = &fs_info->chunk_root->root_item;
				1714	super->chunk_root = root_item->bytenr;
				1715	super->chunk_root_generation = root_item->generation;
				1716	super->chunk_root_level = root_item->level;
				1717
				1718	root_item = &fs_info->tree_root->root_item;
				1719	super->root = root_item->bytenr;
				1720	super->generation = root_item->generation;
				1721	super->root_level = root_item->level;
				1722	if (btrfs_test_opt(fs_info, SPACE_CACHE))
				1723	super->cache_generation = root_item->generation;
				1724	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
				1725	super->uuid_tree_generation = root_item->generation;
				1726	}
				1727
				1728	int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
				1729	{
				1730	struct btrfs_transaction *trans;
				1731	int ret = 0;
				1732
				1733	spin_lock(&info->trans_lock);
				1734	trans = info->running_transaction;
				1735	if (trans)
				1736	ret = (trans->state >= TRANS_STATE_COMMIT_START);
				1737	spin_unlock(&info->trans_lock);
				1738	return ret;
				1739	}
				1740
				1741	int btrfs_transaction_blocked(struct btrfs_fs_info *info)
				1742	{
				1743	struct btrfs_transaction *trans;
				1744	int ret = 0;
				1745
				1746	spin_lock(&info->trans_lock);
				1747	trans = info->running_transaction;
				1748	if (trans)
				1749	ret = is_transaction_blocked(trans);
				1750	spin_unlock(&info->trans_lock);
				1751	return ret;
				1752	}
				1753
				1754	/*
				1755	* wait for the current transaction commit to start and block subsequent
				1756	* transaction joins
				1757	*/
				1758	static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
				1759	struct btrfs_transaction *trans)
				1760	{
				1761	wait_event(fs_info->transaction_blocked_wait,
				1762	trans->state >= TRANS_STATE_COMMIT_START \|\| trans->aborted);
				1763	}
				1764
				1765	/*
				1766	* wait for the current transaction to start and then become unblocked.
				1767	* caller holds ref.
				1768	*/
				1769	static void wait_current_trans_commit_start_and_unblock(
				1770	struct btrfs_fs_info *fs_info,
				1771	struct btrfs_transaction *trans)
				1772	{
				1773	wait_event(fs_info->transaction_wait,
				1774	trans->state >= TRANS_STATE_UNBLOCKED \|\| trans->aborted);
				1775	}
				1776
				1777	/*
				1778	* commit transactions asynchronously. once btrfs_commit_transaction_async
				1779	* returns, any subsequent transaction will not be allowed to join.
				1780	*/
				1781	struct btrfs_async_commit {
				1782	struct btrfs_trans_handle *newtrans;
				1783	struct work_struct work;
				1784	};
				1785
				1786	static void do_async_commit(struct work_struct *work)
				1787	{
				1788	struct btrfs_async_commit *ac =
				1789	container_of(work, struct btrfs_async_commit, work);
				1790
				1791	/*
				1792	* We've got freeze protection passed with the transaction.
				1793	* Tell lockdep about it.
				1794	*/
				1795	if (ac->newtrans->type & __TRANS_FREEZABLE)
				1796	__sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
				1797
				1798	current->journal_info = ac->newtrans;
				1799
				1800	btrfs_commit_transaction(ac->newtrans);
				1801	kfree(ac);
				1802	}
				1803
				1804	int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				1805	int wait_for_unblock)
				1806	{
				1807	struct btrfs_fs_info *fs_info = trans->fs_info;
				1808	struct btrfs_async_commit *ac;
				1809	struct btrfs_transaction *cur_trans;
				1810
				1811	ac = kmalloc(sizeof(*ac), GFP_NOFS);
				1812	if (!ac)
				1813	return -ENOMEM;
				1814
				1815	INIT_WORK(&ac->work, do_async_commit);
				1816	ac->newtrans = btrfs_join_transaction(trans->root);
				1817	if (IS_ERR(ac->newtrans)) {
				1818	int err = PTR_ERR(ac->newtrans);
				1819	kfree(ac);
				1820	return err;
				1821	}
				1822
				1823	/* take transaction reference */
				1824	cur_trans = trans->transaction;
				1825	refcount_inc(&cur_trans->use_count);
				1826
				1827	btrfs_end_transaction(trans);
				1828
				1829	/*
				1830	* Tell lockdep we've released the freeze rwsem, since the
				1831	* async commit thread will be the one to unlock it.
				1832	*/
				1833	if (ac->newtrans->type & __TRANS_FREEZABLE)
				1834	__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
				1835
				1836	schedule_work(&ac->work);
				1837
				1838	/* wait for transaction to start and unblock */
				1839	if (wait_for_unblock)
				1840	wait_current_trans_commit_start_and_unblock(fs_info, cur_trans);
				1841	else
				1842	wait_current_trans_commit_start(fs_info, cur_trans);
				1843
				1844	if (current->journal_info == trans)
				1845	current->journal_info = NULL;
				1846
				1847	btrfs_put_transaction(cur_trans);
				1848	return 0;
				1849	}
				1850
				1851
				1852	static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
				1853	{
				1854	struct btrfs_fs_info *fs_info = trans->fs_info;
				1855	struct btrfs_transaction *cur_trans = trans->transaction;
				1856	DEFINE_WAIT(wait);
				1857
				1858	WARN_ON(refcount_read(&trans->use_count) > 1);
				1859
				1860	btrfs_abort_transaction(trans, err);
				1861
				1862	spin_lock(&fs_info->trans_lock);
				1863
				1864	/*
				1865	* If the transaction is removed from the list, it means this
				1866	* transaction has been committed successfully, so it is impossible
				1867	* to call the cleanup function.
				1868	*/
				1869	BUG_ON(list_empty(&cur_trans->list));
				1870
				1871	list_del_init(&cur_trans->list);
				1872	if (cur_trans == fs_info->running_transaction) {
				1873	cur_trans->state = TRANS_STATE_COMMIT_DOING;
				1874	spin_unlock(&fs_info->trans_lock);
				1875	wait_event(cur_trans->writer_wait,
				1876	atomic_read(&cur_trans->num_writers) == 1);
				1877
				1878	spin_lock(&fs_info->trans_lock);
				1879	}
				1880	spin_unlock(&fs_info->trans_lock);
				1881
				1882	btrfs_cleanup_one_transaction(trans->transaction, fs_info);
				1883
				1884	spin_lock(&fs_info->trans_lock);
				1885	if (cur_trans == fs_info->running_transaction)
				1886	fs_info->running_transaction = NULL;
				1887	spin_unlock(&fs_info->trans_lock);
				1888
				1889	if (trans->type & __TRANS_FREEZABLE)
				1890	sb_end_intwrite(fs_info->sb);
				1891	btrfs_put_transaction(cur_trans);
				1892	btrfs_put_transaction(cur_trans);
				1893
				1894	trace_btrfs_transaction_commit(trans->root);
				1895
				1896	if (current->journal_info == trans)
				1897	current->journal_info = NULL;
				1898	btrfs_scrub_cancel(fs_info);
				1899
				1900	kmem_cache_free(btrfs_trans_handle_cachep, trans);
				1901	}
				1902
				1903	static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
				1904	{
				1905	/*
				1906	* We use writeback_inodes_sb here because if we used
				1907	* btrfs_start_delalloc_roots we would deadlock with fs freeze.
				1908	* Currently are holding the fs freeze lock, if we do an async flush
				1909	* we'll do btrfs_join_transaction() and deadlock because we need to
				1910	* wait for the fs freeze lock. Using the direct flushing we benefit
				1911	* from already being in a transaction and our join_transaction doesn't
				1912	* have to re-take the fs freeze lock.
				1913	*/
				1914	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
				1915	writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
				1916	return 0;
				1917	}
				1918
				1919	static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
				1920	{
				1921	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
				1922	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
				1923	}
				1924
				1925	static inline void
				1926	btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
				1927	{
				1928	wait_event(cur_trans->pending_wait,
				1929	atomic_read(&cur_trans->pending_ordered) == 0);
				1930	}
				1931
				1932	int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
				1933	{
				1934	struct btrfs_fs_info *fs_info = trans->fs_info;
				1935	struct btrfs_transaction *cur_trans = trans->transaction;
				1936	struct btrfs_transaction *prev_trans = NULL;
				1937	int ret;
				1938
				1939	/* Stop the commit early if ->aborted is set */
				1940	if (unlikely(READ_ONCE(cur_trans->aborted))) {
				1941	ret = cur_trans->aborted;
				1942	btrfs_end_transaction(trans);
				1943	return ret;
				1944	}
				1945
				1946	btrfs_trans_release_metadata(trans);
				1947	trans->block_rsv = NULL;
				1948
				1949	/* make a pass through all the delayed refs we have so far
				1950	* any runnings procs may add more while we are here
				1951	*/
				1952	ret = btrfs_run_delayed_refs(trans, 0);
				1953	if (ret) {
				1954	btrfs_end_transaction(trans);
				1955	return ret;
				1956	}
				1957
				1958	cur_trans = trans->transaction;
				1959
				1960	/*
				1961	* set the flushing flag so procs in this transaction have to
				1962	* start sending their work down.
				1963	*/
				1964	cur_trans->delayed_refs.flushing = 1;
				1965	smp_wmb();
				1966
				1967	if (!list_empty(&trans->new_bgs))
				1968	btrfs_create_pending_block_groups(trans);
				1969
				1970	ret = btrfs_run_delayed_refs(trans, 0);
				1971	if (ret) {
				1972	btrfs_end_transaction(trans);
				1973	return ret;
				1974	}
				1975
				1976	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
				1977	int run_it = 0;
				1978
				1979	/* this mutex is also taken before trying to set
				1980	* block groups readonly. We need to make sure
				1981	* that nobody has set a block group readonly
				1982	* after a extents from that block group have been
				1983	* allocated for cache files. btrfs_set_block_group_ro
				1984	* will wait for the transaction to commit if it
				1985	* finds BTRFS_TRANS_DIRTY_BG_RUN set.
				1986	*
				1987	* The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
				1988	* only one process starts all the block group IO. It wouldn't
				1989	* hurt to have more than one go through, but there's no
				1990	* real advantage to it either.
				1991	*/
				1992	mutex_lock(&fs_info->ro_block_group_mutex);
				1993	if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
				1994	&cur_trans->flags))
				1995	run_it = 1;
				1996	mutex_unlock(&fs_info->ro_block_group_mutex);
				1997
				1998	if (run_it) {
				1999	ret = btrfs_start_dirty_block_groups(trans);
				2000	if (ret) {
				2001	btrfs_end_transaction(trans);
				2002	return ret;
				2003	}
				2004	}
				2005	}
				2006
				2007	spin_lock(&fs_info->trans_lock);
				2008	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
				2009	spin_unlock(&fs_info->trans_lock);
				2010	refcount_inc(&cur_trans->use_count);
				2011	ret = btrfs_end_transaction(trans);
				2012
				2013	wait_for_commit(cur_trans);
				2014
				2015	if (unlikely(cur_trans->aborted))
				2016	ret = cur_trans->aborted;
				2017
				2018	btrfs_put_transaction(cur_trans);
				2019
				2020	return ret;
				2021	}
				2022
				2023	cur_trans->state = TRANS_STATE_COMMIT_START;
				2024	wake_up(&fs_info->transaction_blocked_wait);
				2025
				2026	if (cur_trans->list.prev != &fs_info->trans_list) {
				2027	prev_trans = list_entry(cur_trans->list.prev,
				2028	struct btrfs_transaction, list);
				2029	if (prev_trans->state != TRANS_STATE_COMPLETED) {
				2030	refcount_inc(&prev_trans->use_count);
				2031	spin_unlock(&fs_info->trans_lock);
				2032
				2033	wait_for_commit(prev_trans);
				2034	ret = prev_trans->aborted;
				2035
				2036	btrfs_put_transaction(prev_trans);
				2037	if (ret)
				2038	goto cleanup_transaction;
				2039	} else {
				2040	spin_unlock(&fs_info->trans_lock);
				2041	}
				2042	} else {
				2043	spin_unlock(&fs_info->trans_lock);
				2044	/*
				2045	* The previous transaction was aborted and was already removed
				2046	* from the list of transactions at fs_info->trans_list. So we
				2047	* abort to prevent writing a new superblock that reflects a
				2048	* corrupt state (pointing to trees with unwritten nodes/leafs).
				2049	*/
				2050	if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
				2051	ret = -EROFS;
				2052	goto cleanup_transaction;
				2053	}
				2054	}
				2055
				2056	extwriter_counter_dec(cur_trans, trans->type);
				2057
				2058	ret = btrfs_start_delalloc_flush(fs_info);
				2059	if (ret)
				2060	goto cleanup_transaction;
				2061
				2062	ret = btrfs_run_delayed_items(trans);
				2063	if (ret)
				2064	goto cleanup_transaction;
				2065
				2066	wait_event(cur_trans->writer_wait,
				2067	extwriter_counter_read(cur_trans) == 0);
				2068
				2069	/* some pending stuffs might be added after the previous flush. */
				2070	ret = btrfs_run_delayed_items(trans);
				2071	if (ret)
				2072	goto cleanup_transaction;
				2073
				2074	btrfs_wait_delalloc_flush(fs_info);
				2075
				2076	btrfs_wait_pending_ordered(cur_trans);
				2077
				2078	btrfs_scrub_pause(fs_info);
				2079	/*
				2080	* Ok now we need to make sure to block out any other joins while we
				2081	* commit the transaction. We could have started a join before setting
				2082	* COMMIT_DOING so make sure to wait for num_writers to == 1 again.
				2083	*/
				2084	spin_lock(&fs_info->trans_lock);
				2085	cur_trans->state = TRANS_STATE_COMMIT_DOING;
				2086	spin_unlock(&fs_info->trans_lock);
				2087	wait_event(cur_trans->writer_wait,
				2088	atomic_read(&cur_trans->num_writers) == 1);
				2089
				2090	/* ->aborted might be set after the previous check, so check it */
				2091	if (unlikely(READ_ONCE(cur_trans->aborted))) {
				2092	ret = cur_trans->aborted;
				2093	goto scrub_continue;
				2094	}
				2095	/*
				2096	* the reloc mutex makes sure that we stop
				2097	* the balancing code from coming in and moving
				2098	* extents around in the middle of the commit
				2099	*/
				2100	mutex_lock(&fs_info->reloc_mutex);
				2101
				2102	/*
				2103	* We needn't worry about the delayed items because we will
				2104	* deal with them in create_pending_snapshot(), which is the
				2105	* core function of the snapshot creation.
				2106	*/
				2107	ret = create_pending_snapshots(trans);
				2108	if (ret) {
				2109	mutex_unlock(&fs_info->reloc_mutex);
				2110	goto scrub_continue;
				2111	}
				2112
				2113	/*
				2114	* We insert the dir indexes of the snapshots and update the inode
				2115	* of the snapshots' parents after the snapshot creation, so there
				2116	* are some delayed items which are not dealt with. Now deal with
				2117	* them.
				2118	*
				2119	* We needn't worry that this operation will corrupt the snapshots,
				2120	* because all the tree which are snapshoted will be forced to COW
				2121	* the nodes and leaves.
				2122	*/
				2123	ret = btrfs_run_delayed_items(trans);
				2124	if (ret) {
				2125	mutex_unlock(&fs_info->reloc_mutex);
				2126	goto scrub_continue;
				2127	}
				2128
				2129	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				2130	if (ret) {
				2131	mutex_unlock(&fs_info->reloc_mutex);
				2132	goto scrub_continue;
				2133	}
				2134
				2135	/*
				2136	* make sure none of the code above managed to slip in a
				2137	* delayed item
				2138	*/
				2139	btrfs_assert_delayed_root_empty(fs_info);
				2140
				2141	WARN_ON(cur_trans != trans->transaction);
				2142
				2143	/* btrfs_commit_tree_roots is responsible for getting the
				2144	* various roots consistent with each other. Every pointer
				2145	* in the tree of tree roots has to point to the most up to date
				2146	* root for every subvolume and other tree. So, we have to keep
				2147	* the tree logging code from jumping in and changing any
				2148	* of the trees.
				2149	*
				2150	* At this point in the commit, there can't be any tree-log
				2151	* writers, but a little lower down we drop the trans mutex
				2152	* and let new people in. By holding the tree_log_mutex
				2153	* from now until after the super is written, we avoid races
				2154	* with the tree-log code.
				2155	*/
				2156	mutex_lock(&fs_info->tree_log_mutex);
				2157
				2158	ret = commit_fs_roots(trans);
				2159	if (ret) {
				2160	mutex_unlock(&fs_info->tree_log_mutex);
				2161	mutex_unlock(&fs_info->reloc_mutex);
				2162	goto scrub_continue;
				2163	}
				2164
				2165	/*
				2166	* Since the transaction is done, we can apply the pending changes
				2167	* before the next transaction.
				2168	*/
				2169	btrfs_apply_pending_changes(fs_info);
				2170
				2171	/* commit_fs_roots gets rid of all the tree log roots, it is now
				2172	* safe to free the root of tree log roots
				2173	*/
				2174	btrfs_free_log_root_tree(trans, fs_info);
				2175
				2176	/*
				2177	* commit_fs_roots() can call btrfs_save_ino_cache(), which generates
				2178	* new delayed refs. Must handle them or qgroup can be wrong.
				2179	*/
				2180	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
				2181	if (ret) {
				2182	mutex_unlock(&fs_info->tree_log_mutex);
				2183	mutex_unlock(&fs_info->reloc_mutex);
				2184	goto scrub_continue;
				2185	}
				2186
				2187	/*
				2188	* Since fs roots are all committed, we can get a quite accurate
				2189	* new_roots. So let's do quota accounting.
				2190	*/
				2191	ret = btrfs_qgroup_account_extents(trans);
				2192	if (ret < 0) {
				2193	mutex_unlock(&fs_info->tree_log_mutex);
				2194	mutex_unlock(&fs_info->reloc_mutex);
				2195	goto scrub_continue;
				2196	}
				2197
				2198	ret = commit_cowonly_roots(trans);
				2199	if (ret) {
				2200	mutex_unlock(&fs_info->tree_log_mutex);
				2201	mutex_unlock(&fs_info->reloc_mutex);
				2202	goto scrub_continue;
				2203	}
				2204
				2205	/*
				2206	* The tasks which save the space cache and inode cache may also
				2207	* update ->aborted, check it.
				2208	*/
				2209	if (unlikely(READ_ONCE(cur_trans->aborted))) {
				2210	ret = cur_trans->aborted;
				2211	mutex_unlock(&fs_info->tree_log_mutex);
				2212	mutex_unlock(&fs_info->reloc_mutex);
				2213	goto scrub_continue;
				2214	}
				2215
				2216	btrfs_prepare_extent_commit(fs_info);
				2217
				2218	cur_trans = fs_info->running_transaction;
				2219
				2220	btrfs_set_root_node(&fs_info->tree_root->root_item,
				2221	fs_info->tree_root->node);
				2222	list_add_tail(&fs_info->tree_root->dirty_list,
				2223	&cur_trans->switch_commits);
				2224
				2225	btrfs_set_root_node(&fs_info->chunk_root->root_item,
				2226	fs_info->chunk_root->node);
				2227	list_add_tail(&fs_info->chunk_root->dirty_list,
				2228	&cur_trans->switch_commits);
				2229
				2230	switch_commit_roots(cur_trans);
				2231
				2232	ASSERT(list_empty(&cur_trans->dirty_bgs));
				2233	ASSERT(list_empty(&cur_trans->io_bgs));
				2234	update_super_roots(fs_info);
				2235
				2236	btrfs_set_super_log_root(fs_info->super_copy, 0);
				2237	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
				2238	memcpy(fs_info->super_for_commit, fs_info->super_copy,
				2239	sizeof(*fs_info->super_copy));
				2240
				2241	btrfs_update_commit_device_size(fs_info);
				2242	btrfs_update_commit_device_bytes_used(cur_trans);
				2243
				2244	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
				2245	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
				2246
				2247	btrfs_trans_release_chunk_metadata(trans);
				2248
				2249	spin_lock(&fs_info->trans_lock);
				2250	cur_trans->state = TRANS_STATE_UNBLOCKED;
				2251	fs_info->running_transaction = NULL;
				2252	spin_unlock(&fs_info->trans_lock);
				2253	mutex_unlock(&fs_info->reloc_mutex);
				2254
				2255	wake_up(&fs_info->transaction_wait);
				2256
				2257	ret = btrfs_write_and_wait_transaction(trans);
				2258	if (ret) {
				2259	btrfs_handle_fs_error(fs_info, ret,
				2260	"Error while writing out transaction");
				2261	mutex_unlock(&fs_info->tree_log_mutex);
				2262	goto scrub_continue;
				2263	}
				2264
				2265	ret = write_all_supers(fs_info, 0);
				2266	/*
				2267	* the super is written, we can safely allow the tree-loggers
				2268	* to go about their business
				2269	*/
				2270	mutex_unlock(&fs_info->tree_log_mutex);
				2271	if (ret)
				2272	goto scrub_continue;
				2273
				2274	btrfs_finish_extent_commit(trans);
				2275
				2276	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
				2277	btrfs_clear_space_info_full(fs_info);
				2278
				2279	fs_info->last_trans_committed = cur_trans->transid;
				2280	/*
				2281	* We needn't acquire the lock here because there is no other task
				2282	* which can change it.
				2283	*/
				2284	cur_trans->state = TRANS_STATE_COMPLETED;
				2285	wake_up(&cur_trans->commit_wait);
				2286	clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
				2287
				2288	spin_lock(&fs_info->trans_lock);
				2289	list_del_init(&cur_trans->list);
				2290	spin_unlock(&fs_info->trans_lock);
				2291
				2292	btrfs_put_transaction(cur_trans);
				2293	btrfs_put_transaction(cur_trans);
				2294
				2295	if (trans->type & __TRANS_FREEZABLE)
				2296	sb_end_intwrite(fs_info->sb);
				2297
				2298	trace_btrfs_transaction_commit(trans->root);
				2299
				2300	btrfs_scrub_continue(fs_info);
				2301
				2302	if (current->journal_info == trans)
				2303	current->journal_info = NULL;
				2304
				2305	kmem_cache_free(btrfs_trans_handle_cachep, trans);
				2306
				2307	return ret;
				2308
				2309	scrub_continue:
				2310	btrfs_scrub_continue(fs_info);
				2311	cleanup_transaction:
				2312	btrfs_trans_release_metadata(trans);
				2313	btrfs_trans_release_chunk_metadata(trans);
				2314	trans->block_rsv = NULL;
				2315	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
				2316	if (current->journal_info == trans)
				2317	current->journal_info = NULL;
				2318	cleanup_transaction(trans, ret);
				2319
				2320	return ret;
				2321	}
				2322
				2323	/*
				2324	* return < 0 if error
				2325	* 0 if there are no more dead_roots at the time of call
				2326	* 1 there are more to be processed, call me again
				2327	*
				2328	* The return value indicates there are certainly more snapshots to delete, but
				2329	* if there comes a new one during processing, it may return 0. We don't mind,
				2330	* because btrfs_commit_super will poke cleaner thread and it will process it a
				2331	* few seconds later.
				2332	*/
				2333	int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
				2334	{
				2335	int ret;
				2336	struct btrfs_fs_info *fs_info = root->fs_info;
				2337
				2338	spin_lock(&fs_info->trans_lock);
				2339	if (list_empty(&fs_info->dead_roots)) {
				2340	spin_unlock(&fs_info->trans_lock);
				2341	return 0;
				2342	}
				2343	root = list_first_entry(&fs_info->dead_roots,
				2344	struct btrfs_root, root_list);
				2345	list_del_init(&root->root_list);
				2346	spin_unlock(&fs_info->trans_lock);
				2347
				2348	btrfs_debug(fs_info, "cleaner removing %llu", root->objectid);
				2349
				2350	btrfs_kill_all_delayed_nodes(root);
				2351
				2352	if (btrfs_header_backref_rev(root->node) <
				2353	BTRFS_MIXED_BACKREF_REV)
				2354	ret = btrfs_drop_snapshot(root, NULL, 0, 0);
				2355	else
				2356	ret = btrfs_drop_snapshot(root, NULL, 1, 0);
				2357
				2358	return (ret < 0) ? 0 : 1;
				2359	}
				2360
				2361	void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
				2362	{
				2363	unsigned long prev;
				2364	unsigned long bit;
				2365
				2366	prev = xchg(&fs_info->pending_changes, 0);
				2367	if (!prev)
				2368	return;
				2369
				2370	bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
				2371	if (prev & bit)
				2372	btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
				2373	prev &= ~bit;
				2374
				2375	bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
				2376	if (prev & bit)
				2377	btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
				2378	prev &= ~bit;
				2379
				2380	bit = 1 << BTRFS_PENDING_COMMIT;
				2381	if (prev & bit)
				2382	btrfs_debug(fs_info, "pending commit done");
				2383	prev &= ~bit;
				2384
				2385	if (prev)
				2386	btrfs_warn(fs_info,
				2387	"unknown pending changes left 0x%lx, ignoring", prev);
				2388	}