Blame - marvell/linux/fs/btrfs/extent-tree.c - T108

blob: 7e5ac187463ed33e02ca693b4be0d919637e2e90 [file] [log] [blame]

b.liu	e958203	2025-04-17 19:18:16 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/sched.h>
				7	#include <linux/sched/signal.h>
				8	#include <linux/pagemap.h>
				9	#include <linux/writeback.h>
				10	#include <linux/blkdev.h>
				11	#include <linux/sort.h>
				12	#include <linux/rcupdate.h>
				13	#include <linux/kthread.h>
				14	#include <linux/slab.h>
				15	#include <linux/ratelimit.h>
				16	#include <linux/percpu_counter.h>
				17	#include <linux/lockdep.h>
				18	#include <linux/crc32c.h>
				19	#include "misc.h"
				20	#include "tree-log.h"
				21	#include "disk-io.h"
				22	#include "print-tree.h"
				23	#include "volumes.h"
				24	#include "raid56.h"
				25	#include "locking.h"
				26	#include "free-space-cache.h"
				27	#include "free-space-tree.h"
				28	#include "sysfs.h"
				29	#include "qgroup.h"
				30	#include "ref-verify.h"
				31	#include "space-info.h"
				32	#include "block-rsv.h"
				33	#include "delalloc-space.h"
				34	#include "block-group.h"
				35	#include "rcu-string.h"
				36
				37	#undef SCRAMBLE_DELAYED_REFS
				38
				39
				40	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				41	struct btrfs_delayed_ref_node *node, u64 parent,
				42	u64 root_objectid, u64 owner_objectid,
				43	u64 owner_offset, int refs_to_drop,
				44	struct btrfs_delayed_extent_op *extra_op);
				45	static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				46	struct extent_buffer *leaf,
				47	struct btrfs_extent_item *ei);
				48	static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				49	u64 parent, u64 root_objectid,
				50	u64 flags, u64 owner, u64 offset,
				51	struct btrfs_key *ins, int ref_mod);
				52	static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				53	struct btrfs_delayed_ref_node *node,
				54	struct btrfs_delayed_extent_op *extent_op);
				55	static int find_next_key(struct btrfs_path *path, int level,
				56	struct btrfs_key *key);
				57
				58	static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
				59	{
				60	return (cache->flags & bits) == bits;
				61	}
				62
				63	int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
				64	u64 start, u64 num_bytes)
				65	{
				66	u64 end = start + num_bytes - 1;
				67	set_extent_bits(&fs_info->freed_extents[0],
				68	start, end, EXTENT_UPTODATE);
				69	set_extent_bits(&fs_info->freed_extents[1],
				70	start, end, EXTENT_UPTODATE);
				71	return 0;
				72	}
				73
				74	void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache)
				75	{
				76	struct btrfs_fs_info *fs_info = cache->fs_info;
				77	u64 start, end;
				78
				79	start = cache->key.objectid;
				80	end = start + cache->key.offset - 1;
				81
				82	clear_extent_bits(&fs_info->freed_extents[0],
				83	start, end, EXTENT_UPTODATE);
				84	clear_extent_bits(&fs_info->freed_extents[1],
				85	start, end, EXTENT_UPTODATE);
				86	}
				87
				88	static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
				89	{
				90	if (ref->type == BTRFS_REF_METADATA) {
				91	if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
				92	return BTRFS_BLOCK_GROUP_SYSTEM;
				93	else
				94	return BTRFS_BLOCK_GROUP_METADATA;
				95	}
				96	return BTRFS_BLOCK_GROUP_DATA;
				97	}
				98
				99	static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
				100	struct btrfs_ref *ref)
				101	{
				102	struct btrfs_space_info *space_info;
				103	u64 flags = generic_ref_to_space_flags(ref);
				104
				105	space_info = btrfs_find_space_info(fs_info, flags);
				106	ASSERT(space_info);
				107	percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
				108	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				109	}
				110
				111	static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
				112	struct btrfs_ref *ref)
				113	{
				114	struct btrfs_space_info *space_info;
				115	u64 flags = generic_ref_to_space_flags(ref);
				116
				117	space_info = btrfs_find_space_info(fs_info, flags);
				118	ASSERT(space_info);
				119	percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
				120	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				121	}
				122
				123	/* simple helper to search for an existing data extent at a given offset */
				124	int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
				125	{
				126	int ret;
				127	struct btrfs_key key;
				128	struct btrfs_path *path;
				129
				130	path = btrfs_alloc_path();
				131	if (!path)
				132	return -ENOMEM;
				133
				134	key.objectid = start;
				135	key.offset = len;
				136	key.type = BTRFS_EXTENT_ITEM_KEY;
				137	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
				138	btrfs_free_path(path);
				139	return ret;
				140	}
				141
				142	/*
				143	* helper function to lookup reference count and flags of a tree block.
				144	*
				145	* the head node for delayed ref is used to store the sum of all the
				146	* reference count modifications queued up in the rbtree. the head
				147	* node may also store the extent flags to set. This way you can check
				148	* to see what the reference count and extent flags would be if all of
				149	* the delayed refs are not processed.
				150	*/
				151	int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
				152	struct btrfs_fs_info *fs_info, u64 bytenr,
				153	u64 offset, int metadata, u64 refs, u64 flags)
				154	{
				155	struct btrfs_delayed_ref_head *head;
				156	struct btrfs_delayed_ref_root *delayed_refs;
				157	struct btrfs_path *path;
				158	struct btrfs_extent_item *ei;
				159	struct extent_buffer *leaf;
				160	struct btrfs_key key;
				161	u32 item_size;
				162	u64 num_refs;
				163	u64 extent_flags;
				164	int ret;
				165
				166	/*
				167	* If we don't have skinny metadata, don't bother doing anything
				168	* different
				169	*/
				170	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
				171	offset = fs_info->nodesize;
				172	metadata = 0;
				173	}
				174
				175	path = btrfs_alloc_path();
				176	if (!path)
				177	return -ENOMEM;
				178
				179	if (!trans) {
				180	path->skip_locking = 1;
				181	path->search_commit_root = 1;
				182	}
				183
				184	search_again:
				185	key.objectid = bytenr;
				186	key.offset = offset;
				187	if (metadata)
				188	key.type = BTRFS_METADATA_ITEM_KEY;
				189	else
				190	key.type = BTRFS_EXTENT_ITEM_KEY;
				191
				192	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
				193	if (ret < 0)
				194	goto out_free;
				195
				196	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
				197	if (path->slots[0]) {
				198	path->slots[0]--;
				199	btrfs_item_key_to_cpu(path->nodes[0], &key,
				200	path->slots[0]);
				201	if (key.objectid == bytenr &&
				202	key.type == BTRFS_EXTENT_ITEM_KEY &&
				203	key.offset == fs_info->nodesize)
				204	ret = 0;
				205	}
				206	}
				207
				208	if (ret == 0) {
				209	leaf = path->nodes[0];
				210	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				211	if (item_size >= sizeof(*ei)) {
				212	ei = btrfs_item_ptr(leaf, path->slots[0],
				213	struct btrfs_extent_item);
				214	num_refs = btrfs_extent_refs(leaf, ei);
				215	extent_flags = btrfs_extent_flags(leaf, ei);
				216	} else {
				217	ret = -EINVAL;
				218	btrfs_print_v0_err(fs_info);
				219	if (trans)
				220	btrfs_abort_transaction(trans, ret);
				221	else
				222	btrfs_handle_fs_error(fs_info, ret, NULL);
				223
				224	goto out_free;
				225	}
				226
				227	BUG_ON(num_refs == 0);
				228	} else {
				229	num_refs = 0;
				230	extent_flags = 0;
				231	ret = 0;
				232	}
				233
				234	if (!trans)
				235	goto out;
				236
				237	delayed_refs = &trans->transaction->delayed_refs;
				238	spin_lock(&delayed_refs->lock);
				239	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				240	if (head) {
				241	if (!mutex_trylock(&head->mutex)) {
				242	refcount_inc(&head->refs);
				243	spin_unlock(&delayed_refs->lock);
				244
				245	btrfs_release_path(path);
				246
				247	/*
				248	* Mutex was contended, block until it's released and try
				249	* again
				250	*/
				251	mutex_lock(&head->mutex);
				252	mutex_unlock(&head->mutex);
				253	btrfs_put_delayed_ref_head(head);
				254	goto search_again;
				255	}
				256	spin_lock(&head->lock);
				257	if (head->extent_op && head->extent_op->update_flags)
				258	extent_flags \|= head->extent_op->flags_to_set;
				259	else
				260	BUG_ON(num_refs == 0);
				261
				262	num_refs += head->ref_mod;
				263	spin_unlock(&head->lock);
				264	mutex_unlock(&head->mutex);
				265	}
				266	spin_unlock(&delayed_refs->lock);
				267	out:
				268	WARN_ON(num_refs == 0);
				269	if (refs)
				270	*refs = num_refs;
				271	if (flags)
				272	*flags = extent_flags;
				273	out_free:
				274	btrfs_free_path(path);
				275	return ret;
				276	}
				277
				278	/*
				279	* Back reference rules. Back refs have three main goals:
				280	*
				281	* 1) differentiate between all holders of references to an extent so that
				282	* when a reference is dropped we can make sure it was a valid reference
				283	* before freeing the extent.
				284	*
				285	* 2) Provide enough information to quickly find the holders of an extent
				286	* if we notice a given block is corrupted or bad.
				287	*
				288	* 3) Make it easy to migrate blocks for FS shrinking or storage pool
				289	* maintenance. This is actually the same as #2, but with a slightly
				290	* different use case.
				291	*
				292	* There are two kinds of back refs. The implicit back refs is optimized
				293	* for pointers in non-shared tree blocks. For a given pointer in a block,
				294	* back refs of this kind provide information about the block's owner tree
				295	* and the pointer's key. These information allow us to find the block by
				296	* b-tree searching. The full back refs is for pointers in tree blocks not
				297	* referenced by their owner trees. The location of tree block is recorded
				298	* in the back refs. Actually the full back refs is generic, and can be
				299	* used in all cases the implicit back refs is used. The major shortcoming
				300	* of the full back refs is its overhead. Every time a tree block gets
				301	* COWed, we have to update back refs entry for all pointers in it.
				302	*
				303	* For a newly allocated tree block, we use implicit back refs for
				304	* pointers in it. This means most tree related operations only involve
				305	* implicit back refs. For a tree block created in old transaction, the
				306	* only way to drop a reference to it is COW it. So we can detect the
				307	* event that tree block loses its owner tree's reference and do the
				308	* back refs conversion.
				309	*
				310	* When a tree block is COWed through a tree, there are four cases:
				311	*
				312	* The reference count of the block is one and the tree is the block's
				313	* owner tree. Nothing to do in this case.
				314	*
				315	* The reference count of the block is one and the tree is not the
				316	* block's owner tree. In this case, full back refs is used for pointers
				317	* in the block. Remove these full back refs, add implicit back refs for
				318	* every pointers in the new block.
				319	*
				320	* The reference count of the block is greater than one and the tree is
				321	* the block's owner tree. In this case, implicit back refs is used for
				322	* pointers in the block. Add full back refs for every pointers in the
				323	* block, increase lower level extents' reference counts. The original
				324	* implicit back refs are entailed to the new block.
				325	*
				326	* The reference count of the block is greater than one and the tree is
				327	* not the block's owner tree. Add implicit back refs for every pointer in
				328	* the new block, increase lower level extents' reference count.
				329	*
				330	* Back Reference Key composing:
				331	*
				332	* The key objectid corresponds to the first byte in the extent,
				333	* The key type is used to differentiate between types of back refs.
				334	* There are different meanings of the key offset for different types
				335	* of back refs.
				336	*
				337	* File extents can be referenced by:
				338	*
				339	* - multiple snapshots, subvolumes, or different generations in one subvol
				340	* - different files inside a single subvolume
				341	* - different offsets inside a file (bookend extents in file.c)
				342	*
				343	* The extent ref structure for the implicit back refs has fields for:
				344	*
				345	* - Objectid of the subvolume root
				346	* - objectid of the file holding the reference
				347	* - original offset in the file
				348	* - how many bookend extents
				349	*
				350	* The key offset for the implicit back refs is hash of the first
				351	* three fields.
				352	*
				353	* The extent ref structure for the full back refs has field for:
				354	*
				355	* - number of pointers in the tree leaf
				356	*
				357	* The key offset for the implicit back refs is the first byte of
				358	* the tree leaf
				359	*
				360	* When a file extent is allocated, The implicit back refs is used.
				361	* the fields are filled in:
				362	*
				363	* (root_key.objectid, inode objectid, offset in file, 1)
				364	*
				365	* When a file extent is removed file truncation, we find the
				366	* corresponding implicit back refs and check the following fields:
				367	*
				368	* (btrfs_header_owner(leaf), inode objectid, offset in file)
				369	*
				370	* Btree extents can be referenced by:
				371	*
				372	* - Different subvolumes
				373	*
				374	* Both the implicit back refs and the full back refs for tree blocks
				375	* only consist of key. The key offset for the implicit back refs is
				376	* objectid of block's owner tree. The key offset for the full back refs
				377	* is the first byte of parent block.
				378	*
				379	* When implicit back refs is used, information about the lowest key and
				380	* level of the tree block are required. These information are stored in
				381	* tree block info structure.
				382	*/
				383
				384	/*
				385	* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
				386	* is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
				387	* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
				388	*/
				389	int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				390	struct btrfs_extent_inline_ref *iref,
				391	enum btrfs_inline_ref_type is_data)
				392	{
				393	int type = btrfs_extent_inline_ref_type(eb, iref);
				394	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
				395
				396	if (type == BTRFS_TREE_BLOCK_REF_KEY \|\|
				397	type == BTRFS_SHARED_BLOCK_REF_KEY \|\|
				398	type == BTRFS_SHARED_DATA_REF_KEY \|\|
				399	type == BTRFS_EXTENT_DATA_REF_KEY) {
				400	if (is_data == BTRFS_REF_TYPE_BLOCK) {
				401	if (type == BTRFS_TREE_BLOCK_REF_KEY)
				402	return type;
				403	if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				404	ASSERT(eb->fs_info);
				405	/*
				406	* Every shared one has parent tree block,
				407	* which must be aligned to sector size.
				408	*/
				409	if (offset &&
				410	IS_ALIGNED(offset, eb->fs_info->sectorsize))
				411	return type;
				412	}
				413	} else if (is_data == BTRFS_REF_TYPE_DATA) {
				414	if (type == BTRFS_EXTENT_DATA_REF_KEY)
				415	return type;
				416	if (type == BTRFS_SHARED_DATA_REF_KEY) {
				417	ASSERT(eb->fs_info);
				418	/*
				419	* Every shared one has parent tree block,
				420	* which must be aligned to sector size.
				421	*/
				422	if (offset &&
				423	IS_ALIGNED(offset, eb->fs_info->sectorsize))
				424	return type;
				425	}
				426	} else {
				427	ASSERT(is_data == BTRFS_REF_TYPE_ANY);
				428	return type;
				429	}
				430	}
				431
				432	btrfs_print_leaf((struct extent_buffer *)eb);
				433	btrfs_err(eb->fs_info,
				434	"eb %llu iref 0x%lx invalid extent inline ref type %d",
				435	eb->start, (unsigned long)iref, type);
				436	WARN_ON(1);
				437
				438	return BTRFS_REF_TYPE_INVALID;
				439	}
				440
				441	u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
				442	{
				443	u32 high_crc = ~(u32)0;
				444	u32 low_crc = ~(u32)0;
				445	__le64 lenum;
				446
				447	lenum = cpu_to_le64(root_objectid);
				448	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
				449	lenum = cpu_to_le64(owner);
				450	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
				451	lenum = cpu_to_le64(offset);
				452	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
				453
				454	return ((u64)high_crc << 31) ^ (u64)low_crc;
				455	}
				456
				457	static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				458	struct btrfs_extent_data_ref *ref)
				459	{
				460	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				461	btrfs_extent_data_ref_objectid(leaf, ref),
				462	btrfs_extent_data_ref_offset(leaf, ref));
				463	}
				464
				465	static int match_extent_data_ref(struct extent_buffer *leaf,
				466	struct btrfs_extent_data_ref *ref,
				467	u64 root_objectid, u64 owner, u64 offset)
				468	{
				469	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid \|\|
				470	btrfs_extent_data_ref_objectid(leaf, ref) != owner \|\|
				471	btrfs_extent_data_ref_offset(leaf, ref) != offset)
				472	return 0;
				473	return 1;
				474	}
				475
				476	static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
				477	struct btrfs_path *path,
				478	u64 bytenr, u64 parent,
				479	u64 root_objectid,
				480	u64 owner, u64 offset)
				481	{
				482	struct btrfs_root *root = trans->fs_info->extent_root;
				483	struct btrfs_key key;
				484	struct btrfs_extent_data_ref *ref;
				485	struct extent_buffer *leaf;
				486	u32 nritems;
				487	int ret;
				488	int recow;
				489	int err = -ENOENT;
				490
				491	key.objectid = bytenr;
				492	if (parent) {
				493	key.type = BTRFS_SHARED_DATA_REF_KEY;
				494	key.offset = parent;
				495	} else {
				496	key.type = BTRFS_EXTENT_DATA_REF_KEY;
				497	key.offset = hash_extent_data_ref(root_objectid,
				498	owner, offset);
				499	}
				500	again:
				501	recow = 0;
				502	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				503	if (ret < 0) {
				504	err = ret;
				505	goto fail;
				506	}
				507
				508	if (parent) {
				509	if (!ret)
				510	return 0;
				511	goto fail;
				512	}
				513
				514	leaf = path->nodes[0];
				515	nritems = btrfs_header_nritems(leaf);
				516	while (1) {
				517	if (path->slots[0] >= nritems) {
				518	ret = btrfs_next_leaf(root, path);
				519	if (ret < 0)
				520	err = ret;
				521	if (ret)
				522	goto fail;
				523
				524	leaf = path->nodes[0];
				525	nritems = btrfs_header_nritems(leaf);
				526	recow = 1;
				527	}
				528
				529	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				530	if (key.objectid != bytenr \|\|
				531	key.type != BTRFS_EXTENT_DATA_REF_KEY)
				532	goto fail;
				533
				534	ref = btrfs_item_ptr(leaf, path->slots[0],
				535	struct btrfs_extent_data_ref);
				536
				537	if (match_extent_data_ref(leaf, ref, root_objectid,
				538	owner, offset)) {
				539	if (recow) {
				540	btrfs_release_path(path);
				541	goto again;
				542	}
				543	err = 0;
				544	break;
				545	}
				546	path->slots[0]++;
				547	}
				548	fail:
				549	return err;
				550	}
				551
				552	static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
				553	struct btrfs_path *path,
				554	u64 bytenr, u64 parent,
				555	u64 root_objectid, u64 owner,
				556	u64 offset, int refs_to_add)
				557	{
				558	struct btrfs_root *root = trans->fs_info->extent_root;
				559	struct btrfs_key key;
				560	struct extent_buffer *leaf;
				561	u32 size;
				562	u32 num_refs;
				563	int ret;
				564
				565	key.objectid = bytenr;
				566	if (parent) {
				567	key.type = BTRFS_SHARED_DATA_REF_KEY;
				568	key.offset = parent;
				569	size = sizeof(struct btrfs_shared_data_ref);
				570	} else {
				571	key.type = BTRFS_EXTENT_DATA_REF_KEY;
				572	key.offset = hash_extent_data_ref(root_objectid,
				573	owner, offset);
				574	size = sizeof(struct btrfs_extent_data_ref);
				575	}
				576
				577	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
				578	if (ret && ret != -EEXIST)
				579	goto fail;
				580
				581	leaf = path->nodes[0];
				582	if (parent) {
				583	struct btrfs_shared_data_ref *ref;
				584	ref = btrfs_item_ptr(leaf, path->slots[0],
				585	struct btrfs_shared_data_ref);
				586	if (ret == 0) {
				587	btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
				588	} else {
				589	num_refs = btrfs_shared_data_ref_count(leaf, ref);
				590	num_refs += refs_to_add;
				591	btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
				592	}
				593	} else {
				594	struct btrfs_extent_data_ref *ref;
				595	while (ret == -EEXIST) {
				596	ref = btrfs_item_ptr(leaf, path->slots[0],
				597	struct btrfs_extent_data_ref);
				598	if (match_extent_data_ref(leaf, ref, root_objectid,
				599	owner, offset))
				600	break;
				601	btrfs_release_path(path);
				602	key.offset++;
				603	ret = btrfs_insert_empty_item(trans, root, path, &key,
				604	size);
				605	if (ret && ret != -EEXIST)
				606	goto fail;
				607
				608	leaf = path->nodes[0];
				609	}
				610	ref = btrfs_item_ptr(leaf, path->slots[0],
				611	struct btrfs_extent_data_ref);
				612	if (ret == 0) {
				613	btrfs_set_extent_data_ref_root(leaf, ref,
				614	root_objectid);
				615	btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
				616	btrfs_set_extent_data_ref_offset(leaf, ref, offset);
				617	btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
				618	} else {
				619	num_refs = btrfs_extent_data_ref_count(leaf, ref);
				620	num_refs += refs_to_add;
				621	btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
				622	}
				623	}
				624	btrfs_mark_buffer_dirty(leaf);
				625	ret = 0;
				626	fail:
				627	btrfs_release_path(path);
				628	return ret;
				629	}
				630
				631	static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
				632	struct btrfs_path *path,
				633	int refs_to_drop, int *last_ref)
				634	{
				635	struct btrfs_key key;
				636	struct btrfs_extent_data_ref *ref1 = NULL;
				637	struct btrfs_shared_data_ref *ref2 = NULL;
				638	struct extent_buffer *leaf;
				639	u32 num_refs = 0;
				640	int ret = 0;
				641
				642	leaf = path->nodes[0];
				643	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				644
				645	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
				646	ref1 = btrfs_item_ptr(leaf, path->slots[0],
				647	struct btrfs_extent_data_ref);
				648	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				649	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
				650	ref2 = btrfs_item_ptr(leaf, path->slots[0],
				651	struct btrfs_shared_data_ref);
				652	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				653	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
				654	btrfs_print_v0_err(trans->fs_info);
				655	btrfs_abort_transaction(trans, -EINVAL);
				656	return -EINVAL;
				657	} else {
				658	BUG();
				659	}
				660
				661	BUG_ON(num_refs < refs_to_drop);
				662	num_refs -= refs_to_drop;
				663
				664	if (num_refs == 0) {
				665	ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
				666	*last_ref = 1;
				667	} else {
				668	if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
				669	btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
				670	else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
				671	btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
				672	btrfs_mark_buffer_dirty(leaf);
				673	}
				674	return ret;
				675	}
				676
				677	static noinline u32 extent_data_ref_count(struct btrfs_path *path,
				678	struct btrfs_extent_inline_ref *iref)
				679	{
				680	struct btrfs_key key;
				681	struct extent_buffer *leaf;
				682	struct btrfs_extent_data_ref *ref1;
				683	struct btrfs_shared_data_ref *ref2;
				684	u32 num_refs = 0;
				685	int type;
				686
				687	leaf = path->nodes[0];
				688	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				689
				690	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
				691	if (iref) {
				692	/*
				693	* If type is invalid, we should have bailed out earlier than
				694	* this call.
				695	*/
				696	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
				697	ASSERT(type != BTRFS_REF_TYPE_INVALID);
				698	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				699	ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
				700	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				701	} else {
				702	ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
				703	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				704	}
				705	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
				706	ref1 = btrfs_item_ptr(leaf, path->slots[0],
				707	struct btrfs_extent_data_ref);
				708	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				709	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
				710	ref2 = btrfs_item_ptr(leaf, path->slots[0],
				711	struct btrfs_shared_data_ref);
				712	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				713	} else {
				714	WARN_ON(1);
				715	}
				716	return num_refs;
				717	}
				718
				719	static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
				720	struct btrfs_path *path,
				721	u64 bytenr, u64 parent,
				722	u64 root_objectid)
				723	{
				724	struct btrfs_root *root = trans->fs_info->extent_root;
				725	struct btrfs_key key;
				726	int ret;
				727
				728	key.objectid = bytenr;
				729	if (parent) {
				730	key.type = BTRFS_SHARED_BLOCK_REF_KEY;
				731	key.offset = parent;
				732	} else {
				733	key.type = BTRFS_TREE_BLOCK_REF_KEY;
				734	key.offset = root_objectid;
				735	}
				736
				737	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				738	if (ret > 0)
				739	ret = -ENOENT;
				740	return ret;
				741	}
				742
				743	static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
				744	struct btrfs_path *path,
				745	u64 bytenr, u64 parent,
				746	u64 root_objectid)
				747	{
				748	struct btrfs_key key;
				749	int ret;
				750
				751	key.objectid = bytenr;
				752	if (parent) {
				753	key.type = BTRFS_SHARED_BLOCK_REF_KEY;
				754	key.offset = parent;
				755	} else {
				756	key.type = BTRFS_TREE_BLOCK_REF_KEY;
				757	key.offset = root_objectid;
				758	}
				759
				760	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
				761	path, &key, 0);
				762	btrfs_release_path(path);
				763	return ret;
				764	}
				765
				766	static inline int extent_ref_type(u64 parent, u64 owner)
				767	{
				768	int type;
				769	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				770	if (parent > 0)
				771	type = BTRFS_SHARED_BLOCK_REF_KEY;
				772	else
				773	type = BTRFS_TREE_BLOCK_REF_KEY;
				774	} else {
				775	if (parent > 0)
				776	type = BTRFS_SHARED_DATA_REF_KEY;
				777	else
				778	type = BTRFS_EXTENT_DATA_REF_KEY;
				779	}
				780	return type;
				781	}
				782
				783	static int find_next_key(struct btrfs_path *path, int level,
				784	struct btrfs_key *key)
				785
				786	{
				787	for (; level < BTRFS_MAX_LEVEL; level++) {
				788	if (!path->nodes[level])
				789	break;
				790	if (path->slots[level] + 1 >=
				791	btrfs_header_nritems(path->nodes[level]))
				792	continue;
				793	if (level == 0)
				794	btrfs_item_key_to_cpu(path->nodes[level], key,
				795	path->slots[level] + 1);
				796	else
				797	btrfs_node_key_to_cpu(path->nodes[level], key,
				798	path->slots[level] + 1);
				799	return 0;
				800	}
				801	return 1;
				802	}
				803
				804	/*
				805	* look for inline back ref. if back ref is found, *ref_ret is set
				806	* to the address of inline back ref, and 0 is returned.
				807	*
				808	* if back ref isn't found, *ref_ret is set to the address where it
				809	* should be inserted, and -ENOENT is returned.
				810	*
				811	* if insert is true and there are too many inline back refs, the path
				812	* points to the extent item, and -EAGAIN is returned.
				813	*
				814	* NOTE: inline back refs are ordered in the same way that back ref
				815	* items in the tree are ordered.
				816	*/
				817	static noinline_for_stack
				818	int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				819	struct btrfs_path *path,
				820	struct btrfs_extent_inline_ref **ref_ret,
				821	u64 bytenr, u64 num_bytes,
				822	u64 parent, u64 root_objectid,
				823	u64 owner, u64 offset, int insert)
				824	{
				825	struct btrfs_fs_info *fs_info = trans->fs_info;
				826	struct btrfs_root *root = fs_info->extent_root;
				827	struct btrfs_key key;
				828	struct extent_buffer *leaf;
				829	struct btrfs_extent_item *ei;
				830	struct btrfs_extent_inline_ref *iref;
				831	u64 flags;
				832	u64 item_size;
				833	unsigned long ptr;
				834	unsigned long end;
				835	int extra_size;
				836	int type;
				837	int want;
				838	int ret;
				839	int err = 0;
				840	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				841	int needed;
				842
				843	key.objectid = bytenr;
				844	key.type = BTRFS_EXTENT_ITEM_KEY;
				845	key.offset = num_bytes;
				846
				847	want = extent_ref_type(parent, owner);
				848	if (insert) {
				849	extra_size = btrfs_extent_inline_ref_size(want);
				850	path->keep_locks = 1;
				851	} else
				852	extra_size = -1;
				853
				854	/*
				855	* Owner is our level, so we can just add one to get the level for the
				856	* block we are interested in.
				857	*/
				858	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
				859	key.type = BTRFS_METADATA_ITEM_KEY;
				860	key.offset = owner;
				861	}
				862
				863	again:
				864	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
				865	if (ret < 0) {
				866	err = ret;
				867	goto out;
				868	}
				869
				870	/*
				871	* We may be a newly converted file system which still has the old fat
				872	* extent entries for metadata, so try and see if we have one of those.
				873	*/
				874	if (ret > 0 && skinny_metadata) {
				875	skinny_metadata = false;
				876	if (path->slots[0]) {
				877	path->slots[0]--;
				878	btrfs_item_key_to_cpu(path->nodes[0], &key,
				879	path->slots[0]);
				880	if (key.objectid == bytenr &&
				881	key.type == BTRFS_EXTENT_ITEM_KEY &&
				882	key.offset == num_bytes)
				883	ret = 0;
				884	}
				885	if (ret) {
				886	key.objectid = bytenr;
				887	key.type = BTRFS_EXTENT_ITEM_KEY;
				888	key.offset = num_bytes;
				889	btrfs_release_path(path);
				890	goto again;
				891	}
				892	}
				893
				894	if (ret && !insert) {
				895	err = -ENOENT;
				896	goto out;
				897	} else if (WARN_ON(ret)) {
				898	btrfs_print_leaf(path->nodes[0]);
				899	btrfs_err(fs_info,
				900	"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
				901	bytenr, num_bytes, parent, root_objectid, owner,
				902	offset);
				903	err = -EIO;
				904	goto out;
				905	}
				906
				907	leaf = path->nodes[0];
				908	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				909	if (unlikely(item_size < sizeof(*ei))) {
				910	err = -EINVAL;
				911	btrfs_print_v0_err(fs_info);
				912	btrfs_abort_transaction(trans, err);
				913	goto out;
				914	}
				915
				916	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				917	flags = btrfs_extent_flags(leaf, ei);
				918
				919	ptr = (unsigned long)(ei + 1);
				920	end = (unsigned long)ei + item_size;
				921
				922	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
				923	ptr += sizeof(struct btrfs_tree_block_info);
				924	BUG_ON(ptr > end);
				925	}
				926
				927	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
				928	needed = BTRFS_REF_TYPE_DATA;
				929	else
				930	needed = BTRFS_REF_TYPE_BLOCK;
				931
				932	err = -ENOENT;
				933	while (1) {
				934	if (ptr >= end) {
				935	WARN_ON(ptr > end);
				936	break;
				937	}
				938	iref = (struct btrfs_extent_inline_ref *)ptr;
				939	type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
				940	if (type == BTRFS_REF_TYPE_INVALID) {
				941	err = -EUCLEAN;
				942	goto out;
				943	}
				944
				945	if (want < type)
				946	break;
				947	if (want > type) {
				948	ptr += btrfs_extent_inline_ref_size(type);
				949	continue;
				950	}
				951
				952	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				953	struct btrfs_extent_data_ref *dref;
				954	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				955	if (match_extent_data_ref(leaf, dref, root_objectid,
				956	owner, offset)) {
				957	err = 0;
				958	break;
				959	}
				960	if (hash_extent_data_ref_item(leaf, dref) <
				961	hash_extent_data_ref(root_objectid, owner, offset))
				962	break;
				963	} else {
				964	u64 ref_offset;
				965	ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
				966	if (parent > 0) {
				967	if (parent == ref_offset) {
				968	err = 0;
				969	break;
				970	}
				971	if (ref_offset < parent)
				972	break;
				973	} else {
				974	if (root_objectid == ref_offset) {
				975	err = 0;
				976	break;
				977	}
				978	if (ref_offset < root_objectid)
				979	break;
				980	}
				981	}
				982	ptr += btrfs_extent_inline_ref_size(type);
				983	}
				984	if (err == -ENOENT && insert) {
				985	if (item_size + extra_size >=
				986	BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
				987	err = -EAGAIN;
				988	goto out;
				989	}
				990	/*
				991	* To add new inline back ref, we have to make sure
				992	* there is no corresponding back ref item.
				993	* For simplicity, we just do not add new inline back
				994	* ref if there is any kind of item for this block
				995	*/
				996	if (find_next_key(path, 0, &key) == 0 &&
				997	key.objectid == bytenr &&
				998	key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
				999	err = -EAGAIN;
				1000	goto out;
				1001	}
				1002	}
				1003	ref_ret = (struct btrfs_extent_inline_ref )ptr;
				1004	out:
				1005	if (insert) {
				1006	path->keep_locks = 0;
				1007	btrfs_unlock_up_safe(path, 1);
				1008	}
				1009	return err;
				1010	}
				1011
				1012	/*
				1013	* helper to add new inline back ref
				1014	*/
				1015	static noinline_for_stack
				1016	void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
				1017	struct btrfs_path *path,
				1018	struct btrfs_extent_inline_ref *iref,
				1019	u64 parent, u64 root_objectid,
				1020	u64 owner, u64 offset, int refs_to_add,
				1021	struct btrfs_delayed_extent_op *extent_op)
				1022	{
				1023	struct extent_buffer *leaf;
				1024	struct btrfs_extent_item *ei;
				1025	unsigned long ptr;
				1026	unsigned long end;
				1027	unsigned long item_offset;
				1028	u64 refs;
				1029	int size;
				1030	int type;
				1031
				1032	leaf = path->nodes[0];
				1033	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1034	item_offset = (unsigned long)iref - (unsigned long)ei;
				1035
				1036	type = extent_ref_type(parent, owner);
				1037	size = btrfs_extent_inline_ref_size(type);
				1038
				1039	btrfs_extend_item(path, size);
				1040
				1041	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1042	refs = btrfs_extent_refs(leaf, ei);
				1043	refs += refs_to_add;
				1044	btrfs_set_extent_refs(leaf, ei, refs);
				1045	if (extent_op)
				1046	__run_delayed_extent_op(extent_op, leaf, ei);
				1047
				1048	ptr = (unsigned long)ei + item_offset;
				1049	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
				1050	if (ptr < end - size)
				1051	memmove_extent_buffer(leaf, ptr + size, ptr,
				1052	end - size - ptr);
				1053
				1054	iref = (struct btrfs_extent_inline_ref *)ptr;
				1055	btrfs_set_extent_inline_ref_type(leaf, iref, type);
				1056	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1057	struct btrfs_extent_data_ref *dref;
				1058	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1059	btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
				1060	btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
				1061	btrfs_set_extent_data_ref_offset(leaf, dref, offset);
				1062	btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
				1063	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1064	struct btrfs_shared_data_ref *sref;
				1065	sref = (struct btrfs_shared_data_ref *)(iref + 1);
				1066	btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
				1067	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				1068	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				1069	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				1070	} else {
				1071	btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
				1072	}
				1073	btrfs_mark_buffer_dirty(leaf);
				1074	}
				1075
				1076	static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				1077	struct btrfs_path *path,
				1078	struct btrfs_extent_inline_ref **ref_ret,
				1079	u64 bytenr, u64 num_bytes, u64 parent,
				1080	u64 root_objectid, u64 owner, u64 offset)
				1081	{
				1082	int ret;
				1083
				1084	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
				1085	num_bytes, parent, root_objectid,
				1086	owner, offset, 0);
				1087	if (ret != -ENOENT)
				1088	return ret;
				1089
				1090	btrfs_release_path(path);
				1091	*ref_ret = NULL;
				1092
				1093	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1094	ret = lookup_tree_block_ref(trans, path, bytenr, parent,
				1095	root_objectid);
				1096	} else {
				1097	ret = lookup_extent_data_ref(trans, path, bytenr, parent,
				1098	root_objectid, owner, offset);
				1099	}
				1100	return ret;
				1101	}
				1102
				1103	/*
				1104	* helper to update/remove inline back ref
				1105	*/
				1106	static noinline_for_stack
				1107	void update_inline_extent_backref(struct btrfs_path *path,
				1108	struct btrfs_extent_inline_ref *iref,
				1109	int refs_to_mod,
				1110	struct btrfs_delayed_extent_op *extent_op,
				1111	int *last_ref)
				1112	{
				1113	struct extent_buffer *leaf = path->nodes[0];
				1114	struct btrfs_extent_item *ei;
				1115	struct btrfs_extent_data_ref *dref = NULL;
				1116	struct btrfs_shared_data_ref *sref = NULL;
				1117	unsigned long ptr;
				1118	unsigned long end;
				1119	u32 item_size;
				1120	int size;
				1121	int type;
				1122	u64 refs;
				1123
				1124	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1125	refs = btrfs_extent_refs(leaf, ei);
				1126	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
				1127	refs += refs_to_mod;
				1128	btrfs_set_extent_refs(leaf, ei, refs);
				1129	if (extent_op)
				1130	__run_delayed_extent_op(extent_op, leaf, ei);
				1131
				1132	/*
				1133	* If type is invalid, we should have bailed out after
				1134	* lookup_inline_extent_backref().
				1135	*/
				1136	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
				1137	ASSERT(type != BTRFS_REF_TYPE_INVALID);
				1138
				1139	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1140	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1141	refs = btrfs_extent_data_ref_count(leaf, dref);
				1142	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1143	sref = (struct btrfs_shared_data_ref *)(iref + 1);
				1144	refs = btrfs_shared_data_ref_count(leaf, sref);
				1145	} else {
				1146	refs = 1;
				1147	BUG_ON(refs_to_mod != -1);
				1148	}
				1149
				1150	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
				1151	refs += refs_to_mod;
				1152
				1153	if (refs > 0) {
				1154	if (type == BTRFS_EXTENT_DATA_REF_KEY)
				1155	btrfs_set_extent_data_ref_count(leaf, dref, refs);
				1156	else
				1157	btrfs_set_shared_data_ref_count(leaf, sref, refs);
				1158	} else {
				1159	*last_ref = 1;
				1160	size = btrfs_extent_inline_ref_size(type);
				1161	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1162	ptr = (unsigned long)iref;
				1163	end = (unsigned long)ei + item_size;
				1164	if (ptr + size < end)
				1165	memmove_extent_buffer(leaf, ptr, ptr + size,
				1166	end - ptr - size);
				1167	item_size -= size;
				1168	btrfs_truncate_item(path, item_size, 1);
				1169	}
				1170	btrfs_mark_buffer_dirty(leaf);
				1171	}
				1172
				1173	static noinline_for_stack
				1174	int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				1175	struct btrfs_path *path,
				1176	u64 bytenr, u64 num_bytes, u64 parent,
				1177	u64 root_objectid, u64 owner,
				1178	u64 offset, int refs_to_add,
				1179	struct btrfs_delayed_extent_op *extent_op)
				1180	{
				1181	struct btrfs_extent_inline_ref *iref;
				1182	int ret;
				1183
				1184	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
				1185	num_bytes, parent, root_objectid,
				1186	owner, offset, 1);
				1187	if (ret == 0) {
				1188	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
				1189	update_inline_extent_backref(path, iref, refs_to_add,
				1190	extent_op, NULL);
				1191	} else if (ret == -ENOENT) {
				1192	setup_inline_extent_backref(trans->fs_info, path, iref, parent,
				1193	root_objectid, owner, offset,
				1194	refs_to_add, extent_op);
				1195	ret = 0;
				1196	}
				1197	return ret;
				1198	}
				1199
				1200	static int insert_extent_backref(struct btrfs_trans_handle *trans,
				1201	struct btrfs_path *path,
				1202	u64 bytenr, u64 parent, u64 root_objectid,
				1203	u64 owner, u64 offset, int refs_to_add)
				1204	{
				1205	int ret;
				1206	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1207	BUG_ON(refs_to_add != 1);
				1208	ret = insert_tree_block_ref(trans, path, bytenr, parent,
				1209	root_objectid);
				1210	} else {
				1211	ret = insert_extent_data_ref(trans, path, bytenr, parent,
				1212	root_objectid, owner, offset,
				1213	refs_to_add);
				1214	}
				1215	return ret;
				1216	}
				1217
				1218	static int remove_extent_backref(struct btrfs_trans_handle *trans,
				1219	struct btrfs_path *path,
				1220	struct btrfs_extent_inline_ref *iref,
				1221	int refs_to_drop, int is_data, int *last_ref)
				1222	{
				1223	int ret = 0;
				1224
				1225	BUG_ON(!is_data && refs_to_drop != 1);
				1226	if (iref) {
				1227	update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
				1228	last_ref);
				1229	} else if (is_data) {
				1230	ret = remove_extent_data_ref(trans, path, refs_to_drop,
				1231	last_ref);
				1232	} else {
				1233	*last_ref = 1;
				1234	ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
				1235	}
				1236	return ret;
				1237	}
				1238
				1239	static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
				1240	u64 *discarded_bytes)
				1241	{
				1242	int j, ret = 0;
				1243	u64 bytes_left, end;
				1244	u64 aligned_start = ALIGN(start, 1 << 9);
				1245
				1246	/* Adjust the range to be aligned to 512B sectors if necessary. */
				1247	if (start != aligned_start) {
				1248	len -= aligned_start - start;
				1249	len = round_down(len, 1 << 9);
				1250	start = aligned_start;
				1251	}
				1252
				1253	*discarded_bytes = 0;
				1254
				1255	if (!len)
				1256	return 0;
				1257
				1258	end = start + len;
				1259	bytes_left = len;
				1260
				1261	/* Skip any superblocks on this device. */
				1262	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
				1263	u64 sb_start = btrfs_sb_offset(j);
				1264	u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
				1265	u64 size = sb_start - start;
				1266
				1267	if (!in_range(sb_start, start, bytes_left) &&
				1268	!in_range(sb_end, start, bytes_left) &&
				1269	!in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
				1270	continue;
				1271
				1272	/*
				1273	* Superblock spans beginning of range. Adjust start and
				1274	* try again.
				1275	*/
				1276	if (sb_start <= start) {
				1277	start += sb_end - start;
				1278	if (start > end) {
				1279	bytes_left = 0;
				1280	break;
				1281	}
				1282	bytes_left = end - start;
				1283	continue;
				1284	}
				1285
				1286	if (size) {
				1287	ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
				1288	GFP_NOFS, 0);
				1289	if (!ret)
				1290	*discarded_bytes += size;
				1291	else if (ret != -EOPNOTSUPP)
				1292	return ret;
				1293	}
				1294
				1295	start = sb_end;
				1296	if (start > end) {
				1297	bytes_left = 0;
				1298	break;
				1299	}
				1300	bytes_left = end - start;
				1301	}
				1302
				1303	if (bytes_left) {
				1304	ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
				1305	GFP_NOFS, 0);
				1306	if (!ret)
				1307	*discarded_bytes += bytes_left;
				1308	}
				1309	return ret;
				1310	}
				1311
				1312	int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
				1313	u64 num_bytes, u64 *actual_bytes)
				1314	{
				1315	int ret = 0;
				1316	u64 discarded_bytes = 0;
				1317	u64 end = bytenr + num_bytes;
				1318	u64 cur = bytenr;
				1319	struct btrfs_bio *bbio = NULL;
				1320
				1321
				1322	/*
				1323	* Avoid races with device replace and make sure our bbio has devices
				1324	* associated to its stripes that don't go away while we are discarding.
				1325	*/
				1326	btrfs_bio_counter_inc_blocked(fs_info);
				1327	while (cur < end) {
				1328	struct btrfs_bio_stripe *stripe;
				1329	int i;
				1330
				1331	num_bytes = end - cur;
				1332	/* Tell the block device(s) that the sectors can be discarded */
				1333	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
				1334	&num_bytes, &bbio, 0);
				1335	/*
				1336	* Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
				1337	* -EOPNOTSUPP. For any such error, @num_bytes is not updated,
				1338	* thus we can't continue anyway.
				1339	*/
				1340	if (ret < 0)
				1341	goto out;
				1342
				1343	stripe = bbio->stripes;
				1344	for (i = 0; i < bbio->num_stripes; i++, stripe++) {
				1345	u64 bytes;
				1346	struct request_queue *req_q;
				1347	struct btrfs_device *device = stripe->dev;
				1348
				1349	if (!device->bdev) {
				1350	ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				1351	continue;
				1352	}
				1353	req_q = bdev_get_queue(device->bdev);
				1354	if (!blk_queue_discard(req_q))
				1355	continue;
				1356
				1357	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				1358	continue;
				1359
				1360	ret = btrfs_issue_discard(device->bdev,
				1361	stripe->physical,
				1362	stripe->length,
				1363	&bytes);
				1364	if (!ret) {
				1365	discarded_bytes += bytes;
				1366	} else if (ret != -EOPNOTSUPP) {
				1367	/*
				1368	* Logic errors or -ENOMEM, or -EIO, but
				1369	* unlikely to happen.
				1370	*
				1371	* And since there are two loops, explicitly
				1372	* go to out to avoid confusion.
				1373	*/
				1374	btrfs_put_bbio(bbio);
				1375	goto out;
				1376	}
				1377
				1378	/*
				1379	* Just in case we get back EOPNOTSUPP for some reason,
				1380	* just ignore the return value so we don't screw up
				1381	* people calling discard_extent.
				1382	*/
				1383	ret = 0;
				1384	}
				1385	btrfs_put_bbio(bbio);
				1386	cur += num_bytes;
				1387	}
				1388	out:
				1389	btrfs_bio_counter_dec(fs_info);
				1390
				1391	if (actual_bytes)
				1392	*actual_bytes = discarded_bytes;
				1393
				1394
				1395	if (ret == -EOPNOTSUPP)
				1396	ret = 0;
				1397	return ret;
				1398	}
				1399
				1400	/* Can return -ENOMEM */
				1401	int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
				1402	struct btrfs_ref *generic_ref)
				1403	{
				1404	struct btrfs_fs_info *fs_info = trans->fs_info;
				1405	int old_ref_mod, new_ref_mod;
				1406	int ret;
				1407
				1408	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
				1409	generic_ref->action);
				1410	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
				1411	generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
				1412
				1413	if (generic_ref->type == BTRFS_REF_METADATA)
				1414	ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
				1415	NULL, &old_ref_mod, &new_ref_mod);
				1416	else
				1417	ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
				1418	&old_ref_mod, &new_ref_mod);
				1419
				1420	btrfs_ref_tree_mod(fs_info, generic_ref);
				1421
				1422	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
				1423	sub_pinned_bytes(fs_info, generic_ref);
				1424
				1425	return ret;
				1426	}
				1427
				1428	/*
				1429	* __btrfs_inc_extent_ref - insert backreference for a given extent
				1430	*
				1431	* @trans: Handle of transaction
				1432	*
				1433	* @node: The delayed ref node used to get the bytenr/length for
				1434	* extent whose references are incremented.
				1435	*
				1436	* @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
				1437	* BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
				1438	* bytenr of the parent block. Since new extents are always
				1439	* created with indirect references, this will only be the case
				1440	* when relocating a shared extent. In that case, root_objectid
				1441	* will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
				1442	* be 0
				1443	*
				1444	* @root_objectid: The id of the root where this modification has originated,
				1445	* this can be either one of the well-known metadata trees or
				1446	* the subvolume id which references this extent.
				1447	*
				1448	* @owner: For data extents it is the inode number of the owning file.
				1449	* For metadata extents this parameter holds the level in the
				1450	* tree of the extent.
				1451	*
				1452	* @offset: For metadata extents the offset is ignored and is currently
				1453	* always passed as 0. For data extents it is the fileoffset
				1454	* this extent belongs to.
				1455	*
				1456	* @refs_to_add Number of references to add
				1457	*
				1458	* @extent_op Pointer to a structure, holding information necessary when
				1459	* updating a tree block's flags
				1460	*
				1461	*/
				1462	static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
				1463	struct btrfs_delayed_ref_node *node,
				1464	u64 parent, u64 root_objectid,
				1465	u64 owner, u64 offset, int refs_to_add,
				1466	struct btrfs_delayed_extent_op *extent_op)
				1467	{
				1468	struct btrfs_path *path;
				1469	struct extent_buffer *leaf;
				1470	struct btrfs_extent_item *item;
				1471	struct btrfs_key key;
				1472	u64 bytenr = node->bytenr;
				1473	u64 num_bytes = node->num_bytes;
				1474	u64 refs;
				1475	int ret;
				1476
				1477	path = btrfs_alloc_path();
				1478	if (!path)
				1479	return -ENOMEM;
				1480
				1481	path->reada = READA_FORWARD;
				1482	path->leave_spinning = 1;
				1483	/* this will setup the path even if it fails to insert the back ref */
				1484	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
				1485	parent, root_objectid, owner,
				1486	offset, refs_to_add, extent_op);
				1487	if ((ret < 0 && ret != -EAGAIN) \|\| !ret)
				1488	goto out;
				1489
				1490	/*
				1491	* Ok we had -EAGAIN which means we didn't have space to insert and
				1492	* inline extent ref, so just update the reference count and add a
				1493	* normal backref.
				1494	*/
				1495	leaf = path->nodes[0];
				1496	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1497	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1498	refs = btrfs_extent_refs(leaf, item);
				1499	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
				1500	if (extent_op)
				1501	__run_delayed_extent_op(extent_op, leaf, item);
				1502
				1503	btrfs_mark_buffer_dirty(leaf);
				1504	btrfs_release_path(path);
				1505
				1506	path->reada = READA_FORWARD;
				1507	path->leave_spinning = 1;
				1508	/* now insert the actual backref */
				1509	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
				1510	owner, offset, refs_to_add);
				1511	if (ret)
				1512	btrfs_abort_transaction(trans, ret);
				1513	out:
				1514	btrfs_free_path(path);
				1515	return ret;
				1516	}
				1517
				1518	static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				1519	struct btrfs_delayed_ref_node *node,
				1520	struct btrfs_delayed_extent_op *extent_op,
				1521	int insert_reserved)
				1522	{
				1523	int ret = 0;
				1524	struct btrfs_delayed_data_ref *ref;
				1525	struct btrfs_key ins;
				1526	u64 parent = 0;
				1527	u64 ref_root = 0;
				1528	u64 flags = 0;
				1529
				1530	ins.objectid = node->bytenr;
				1531	ins.offset = node->num_bytes;
				1532	ins.type = BTRFS_EXTENT_ITEM_KEY;
				1533
				1534	ref = btrfs_delayed_node_to_data_ref(node);
				1535	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
				1536
				1537	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
				1538	parent = ref->parent;
				1539	ref_root = ref->root;
				1540
				1541	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
				1542	if (extent_op)
				1543	flags \|= extent_op->flags_to_set;
				1544	ret = alloc_reserved_file_extent(trans, parent, ref_root,
				1545	flags, ref->objectid,
				1546	ref->offset, &ins,
				1547	node->ref_mod);
				1548	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
				1549	ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
				1550	ref->objectid, ref->offset,
				1551	node->ref_mod, extent_op);
				1552	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
				1553	ret = __btrfs_free_extent(trans, node, parent,
				1554	ref_root, ref->objectid,
				1555	ref->offset, node->ref_mod,
				1556	extent_op);
				1557	} else {
				1558	BUG();
				1559	}
				1560	return ret;
				1561	}
				1562
				1563	static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				1564	struct extent_buffer *leaf,
				1565	struct btrfs_extent_item *ei)
				1566	{
				1567	u64 flags = btrfs_extent_flags(leaf, ei);
				1568	if (extent_op->update_flags) {
				1569	flags \|= extent_op->flags_to_set;
				1570	btrfs_set_extent_flags(leaf, ei, flags);
				1571	}
				1572
				1573	if (extent_op->update_key) {
				1574	struct btrfs_tree_block_info *bi;
				1575	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
				1576	bi = (struct btrfs_tree_block_info *)(ei + 1);
				1577	btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
				1578	}
				1579	}
				1580
				1581	static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
				1582	struct btrfs_delayed_ref_head *head,
				1583	struct btrfs_delayed_extent_op *extent_op)
				1584	{
				1585	struct btrfs_fs_info *fs_info = trans->fs_info;
				1586	struct btrfs_key key;
				1587	struct btrfs_path *path;
				1588	struct btrfs_extent_item *ei;
				1589	struct extent_buffer *leaf;
				1590	u32 item_size;
				1591	int ret;
				1592	int err = 0;
				1593	int metadata = !extent_op->is_data;
				1594
				1595	if (TRANS_ABORTED(trans))
				1596	return 0;
				1597
				1598	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				1599	metadata = 0;
				1600
				1601	path = btrfs_alloc_path();
				1602	if (!path)
				1603	return -ENOMEM;
				1604
				1605	key.objectid = head->bytenr;
				1606
				1607	if (metadata) {
				1608	key.type = BTRFS_METADATA_ITEM_KEY;
				1609	key.offset = extent_op->level;
				1610	} else {
				1611	key.type = BTRFS_EXTENT_ITEM_KEY;
				1612	key.offset = head->num_bytes;
				1613	}
				1614
				1615	again:
				1616	path->reada = READA_FORWARD;
				1617	path->leave_spinning = 1;
				1618	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
				1619	if (ret < 0) {
				1620	err = ret;
				1621	goto out;
				1622	}
				1623	if (ret > 0) {
				1624	if (metadata) {
				1625	if (path->slots[0] > 0) {
				1626	path->slots[0]--;
				1627	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1628	path->slots[0]);
				1629	if (key.objectid == head->bytenr &&
				1630	key.type == BTRFS_EXTENT_ITEM_KEY &&
				1631	key.offset == head->num_bytes)
				1632	ret = 0;
				1633	}
				1634	if (ret > 0) {
				1635	btrfs_release_path(path);
				1636	metadata = 0;
				1637
				1638	key.objectid = head->bytenr;
				1639	key.offset = head->num_bytes;
				1640	key.type = BTRFS_EXTENT_ITEM_KEY;
				1641	goto again;
				1642	}
				1643	} else {
				1644	err = -EIO;
				1645	goto out;
				1646	}
				1647	}
				1648
				1649	leaf = path->nodes[0];
				1650	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1651
				1652	if (unlikely(item_size < sizeof(*ei))) {
				1653	err = -EINVAL;
				1654	btrfs_print_v0_err(fs_info);
				1655	btrfs_abort_transaction(trans, err);
				1656	goto out;
				1657	}
				1658
				1659	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1660	__run_delayed_extent_op(extent_op, leaf, ei);
				1661
				1662	btrfs_mark_buffer_dirty(leaf);
				1663	out:
				1664	btrfs_free_path(path);
				1665	return err;
				1666	}
				1667
				1668	static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				1669	struct btrfs_delayed_ref_node *node,
				1670	struct btrfs_delayed_extent_op *extent_op,
				1671	int insert_reserved)
				1672	{
				1673	int ret = 0;
				1674	struct btrfs_delayed_tree_ref *ref;
				1675	u64 parent = 0;
				1676	u64 ref_root = 0;
				1677
				1678	ref = btrfs_delayed_node_to_tree_ref(node);
				1679	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
				1680
				1681	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
				1682	parent = ref->parent;
				1683	ref_root = ref->root;
				1684
				1685	if (unlikely(node->ref_mod != 1)) {
				1686	btrfs_err(trans->fs_info,
				1687	"btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
				1688	node->bytenr, node->ref_mod, node->action, ref_root,
				1689	parent);
				1690	return -EUCLEAN;
				1691	}
				1692	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
				1693	BUG_ON(!extent_op \|\| !extent_op->update_flags);
				1694	ret = alloc_reserved_tree_block(trans, node, extent_op);
				1695	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
				1696	ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
				1697	ref->level, 0, 1, extent_op);
				1698	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
				1699	ret = __btrfs_free_extent(trans, node, parent, ref_root,
				1700	ref->level, 0, 1, extent_op);
				1701	} else {
				1702	BUG();
				1703	}
				1704	return ret;
				1705	}
				1706
				1707	/* helper function to actually process a single delayed ref entry */
				1708	static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
				1709	struct btrfs_delayed_ref_node *node,
				1710	struct btrfs_delayed_extent_op *extent_op,
				1711	int insert_reserved)
				1712	{
				1713	int ret = 0;
				1714
				1715	if (TRANS_ABORTED(trans)) {
				1716	if (insert_reserved)
				1717	btrfs_pin_extent(trans->fs_info, node->bytenr,
				1718	node->num_bytes, 1);
				1719	return 0;
				1720	}
				1721
				1722	if (node->type == BTRFS_TREE_BLOCK_REF_KEY \|\|
				1723	node->type == BTRFS_SHARED_BLOCK_REF_KEY)
				1724	ret = run_delayed_tree_ref(trans, node, extent_op,
				1725	insert_reserved);
				1726	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY \|\|
				1727	node->type == BTRFS_SHARED_DATA_REF_KEY)
				1728	ret = run_delayed_data_ref(trans, node, extent_op,
				1729	insert_reserved);
				1730	else
				1731	BUG();
				1732	if (ret && insert_reserved)
				1733	btrfs_pin_extent(trans->fs_info, node->bytenr,
				1734	node->num_bytes, 1);
				1735	return ret;
				1736	}
				1737
				1738	static inline struct btrfs_delayed_ref_node *
				1739	select_delayed_ref(struct btrfs_delayed_ref_head *head)
				1740	{
				1741	struct btrfs_delayed_ref_node *ref;
				1742
				1743	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
				1744	return NULL;
				1745
				1746	/*
				1747	* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
				1748	* This is to prevent a ref count from going down to zero, which deletes
				1749	* the extent item from the extent tree, when there still are references
				1750	* to add, which would fail because they would not find the extent item.
				1751	*/
				1752	if (!list_empty(&head->ref_add_list))
				1753	return list_first_entry(&head->ref_add_list,
				1754	struct btrfs_delayed_ref_node, add_list);
				1755
				1756	ref = rb_entry(rb_first_cached(&head->ref_tree),
				1757	struct btrfs_delayed_ref_node, ref_node);
				1758	ASSERT(list_empty(&ref->add_list));
				1759	return ref;
				1760	}
				1761
				1762	static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				1763	struct btrfs_delayed_ref_head *head)
				1764	{
				1765	spin_lock(&delayed_refs->lock);
				1766	head->processing = 0;
				1767	delayed_refs->num_heads_ready++;
				1768	spin_unlock(&delayed_refs->lock);
				1769	btrfs_delayed_ref_unlock(head);
				1770	}
				1771
				1772	static struct btrfs_delayed_extent_op *cleanup_extent_op(
				1773	struct btrfs_delayed_ref_head *head)
				1774	{
				1775	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
				1776
				1777	if (!extent_op)
				1778	return NULL;
				1779
				1780	if (head->must_insert_reserved) {
				1781	head->extent_op = NULL;
				1782	btrfs_free_delayed_extent_op(extent_op);
				1783	return NULL;
				1784	}
				1785	return extent_op;
				1786	}
				1787
				1788	static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
				1789	struct btrfs_delayed_ref_head *head)
				1790	{
				1791	struct btrfs_delayed_extent_op *extent_op;
				1792	int ret;
				1793
				1794	extent_op = cleanup_extent_op(head);
				1795	if (!extent_op)
				1796	return 0;
				1797	head->extent_op = NULL;
				1798	spin_unlock(&head->lock);
				1799	ret = run_delayed_extent_op(trans, head, extent_op);
				1800	btrfs_free_delayed_extent_op(extent_op);
				1801	return ret ? ret : 1;
				1802	}
				1803
				1804	void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
				1805	struct btrfs_delayed_ref_root *delayed_refs,
				1806	struct btrfs_delayed_ref_head *head)
				1807	{
				1808	int nr_items = 1; /* Dropping this ref head update. */
				1809
				1810	if (head->total_ref_mod < 0) {
				1811	struct btrfs_space_info *space_info;
				1812	u64 flags;
				1813
				1814	if (head->is_data)
				1815	flags = BTRFS_BLOCK_GROUP_DATA;
				1816	else if (head->is_system)
				1817	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				1818	else
				1819	flags = BTRFS_BLOCK_GROUP_METADATA;
				1820	space_info = btrfs_find_space_info(fs_info, flags);
				1821	ASSERT(space_info);
				1822	percpu_counter_add_batch(&space_info->total_bytes_pinned,
				1823	-head->num_bytes,
				1824	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				1825
				1826	/*
				1827	* We had csum deletions accounted for in our delayed refs rsv,
				1828	* we need to drop the csum leaves for this update from our
				1829	* delayed_refs_rsv.
				1830	*/
				1831	if (head->is_data) {
				1832	spin_lock(&delayed_refs->lock);
				1833	delayed_refs->pending_csums -= head->num_bytes;
				1834	spin_unlock(&delayed_refs->lock);
				1835	nr_items += btrfs_csum_bytes_to_leaves(fs_info,
				1836	head->num_bytes);
				1837	}
				1838	}
				1839
				1840	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
				1841	}
				1842
				1843	static int cleanup_ref_head(struct btrfs_trans_handle *trans,
				1844	struct btrfs_delayed_ref_head *head)
				1845	{
				1846
				1847	struct btrfs_fs_info *fs_info = trans->fs_info;
				1848	struct btrfs_delayed_ref_root *delayed_refs;
				1849	int ret;
				1850
				1851	delayed_refs = &trans->transaction->delayed_refs;
				1852
				1853	ret = run_and_cleanup_extent_op(trans, head);
				1854	if (ret < 0) {
				1855	unselect_delayed_ref_head(delayed_refs, head);
				1856	btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
				1857	return ret;
				1858	} else if (ret) {
				1859	return ret;
				1860	}
				1861
				1862	/*
				1863	* Need to drop our head ref lock and re-acquire the delayed ref lock
				1864	* and then re-check to make sure nobody got added.
				1865	*/
				1866	spin_unlock(&head->lock);
				1867	spin_lock(&delayed_refs->lock);
				1868	spin_lock(&head->lock);
				1869	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) \|\| head->extent_op) {
				1870	spin_unlock(&head->lock);
				1871	spin_unlock(&delayed_refs->lock);
				1872	return 1;
				1873	}
				1874	btrfs_delete_ref_head(delayed_refs, head);
				1875	spin_unlock(&head->lock);
				1876	spin_unlock(&delayed_refs->lock);
				1877
				1878	if (head->must_insert_reserved) {
				1879	btrfs_pin_extent(fs_info, head->bytenr,
				1880	head->num_bytes, 1);
				1881	if (head->is_data) {
				1882	ret = btrfs_del_csums(trans, fs_info->csum_root,
				1883	head->bytenr, head->num_bytes);
				1884	}
				1885	}
				1886
				1887	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
				1888
				1889	trace_run_delayed_ref_head(fs_info, head, 0);
				1890	btrfs_delayed_ref_unlock(head);
				1891	btrfs_put_delayed_ref_head(head);
				1892	return ret;
				1893	}
				1894
				1895	static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
				1896	struct btrfs_trans_handle *trans)
				1897	{
				1898	struct btrfs_delayed_ref_root *delayed_refs =
				1899	&trans->transaction->delayed_refs;
				1900	struct btrfs_delayed_ref_head *head = NULL;
				1901	int ret;
				1902
				1903	spin_lock(&delayed_refs->lock);
				1904	head = btrfs_select_ref_head(delayed_refs);
				1905	if (!head) {
				1906	spin_unlock(&delayed_refs->lock);
				1907	return head;
				1908	}
				1909
				1910	/*
				1911	* Grab the lock that says we are going to process all the refs for
				1912	* this head
				1913	*/
				1914	ret = btrfs_delayed_ref_lock(delayed_refs, head);
				1915	spin_unlock(&delayed_refs->lock);
				1916
				1917	/*
				1918	* We may have dropped the spin lock to get the head mutex lock, and
				1919	* that might have given someone else time to free the head. If that's
				1920	* true, it has been removed from our list and we can move on.
				1921	*/
				1922	if (ret == -EAGAIN)
				1923	head = ERR_PTR(-EAGAIN);
				1924
				1925	return head;
				1926	}
				1927
				1928	static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
				1929	struct btrfs_delayed_ref_head *locked_ref,
				1930	unsigned long *run_refs)
				1931	{
				1932	struct btrfs_fs_info *fs_info = trans->fs_info;
				1933	struct btrfs_delayed_ref_root *delayed_refs;
				1934	struct btrfs_delayed_extent_op *extent_op;
				1935	struct btrfs_delayed_ref_node *ref;
				1936	int must_insert_reserved = 0;
				1937	int ret;
				1938
				1939	delayed_refs = &trans->transaction->delayed_refs;
				1940
				1941	lockdep_assert_held(&locked_ref->mutex);
				1942	lockdep_assert_held(&locked_ref->lock);
				1943
				1944	while ((ref = select_delayed_ref(locked_ref))) {
				1945	if (ref->seq &&
				1946	btrfs_check_delayed_seq(fs_info, ref->seq)) {
				1947	spin_unlock(&locked_ref->lock);
				1948	unselect_delayed_ref_head(delayed_refs, locked_ref);
				1949	return -EAGAIN;
				1950	}
				1951
				1952	(*run_refs)++;
				1953	ref->in_tree = 0;
				1954	rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
				1955	RB_CLEAR_NODE(&ref->ref_node);
				1956	if (!list_empty(&ref->add_list))
				1957	list_del(&ref->add_list);
				1958	/*
				1959	* When we play the delayed ref, also correct the ref_mod on
				1960	* head
				1961	*/
				1962	switch (ref->action) {
				1963	case BTRFS_ADD_DELAYED_REF:
				1964	case BTRFS_ADD_DELAYED_EXTENT:
				1965	locked_ref->ref_mod -= ref->ref_mod;
				1966	break;
				1967	case BTRFS_DROP_DELAYED_REF:
				1968	locked_ref->ref_mod += ref->ref_mod;
				1969	break;
				1970	default:
				1971	WARN_ON(1);
				1972	}
				1973	atomic_dec(&delayed_refs->num_entries);
				1974
				1975	/*
				1976	* Record the must_insert_reserved flag before we drop the
				1977	* spin lock.
				1978	*/
				1979	must_insert_reserved = locked_ref->must_insert_reserved;
				1980	locked_ref->must_insert_reserved = 0;
				1981
				1982	extent_op = locked_ref->extent_op;
				1983	locked_ref->extent_op = NULL;
				1984	spin_unlock(&locked_ref->lock);
				1985
				1986	ret = run_one_delayed_ref(trans, ref, extent_op,
				1987	must_insert_reserved);
				1988
				1989	btrfs_free_delayed_extent_op(extent_op);
				1990	if (ret) {
				1991	unselect_delayed_ref_head(delayed_refs, locked_ref);
				1992	btrfs_put_delayed_ref(ref);
				1993	btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				1994	ret);
				1995	return ret;
				1996	}
				1997
				1998	btrfs_put_delayed_ref(ref);
				1999	cond_resched();
				2000
				2001	spin_lock(&locked_ref->lock);
				2002	btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
				2003	}
				2004
				2005	return 0;
				2006	}
				2007
				2008	/*
				2009	* Returns 0 on success or if called with an already aborted transaction.
				2010	* Returns -ENOMEM or -EIO on failure and will abort the transaction.
				2011	*/
				2012	static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
				2013	unsigned long nr)
				2014	{
				2015	struct btrfs_fs_info *fs_info = trans->fs_info;
				2016	struct btrfs_delayed_ref_root *delayed_refs;
				2017	struct btrfs_delayed_ref_head *locked_ref = NULL;
				2018	ktime_t start = ktime_get();
				2019	int ret;
				2020	unsigned long count = 0;
				2021	unsigned long actual_count = 0;
				2022
				2023	delayed_refs = &trans->transaction->delayed_refs;
				2024	do {
				2025	if (!locked_ref) {
				2026	locked_ref = btrfs_obtain_ref_head(trans);
				2027	if (IS_ERR_OR_NULL(locked_ref)) {
				2028	if (PTR_ERR(locked_ref) == -EAGAIN) {
				2029	continue;
				2030	} else {
				2031	break;
				2032	}
				2033	}
				2034	count++;
				2035	}
				2036	/*
				2037	* We need to try and merge add/drops of the same ref since we
				2038	* can run into issues with relocate dropping the implicit ref
				2039	* and then it being added back again before the drop can
				2040	* finish. If we merged anything we need to re-loop so we can
				2041	* get a good ref.
				2042	* Or we can get node references of the same type that weren't
				2043	* merged when created due to bumps in the tree mod seq, and
				2044	* we need to merge them to prevent adding an inline extent
				2045	* backref before dropping it (triggering a BUG_ON at
				2046	* insert_inline_extent_backref()).
				2047	*/
				2048	spin_lock(&locked_ref->lock);
				2049	btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
				2050
				2051	ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
				2052	&actual_count);
				2053	if (ret < 0 && ret != -EAGAIN) {
				2054	/*
				2055	* Error, btrfs_run_delayed_refs_for_head already
				2056	* unlocked everything so just bail out
				2057	*/
				2058	return ret;
				2059	} else if (!ret) {
				2060	/*
				2061	* Success, perform the usual cleanup of a processed
				2062	* head
				2063	*/
				2064	ret = cleanup_ref_head(trans, locked_ref);
				2065	if (ret > 0 ) {
				2066	/* We dropped our lock, we need to loop. */
				2067	ret = 0;
				2068	continue;
				2069	} else if (ret) {
				2070	return ret;
				2071	}
				2072	}
				2073
				2074	/*
				2075	* Either success case or btrfs_run_delayed_refs_for_head
				2076	* returned -EAGAIN, meaning we need to select another head
				2077	*/
				2078
				2079	locked_ref = NULL;
				2080	cond_resched();
				2081	} while ((nr != -1 && count < nr) \|\| locked_ref);
				2082
				2083	/*
				2084	* We don't want to include ref heads since we can have empty ref heads
				2085	* and those will drastically skew our runtime down since we just do
				2086	* accounting, no actual extent tree updates.
				2087	*/
				2088	if (actual_count > 0) {
				2089	u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
				2090	u64 avg;
				2091
				2092	/*
				2093	* We weigh the current average higher than our current runtime
				2094	* to avoid large swings in the average.
				2095	*/
				2096	spin_lock(&delayed_refs->lock);
				2097	avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
				2098	fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
				2099	spin_unlock(&delayed_refs->lock);
				2100	}
				2101	return 0;
				2102	}
				2103
				2104	#ifdef SCRAMBLE_DELAYED_REFS
				2105	/*
				2106	* Normally delayed refs get processed in ascending bytenr order. This
				2107	* correlates in most cases to the order added. To expose dependencies on this
				2108	* order, we start to process the tree in the middle instead of the beginning
				2109	*/
				2110	static u64 find_middle(struct rb_root *root)
				2111	{
				2112	struct rb_node *n = root->rb_node;
				2113	struct btrfs_delayed_ref_node *entry;
				2114	int alt = 1;
				2115	u64 middle;
				2116	u64 first = 0, last = 0;
				2117
				2118	n = rb_first(root);
				2119	if (n) {
				2120	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2121	first = entry->bytenr;
				2122	}
				2123	n = rb_last(root);
				2124	if (n) {
				2125	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2126	last = entry->bytenr;
				2127	}
				2128	n = root->rb_node;
				2129
				2130	while (n) {
				2131	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2132	WARN_ON(!entry->in_tree);
				2133
				2134	middle = entry->bytenr;
				2135
				2136	if (alt)
				2137	n = n->rb_left;
				2138	else
				2139	n = n->rb_right;
				2140
				2141	alt = 1 - alt;
				2142	}
				2143	return middle;
				2144	}
				2145	#endif
				2146
				2147	static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
				2148	{
				2149	u64 num_bytes;
				2150
				2151	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
				2152	sizeof(struct btrfs_extent_inline_ref));
				2153	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				2154	num_bytes += heads * sizeof(struct btrfs_tree_block_info);
				2155
				2156	/*
				2157	* We don't ever fill up leaves all the way so multiply by 2 just to be
				2158	* closer to what we're really going to want to use.
				2159	*/
				2160	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
				2161	}
				2162
				2163	/*
				2164	* Takes the number of bytes to be csumm'ed and figures out how many leaves it
				2165	* would require to store the csums for that many bytes.
				2166	*/
				2167	u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
				2168	{
				2169	u64 csum_size;
				2170	u64 num_csums_per_leaf;
				2171	u64 num_csums;
				2172
				2173	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
				2174	num_csums_per_leaf = div64_u64(csum_size,
				2175	(u64)btrfs_super_csum_size(fs_info->super_copy));
				2176	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
				2177	num_csums += num_csums_per_leaf - 1;
				2178	num_csums = div64_u64(num_csums, num_csums_per_leaf);
				2179	return num_csums;
				2180	}
				2181
				2182	/*
				2183	* this starts processing the delayed reference count updates and
				2184	* extent insertions we have queued up so far. count can be
				2185	* 0, which means to process everything in the tree at the start
				2186	* of the run (but not newly added entries), or it can be some target
				2187	* number you'd like to process.
				2188	*
				2189	* Returns 0 on success or if called with an aborted transaction
				2190	* Returns <0 on error and aborts the transaction
				2191	*/
				2192	int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
				2193	unsigned long count)
				2194	{
				2195	struct btrfs_fs_info *fs_info = trans->fs_info;
				2196	struct rb_node *node;
				2197	struct btrfs_delayed_ref_root *delayed_refs;
				2198	struct btrfs_delayed_ref_head *head;
				2199	int ret;
				2200	int run_all = count == (unsigned long)-1;
				2201
				2202	/* We'll clean this up in btrfs_cleanup_transaction */
				2203	if (TRANS_ABORTED(trans))
				2204	return 0;
				2205
				2206	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
				2207	return 0;
				2208
				2209	delayed_refs = &trans->transaction->delayed_refs;
				2210	if (count == 0)
				2211	count = atomic_read(&delayed_refs->num_entries) * 2;
				2212
				2213	again:
				2214	#ifdef SCRAMBLE_DELAYED_REFS
				2215	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
				2216	#endif
				2217	ret = __btrfs_run_delayed_refs(trans, count);
				2218	if (ret < 0) {
				2219	btrfs_abort_transaction(trans, ret);
				2220	return ret;
				2221	}
				2222
				2223	if (run_all) {
				2224	btrfs_create_pending_block_groups(trans);
				2225
				2226	spin_lock(&delayed_refs->lock);
				2227	node = rb_first_cached(&delayed_refs->href_root);
				2228	if (!node) {
				2229	spin_unlock(&delayed_refs->lock);
				2230	goto out;
				2231	}
				2232	head = rb_entry(node, struct btrfs_delayed_ref_head,
				2233	href_node);
				2234	refcount_inc(&head->refs);
				2235	spin_unlock(&delayed_refs->lock);
				2236
				2237	/* Mutex was contended, block until it's released and retry. */
				2238	mutex_lock(&head->mutex);
				2239	mutex_unlock(&head->mutex);
				2240
				2241	btrfs_put_delayed_ref_head(head);
				2242	cond_resched();
				2243	goto again;
				2244	}
				2245	out:
				2246	return 0;
				2247	}
				2248
				2249	int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				2250	u64 bytenr, u64 num_bytes, u64 flags,
				2251	int level, int is_data)
				2252	{
				2253	struct btrfs_delayed_extent_op *extent_op;
				2254	int ret;
				2255
				2256	extent_op = btrfs_alloc_delayed_extent_op();
				2257	if (!extent_op)
				2258	return -ENOMEM;
				2259
				2260	extent_op->flags_to_set = flags;
				2261	extent_op->update_flags = true;
				2262	extent_op->update_key = false;
				2263	extent_op->is_data = is_data ? true : false;
				2264	extent_op->level = level;
				2265
				2266	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
				2267	if (ret)
				2268	btrfs_free_delayed_extent_op(extent_op);
				2269	return ret;
				2270	}
				2271
				2272	static noinline int check_delayed_ref(struct btrfs_root *root,
				2273	struct btrfs_path *path,
				2274	u64 objectid, u64 offset, u64 bytenr)
				2275	{
				2276	struct btrfs_delayed_ref_head *head;
				2277	struct btrfs_delayed_ref_node *ref;
				2278	struct btrfs_delayed_data_ref *data_ref;
				2279	struct btrfs_delayed_ref_root *delayed_refs;
				2280	struct btrfs_transaction *cur_trans;
				2281	struct rb_node *node;
				2282	int ret = 0;
				2283
				2284	spin_lock(&root->fs_info->trans_lock);
				2285	cur_trans = root->fs_info->running_transaction;
				2286	if (cur_trans)
				2287	refcount_inc(&cur_trans->use_count);
				2288	spin_unlock(&root->fs_info->trans_lock);
				2289	if (!cur_trans)
				2290	return 0;
				2291
				2292	delayed_refs = &cur_trans->delayed_refs;
				2293	spin_lock(&delayed_refs->lock);
				2294	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				2295	if (!head) {
				2296	spin_unlock(&delayed_refs->lock);
				2297	btrfs_put_transaction(cur_trans);
				2298	return 0;
				2299	}
				2300
				2301	if (!mutex_trylock(&head->mutex)) {
				2302	refcount_inc(&head->refs);
				2303	spin_unlock(&delayed_refs->lock);
				2304
				2305	btrfs_release_path(path);
				2306
				2307	/*
				2308	* Mutex was contended, block until it's released and let
				2309	* caller try again
				2310	*/
				2311	mutex_lock(&head->mutex);
				2312	mutex_unlock(&head->mutex);
				2313	btrfs_put_delayed_ref_head(head);
				2314	btrfs_put_transaction(cur_trans);
				2315	return -EAGAIN;
				2316	}
				2317	spin_unlock(&delayed_refs->lock);
				2318
				2319	spin_lock(&head->lock);
				2320	/*
				2321	* XXX: We should replace this with a proper search function in the
				2322	* future.
				2323	*/
				2324	for (node = rb_first_cached(&head->ref_tree); node;
				2325	node = rb_next(node)) {
				2326	ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
				2327	/* If it's a shared ref we know a cross reference exists */
				2328	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
				2329	ret = 1;
				2330	break;
				2331	}
				2332
				2333	data_ref = btrfs_delayed_node_to_data_ref(ref);
				2334
				2335	/*
				2336	* If our ref doesn't match the one we're currently looking at
				2337	* then we have a cross reference.
				2338	*/
				2339	if (data_ref->root != root->root_key.objectid \|\|
				2340	data_ref->objectid != objectid \|\|
				2341	data_ref->offset != offset) {
				2342	ret = 1;
				2343	break;
				2344	}
				2345	}
				2346	spin_unlock(&head->lock);
				2347	mutex_unlock(&head->mutex);
				2348	btrfs_put_transaction(cur_trans);
				2349	return ret;
				2350	}
				2351
				2352	static noinline int check_committed_ref(struct btrfs_root *root,
				2353	struct btrfs_path *path,
				2354	u64 objectid, u64 offset, u64 bytenr,
				2355	bool strict)
				2356	{
				2357	struct btrfs_fs_info *fs_info = root->fs_info;
				2358	struct btrfs_root *extent_root = fs_info->extent_root;
				2359	struct extent_buffer *leaf;
				2360	struct btrfs_extent_data_ref *ref;
				2361	struct btrfs_extent_inline_ref *iref;
				2362	struct btrfs_extent_item *ei;
				2363	struct btrfs_key key;
				2364	u32 item_size;
				2365	int type;
				2366	int ret;
				2367
				2368	key.objectid = bytenr;
				2369	key.offset = (u64)-1;
				2370	key.type = BTRFS_EXTENT_ITEM_KEY;
				2371
				2372	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
				2373	if (ret < 0)
				2374	goto out;
				2375	BUG_ON(ret == 0); /* Corruption */
				2376
				2377	ret = -ENOENT;
				2378	if (path->slots[0] == 0)
				2379	goto out;
				2380
				2381	path->slots[0]--;
				2382	leaf = path->nodes[0];
				2383	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2384
				2385	if (key.objectid != bytenr \|\| key.type != BTRFS_EXTENT_ITEM_KEY)
				2386	goto out;
				2387
				2388	ret = 1;
				2389	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				2390	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				2391
				2392	/* If extent item has more than 1 inline ref then it's shared */
				2393	if (item_size != sizeof(*ei) +
				2394	btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
				2395	goto out;
				2396
				2397	/*
				2398	* If extent created before last snapshot => it's shared unless the
				2399	* snapshot has been deleted. Use the heuristic if strict is false.
				2400	*/
				2401	if (!strict &&
				2402	(btrfs_extent_generation(leaf, ei) <=
				2403	btrfs_root_last_snapshot(&root->root_item)))
				2404	goto out;
				2405
				2406	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
				2407
				2408	/* If this extent has SHARED_DATA_REF then it's shared */
				2409	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
				2410	if (type != BTRFS_EXTENT_DATA_REF_KEY)
				2411	goto out;
				2412
				2413	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
				2414	if (btrfs_extent_refs(leaf, ei) !=
				2415	btrfs_extent_data_ref_count(leaf, ref) \|\|
				2416	btrfs_extent_data_ref_root(leaf, ref) !=
				2417	root->root_key.objectid \|\|
				2418	btrfs_extent_data_ref_objectid(leaf, ref) != objectid \|\|
				2419	btrfs_extent_data_ref_offset(leaf, ref) != offset)
				2420	goto out;
				2421
				2422	ret = 0;
				2423	out:
				2424	return ret;
				2425	}
				2426
				2427	int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
				2428	u64 bytenr, bool strict)
				2429	{
				2430	struct btrfs_path *path;
				2431	int ret;
				2432
				2433	path = btrfs_alloc_path();
				2434	if (!path)
				2435	return -ENOMEM;
				2436
				2437	do {
				2438	ret = check_committed_ref(root, path, objectid,
				2439	offset, bytenr, strict);
				2440	if (ret && ret != -ENOENT)
				2441	goto out;
				2442
				2443	ret = check_delayed_ref(root, path, objectid, offset, bytenr);
				2444	} while (ret == -EAGAIN);
				2445
				2446	out:
				2447	btrfs_free_path(path);
				2448	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
				2449	WARN_ON(ret > 0);
				2450	return ret;
				2451	}
				2452
				2453	static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
				2454	struct btrfs_root *root,
				2455	struct extent_buffer *buf,
				2456	int full_backref, int inc)
				2457	{
				2458	struct btrfs_fs_info *fs_info = root->fs_info;
				2459	u64 bytenr;
				2460	u64 num_bytes;
				2461	u64 parent;
				2462	u64 ref_root;
				2463	u32 nritems;
				2464	struct btrfs_key key;
				2465	struct btrfs_file_extent_item *fi;
				2466	struct btrfs_ref generic_ref = { 0 };
				2467	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
				2468	int i;
				2469	int action;
				2470	int level;
				2471	int ret = 0;
				2472
				2473	if (btrfs_is_testing(fs_info))
				2474	return 0;
				2475
				2476	ref_root = btrfs_header_owner(buf);
				2477	nritems = btrfs_header_nritems(buf);
				2478	level = btrfs_header_level(buf);
				2479
				2480	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
				2481	return 0;
				2482
				2483	if (full_backref)
				2484	parent = buf->start;
				2485	else
				2486	parent = 0;
				2487	if (inc)
				2488	action = BTRFS_ADD_DELAYED_REF;
				2489	else
				2490	action = BTRFS_DROP_DELAYED_REF;
				2491
				2492	for (i = 0; i < nritems; i++) {
				2493	if (level == 0) {
				2494	btrfs_item_key_to_cpu(buf, &key, i);
				2495	if (key.type != BTRFS_EXTENT_DATA_KEY)
				2496	continue;
				2497	fi = btrfs_item_ptr(buf, i,
				2498	struct btrfs_file_extent_item);
				2499	if (btrfs_file_extent_type(buf, fi) ==
				2500	BTRFS_FILE_EXTENT_INLINE)
				2501	continue;
				2502	bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
				2503	if (bytenr == 0)
				2504	continue;
				2505
				2506	num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
				2507	key.offset -= btrfs_file_extent_offset(buf, fi);
				2508	btrfs_init_generic_ref(&generic_ref, action, bytenr,
				2509	num_bytes, parent);
				2510	generic_ref.real_root = root->root_key.objectid;
				2511	btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
				2512	key.offset);
				2513	generic_ref.skip_qgroup = for_reloc;
				2514	if (inc)
				2515	ret = btrfs_inc_extent_ref(trans, &generic_ref);
				2516	else
				2517	ret = btrfs_free_extent(trans, &generic_ref);
				2518	if (ret)
				2519	goto fail;
				2520	} else {
				2521	bytenr = btrfs_node_blockptr(buf, i);
				2522	num_bytes = fs_info->nodesize;
				2523	btrfs_init_generic_ref(&generic_ref, action, bytenr,
				2524	num_bytes, parent);
				2525	generic_ref.real_root = root->root_key.objectid;
				2526	btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
				2527	generic_ref.skip_qgroup = for_reloc;
				2528	if (inc)
				2529	ret = btrfs_inc_extent_ref(trans, &generic_ref);
				2530	else
				2531	ret = btrfs_free_extent(trans, &generic_ref);
				2532	if (ret)
				2533	goto fail;
				2534	}
				2535	}
				2536	return 0;
				2537	fail:
				2538	return ret;
				2539	}
				2540
				2541	int btrfs_inc_ref(struct btrfs_trans_handle trans, struct btrfs_root root,
				2542	struct extent_buffer *buf, int full_backref)
				2543	{
				2544	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
				2545	}
				2546
				2547	int btrfs_dec_ref(struct btrfs_trans_handle trans, struct btrfs_root root,
				2548	struct extent_buffer *buf, int full_backref)
				2549	{
				2550	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
				2551	}
				2552
				2553	int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
				2554	{
				2555	struct btrfs_block_group_cache *block_group;
				2556	int readonly = 0;
				2557
				2558	block_group = btrfs_lookup_block_group(fs_info, bytenr);
				2559	if (!block_group \|\| block_group->ro)
				2560	readonly = 1;
				2561	if (block_group)
				2562	btrfs_put_block_group(block_group);
				2563	return readonly;
				2564	}
				2565
				2566	static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
				2567	{
				2568	struct btrfs_fs_info *fs_info = root->fs_info;
				2569	u64 flags;
				2570	u64 ret;
				2571
				2572	if (data)
				2573	flags = BTRFS_BLOCK_GROUP_DATA;
				2574	else if (root == fs_info->chunk_root)
				2575	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				2576	else
				2577	flags = BTRFS_BLOCK_GROUP_METADATA;
				2578
				2579	ret = btrfs_get_alloc_profile(fs_info, flags);
				2580	return ret;
				2581	}
				2582
				2583	static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
				2584	{
				2585	struct btrfs_block_group_cache *cache;
				2586	u64 bytenr;
				2587
				2588	spin_lock(&fs_info->block_group_cache_lock);
				2589	bytenr = fs_info->first_logical_byte;
				2590	spin_unlock(&fs_info->block_group_cache_lock);
				2591
				2592	if (bytenr < (u64)-1)
				2593	return bytenr;
				2594
				2595	cache = btrfs_lookup_first_block_group(fs_info, search_start);
				2596	if (!cache)
				2597	return 0;
				2598
				2599	bytenr = cache->key.objectid;
				2600	btrfs_put_block_group(cache);
				2601
				2602	return bytenr;
				2603	}
				2604
				2605	static int pin_down_extent(struct btrfs_block_group_cache *cache,
				2606	u64 bytenr, u64 num_bytes, int reserved)
				2607	{
				2608	struct btrfs_fs_info *fs_info = cache->fs_info;
				2609
				2610	spin_lock(&cache->space_info->lock);
				2611	spin_lock(&cache->lock);
				2612	cache->pinned += num_bytes;
				2613	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
				2614	num_bytes);
				2615	if (reserved) {
				2616	cache->reserved -= num_bytes;
				2617	cache->space_info->bytes_reserved -= num_bytes;
				2618	}
				2619	spin_unlock(&cache->lock);
				2620	spin_unlock(&cache->space_info->lock);
				2621
				2622	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
				2623	num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
				2624	set_extent_dirty(fs_info->pinned_extents, bytenr,
				2625	bytenr + num_bytes - 1, GFP_NOFS \| __GFP_NOFAIL);
				2626	return 0;
				2627	}
				2628
				2629	/*
				2630	* this function must be called within transaction
				2631	*/
				2632	int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
				2633	u64 bytenr, u64 num_bytes, int reserved)
				2634	{
				2635	struct btrfs_block_group_cache *cache;
				2636
				2637	cache = btrfs_lookup_block_group(fs_info, bytenr);
				2638	BUG_ON(!cache); /* Logic error */
				2639
				2640	pin_down_extent(cache, bytenr, num_bytes, reserved);
				2641
				2642	btrfs_put_block_group(cache);
				2643	return 0;
				2644	}
				2645
				2646	/*
				2647	* this function must be called within transaction
				2648	*/
				2649	int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
				2650	u64 bytenr, u64 num_bytes)
				2651	{
				2652	struct btrfs_block_group_cache *cache;
				2653	int ret;
				2654
				2655	cache = btrfs_lookup_block_group(fs_info, bytenr);
				2656	if (!cache)
				2657	return -EINVAL;
				2658
				2659	/*
				2660	* pull in the free space cache (if any) so that our pin
				2661	* removes the free space from the cache. We have load_only set
				2662	* to one because the slow code to read in the free extents does check
				2663	* the pinned extents.
				2664	*/
				2665	btrfs_cache_block_group(cache, 1);
				2666
				2667	pin_down_extent(cache, bytenr, num_bytes, 0);
				2668
				2669	/* remove us from the free space cache (if we're there at all) */
				2670	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
				2671	btrfs_put_block_group(cache);
				2672	return ret;
				2673	}
				2674
				2675	static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				2676	u64 start, u64 num_bytes)
				2677	{
				2678	int ret;
				2679	struct btrfs_block_group_cache *block_group;
				2680	struct btrfs_caching_control *caching_ctl;
				2681
				2682	block_group = btrfs_lookup_block_group(fs_info, start);
				2683	if (!block_group)
				2684	return -EINVAL;
				2685
				2686	btrfs_cache_block_group(block_group, 0);
				2687	caching_ctl = btrfs_get_caching_control(block_group);
				2688
				2689	if (!caching_ctl) {
				2690	/* Logic error */
				2691	BUG_ON(!btrfs_block_group_cache_done(block_group));
				2692	ret = btrfs_remove_free_space(block_group, start, num_bytes);
				2693	} else {
				2694	mutex_lock(&caching_ctl->mutex);
				2695
				2696	if (start >= caching_ctl->progress) {
				2697	ret = btrfs_add_excluded_extent(fs_info, start,
				2698	num_bytes);
				2699	} else if (start + num_bytes <= caching_ctl->progress) {
				2700	ret = btrfs_remove_free_space(block_group,
				2701	start, num_bytes);
				2702	} else {
				2703	num_bytes = caching_ctl->progress - start;
				2704	ret = btrfs_remove_free_space(block_group,
				2705	start, num_bytes);
				2706	if (ret)
				2707	goto out_lock;
				2708
				2709	num_bytes = (start + num_bytes) -
				2710	caching_ctl->progress;
				2711	start = caching_ctl->progress;
				2712	ret = btrfs_add_excluded_extent(fs_info, start,
				2713	num_bytes);
				2714	}
				2715	out_lock:
				2716	mutex_unlock(&caching_ctl->mutex);
				2717	btrfs_put_caching_control(caching_ctl);
				2718	}
				2719	btrfs_put_block_group(block_group);
				2720	return ret;
				2721	}
				2722
				2723	int btrfs_exclude_logged_extents(struct extent_buffer *eb)
				2724	{
				2725	struct btrfs_fs_info *fs_info = eb->fs_info;
				2726	struct btrfs_file_extent_item *item;
				2727	struct btrfs_key key;
				2728	int found_type;
				2729	int i;
				2730	int ret = 0;
				2731
				2732	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
				2733	return 0;
				2734
				2735	for (i = 0; i < btrfs_header_nritems(eb); i++) {
				2736	btrfs_item_key_to_cpu(eb, &key, i);
				2737	if (key.type != BTRFS_EXTENT_DATA_KEY)
				2738	continue;
				2739	item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
				2740	found_type = btrfs_file_extent_type(eb, item);
				2741	if (found_type == BTRFS_FILE_EXTENT_INLINE)
				2742	continue;
				2743	if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
				2744	continue;
				2745	key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
				2746	key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
				2747	ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
				2748	if (ret)
				2749	break;
				2750	}
				2751
				2752	return ret;
				2753	}
				2754
				2755	static void
				2756	btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
				2757	{
				2758	atomic_inc(&bg->reservations);
				2759	}
				2760
				2761	void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
				2762	{
				2763	struct btrfs_caching_control *next;
				2764	struct btrfs_caching_control *caching_ctl;
				2765	struct btrfs_block_group_cache *cache;
				2766
				2767	down_write(&fs_info->commit_root_sem);
				2768
				2769	list_for_each_entry_safe(caching_ctl, next,
				2770	&fs_info->caching_block_groups, list) {
				2771	cache = caching_ctl->block_group;
				2772	if (btrfs_block_group_cache_done(cache)) {
				2773	cache->last_byte_to_unpin = (u64)-1;
				2774	list_del_init(&caching_ctl->list);
				2775	btrfs_put_caching_control(caching_ctl);
				2776	} else {
				2777	cache->last_byte_to_unpin = caching_ctl->progress;
				2778	}
				2779	}
				2780
				2781	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
				2782	fs_info->pinned_extents = &fs_info->freed_extents[1];
				2783	else
				2784	fs_info->pinned_extents = &fs_info->freed_extents[0];
				2785
				2786	up_write(&fs_info->commit_root_sem);
				2787
				2788	btrfs_update_global_block_rsv(fs_info);
				2789	}
				2790
				2791	/*
				2792	* Returns the free cluster for the given space info and sets empty_cluster to
				2793	* what it should be based on the mount options.
				2794	*/
				2795	static struct btrfs_free_cluster *
				2796	fetch_cluster_info(struct btrfs_fs_info *fs_info,
				2797	struct btrfs_space_info space_info, u64 empty_cluster)
				2798	{
				2799	struct btrfs_free_cluster *ret = NULL;
				2800
				2801	*empty_cluster = 0;
				2802	if (btrfs_mixed_space_info(space_info))
				2803	return ret;
				2804
				2805	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				2806	ret = &fs_info->meta_alloc_cluster;
				2807	if (btrfs_test_opt(fs_info, SSD))
				2808	*empty_cluster = SZ_2M;
				2809	else
				2810	*empty_cluster = SZ_64K;
				2811	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
				2812	btrfs_test_opt(fs_info, SSD_SPREAD)) {
				2813	*empty_cluster = SZ_2M;
				2814	ret = &fs_info->data_alloc_cluster;
				2815	}
				2816
				2817	return ret;
				2818	}
				2819
				2820	static int unpin_extent_range(struct btrfs_fs_info *fs_info,
				2821	u64 start, u64 end,
				2822	const bool return_free_space)
				2823	{
				2824	struct btrfs_block_group_cache *cache = NULL;
				2825	struct btrfs_space_info *space_info;
				2826	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				2827	struct btrfs_free_cluster *cluster = NULL;
				2828	u64 len;
				2829	u64 total_unpinned = 0;
				2830	u64 empty_cluster = 0;
				2831	bool readonly;
				2832
				2833	while (start <= end) {
				2834	readonly = false;
				2835	if (!cache \|\|
				2836	start >= cache->key.objectid + cache->key.offset) {
				2837	if (cache)
				2838	btrfs_put_block_group(cache);
				2839	total_unpinned = 0;
				2840	cache = btrfs_lookup_block_group(fs_info, start);
				2841	BUG_ON(!cache); /* Logic error */
				2842
				2843	cluster = fetch_cluster_info(fs_info,
				2844	cache->space_info,
				2845	&empty_cluster);
				2846	empty_cluster <<= 1;
				2847	}
				2848
				2849	len = cache->key.objectid + cache->key.offset - start;
				2850	len = min(len, end + 1 - start);
				2851
				2852	if (start < cache->last_byte_to_unpin && return_free_space) {
				2853	u64 add_len = min(len, cache->last_byte_to_unpin - start);
				2854
				2855	btrfs_add_free_space(cache, start, add_len);
				2856	}
				2857
				2858	start += len;
				2859	total_unpinned += len;
				2860	space_info = cache->space_info;
				2861
				2862	/*
				2863	* If this space cluster has been marked as fragmented and we've
				2864	* unpinned enough in this block group to potentially allow a
				2865	* cluster to be created inside of it go ahead and clear the
				2866	* fragmented check.
				2867	*/
				2868	if (cluster && cluster->fragmented &&
				2869	total_unpinned > empty_cluster) {
				2870	spin_lock(&cluster->lock);
				2871	cluster->fragmented = 0;
				2872	spin_unlock(&cluster->lock);
				2873	}
				2874
				2875	spin_lock(&space_info->lock);
				2876	spin_lock(&cache->lock);
				2877	cache->pinned -= len;
				2878	btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
				2879	space_info->max_extent_size = 0;
				2880	percpu_counter_add_batch(&space_info->total_bytes_pinned,
				2881	-len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
				2882	if (cache->ro) {
				2883	space_info->bytes_readonly += len;
				2884	readonly = true;
				2885	}
				2886	spin_unlock(&cache->lock);
				2887	if (!readonly && return_free_space &&
				2888	global_rsv->space_info == space_info) {
				2889	u64 to_add = len;
				2890
				2891	spin_lock(&global_rsv->lock);
				2892	if (!global_rsv->full) {
				2893	to_add = min(len, global_rsv->size -
				2894	global_rsv->reserved);
				2895	global_rsv->reserved += to_add;
				2896	btrfs_space_info_update_bytes_may_use(fs_info,
				2897	space_info, to_add);
				2898	if (global_rsv->reserved >= global_rsv->size)
				2899	global_rsv->full = 1;
				2900	len -= to_add;
				2901	}
				2902	spin_unlock(&global_rsv->lock);
				2903	/* Add to any tickets we may have */
				2904	if (len)
				2905	btrfs_try_granting_tickets(fs_info,
				2906	space_info);
				2907	}
				2908	spin_unlock(&space_info->lock);
				2909	}
				2910
				2911	if (cache)
				2912	btrfs_put_block_group(cache);
				2913	return 0;
				2914	}
				2915
				2916	int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
				2917	{
				2918	struct btrfs_fs_info *fs_info = trans->fs_info;
				2919	struct btrfs_block_group_cache block_group, tmp;
				2920	struct list_head *deleted_bgs;
				2921	struct extent_io_tree *unpin;
				2922	u64 start;
				2923	u64 end;
				2924	int ret;
				2925
				2926	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
				2927	unpin = &fs_info->freed_extents[1];
				2928	else
				2929	unpin = &fs_info->freed_extents[0];
				2930
				2931	while (!TRANS_ABORTED(trans)) {
				2932	struct extent_state *cached_state = NULL;
				2933
				2934	mutex_lock(&fs_info->unused_bg_unpin_mutex);
				2935	ret = find_first_extent_bit(unpin, 0, &start, &end,
				2936	EXTENT_DIRTY, &cached_state);
				2937	if (ret) {
				2938	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				2939	break;
				2940	}
				2941
				2942	if (btrfs_test_opt(fs_info, DISCARD))
				2943	ret = btrfs_discard_extent(fs_info, start,
				2944	end + 1 - start, NULL);
				2945
				2946	clear_extent_dirty(unpin, start, end, &cached_state);
				2947	unpin_extent_range(fs_info, start, end, true);
				2948	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				2949	free_extent_state(cached_state);
				2950	cond_resched();
				2951	}
				2952
				2953	/*
				2954	* Transaction is finished. We don't need the lock anymore. We
				2955	* do need to clean up the block groups in case of a transaction
				2956	* abort.
				2957	*/
				2958	deleted_bgs = &trans->transaction->deleted_bgs;
				2959	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
				2960	u64 trimmed = 0;
				2961
				2962	ret = -EROFS;
				2963	if (!TRANS_ABORTED(trans))
				2964	ret = btrfs_discard_extent(fs_info,
				2965	block_group->key.objectid,
				2966	block_group->key.offset,
				2967	&trimmed);
				2968
				2969	list_del_init(&block_group->bg_list);
				2970	btrfs_put_block_group_trimming(block_group);
				2971	btrfs_put_block_group(block_group);
				2972
				2973	if (ret) {
				2974	const char *errstr = btrfs_decode_error(ret);
				2975	btrfs_warn(fs_info,
				2976	"discard failed while removing blockgroup: errno=%d %s",
				2977	ret, errstr);
				2978	}
				2979	}
				2980
				2981	return 0;
				2982	}
				2983
				2984	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				2985	struct btrfs_delayed_ref_node *node, u64 parent,
				2986	u64 root_objectid, u64 owner_objectid,
				2987	u64 owner_offset, int refs_to_drop,
				2988	struct btrfs_delayed_extent_op *extent_op)
				2989	{
				2990	struct btrfs_fs_info *info = trans->fs_info;
				2991	struct btrfs_key key;
				2992	struct btrfs_path *path;
				2993	struct btrfs_root *extent_root = info->extent_root;
				2994	struct extent_buffer *leaf;
				2995	struct btrfs_extent_item *ei;
				2996	struct btrfs_extent_inline_ref *iref;
				2997	int ret;
				2998	int is_data;
				2999	int extent_slot = 0;
				3000	int found_extent = 0;
				3001	int num_to_del = 1;
				3002	u32 item_size;
				3003	u64 refs;
				3004	u64 bytenr = node->bytenr;
				3005	u64 num_bytes = node->num_bytes;
				3006	int last_ref = 0;
				3007	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
				3008
				3009	path = btrfs_alloc_path();
				3010	if (!path)
				3011	return -ENOMEM;
				3012
				3013	path->reada = READA_FORWARD;
				3014	path->leave_spinning = 1;
				3015
				3016	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
				3017	BUG_ON(!is_data && refs_to_drop != 1);
				3018
				3019	if (is_data)
				3020	skinny_metadata = false;
				3021
				3022	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				3023	parent, root_objectid, owner_objectid,
				3024	owner_offset);
				3025	if (ret == 0) {
				3026	extent_slot = path->slots[0];
				3027	while (extent_slot >= 0) {
				3028	btrfs_item_key_to_cpu(path->nodes[0], &key,
				3029	extent_slot);
				3030	if (key.objectid != bytenr)
				3031	break;
				3032	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
				3033	key.offset == num_bytes) {
				3034	found_extent = 1;
				3035	break;
				3036	}
				3037	if (key.type == BTRFS_METADATA_ITEM_KEY &&
				3038	key.offset == owner_objectid) {
				3039	found_extent = 1;
				3040	break;
				3041	}
				3042	if (path->slots[0] - extent_slot > 5)
				3043	break;
				3044	extent_slot--;
				3045	}
				3046
				3047	if (!found_extent) {
				3048	BUG_ON(iref);
				3049	ret = remove_extent_backref(trans, path, NULL,
				3050	refs_to_drop,
				3051	is_data, &last_ref);
				3052	if (ret) {
				3053	btrfs_abort_transaction(trans, ret);
				3054	goto out;
				3055	}
				3056	btrfs_release_path(path);
				3057	path->leave_spinning = 1;
				3058
				3059	key.objectid = bytenr;
				3060	key.type = BTRFS_EXTENT_ITEM_KEY;
				3061	key.offset = num_bytes;
				3062
				3063	if (!is_data && skinny_metadata) {
				3064	key.type = BTRFS_METADATA_ITEM_KEY;
				3065	key.offset = owner_objectid;
				3066	}
				3067
				3068	ret = btrfs_search_slot(trans, extent_root,
				3069	&key, path, -1, 1);
				3070	if (ret > 0 && skinny_metadata && path->slots[0]) {
				3071	/*
				3072	* Couldn't find our skinny metadata item,
				3073	* see if we have ye olde extent item.
				3074	*/
				3075	path->slots[0]--;
				3076	btrfs_item_key_to_cpu(path->nodes[0], &key,
				3077	path->slots[0]);
				3078	if (key.objectid == bytenr &&
				3079	key.type == BTRFS_EXTENT_ITEM_KEY &&
				3080	key.offset == num_bytes)
				3081	ret = 0;
				3082	}
				3083
				3084	if (ret > 0 && skinny_metadata) {
				3085	skinny_metadata = false;
				3086	key.objectid = bytenr;
				3087	key.type = BTRFS_EXTENT_ITEM_KEY;
				3088	key.offset = num_bytes;
				3089	btrfs_release_path(path);
				3090	ret = btrfs_search_slot(trans, extent_root,
				3091	&key, path, -1, 1);
				3092	}
				3093
				3094	if (ret) {
				3095	btrfs_err(info,
				3096	"umm, got %d back from search, was looking for %llu",
				3097	ret, bytenr);
				3098	if (ret > 0)
				3099	btrfs_print_leaf(path->nodes[0]);
				3100	}
				3101	if (ret < 0) {
				3102	btrfs_abort_transaction(trans, ret);
				3103	goto out;
				3104	}
				3105	extent_slot = path->slots[0];
				3106	}
				3107	} else if (WARN_ON(ret == -ENOENT)) {
				3108	btrfs_print_leaf(path->nodes[0]);
				3109	btrfs_err(info,
				3110	"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
				3111	bytenr, parent, root_objectid, owner_objectid,
				3112	owner_offset);
				3113	btrfs_abort_transaction(trans, ret);
				3114	goto out;
				3115	} else {
				3116	btrfs_abort_transaction(trans, ret);
				3117	goto out;
				3118	}
				3119
				3120	leaf = path->nodes[0];
				3121	item_size = btrfs_item_size_nr(leaf, extent_slot);
				3122	if (unlikely(item_size < sizeof(*ei))) {
				3123	ret = -EINVAL;
				3124	btrfs_print_v0_err(info);
				3125	btrfs_abort_transaction(trans, ret);
				3126	goto out;
				3127	}
				3128	ei = btrfs_item_ptr(leaf, extent_slot,
				3129	struct btrfs_extent_item);
				3130	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
				3131	key.type == BTRFS_EXTENT_ITEM_KEY) {
				3132	struct btrfs_tree_block_info *bi;
				3133	BUG_ON(item_size < sizeof(ei) + sizeof(bi));
				3134	bi = (struct btrfs_tree_block_info *)(ei + 1);
				3135	WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
				3136	}
				3137
				3138	refs = btrfs_extent_refs(leaf, ei);
				3139	if (refs < refs_to_drop) {
				3140	btrfs_err(info,
				3141	"trying to drop %d refs but we only have %Lu for bytenr %Lu",
				3142	refs_to_drop, refs, bytenr);
				3143	ret = -EINVAL;
				3144	btrfs_abort_transaction(trans, ret);
				3145	goto out;
				3146	}
				3147	refs -= refs_to_drop;
				3148
				3149	if (refs > 0) {
				3150	if (extent_op)
				3151	__run_delayed_extent_op(extent_op, leaf, ei);
				3152	/*
				3153	* In the case of inline back ref, reference count will
				3154	* be updated by remove_extent_backref
				3155	*/
				3156	if (iref) {
				3157	BUG_ON(!found_extent);
				3158	} else {
				3159	btrfs_set_extent_refs(leaf, ei, refs);
				3160	btrfs_mark_buffer_dirty(leaf);
				3161	}
				3162	if (found_extent) {
				3163	ret = remove_extent_backref(trans, path, iref,
				3164	refs_to_drop, is_data,
				3165	&last_ref);
				3166	if (ret) {
				3167	btrfs_abort_transaction(trans, ret);
				3168	goto out;
				3169	}
				3170	}
				3171	} else {
				3172	if (found_extent) {
				3173	BUG_ON(is_data && refs_to_drop !=
				3174	extent_data_ref_count(path, iref));
				3175	if (iref) {
				3176	BUG_ON(path->slots[0] != extent_slot);
				3177	} else {
				3178	BUG_ON(path->slots[0] != extent_slot + 1);
				3179	path->slots[0] = extent_slot;
				3180	num_to_del = 2;
				3181	}
				3182	}
				3183
				3184	last_ref = 1;
				3185	ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				3186	num_to_del);
				3187	if (ret) {
				3188	btrfs_abort_transaction(trans, ret);
				3189	goto out;
				3190	}
				3191	btrfs_release_path(path);
				3192
				3193	if (is_data) {
				3194	ret = btrfs_del_csums(trans, info->csum_root, bytenr,
				3195	num_bytes);
				3196	if (ret) {
				3197	btrfs_abort_transaction(trans, ret);
				3198	goto out;
				3199	}
				3200	}
				3201
				3202	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
				3203	if (ret) {
				3204	btrfs_abort_transaction(trans, ret);
				3205	goto out;
				3206	}
				3207
				3208	ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
				3209	if (ret) {
				3210	btrfs_abort_transaction(trans, ret);
				3211	goto out;
				3212	}
				3213	}
				3214	btrfs_release_path(path);
				3215
				3216	out:
				3217	btrfs_free_path(path);
				3218	return ret;
				3219	}
				3220
				3221	/*
				3222	* when we free an block, it is possible (and likely) that we free the last
				3223	* delayed ref for that extent as well. This searches the delayed ref tree for
				3224	* a given extent, and if there are no other delayed refs to be processed, it
				3225	* removes it from the tree.
				3226	*/
				3227	static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
				3228	u64 bytenr)
				3229	{
				3230	struct btrfs_delayed_ref_head *head;
				3231	struct btrfs_delayed_ref_root *delayed_refs;
				3232	int ret = 0;
				3233
				3234	delayed_refs = &trans->transaction->delayed_refs;
				3235	spin_lock(&delayed_refs->lock);
				3236	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				3237	if (!head)
				3238	goto out_delayed_unlock;
				3239
				3240	spin_lock(&head->lock);
				3241	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
				3242	goto out;
				3243
				3244	if (cleanup_extent_op(head) != NULL)
				3245	goto out;
				3246
				3247	/*
				3248	* waiting for the lock here would deadlock. If someone else has it
				3249	* locked they are already in the process of dropping it anyway
				3250	*/
				3251	if (!mutex_trylock(&head->mutex))
				3252	goto out;
				3253
				3254	btrfs_delete_ref_head(delayed_refs, head);
				3255	head->processing = 0;
				3256
				3257	spin_unlock(&head->lock);
				3258	spin_unlock(&delayed_refs->lock);
				3259
				3260	BUG_ON(head->extent_op);
				3261	if (head->must_insert_reserved)
				3262	ret = 1;
				3263
				3264	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
				3265	mutex_unlock(&head->mutex);
				3266	btrfs_put_delayed_ref_head(head);
				3267	return ret;
				3268	out:
				3269	spin_unlock(&head->lock);
				3270
				3271	out_delayed_unlock:
				3272	spin_unlock(&delayed_refs->lock);
				3273	return 0;
				3274	}
				3275
				3276	void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
				3277	struct btrfs_root *root,
				3278	struct extent_buffer *buf,
				3279	u64 parent, int last_ref)
				3280	{
				3281	struct btrfs_fs_info *fs_info = root->fs_info;
				3282	struct btrfs_ref generic_ref = { 0 };
				3283	int pin = 1;
				3284	int ret;
				3285
				3286	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
				3287	buf->start, buf->len, parent);
				3288	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
				3289	root->root_key.objectid);
				3290
				3291	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				3292	int old_ref_mod, new_ref_mod;
				3293
				3294	btrfs_ref_tree_mod(fs_info, &generic_ref);
				3295	ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
				3296	&old_ref_mod, &new_ref_mod);
				3297	BUG_ON(ret); /* -ENOMEM */
				3298	pin = old_ref_mod >= 0 && new_ref_mod < 0;
				3299	}
				3300
				3301	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
				3302	struct btrfs_block_group_cache *cache;
				3303
				3304	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				3305	ret = check_ref_cleanup(trans, buf->start);
				3306	if (!ret)
				3307	goto out;
				3308	}
				3309
				3310	pin = 0;
				3311	cache = btrfs_lookup_block_group(fs_info, buf->start);
				3312
				3313	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
				3314	pin_down_extent(cache, buf->start, buf->len, 1);
				3315	btrfs_put_block_group(cache);
				3316	goto out;
				3317	}
				3318
				3319	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
				3320
				3321	btrfs_add_free_space(cache, buf->start, buf->len);
				3322	btrfs_free_reserved_bytes(cache, buf->len, 0);
				3323	btrfs_put_block_group(cache);
				3324	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
				3325	}
				3326	out:
				3327	if (pin)
				3328	add_pinned_bytes(fs_info, &generic_ref);
				3329
				3330	if (last_ref) {
				3331	/*
				3332	* Deleting the buffer, clear the corrupt flag since it doesn't
				3333	* matter anymore.
				3334	*/
				3335	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
				3336	}
				3337	}
				3338
				3339	/* Can return -ENOMEM */
				3340	int btrfs_free_extent(struct btrfs_trans_handle trans, struct btrfs_ref ref)
				3341	{
				3342	struct btrfs_fs_info *fs_info = trans->fs_info;
				3343	int old_ref_mod, new_ref_mod;
				3344	int ret;
				3345
				3346	if (btrfs_is_testing(fs_info))
				3347	return 0;
				3348
				3349	/*
				3350	* tree log blocks never actually go into the extent allocation
				3351	* tree, just update pinning info and exit early.
				3352	*/
				3353	if ((ref->type == BTRFS_REF_METADATA &&
				3354	ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) \|\|
				3355	(ref->type == BTRFS_REF_DATA &&
				3356	ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
				3357	/* unlocks the pinned mutex */
				3358	btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
				3359	old_ref_mod = new_ref_mod = 0;
				3360	ret = 0;
				3361	} else if (ref->type == BTRFS_REF_METADATA) {
				3362	ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
				3363	&old_ref_mod, &new_ref_mod);
				3364	} else {
				3365	ret = btrfs_add_delayed_data_ref(trans, ref, 0,
				3366	&old_ref_mod, &new_ref_mod);
				3367	}
				3368
				3369	if (!((ref->type == BTRFS_REF_METADATA &&
				3370	ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) \|\|
				3371	(ref->type == BTRFS_REF_DATA &&
				3372	ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
				3373	btrfs_ref_tree_mod(fs_info, ref);
				3374
				3375	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
				3376	add_pinned_bytes(fs_info, ref);
				3377
				3378	return ret;
				3379	}
				3380
				3381	enum btrfs_loop_type {
				3382	LOOP_CACHING_NOWAIT,
				3383	LOOP_CACHING_WAIT,
				3384	LOOP_ALLOC_CHUNK,
				3385	LOOP_NO_EMPTY_SIZE,
				3386	};
				3387
				3388	static inline void
				3389	btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
				3390	int delalloc)
				3391	{
				3392	if (delalloc)
				3393	down_read(&cache->data_rwsem);
				3394	}
				3395
				3396	static inline void
				3397	btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
				3398	int delalloc)
				3399	{
				3400	btrfs_get_block_group(cache);
				3401	if (delalloc)
				3402	down_read(&cache->data_rwsem);
				3403	}
				3404
				3405	static struct btrfs_block_group_cache *
				3406	btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
				3407	struct btrfs_free_cluster *cluster,
				3408	int delalloc)
				3409	{
				3410	struct btrfs_block_group_cache *used_bg = NULL;
				3411
				3412	spin_lock(&cluster->refill_lock);
				3413	while (1) {
				3414	used_bg = cluster->block_group;
				3415	if (!used_bg)
				3416	return NULL;
				3417
				3418	if (used_bg == block_group)
				3419	return used_bg;
				3420
				3421	btrfs_get_block_group(used_bg);
				3422
				3423	if (!delalloc)
				3424	return used_bg;
				3425
				3426	if (down_read_trylock(&used_bg->data_rwsem))
				3427	return used_bg;
				3428
				3429	spin_unlock(&cluster->refill_lock);
				3430
				3431	/* We should only have one-level nested. */
				3432	down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
				3433
				3434	spin_lock(&cluster->refill_lock);
				3435	if (used_bg == cluster->block_group)
				3436	return used_bg;
				3437
				3438	up_read(&used_bg->data_rwsem);
				3439	btrfs_put_block_group(used_bg);
				3440	}
				3441	}
				3442
				3443	static inline void
				3444	btrfs_release_block_group(struct btrfs_block_group_cache *cache,
				3445	int delalloc)
				3446	{
				3447	if (delalloc)
				3448	up_read(&cache->data_rwsem);
				3449	btrfs_put_block_group(cache);
				3450	}
				3451
				3452	/*
				3453	* Structure used internally for find_free_extent() function. Wraps needed
				3454	* parameters.
				3455	*/
				3456	struct find_free_extent_ctl {
				3457	/* Basic allocation info */
				3458	u64 ram_bytes;
				3459	u64 num_bytes;
				3460	u64 empty_size;
				3461	u64 flags;
				3462	int delalloc;
				3463
				3464	/* Where to start the search inside the bg */
				3465	u64 search_start;
				3466
				3467	/* For clustered allocation */
				3468	u64 empty_cluster;
				3469
				3470	bool have_caching_bg;
				3471	bool orig_have_caching_bg;
				3472
				3473	/* RAID index, converted from flags */
				3474	int index;
				3475
				3476	/*
				3477	* Current loop number, check find_free_extent_update_loop() for details
				3478	*/
				3479	int loop;
				3480
				3481	/*
				3482	* Whether we're refilling a cluster, if true we need to re-search
				3483	* current block group but don't try to refill the cluster again.
				3484	*/
				3485	bool retry_clustered;
				3486
				3487	/*
				3488	* Whether we're updating free space cache, if true we need to re-search
				3489	* current block group but don't try updating free space cache again.
				3490	*/
				3491	bool retry_unclustered;
				3492
				3493	/* If current block group is cached */
				3494	int cached;
				3495
				3496	/* Max contiguous hole found */
				3497	u64 max_extent_size;
				3498
				3499	/* Total free space from free space cache, not always contiguous */
				3500	u64 total_free_space;
				3501
				3502	/* Found result */
				3503	u64 found_offset;
				3504	};
				3505
				3506
				3507	/*
				3508	* Helper function for find_free_extent().
				3509	*
				3510	* Return -ENOENT to inform caller that we need fallback to unclustered mode.
				3511	* Return -EAGAIN to inform caller that we need to re-search this block group
				3512	* Return >0 to inform caller that we find nothing
				3513	* Return 0 means we have found a location and set ffe_ctl->found_offset.
				3514	*/
				3515	static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
				3516	struct btrfs_free_cluster *last_ptr,
				3517	struct find_free_extent_ctl *ffe_ctl,
				3518	struct btrfs_block_group_cache **cluster_bg_ret)
				3519	{
				3520	struct btrfs_block_group_cache *cluster_bg;
				3521	u64 aligned_cluster;
				3522	u64 offset;
				3523	int ret;
				3524
				3525	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
				3526	if (!cluster_bg)
				3527	goto refill_cluster;
				3528	if (cluster_bg != bg && (cluster_bg->ro \|\|
				3529	!block_group_bits(cluster_bg, ffe_ctl->flags)))
				3530	goto release_cluster;
				3531
				3532	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
				3533	ffe_ctl->num_bytes, cluster_bg->key.objectid,
				3534	&ffe_ctl->max_extent_size);
				3535	if (offset) {
				3536	/* We have a block, we're done */
				3537	spin_unlock(&last_ptr->refill_lock);
				3538	trace_btrfs_reserve_extent_cluster(cluster_bg,
				3539	ffe_ctl->search_start, ffe_ctl->num_bytes);
				3540	*cluster_bg_ret = cluster_bg;
				3541	ffe_ctl->found_offset = offset;
				3542	return 0;
				3543	}
				3544	WARN_ON(last_ptr->block_group != cluster_bg);
				3545
				3546	release_cluster:
				3547	/*
				3548	* If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
				3549	* lets just skip it and let the allocator find whatever block it can
				3550	* find. If we reach this point, we will have tried the cluster
				3551	* allocator plenty of times and not have found anything, so we are
				3552	* likely way too fragmented for the clustering stuff to find anything.
				3553	*
				3554	* However, if the cluster is taken from the current block group,
				3555	* release the cluster first, so that we stand a better chance of
				3556	* succeeding in the unclustered allocation.
				3557	*/
				3558	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
				3559	spin_unlock(&last_ptr->refill_lock);
				3560	btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
				3561	return -ENOENT;
				3562	}
				3563
				3564	/* This cluster didn't work out, free it and start over */
				3565	btrfs_return_cluster_to_free_space(NULL, last_ptr);
				3566
				3567	if (cluster_bg != bg)
				3568	btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
				3569
				3570	refill_cluster:
				3571	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
				3572	spin_unlock(&last_ptr->refill_lock);
				3573	return -ENOENT;
				3574	}
				3575
				3576	aligned_cluster = max_t(u64,
				3577	ffe_ctl->empty_cluster + ffe_ctl->empty_size,
				3578	bg->full_stripe_len);
				3579	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
				3580	ffe_ctl->num_bytes, aligned_cluster);
				3581	if (ret == 0) {
				3582	/* Now pull our allocation out of this cluster */
				3583	offset = btrfs_alloc_from_cluster(bg, last_ptr,
				3584	ffe_ctl->num_bytes, ffe_ctl->search_start,
				3585	&ffe_ctl->max_extent_size);
				3586	if (offset) {
				3587	/* We found one, proceed */
				3588	spin_unlock(&last_ptr->refill_lock);
				3589	trace_btrfs_reserve_extent_cluster(bg,
				3590	ffe_ctl->search_start,
				3591	ffe_ctl->num_bytes);
				3592	ffe_ctl->found_offset = offset;
				3593	return 0;
				3594	}
				3595	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
				3596	!ffe_ctl->retry_clustered) {
				3597	spin_unlock(&last_ptr->refill_lock);
				3598
				3599	ffe_ctl->retry_clustered = true;
				3600	btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
				3601	ffe_ctl->empty_cluster + ffe_ctl->empty_size);
				3602	return -EAGAIN;
				3603	}
				3604	/*
				3605	* At this point we either didn't find a cluster or we weren't able to
				3606	* allocate a block from our cluster. Free the cluster we've been
				3607	* trying to use, and go to the next block group.
				3608	*/
				3609	btrfs_return_cluster_to_free_space(NULL, last_ptr);
				3610	spin_unlock(&last_ptr->refill_lock);
				3611	return 1;
				3612	}
				3613
				3614	/*
				3615	* Return >0 to inform caller that we find nothing
				3616	* Return 0 when we found an free extent and set ffe_ctrl->found_offset
				3617	* Return -EAGAIN to inform caller that we need to re-search this block group
				3618	*/
				3619	static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
				3620	struct btrfs_free_cluster *last_ptr,
				3621	struct find_free_extent_ctl *ffe_ctl)
				3622	{
				3623	u64 offset;
				3624
				3625	/*
				3626	* We are doing an unclustered allocation, set the fragmented flag so
				3627	* we don't bother trying to setup a cluster again until we get more
				3628	* space.
				3629	*/
				3630	if (unlikely(last_ptr)) {
				3631	spin_lock(&last_ptr->lock);
				3632	last_ptr->fragmented = 1;
				3633	spin_unlock(&last_ptr->lock);
				3634	}
				3635	if (ffe_ctl->cached) {
				3636	struct btrfs_free_space_ctl *free_space_ctl;
				3637
				3638	free_space_ctl = bg->free_space_ctl;
				3639	spin_lock(&free_space_ctl->tree_lock);
				3640	if (free_space_ctl->free_space <
				3641	ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
				3642	ffe_ctl->empty_size) {
				3643	ffe_ctl->total_free_space = max_t(u64,
				3644	ffe_ctl->total_free_space,
				3645	free_space_ctl->free_space);
				3646	spin_unlock(&free_space_ctl->tree_lock);
				3647	return 1;
				3648	}
				3649	spin_unlock(&free_space_ctl->tree_lock);
				3650	}
				3651
				3652	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
				3653	ffe_ctl->num_bytes, ffe_ctl->empty_size,
				3654	&ffe_ctl->max_extent_size);
				3655
				3656	/*
				3657	* If we didn't find a chunk, and we haven't failed on this block group
				3658	* before, and this block group is in the middle of caching and we are
				3659	* ok with waiting, then go ahead and wait for progress to be made, and
				3660	* set @retry_unclustered to true.
				3661	*
				3662	* If @retry_unclustered is true then we've already waited on this
				3663	* block group once and should move on to the next block group.
				3664	*/
				3665	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
				3666	ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
				3667	btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
				3668	ffe_ctl->empty_size);
				3669	ffe_ctl->retry_unclustered = true;
				3670	return -EAGAIN;
				3671	} else if (!offset) {
				3672	return 1;
				3673	}
				3674	ffe_ctl->found_offset = offset;
				3675	return 0;
				3676	}
				3677
				3678	/*
				3679	* Return >0 means caller needs to re-search for free extent
				3680	* Return 0 means we have the needed free extent.
				3681	* Return <0 means we failed to locate any free extent.
				3682	*/
				3683	static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
				3684	struct btrfs_free_cluster *last_ptr,
				3685	struct btrfs_key *ins,
				3686	struct find_free_extent_ctl *ffe_ctl,
				3687	int full_search, bool use_cluster)
				3688	{
				3689	struct btrfs_root *root = fs_info->extent_root;
				3690	int ret;
				3691
				3692	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
				3693	ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
				3694	ffe_ctl->orig_have_caching_bg = true;
				3695
				3696	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
				3697	ffe_ctl->have_caching_bg)
				3698	return 1;
				3699
				3700	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
				3701	return 1;
				3702
				3703	if (ins->objectid) {
				3704	if (!use_cluster && last_ptr) {
				3705	spin_lock(&last_ptr->lock);
				3706	last_ptr->window_start = ins->objectid;
				3707	spin_unlock(&last_ptr->lock);
				3708	}
				3709	return 0;
				3710	}
				3711
				3712	/*
				3713	* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
				3714	* caching kthreads as we move along
				3715	* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
				3716	* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
				3717	* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
				3718	* again
				3719	*/
				3720	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
				3721	ffe_ctl->index = 0;
				3722	if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
				3723	/*
				3724	* We want to skip the LOOP_CACHING_WAIT step if we
				3725	* don't have any uncached bgs and we've already done a
				3726	* full search through.
				3727	*/
				3728	if (ffe_ctl->orig_have_caching_bg \|\| !full_search)
				3729	ffe_ctl->loop = LOOP_CACHING_WAIT;
				3730	else
				3731	ffe_ctl->loop = LOOP_ALLOC_CHUNK;
				3732	} else {
				3733	ffe_ctl->loop++;
				3734	}
				3735
				3736	if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
				3737	struct btrfs_trans_handle *trans;
				3738	int exist = 0;
				3739
				3740	trans = current->journal_info;
				3741	if (trans)
				3742	exist = 1;
				3743	else
				3744	trans = btrfs_join_transaction(root);
				3745
				3746	if (IS_ERR(trans)) {
				3747	ret = PTR_ERR(trans);
				3748	return ret;
				3749	}
				3750
				3751	ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
				3752	CHUNK_ALLOC_FORCE);
				3753
				3754	/*
				3755	* If we can't allocate a new chunk we've already looped
				3756	* through at least once, move on to the NO_EMPTY_SIZE
				3757	* case.
				3758	*/
				3759	if (ret == -ENOSPC)
				3760	ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
				3761
				3762	/* Do not bail out on ENOSPC since we can do more. */
				3763	if (ret < 0 && ret != -ENOSPC)
				3764	btrfs_abort_transaction(trans, ret);
				3765	else
				3766	ret = 0;
				3767	if (!exist)
				3768	btrfs_end_transaction(trans);
				3769	if (ret)
				3770	return ret;
				3771	}
				3772
				3773	if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
				3774	/*
				3775	* Don't loop again if we already have no empty_size and
				3776	* no empty_cluster.
				3777	*/
				3778	if (ffe_ctl->empty_size == 0 &&
				3779	ffe_ctl->empty_cluster == 0)
				3780	return -ENOSPC;
				3781	ffe_ctl->empty_size = 0;
				3782	ffe_ctl->empty_cluster = 0;
				3783	}
				3784	return 1;
				3785	}
				3786	return -ENOSPC;
				3787	}
				3788
				3789	/*
				3790	* walks the btree of allocated extents and find a hole of a given size.
				3791	* The key ins is changed to record the hole:
				3792	* ins->objectid == start position
				3793	* ins->flags = BTRFS_EXTENT_ITEM_KEY
				3794	* ins->offset == the size of the hole.
				3795	* Any available blocks before search_start are skipped.
				3796	*
				3797	* If there is no suitable free space, we will record the max size of
				3798	* the free space extent currently.
				3799	*
				3800	* The overall logic and call chain:
				3801	*
				3802	* find_free_extent()
				3803	* \|- Iterate through all block groups
				3804	* \| \|- Get a valid block group
				3805	* \| \|- Try to do clustered allocation in that block group
				3806	* \| \|- Try to do unclustered allocation in that block group
				3807	* \| \|- Check if the result is valid
				3808	* \| \| \|- If valid, then exit
				3809	* \| \|- Jump to next block group
				3810	* \|
				3811	* \|- Push harder to find free extents
				3812	* \|- If not found, re-iterate all block groups
				3813	*/
				3814	static noinline int find_free_extent(struct btrfs_root *root,
				3815	u64 ram_bytes, u64 num_bytes, u64 empty_size,
				3816	u64 hint_byte, struct btrfs_key *ins,
				3817	u64 flags, int delalloc)
				3818	{
				3819	struct btrfs_fs_info *fs_info = root->fs_info;
				3820	int ret = 0;
				3821	int cache_block_group_error = 0;
				3822	struct btrfs_free_cluster *last_ptr = NULL;
				3823	struct btrfs_block_group_cache *block_group = NULL;
				3824	struct find_free_extent_ctl ffe_ctl = {0};
				3825	struct btrfs_space_info *space_info;
				3826	bool use_cluster = true;
				3827	bool full_search = false;
				3828
				3829	WARN_ON(num_bytes < fs_info->sectorsize);
				3830
				3831	ffe_ctl.ram_bytes = ram_bytes;
				3832	ffe_ctl.num_bytes = num_bytes;
				3833	ffe_ctl.empty_size = empty_size;
				3834	ffe_ctl.flags = flags;
				3835	ffe_ctl.search_start = 0;
				3836	ffe_ctl.retry_clustered = false;
				3837	ffe_ctl.retry_unclustered = false;
				3838	ffe_ctl.delalloc = delalloc;
				3839	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
				3840	ffe_ctl.have_caching_bg = false;
				3841	ffe_ctl.orig_have_caching_bg = false;
				3842	ffe_ctl.found_offset = 0;
				3843
				3844	ins->type = BTRFS_EXTENT_ITEM_KEY;
				3845	ins->objectid = 0;
				3846	ins->offset = 0;
				3847
				3848	trace_find_free_extent(root, num_bytes, empty_size, flags);
				3849
				3850	space_info = btrfs_find_space_info(fs_info, flags);
				3851	if (!space_info) {
				3852	btrfs_err(fs_info, "No space info for %llu", flags);
				3853	return -ENOSPC;
				3854	}
				3855
				3856	/*
				3857	* If our free space is heavily fragmented we may not be able to make
				3858	* big contiguous allocations, so instead of doing the expensive search
				3859	* for free space, simply return ENOSPC with our max_extent_size so we
				3860	* can go ahead and search for a more manageable chunk.
				3861	*
				3862	* If our max_extent_size is large enough for our allocation simply
				3863	* disable clustering since we will likely not be able to find enough
				3864	* space to create a cluster and induce latency trying.
				3865	*/
				3866	if (unlikely(space_info->max_extent_size)) {
				3867	spin_lock(&space_info->lock);
				3868	if (space_info->max_extent_size &&
				3869	num_bytes > space_info->max_extent_size) {
				3870	ins->offset = space_info->max_extent_size;
				3871	spin_unlock(&space_info->lock);
				3872	return -ENOSPC;
				3873	} else if (space_info->max_extent_size) {
				3874	use_cluster = false;
				3875	}
				3876	spin_unlock(&space_info->lock);
				3877	}
				3878
				3879	last_ptr = fetch_cluster_info(fs_info, space_info,
				3880	&ffe_ctl.empty_cluster);
				3881	if (last_ptr) {
				3882	spin_lock(&last_ptr->lock);
				3883	if (last_ptr->block_group)
				3884	hint_byte = last_ptr->window_start;
				3885	if (last_ptr->fragmented) {
				3886	/*
				3887	* We still set window_start so we can keep track of the
				3888	* last place we found an allocation to try and save
				3889	* some time.
				3890	*/
				3891	hint_byte = last_ptr->window_start;
				3892	use_cluster = false;
				3893	}
				3894	spin_unlock(&last_ptr->lock);
				3895	}
				3896
				3897	ffe_ctl.search_start = max(ffe_ctl.search_start,
				3898	first_logical_byte(fs_info, 0));
				3899	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
				3900	if (ffe_ctl.search_start == hint_byte) {
				3901	block_group = btrfs_lookup_block_group(fs_info,
				3902	ffe_ctl.search_start);
				3903	/*
				3904	* we don't want to use the block group if it doesn't match our
				3905	* allocation bits, or if its not cached.
				3906	*
				3907	* However if we are re-searching with an ideal block group
				3908	* picked out then we don't care that the block group is cached.
				3909	*/
				3910	if (block_group && block_group_bits(block_group, flags) &&
				3911	block_group->cached != BTRFS_CACHE_NO) {
				3912	down_read(&space_info->groups_sem);
				3913	if (list_empty(&block_group->list) \|\|
				3914	block_group->ro) {
				3915	/*
				3916	* someone is removing this block group,
				3917	* we can't jump into the have_block_group
				3918	* target because our list pointers are not
				3919	* valid
				3920	*/
				3921	btrfs_put_block_group(block_group);
				3922	up_read(&space_info->groups_sem);
				3923	} else {
				3924	ffe_ctl.index = btrfs_bg_flags_to_raid_index(
				3925	block_group->flags);
				3926	btrfs_lock_block_group(block_group, delalloc);
				3927	goto have_block_group;
				3928	}
				3929	} else if (block_group) {
				3930	btrfs_put_block_group(block_group);
				3931	}
				3932	}
				3933	search:
				3934	ffe_ctl.have_caching_bg = false;
				3935	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) \|\|
				3936	ffe_ctl.index == 0)
				3937	full_search = true;
				3938	down_read(&space_info->groups_sem);
				3939	list_for_each_entry(block_group,
				3940	&space_info->block_groups[ffe_ctl.index], list) {
				3941	/* If the block group is read-only, we can skip it entirely. */
				3942	if (unlikely(block_group->ro))
				3943	continue;
				3944
				3945	btrfs_grab_block_group(block_group, delalloc);
				3946	ffe_ctl.search_start = block_group->key.objectid;
				3947
				3948	/*
				3949	* this can happen if we end up cycling through all the
				3950	* raid types, but we want to make sure we only allocate
				3951	* for the proper type.
				3952	*/
				3953	if (!block_group_bits(block_group, flags)) {
				3954	u64 extra = BTRFS_BLOCK_GROUP_DUP \|
				3955	BTRFS_BLOCK_GROUP_RAID1_MASK \|
				3956	BTRFS_BLOCK_GROUP_RAID56_MASK \|
				3957	BTRFS_BLOCK_GROUP_RAID10;
				3958
				3959	/*
				3960	* if they asked for extra copies and this block group
				3961	* doesn't provide them, bail. This does allow us to
				3962	* fill raid0 from raid1.
				3963	*/
				3964	if ((flags & extra) && !(block_group->flags & extra))
				3965	goto loop;
				3966
				3967	/*
				3968	* This block group has different flags than we want.
				3969	* It's possible that we have MIXED_GROUP flag but no
				3970	* block group is mixed. Just skip such block group.
				3971	*/
				3972	btrfs_release_block_group(block_group, delalloc);
				3973	continue;
				3974	}
				3975
				3976	have_block_group:
				3977	ffe_ctl.cached = btrfs_block_group_cache_done(block_group);
				3978	if (unlikely(!ffe_ctl.cached)) {
				3979	ffe_ctl.have_caching_bg = true;
				3980	ret = btrfs_cache_block_group(block_group, 0);
				3981
				3982	/*
				3983	* If we get ENOMEM here or something else we want to
				3984	* try other block groups, because it may not be fatal.
				3985	* However if we can't find anything else we need to
				3986	* save our return here so that we return the actual
				3987	* error that caused problems, not ENOSPC.
				3988	*/
				3989	if (ret < 0) {
				3990	if (!cache_block_group_error)
				3991	cache_block_group_error = ret;
				3992	ret = 0;
				3993	goto loop;
				3994	}
				3995	ret = 0;
				3996	}
				3997
				3998	if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
				3999	if (!cache_block_group_error)
				4000	cache_block_group_error = -EIO;
				4001	goto loop;
				4002	}
				4003
				4004	/*
				4005	* Ok we want to try and use the cluster allocator, so
				4006	* lets look there
				4007	*/
				4008	if (last_ptr && use_cluster) {
				4009	struct btrfs_block_group_cache *cluster_bg = NULL;
				4010
				4011	ret = find_free_extent_clustered(block_group, last_ptr,
				4012	&ffe_ctl, &cluster_bg);
				4013
				4014	if (ret == 0) {
				4015	if (cluster_bg && cluster_bg != block_group) {
				4016	btrfs_release_block_group(block_group,
				4017	delalloc);
				4018	block_group = cluster_bg;
				4019	}
				4020	goto checks;
				4021	} else if (ret == -EAGAIN) {
				4022	goto have_block_group;
				4023	} else if (ret > 0) {
				4024	goto loop;
				4025	}
				4026	/* ret == -ENOENT case falls through */
				4027	}
				4028
				4029	ret = find_free_extent_unclustered(block_group, last_ptr,
				4030	&ffe_ctl);
				4031	if (ret == -EAGAIN)
				4032	goto have_block_group;
				4033	else if (ret > 0)
				4034	goto loop;
				4035	/* ret == 0 case falls through */
				4036	checks:
				4037	ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
				4038	fs_info->stripesize);
				4039
				4040	/* move on to the next group */
				4041	if (ffe_ctl.search_start + num_bytes >
				4042	block_group->key.objectid + block_group->key.offset) {
				4043	btrfs_add_free_space(block_group, ffe_ctl.found_offset,
				4044	num_bytes);
				4045	goto loop;
				4046	}
				4047
				4048	if (ffe_ctl.found_offset < ffe_ctl.search_start)
				4049	btrfs_add_free_space(block_group, ffe_ctl.found_offset,
				4050	ffe_ctl.search_start - ffe_ctl.found_offset);
				4051
				4052	ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				4053	num_bytes, delalloc);
				4054	if (ret == -EAGAIN) {
				4055	btrfs_add_free_space(block_group, ffe_ctl.found_offset,
				4056	num_bytes);
				4057	goto loop;
				4058	}
				4059	btrfs_inc_block_group_reservations(block_group);
				4060
				4061	/* we are all good, lets return */
				4062	ins->objectid = ffe_ctl.search_start;
				4063	ins->offset = num_bytes;
				4064
				4065	trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
				4066	num_bytes);
				4067	btrfs_release_block_group(block_group, delalloc);
				4068	break;
				4069	loop:
				4070	ffe_ctl.retry_clustered = false;
				4071	ffe_ctl.retry_unclustered = false;
				4072	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
				4073	ffe_ctl.index);
				4074	btrfs_release_block_group(block_group, delalloc);
				4075	cond_resched();
				4076	}
				4077	up_read(&space_info->groups_sem);
				4078
				4079	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
				4080	full_search, use_cluster);
				4081	if (ret > 0)
				4082	goto search;
				4083
				4084	if (ret == -ENOSPC && !cache_block_group_error) {
				4085	/*
				4086	* Use ffe_ctl->total_free_space as fallback if we can't find
				4087	* any contiguous hole.
				4088	*/
				4089	if (!ffe_ctl.max_extent_size)
				4090	ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
				4091	spin_lock(&space_info->lock);
				4092	space_info->max_extent_size = ffe_ctl.max_extent_size;
				4093	spin_unlock(&space_info->lock);
				4094	ins->offset = ffe_ctl.max_extent_size;
				4095	} else if (ret == -ENOSPC) {
				4096	ret = cache_block_group_error;
				4097	}
				4098	return ret;
				4099	}
				4100
				4101	/*
				4102	* btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
				4103	* hole that is at least as big as @num_bytes.
				4104	*
				4105	* @root - The root that will contain this extent
				4106	*
				4107	* @ram_bytes - The amount of space in ram that @num_bytes take. This
				4108	* is used for accounting purposes. This value differs
				4109	* from @num_bytes only in the case of compressed extents.
				4110	*
				4111	* @num_bytes - Number of bytes to allocate on-disk.
				4112	*
				4113	* @min_alloc_size - Indicates the minimum amount of space that the
				4114	* allocator should try to satisfy. In some cases
				4115	* @num_bytes may be larger than what is required and if
				4116	* the filesystem is fragmented then allocation fails.
				4117	* However, the presence of @min_alloc_size gives a
				4118	* chance to try and satisfy the smaller allocation.
				4119	*
				4120	* @empty_size - A hint that you plan on doing more COW. This is the
				4121	* size in bytes the allocator should try to find free
				4122	* next to the block it returns. This is just a hint and
				4123	* may be ignored by the allocator.
				4124	*
				4125	* @hint_byte - Hint to the allocator to start searching above the byte
				4126	* address passed. It might be ignored.
				4127	*
				4128	* @ins - This key is modified to record the found hole. It will
				4129	* have the following values:
				4130	* ins->objectid == start position
				4131	* ins->flags = BTRFS_EXTENT_ITEM_KEY
				4132	* ins->offset == the size of the hole.
				4133	*
				4134	* @is_data - Boolean flag indicating whether an extent is
				4135	* allocated for data (true) or metadata (false)
				4136	*
				4137	* @delalloc - Boolean flag indicating whether this allocation is for
				4138	* delalloc or not. If 'true' data_rwsem of block groups
				4139	* is going to be acquired.
				4140	*
				4141	*
				4142	* Returns 0 when an allocation succeeded or < 0 when an error occurred. In
				4143	* case -ENOSPC is returned then @ins->offset will contain the size of the
				4144	* largest available hole the allocator managed to find.
				4145	*/
				4146	int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
				4147	u64 num_bytes, u64 min_alloc_size,
				4148	u64 empty_size, u64 hint_byte,
				4149	struct btrfs_key *ins, int is_data, int delalloc)
				4150	{
				4151	struct btrfs_fs_info *fs_info = root->fs_info;
				4152	bool final_tried = num_bytes == min_alloc_size;
				4153	u64 flags;
				4154	int ret;
				4155
				4156	flags = get_alloc_profile_by_root(root, is_data);
				4157	again:
				4158	WARN_ON(num_bytes < fs_info->sectorsize);
				4159	ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
				4160	hint_byte, ins, flags, delalloc);
				4161	if (!ret && !is_data) {
				4162	btrfs_dec_block_group_reservations(fs_info, ins->objectid);
				4163	} else if (ret == -ENOSPC) {
				4164	if (!final_tried && ins->offset) {
				4165	num_bytes = min(num_bytes >> 1, ins->offset);
				4166	num_bytes = round_down(num_bytes,
				4167	fs_info->sectorsize);
				4168	num_bytes = max(num_bytes, min_alloc_size);
				4169	ram_bytes = num_bytes;
				4170	if (num_bytes == min_alloc_size)
				4171	final_tried = true;
				4172	goto again;
				4173	} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				4174	struct btrfs_space_info *sinfo;
				4175
				4176	sinfo = btrfs_find_space_info(fs_info, flags);
				4177	btrfs_err(fs_info,
				4178	"allocation failed flags %llu, wanted %llu",
				4179	flags, num_bytes);
				4180	if (sinfo)
				4181	btrfs_dump_space_info(fs_info, sinfo,
				4182	num_bytes, 1);
				4183	}
				4184	}
				4185
				4186	return ret;
				4187	}
				4188
				4189	static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
				4190	u64 start, u64 len,
				4191	int pin, int delalloc)
				4192	{
				4193	struct btrfs_block_group_cache *cache;
				4194	int ret = 0;
				4195
				4196	cache = btrfs_lookup_block_group(fs_info, start);
				4197	if (!cache) {
				4198	btrfs_err(fs_info, "Unable to find block group for %llu",
				4199	start);
				4200	return -ENOSPC;
				4201	}
				4202
				4203	if (pin)
				4204	pin_down_extent(cache, start, len, 1);
				4205	else {
				4206	if (btrfs_test_opt(fs_info, DISCARD))
				4207	ret = btrfs_discard_extent(fs_info, start, len, NULL);
				4208	btrfs_add_free_space(cache, start, len);
				4209	btrfs_free_reserved_bytes(cache, len, delalloc);
				4210	trace_btrfs_reserved_extent_free(fs_info, start, len);
				4211	}
				4212
				4213	btrfs_put_block_group(cache);
				4214	return ret;
				4215	}
				4216
				4217	int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
				4218	u64 start, u64 len, int delalloc)
				4219	{
				4220	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
				4221	}
				4222
				4223	int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
				4224	u64 start, u64 len)
				4225	{
				4226	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
				4227	}
				4228
				4229	static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				4230	u64 parent, u64 root_objectid,
				4231	u64 flags, u64 owner, u64 offset,
				4232	struct btrfs_key *ins, int ref_mod)
				4233	{
				4234	struct btrfs_fs_info *fs_info = trans->fs_info;
				4235	int ret;
				4236	struct btrfs_extent_item *extent_item;
				4237	struct btrfs_extent_inline_ref *iref;
				4238	struct btrfs_path *path;
				4239	struct extent_buffer *leaf;
				4240	int type;
				4241	u32 size;
				4242
				4243	if (parent > 0)
				4244	type = BTRFS_SHARED_DATA_REF_KEY;
				4245	else
				4246	type = BTRFS_EXTENT_DATA_REF_KEY;
				4247
				4248	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
				4249
				4250	path = btrfs_alloc_path();
				4251	if (!path)
				4252	return -ENOMEM;
				4253
				4254	path->leave_spinning = 1;
				4255	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				4256	ins, size);
				4257	if (ret) {
				4258	btrfs_free_path(path);
				4259	return ret;
				4260	}
				4261
				4262	leaf = path->nodes[0];
				4263	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				4264	struct btrfs_extent_item);
				4265	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
				4266	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
				4267	btrfs_set_extent_flags(leaf, extent_item,
				4268	flags \| BTRFS_EXTENT_FLAG_DATA);
				4269
				4270	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
				4271	btrfs_set_extent_inline_ref_type(leaf, iref, type);
				4272	if (parent > 0) {
				4273	struct btrfs_shared_data_ref *ref;
				4274	ref = (struct btrfs_shared_data_ref *)(iref + 1);
				4275	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				4276	btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
				4277	} else {
				4278	struct btrfs_extent_data_ref *ref;
				4279	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
				4280	btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
				4281	btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
				4282	btrfs_set_extent_data_ref_offset(leaf, ref, offset);
				4283	btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
				4284	}
				4285
				4286	btrfs_mark_buffer_dirty(path->nodes[0]);
				4287	btrfs_free_path(path);
				4288
				4289	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
				4290	if (ret)
				4291	return ret;
				4292
				4293	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
				4294	if (ret) { /* -ENOENT, logic error */
				4295	btrfs_err(fs_info, "update block group failed for %llu %llu",
				4296	ins->objectid, ins->offset);
				4297	BUG();
				4298	}
				4299	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
				4300	return ret;
				4301	}
				4302
				4303	static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				4304	struct btrfs_delayed_ref_node *node,
				4305	struct btrfs_delayed_extent_op *extent_op)
				4306	{
				4307	struct btrfs_fs_info *fs_info = trans->fs_info;
				4308	int ret;
				4309	struct btrfs_extent_item *extent_item;
				4310	struct btrfs_key extent_key;
				4311	struct btrfs_tree_block_info *block_info;
				4312	struct btrfs_extent_inline_ref *iref;
				4313	struct btrfs_path *path;
				4314	struct extent_buffer *leaf;
				4315	struct btrfs_delayed_tree_ref *ref;
				4316	u32 size = sizeof(extent_item) + sizeof(iref);
				4317	u64 num_bytes;
				4318	u64 flags = extent_op->flags_to_set;
				4319	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				4320
				4321	ref = btrfs_delayed_node_to_tree_ref(node);
				4322
				4323	extent_key.objectid = node->bytenr;
				4324	if (skinny_metadata) {
				4325	extent_key.offset = ref->level;
				4326	extent_key.type = BTRFS_METADATA_ITEM_KEY;
				4327	num_bytes = fs_info->nodesize;
				4328	} else {
				4329	extent_key.offset = node->num_bytes;
				4330	extent_key.type = BTRFS_EXTENT_ITEM_KEY;
				4331	size += sizeof(*block_info);
				4332	num_bytes = node->num_bytes;
				4333	}
				4334
				4335	path = btrfs_alloc_path();
				4336	if (!path)
				4337	return -ENOMEM;
				4338
				4339	path->leave_spinning = 1;
				4340	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				4341	&extent_key, size);
				4342	if (ret) {
				4343	btrfs_free_path(path);
				4344	return ret;
				4345	}
				4346
				4347	leaf = path->nodes[0];
				4348	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				4349	struct btrfs_extent_item);
				4350	btrfs_set_extent_refs(leaf, extent_item, 1);
				4351	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
				4352	btrfs_set_extent_flags(leaf, extent_item,
				4353	flags \| BTRFS_EXTENT_FLAG_TREE_BLOCK);
				4354
				4355	if (skinny_metadata) {
				4356	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
				4357	} else {
				4358	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
				4359	btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
				4360	btrfs_set_tree_block_level(leaf, block_info, ref->level);
				4361	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
				4362	}
				4363
				4364	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
				4365	BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
				4366	btrfs_set_extent_inline_ref_type(leaf, iref,
				4367	BTRFS_SHARED_BLOCK_REF_KEY);
				4368	btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
				4369	} else {
				4370	btrfs_set_extent_inline_ref_type(leaf, iref,
				4371	BTRFS_TREE_BLOCK_REF_KEY);
				4372	btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
				4373	}
				4374
				4375	btrfs_mark_buffer_dirty(leaf);
				4376	btrfs_free_path(path);
				4377
				4378	ret = remove_from_free_space_tree(trans, extent_key.objectid,
				4379	num_bytes);
				4380	if (ret)
				4381	return ret;
				4382
				4383	ret = btrfs_update_block_group(trans, extent_key.objectid,
				4384	fs_info->nodesize, 1);
				4385	if (ret) { /* -ENOENT, logic error */
				4386	btrfs_err(fs_info, "update block group failed for %llu %llu",
				4387	extent_key.objectid, extent_key.offset);
				4388	BUG();
				4389	}
				4390
				4391	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
				4392	fs_info->nodesize);
				4393	return ret;
				4394	}
				4395
				4396	int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				4397	struct btrfs_root *root, u64 owner,
				4398	u64 offset, u64 ram_bytes,
				4399	struct btrfs_key *ins)
				4400	{
				4401	struct btrfs_ref generic_ref = { 0 };
				4402	int ret;
				4403
				4404	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
				4405
				4406	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				4407	ins->objectid, ins->offset, 0);
				4408	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
				4409	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
				4410	ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
				4411	ram_bytes, NULL, NULL);
				4412	return ret;
				4413	}
				4414
				4415	/*
				4416	* this is used by the tree logging recovery code. It records that
				4417	* an extent has been allocated and makes sure to clear the free
				4418	* space cache bits as well
				4419	*/
				4420	int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				4421	u64 root_objectid, u64 owner, u64 offset,
				4422	struct btrfs_key *ins)
				4423	{
				4424	struct btrfs_fs_info *fs_info = trans->fs_info;
				4425	int ret;
				4426	struct btrfs_block_group_cache *block_group;
				4427	struct btrfs_space_info *space_info;
				4428
				4429	/*
				4430	* Mixed block groups will exclude before processing the log so we only
				4431	* need to do the exclude dance if this fs isn't mixed.
				4432	*/
				4433	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
				4434	ret = __exclude_logged_extent(fs_info, ins->objectid,
				4435	ins->offset);
				4436	if (ret)
				4437	return ret;
				4438	}
				4439
				4440	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
				4441	if (!block_group)
				4442	return -EINVAL;
				4443
				4444	space_info = block_group->space_info;
				4445	spin_lock(&space_info->lock);
				4446	spin_lock(&block_group->lock);
				4447	space_info->bytes_reserved += ins->offset;
				4448	block_group->reserved += ins->offset;
				4449	spin_unlock(&block_group->lock);
				4450	spin_unlock(&space_info->lock);
				4451
				4452	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
				4453	offset, ins, 1);
				4454	if (ret)
				4455	btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
				4456	btrfs_put_block_group(block_group);
				4457	return ret;
				4458	}
				4459
				4460	static struct extent_buffer *
				4461	btrfs_init_new_buffer(struct btrfs_trans_handle trans, struct btrfs_root root,
				4462	u64 bytenr, int level, u64 owner)
				4463	{
				4464	struct btrfs_fs_info *fs_info = root->fs_info;
				4465	struct extent_buffer *buf;
				4466
				4467	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				4468	if (IS_ERR(buf))
				4469	return buf;
				4470
				4471	/*
				4472	* Extra safety check in case the extent tree is corrupted and extent
				4473	* allocator chooses to use a tree block which is already used and
				4474	* locked.
				4475	*/
				4476	if (buf->lock_owner == current->pid) {
				4477	btrfs_err_rl(fs_info,
				4478	"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
				4479	buf->start, btrfs_header_owner(buf), current->pid);
				4480	free_extent_buffer(buf);
				4481	return ERR_PTR(-EUCLEAN);
				4482	}
				4483
				4484	btrfs_set_buffer_lockdep_class(owner, buf, level);
				4485	btrfs_tree_lock(buf);
				4486	btrfs_clean_tree_block(buf);
				4487	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
				4488
				4489	btrfs_set_lock_blocking_write(buf);
				4490	set_extent_buffer_uptodate(buf);
				4491
				4492	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
				4493	btrfs_set_header_level(buf, level);
				4494	btrfs_set_header_bytenr(buf, buf->start);
				4495	btrfs_set_header_generation(buf, trans->transid);
				4496	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
				4497	btrfs_set_header_owner(buf, owner);
				4498	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
				4499	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
				4500	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
				4501	buf->log_index = root->log_transid % 2;
				4502	/*
				4503	* we allow two log transactions at a time, use different
				4504	* EXTENT bit to differentiate dirty pages.
				4505	*/
				4506	if (buf->log_index == 0)
				4507	set_extent_dirty(&root->dirty_log_pages, buf->start,
				4508	buf->start + buf->len - 1, GFP_NOFS);
				4509	else
				4510	set_extent_new(&root->dirty_log_pages, buf->start,
				4511	buf->start + buf->len - 1);
				4512	} else {
				4513	buf->log_index = -1;
				4514	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
				4515	buf->start + buf->len - 1, GFP_NOFS);
				4516	}
				4517	trans->dirty = true;
				4518	/* this returns a buffer locked for blocking */
				4519	return buf;
				4520	}
				4521
				4522	/*
				4523	* finds a free extent and does all the dirty work required for allocation
				4524	* returns the tree buffer or an ERR_PTR on error.
				4525	*/
				4526	struct extent_buffer btrfs_alloc_tree_block(struct btrfs_trans_handle trans,
				4527	struct btrfs_root *root,
				4528	u64 parent, u64 root_objectid,
				4529	const struct btrfs_disk_key *key,
				4530	int level, u64 hint,
				4531	u64 empty_size)
				4532	{
				4533	struct btrfs_fs_info *fs_info = root->fs_info;
				4534	struct btrfs_key ins;
				4535	struct btrfs_block_rsv *block_rsv;
				4536	struct extent_buffer *buf;
				4537	struct btrfs_delayed_extent_op *extent_op;
				4538	struct btrfs_ref generic_ref = { 0 };
				4539	u64 flags = 0;
				4540	int ret;
				4541	u32 blocksize = fs_info->nodesize;
				4542	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				4543
				4544	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				4545	if (btrfs_is_testing(fs_info)) {
				4546	buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
				4547	level, root_objectid);
				4548	if (!IS_ERR(buf))
				4549	root->alloc_bytenr += blocksize;
				4550	return buf;
				4551	}
				4552	#endif
				4553
				4554	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
				4555	if (IS_ERR(block_rsv))
				4556	return ERR_CAST(block_rsv);
				4557
				4558	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
				4559	empty_size, hint, &ins, 0, 0);
				4560	if (ret)
				4561	goto out_unuse;
				4562
				4563	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
				4564	root_objectid);
				4565	if (IS_ERR(buf)) {
				4566	ret = PTR_ERR(buf);
				4567	goto out_free_reserved;
				4568	}
				4569
				4570	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
				4571	if (parent == 0)
				4572	parent = ins.objectid;
				4573	flags \|= BTRFS_BLOCK_FLAG_FULL_BACKREF;
				4574	} else
				4575	BUG_ON(parent > 0);
				4576
				4577	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
				4578	extent_op = btrfs_alloc_delayed_extent_op();
				4579	if (!extent_op) {
				4580	ret = -ENOMEM;
				4581	goto out_free_buf;
				4582	}
				4583	if (key)
				4584	memcpy(&extent_op->key, key, sizeof(extent_op->key));
				4585	else
				4586	memset(&extent_op->key, 0, sizeof(extent_op->key));
				4587	extent_op->flags_to_set = flags;
				4588	extent_op->update_key = skinny_metadata ? false : true;
				4589	extent_op->update_flags = true;
				4590	extent_op->is_data = false;
				4591	extent_op->level = level;
				4592
				4593	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				4594	ins.objectid, ins.offset, parent);
				4595	generic_ref.real_root = root->root_key.objectid;
				4596	btrfs_init_tree_ref(&generic_ref, level, root_objectid);
				4597	btrfs_ref_tree_mod(fs_info, &generic_ref);
				4598	ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
				4599	extent_op, NULL, NULL);
				4600	if (ret)
				4601	goto out_free_delayed;
				4602	}
				4603	return buf;
				4604
				4605	out_free_delayed:
				4606	btrfs_free_delayed_extent_op(extent_op);
				4607	out_free_buf:
				4608	btrfs_tree_unlock(buf);
				4609	free_extent_buffer(buf);
				4610	out_free_reserved:
				4611	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
				4612	out_unuse:
				4613	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
				4614	return ERR_PTR(ret);
				4615	}
				4616
				4617	struct walk_control {
				4618	u64 refs[BTRFS_MAX_LEVEL];
				4619	u64 flags[BTRFS_MAX_LEVEL];
				4620	struct btrfs_key update_progress;
				4621	struct btrfs_key drop_progress;
				4622	int drop_level;
				4623	int stage;
				4624	int level;
				4625	int shared_level;
				4626	int update_ref;
				4627	int keep_locks;
				4628	int reada_slot;
				4629	int reada_count;
				4630	int restarted;
				4631	};
				4632
				4633	#define DROP_REFERENCE 1
				4634	#define UPDATE_BACKREF 2
				4635
				4636	static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				4637	struct btrfs_root *root,
				4638	struct walk_control *wc,
				4639	struct btrfs_path *path)
				4640	{
				4641	struct btrfs_fs_info *fs_info = root->fs_info;
				4642	u64 bytenr;
				4643	u64 generation;
				4644	u64 refs;
				4645	u64 flags;
				4646	u32 nritems;
				4647	struct btrfs_key key;
				4648	struct extent_buffer *eb;
				4649	int ret;
				4650	int slot;
				4651	int nread = 0;
				4652
				4653	if (path->slots[wc->level] < wc->reada_slot) {
				4654	wc->reada_count = wc->reada_count * 2 / 3;
				4655	wc->reada_count = max(wc->reada_count, 2);
				4656	} else {
				4657	wc->reada_count = wc->reada_count * 3 / 2;
				4658	wc->reada_count = min_t(int, wc->reada_count,
				4659	BTRFS_NODEPTRS_PER_BLOCK(fs_info));
				4660	}
				4661
				4662	eb = path->nodes[wc->level];
				4663	nritems = btrfs_header_nritems(eb);
				4664
				4665	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
				4666	if (nread >= wc->reada_count)
				4667	break;
				4668
				4669	cond_resched();
				4670	bytenr = btrfs_node_blockptr(eb, slot);
				4671	generation = btrfs_node_ptr_generation(eb, slot);
				4672
				4673	if (slot == path->slots[wc->level])
				4674	goto reada;
				4675
				4676	if (wc->stage == UPDATE_BACKREF &&
				4677	generation <= root->root_key.offset)
				4678	continue;
				4679
				4680	/* We don't lock the tree block, it's OK to be racy here */
				4681	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
				4682	wc->level - 1, 1, &refs,
				4683	&flags);
				4684	/* We don't care about errors in readahead. */
				4685	if (ret < 0)
				4686	continue;
				4687
				4688	/*
				4689	* This could be racey, it's conceivable that we raced and end
				4690	* up with a bogus refs count, if that's the case just skip, if
				4691	* we are actually corrupt we will notice when we look up
				4692	* everything again with our locks.
				4693	*/
				4694	if (refs == 0)
				4695	continue;
				4696
				4697	if (wc->stage == DROP_REFERENCE) {
				4698	if (refs == 1)
				4699	goto reada;
				4700
				4701	if (wc->level == 1 &&
				4702	(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				4703	continue;
				4704	if (!wc->update_ref \|\|
				4705	generation <= root->root_key.offset)
				4706	continue;
				4707	btrfs_node_key_to_cpu(eb, &key, slot);
				4708	ret = btrfs_comp_cpu_keys(&key,
				4709	&wc->update_progress);
				4710	if (ret < 0)
				4711	continue;
				4712	} else {
				4713	if (wc->level == 1 &&
				4714	(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				4715	continue;
				4716	}
				4717	reada:
				4718	readahead_tree_block(fs_info, bytenr);
				4719	nread++;
				4720	}
				4721	wc->reada_slot = slot;
				4722	}
				4723
				4724	/*
				4725	* helper to process tree block while walking down the tree.
				4726	*
				4727	* when wc->stage == UPDATE_BACKREF, this function updates
				4728	* back refs for pointers in the block.
				4729	*
				4730	* NOTE: return value 1 means we should stop walking down.
				4731	*/
				4732	static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
				4733	struct btrfs_root *root,
				4734	struct btrfs_path *path,
				4735	struct walk_control *wc, int lookup_info)
				4736	{
				4737	struct btrfs_fs_info *fs_info = root->fs_info;
				4738	int level = wc->level;
				4739	struct extent_buffer *eb = path->nodes[level];
				4740	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
				4741	int ret;
				4742
				4743	if (wc->stage == UPDATE_BACKREF &&
				4744	btrfs_header_owner(eb) != root->root_key.objectid)
				4745	return 1;
				4746
				4747	/*
				4748	* when reference count of tree block is 1, it won't increase
				4749	* again. once full backref flag is set, we never clear it.
				4750	*/
				4751	if (lookup_info &&
				4752	((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) \|\|
				4753	(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
				4754	ASSERT(path->locks[level]);
				4755	ret = btrfs_lookup_extent_info(trans, fs_info,
				4756	eb->start, level, 1,
				4757	&wc->refs[level],
				4758	&wc->flags[level]);
				4759	BUG_ON(ret == -ENOMEM);
				4760	if (ret)
				4761	return ret;
				4762	if (unlikely(wc->refs[level] == 0)) {
				4763	btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
				4764	eb->start);
				4765	return -EUCLEAN;
				4766	}
				4767	}
				4768
				4769	if (wc->stage == DROP_REFERENCE) {
				4770	if (wc->refs[level] > 1)
				4771	return 1;
				4772
				4773	if (path->locks[level] && !wc->keep_locks) {
				4774	btrfs_tree_unlock_rw(eb, path->locks[level]);
				4775	path->locks[level] = 0;
				4776	}
				4777	return 0;
				4778	}
				4779
				4780	/* wc->stage == UPDATE_BACKREF */
				4781	if (!(wc->flags[level] & flag)) {
				4782	ASSERT(path->locks[level]);
				4783	ret = btrfs_inc_ref(trans, root, eb, 1);
				4784	BUG_ON(ret); /* -ENOMEM */
				4785	ret = btrfs_dec_ref(trans, root, eb, 0);
				4786	BUG_ON(ret); /* -ENOMEM */
				4787	ret = btrfs_set_disk_extent_flags(trans, eb->start,
				4788	eb->len, flag,
				4789	btrfs_header_level(eb), 0);
				4790	BUG_ON(ret); /* -ENOMEM */
				4791	wc->flags[level] \|= flag;
				4792	}
				4793
				4794	/*
				4795	* the block is shared by multiple trees, so it's not good to
				4796	* keep the tree lock
				4797	*/
				4798	if (path->locks[level] && level > 0) {
				4799	btrfs_tree_unlock_rw(eb, path->locks[level]);
				4800	path->locks[level] = 0;
				4801	}
				4802	return 0;
				4803	}
				4804
				4805	/*
				4806	* This is used to verify a ref exists for this root to deal with a bug where we
				4807	* would have a drop_progress key that hadn't been updated properly.
				4808	*/
				4809	static int check_ref_exists(struct btrfs_trans_handle *trans,
				4810	struct btrfs_root *root, u64 bytenr, u64 parent,
				4811	int level)
				4812	{
				4813	struct btrfs_path *path;
				4814	struct btrfs_extent_inline_ref *iref;
				4815	int ret;
				4816
				4817	path = btrfs_alloc_path();
				4818	if (!path)
				4819	return -ENOMEM;
				4820
				4821	ret = lookup_extent_backref(trans, path, &iref, bytenr,
				4822	root->fs_info->nodesize, parent,
				4823	root->root_key.objectid, level, 0);
				4824	btrfs_free_path(path);
				4825	if (ret == -ENOENT)
				4826	return 0;
				4827	if (ret < 0)
				4828	return ret;
				4829	return 1;
				4830	}
				4831
				4832	/*
				4833	* helper to process tree block pointer.
				4834	*
				4835	* when wc->stage == DROP_REFERENCE, this function checks
				4836	* reference count of the block pointed to. if the block
				4837	* is shared and we need update back refs for the subtree
				4838	* rooted at the block, this function changes wc->stage to
				4839	* UPDATE_BACKREF. if the block is shared and there is no
				4840	* need to update back, this function drops the reference
				4841	* to the block.
				4842	*
				4843	* NOTE: return value 1 means we should stop walking down.
				4844	*/
				4845	static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				4846	struct btrfs_root *root,
				4847	struct btrfs_path *path,
				4848	struct walk_control wc, int lookup_info)
				4849	{
				4850	struct btrfs_fs_info *fs_info = root->fs_info;
				4851	u64 bytenr;
				4852	u64 generation;
				4853	u64 parent;
				4854	struct btrfs_key key;
				4855	struct btrfs_key first_key;
				4856	struct btrfs_ref ref = { 0 };
				4857	struct extent_buffer *next;
				4858	int level = wc->level;
				4859	int reada = 0;
				4860	int ret = 0;
				4861	bool need_account = false;
				4862
				4863	generation = btrfs_node_ptr_generation(path->nodes[level],
				4864	path->slots[level]);
				4865	/*
				4866	* if the lower level block was created before the snapshot
				4867	* was created, we know there is no need to update back refs
				4868	* for the subtree
				4869	*/
				4870	if (wc->stage == UPDATE_BACKREF &&
				4871	generation <= root->root_key.offset) {
				4872	*lookup_info = 1;
				4873	return 1;
				4874	}
				4875
				4876	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
				4877	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
				4878	path->slots[level]);
				4879
				4880	next = find_extent_buffer(fs_info, bytenr);
				4881	if (!next) {
				4882	next = btrfs_find_create_tree_block(fs_info, bytenr);
				4883	if (IS_ERR(next))
				4884	return PTR_ERR(next);
				4885
				4886	btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
				4887	level - 1);
				4888	reada = 1;
				4889	}
				4890	btrfs_tree_lock(next);
				4891	btrfs_set_lock_blocking_write(next);
				4892
				4893	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
				4894	&wc->refs[level - 1],
				4895	&wc->flags[level - 1]);
				4896	if (ret < 0)
				4897	goto out_unlock;
				4898
				4899	if (unlikely(wc->refs[level - 1] == 0)) {
				4900	btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
				4901	bytenr);
				4902	ret = -EUCLEAN;
				4903	goto out_unlock;
				4904	}
				4905	*lookup_info = 0;
				4906
				4907	if (wc->stage == DROP_REFERENCE) {
				4908	if (wc->refs[level - 1] > 1) {
				4909	need_account = true;
				4910	if (level == 1 &&
				4911	(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				4912	goto skip;
				4913
				4914	if (!wc->update_ref \|\|
				4915	generation <= root->root_key.offset)
				4916	goto skip;
				4917
				4918	btrfs_node_key_to_cpu(path->nodes[level], &key,
				4919	path->slots[level]);
				4920	ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
				4921	if (ret < 0)
				4922	goto skip;
				4923
				4924	wc->stage = UPDATE_BACKREF;
				4925	wc->shared_level = level - 1;
				4926	}
				4927	} else {
				4928	if (level == 1 &&
				4929	(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				4930	goto skip;
				4931	}
				4932
				4933	if (!btrfs_buffer_uptodate(next, generation, 0)) {
				4934	btrfs_tree_unlock(next);
				4935	free_extent_buffer(next);
				4936	next = NULL;
				4937	*lookup_info = 1;
				4938	}
				4939
				4940	if (!next) {
				4941	if (reada && level == 1)
				4942	reada_walk_down(trans, root, wc, path);
				4943	next = read_tree_block(fs_info, bytenr, generation, level - 1,
				4944	&first_key);
				4945	if (IS_ERR(next)) {
				4946	return PTR_ERR(next);
				4947	} else if (!extent_buffer_uptodate(next)) {
				4948	free_extent_buffer(next);
				4949	return -EIO;
				4950	}
				4951	btrfs_tree_lock(next);
				4952	btrfs_set_lock_blocking_write(next);
				4953	}
				4954
				4955	level--;
				4956	ASSERT(level == btrfs_header_level(next));
				4957	if (level != btrfs_header_level(next)) {
				4958	btrfs_err(root->fs_info, "mismatched level");
				4959	ret = -EIO;
				4960	goto out_unlock;
				4961	}
				4962	path->nodes[level] = next;
				4963	path->slots[level] = 0;
				4964	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				4965	wc->level = level;
				4966	if (wc->level == 1)
				4967	wc->reada_slot = 0;
				4968	return 0;
				4969	skip:
				4970	wc->refs[level - 1] = 0;
				4971	wc->flags[level - 1] = 0;
				4972	if (wc->stage == DROP_REFERENCE) {
				4973	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
				4974	parent = path->nodes[level]->start;
				4975	} else {
				4976	ASSERT(root->root_key.objectid ==
				4977	btrfs_header_owner(path->nodes[level]));
				4978	if (root->root_key.objectid !=
				4979	btrfs_header_owner(path->nodes[level])) {
				4980	btrfs_err(root->fs_info,
				4981	"mismatched block owner");
				4982	ret = -EIO;
				4983	goto out_unlock;
				4984	}
				4985	parent = 0;
				4986	}
				4987
				4988	/*
				4989	* If we had a drop_progress we need to verify the refs are set
				4990	* as expected. If we find our ref then we know that from here
				4991	* on out everything should be correct, and we can clear the
				4992	* ->restarted flag.
				4993	*/
				4994	if (wc->restarted) {
				4995	ret = check_ref_exists(trans, root, bytenr, parent,
				4996	level - 1);
				4997	if (ret < 0)
				4998	goto out_unlock;
				4999	if (ret == 0)
				5000	goto no_delete;
				5001	ret = 0;
				5002	wc->restarted = 0;
				5003	}
				5004
				5005	/*
				5006	* Reloc tree doesn't contribute to qgroup numbers, and we have
				5007	* already accounted them at merge time (replace_path),
				5008	* thus we could skip expensive subtree trace here.
				5009	*/
				5010	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
				5011	need_account) {
				5012	ret = btrfs_qgroup_trace_subtree(trans, next,
				5013	generation, level - 1);
				5014	if (ret) {
				5015	btrfs_err_rl(fs_info,
				5016	"Error %d accounting shared subtree. Quota is out of sync, rescan required.",
				5017	ret);
				5018	}
				5019	}
				5020
				5021	/*
				5022	* We need to update the next key in our walk control so we can
				5023	* update the drop_progress key accordingly. We don't care if
				5024	* find_next_key doesn't find a key because that means we're at
				5025	* the end and are going to clean up now.
				5026	*/
				5027	wc->drop_level = level;
				5028	find_next_key(path, level, &wc->drop_progress);
				5029
				5030	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				5031	fs_info->nodesize, parent);
				5032	btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
				5033	ret = btrfs_free_extent(trans, &ref);
				5034	if (ret)
				5035	goto out_unlock;
				5036	}
				5037	no_delete:
				5038	*lookup_info = 1;
				5039	ret = 1;
				5040
				5041	out_unlock:
				5042	btrfs_tree_unlock(next);
				5043	free_extent_buffer(next);
				5044
				5045	return ret;
				5046	}
				5047
				5048	/*
				5049	* helper to process tree block while walking up the tree.
				5050	*
				5051	* when wc->stage == DROP_REFERENCE, this function drops
				5052	* reference count on the block.
				5053	*
				5054	* when wc->stage == UPDATE_BACKREF, this function changes
				5055	* wc->stage back to DROP_REFERENCE if we changed wc->stage
				5056	* to UPDATE_BACKREF previously while processing the block.
				5057	*
				5058	* NOTE: return value 1 means we should stop walking up.
				5059	*/
				5060	static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				5061	struct btrfs_root *root,
				5062	struct btrfs_path *path,
				5063	struct walk_control *wc)
				5064	{
				5065	struct btrfs_fs_info *fs_info = root->fs_info;
				5066	int ret;
				5067	int level = wc->level;
				5068	struct extent_buffer *eb = path->nodes[level];
				5069	u64 parent = 0;
				5070
				5071	if (wc->stage == UPDATE_BACKREF) {
				5072	BUG_ON(wc->shared_level < level);
				5073	if (level < wc->shared_level)
				5074	goto out;
				5075
				5076	ret = find_next_key(path, level + 1, &wc->update_progress);
				5077	if (ret > 0)
				5078	wc->update_ref = 0;
				5079
				5080	wc->stage = DROP_REFERENCE;
				5081	wc->shared_level = -1;
				5082	path->slots[level] = 0;
				5083
				5084	/*
				5085	* check reference count again if the block isn't locked.
				5086	* we should start walking down the tree again if reference
				5087	* count is one.
				5088	*/
				5089	if (!path->locks[level]) {
				5090	BUG_ON(level == 0);
				5091	btrfs_tree_lock(eb);
				5092	btrfs_set_lock_blocking_write(eb);
				5093	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				5094
				5095	ret = btrfs_lookup_extent_info(trans, fs_info,
				5096	eb->start, level, 1,
				5097	&wc->refs[level],
				5098	&wc->flags[level]);
				5099	if (ret < 0) {
				5100	btrfs_tree_unlock_rw(eb, path->locks[level]);
				5101	path->locks[level] = 0;
				5102	return ret;
				5103	}
				5104	if (unlikely(wc->refs[level] == 0)) {
				5105	btrfs_tree_unlock_rw(eb, path->locks[level]);
				5106	btrfs_err(fs_info, "bytenr %llu has 0 references, expect > 0",
				5107	eb->start);
				5108	return -EUCLEAN;
				5109	}
				5110	if (wc->refs[level] == 1) {
				5111	btrfs_tree_unlock_rw(eb, path->locks[level]);
				5112	path->locks[level] = 0;
				5113	return 1;
				5114	}
				5115	}
				5116	}
				5117
				5118	/* wc->stage == DROP_REFERENCE */
				5119	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
				5120
				5121	if (wc->refs[level] == 1) {
				5122	if (level == 0) {
				5123	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				5124	ret = btrfs_dec_ref(trans, root, eb, 1);
				5125	else
				5126	ret = btrfs_dec_ref(trans, root, eb, 0);
				5127	BUG_ON(ret); /* -ENOMEM */
				5128	if (is_fstree(root->root_key.objectid)) {
				5129	ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				5130	if (ret) {
				5131	btrfs_err_rl(fs_info,
				5132	"error %d accounting leaf items, quota is out of sync, rescan required",
				5133	ret);
				5134	}
				5135	}
				5136	}
				5137	/* make block locked assertion in btrfs_clean_tree_block happy */
				5138	if (!path->locks[level] &&
				5139	btrfs_header_generation(eb) == trans->transid) {
				5140	btrfs_tree_lock(eb);
				5141	btrfs_set_lock_blocking_write(eb);
				5142	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				5143	}
				5144	btrfs_clean_tree_block(eb);
				5145	}
				5146
				5147	if (eb == root->node) {
				5148	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				5149	parent = eb->start;
				5150	else if (root->root_key.objectid != btrfs_header_owner(eb))
				5151	goto owner_mismatch;
				5152	} else {
				5153	if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				5154	parent = path->nodes[level + 1]->start;
				5155	else if (root->root_key.objectid !=
				5156	btrfs_header_owner(path->nodes[level + 1]))
				5157	goto owner_mismatch;
				5158	}
				5159
				5160	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
				5161	out:
				5162	wc->refs[level] = 0;
				5163	wc->flags[level] = 0;
				5164	return 0;
				5165
				5166	owner_mismatch:
				5167	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
				5168	btrfs_header_owner(eb), root->root_key.objectid);
				5169	return -EUCLEAN;
				5170	}
				5171
				5172	static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				5173	struct btrfs_root *root,
				5174	struct btrfs_path *path,
				5175	struct walk_control *wc)
				5176	{
				5177	int level = wc->level;
				5178	int lookup_info = 1;
				5179	int ret;
				5180
				5181	while (level >= 0) {
				5182	ret = walk_down_proc(trans, root, path, wc, lookup_info);
				5183	if (ret > 0)
				5184	break;
				5185
				5186	if (level == 0)
				5187	break;
				5188
				5189	if (path->slots[level] >=
				5190	btrfs_header_nritems(path->nodes[level]))
				5191	break;
				5192
				5193	ret = do_walk_down(trans, root, path, wc, &lookup_info);
				5194	if (ret > 0) {
				5195	path->slots[level]++;
				5196	continue;
				5197	} else if (ret < 0)
				5198	return ret;
				5199	level = wc->level;
				5200	}
				5201	return 0;
				5202	}
				5203
				5204	static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
				5205	struct btrfs_root *root,
				5206	struct btrfs_path *path,
				5207	struct walk_control *wc, int max_level)
				5208	{
				5209	int level = wc->level;
				5210	int ret;
				5211
				5212	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
				5213	while (level < max_level && path->nodes[level]) {
				5214	wc->level = level;
				5215	if (path->slots[level] + 1 <
				5216	btrfs_header_nritems(path->nodes[level])) {
				5217	path->slots[level]++;
				5218	return 0;
				5219	} else {
				5220	ret = walk_up_proc(trans, root, path, wc);
				5221	if (ret > 0)
				5222	return 0;
				5223	if (ret < 0)
				5224	return ret;
				5225
				5226	if (path->locks[level]) {
				5227	btrfs_tree_unlock_rw(path->nodes[level],
				5228	path->locks[level]);
				5229	path->locks[level] = 0;
				5230	}
				5231	free_extent_buffer(path->nodes[level]);
				5232	path->nodes[level] = NULL;
				5233	level++;
				5234	}
				5235	}
				5236	return 1;
				5237	}
				5238
				5239	/*
				5240	* drop a subvolume tree.
				5241	*
				5242	* this function traverses the tree freeing any blocks that only
				5243	* referenced by the tree.
				5244	*
				5245	* when a shared tree block is found. this function decreases its
				5246	* reference count by one. if update_ref is true, this function
				5247	* also make sure backrefs for the shared block and all lower level
				5248	* blocks are properly updated.
				5249	*
				5250	* If called with for_reloc == 0, may exit early with -EAGAIN
				5251	*/
				5252	int btrfs_drop_snapshot(struct btrfs_root *root,
				5253	struct btrfs_block_rsv *block_rsv, int update_ref,
				5254	int for_reloc)
				5255	{
				5256	struct btrfs_fs_info *fs_info = root->fs_info;
				5257	struct btrfs_path *path;
				5258	struct btrfs_trans_handle *trans;
				5259	struct btrfs_root *tree_root = fs_info->tree_root;
				5260	struct btrfs_root_item *root_item = &root->root_item;
				5261	struct walk_control *wc;
				5262	struct btrfs_key key;
				5263	int err = 0;
				5264	int ret;
				5265	int level;
				5266	bool root_dropped = false;
				5267
				5268	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
				5269
				5270	path = btrfs_alloc_path();
				5271	if (!path) {
				5272	err = -ENOMEM;
				5273	goto out;
				5274	}
				5275
				5276	wc = kzalloc(sizeof(*wc), GFP_NOFS);
				5277	if (!wc) {
				5278	btrfs_free_path(path);
				5279	err = -ENOMEM;
				5280	goto out;
				5281	}
				5282
				5283	/*
				5284	* Use join to avoid potential EINTR from transaction start. See
				5285	* wait_reserve_ticket and the whole reservation callchain.
				5286	*/
				5287	if (for_reloc)
				5288	trans = btrfs_join_transaction(tree_root);
				5289	else
				5290	trans = btrfs_start_transaction(tree_root, 0);
				5291	if (IS_ERR(trans)) {
				5292	err = PTR_ERR(trans);
				5293	goto out_free;
				5294	}
				5295
				5296	err = btrfs_run_delayed_items(trans);
				5297	if (err)
				5298	goto out_end_trans;
				5299
				5300	if (block_rsv)
				5301	trans->block_rsv = block_rsv;
				5302
				5303	/*
				5304	* This will help us catch people modifying the fs tree while we're
				5305	* dropping it. It is unsafe to mess with the fs tree while it's being
				5306	* dropped as we unlock the root node and parent nodes as we walk down
				5307	* the tree, assuming nothing will change. If something does change
				5308	* then we'll have stale information and drop references to blocks we've
				5309	* already dropped.
				5310	*/
				5311	set_bit(BTRFS_ROOT_DELETING, &root->state);
				5312	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
				5313	level = btrfs_header_level(root->node);
				5314	path->nodes[level] = btrfs_lock_root_node(root);
				5315	btrfs_set_lock_blocking_write(path->nodes[level]);
				5316	path->slots[level] = 0;
				5317	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				5318	memset(&wc->update_progress, 0,
				5319	sizeof(wc->update_progress));
				5320	} else {
				5321	btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
				5322	memcpy(&wc->update_progress, &key,
				5323	sizeof(wc->update_progress));
				5324
				5325	level = root_item->drop_level;
				5326	BUG_ON(level == 0);
				5327	path->lowest_level = level;
				5328	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				5329	path->lowest_level = 0;
				5330	if (ret < 0) {
				5331	err = ret;
				5332	goto out_end_trans;
				5333	}
				5334	WARN_ON(ret > 0);
				5335
				5336	/*
				5337	* unlock our path, this is safe because only this
				5338	* function is allowed to delete this snapshot
				5339	*/
				5340	btrfs_unlock_up_safe(path, 0);
				5341
				5342	level = btrfs_header_level(root->node);
				5343	while (1) {
				5344	btrfs_tree_lock(path->nodes[level]);
				5345	btrfs_set_lock_blocking_write(path->nodes[level]);
				5346	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				5347
				5348	ret = btrfs_lookup_extent_info(trans, fs_info,
				5349	path->nodes[level]->start,
				5350	level, 1, &wc->refs[level],
				5351	&wc->flags[level]);
				5352	if (ret < 0) {
				5353	err = ret;
				5354	goto out_end_trans;
				5355	}
				5356	BUG_ON(wc->refs[level] == 0);
				5357
				5358	if (level == root_item->drop_level)
				5359	break;
				5360
				5361	btrfs_tree_unlock(path->nodes[level]);
				5362	path->locks[level] = 0;
				5363	WARN_ON(wc->refs[level] != 1);
				5364	level--;
				5365	}
				5366	}
				5367
				5368	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
				5369	wc->level = level;
				5370	wc->shared_level = -1;
				5371	wc->stage = DROP_REFERENCE;
				5372	wc->update_ref = update_ref;
				5373	wc->keep_locks = 0;
				5374	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
				5375
				5376	while (1) {
				5377
				5378	ret = walk_down_tree(trans, root, path, wc);
				5379	if (ret < 0) {
				5380	err = ret;
				5381	break;
				5382	}
				5383
				5384	ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
				5385	if (ret < 0) {
				5386	err = ret;
				5387	break;
				5388	}
				5389
				5390	if (ret > 0) {
				5391	BUG_ON(wc->stage != DROP_REFERENCE);
				5392	break;
				5393	}
				5394
				5395	if (wc->stage == DROP_REFERENCE) {
				5396	wc->drop_level = wc->level;
				5397	btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
				5398	&wc->drop_progress,
				5399	path->slots[wc->drop_level]);
				5400	}
				5401	btrfs_cpu_key_to_disk(&root_item->drop_progress,
				5402	&wc->drop_progress);
				5403	root_item->drop_level = wc->drop_level;
				5404
				5405	BUG_ON(wc->level == 0);
				5406	if (btrfs_should_end_transaction(trans) \|\|
				5407	(!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
				5408	ret = btrfs_update_root(trans, tree_root,
				5409	&root->root_key,
				5410	root_item);
				5411	if (ret) {
				5412	btrfs_abort_transaction(trans, ret);
				5413	err = ret;
				5414	goto out_end_trans;
				5415	}
				5416
				5417	btrfs_end_transaction_throttle(trans);
				5418	if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
				5419	btrfs_debug(fs_info,
				5420	"drop snapshot early exit");
				5421	err = -EAGAIN;
				5422	goto out_free;
				5423	}
				5424
				5425	/*
				5426	* Use join to avoid potential EINTR from transaction
				5427	* start. See wait_reserve_ticket and the whole
				5428	* reservation callchain.
				5429	*/
				5430	if (for_reloc)
				5431	trans = btrfs_join_transaction(tree_root);
				5432	else
				5433	trans = btrfs_start_transaction(tree_root, 0);
				5434	if (IS_ERR(trans)) {
				5435	err = PTR_ERR(trans);
				5436	goto out_free;
				5437	}
				5438	if (block_rsv)
				5439	trans->block_rsv = block_rsv;
				5440	}
				5441	}
				5442	btrfs_release_path(path);
				5443	if (err)
				5444	goto out_end_trans;
				5445
				5446	ret = btrfs_del_root(trans, &root->root_key);
				5447	if (ret) {
				5448	btrfs_abort_transaction(trans, ret);
				5449	err = ret;
				5450	goto out_end_trans;
				5451	}
				5452
				5453	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
				5454	ret = btrfs_find_root(tree_root, &root->root_key, path,
				5455	NULL, NULL);
				5456	if (ret < 0) {
				5457	btrfs_abort_transaction(trans, ret);
				5458	err = ret;
				5459	goto out_end_trans;
				5460	} else if (ret > 0) {
				5461	/* if we fail to delete the orphan item this time
				5462	* around, it'll get picked up the next time.
				5463	*
				5464	* The most common failure here is just -ENOENT.
				5465	*/
				5466	btrfs_del_orphan_item(trans, tree_root,
				5467	root->root_key.objectid);
				5468	}
				5469	}
				5470
				5471	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
				5472	btrfs_add_dropped_root(trans, root);
				5473	} else {
				5474	free_extent_buffer(root->node);
				5475	free_extent_buffer(root->commit_root);
				5476	btrfs_put_fs_root(root);
				5477	}
				5478	root_dropped = true;
				5479	out_end_trans:
				5480	btrfs_end_transaction_throttle(trans);
				5481	out_free:
				5482	kfree(wc);
				5483	btrfs_free_path(path);
				5484	out:
				5485	/*
				5486	* So if we need to stop dropping the snapshot for whatever reason we
				5487	* need to make sure to add it back to the dead root list so that we
				5488	* keep trying to do the work later. This also cleans up roots if we
				5489	* don't have it in the radix (like when we recover after a power fail
				5490	* or unmount) so we don't leak memory.
				5491	*/
				5492	if (!for_reloc && !root_dropped)
				5493	btrfs_add_dead_root(root);
				5494	return err;
				5495	}
				5496
				5497	/*
				5498	* drop subtree rooted at tree block 'node'.
				5499	*
				5500	* NOTE: this function will unlock and release tree block 'node'
				5501	* only used by relocation code
				5502	*/
				5503	int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
				5504	struct btrfs_root *root,
				5505	struct extent_buffer *node,
				5506	struct extent_buffer *parent)
				5507	{
				5508	struct btrfs_fs_info *fs_info = root->fs_info;
				5509	struct btrfs_path *path;
				5510	struct walk_control *wc;
				5511	int level;
				5512	int parent_level;
				5513	int ret = 0;
				5514	int wret;
				5515
				5516	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
				5517
				5518	path = btrfs_alloc_path();
				5519	if (!path)
				5520	return -ENOMEM;
				5521
				5522	wc = kzalloc(sizeof(*wc), GFP_NOFS);
				5523	if (!wc) {
				5524	btrfs_free_path(path);
				5525	return -ENOMEM;
				5526	}
				5527
				5528	btrfs_assert_tree_locked(parent);
				5529	parent_level = btrfs_header_level(parent);
				5530	extent_buffer_get(parent);
				5531	path->nodes[parent_level] = parent;
				5532	path->slots[parent_level] = btrfs_header_nritems(parent);
				5533
				5534	btrfs_assert_tree_locked(node);
				5535	level = btrfs_header_level(node);
				5536	path->nodes[level] = node;
				5537	path->slots[level] = 0;
				5538	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				5539
				5540	wc->refs[parent_level] = 1;
				5541	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
				5542	wc->level = level;
				5543	wc->shared_level = -1;
				5544	wc->stage = DROP_REFERENCE;
				5545	wc->update_ref = 0;
				5546	wc->keep_locks = 1;
				5547	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
				5548
				5549	while (1) {
				5550	wret = walk_down_tree(trans, root, path, wc);
				5551	if (wret < 0) {
				5552	ret = wret;
				5553	break;
				5554	}
				5555
				5556	wret = walk_up_tree(trans, root, path, wc, parent_level);
				5557	if (wret < 0)
				5558	ret = wret;
				5559	if (wret != 0)
				5560	break;
				5561	}
				5562
				5563	kfree(wc);
				5564	btrfs_free_path(path);
				5565	return ret;
				5566	}
				5567
				5568	/*
				5569	* helper to account the unused space of all the readonly block group in the
				5570	* space_info. takes mirrors into account.
				5571	*/
				5572	u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
				5573	{
				5574	struct btrfs_block_group_cache *block_group;
				5575	u64 free_bytes = 0;
				5576	int factor;
				5577
				5578	/* It's df, we don't care if it's racy */
				5579	if (list_empty(&sinfo->ro_bgs))
				5580	return 0;
				5581
				5582	spin_lock(&sinfo->lock);
				5583	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
				5584	spin_lock(&block_group->lock);
				5585
				5586	if (!block_group->ro) {
				5587	spin_unlock(&block_group->lock);
				5588	continue;
				5589	}
				5590
				5591	factor = btrfs_bg_type_to_factor(block_group->flags);
				5592	free_bytes += (block_group->key.offset -
				5593	btrfs_block_group_used(&block_group->item)) *
				5594	factor;
				5595
				5596	spin_unlock(&block_group->lock);
				5597	}
				5598	spin_unlock(&sinfo->lock);
				5599
				5600	return free_bytes;
				5601	}
				5602
				5603	int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				5604	u64 start, u64 end)
				5605	{
				5606	return unpin_extent_range(fs_info, start, end, false);
				5607	}
				5608
				5609	/*
				5610	* It used to be that old block groups would be left around forever.
				5611	* Iterating over them would be enough to trim unused space. Since we
				5612	* now automatically remove them, we also need to iterate over unallocated
				5613	* space.
				5614	*
				5615	* We don't want a transaction for this since the discard may take a
				5616	* substantial amount of time. We don't require that a transaction be
				5617	* running, but we do need to take a running transaction into account
				5618	* to ensure that we're not discarding chunks that were released or
				5619	* allocated in the current transaction.
				5620	*
				5621	* Holding the chunks lock will prevent other threads from allocating
				5622	* or releasing chunks, but it won't prevent a running transaction
				5623	* from committing and releasing the memory that the pending chunks
				5624	* list head uses. For that, we need to take a reference to the
				5625	* transaction and hold the commit root sem. We only need to hold
				5626	* it while performing the free space search since we have already
				5627	* held back allocations.
				5628	*/
				5629	static int btrfs_trim_free_extents(struct btrfs_device device, u64 trimmed)
				5630	{
				5631	u64 start = SZ_1M, len = 0, end = 0;
				5632	int ret;
				5633
				5634	*trimmed = 0;
				5635
				5636	/* Discard not supported = nothing to do. */
				5637	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
				5638	return 0;
				5639
				5640	/* Not writable = nothing to do. */
				5641	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				5642	return 0;
				5643
				5644	/* No free space = nothing to do. */
				5645	if (device->total_bytes <= device->bytes_used)
				5646	return 0;
				5647
				5648	ret = 0;
				5649
				5650	while (1) {
				5651	struct btrfs_fs_info *fs_info = device->fs_info;
				5652	u64 bytes;
				5653
				5654	ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
				5655	if (ret)
				5656	break;
				5657
				5658	find_first_clear_extent_bit(&device->alloc_state, start,
				5659	&start, &end,
				5660	CHUNK_TRIMMED \| CHUNK_ALLOCATED);
				5661
				5662	/* Check if there are any CHUNK_* bits left */
				5663	if (start > device->total_bytes) {
				5664	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
				5665	btrfs_warn_in_rcu(fs_info,
				5666	"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
				5667	start, end - start + 1,
				5668	rcu_str_deref(device->name),
				5669	device->total_bytes);
				5670	mutex_unlock(&fs_info->chunk_mutex);
				5671	ret = 0;
				5672	break;
				5673	}
				5674
				5675	/* Ensure we skip the reserved area in the first 1M */
				5676	start = max_t(u64, start, SZ_1M);
				5677
				5678	/*
				5679	* If find_first_clear_extent_bit find a range that spans the
				5680	* end of the device it will set end to -1, in this case it's up
				5681	* to the caller to trim the value to the size of the device.
				5682	*/
				5683	end = min(end, device->total_bytes - 1);
				5684
				5685	len = end - start + 1;
				5686
				5687	/* We didn't find any extents */
				5688	if (!len) {
				5689	mutex_unlock(&fs_info->chunk_mutex);
				5690	ret = 0;
				5691	break;
				5692	}
				5693
				5694	ret = btrfs_issue_discard(device->bdev, start, len,
				5695	&bytes);
				5696	if (!ret)
				5697	set_extent_bits(&device->alloc_state, start,
				5698	start + bytes - 1,
				5699	CHUNK_TRIMMED);
				5700	mutex_unlock(&fs_info->chunk_mutex);
				5701
				5702	if (ret)
				5703	break;
				5704
				5705	start += len;
				5706	*trimmed += bytes;
				5707
				5708	if (fatal_signal_pending(current)) {
				5709	ret = -ERESTARTSYS;
				5710	break;
				5711	}
				5712
				5713	cond_resched();
				5714	}
				5715
				5716	return ret;
				5717	}
				5718
				5719	/*
				5720	* Trim the whole filesystem by:
				5721	* 1) trimming the free space in each block group
				5722	* 2) trimming the unallocated space on each device
				5723	*
				5724	* This will also continue trimming even if a block group or device encounters
				5725	* an error. The return value will be the last error, or 0 if nothing bad
				5726	* happens.
				5727	*/
				5728	int btrfs_trim_fs(struct btrfs_fs_info fs_info, struct fstrim_range range)
				5729	{
				5730	struct btrfs_block_group_cache *cache = NULL;
				5731	struct btrfs_device *device;
				5732	struct list_head *devices;
				5733	u64 group_trimmed;
				5734	u64 range_end = U64_MAX;
				5735	u64 start;
				5736	u64 end;
				5737	u64 trimmed = 0;
				5738	u64 bg_failed = 0;
				5739	u64 dev_failed = 0;
				5740	int bg_ret = 0;
				5741	int dev_ret = 0;
				5742	int ret = 0;
				5743
				5744	/*
				5745	* Check range overflow if range->len is set.
				5746	* The default range->len is U64_MAX.
				5747	*/
				5748	if (range->len != U64_MAX &&
				5749	check_add_overflow(range->start, range->len, &range_end))
				5750	return -EINVAL;
				5751
				5752	cache = btrfs_lookup_first_block_group(fs_info, range->start);
				5753	for (; cache; cache = btrfs_next_block_group(cache)) {
				5754	if (cache->key.objectid >= range_end) {
				5755	btrfs_put_block_group(cache);
				5756	break;
				5757	}
				5758
				5759	start = max(range->start, cache->key.objectid);
				5760	end = min(range_end, cache->key.objectid + cache->key.offset);
				5761
				5762	if (end - start >= range->minlen) {
				5763	if (!btrfs_block_group_cache_done(cache)) {
				5764	ret = btrfs_cache_block_group(cache, 0);
				5765	if (ret) {
				5766	bg_failed++;
				5767	bg_ret = ret;
				5768	continue;
				5769	}
				5770	ret = btrfs_wait_block_group_cache_done(cache);
				5771	if (ret) {
				5772	bg_failed++;
				5773	bg_ret = ret;
				5774	continue;
				5775	}
				5776	}
				5777	ret = btrfs_trim_block_group(cache,
				5778	&group_trimmed,
				5779	start,
				5780	end,
				5781	range->minlen);
				5782
				5783	trimmed += group_trimmed;
				5784	if (ret) {
				5785	bg_failed++;
				5786	bg_ret = ret;
				5787	continue;
				5788	}
				5789	}
				5790	}
				5791
				5792	if (bg_failed)
				5793	btrfs_warn(fs_info,
				5794	"failed to trim %llu block group(s), last error %d",
				5795	bg_failed, bg_ret);
				5796	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				5797	devices = &fs_info->fs_devices->devices;
				5798	list_for_each_entry(device, devices, dev_list) {
				5799	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
				5800	continue;
				5801
				5802	ret = btrfs_trim_free_extents(device, &group_trimmed);
				5803	if (ret) {
				5804	dev_failed++;
				5805	dev_ret = ret;
				5806	break;
				5807	}
				5808
				5809	trimmed += group_trimmed;
				5810	}
				5811	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				5812
				5813	if (dev_failed)
				5814	btrfs_warn(fs_info,
				5815	"failed to trim %llu device(s), last error %d",
				5816	dev_failed, dev_ret);
				5817	range->len = trimmed;
				5818	if (bg_ret)
				5819	return bg_ret;
				5820	return dev_ret;
				5821	}
				5822
				5823	/*
				5824	* btrfs_{start,end}_write_no_snapshotting() are similar to
				5825	* mnt_{want,drop}_write(), they are used to prevent some tasks from writing
				5826	* data into the page cache through nocow before the subvolume is snapshoted,
				5827	* but flush the data into disk after the snapshot creation, or to prevent
				5828	* operations while snapshotting is ongoing and that cause the snapshot to be
				5829	* inconsistent (writes followed by expanding truncates for example).
				5830	*/
				5831	void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
				5832	{
				5833	percpu_counter_dec(&root->subv_writers->counter);
				5834	cond_wake_up(&root->subv_writers->wait);
				5835	}
				5836
				5837	int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
				5838	{
				5839	if (atomic_read(&root->will_be_snapshotted))
				5840	return 0;
				5841
				5842	percpu_counter_inc(&root->subv_writers->counter);
				5843	/*
				5844	* Make sure counter is updated before we check for snapshot creation.
				5845	*/
				5846	smp_mb();
				5847	if (atomic_read(&root->will_be_snapshotted)) {
				5848	btrfs_end_write_no_snapshotting(root);
				5849	return 0;
				5850	}
				5851	return 1;
				5852	}
				5853
				5854	void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
				5855	{
				5856	while (true) {
				5857	int ret;
				5858
				5859	ret = btrfs_start_write_no_snapshotting(root);
				5860	if (ret)
				5861	break;
				5862	wait_var_event(&root->will_be_snapshotted,
				5863	!atomic_read(&root->will_be_snapshotted));
				5864	}
				5865	}