Blame - src/kernel/linux/v4.14/fs/btrfs/extent-tree.c - T103

blob: 00481cfe6cfce574875fe9d5dbaffcc8d8acc268 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2007 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18	#include <linux/sched.h>
				19	#include <linux/sched/signal.h>
				20	#include <linux/pagemap.h>
				21	#include <linux/writeback.h>
				22	#include <linux/blkdev.h>
				23	#include <linux/sort.h>
				24	#include <linux/rcupdate.h>
				25	#include <linux/kthread.h>
				26	#include <linux/slab.h>
				27	#include <linux/ratelimit.h>
				28	#include <linux/percpu_counter.h>
				29	#include "hash.h"
				30	#include "tree-log.h"
				31	#include "disk-io.h"
				32	#include "print-tree.h"
				33	#include "volumes.h"
				34	#include "raid56.h"
				35	#include "locking.h"
				36	#include "free-space-cache.h"
				37	#include "free-space-tree.h"
				38	#include "math.h"
				39	#include "sysfs.h"
				40	#include "qgroup.h"
				41
				42	#undef SCRAMBLE_DELAYED_REFS
				43
				44	/*
				45	* control flags for do_chunk_alloc's force field
				46	* CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
				47	* if we really need one.
				48	*
				49	* CHUNK_ALLOC_LIMITED means to only try and allocate one
				50	* if we have very few chunks already allocated. This is
				51	* used as part of the clustering code to help make sure
				52	* we have a good pool of storage to cluster in, without
				53	* filling the FS with empty chunks
				54	*
				55	* CHUNK_ALLOC_FORCE means it must try to allocate one
				56	*
				57	*/
				58	enum {
				59	CHUNK_ALLOC_NO_FORCE = 0,
				60	CHUNK_ALLOC_LIMITED = 1,
				61	CHUNK_ALLOC_FORCE = 2,
				62	};
				63
				64	static int update_block_group(struct btrfs_trans_handle *trans,
				65	struct btrfs_fs_info *fs_info, u64 bytenr,
				66	u64 num_bytes, int alloc);
				67	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				68	struct btrfs_fs_info *fs_info,
				69	struct btrfs_delayed_ref_node *node, u64 parent,
				70	u64 root_objectid, u64 owner_objectid,
				71	u64 owner_offset, int refs_to_drop,
				72	struct btrfs_delayed_extent_op *extra_op);
				73	static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				74	struct extent_buffer *leaf,
				75	struct btrfs_extent_item *ei);
				76	static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				77	struct btrfs_fs_info *fs_info,
				78	u64 parent, u64 root_objectid,
				79	u64 flags, u64 owner, u64 offset,
				80	struct btrfs_key *ins, int ref_mod);
				81	static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				82	struct btrfs_fs_info *fs_info,
				83	u64 parent, u64 root_objectid,
				84	u64 flags, struct btrfs_disk_key *key,
				85	int level, struct btrfs_key *ins);
				86	static int do_chunk_alloc(struct btrfs_trans_handle *trans,
				87	struct btrfs_fs_info *fs_info, u64 flags,
				88	int force);
				89	static int find_next_key(struct btrfs_path *path, int level,
				90	struct btrfs_key *key);
				91	static void dump_space_info(struct btrfs_fs_info *fs_info,
				92	struct btrfs_space_info *info, u64 bytes,
				93	int dump_block_groups);
				94	static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
				95	u64 ram_bytes, u64 num_bytes, int delalloc);
				96	static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
				97	u64 num_bytes, int delalloc);
				98	static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
				99	u64 num_bytes);
				100	static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
				101	struct btrfs_space_info *space_info,
				102	u64 orig_bytes,
				103	enum btrfs_reserve_flush_enum flush,
				104	bool system_chunk);
				105	static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				106	struct btrfs_space_info *space_info,
				107	u64 num_bytes);
				108	static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				109	struct btrfs_space_info *space_info,
				110	u64 num_bytes);
				111
				112	static noinline int
				113	block_group_cache_done(struct btrfs_block_group_cache *cache)
				114	{
				115	smp_mb();
				116	return cache->cached == BTRFS_CACHE_FINISHED \|\|
				117	cache->cached == BTRFS_CACHE_ERROR;
				118	}
				119
				120	static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
				121	{
				122	return (cache->flags & bits) == bits;
				123	}
				124
				125	void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
				126	{
				127	atomic_inc(&cache->count);
				128	}
				129
				130	void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
				131	{
				132	if (atomic_dec_and_test(&cache->count)) {
				133	WARN_ON(cache->pinned > 0);
				134	WARN_ON(cache->reserved > 0);
				135
				136	/*
				137	* If not empty, someone is still holding mutex of
				138	* full_stripe_lock, which can only be released by caller.
				139	* And it will definitely cause use-after-free when caller
				140	* tries to release full stripe lock.
				141	*
				142	* No better way to resolve, but only to warn.
				143	*/
				144	WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
				145	kfree(cache->free_space_ctl);
				146	kfree(cache);
				147	}
				148	}
				149
				150	/*
				151	* this adds the block group to the fs_info rb tree for the block group
				152	* cache
				153	*/
				154	static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
				155	struct btrfs_block_group_cache *block_group)
				156	{
				157	struct rb_node **p;
				158	struct rb_node *parent = NULL;
				159	struct btrfs_block_group_cache *cache;
				160
				161	spin_lock(&info->block_group_cache_lock);
				162	p = &info->block_group_cache_tree.rb_node;
				163
				164	while (*p) {
				165	parent = *p;
				166	cache = rb_entry(parent, struct btrfs_block_group_cache,
				167	cache_node);
				168	if (block_group->key.objectid < cache->key.objectid) {
				169	p = &(*p)->rb_left;
				170	} else if (block_group->key.objectid > cache->key.objectid) {
				171	p = &(*p)->rb_right;
				172	} else {
				173	spin_unlock(&info->block_group_cache_lock);
				174	return -EEXIST;
				175	}
				176	}
				177
				178	rb_link_node(&block_group->cache_node, parent, p);
				179	rb_insert_color(&block_group->cache_node,
				180	&info->block_group_cache_tree);
				181
				182	if (info->first_logical_byte > block_group->key.objectid)
				183	info->first_logical_byte = block_group->key.objectid;
				184
				185	spin_unlock(&info->block_group_cache_lock);
				186
				187	return 0;
				188	}
				189
				190	/*
				191	* This will return the block group at or after bytenr if contains is 0, else
				192	* it will return the block group that contains the bytenr
				193	*/
				194	static struct btrfs_block_group_cache *
				195	block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
				196	int contains)
				197	{
				198	struct btrfs_block_group_cache cache, ret = NULL;
				199	struct rb_node *n;
				200	u64 end, start;
				201
				202	spin_lock(&info->block_group_cache_lock);
				203	n = info->block_group_cache_tree.rb_node;
				204
				205	while (n) {
				206	cache = rb_entry(n, struct btrfs_block_group_cache,
				207	cache_node);
				208	end = cache->key.objectid + cache->key.offset - 1;
				209	start = cache->key.objectid;
				210
				211	if (bytenr < start) {
				212	if (!contains && (!ret \|\| start < ret->key.objectid))
				213	ret = cache;
				214	n = n->rb_left;
				215	} else if (bytenr > start) {
				216	if (contains && bytenr <= end) {
				217	ret = cache;
				218	break;
				219	}
				220	n = n->rb_right;
				221	} else {
				222	ret = cache;
				223	break;
				224	}
				225	}
				226	if (ret) {
				227	btrfs_get_block_group(ret);
				228	if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
				229	info->first_logical_byte = ret->key.objectid;
				230	}
				231	spin_unlock(&info->block_group_cache_lock);
				232
				233	return ret;
				234	}
				235
				236	static int add_excluded_extent(struct btrfs_fs_info *fs_info,
				237	u64 start, u64 num_bytes)
				238	{
				239	u64 end = start + num_bytes - 1;
				240	set_extent_bits(&fs_info->freed_extents[0],
				241	start, end, EXTENT_UPTODATE);
				242	set_extent_bits(&fs_info->freed_extents[1],
				243	start, end, EXTENT_UPTODATE);
				244	return 0;
				245	}
				246
				247	static void free_excluded_extents(struct btrfs_fs_info *fs_info,
				248	struct btrfs_block_group_cache *cache)
				249	{
				250	u64 start, end;
				251
				252	start = cache->key.objectid;
				253	end = start + cache->key.offset - 1;
				254
				255	clear_extent_bits(&fs_info->freed_extents[0],
				256	start, end, EXTENT_UPTODATE);
				257	clear_extent_bits(&fs_info->freed_extents[1],
				258	start, end, EXTENT_UPTODATE);
				259	}
				260
				261	static int exclude_super_stripes(struct btrfs_fs_info *fs_info,
				262	struct btrfs_block_group_cache *cache)
				263	{
				264	u64 bytenr;
				265	u64 *logical;
				266	int stripe_len;
				267	int i, nr, ret;
				268
				269	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
				270	stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
				271	cache->bytes_super += stripe_len;
				272	ret = add_excluded_extent(fs_info, cache->key.objectid,
				273	stripe_len);
				274	if (ret)
				275	return ret;
				276	}
				277
				278	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
				279	bytenr = btrfs_sb_offset(i);
				280	ret = btrfs_rmap_block(fs_info, cache->key.objectid,
				281	bytenr, 0, &logical, &nr, &stripe_len);
				282	if (ret)
				283	return ret;
				284
				285	while (nr--) {
				286	u64 start, len;
				287
				288	if (logical[nr] > cache->key.objectid +
				289	cache->key.offset)
				290	continue;
				291
				292	if (logical[nr] + stripe_len <= cache->key.objectid)
				293	continue;
				294
				295	start = logical[nr];
				296	if (start < cache->key.objectid) {
				297	start = cache->key.objectid;
				298	len = (logical[nr] + stripe_len) - start;
				299	} else {
				300	len = min_t(u64, stripe_len,
				301	cache->key.objectid +
				302	cache->key.offset - start);
				303	}
				304
				305	cache->bytes_super += len;
				306	ret = add_excluded_extent(fs_info, start, len);
				307	if (ret) {
				308	kfree(logical);
				309	return ret;
				310	}
				311	}
				312
				313	kfree(logical);
				314	}
				315	return 0;
				316	}
				317
				318	static struct btrfs_caching_control *
				319	get_caching_control(struct btrfs_block_group_cache *cache)
				320	{
				321	struct btrfs_caching_control *ctl;
				322
				323	spin_lock(&cache->lock);
				324	if (!cache->caching_ctl) {
				325	spin_unlock(&cache->lock);
				326	return NULL;
				327	}
				328
				329	ctl = cache->caching_ctl;
				330	refcount_inc(&ctl->count);
				331	spin_unlock(&cache->lock);
				332	return ctl;
				333	}
				334
				335	static void put_caching_control(struct btrfs_caching_control *ctl)
				336	{
				337	if (refcount_dec_and_test(&ctl->count))
				338	kfree(ctl);
				339	}
				340
				341	#ifdef CONFIG_BTRFS_DEBUG
				342	static void fragment_free_space(struct btrfs_block_group_cache *block_group)
				343	{
				344	struct btrfs_fs_info *fs_info = block_group->fs_info;
				345	u64 start = block_group->key.objectid;
				346	u64 len = block_group->key.offset;
				347	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
				348	fs_info->nodesize : fs_info->sectorsize;
				349	u64 step = chunk << 1;
				350
				351	while (len > chunk) {
				352	btrfs_remove_free_space(block_group, start, chunk);
				353	start += step;
				354	if (len < step)
				355	len = 0;
				356	else
				357	len -= step;
				358	}
				359	}
				360	#endif
				361
				362	/*
				363	* this is only called by cache_block_group, since we could have freed extents
				364	* we need to check the pinned_extents for any extents that can't be used yet
				365	* since their free space will be released as soon as the transaction commits.
				366	*/
				367	u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
				368	struct btrfs_fs_info *info, u64 start, u64 end)
				369	{
				370	u64 extent_start, extent_end, size, total_added = 0;
				371	int ret;
				372
				373	while (start < end) {
				374	ret = find_first_extent_bit(info->pinned_extents, start,
				375	&extent_start, &extent_end,
				376	EXTENT_DIRTY \| EXTENT_UPTODATE,
				377	NULL);
				378	if (ret)
				379	break;
				380
				381	if (extent_start <= start) {
				382	start = extent_end + 1;
				383	} else if (extent_start > start && extent_start < end) {
				384	size = extent_start - start;
				385	total_added += size;
				386	ret = btrfs_add_free_space(block_group, start,
				387	size);
				388	BUG_ON(ret); /* -ENOMEM or logic error */
				389	start = extent_end + 1;
				390	} else {
				391	break;
				392	}
				393	}
				394
				395	if (start < end) {
				396	size = end - start;
				397	total_added += size;
				398	ret = btrfs_add_free_space(block_group, start, size);
				399	BUG_ON(ret); /* -ENOMEM or logic error */
				400	}
				401
				402	return total_added;
				403	}
				404
				405	static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
				406	{
				407	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
				408	struct btrfs_fs_info *fs_info = block_group->fs_info;
				409	struct btrfs_root *extent_root = fs_info->extent_root;
				410	struct btrfs_path *path;
				411	struct extent_buffer *leaf;
				412	struct btrfs_key key;
				413	u64 total_found = 0;
				414	u64 last = 0;
				415	u32 nritems;
				416	int ret;
				417	bool wakeup = true;
				418
				419	path = btrfs_alloc_path();
				420	if (!path)
				421	return -ENOMEM;
				422
				423	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
				424
				425	#ifdef CONFIG_BTRFS_DEBUG
				426	/*
				427	* If we're fragmenting we don't want to make anybody think we can
				428	* allocate from this block group until we've had a chance to fragment
				429	* the free space.
				430	*/
				431	if (btrfs_should_fragment_free_space(block_group))
				432	wakeup = false;
				433	#endif
				434	/*
				435	* We don't want to deadlock with somebody trying to allocate a new
				436	* extent for the extent root while also trying to search the extent
				437	* root to add free space. So we skip locking and search the commit
				438	* root, since its read-only
				439	*/
				440	path->skip_locking = 1;
				441	path->search_commit_root = 1;
				442	path->reada = READA_FORWARD;
				443
				444	key.objectid = last;
				445	key.offset = 0;
				446	key.type = BTRFS_EXTENT_ITEM_KEY;
				447
				448	next:
				449	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
				450	if (ret < 0)
				451	goto out;
				452
				453	leaf = path->nodes[0];
				454	nritems = btrfs_header_nritems(leaf);
				455
				456	while (1) {
				457	if (btrfs_fs_closing(fs_info) > 1) {
				458	last = (u64)-1;
				459	break;
				460	}
				461
				462	if (path->slots[0] < nritems) {
				463	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				464	} else {
				465	ret = find_next_key(path, 0, &key);
				466	if (ret)
				467	break;
				468
				469	if (need_resched() \|\|
				470	rwsem_is_contended(&fs_info->commit_root_sem)) {
				471	if (wakeup)
				472	caching_ctl->progress = last;
				473	btrfs_release_path(path);
				474	up_read(&fs_info->commit_root_sem);
				475	mutex_unlock(&caching_ctl->mutex);
				476	cond_resched();
				477	mutex_lock(&caching_ctl->mutex);
				478	down_read(&fs_info->commit_root_sem);
				479	goto next;
				480	}
				481
				482	ret = btrfs_next_leaf(extent_root, path);
				483	if (ret < 0)
				484	goto out;
				485	if (ret)
				486	break;
				487	leaf = path->nodes[0];
				488	nritems = btrfs_header_nritems(leaf);
				489	continue;
				490	}
				491
				492	if (key.objectid < last) {
				493	key.objectid = last;
				494	key.offset = 0;
				495	key.type = BTRFS_EXTENT_ITEM_KEY;
				496
				497	if (wakeup)
				498	caching_ctl->progress = last;
				499	btrfs_release_path(path);
				500	goto next;
				501	}
				502
				503	if (key.objectid < block_group->key.objectid) {
				504	path->slots[0]++;
				505	continue;
				506	}
				507
				508	if (key.objectid >= block_group->key.objectid +
				509	block_group->key.offset)
				510	break;
				511
				512	if (key.type == BTRFS_EXTENT_ITEM_KEY \|\|
				513	key.type == BTRFS_METADATA_ITEM_KEY) {
				514	total_found += add_new_free_space(block_group,
				515	fs_info, last,
				516	key.objectid);
				517	if (key.type == BTRFS_METADATA_ITEM_KEY)
				518	last = key.objectid +
				519	fs_info->nodesize;
				520	else
				521	last = key.objectid + key.offset;
				522
				523	if (total_found > CACHING_CTL_WAKE_UP) {
				524	total_found = 0;
				525	if (wakeup)
				526	wake_up(&caching_ctl->wait);
				527	}
				528	}
				529	path->slots[0]++;
				530	}
				531	ret = 0;
				532
				533	total_found += add_new_free_space(block_group, fs_info, last,
				534	block_group->key.objectid +
				535	block_group->key.offset);
				536	caching_ctl->progress = (u64)-1;
				537
				538	out:
				539	btrfs_free_path(path);
				540	return ret;
				541	}
				542
				543	static noinline void caching_thread(struct btrfs_work *work)
				544	{
				545	struct btrfs_block_group_cache *block_group;
				546	struct btrfs_fs_info *fs_info;
				547	struct btrfs_caching_control *caching_ctl;
				548	struct btrfs_root *extent_root;
				549	int ret;
				550
				551	caching_ctl = container_of(work, struct btrfs_caching_control, work);
				552	block_group = caching_ctl->block_group;
				553	fs_info = block_group->fs_info;
				554	extent_root = fs_info->extent_root;
				555
				556	mutex_lock(&caching_ctl->mutex);
				557	down_read(&fs_info->commit_root_sem);
				558
				559	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
				560	ret = load_free_space_tree(caching_ctl);
				561	else
				562	ret = load_extent_tree_free(caching_ctl);
				563
				564	spin_lock(&block_group->lock);
				565	block_group->caching_ctl = NULL;
				566	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
				567	spin_unlock(&block_group->lock);
				568
				569	#ifdef CONFIG_BTRFS_DEBUG
				570	if (btrfs_should_fragment_free_space(block_group)) {
				571	u64 bytes_used;
				572
				573	spin_lock(&block_group->space_info->lock);
				574	spin_lock(&block_group->lock);
				575	bytes_used = block_group->key.offset -
				576	btrfs_block_group_used(&block_group->item);
				577	block_group->space_info->bytes_used += bytes_used >> 1;
				578	spin_unlock(&block_group->lock);
				579	spin_unlock(&block_group->space_info->lock);
				580	fragment_free_space(block_group);
				581	}
				582	#endif
				583
				584	caching_ctl->progress = (u64)-1;
				585
				586	up_read(&fs_info->commit_root_sem);
				587	free_excluded_extents(fs_info, block_group);
				588	mutex_unlock(&caching_ctl->mutex);
				589
				590	wake_up(&caching_ctl->wait);
				591
				592	put_caching_control(caching_ctl);
				593	btrfs_put_block_group(block_group);
				594	}
				595
				596	static int cache_block_group(struct btrfs_block_group_cache *cache,
				597	int load_cache_only)
				598	{
				599	DEFINE_WAIT(wait);
				600	struct btrfs_fs_info *fs_info = cache->fs_info;
				601	struct btrfs_caching_control *caching_ctl;
				602	int ret = 0;
				603
				604	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
				605	if (!caching_ctl)
				606	return -ENOMEM;
				607
				608	INIT_LIST_HEAD(&caching_ctl->list);
				609	mutex_init(&caching_ctl->mutex);
				610	init_waitqueue_head(&caching_ctl->wait);
				611	caching_ctl->block_group = cache;
				612	caching_ctl->progress = cache->key.objectid;
				613	refcount_set(&caching_ctl->count, 1);
				614	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
				615	caching_thread, NULL, NULL);
				616
				617	spin_lock(&cache->lock);
				618	/*
				619	* This should be a rare occasion, but this could happen I think in the
				620	* case where one thread starts to load the space cache info, and then
				621	* some other thread starts a transaction commit which tries to do an
				622	* allocation while the other thread is still loading the space cache
				623	* info. The previous loop should have kept us from choosing this block
				624	* group, but if we've moved to the state where we will wait on caching
				625	* block groups we need to first check if we're doing a fast load here,
				626	* so we can wait for it to finish, otherwise we could end up allocating
				627	* from a block group who's cache gets evicted for one reason or
				628	* another.
				629	*/
				630	while (cache->cached == BTRFS_CACHE_FAST) {
				631	struct btrfs_caching_control *ctl;
				632
				633	ctl = cache->caching_ctl;
				634	refcount_inc(&ctl->count);
				635	prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
				636	spin_unlock(&cache->lock);
				637
				638	schedule();
				639
				640	finish_wait(&ctl->wait, &wait);
				641	put_caching_control(ctl);
				642	spin_lock(&cache->lock);
				643	}
				644
				645	if (cache->cached != BTRFS_CACHE_NO) {
				646	spin_unlock(&cache->lock);
				647	kfree(caching_ctl);
				648	return 0;
				649	}
				650	WARN_ON(cache->caching_ctl);
				651	cache->caching_ctl = caching_ctl;
				652	cache->cached = BTRFS_CACHE_FAST;
				653	spin_unlock(&cache->lock);
				654
				655	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
				656	mutex_lock(&caching_ctl->mutex);
				657	ret = load_free_space_cache(fs_info, cache);
				658
				659	spin_lock(&cache->lock);
				660	if (ret == 1) {
				661	cache->caching_ctl = NULL;
				662	cache->cached = BTRFS_CACHE_FINISHED;
				663	cache->last_byte_to_unpin = (u64)-1;
				664	caching_ctl->progress = (u64)-1;
				665	} else {
				666	if (load_cache_only) {
				667	cache->caching_ctl = NULL;
				668	cache->cached = BTRFS_CACHE_NO;
				669	} else {
				670	cache->cached = BTRFS_CACHE_STARTED;
				671	cache->has_caching_ctl = 1;
				672	}
				673	}
				674	spin_unlock(&cache->lock);
				675	#ifdef CONFIG_BTRFS_DEBUG
				676	if (ret == 1 &&
				677	btrfs_should_fragment_free_space(cache)) {
				678	u64 bytes_used;
				679
				680	spin_lock(&cache->space_info->lock);
				681	spin_lock(&cache->lock);
				682	bytes_used = cache->key.offset -
				683	btrfs_block_group_used(&cache->item);
				684	cache->space_info->bytes_used += bytes_used >> 1;
				685	spin_unlock(&cache->lock);
				686	spin_unlock(&cache->space_info->lock);
				687	fragment_free_space(cache);
				688	}
				689	#endif
				690	mutex_unlock(&caching_ctl->mutex);
				691
				692	wake_up(&caching_ctl->wait);
				693	if (ret == 1) {
				694	put_caching_control(caching_ctl);
				695	free_excluded_extents(fs_info, cache);
				696	return 0;
				697	}
				698	} else {
				699	/*
				700	* We're either using the free space tree or no caching at all.
				701	* Set cached to the appropriate value and wakeup any waiters.
				702	*/
				703	spin_lock(&cache->lock);
				704	if (load_cache_only) {
				705	cache->caching_ctl = NULL;
				706	cache->cached = BTRFS_CACHE_NO;
				707	} else {
				708	cache->cached = BTRFS_CACHE_STARTED;
				709	cache->has_caching_ctl = 1;
				710	}
				711	spin_unlock(&cache->lock);
				712	wake_up(&caching_ctl->wait);
				713	}
				714
				715	if (load_cache_only) {
				716	put_caching_control(caching_ctl);
				717	return 0;
				718	}
				719
				720	down_write(&fs_info->commit_root_sem);
				721	refcount_inc(&caching_ctl->count);
				722	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
				723	up_write(&fs_info->commit_root_sem);
				724
				725	btrfs_get_block_group(cache);
				726
				727	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
				728
				729	return ret;
				730	}
				731
				732	/*
				733	* return the block group that starts at or after bytenr
				734	*/
				735	static struct btrfs_block_group_cache *
				736	btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
				737	{
				738	return block_group_cache_tree_search(info, bytenr, 0);
				739	}
				740
				741	/*
				742	* return the block group that contains the given bytenr
				743	*/
				744	struct btrfs_block_group_cache *btrfs_lookup_block_group(
				745	struct btrfs_fs_info *info,
				746	u64 bytenr)
				747	{
				748	return block_group_cache_tree_search(info, bytenr, 1);
				749	}
				750
				751	static struct btrfs_space_info __find_space_info(struct btrfs_fs_info info,
				752	u64 flags)
				753	{
				754	struct list_head *head = &info->space_info;
				755	struct btrfs_space_info *found;
				756
				757	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
				758
				759	rcu_read_lock();
				760	list_for_each_entry_rcu(found, head, list) {
				761	if (found->flags & flags) {
				762	rcu_read_unlock();
				763	return found;
				764	}
				765	}
				766	rcu_read_unlock();
				767	return NULL;
				768	}
				769
				770	static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
				771	u64 owner, u64 root_objectid)
				772	{
				773	struct btrfs_space_info *space_info;
				774	u64 flags;
				775
				776	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				777	if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
				778	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				779	else
				780	flags = BTRFS_BLOCK_GROUP_METADATA;
				781	} else {
				782	flags = BTRFS_BLOCK_GROUP_DATA;
				783	}
				784
				785	space_info = __find_space_info(fs_info, flags);
				786	ASSERT(space_info);
				787	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
				788	}
				789
				790	/*
				791	* after adding space to the filesystem, we need to clear the full flags
				792	* on all the space infos.
				793	*/
				794	void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
				795	{
				796	struct list_head *head = &info->space_info;
				797	struct btrfs_space_info *found;
				798
				799	rcu_read_lock();
				800	list_for_each_entry_rcu(found, head, list)
				801	found->full = 0;
				802	rcu_read_unlock();
				803	}
				804
				805	/* simple helper to search for an existing data extent at a given offset */
				806	int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
				807	{
				808	int ret;
				809	struct btrfs_key key;
				810	struct btrfs_path *path;
				811
				812	path = btrfs_alloc_path();
				813	if (!path)
				814	return -ENOMEM;
				815
				816	key.objectid = start;
				817	key.offset = len;
				818	key.type = BTRFS_EXTENT_ITEM_KEY;
				819	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
				820	btrfs_free_path(path);
				821	return ret;
				822	}
				823
				824	/*
				825	* helper function to lookup reference count and flags of a tree block.
				826	*
				827	* the head node for delayed ref is used to store the sum of all the
				828	* reference count modifications queued up in the rbtree. the head
				829	* node may also store the extent flags to set. This way you can check
				830	* to see what the reference count and extent flags would be if all of
				831	* the delayed refs are not processed.
				832	*/
				833	int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
				834	struct btrfs_fs_info *fs_info, u64 bytenr,
				835	u64 offset, int metadata, u64 refs, u64 flags)
				836	{
				837	struct btrfs_delayed_ref_head *head;
				838	struct btrfs_delayed_ref_root *delayed_refs;
				839	struct btrfs_path *path;
				840	struct btrfs_extent_item *ei;
				841	struct extent_buffer *leaf;
				842	struct btrfs_key key;
				843	u32 item_size;
				844	u64 num_refs;
				845	u64 extent_flags;
				846	int ret;
				847
				848	/*
				849	* If we don't have skinny metadata, don't bother doing anything
				850	* different
				851	*/
				852	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
				853	offset = fs_info->nodesize;
				854	metadata = 0;
				855	}
				856
				857	path = btrfs_alloc_path();
				858	if (!path)
				859	return -ENOMEM;
				860
				861	if (!trans) {
				862	path->skip_locking = 1;
				863	path->search_commit_root = 1;
				864	}
				865
				866	search_again:
				867	key.objectid = bytenr;
				868	key.offset = offset;
				869	if (metadata)
				870	key.type = BTRFS_METADATA_ITEM_KEY;
				871	else
				872	key.type = BTRFS_EXTENT_ITEM_KEY;
				873
				874	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
				875	if (ret < 0)
				876	goto out_free;
				877
				878	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
				879	if (path->slots[0]) {
				880	path->slots[0]--;
				881	btrfs_item_key_to_cpu(path->nodes[0], &key,
				882	path->slots[0]);
				883	if (key.objectid == bytenr &&
				884	key.type == BTRFS_EXTENT_ITEM_KEY &&
				885	key.offset == fs_info->nodesize)
				886	ret = 0;
				887	}
				888	}
				889
				890	if (ret == 0) {
				891	leaf = path->nodes[0];
				892	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				893	if (item_size >= sizeof(*ei)) {
				894	ei = btrfs_item_ptr(leaf, path->slots[0],
				895	struct btrfs_extent_item);
				896	num_refs = btrfs_extent_refs(leaf, ei);
				897	extent_flags = btrfs_extent_flags(leaf, ei);
				898	} else {
				899	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				900	struct btrfs_extent_item_v0 *ei0;
				901	BUG_ON(item_size != sizeof(*ei0));
				902	ei0 = btrfs_item_ptr(leaf, path->slots[0],
				903	struct btrfs_extent_item_v0);
				904	num_refs = btrfs_extent_refs_v0(leaf, ei0);
				905	/* FIXME: this isn't correct for data */
				906	extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
				907	#else
				908	BUG();
				909	#endif
				910	}
				911	BUG_ON(num_refs == 0);
				912	} else {
				913	num_refs = 0;
				914	extent_flags = 0;
				915	ret = 0;
				916	}
				917
				918	if (!trans)
				919	goto out;
				920
				921	delayed_refs = &trans->transaction->delayed_refs;
				922	spin_lock(&delayed_refs->lock);
				923	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				924	if (head) {
				925	if (!mutex_trylock(&head->mutex)) {
				926	refcount_inc(&head->node.refs);
				927	spin_unlock(&delayed_refs->lock);
				928
				929	btrfs_release_path(path);
				930
				931	/*
				932	* Mutex was contended, block until it's released and try
				933	* again
				934	*/
				935	mutex_lock(&head->mutex);
				936	mutex_unlock(&head->mutex);
				937	btrfs_put_delayed_ref(&head->node);
				938	goto search_again;
				939	}
				940	spin_lock(&head->lock);
				941	if (head->extent_op && head->extent_op->update_flags)
				942	extent_flags \|= head->extent_op->flags_to_set;
				943	else
				944	BUG_ON(num_refs == 0);
				945
				946	num_refs += head->node.ref_mod;
				947	spin_unlock(&head->lock);
				948	mutex_unlock(&head->mutex);
				949	}
				950	spin_unlock(&delayed_refs->lock);
				951	out:
				952	WARN_ON(num_refs == 0);
				953	if (refs)
				954	*refs = num_refs;
				955	if (flags)
				956	*flags = extent_flags;
				957	out_free:
				958	btrfs_free_path(path);
				959	return ret;
				960	}
				961
				962	/*
				963	* Back reference rules. Back refs have three main goals:
				964	*
				965	* 1) differentiate between all holders of references to an extent so that
				966	* when a reference is dropped we can make sure it was a valid reference
				967	* before freeing the extent.
				968	*
				969	* 2) Provide enough information to quickly find the holders of an extent
				970	* if we notice a given block is corrupted or bad.
				971	*
				972	* 3) Make it easy to migrate blocks for FS shrinking or storage pool
				973	* maintenance. This is actually the same as #2, but with a slightly
				974	* different use case.
				975	*
				976	* There are two kinds of back refs. The implicit back refs is optimized
				977	* for pointers in non-shared tree blocks. For a given pointer in a block,
				978	* back refs of this kind provide information about the block's owner tree
				979	* and the pointer's key. These information allow us to find the block by
				980	* b-tree searching. The full back refs is for pointers in tree blocks not
				981	* referenced by their owner trees. The location of tree block is recorded
				982	* in the back refs. Actually the full back refs is generic, and can be
				983	* used in all cases the implicit back refs is used. The major shortcoming
				984	* of the full back refs is its overhead. Every time a tree block gets
				985	* COWed, we have to update back refs entry for all pointers in it.
				986	*
				987	* For a newly allocated tree block, we use implicit back refs for
				988	* pointers in it. This means most tree related operations only involve
				989	* implicit back refs. For a tree block created in old transaction, the
				990	* only way to drop a reference to it is COW it. So we can detect the
				991	* event that tree block loses its owner tree's reference and do the
				992	* back refs conversion.
				993	*
				994	* When a tree block is COWed through a tree, there are four cases:
				995	*
				996	* The reference count of the block is one and the tree is the block's
				997	* owner tree. Nothing to do in this case.
				998	*
				999	* The reference count of the block is one and the tree is not the
				1000	* block's owner tree. In this case, full back refs is used for pointers
				1001	* in the block. Remove these full back refs, add implicit back refs for
				1002	* every pointers in the new block.
				1003	*
				1004	* The reference count of the block is greater than one and the tree is
				1005	* the block's owner tree. In this case, implicit back refs is used for
				1006	* pointers in the block. Add full back refs for every pointers in the
				1007	* block, increase lower level extents' reference counts. The original
				1008	* implicit back refs are entailed to the new block.
				1009	*
				1010	* The reference count of the block is greater than one and the tree is
				1011	* not the block's owner tree. Add implicit back refs for every pointer in
				1012	* the new block, increase lower level extents' reference count.
				1013	*
				1014	* Back Reference Key composing:
				1015	*
				1016	* The key objectid corresponds to the first byte in the extent,
				1017	* The key type is used to differentiate between types of back refs.
				1018	* There are different meanings of the key offset for different types
				1019	* of back refs.
				1020	*
				1021	* File extents can be referenced by:
				1022	*
				1023	* - multiple snapshots, subvolumes, or different generations in one subvol
				1024	* - different files inside a single subvolume
				1025	* - different offsets inside a file (bookend extents in file.c)
				1026	*
				1027	* The extent ref structure for the implicit back refs has fields for:
				1028	*
				1029	* - Objectid of the subvolume root
				1030	* - objectid of the file holding the reference
				1031	* - original offset in the file
				1032	* - how many bookend extents
				1033	*
				1034	* The key offset for the implicit back refs is hash of the first
				1035	* three fields.
				1036	*
				1037	* The extent ref structure for the full back refs has field for:
				1038	*
				1039	* - number of pointers in the tree leaf
				1040	*
				1041	* The key offset for the implicit back refs is the first byte of
				1042	* the tree leaf
				1043	*
				1044	* When a file extent is allocated, The implicit back refs is used.
				1045	* the fields are filled in:
				1046	*
				1047	* (root_key.objectid, inode objectid, offset in file, 1)
				1048	*
				1049	* When a file extent is removed file truncation, we find the
				1050	* corresponding implicit back refs and check the following fields:
				1051	*
				1052	* (btrfs_header_owner(leaf), inode objectid, offset in file)
				1053	*
				1054	* Btree extents can be referenced by:
				1055	*
				1056	* - Different subvolumes
				1057	*
				1058	* Both the implicit back refs and the full back refs for tree blocks
				1059	* only consist of key. The key offset for the implicit back refs is
				1060	* objectid of block's owner tree. The key offset for the full back refs
				1061	* is the first byte of parent block.
				1062	*
				1063	* When implicit back refs is used, information about the lowest key and
				1064	* level of the tree block are required. These information are stored in
				1065	* tree block info structure.
				1066	*/
				1067
				1068	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				1069	static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
				1070	struct btrfs_fs_info *fs_info,
				1071	struct btrfs_path *path,
				1072	u64 owner, u32 extra_size)
				1073	{
				1074	struct btrfs_root *root = fs_info->extent_root;
				1075	struct btrfs_extent_item *item;
				1076	struct btrfs_extent_item_v0 *ei0;
				1077	struct btrfs_extent_ref_v0 *ref0;
				1078	struct btrfs_tree_block_info *bi;
				1079	struct extent_buffer *leaf;
				1080	struct btrfs_key key;
				1081	struct btrfs_key found_key;
				1082	u32 new_size = sizeof(*item);
				1083	u64 refs;
				1084	int ret;
				1085
				1086	leaf = path->nodes[0];
				1087	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
				1088
				1089	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1090	ei0 = btrfs_item_ptr(leaf, path->slots[0],
				1091	struct btrfs_extent_item_v0);
				1092	refs = btrfs_extent_refs_v0(leaf, ei0);
				1093
				1094	if (owner == (u64)-1) {
				1095	while (1) {
				1096	if (path->slots[0] >= btrfs_header_nritems(leaf)) {
				1097	ret = btrfs_next_leaf(root, path);
				1098	if (ret < 0)
				1099	return ret;
				1100	BUG_ON(ret > 0); /* Corruption */
				1101	leaf = path->nodes[0];
				1102	}
				1103	btrfs_item_key_to_cpu(leaf, &found_key,
				1104	path->slots[0]);
				1105	BUG_ON(key.objectid != found_key.objectid);
				1106	if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
				1107	path->slots[0]++;
				1108	continue;
				1109	}
				1110	ref0 = btrfs_item_ptr(leaf, path->slots[0],
				1111	struct btrfs_extent_ref_v0);
				1112	owner = btrfs_ref_objectid_v0(leaf, ref0);
				1113	break;
				1114	}
				1115	}
				1116	btrfs_release_path(path);
				1117
				1118	if (owner < BTRFS_FIRST_FREE_OBJECTID)
				1119	new_size += sizeof(*bi);
				1120
				1121	new_size -= sizeof(*ei0);
				1122	ret = btrfs_search_slot(trans, root, &key, path,
				1123	new_size + extra_size, 1);
				1124	if (ret < 0)
				1125	return ret;
				1126	BUG_ON(ret); /* Corruption */
				1127
				1128	btrfs_extend_item(fs_info, path, new_size);
				1129
				1130	leaf = path->nodes[0];
				1131	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1132	btrfs_set_extent_refs(leaf, item, refs);
				1133	/* FIXME: get real generation */
				1134	btrfs_set_extent_generation(leaf, item, 0);
				1135	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1136	btrfs_set_extent_flags(leaf, item,
				1137	BTRFS_EXTENT_FLAG_TREE_BLOCK \|
				1138	BTRFS_BLOCK_FLAG_FULL_BACKREF);
				1139	bi = (struct btrfs_tree_block_info *)(item + 1);
				1140	/* FIXME: get first key of the block */
				1141	memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi));
				1142	btrfs_set_tree_block_level(leaf, bi, (int)owner);
				1143	} else {
				1144	btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
				1145	}
				1146	btrfs_mark_buffer_dirty(leaf);
				1147	return 0;
				1148	}
				1149	#endif
				1150
				1151	/*
				1152	* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
				1153	* is_data == BTRFS_REF_TYPE_DATA, data type is requried,
				1154	* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
				1155	*/
				1156	int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				1157	struct btrfs_extent_inline_ref *iref,
				1158	enum btrfs_inline_ref_type is_data)
				1159	{
				1160	int type = btrfs_extent_inline_ref_type(eb, iref);
				1161	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
				1162
				1163	if (type == BTRFS_TREE_BLOCK_REF_KEY \|\|
				1164	type == BTRFS_SHARED_BLOCK_REF_KEY \|\|
				1165	type == BTRFS_SHARED_DATA_REF_KEY \|\|
				1166	type == BTRFS_EXTENT_DATA_REF_KEY) {
				1167	if (is_data == BTRFS_REF_TYPE_BLOCK) {
				1168	if (type == BTRFS_TREE_BLOCK_REF_KEY)
				1169	return type;
				1170	if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				1171	ASSERT(eb->fs_info);
				1172	/*
				1173	* Every shared one has parent tree block,
				1174	* which must be aligned to sector size.
				1175	*/
				1176	if (offset &&
				1177	IS_ALIGNED(offset, eb->fs_info->sectorsize))
				1178	return type;
				1179	}
				1180	} else if (is_data == BTRFS_REF_TYPE_DATA) {
				1181	if (type == BTRFS_EXTENT_DATA_REF_KEY)
				1182	return type;
				1183	if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1184	ASSERT(eb->fs_info);
				1185	/*
				1186	* Every shared one has parent tree block,
				1187	* which must be aligned to sector size.
				1188	*/
				1189	if (offset &&
				1190	IS_ALIGNED(offset, eb->fs_info->sectorsize))
				1191	return type;
				1192	}
				1193	} else {
				1194	ASSERT(is_data == BTRFS_REF_TYPE_ANY);
				1195	return type;
				1196	}
				1197	}
				1198
				1199	btrfs_print_leaf((struct extent_buffer *)eb);
				1200	btrfs_err(eb->fs_info,
				1201	"eb %llu iref 0x%lx invalid extent inline ref type %d",
				1202	eb->start, (unsigned long)iref, type);
				1203	WARN_ON(1);
				1204
				1205	return BTRFS_REF_TYPE_INVALID;
				1206	}
				1207
				1208	static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
				1209	{
				1210	u32 high_crc = ~(u32)0;
				1211	u32 low_crc = ~(u32)0;
				1212	__le64 lenum;
				1213
				1214	lenum = cpu_to_le64(root_objectid);
				1215	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
				1216	lenum = cpu_to_le64(owner);
				1217	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
				1218	lenum = cpu_to_le64(offset);
				1219	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
				1220
				1221	return ((u64)high_crc << 31) ^ (u64)low_crc;
				1222	}
				1223
				1224	static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				1225	struct btrfs_extent_data_ref *ref)
				1226	{
				1227	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				1228	btrfs_extent_data_ref_objectid(leaf, ref),
				1229	btrfs_extent_data_ref_offset(leaf, ref));
				1230	}
				1231
				1232	static int match_extent_data_ref(struct extent_buffer *leaf,
				1233	struct btrfs_extent_data_ref *ref,
				1234	u64 root_objectid, u64 owner, u64 offset)
				1235	{
				1236	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid \|\|
				1237	btrfs_extent_data_ref_objectid(leaf, ref) != owner \|\|
				1238	btrfs_extent_data_ref_offset(leaf, ref) != offset)
				1239	return 0;
				1240	return 1;
				1241	}
				1242
				1243	static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
				1244	struct btrfs_fs_info *fs_info,
				1245	struct btrfs_path *path,
				1246	u64 bytenr, u64 parent,
				1247	u64 root_objectid,
				1248	u64 owner, u64 offset)
				1249	{
				1250	struct btrfs_root *root = fs_info->extent_root;
				1251	struct btrfs_key key;
				1252	struct btrfs_extent_data_ref *ref;
				1253	struct extent_buffer *leaf;
				1254	u32 nritems;
				1255	int ret;
				1256	int recow;
				1257	int err = -ENOENT;
				1258
				1259	key.objectid = bytenr;
				1260	if (parent) {
				1261	key.type = BTRFS_SHARED_DATA_REF_KEY;
				1262	key.offset = parent;
				1263	} else {
				1264	key.type = BTRFS_EXTENT_DATA_REF_KEY;
				1265	key.offset = hash_extent_data_ref(root_objectid,
				1266	owner, offset);
				1267	}
				1268	again:
				1269	recow = 0;
				1270	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1271	if (ret < 0) {
				1272	err = ret;
				1273	goto fail;
				1274	}
				1275
				1276	if (parent) {
				1277	if (!ret)
				1278	return 0;
				1279	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				1280	key.type = BTRFS_EXTENT_REF_V0_KEY;
				1281	btrfs_release_path(path);
				1282	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1283	if (ret < 0) {
				1284	err = ret;
				1285	goto fail;
				1286	}
				1287	if (!ret)
				1288	return 0;
				1289	#endif
				1290	goto fail;
				1291	}
				1292
				1293	leaf = path->nodes[0];
				1294	nritems = btrfs_header_nritems(leaf);
				1295	while (1) {
				1296	if (path->slots[0] >= nritems) {
				1297	ret = btrfs_next_leaf(root, path);
				1298	if (ret < 0)
				1299	err = ret;
				1300	if (ret)
				1301	goto fail;
				1302
				1303	leaf = path->nodes[0];
				1304	nritems = btrfs_header_nritems(leaf);
				1305	recow = 1;
				1306	}
				1307
				1308	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1309	if (key.objectid != bytenr \|\|
				1310	key.type != BTRFS_EXTENT_DATA_REF_KEY)
				1311	goto fail;
				1312
				1313	ref = btrfs_item_ptr(leaf, path->slots[0],
				1314	struct btrfs_extent_data_ref);
				1315
				1316	if (match_extent_data_ref(leaf, ref, root_objectid,
				1317	owner, offset)) {
				1318	if (recow) {
				1319	btrfs_release_path(path);
				1320	goto again;
				1321	}
				1322	err = 0;
				1323	break;
				1324	}
				1325	path->slots[0]++;
				1326	}
				1327	fail:
				1328	return err;
				1329	}
				1330
				1331	static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
				1332	struct btrfs_fs_info *fs_info,
				1333	struct btrfs_path *path,
				1334	u64 bytenr, u64 parent,
				1335	u64 root_objectid, u64 owner,
				1336	u64 offset, int refs_to_add)
				1337	{
				1338	struct btrfs_root *root = fs_info->extent_root;
				1339	struct btrfs_key key;
				1340	struct extent_buffer *leaf;
				1341	u32 size;
				1342	u32 num_refs;
				1343	int ret;
				1344
				1345	key.objectid = bytenr;
				1346	if (parent) {
				1347	key.type = BTRFS_SHARED_DATA_REF_KEY;
				1348	key.offset = parent;
				1349	size = sizeof(struct btrfs_shared_data_ref);
				1350	} else {
				1351	key.type = BTRFS_EXTENT_DATA_REF_KEY;
				1352	key.offset = hash_extent_data_ref(root_objectid,
				1353	owner, offset);
				1354	size = sizeof(struct btrfs_extent_data_ref);
				1355	}
				1356
				1357	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
				1358	if (ret && ret != -EEXIST)
				1359	goto fail;
				1360
				1361	leaf = path->nodes[0];
				1362	if (parent) {
				1363	struct btrfs_shared_data_ref *ref;
				1364	ref = btrfs_item_ptr(leaf, path->slots[0],
				1365	struct btrfs_shared_data_ref);
				1366	if (ret == 0) {
				1367	btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
				1368	} else {
				1369	num_refs = btrfs_shared_data_ref_count(leaf, ref);
				1370	num_refs += refs_to_add;
				1371	btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
				1372	}
				1373	} else {
				1374	struct btrfs_extent_data_ref *ref;
				1375	while (ret == -EEXIST) {
				1376	ref = btrfs_item_ptr(leaf, path->slots[0],
				1377	struct btrfs_extent_data_ref);
				1378	if (match_extent_data_ref(leaf, ref, root_objectid,
				1379	owner, offset))
				1380	break;
				1381	btrfs_release_path(path);
				1382	key.offset++;
				1383	ret = btrfs_insert_empty_item(trans, root, path, &key,
				1384	size);
				1385	if (ret && ret != -EEXIST)
				1386	goto fail;
				1387
				1388	leaf = path->nodes[0];
				1389	}
				1390	ref = btrfs_item_ptr(leaf, path->slots[0],
				1391	struct btrfs_extent_data_ref);
				1392	if (ret == 0) {
				1393	btrfs_set_extent_data_ref_root(leaf, ref,
				1394	root_objectid);
				1395	btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
				1396	btrfs_set_extent_data_ref_offset(leaf, ref, offset);
				1397	btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
				1398	} else {
				1399	num_refs = btrfs_extent_data_ref_count(leaf, ref);
				1400	num_refs += refs_to_add;
				1401	btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
				1402	}
				1403	}
				1404	btrfs_mark_buffer_dirty(leaf);
				1405	ret = 0;
				1406	fail:
				1407	btrfs_release_path(path);
				1408	return ret;
				1409	}
				1410
				1411	static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
				1412	struct btrfs_fs_info *fs_info,
				1413	struct btrfs_path *path,
				1414	int refs_to_drop, int *last_ref)
				1415	{
				1416	struct btrfs_key key;
				1417	struct btrfs_extent_data_ref *ref1 = NULL;
				1418	struct btrfs_shared_data_ref *ref2 = NULL;
				1419	struct extent_buffer *leaf;
				1420	u32 num_refs = 0;
				1421	int ret = 0;
				1422
				1423	leaf = path->nodes[0];
				1424	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1425
				1426	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
				1427	ref1 = btrfs_item_ptr(leaf, path->slots[0],
				1428	struct btrfs_extent_data_ref);
				1429	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				1430	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
				1431	ref2 = btrfs_item_ptr(leaf, path->slots[0],
				1432	struct btrfs_shared_data_ref);
				1433	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				1434	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				1435	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
				1436	struct btrfs_extent_ref_v0 *ref0;
				1437	ref0 = btrfs_item_ptr(leaf, path->slots[0],
				1438	struct btrfs_extent_ref_v0);
				1439	num_refs = btrfs_ref_count_v0(leaf, ref0);
				1440	#endif
				1441	} else {
				1442	BUG();
				1443	}
				1444
				1445	BUG_ON(num_refs < refs_to_drop);
				1446	num_refs -= refs_to_drop;
				1447
				1448	if (num_refs == 0) {
				1449	ret = btrfs_del_item(trans, fs_info->extent_root, path);
				1450	*last_ref = 1;
				1451	} else {
				1452	if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
				1453	btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
				1454	else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
				1455	btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
				1456	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				1457	else {
				1458	struct btrfs_extent_ref_v0 *ref0;
				1459	ref0 = btrfs_item_ptr(leaf, path->slots[0],
				1460	struct btrfs_extent_ref_v0);
				1461	btrfs_set_ref_count_v0(leaf, ref0, num_refs);
				1462	}
				1463	#endif
				1464	btrfs_mark_buffer_dirty(leaf);
				1465	}
				1466	return ret;
				1467	}
				1468
				1469	static noinline u32 extent_data_ref_count(struct btrfs_path *path,
				1470	struct btrfs_extent_inline_ref *iref)
				1471	{
				1472	struct btrfs_key key;
				1473	struct extent_buffer *leaf;
				1474	struct btrfs_extent_data_ref *ref1;
				1475	struct btrfs_shared_data_ref *ref2;
				1476	u32 num_refs = 0;
				1477	int type;
				1478
				1479	leaf = path->nodes[0];
				1480	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1481	if (iref) {
				1482	/*
				1483	* If type is invalid, we should have bailed out earlier than
				1484	* this call.
				1485	*/
				1486	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
				1487	ASSERT(type != BTRFS_REF_TYPE_INVALID);
				1488	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1489	ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
				1490	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				1491	} else {
				1492	ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
				1493	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				1494	}
				1495	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
				1496	ref1 = btrfs_item_ptr(leaf, path->slots[0],
				1497	struct btrfs_extent_data_ref);
				1498	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				1499	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
				1500	ref2 = btrfs_item_ptr(leaf, path->slots[0],
				1501	struct btrfs_shared_data_ref);
				1502	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				1503	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				1504	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
				1505	struct btrfs_extent_ref_v0 *ref0;
				1506	ref0 = btrfs_item_ptr(leaf, path->slots[0],
				1507	struct btrfs_extent_ref_v0);
				1508	num_refs = btrfs_ref_count_v0(leaf, ref0);
				1509	#endif
				1510	} else {
				1511	WARN_ON(1);
				1512	}
				1513	return num_refs;
				1514	}
				1515
				1516	static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
				1517	struct btrfs_fs_info *fs_info,
				1518	struct btrfs_path *path,
				1519	u64 bytenr, u64 parent,
				1520	u64 root_objectid)
				1521	{
				1522	struct btrfs_root *root = fs_info->extent_root;
				1523	struct btrfs_key key;
				1524	int ret;
				1525
				1526	key.objectid = bytenr;
				1527	if (parent) {
				1528	key.type = BTRFS_SHARED_BLOCK_REF_KEY;
				1529	key.offset = parent;
				1530	} else {
				1531	key.type = BTRFS_TREE_BLOCK_REF_KEY;
				1532	key.offset = root_objectid;
				1533	}
				1534
				1535	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1536	if (ret > 0)
				1537	ret = -ENOENT;
				1538	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				1539	if (ret == -ENOENT && parent) {
				1540	btrfs_release_path(path);
				1541	key.type = BTRFS_EXTENT_REF_V0_KEY;
				1542	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1543	if (ret > 0)
				1544	ret = -ENOENT;
				1545	}
				1546	#endif
				1547	return ret;
				1548	}
				1549
				1550	static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
				1551	struct btrfs_fs_info *fs_info,
				1552	struct btrfs_path *path,
				1553	u64 bytenr, u64 parent,
				1554	u64 root_objectid)
				1555	{
				1556	struct btrfs_key key;
				1557	int ret;
				1558
				1559	key.objectid = bytenr;
				1560	if (parent) {
				1561	key.type = BTRFS_SHARED_BLOCK_REF_KEY;
				1562	key.offset = parent;
				1563	} else {
				1564	key.type = BTRFS_TREE_BLOCK_REF_KEY;
				1565	key.offset = root_objectid;
				1566	}
				1567
				1568	ret = btrfs_insert_empty_item(trans, fs_info->extent_root,
				1569	path, &key, 0);
				1570	btrfs_release_path(path);
				1571	return ret;
				1572	}
				1573
				1574	static inline int extent_ref_type(u64 parent, u64 owner)
				1575	{
				1576	int type;
				1577	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1578	if (parent > 0)
				1579	type = BTRFS_SHARED_BLOCK_REF_KEY;
				1580	else
				1581	type = BTRFS_TREE_BLOCK_REF_KEY;
				1582	} else {
				1583	if (parent > 0)
				1584	type = BTRFS_SHARED_DATA_REF_KEY;
				1585	else
				1586	type = BTRFS_EXTENT_DATA_REF_KEY;
				1587	}
				1588	return type;
				1589	}
				1590
				1591	static int find_next_key(struct btrfs_path *path, int level,
				1592	struct btrfs_key *key)
				1593
				1594	{
				1595	for (; level < BTRFS_MAX_LEVEL; level++) {
				1596	if (!path->nodes[level])
				1597	break;
				1598	if (path->slots[level] + 1 >=
				1599	btrfs_header_nritems(path->nodes[level]))
				1600	continue;
				1601	if (level == 0)
				1602	btrfs_item_key_to_cpu(path->nodes[level], key,
				1603	path->slots[level] + 1);
				1604	else
				1605	btrfs_node_key_to_cpu(path->nodes[level], key,
				1606	path->slots[level] + 1);
				1607	return 0;
				1608	}
				1609	return 1;
				1610	}
				1611
				1612	/*
				1613	* look for inline back ref. if back ref is found, *ref_ret is set
				1614	* to the address of inline back ref, and 0 is returned.
				1615	*
				1616	* if back ref isn't found, *ref_ret is set to the address where it
				1617	* should be inserted, and -ENOENT is returned.
				1618	*
				1619	* if insert is true and there are too many inline back refs, the path
				1620	* points to the extent item, and -EAGAIN is returned.
				1621	*
				1622	* NOTE: inline back refs are ordered in the same way that back ref
				1623	* items in the tree are ordered.
				1624	*/
				1625	static noinline_for_stack
				1626	int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				1627	struct btrfs_fs_info *fs_info,
				1628	struct btrfs_path *path,
				1629	struct btrfs_extent_inline_ref **ref_ret,
				1630	u64 bytenr, u64 num_bytes,
				1631	u64 parent, u64 root_objectid,
				1632	u64 owner, u64 offset, int insert)
				1633	{
				1634	struct btrfs_root *root = fs_info->extent_root;
				1635	struct btrfs_key key;
				1636	struct extent_buffer *leaf;
				1637	struct btrfs_extent_item *ei;
				1638	struct btrfs_extent_inline_ref *iref;
				1639	u64 flags;
				1640	u64 item_size;
				1641	unsigned long ptr;
				1642	unsigned long end;
				1643	int extra_size;
				1644	int type;
				1645	int want;
				1646	int ret;
				1647	int err = 0;
				1648	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				1649	int needed;
				1650
				1651	key.objectid = bytenr;
				1652	key.type = BTRFS_EXTENT_ITEM_KEY;
				1653	key.offset = num_bytes;
				1654
				1655	want = extent_ref_type(parent, owner);
				1656	if (insert) {
				1657	extra_size = btrfs_extent_inline_ref_size(want);
				1658	path->keep_locks = 1;
				1659	} else
				1660	extra_size = -1;
				1661
				1662	/*
				1663	* Owner is our parent level, so we can just add one to get the level
				1664	* for the block we are interested in.
				1665	*/
				1666	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
				1667	key.type = BTRFS_METADATA_ITEM_KEY;
				1668	key.offset = owner;
				1669	}
				1670
				1671	again:
				1672	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
				1673	if (ret < 0) {
				1674	err = ret;
				1675	goto out;
				1676	}
				1677
				1678	/*
				1679	* We may be a newly converted file system which still has the old fat
				1680	* extent entries for metadata, so try and see if we have one of those.
				1681	*/
				1682	if (ret > 0 && skinny_metadata) {
				1683	skinny_metadata = false;
				1684	if (path->slots[0]) {
				1685	path->slots[0]--;
				1686	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1687	path->slots[0]);
				1688	if (key.objectid == bytenr &&
				1689	key.type == BTRFS_EXTENT_ITEM_KEY &&
				1690	key.offset == num_bytes)
				1691	ret = 0;
				1692	}
				1693	if (ret) {
				1694	key.objectid = bytenr;
				1695	key.type = BTRFS_EXTENT_ITEM_KEY;
				1696	key.offset = num_bytes;
				1697	btrfs_release_path(path);
				1698	goto again;
				1699	}
				1700	}
				1701
				1702	if (ret && !insert) {
				1703	err = -ENOENT;
				1704	goto out;
				1705	} else if (WARN_ON(ret)) {
				1706	err = -EIO;
				1707	goto out;
				1708	}
				1709
				1710	leaf = path->nodes[0];
				1711	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1712	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				1713	if (item_size < sizeof(*ei)) {
				1714	if (!insert) {
				1715	err = -ENOENT;
				1716	goto out;
				1717	}
				1718	ret = convert_extent_item_v0(trans, fs_info, path, owner,
				1719	extra_size);
				1720	if (ret < 0) {
				1721	err = ret;
				1722	goto out;
				1723	}
				1724	leaf = path->nodes[0];
				1725	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1726	}
				1727	#endif
				1728	BUG_ON(item_size < sizeof(*ei));
				1729
				1730	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1731	flags = btrfs_extent_flags(leaf, ei);
				1732
				1733	ptr = (unsigned long)(ei + 1);
				1734	end = (unsigned long)ei + item_size;
				1735
				1736	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
				1737	ptr += sizeof(struct btrfs_tree_block_info);
				1738	BUG_ON(ptr > end);
				1739	}
				1740
				1741	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
				1742	needed = BTRFS_REF_TYPE_DATA;
				1743	else
				1744	needed = BTRFS_REF_TYPE_BLOCK;
				1745
				1746	err = -ENOENT;
				1747	while (1) {
				1748	if (ptr >= end) {
				1749	WARN_ON(ptr > end);
				1750	break;
				1751	}
				1752	iref = (struct btrfs_extent_inline_ref *)ptr;
				1753	type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
				1754	if (type == BTRFS_REF_TYPE_INVALID) {
				1755	err = -EINVAL;
				1756	goto out;
				1757	}
				1758
				1759	if (want < type)
				1760	break;
				1761	if (want > type) {
				1762	ptr += btrfs_extent_inline_ref_size(type);
				1763	continue;
				1764	}
				1765
				1766	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1767	struct btrfs_extent_data_ref *dref;
				1768	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1769	if (match_extent_data_ref(leaf, dref, root_objectid,
				1770	owner, offset)) {
				1771	err = 0;
				1772	break;
				1773	}
				1774	if (hash_extent_data_ref_item(leaf, dref) <
				1775	hash_extent_data_ref(root_objectid, owner, offset))
				1776	break;
				1777	} else {
				1778	u64 ref_offset;
				1779	ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
				1780	if (parent > 0) {
				1781	if (parent == ref_offset) {
				1782	err = 0;
				1783	break;
				1784	}
				1785	if (ref_offset < parent)
				1786	break;
				1787	} else {
				1788	if (root_objectid == ref_offset) {
				1789	err = 0;
				1790	break;
				1791	}
				1792	if (ref_offset < root_objectid)
				1793	break;
				1794	}
				1795	}
				1796	ptr += btrfs_extent_inline_ref_size(type);
				1797	}
				1798	if (err == -ENOENT && insert) {
				1799	if (item_size + extra_size >=
				1800	BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
				1801	err = -EAGAIN;
				1802	goto out;
				1803	}
				1804	/*
				1805	* To add new inline back ref, we have to make sure
				1806	* there is no corresponding back ref item.
				1807	* For simplicity, we just do not add new inline back
				1808	* ref if there is any kind of item for this block
				1809	*/
				1810	if (find_next_key(path, 0, &key) == 0 &&
				1811	key.objectid == bytenr &&
				1812	key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
				1813	err = -EAGAIN;
				1814	goto out;
				1815	}
				1816	}
				1817	ref_ret = (struct btrfs_extent_inline_ref )ptr;
				1818	out:
				1819	if (insert) {
				1820	path->keep_locks = 0;
				1821	btrfs_unlock_up_safe(path, 1);
				1822	}
				1823	return err;
				1824	}
				1825
				1826	/*
				1827	* helper to add new inline back ref
				1828	*/
				1829	static noinline_for_stack
				1830	void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
				1831	struct btrfs_path *path,
				1832	struct btrfs_extent_inline_ref *iref,
				1833	u64 parent, u64 root_objectid,
				1834	u64 owner, u64 offset, int refs_to_add,
				1835	struct btrfs_delayed_extent_op *extent_op)
				1836	{
				1837	struct extent_buffer *leaf;
				1838	struct btrfs_extent_item *ei;
				1839	unsigned long ptr;
				1840	unsigned long end;
				1841	unsigned long item_offset;
				1842	u64 refs;
				1843	int size;
				1844	int type;
				1845
				1846	leaf = path->nodes[0];
				1847	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1848	item_offset = (unsigned long)iref - (unsigned long)ei;
				1849
				1850	type = extent_ref_type(parent, owner);
				1851	size = btrfs_extent_inline_ref_size(type);
				1852
				1853	btrfs_extend_item(fs_info, path, size);
				1854
				1855	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1856	refs = btrfs_extent_refs(leaf, ei);
				1857	refs += refs_to_add;
				1858	btrfs_set_extent_refs(leaf, ei, refs);
				1859	if (extent_op)
				1860	__run_delayed_extent_op(extent_op, leaf, ei);
				1861
				1862	ptr = (unsigned long)ei + item_offset;
				1863	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
				1864	if (ptr < end - size)
				1865	memmove_extent_buffer(leaf, ptr + size, ptr,
				1866	end - size - ptr);
				1867
				1868	iref = (struct btrfs_extent_inline_ref *)ptr;
				1869	btrfs_set_extent_inline_ref_type(leaf, iref, type);
				1870	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1871	struct btrfs_extent_data_ref *dref;
				1872	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1873	btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
				1874	btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
				1875	btrfs_set_extent_data_ref_offset(leaf, dref, offset);
				1876	btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
				1877	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1878	struct btrfs_shared_data_ref *sref;
				1879	sref = (struct btrfs_shared_data_ref *)(iref + 1);
				1880	btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
				1881	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				1882	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				1883	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				1884	} else {
				1885	btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
				1886	}
				1887	btrfs_mark_buffer_dirty(leaf);
				1888	}
				1889
				1890	static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				1891	struct btrfs_fs_info *fs_info,
				1892	struct btrfs_path *path,
				1893	struct btrfs_extent_inline_ref **ref_ret,
				1894	u64 bytenr, u64 num_bytes, u64 parent,
				1895	u64 root_objectid, u64 owner, u64 offset)
				1896	{
				1897	int ret;
				1898
				1899	ret = lookup_inline_extent_backref(trans, fs_info, path, ref_ret,
				1900	bytenr, num_bytes, parent,
				1901	root_objectid, owner, offset, 0);
				1902	if (ret != -ENOENT)
				1903	return ret;
				1904
				1905	btrfs_release_path(path);
				1906	*ref_ret = NULL;
				1907
				1908	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1909	ret = lookup_tree_block_ref(trans, fs_info, path, bytenr,
				1910	parent, root_objectid);
				1911	} else {
				1912	ret = lookup_extent_data_ref(trans, fs_info, path, bytenr,
				1913	parent, root_objectid, owner,
				1914	offset);
				1915	}
				1916	return ret;
				1917	}
				1918
				1919	/*
				1920	* helper to update/remove inline back ref
				1921	*/
				1922	static noinline_for_stack
				1923	void update_inline_extent_backref(struct btrfs_fs_info *fs_info,
				1924	struct btrfs_path *path,
				1925	struct btrfs_extent_inline_ref *iref,
				1926	int refs_to_mod,
				1927	struct btrfs_delayed_extent_op *extent_op,
				1928	int *last_ref)
				1929	{
				1930	struct extent_buffer *leaf;
				1931	struct btrfs_extent_item *ei;
				1932	struct btrfs_extent_data_ref *dref = NULL;
				1933	struct btrfs_shared_data_ref *sref = NULL;
				1934	unsigned long ptr;
				1935	unsigned long end;
				1936	u32 item_size;
				1937	int size;
				1938	int type;
				1939	u64 refs;
				1940
				1941	leaf = path->nodes[0];
				1942	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1943	refs = btrfs_extent_refs(leaf, ei);
				1944	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
				1945	refs += refs_to_mod;
				1946	btrfs_set_extent_refs(leaf, ei, refs);
				1947	if (extent_op)
				1948	__run_delayed_extent_op(extent_op, leaf, ei);
				1949
				1950	/*
				1951	* If type is invalid, we should have bailed out after
				1952	* lookup_inline_extent_backref().
				1953	*/
				1954	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
				1955	ASSERT(type != BTRFS_REF_TYPE_INVALID);
				1956
				1957	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1958	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1959	refs = btrfs_extent_data_ref_count(leaf, dref);
				1960	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1961	sref = (struct btrfs_shared_data_ref *)(iref + 1);
				1962	refs = btrfs_shared_data_ref_count(leaf, sref);
				1963	} else {
				1964	refs = 1;
				1965	BUG_ON(refs_to_mod != -1);
				1966	}
				1967
				1968	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
				1969	refs += refs_to_mod;
				1970
				1971	if (refs > 0) {
				1972	if (type == BTRFS_EXTENT_DATA_REF_KEY)
				1973	btrfs_set_extent_data_ref_count(leaf, dref, refs);
				1974	else
				1975	btrfs_set_shared_data_ref_count(leaf, sref, refs);
				1976	} else {
				1977	*last_ref = 1;
				1978	size = btrfs_extent_inline_ref_size(type);
				1979	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1980	ptr = (unsigned long)iref;
				1981	end = (unsigned long)ei + item_size;
				1982	if (ptr + size < end)
				1983	memmove_extent_buffer(leaf, ptr, ptr + size,
				1984	end - ptr - size);
				1985	item_size -= size;
				1986	btrfs_truncate_item(fs_info, path, item_size, 1);
				1987	}
				1988	btrfs_mark_buffer_dirty(leaf);
				1989	}
				1990
				1991	static noinline_for_stack
				1992	int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				1993	struct btrfs_fs_info *fs_info,
				1994	struct btrfs_path *path,
				1995	u64 bytenr, u64 num_bytes, u64 parent,
				1996	u64 root_objectid, u64 owner,
				1997	u64 offset, int refs_to_add,
				1998	struct btrfs_delayed_extent_op *extent_op)
				1999	{
				2000	struct btrfs_extent_inline_ref *iref;
				2001	int ret;
				2002
				2003	ret = lookup_inline_extent_backref(trans, fs_info, path, &iref,
				2004	bytenr, num_bytes, parent,
				2005	root_objectid, owner, offset, 1);
				2006	if (ret == 0) {
				2007	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
				2008	update_inline_extent_backref(fs_info, path, iref,
				2009	refs_to_add, extent_op, NULL);
				2010	} else if (ret == -ENOENT) {
				2011	setup_inline_extent_backref(fs_info, path, iref, parent,
				2012	root_objectid, owner, offset,
				2013	refs_to_add, extent_op);
				2014	ret = 0;
				2015	}
				2016	return ret;
				2017	}
				2018
				2019	static int insert_extent_backref(struct btrfs_trans_handle *trans,
				2020	struct btrfs_fs_info *fs_info,
				2021	struct btrfs_path *path,
				2022	u64 bytenr, u64 parent, u64 root_objectid,
				2023	u64 owner, u64 offset, int refs_to_add)
				2024	{
				2025	int ret;
				2026	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				2027	BUG_ON(refs_to_add != 1);
				2028	ret = insert_tree_block_ref(trans, fs_info, path, bytenr,
				2029	parent, root_objectid);
				2030	} else {
				2031	ret = insert_extent_data_ref(trans, fs_info, path, bytenr,
				2032	parent, root_objectid,
				2033	owner, offset, refs_to_add);
				2034	}
				2035	return ret;
				2036	}
				2037
				2038	static int remove_extent_backref(struct btrfs_trans_handle *trans,
				2039	struct btrfs_fs_info *fs_info,
				2040	struct btrfs_path *path,
				2041	struct btrfs_extent_inline_ref *iref,
				2042	int refs_to_drop, int is_data, int *last_ref)
				2043	{
				2044	int ret = 0;
				2045
				2046	BUG_ON(!is_data && refs_to_drop != 1);
				2047	if (iref) {
				2048	update_inline_extent_backref(fs_info, path, iref,
				2049	-refs_to_drop, NULL, last_ref);
				2050	} else if (is_data) {
				2051	ret = remove_extent_data_ref(trans, fs_info, path, refs_to_drop,
				2052	last_ref);
				2053	} else {
				2054	*last_ref = 1;
				2055	ret = btrfs_del_item(trans, fs_info->extent_root, path);
				2056	}
				2057	return ret;
				2058	}
				2059
				2060	#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
				2061	static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
				2062	u64 *discarded_bytes)
				2063	{
				2064	int j, ret = 0;
				2065	u64 bytes_left, end;
				2066	u64 aligned_start = ALIGN(start, 1 << 9);
				2067
				2068	if (WARN_ON(start != aligned_start)) {
				2069	len -= aligned_start - start;
				2070	len = round_down(len, 1 << 9);
				2071	start = aligned_start;
				2072	}
				2073
				2074	*discarded_bytes = 0;
				2075
				2076	if (!len)
				2077	return 0;
				2078
				2079	end = start + len;
				2080	bytes_left = len;
				2081
				2082	/* Skip any superblocks on this device. */
				2083	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
				2084	u64 sb_start = btrfs_sb_offset(j);
				2085	u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
				2086	u64 size = sb_start - start;
				2087
				2088	if (!in_range(sb_start, start, bytes_left) &&
				2089	!in_range(sb_end, start, bytes_left) &&
				2090	!in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
				2091	continue;
				2092
				2093	/*
				2094	* Superblock spans beginning of range. Adjust start and
				2095	* try again.
				2096	*/
				2097	if (sb_start <= start) {
				2098	start += sb_end - start;
				2099	if (start > end) {
				2100	bytes_left = 0;
				2101	break;
				2102	}
				2103	bytes_left = end - start;
				2104	continue;
				2105	}
				2106
				2107	if (size) {
				2108	ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
				2109	GFP_NOFS, 0);
				2110	if (!ret)
				2111	*discarded_bytes += size;
				2112	else if (ret != -EOPNOTSUPP)
				2113	return ret;
				2114	}
				2115
				2116	start = sb_end;
				2117	if (start > end) {
				2118	bytes_left = 0;
				2119	break;
				2120	}
				2121	bytes_left = end - start;
				2122	}
				2123
				2124	if (bytes_left) {
				2125	ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
				2126	GFP_NOFS, 0);
				2127	if (!ret)
				2128	*discarded_bytes += bytes_left;
				2129	}
				2130	return ret;
				2131	}
				2132
				2133	int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
				2134	u64 num_bytes, u64 *actual_bytes)
				2135	{
				2136	int ret;
				2137	u64 discarded_bytes = 0;
				2138	struct btrfs_bio *bbio = NULL;
				2139
				2140
				2141	/*
				2142	* Avoid races with device replace and make sure our bbio has devices
				2143	* associated to its stripes that don't go away while we are discarding.
				2144	*/
				2145	btrfs_bio_counter_inc_blocked(fs_info);
				2146	/* Tell the block device(s) that the sectors can be discarded */
				2147	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
				2148	&bbio, 0);
				2149	/* Error condition is -ENOMEM */
				2150	if (!ret) {
				2151	struct btrfs_bio_stripe *stripe = bbio->stripes;
				2152	int i;
				2153
				2154
				2155	for (i = 0; i < bbio->num_stripes; i++, stripe++) {
				2156	u64 bytes;
				2157	if (!stripe->dev->can_discard)
				2158	continue;
				2159
				2160	ret = btrfs_issue_discard(stripe->dev->bdev,
				2161	stripe->physical,
				2162	stripe->length,
				2163	&bytes);
				2164	if (!ret)
				2165	discarded_bytes += bytes;
				2166	else if (ret != -EOPNOTSUPP)
				2167	break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
				2168
				2169	/*
				2170	* Just in case we get back EOPNOTSUPP for some reason,
				2171	* just ignore the return value so we don't screw up
				2172	* people calling discard_extent.
				2173	*/
				2174	ret = 0;
				2175	}
				2176	btrfs_put_bbio(bbio);
				2177	}
				2178	btrfs_bio_counter_dec(fs_info);
				2179
				2180	if (actual_bytes)
				2181	*actual_bytes = discarded_bytes;
				2182
				2183
				2184	if (ret == -EOPNOTSUPP)
				2185	ret = 0;
				2186	return ret;
				2187	}
				2188
				2189	/* Can return -ENOMEM */
				2190	int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
				2191	struct btrfs_fs_info *fs_info,
				2192	u64 bytenr, u64 num_bytes, u64 parent,
				2193	u64 root_objectid, u64 owner, u64 offset)
				2194	{
				2195	int old_ref_mod, new_ref_mod;
				2196	int ret;
				2197
				2198	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
				2199	root_objectid == BTRFS_TREE_LOG_OBJECTID);
				2200
				2201	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				2202	ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
				2203	num_bytes, parent,
				2204	root_objectid, (int)owner,
				2205	BTRFS_ADD_DELAYED_REF, NULL,
				2206	&old_ref_mod, &new_ref_mod);
				2207	} else {
				2208	ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
				2209	num_bytes, parent,
				2210	root_objectid, owner, offset,
				2211	0, BTRFS_ADD_DELAYED_REF,
				2212	&old_ref_mod, &new_ref_mod);
				2213	}
				2214
				2215	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
				2216	add_pinned_bytes(fs_info, -num_bytes, owner, root_objectid);
				2217
				2218	return ret;
				2219	}
				2220
				2221	static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
				2222	struct btrfs_fs_info *fs_info,
				2223	struct btrfs_delayed_ref_node *node,
				2224	u64 parent, u64 root_objectid,
				2225	u64 owner, u64 offset, int refs_to_add,
				2226	struct btrfs_delayed_extent_op *extent_op)
				2227	{
				2228	struct btrfs_path *path;
				2229	struct extent_buffer *leaf;
				2230	struct btrfs_extent_item *item;
				2231	struct btrfs_key key;
				2232	u64 bytenr = node->bytenr;
				2233	u64 num_bytes = node->num_bytes;
				2234	u64 refs;
				2235	int ret;
				2236
				2237	path = btrfs_alloc_path();
				2238	if (!path)
				2239	return -ENOMEM;
				2240
				2241	path->reada = READA_FORWARD;
				2242	path->leave_spinning = 1;
				2243	/* this will setup the path even if it fails to insert the back ref */
				2244	ret = insert_inline_extent_backref(trans, fs_info, path, bytenr,
				2245	num_bytes, parent, root_objectid,
				2246	owner, offset,
				2247	refs_to_add, extent_op);
				2248	if ((ret < 0 && ret != -EAGAIN) \|\| !ret)
				2249	goto out;
				2250
				2251	/*
				2252	* Ok we had -EAGAIN which means we didn't have space to insert and
				2253	* inline extent ref, so just update the reference count and add a
				2254	* normal backref.
				2255	*/
				2256	leaf = path->nodes[0];
				2257	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2258	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				2259	refs = btrfs_extent_refs(leaf, item);
				2260	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
				2261	if (extent_op)
				2262	__run_delayed_extent_op(extent_op, leaf, item);
				2263
				2264	btrfs_mark_buffer_dirty(leaf);
				2265	btrfs_release_path(path);
				2266
				2267	path->reada = READA_FORWARD;
				2268	path->leave_spinning = 1;
				2269	/* now insert the actual backref */
				2270	ret = insert_extent_backref(trans, fs_info, path, bytenr, parent,
				2271	root_objectid, owner, offset, refs_to_add);
				2272	if (ret)
				2273	btrfs_abort_transaction(trans, ret);
				2274	out:
				2275	btrfs_free_path(path);
				2276	return ret;
				2277	}
				2278
				2279	static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				2280	struct btrfs_fs_info *fs_info,
				2281	struct btrfs_delayed_ref_node *node,
				2282	struct btrfs_delayed_extent_op *extent_op,
				2283	int insert_reserved)
				2284	{
				2285	int ret = 0;
				2286	struct btrfs_delayed_data_ref *ref;
				2287	struct btrfs_key ins;
				2288	u64 parent = 0;
				2289	u64 ref_root = 0;
				2290	u64 flags = 0;
				2291
				2292	ins.objectid = node->bytenr;
				2293	ins.offset = node->num_bytes;
				2294	ins.type = BTRFS_EXTENT_ITEM_KEY;
				2295
				2296	ref = btrfs_delayed_node_to_data_ref(node);
				2297	trace_run_delayed_data_ref(fs_info, node, ref, node->action);
				2298
				2299	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
				2300	parent = ref->parent;
				2301	ref_root = ref->root;
				2302
				2303	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
				2304	if (extent_op)
				2305	flags \|= extent_op->flags_to_set;
				2306	ret = alloc_reserved_file_extent(trans, fs_info,
				2307	parent, ref_root, flags,
				2308	ref->objectid, ref->offset,
				2309	&ins, node->ref_mod);
				2310	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
				2311	ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent,
				2312	ref_root, ref->objectid,
				2313	ref->offset, node->ref_mod,
				2314	extent_op);
				2315	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
				2316	ret = __btrfs_free_extent(trans, fs_info, node, parent,
				2317	ref_root, ref->objectid,
				2318	ref->offset, node->ref_mod,
				2319	extent_op);
				2320	} else {
				2321	BUG();
				2322	}
				2323	return ret;
				2324	}
				2325
				2326	static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				2327	struct extent_buffer *leaf,
				2328	struct btrfs_extent_item *ei)
				2329	{
				2330	u64 flags = btrfs_extent_flags(leaf, ei);
				2331	if (extent_op->update_flags) {
				2332	flags \|= extent_op->flags_to_set;
				2333	btrfs_set_extent_flags(leaf, ei, flags);
				2334	}
				2335
				2336	if (extent_op->update_key) {
				2337	struct btrfs_tree_block_info *bi;
				2338	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
				2339	bi = (struct btrfs_tree_block_info *)(ei + 1);
				2340	btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
				2341	}
				2342	}
				2343
				2344	static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
				2345	struct btrfs_fs_info *fs_info,
				2346	struct btrfs_delayed_ref_node *node,
				2347	struct btrfs_delayed_extent_op *extent_op)
				2348	{
				2349	struct btrfs_key key;
				2350	struct btrfs_path *path;
				2351	struct btrfs_extent_item *ei;
				2352	struct extent_buffer *leaf;
				2353	u32 item_size;
				2354	int ret;
				2355	int err = 0;
				2356	int metadata = !extent_op->is_data;
				2357
				2358	if (trans->aborted)
				2359	return 0;
				2360
				2361	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				2362	metadata = 0;
				2363
				2364	path = btrfs_alloc_path();
				2365	if (!path)
				2366	return -ENOMEM;
				2367
				2368	key.objectid = node->bytenr;
				2369
				2370	if (metadata) {
				2371	key.type = BTRFS_METADATA_ITEM_KEY;
				2372	key.offset = extent_op->level;
				2373	} else {
				2374	key.type = BTRFS_EXTENT_ITEM_KEY;
				2375	key.offset = node->num_bytes;
				2376	}
				2377
				2378	again:
				2379	path->reada = READA_FORWARD;
				2380	path->leave_spinning = 1;
				2381	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
				2382	if (ret < 0) {
				2383	err = ret;
				2384	goto out;
				2385	}
				2386	if (ret > 0) {
				2387	if (metadata) {
				2388	if (path->slots[0] > 0) {
				2389	path->slots[0]--;
				2390	btrfs_item_key_to_cpu(path->nodes[0], &key,
				2391	path->slots[0]);
				2392	if (key.objectid == node->bytenr &&
				2393	key.type == BTRFS_EXTENT_ITEM_KEY &&
				2394	key.offset == node->num_bytes)
				2395	ret = 0;
				2396	}
				2397	if (ret > 0) {
				2398	btrfs_release_path(path);
				2399	metadata = 0;
				2400
				2401	key.objectid = node->bytenr;
				2402	key.offset = node->num_bytes;
				2403	key.type = BTRFS_EXTENT_ITEM_KEY;
				2404	goto again;
				2405	}
				2406	} else {
				2407	err = -EIO;
				2408	goto out;
				2409	}
				2410	}
				2411
				2412	leaf = path->nodes[0];
				2413	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				2414	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				2415	if (item_size < sizeof(*ei)) {
				2416	ret = convert_extent_item_v0(trans, fs_info, path, (u64)-1, 0);
				2417	if (ret < 0) {
				2418	err = ret;
				2419	goto out;
				2420	}
				2421	leaf = path->nodes[0];
				2422	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				2423	}
				2424	#endif
				2425	BUG_ON(item_size < sizeof(*ei));
				2426	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				2427	__run_delayed_extent_op(extent_op, leaf, ei);
				2428
				2429	btrfs_mark_buffer_dirty(leaf);
				2430	out:
				2431	btrfs_free_path(path);
				2432	return err;
				2433	}
				2434
				2435	static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				2436	struct btrfs_fs_info *fs_info,
				2437	struct btrfs_delayed_ref_node *node,
				2438	struct btrfs_delayed_extent_op *extent_op,
				2439	int insert_reserved)
				2440	{
				2441	int ret = 0;
				2442	struct btrfs_delayed_tree_ref *ref;
				2443	struct btrfs_key ins;
				2444	u64 parent = 0;
				2445	u64 ref_root = 0;
				2446	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				2447
				2448	ref = btrfs_delayed_node_to_tree_ref(node);
				2449	trace_run_delayed_tree_ref(fs_info, node, ref, node->action);
				2450
				2451	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
				2452	parent = ref->parent;
				2453	ref_root = ref->root;
				2454
				2455	ins.objectid = node->bytenr;
				2456	if (skinny_metadata) {
				2457	ins.offset = ref->level;
				2458	ins.type = BTRFS_METADATA_ITEM_KEY;
				2459	} else {
				2460	ins.offset = node->num_bytes;
				2461	ins.type = BTRFS_EXTENT_ITEM_KEY;
				2462	}
				2463
				2464	if (node->ref_mod != 1) {
				2465	btrfs_err(fs_info,
				2466	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
				2467	node->bytenr, node->ref_mod, node->action, ref_root,
				2468	parent);
				2469	return -EIO;
				2470	}
				2471	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
				2472	BUG_ON(!extent_op \|\| !extent_op->update_flags);
				2473	ret = alloc_reserved_tree_block(trans, fs_info,
				2474	parent, ref_root,
				2475	extent_op->flags_to_set,
				2476	&extent_op->key,
				2477	ref->level, &ins);
				2478	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
				2479	ret = __btrfs_inc_extent_ref(trans, fs_info, node,
				2480	parent, ref_root,
				2481	ref->level, 0, 1,
				2482	extent_op);
				2483	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
				2484	ret = __btrfs_free_extent(trans, fs_info, node,
				2485	parent, ref_root,
				2486	ref->level, 0, 1, extent_op);
				2487	} else {
				2488	BUG();
				2489	}
				2490	return ret;
				2491	}
				2492
				2493	/* helper function to actually process a single delayed ref entry */
				2494	static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
				2495	struct btrfs_fs_info *fs_info,
				2496	struct btrfs_delayed_ref_node *node,
				2497	struct btrfs_delayed_extent_op *extent_op,
				2498	int insert_reserved)
				2499	{
				2500	int ret = 0;
				2501
				2502	if (trans->aborted) {
				2503	if (insert_reserved)
				2504	btrfs_pin_extent(fs_info, node->bytenr,
				2505	node->num_bytes, 1);
				2506	return 0;
				2507	}
				2508
				2509	if (btrfs_delayed_ref_is_head(node)) {
				2510	struct btrfs_delayed_ref_head *head;
				2511	/*
				2512	* we've hit the end of the chain and we were supposed
				2513	* to insert this extent into the tree. But, it got
				2514	* deleted before we ever needed to insert it, so all
				2515	* we have to do is clean up the accounting
				2516	*/
				2517	BUG_ON(extent_op);
				2518	head = btrfs_delayed_node_to_head(node);
				2519	trace_run_delayed_ref_head(fs_info, node, head, node->action);
				2520
				2521	if (head->total_ref_mod < 0) {
				2522	struct btrfs_block_group_cache *cache;
				2523
				2524	cache = btrfs_lookup_block_group(fs_info, node->bytenr);
				2525	ASSERT(cache);
				2526	percpu_counter_add(&cache->space_info->total_bytes_pinned,
				2527	-node->num_bytes);
				2528	btrfs_put_block_group(cache);
				2529	}
				2530
				2531	if (insert_reserved) {
				2532	btrfs_pin_extent(fs_info, node->bytenr,
				2533	node->num_bytes, 1);
				2534	if (head->is_data) {
				2535	ret = btrfs_del_csums(trans, fs_info,
				2536	node->bytenr,
				2537	node->num_bytes);
				2538	}
				2539	}
				2540
				2541	/* Also free its reserved qgroup space */
				2542	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
				2543	head->qgroup_reserved);
				2544	return ret;
				2545	}
				2546
				2547	if (node->type == BTRFS_TREE_BLOCK_REF_KEY \|\|
				2548	node->type == BTRFS_SHARED_BLOCK_REF_KEY)
				2549	ret = run_delayed_tree_ref(trans, fs_info, node, extent_op,
				2550	insert_reserved);
				2551	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY \|\|
				2552	node->type == BTRFS_SHARED_DATA_REF_KEY)
				2553	ret = run_delayed_data_ref(trans, fs_info, node, extent_op,
				2554	insert_reserved);
				2555	else
				2556	BUG();
				2557	return ret;
				2558	}
				2559
				2560	static inline struct btrfs_delayed_ref_node *
				2561	select_delayed_ref(struct btrfs_delayed_ref_head *head)
				2562	{
				2563	struct btrfs_delayed_ref_node *ref;
				2564
				2565	if (list_empty(&head->ref_list))
				2566	return NULL;
				2567
				2568	/*
				2569	* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
				2570	* This is to prevent a ref count from going down to zero, which deletes
				2571	* the extent item from the extent tree, when there still are references
				2572	* to add, which would fail because they would not find the extent item.
				2573	*/
				2574	if (!list_empty(&head->ref_add_list))
				2575	return list_first_entry(&head->ref_add_list,
				2576	struct btrfs_delayed_ref_node, add_list);
				2577
				2578	ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
				2579	list);
				2580	ASSERT(list_empty(&ref->add_list));
				2581	return ref;
				2582	}
				2583
				2584	/*
				2585	* Returns 0 on success or if called with an already aborted transaction.
				2586	* Returns -ENOMEM or -EIO on failure and will abort the transaction.
				2587	*/
				2588	static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
				2589	struct btrfs_fs_info *fs_info,
				2590	unsigned long nr)
				2591	{
				2592	struct btrfs_delayed_ref_root *delayed_refs;
				2593	struct btrfs_delayed_ref_node *ref;
				2594	struct btrfs_delayed_ref_head *locked_ref = NULL;
				2595	struct btrfs_delayed_extent_op *extent_op;
				2596	ktime_t start = ktime_get();
				2597	int ret;
				2598	unsigned long count = 0;
				2599	unsigned long actual_count = 0;
				2600	int must_insert_reserved = 0;
				2601
				2602	delayed_refs = &trans->transaction->delayed_refs;
				2603	while (1) {
				2604	if (!locked_ref) {
				2605	if (count >= nr)
				2606	break;
				2607
				2608	spin_lock(&delayed_refs->lock);
				2609	locked_ref = btrfs_select_ref_head(trans);
				2610	if (!locked_ref) {
				2611	spin_unlock(&delayed_refs->lock);
				2612	break;
				2613	}
				2614
				2615	/* grab the lock that says we are going to process
				2616	* all the refs for this head */
				2617	ret = btrfs_delayed_ref_lock(trans, locked_ref);
				2618	spin_unlock(&delayed_refs->lock);
				2619	/*
				2620	* we may have dropped the spin lock to get the head
				2621	* mutex lock, and that might have given someone else
				2622	* time to free the head. If that's true, it has been
				2623	* removed from our list and we can move on.
				2624	*/
				2625	if (ret == -EAGAIN) {
				2626	locked_ref = NULL;
				2627	count++;
				2628	continue;
				2629	}
				2630	}
				2631
				2632	/*
				2633	* We need to try and merge add/drops of the same ref since we
				2634	* can run into issues with relocate dropping the implicit ref
				2635	* and then it being added back again before the drop can
				2636	* finish. If we merged anything we need to re-loop so we can
				2637	* get a good ref.
				2638	* Or we can get node references of the same type that weren't
				2639	* merged when created due to bumps in the tree mod seq, and
				2640	* we need to merge them to prevent adding an inline extent
				2641	* backref before dropping it (triggering a BUG_ON at
				2642	* insert_inline_extent_backref()).
				2643	*/
				2644	spin_lock(&locked_ref->lock);
				2645	btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
				2646	locked_ref);
				2647
				2648	/*
				2649	* locked_ref is the head node, so we have to go one
				2650	* node back for any delayed ref updates
				2651	*/
				2652	ref = select_delayed_ref(locked_ref);
				2653
				2654	if (ref && ref->seq &&
				2655	btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
				2656	spin_unlock(&locked_ref->lock);
				2657	spin_lock(&delayed_refs->lock);
				2658	locked_ref->processing = 0;
				2659	delayed_refs->num_heads_ready++;
				2660	spin_unlock(&delayed_refs->lock);
				2661	btrfs_delayed_ref_unlock(locked_ref);
				2662	locked_ref = NULL;
				2663	cond_resched();
				2664	count++;
				2665	continue;
				2666	}
				2667
				2668	/*
				2669	* record the must insert reserved flag before we
				2670	* drop the spin lock.
				2671	*/
				2672	must_insert_reserved = locked_ref->must_insert_reserved;
				2673	locked_ref->must_insert_reserved = 0;
				2674
				2675	extent_op = locked_ref->extent_op;
				2676	locked_ref->extent_op = NULL;
				2677
				2678	if (!ref) {
				2679
				2680
				2681	/* All delayed refs have been processed, Go ahead
				2682	* and send the head node to run_one_delayed_ref,
				2683	* so that any accounting fixes can happen
				2684	*/
				2685	ref = &locked_ref->node;
				2686
				2687	if (extent_op && must_insert_reserved) {
				2688	btrfs_free_delayed_extent_op(extent_op);
				2689	extent_op = NULL;
				2690	}
				2691
				2692	if (extent_op) {
				2693	spin_unlock(&locked_ref->lock);
				2694	ret = run_delayed_extent_op(trans, fs_info,
				2695	ref, extent_op);
				2696	btrfs_free_delayed_extent_op(extent_op);
				2697
				2698	if (ret) {
				2699	/*
				2700	* Need to reset must_insert_reserved if
				2701	* there was an error so the abort stuff
				2702	* can cleanup the reserved space
				2703	* properly.
				2704	*/
				2705	if (must_insert_reserved)
				2706	locked_ref->must_insert_reserved = 1;
				2707	spin_lock(&delayed_refs->lock);
				2708	locked_ref->processing = 0;
				2709	delayed_refs->num_heads_ready++;
				2710	spin_unlock(&delayed_refs->lock);
				2711	btrfs_debug(fs_info,
				2712	"run_delayed_extent_op returned %d",
				2713	ret);
				2714	btrfs_delayed_ref_unlock(locked_ref);
				2715	return ret;
				2716	}
				2717	continue;
				2718	}
				2719
				2720	/*
				2721	* Need to drop our head ref lock and re-acquire the
				2722	* delayed ref lock and then re-check to make sure
				2723	* nobody got added.
				2724	*/
				2725	spin_unlock(&locked_ref->lock);
				2726	spin_lock(&delayed_refs->lock);
				2727	spin_lock(&locked_ref->lock);
				2728	if (!list_empty(&locked_ref->ref_list) \|\|
				2729	locked_ref->extent_op) {
				2730	spin_unlock(&locked_ref->lock);
				2731	spin_unlock(&delayed_refs->lock);
				2732	continue;
				2733	}
				2734	ref->in_tree = 0;
				2735	delayed_refs->num_heads--;
				2736	rb_erase(&locked_ref->href_node,
				2737	&delayed_refs->href_root);
				2738	spin_unlock(&delayed_refs->lock);
				2739	} else {
				2740	actual_count++;
				2741	ref->in_tree = 0;
				2742	list_del(&ref->list);
				2743	if (!list_empty(&ref->add_list))
				2744	list_del(&ref->add_list);
				2745	}
				2746	atomic_dec(&delayed_refs->num_entries);
				2747
				2748	if (!btrfs_delayed_ref_is_head(ref)) {
				2749	/*
				2750	* when we play the delayed ref, also correct the
				2751	* ref_mod on head
				2752	*/
				2753	switch (ref->action) {
				2754	case BTRFS_ADD_DELAYED_REF:
				2755	case BTRFS_ADD_DELAYED_EXTENT:
				2756	locked_ref->node.ref_mod -= ref->ref_mod;
				2757	break;
				2758	case BTRFS_DROP_DELAYED_REF:
				2759	locked_ref->node.ref_mod += ref->ref_mod;
				2760	break;
				2761	default:
				2762	WARN_ON(1);
				2763	}
				2764	}
				2765	spin_unlock(&locked_ref->lock);
				2766
				2767	ret = run_one_delayed_ref(trans, fs_info, ref, extent_op,
				2768	must_insert_reserved);
				2769
				2770	btrfs_free_delayed_extent_op(extent_op);
				2771	if (ret) {
				2772	spin_lock(&delayed_refs->lock);
				2773	locked_ref->processing = 0;
				2774	delayed_refs->num_heads_ready++;
				2775	spin_unlock(&delayed_refs->lock);
				2776	btrfs_delayed_ref_unlock(locked_ref);
				2777	btrfs_put_delayed_ref(ref);
				2778	btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				2779	ret);
				2780	return ret;
				2781	}
				2782
				2783	/*
				2784	* If this node is a head, that means all the refs in this head
				2785	* have been dealt with, and we will pick the next head to deal
				2786	* with, so we must unlock the head and drop it from the cluster
				2787	* list before we release it.
				2788	*/
				2789	if (btrfs_delayed_ref_is_head(ref)) {
				2790	if (locked_ref->is_data &&
				2791	locked_ref->total_ref_mod < 0) {
				2792	spin_lock(&delayed_refs->lock);
				2793	delayed_refs->pending_csums -= ref->num_bytes;
				2794	spin_unlock(&delayed_refs->lock);
				2795	}
				2796	btrfs_delayed_ref_unlock(locked_ref);
				2797	locked_ref = NULL;
				2798	}
				2799	btrfs_put_delayed_ref(ref);
				2800	count++;
				2801	cond_resched();
				2802	}
				2803
				2804	/*
				2805	* We don't want to include ref heads since we can have empty ref heads
				2806	* and those will drastically skew our runtime down since we just do
				2807	* accounting, no actual extent tree updates.
				2808	*/
				2809	if (actual_count > 0) {
				2810	u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
				2811	u64 avg;
				2812
				2813	/*
				2814	* We weigh the current average higher than our current runtime
				2815	* to avoid large swings in the average.
				2816	*/
				2817	spin_lock(&delayed_refs->lock);
				2818	avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
				2819	fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
				2820	spin_unlock(&delayed_refs->lock);
				2821	}
				2822	return 0;
				2823	}
				2824
				2825	#ifdef SCRAMBLE_DELAYED_REFS
				2826	/*
				2827	* Normally delayed refs get processed in ascending bytenr order. This
				2828	* correlates in most cases to the order added. To expose dependencies on this
				2829	* order, we start to process the tree in the middle instead of the beginning
				2830	*/
				2831	static u64 find_middle(struct rb_root *root)
				2832	{
				2833	struct rb_node *n = root->rb_node;
				2834	struct btrfs_delayed_ref_node *entry;
				2835	int alt = 1;
				2836	u64 middle;
				2837	u64 first = 0, last = 0;
				2838
				2839	n = rb_first(root);
				2840	if (n) {
				2841	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2842	first = entry->bytenr;
				2843	}
				2844	n = rb_last(root);
				2845	if (n) {
				2846	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2847	last = entry->bytenr;
				2848	}
				2849	n = root->rb_node;
				2850
				2851	while (n) {
				2852	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2853	WARN_ON(!entry->in_tree);
				2854
				2855	middle = entry->bytenr;
				2856
				2857	if (alt)
				2858	n = n->rb_left;
				2859	else
				2860	n = n->rb_right;
				2861
				2862	alt = 1 - alt;
				2863	}
				2864	return middle;
				2865	}
				2866	#endif
				2867
				2868	static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
				2869	{
				2870	u64 num_bytes;
				2871
				2872	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
				2873	sizeof(struct btrfs_extent_inline_ref));
				2874	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				2875	num_bytes += heads * sizeof(struct btrfs_tree_block_info);
				2876
				2877	/*
				2878	* We don't ever fill up leaves all the way so multiply by 2 just to be
				2879	* closer to what we're really going to want to use.
				2880	*/
				2881	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
				2882	}
				2883
				2884	/*
				2885	* Takes the number of bytes to be csumm'ed and figures out how many leaves it
				2886	* would require to store the csums for that many bytes.
				2887	*/
				2888	u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
				2889	{
				2890	u64 csum_size;
				2891	u64 num_csums_per_leaf;
				2892	u64 num_csums;
				2893
				2894	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
				2895	num_csums_per_leaf = div64_u64(csum_size,
				2896	(u64)btrfs_super_csum_size(fs_info->super_copy));
				2897	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
				2898	num_csums += num_csums_per_leaf - 1;
				2899	num_csums = div64_u64(num_csums, num_csums_per_leaf);
				2900	return num_csums;
				2901	}
				2902
				2903	int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
				2904	struct btrfs_fs_info *fs_info)
				2905	{
				2906	struct btrfs_block_rsv *global_rsv;
				2907	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
				2908	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
				2909	u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
				2910	u64 num_bytes, num_dirty_bgs_bytes;
				2911	int ret = 0;
				2912
				2913	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
				2914	num_heads = heads_to_leaves(fs_info, num_heads);
				2915	if (num_heads > 1)
				2916	num_bytes += (num_heads - 1) * fs_info->nodesize;
				2917	num_bytes <<= 1;
				2918	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
				2919	fs_info->nodesize;
				2920	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
				2921	num_dirty_bgs);
				2922	global_rsv = &fs_info->global_block_rsv;
				2923
				2924	/*
				2925	* If we can't allocate any more chunks lets make sure we have _lots_ of
				2926	* wiggle room since running delayed refs can create more delayed refs.
				2927	*/
				2928	if (global_rsv->space_info->full) {
				2929	num_dirty_bgs_bytes <<= 1;
				2930	num_bytes <<= 1;
				2931	}
				2932
				2933	spin_lock(&global_rsv->lock);
				2934	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
				2935	ret = 1;
				2936	spin_unlock(&global_rsv->lock);
				2937	return ret;
				2938	}
				2939
				2940	int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
				2941	struct btrfs_fs_info *fs_info)
				2942	{
				2943	u64 num_entries =
				2944	atomic_read(&trans->transaction->delayed_refs.num_entries);
				2945	u64 avg_runtime;
				2946	u64 val;
				2947
				2948	smp_mb();
				2949	avg_runtime = fs_info->avg_delayed_ref_runtime;
				2950	val = num_entries * avg_runtime;
				2951	if (val >= NSEC_PER_SEC)
				2952	return 1;
				2953	if (val >= NSEC_PER_SEC / 2)
				2954	return 2;
				2955
				2956	return btrfs_check_space_for_delayed_refs(trans, fs_info);
				2957	}
				2958
				2959	struct async_delayed_refs {
				2960	struct btrfs_root *root;
				2961	u64 transid;
				2962	int count;
				2963	int error;
				2964	int sync;
				2965	struct completion wait;
				2966	struct btrfs_work work;
				2967	};
				2968
				2969	static inline struct async_delayed_refs *
				2970	to_async_delayed_refs(struct btrfs_work *work)
				2971	{
				2972	return container_of(work, struct async_delayed_refs, work);
				2973	}
				2974
				2975	static void delayed_ref_async_start(struct btrfs_work *work)
				2976	{
				2977	struct async_delayed_refs *async = to_async_delayed_refs(work);
				2978	struct btrfs_trans_handle *trans;
				2979	struct btrfs_fs_info *fs_info = async->root->fs_info;
				2980	int ret;
				2981
				2982	/* if the commit is already started, we don't need to wait here */
				2983	if (btrfs_transaction_blocked(fs_info))
				2984	goto done;
				2985
				2986	trans = btrfs_join_transaction(async->root);
				2987	if (IS_ERR(trans)) {
				2988	async->error = PTR_ERR(trans);
				2989	goto done;
				2990	}
				2991
				2992	/*
				2993	* trans->sync means that when we call end_transaction, we won't
				2994	* wait on delayed refs
				2995	*/
				2996	trans->sync = true;
				2997
				2998	/* Don't bother flushing if we got into a different transaction */
				2999	if (trans->transid > async->transid)
				3000	goto end;
				3001
				3002	ret = btrfs_run_delayed_refs(trans, fs_info, async->count);
				3003	if (ret)
				3004	async->error = ret;
				3005	end:
				3006	ret = btrfs_end_transaction(trans);
				3007	if (ret && !async->error)
				3008	async->error = ret;
				3009	done:
				3010	if (async->sync)
				3011	complete(&async->wait);
				3012	else
				3013	kfree(async);
				3014	}
				3015
				3016	int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
				3017	unsigned long count, u64 transid, int wait)
				3018	{
				3019	struct async_delayed_refs *async;
				3020	int ret;
				3021
				3022	async = kmalloc(sizeof(*async), GFP_NOFS);
				3023	if (!async)
				3024	return -ENOMEM;
				3025
				3026	async->root = fs_info->tree_root;
				3027	async->count = count;
				3028	async->error = 0;
				3029	async->transid = transid;
				3030	if (wait)
				3031	async->sync = 1;
				3032	else
				3033	async->sync = 0;
				3034	init_completion(&async->wait);
				3035
				3036	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
				3037	delayed_ref_async_start, NULL, NULL);
				3038
				3039	btrfs_queue_work(fs_info->extent_workers, &async->work);
				3040
				3041	if (wait) {
				3042	wait_for_completion(&async->wait);
				3043	ret = async->error;
				3044	kfree(async);
				3045	return ret;
				3046	}
				3047	return 0;
				3048	}
				3049
				3050	/*
				3051	* this starts processing the delayed reference count updates and
				3052	* extent insertions we have queued up so far. count can be
				3053	* 0, which means to process everything in the tree at the start
				3054	* of the run (but not newly added entries), or it can be some target
				3055	* number you'd like to process.
				3056	*
				3057	* Returns 0 on success or if called with an aborted transaction
				3058	* Returns <0 on error and aborts the transaction
				3059	*/
				3060	int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
				3061	struct btrfs_fs_info *fs_info, unsigned long count)
				3062	{
				3063	struct rb_node *node;
				3064	struct btrfs_delayed_ref_root *delayed_refs;
				3065	struct btrfs_delayed_ref_head *head;
				3066	int ret;
				3067	int run_all = count == (unsigned long)-1;
				3068	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
				3069
				3070	/* We'll clean this up in btrfs_cleanup_transaction */
				3071	if (trans->aborted)
				3072	return 0;
				3073
				3074	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
				3075	return 0;
				3076
				3077	delayed_refs = &trans->transaction->delayed_refs;
				3078	if (count == 0)
				3079	count = atomic_read(&delayed_refs->num_entries) * 2;
				3080
				3081	again:
				3082	#ifdef SCRAMBLE_DELAYED_REFS
				3083	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
				3084	#endif
				3085	trans->can_flush_pending_bgs = false;
				3086	ret = __btrfs_run_delayed_refs(trans, fs_info, count);
				3087	if (ret < 0) {
				3088	btrfs_abort_transaction(trans, ret);
				3089	return ret;
				3090	}
				3091
				3092	if (run_all) {
				3093	if (!list_empty(&trans->new_bgs))
				3094	btrfs_create_pending_block_groups(trans, fs_info);
				3095
				3096	spin_lock(&delayed_refs->lock);
				3097	node = rb_first(&delayed_refs->href_root);
				3098	if (!node) {
				3099	spin_unlock(&delayed_refs->lock);
				3100	goto out;
				3101	}
				3102
				3103	while (node) {
				3104	head = rb_entry(node, struct btrfs_delayed_ref_head,
				3105	href_node);
				3106	if (btrfs_delayed_ref_is_head(&head->node)) {
				3107	struct btrfs_delayed_ref_node *ref;
				3108
				3109	ref = &head->node;
				3110	refcount_inc(&ref->refs);
				3111
				3112	spin_unlock(&delayed_refs->lock);
				3113	/*
				3114	* Mutex was contended, block until it's
				3115	* released and try again
				3116	*/
				3117	mutex_lock(&head->mutex);
				3118	mutex_unlock(&head->mutex);
				3119
				3120	btrfs_put_delayed_ref(ref);
				3121	cond_resched();
				3122	goto again;
				3123	} else {
				3124	WARN_ON(1);
				3125	}
				3126	node = rb_next(node);
				3127	}
				3128	spin_unlock(&delayed_refs->lock);
				3129	cond_resched();
				3130	goto again;
				3131	}
				3132	out:
				3133	trans->can_flush_pending_bgs = can_flush_pending_bgs;
				3134	return 0;
				3135	}
				3136
				3137	int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				3138	struct btrfs_fs_info *fs_info,
				3139	u64 bytenr, u64 num_bytes, u64 flags,
				3140	int level, int is_data)
				3141	{
				3142	struct btrfs_delayed_extent_op *extent_op;
				3143	int ret;
				3144
				3145	extent_op = btrfs_alloc_delayed_extent_op();
				3146	if (!extent_op)
				3147	return -ENOMEM;
				3148
				3149	extent_op->flags_to_set = flags;
				3150	extent_op->update_flags = true;
				3151	extent_op->update_key = false;
				3152	extent_op->is_data = is_data ? true : false;
				3153	extent_op->level = level;
				3154
				3155	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
				3156	num_bytes, extent_op);
				3157	if (ret)
				3158	btrfs_free_delayed_extent_op(extent_op);
				3159	return ret;
				3160	}
				3161
				3162	static noinline int check_delayed_ref(struct btrfs_root *root,
				3163	struct btrfs_path *path,
				3164	u64 objectid, u64 offset, u64 bytenr)
				3165	{
				3166	struct btrfs_delayed_ref_head *head;
				3167	struct btrfs_delayed_ref_node *ref;
				3168	struct btrfs_delayed_data_ref *data_ref;
				3169	struct btrfs_delayed_ref_root *delayed_refs;
				3170	struct btrfs_transaction *cur_trans;
				3171	int ret = 0;
				3172
				3173	spin_lock(&root->fs_info->trans_lock);
				3174	cur_trans = root->fs_info->running_transaction;
				3175	if (cur_trans)
				3176	refcount_inc(&cur_trans->use_count);
				3177	spin_unlock(&root->fs_info->trans_lock);
				3178	if (!cur_trans)
				3179	return 0;
				3180
				3181	delayed_refs = &cur_trans->delayed_refs;
				3182	spin_lock(&delayed_refs->lock);
				3183	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				3184	if (!head) {
				3185	spin_unlock(&delayed_refs->lock);
				3186	btrfs_put_transaction(cur_trans);
				3187	return 0;
				3188	}
				3189
				3190	if (!mutex_trylock(&head->mutex)) {
				3191	refcount_inc(&head->node.refs);
				3192	spin_unlock(&delayed_refs->lock);
				3193
				3194	btrfs_release_path(path);
				3195
				3196	/*
				3197	* Mutex was contended, block until it's released and let
				3198	* caller try again
				3199	*/
				3200	mutex_lock(&head->mutex);
				3201	mutex_unlock(&head->mutex);
				3202	btrfs_put_delayed_ref(&head->node);
				3203	btrfs_put_transaction(cur_trans);
				3204	return -EAGAIN;
				3205	}
				3206	spin_unlock(&delayed_refs->lock);
				3207
				3208	spin_lock(&head->lock);
				3209	list_for_each_entry(ref, &head->ref_list, list) {
				3210	/* If it's a shared ref we know a cross reference exists */
				3211	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
				3212	ret = 1;
				3213	break;
				3214	}
				3215
				3216	data_ref = btrfs_delayed_node_to_data_ref(ref);
				3217
				3218	/*
				3219	* If our ref doesn't match the one we're currently looking at
				3220	* then we have a cross reference.
				3221	*/
				3222	if (data_ref->root != root->root_key.objectid \|\|
				3223	data_ref->objectid != objectid \|\|
				3224	data_ref->offset != offset) {
				3225	ret = 1;
				3226	break;
				3227	}
				3228	}
				3229	spin_unlock(&head->lock);
				3230	mutex_unlock(&head->mutex);
				3231	btrfs_put_transaction(cur_trans);
				3232	return ret;
				3233	}
				3234
				3235	static noinline int check_committed_ref(struct btrfs_root *root,
				3236	struct btrfs_path *path,
				3237	u64 objectid, u64 offset, u64 bytenr)
				3238	{
				3239	struct btrfs_fs_info *fs_info = root->fs_info;
				3240	struct btrfs_root *extent_root = fs_info->extent_root;
				3241	struct extent_buffer *leaf;
				3242	struct btrfs_extent_data_ref *ref;
				3243	struct btrfs_extent_inline_ref *iref;
				3244	struct btrfs_extent_item *ei;
				3245	struct btrfs_key key;
				3246	u32 item_size;
				3247	int type;
				3248	int ret;
				3249
				3250	key.objectid = bytenr;
				3251	key.offset = (u64)-1;
				3252	key.type = BTRFS_EXTENT_ITEM_KEY;
				3253
				3254	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
				3255	if (ret < 0)
				3256	goto out;
				3257	BUG_ON(ret == 0); /* Corruption */
				3258
				3259	ret = -ENOENT;
				3260	if (path->slots[0] == 0)
				3261	goto out;
				3262
				3263	path->slots[0]--;
				3264	leaf = path->nodes[0];
				3265	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				3266
				3267	if (key.objectid != bytenr \|\| key.type != BTRFS_EXTENT_ITEM_KEY)
				3268	goto out;
				3269
				3270	ret = 1;
				3271	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				3272	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				3273	if (item_size < sizeof(*ei)) {
				3274	WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
				3275	goto out;
				3276	}
				3277	#endif
				3278	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				3279
				3280	if (item_size != sizeof(*ei) +
				3281	btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
				3282	goto out;
				3283
				3284	if (btrfs_extent_generation(leaf, ei) <=
				3285	btrfs_root_last_snapshot(&root->root_item))
				3286	goto out;
				3287
				3288	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
				3289
				3290	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
				3291	if (type != BTRFS_EXTENT_DATA_REF_KEY)
				3292	goto out;
				3293
				3294	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
				3295	if (btrfs_extent_refs(leaf, ei) !=
				3296	btrfs_extent_data_ref_count(leaf, ref) \|\|
				3297	btrfs_extent_data_ref_root(leaf, ref) !=
				3298	root->root_key.objectid \|\|
				3299	btrfs_extent_data_ref_objectid(leaf, ref) != objectid \|\|
				3300	btrfs_extent_data_ref_offset(leaf, ref) != offset)
				3301	goto out;
				3302
				3303	ret = 0;
				3304	out:
				3305	return ret;
				3306	}
				3307
				3308	int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
				3309	u64 bytenr)
				3310	{
				3311	struct btrfs_path *path;
				3312	int ret;
				3313	int ret2;
				3314
				3315	path = btrfs_alloc_path();
				3316	if (!path)
				3317	return -ENOENT;
				3318
				3319	do {
				3320	ret = check_committed_ref(root, path, objectid,
				3321	offset, bytenr);
				3322	if (ret && ret != -ENOENT)
				3323	goto out;
				3324
				3325	ret2 = check_delayed_ref(root, path, objectid,
				3326	offset, bytenr);
				3327	} while (ret2 == -EAGAIN);
				3328
				3329	if (ret2 && ret2 != -ENOENT) {
				3330	ret = ret2;
				3331	goto out;
				3332	}
				3333
				3334	if (ret != -ENOENT \|\| ret2 != -ENOENT)
				3335	ret = 0;
				3336	out:
				3337	btrfs_free_path(path);
				3338	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
				3339	WARN_ON(ret > 0);
				3340	return ret;
				3341	}
				3342
				3343	static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
				3344	struct btrfs_root *root,
				3345	struct extent_buffer *buf,
				3346	int full_backref, int inc)
				3347	{
				3348	struct btrfs_fs_info *fs_info = root->fs_info;
				3349	u64 bytenr;
				3350	u64 num_bytes;
				3351	u64 parent;
				3352	u64 ref_root;
				3353	u32 nritems;
				3354	struct btrfs_key key;
				3355	struct btrfs_file_extent_item *fi;
				3356	int i;
				3357	int level;
				3358	int ret = 0;
				3359	int (process_func)(struct btrfs_trans_handle ,
				3360	struct btrfs_fs_info *,
				3361	u64, u64, u64, u64, u64, u64);
				3362
				3363
				3364	if (btrfs_is_testing(fs_info))
				3365	return 0;
				3366
				3367	ref_root = btrfs_header_owner(buf);
				3368	nritems = btrfs_header_nritems(buf);
				3369	level = btrfs_header_level(buf);
				3370
				3371	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
				3372	return 0;
				3373
				3374	if (inc)
				3375	process_func = btrfs_inc_extent_ref;
				3376	else
				3377	process_func = btrfs_free_extent;
				3378
				3379	if (full_backref)
				3380	parent = buf->start;
				3381	else
				3382	parent = 0;
				3383
				3384	for (i = 0; i < nritems; i++) {
				3385	if (level == 0) {
				3386	btrfs_item_key_to_cpu(buf, &key, i);
				3387	if (key.type != BTRFS_EXTENT_DATA_KEY)
				3388	continue;
				3389	fi = btrfs_item_ptr(buf, i,
				3390	struct btrfs_file_extent_item);
				3391	if (btrfs_file_extent_type(buf, fi) ==
				3392	BTRFS_FILE_EXTENT_INLINE)
				3393	continue;
				3394	bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
				3395	if (bytenr == 0)
				3396	continue;
				3397
				3398	num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
				3399	key.offset -= btrfs_file_extent_offset(buf, fi);
				3400	ret = process_func(trans, fs_info, bytenr, num_bytes,
				3401	parent, ref_root, key.objectid,
				3402	key.offset);
				3403	if (ret)
				3404	goto fail;
				3405	} else {
				3406	bytenr = btrfs_node_blockptr(buf, i);
				3407	num_bytes = fs_info->nodesize;
				3408	ret = process_func(trans, fs_info, bytenr, num_bytes,
				3409	parent, ref_root, level - 1, 0);
				3410	if (ret)
				3411	goto fail;
				3412	}
				3413	}
				3414	return 0;
				3415	fail:
				3416	return ret;
				3417	}
				3418
				3419	int btrfs_inc_ref(struct btrfs_trans_handle trans, struct btrfs_root root,
				3420	struct extent_buffer *buf, int full_backref)
				3421	{
				3422	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
				3423	}
				3424
				3425	int btrfs_dec_ref(struct btrfs_trans_handle trans, struct btrfs_root root,
				3426	struct extent_buffer *buf, int full_backref)
				3427	{
				3428	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
				3429	}
				3430
				3431	static int write_one_cache_group(struct btrfs_trans_handle *trans,
				3432	struct btrfs_fs_info *fs_info,
				3433	struct btrfs_path *path,
				3434	struct btrfs_block_group_cache *cache)
				3435	{
				3436	int ret;
				3437	struct btrfs_root *extent_root = fs_info->extent_root;
				3438	unsigned long bi;
				3439	struct extent_buffer *leaf;
				3440
				3441	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
				3442	if (ret) {
				3443	if (ret > 0)
				3444	ret = -ENOENT;
				3445	goto fail;
				3446	}
				3447
				3448	leaf = path->nodes[0];
				3449	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
				3450	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
				3451	btrfs_mark_buffer_dirty(leaf);
				3452	fail:
				3453	btrfs_release_path(path);
				3454	return ret;
				3455
				3456	}
				3457
				3458	static struct btrfs_block_group_cache *
				3459	next_block_group(struct btrfs_fs_info *fs_info,
				3460	struct btrfs_block_group_cache *cache)
				3461	{
				3462	struct rb_node *node;
				3463
				3464	spin_lock(&fs_info->block_group_cache_lock);
				3465
				3466	/* If our block group was removed, we need a full search. */
				3467	if (RB_EMPTY_NODE(&cache->cache_node)) {
				3468	const u64 next_bytenr = cache->key.objectid + cache->key.offset;
				3469
				3470	spin_unlock(&fs_info->block_group_cache_lock);
				3471	btrfs_put_block_group(cache);
				3472	cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
				3473	}
				3474	node = rb_next(&cache->cache_node);
				3475	btrfs_put_block_group(cache);
				3476	if (node) {
				3477	cache = rb_entry(node, struct btrfs_block_group_cache,
				3478	cache_node);
				3479	btrfs_get_block_group(cache);
				3480	} else
				3481	cache = NULL;
				3482	spin_unlock(&fs_info->block_group_cache_lock);
				3483	return cache;
				3484	}
				3485
				3486	static int cache_save_setup(struct btrfs_block_group_cache *block_group,
				3487	struct btrfs_trans_handle *trans,
				3488	struct btrfs_path *path)
				3489	{
				3490	struct btrfs_fs_info *fs_info = block_group->fs_info;
				3491	struct btrfs_root *root = fs_info->tree_root;
				3492	struct inode *inode = NULL;
				3493	struct extent_changeset *data_reserved = NULL;
				3494	u64 alloc_hint = 0;
				3495	int dcs = BTRFS_DC_ERROR;
				3496	u64 num_pages = 0;
				3497	int retries = 0;
				3498	int ret = 0;
				3499
				3500	/*
				3501	* If this block group is smaller than 100 megs don't bother caching the
				3502	* block group.
				3503	*/
				3504	if (block_group->key.offset < (100 * SZ_1M)) {
				3505	spin_lock(&block_group->lock);
				3506	block_group->disk_cache_state = BTRFS_DC_WRITTEN;
				3507	spin_unlock(&block_group->lock);
				3508	return 0;
				3509	}
				3510
				3511	if (trans->aborted)
				3512	return 0;
				3513	again:
				3514	inode = lookup_free_space_inode(fs_info, block_group, path);
				3515	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
				3516	ret = PTR_ERR(inode);
				3517	btrfs_release_path(path);
				3518	goto out;
				3519	}
				3520
				3521	if (IS_ERR(inode)) {
				3522	BUG_ON(retries);
				3523	retries++;
				3524
				3525	if (block_group->ro)
				3526	goto out_free;
				3527
				3528	ret = create_free_space_inode(fs_info, trans, block_group,
				3529	path);
				3530	if (ret)
				3531	goto out_free;
				3532	goto again;
				3533	}
				3534
				3535	/*
				3536	* We want to set the generation to 0, that way if anything goes wrong
				3537	* from here on out we know not to trust this cache when we load up next
				3538	* time.
				3539	*/
				3540	BTRFS_I(inode)->generation = 0;
				3541	ret = btrfs_update_inode(trans, root, inode);
				3542	if (ret) {
				3543	/*
				3544	* So theoretically we could recover from this, simply set the
				3545	* super cache generation to 0 so we know to invalidate the
				3546	* cache, but then we'd have to keep track of the block groups
				3547	* that fail this way so we know we _have_ to reset this cache
				3548	* before the next commit or risk reading stale cache. So to
				3549	* limit our exposure to horrible edge cases lets just abort the
				3550	* transaction, this only happens in really bad situations
				3551	* anyway.
				3552	*/
				3553	btrfs_abort_transaction(trans, ret);
				3554	goto out_put;
				3555	}
				3556	WARN_ON(ret);
				3557
				3558	/* We've already setup this transaction, go ahead and exit */
				3559	if (block_group->cache_generation == trans->transid &&
				3560	i_size_read(inode)) {
				3561	dcs = BTRFS_DC_SETUP;
				3562	goto out_put;
				3563	}
				3564
				3565	if (i_size_read(inode) > 0) {
				3566	ret = btrfs_check_trunc_cache_free_space(fs_info,
				3567	&fs_info->global_block_rsv);
				3568	if (ret)
				3569	goto out_put;
				3570
				3571	ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
				3572	if (ret)
				3573	goto out_put;
				3574	}
				3575
				3576	spin_lock(&block_group->lock);
				3577	if (block_group->cached != BTRFS_CACHE_FINISHED \|\|
				3578	!btrfs_test_opt(fs_info, SPACE_CACHE)) {
				3579	/*
				3580	* don't bother trying to write stuff out _if_
				3581	* a) we're not cached,
				3582	* b) we're with nospace_cache mount option,
				3583	* c) we're with v2 space_cache (FREE_SPACE_TREE).
				3584	*/
				3585	dcs = BTRFS_DC_WRITTEN;
				3586	spin_unlock(&block_group->lock);
				3587	goto out_put;
				3588	}
				3589	spin_unlock(&block_group->lock);
				3590
				3591	/*
				3592	* We hit an ENOSPC when setting up the cache in this transaction, just
				3593	* skip doing the setup, we've already cleared the cache so we're safe.
				3594	*/
				3595	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
				3596	ret = -ENOSPC;
				3597	goto out_put;
				3598	}
				3599
				3600	/*
				3601	* Try to preallocate enough space based on how big the block group is.
				3602	* Keep in mind this has to include any pinned space which could end up
				3603	* taking up quite a bit since it's not folded into the other space
				3604	* cache.
				3605	*/
				3606	num_pages = div_u64(block_group->key.offset, SZ_256M);
				3607	if (!num_pages)
				3608	num_pages = 1;
				3609
				3610	num_pages *= 16;
				3611	num_pages *= PAGE_SIZE;
				3612
				3613	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
				3614	if (ret)
				3615	goto out_put;
				3616
				3617	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
				3618	num_pages, num_pages,
				3619	&alloc_hint);
				3620	/*
				3621	* Our cache requires contiguous chunks so that we don't modify a bunch
				3622	* of metadata or split extents when writing the cache out, which means
				3623	* we can enospc if we are heavily fragmented in addition to just normal
				3624	* out of space conditions. So if we hit this just skip setting up any
				3625	* other block groups for this transaction, maybe we'll unpin enough
				3626	* space the next time around.
				3627	*/
				3628	if (!ret)
				3629	dcs = BTRFS_DC_SETUP;
				3630	else if (ret == -ENOSPC)
				3631	set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
				3632
				3633	out_put:
				3634	iput(inode);
				3635	out_free:
				3636	btrfs_release_path(path);
				3637	out:
				3638	spin_lock(&block_group->lock);
				3639	if (!ret && dcs == BTRFS_DC_SETUP)
				3640	block_group->cache_generation = trans->transid;
				3641	block_group->disk_cache_state = dcs;
				3642	spin_unlock(&block_group->lock);
				3643
				3644	extent_changeset_free(data_reserved);
				3645	return ret;
				3646	}
				3647
				3648	int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
				3649	struct btrfs_fs_info *fs_info)
				3650	{
				3651	struct btrfs_block_group_cache cache, tmp;
				3652	struct btrfs_transaction *cur_trans = trans->transaction;
				3653	struct btrfs_path *path;
				3654
				3655	if (list_empty(&cur_trans->dirty_bgs) \|\|
				3656	!btrfs_test_opt(fs_info, SPACE_CACHE))
				3657	return 0;
				3658
				3659	path = btrfs_alloc_path();
				3660	if (!path)
				3661	return -ENOMEM;
				3662
				3663	/* Could add new block groups, use _safe just in case */
				3664	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
				3665	dirty_list) {
				3666	if (cache->disk_cache_state == BTRFS_DC_CLEAR)
				3667	cache_save_setup(cache, trans, path);
				3668	}
				3669
				3670	btrfs_free_path(path);
				3671	return 0;
				3672	}
				3673
				3674	/*
				3675	* transaction commit does final block group cache writeback during a
				3676	* critical section where nothing is allowed to change the FS. This is
				3677	* required in order for the cache to actually match the block group,
				3678	* but can introduce a lot of latency into the commit.
				3679	*
				3680	* So, btrfs_start_dirty_block_groups is here to kick off block group
				3681	* cache IO. There's a chance we'll have to redo some of it if the
				3682	* block group changes again during the commit, but it greatly reduces
				3683	* the commit latency by getting rid of the easy block groups while
				3684	* we're still allowing others to join the commit.
				3685	*/
				3686	int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
				3687	struct btrfs_fs_info *fs_info)
				3688	{
				3689	struct btrfs_block_group_cache *cache;
				3690	struct btrfs_transaction *cur_trans = trans->transaction;
				3691	int ret = 0;
				3692	int should_put;
				3693	struct btrfs_path *path = NULL;
				3694	LIST_HEAD(dirty);
				3695	struct list_head *io = &cur_trans->io_bgs;
				3696	int num_started = 0;
				3697	int loops = 0;
				3698
				3699	spin_lock(&cur_trans->dirty_bgs_lock);
				3700	if (list_empty(&cur_trans->dirty_bgs)) {
				3701	spin_unlock(&cur_trans->dirty_bgs_lock);
				3702	return 0;
				3703	}
				3704	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				3705	spin_unlock(&cur_trans->dirty_bgs_lock);
				3706
				3707	again:
				3708	/*
				3709	* make sure all the block groups on our dirty list actually
				3710	* exist
				3711	*/
				3712	btrfs_create_pending_block_groups(trans, fs_info);
				3713
				3714	if (!path) {
				3715	path = btrfs_alloc_path();
				3716	if (!path)
				3717	return -ENOMEM;
				3718	}
				3719
				3720	/*
				3721	* cache_write_mutex is here only to save us from balance or automatic
				3722	* removal of empty block groups deleting this block group while we are
				3723	* writing out the cache
				3724	*/
				3725	mutex_lock(&trans->transaction->cache_write_mutex);
				3726	while (!list_empty(&dirty)) {
				3727	cache = list_first_entry(&dirty,
				3728	struct btrfs_block_group_cache,
				3729	dirty_list);
				3730	/*
				3731	* this can happen if something re-dirties a block
				3732	* group that is already under IO. Just wait for it to
				3733	* finish and then do it all again
				3734	*/
				3735	if (!list_empty(&cache->io_list)) {
				3736	list_del_init(&cache->io_list);
				3737	btrfs_wait_cache_io(trans, cache, path);
				3738	btrfs_put_block_group(cache);
				3739	}
				3740
				3741
				3742	/*
				3743	* btrfs_wait_cache_io uses the cache->dirty_list to decide
				3744	* if it should update the cache_state. Don't delete
				3745	* until after we wait.
				3746	*
				3747	* Since we're not running in the commit critical section
				3748	* we need the dirty_bgs_lock to protect from update_block_group
				3749	*/
				3750	spin_lock(&cur_trans->dirty_bgs_lock);
				3751	list_del_init(&cache->dirty_list);
				3752	spin_unlock(&cur_trans->dirty_bgs_lock);
				3753
				3754	should_put = 1;
				3755
				3756	cache_save_setup(cache, trans, path);
				3757
				3758	if (cache->disk_cache_state == BTRFS_DC_SETUP) {
				3759	cache->io_ctl.inode = NULL;
				3760	ret = btrfs_write_out_cache(fs_info, trans,
				3761	cache, path);
				3762	if (ret == 0 && cache->io_ctl.inode) {
				3763	num_started++;
				3764	should_put = 0;
				3765
				3766	/*
				3767	* the cache_write_mutex is protecting
				3768	* the io_list
				3769	*/
				3770	list_add_tail(&cache->io_list, io);
				3771	} else {
				3772	/*
				3773	* if we failed to write the cache, the
				3774	* generation will be bad and life goes on
				3775	*/
				3776	ret = 0;
				3777	}
				3778	}
				3779	if (!ret) {
				3780	ret = write_one_cache_group(trans, fs_info,
				3781	path, cache);
				3782	/*
				3783	* Our block group might still be attached to the list
				3784	* of new block groups in the transaction handle of some
				3785	* other task (struct btrfs_trans_handle->new_bgs). This
				3786	* means its block group item isn't yet in the extent
				3787	* tree. If this happens ignore the error, as we will
				3788	* try again later in the critical section of the
				3789	* transaction commit.
				3790	*/
				3791	if (ret == -ENOENT) {
				3792	ret = 0;
				3793	spin_lock(&cur_trans->dirty_bgs_lock);
				3794	if (list_empty(&cache->dirty_list)) {
				3795	list_add_tail(&cache->dirty_list,
				3796	&cur_trans->dirty_bgs);
				3797	btrfs_get_block_group(cache);
				3798	}
				3799	spin_unlock(&cur_trans->dirty_bgs_lock);
				3800	} else if (ret) {
				3801	btrfs_abort_transaction(trans, ret);
				3802	}
				3803	}
				3804
				3805	/* if its not on the io list, we need to put the block group */
				3806	if (should_put)
				3807	btrfs_put_block_group(cache);
				3808
				3809	if (ret)
				3810	break;
				3811
				3812	/*
				3813	* Avoid blocking other tasks for too long. It might even save
				3814	* us from writing caches for block groups that are going to be
				3815	* removed.
				3816	*/
				3817	mutex_unlock(&trans->transaction->cache_write_mutex);
				3818	mutex_lock(&trans->transaction->cache_write_mutex);
				3819	}
				3820	mutex_unlock(&trans->transaction->cache_write_mutex);
				3821
				3822	/*
				3823	* go through delayed refs for all the stuff we've just kicked off
				3824	* and then loop back (just once)
				3825	*/
				3826	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
				3827	if (!ret && loops == 0) {
				3828	loops++;
				3829	spin_lock(&cur_trans->dirty_bgs_lock);
				3830	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				3831	/*
				3832	* dirty_bgs_lock protects us from concurrent block group
				3833	* deletes too (not just cache_write_mutex).
				3834	*/
				3835	if (!list_empty(&dirty)) {
				3836	spin_unlock(&cur_trans->dirty_bgs_lock);
				3837	goto again;
				3838	}
				3839	spin_unlock(&cur_trans->dirty_bgs_lock);
				3840	} else if (ret < 0) {
				3841	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
				3842	}
				3843
				3844	btrfs_free_path(path);
				3845	return ret;
				3846	}
				3847
				3848	int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
				3849	struct btrfs_fs_info *fs_info)
				3850	{
				3851	struct btrfs_block_group_cache *cache;
				3852	struct btrfs_transaction *cur_trans = trans->transaction;
				3853	int ret = 0;
				3854	int should_put;
				3855	struct btrfs_path *path;
				3856	struct list_head *io = &cur_trans->io_bgs;
				3857	int num_started = 0;
				3858
				3859	path = btrfs_alloc_path();
				3860	if (!path)
				3861	return -ENOMEM;
				3862
				3863	/*
				3864	* Even though we are in the critical section of the transaction commit,
				3865	* we can still have concurrent tasks adding elements to this
				3866	* transaction's list of dirty block groups. These tasks correspond to
				3867	* endio free space workers started when writeback finishes for a
				3868	* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
				3869	* allocate new block groups as a result of COWing nodes of the root
				3870	* tree when updating the free space inode. The writeback for the space
				3871	* caches is triggered by an earlier call to
				3872	* btrfs_start_dirty_block_groups() and iterations of the following
				3873	* loop.
				3874	* Also we want to do the cache_save_setup first and then run the
				3875	* delayed refs to make sure we have the best chance at doing this all
				3876	* in one shot.
				3877	*/
				3878	spin_lock(&cur_trans->dirty_bgs_lock);
				3879	while (!list_empty(&cur_trans->dirty_bgs)) {
				3880	cache = list_first_entry(&cur_trans->dirty_bgs,
				3881	struct btrfs_block_group_cache,
				3882	dirty_list);
				3883
				3884	/*
				3885	* this can happen if cache_save_setup re-dirties a block
				3886	* group that is already under IO. Just wait for it to
				3887	* finish and then do it all again
				3888	*/
				3889	if (!list_empty(&cache->io_list)) {
				3890	spin_unlock(&cur_trans->dirty_bgs_lock);
				3891	list_del_init(&cache->io_list);
				3892	btrfs_wait_cache_io(trans, cache, path);
				3893	btrfs_put_block_group(cache);
				3894	spin_lock(&cur_trans->dirty_bgs_lock);
				3895	}
				3896
				3897	/*
				3898	* don't remove from the dirty list until after we've waited
				3899	* on any pending IO
				3900	*/
				3901	list_del_init(&cache->dirty_list);
				3902	spin_unlock(&cur_trans->dirty_bgs_lock);
				3903	should_put = 1;
				3904
				3905	cache_save_setup(cache, trans, path);
				3906
				3907	if (!ret)
				3908	ret = btrfs_run_delayed_refs(trans, fs_info,
				3909	(unsigned long) -1);
				3910
				3911	if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
				3912	cache->io_ctl.inode = NULL;
				3913	ret = btrfs_write_out_cache(fs_info, trans,
				3914	cache, path);
				3915	if (ret == 0 && cache->io_ctl.inode) {
				3916	num_started++;
				3917	should_put = 0;
				3918	list_add_tail(&cache->io_list, io);
				3919	} else {
				3920	/*
				3921	* if we failed to write the cache, the
				3922	* generation will be bad and life goes on
				3923	*/
				3924	ret = 0;
				3925	}
				3926	}
				3927	if (!ret) {
				3928	ret = write_one_cache_group(trans, fs_info,
				3929	path, cache);
				3930	/*
				3931	* One of the free space endio workers might have
				3932	* created a new block group while updating a free space
				3933	* cache's inode (at inode.c:btrfs_finish_ordered_io())
				3934	* and hasn't released its transaction handle yet, in
				3935	* which case the new block group is still attached to
				3936	* its transaction handle and its creation has not
				3937	* finished yet (no block group item in the extent tree
				3938	* yet, etc). If this is the case, wait for all free
				3939	* space endio workers to finish and retry. This is a
				3940	* a very rare case so no need for a more efficient and
				3941	* complex approach.
				3942	*/
				3943	if (ret == -ENOENT) {
				3944	wait_event(cur_trans->writer_wait,
				3945	atomic_read(&cur_trans->num_writers) == 1);
				3946	ret = write_one_cache_group(trans, fs_info,
				3947	path, cache);
				3948	}
				3949	if (ret)
				3950	btrfs_abort_transaction(trans, ret);
				3951	}
				3952
				3953	/* if its not on the io list, we need to put the block group */
				3954	if (should_put)
				3955	btrfs_put_block_group(cache);
				3956	spin_lock(&cur_trans->dirty_bgs_lock);
				3957	}
				3958	spin_unlock(&cur_trans->dirty_bgs_lock);
				3959
				3960	while (!list_empty(io)) {
				3961	cache = list_first_entry(io, struct btrfs_block_group_cache,
				3962	io_list);
				3963	list_del_init(&cache->io_list);
				3964	btrfs_wait_cache_io(trans, cache, path);
				3965	btrfs_put_block_group(cache);
				3966	}
				3967
				3968	btrfs_free_path(path);
				3969	return ret;
				3970	}
				3971
				3972	int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
				3973	{
				3974	struct btrfs_block_group_cache *block_group;
				3975	int readonly = 0;
				3976
				3977	block_group = btrfs_lookup_block_group(fs_info, bytenr);
				3978	if (!block_group \|\| block_group->ro)
				3979	readonly = 1;
				3980	if (block_group)
				3981	btrfs_put_block_group(block_group);
				3982	return readonly;
				3983	}
				3984
				3985	bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				3986	{
				3987	struct btrfs_block_group_cache *bg;
				3988	bool ret = true;
				3989
				3990	bg = btrfs_lookup_block_group(fs_info, bytenr);
				3991	if (!bg)
				3992	return false;
				3993
				3994	spin_lock(&bg->lock);
				3995	if (bg->ro)
				3996	ret = false;
				3997	else
				3998	atomic_inc(&bg->nocow_writers);
				3999	spin_unlock(&bg->lock);
				4000
				4001	/* no put on block group, done by btrfs_dec_nocow_writers */
				4002	if (!ret)
				4003	btrfs_put_block_group(bg);
				4004
				4005	return ret;
				4006
				4007	}
				4008
				4009	void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				4010	{
				4011	struct btrfs_block_group_cache *bg;
				4012
				4013	bg = btrfs_lookup_block_group(fs_info, bytenr);
				4014	ASSERT(bg);
				4015	if (atomic_dec_and_test(&bg->nocow_writers))
				4016	wake_up_atomic_t(&bg->nocow_writers);
				4017	/*
				4018	* Once for our lookup and once for the lookup done by a previous call
				4019	* to btrfs_inc_nocow_writers()
				4020	*/
				4021	btrfs_put_block_group(bg);
				4022	btrfs_put_block_group(bg);
				4023	}
				4024
				4025	static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
				4026	{
				4027	schedule();
				4028	return 0;
				4029	}
				4030
				4031	void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
				4032	{
				4033	wait_on_atomic_t(&bg->nocow_writers,
				4034	btrfs_wait_nocow_writers_atomic_t,
				4035	TASK_UNINTERRUPTIBLE);
				4036	}
				4037
				4038	static const char *alloc_name(u64 flags)
				4039	{
				4040	switch (flags) {
				4041	case BTRFS_BLOCK_GROUP_METADATA\|BTRFS_BLOCK_GROUP_DATA:
				4042	return "mixed";
				4043	case BTRFS_BLOCK_GROUP_METADATA:
				4044	return "metadata";
				4045	case BTRFS_BLOCK_GROUP_DATA:
				4046	return "data";
				4047	case BTRFS_BLOCK_GROUP_SYSTEM:
				4048	return "system";
				4049	default:
				4050	WARN_ON(1);
				4051	return "invalid-combination";
				4052	};
				4053	}
				4054
				4055	static int create_space_info(struct btrfs_fs_info *info, u64 flags,
				4056	struct btrfs_space_info **new)
				4057	{
				4058
				4059	struct btrfs_space_info *space_info;
				4060	int i;
				4061	int ret;
				4062
				4063	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
				4064	if (!space_info)
				4065	return -ENOMEM;
				4066
				4067	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
				4068	GFP_KERNEL);
				4069	if (ret) {
				4070	kfree(space_info);
				4071	return ret;
				4072	}
				4073
				4074	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
				4075	INIT_LIST_HEAD(&space_info->block_groups[i]);
				4076	init_rwsem(&space_info->groups_sem);
				4077	spin_lock_init(&space_info->lock);
				4078	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
				4079	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				4080	init_waitqueue_head(&space_info->wait);
				4081	INIT_LIST_HEAD(&space_info->ro_bgs);
				4082	INIT_LIST_HEAD(&space_info->tickets);
				4083	INIT_LIST_HEAD(&space_info->priority_tickets);
				4084
				4085	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
				4086	info->space_info_kobj, "%s",
				4087	alloc_name(space_info->flags));
				4088	if (ret) {
				4089	kobject_put(&space_info->kobj);
				4090	return ret;
				4091	}
				4092
				4093	*new = space_info;
				4094	list_add_rcu(&space_info->list, &info->space_info);
				4095	if (flags & BTRFS_BLOCK_GROUP_DATA)
				4096	info->data_sinfo = space_info;
				4097
				4098	return ret;
				4099	}
				4100
				4101	static void update_space_info(struct btrfs_fs_info *info, u64 flags,
				4102	u64 total_bytes, u64 bytes_used,
				4103	u64 bytes_readonly,
				4104	struct btrfs_space_info **space_info)
				4105	{
				4106	struct btrfs_space_info *found;
				4107	int factor;
				4108
				4109	if (flags & (BTRFS_BLOCK_GROUP_DUP \| BTRFS_BLOCK_GROUP_RAID1 \|
				4110	BTRFS_BLOCK_GROUP_RAID10))
				4111	factor = 2;
				4112	else
				4113	factor = 1;
				4114
				4115	found = __find_space_info(info, flags);
				4116	ASSERT(found);
				4117	spin_lock(&found->lock);
				4118	found->total_bytes += total_bytes;
				4119	found->disk_total += total_bytes * factor;
				4120	found->bytes_used += bytes_used;
				4121	found->disk_used += bytes_used * factor;
				4122	found->bytes_readonly += bytes_readonly;
				4123	if (total_bytes > 0)
				4124	found->full = 0;
				4125	space_info_add_new_bytes(info, found, total_bytes -
				4126	bytes_used - bytes_readonly);
				4127	spin_unlock(&found->lock);
				4128	*space_info = found;
				4129	}
				4130
				4131	static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				4132	{
				4133	u64 extra_flags = chunk_to_extended(flags) &
				4134	BTRFS_EXTENDED_PROFILE_MASK;
				4135
				4136	write_seqlock(&fs_info->profiles_lock);
				4137	if (flags & BTRFS_BLOCK_GROUP_DATA)
				4138	fs_info->avail_data_alloc_bits \|= extra_flags;
				4139	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				4140	fs_info->avail_metadata_alloc_bits \|= extra_flags;
				4141	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				4142	fs_info->avail_system_alloc_bits \|= extra_flags;
				4143	write_sequnlock(&fs_info->profiles_lock);
				4144	}
				4145
				4146	/*
				4147	* returns target flags in extended format or 0 if restripe for this
				4148	* chunk_type is not in progress
				4149	*
				4150	* should be called with either volume_mutex or balance_lock held
				4151	*/
				4152	static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
				4153	{
				4154	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				4155	u64 target = 0;
				4156
				4157	if (!bctl)
				4158	return 0;
				4159
				4160	if (flags & BTRFS_BLOCK_GROUP_DATA &&
				4161	bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				4162	target = BTRFS_BLOCK_GROUP_DATA \| bctl->data.target;
				4163	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
				4164	bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				4165	target = BTRFS_BLOCK_GROUP_SYSTEM \| bctl->sys.target;
				4166	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
				4167	bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				4168	target = BTRFS_BLOCK_GROUP_METADATA \| bctl->meta.target;
				4169	}
				4170
				4171	return target;
				4172	}
				4173
				4174	/*
				4175	* @flags: available profiles in extended format (see ctree.h)
				4176	*
				4177	* Returns reduced profile in chunk format. If profile changing is in
				4178	* progress (either running or paused) picks the target profile (if it's
				4179	* already available), otherwise falls back to plain reducing.
				4180	*/
				4181	static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
				4182	{
				4183	u64 num_devices = fs_info->fs_devices->rw_devices;
				4184	u64 target;
				4185	u64 raid_type;
				4186	u64 allowed = 0;
				4187
				4188	/*
				4189	* see if restripe for this chunk_type is in progress, if so
				4190	* try to reduce to the target profile
				4191	*/
				4192	spin_lock(&fs_info->balance_lock);
				4193	target = get_restripe_target(fs_info, flags);
				4194	if (target) {
				4195	/* pick target profile only if it's already available */
				4196	if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
				4197	spin_unlock(&fs_info->balance_lock);
				4198	return extended_to_chunk(target);
				4199	}
				4200	}
				4201	spin_unlock(&fs_info->balance_lock);
				4202
				4203	/* First, mask out the RAID levels which aren't possible */
				4204	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
				4205	if (num_devices >= btrfs_raid_array[raid_type].devs_min)
				4206	allowed \|= btrfs_raid_group[raid_type];
				4207	}
				4208	allowed &= flags;
				4209
				4210	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
				4211	allowed = BTRFS_BLOCK_GROUP_RAID6;
				4212	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
				4213	allowed = BTRFS_BLOCK_GROUP_RAID5;
				4214	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
				4215	allowed = BTRFS_BLOCK_GROUP_RAID10;
				4216	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
				4217	allowed = BTRFS_BLOCK_GROUP_RAID1;
				4218	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
				4219	allowed = BTRFS_BLOCK_GROUP_RAID0;
				4220
				4221	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
				4222
				4223	return extended_to_chunk(flags \| allowed);
				4224	}
				4225
				4226	static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
				4227	{
				4228	unsigned seq;
				4229	u64 flags;
				4230
				4231	do {
				4232	flags = orig_flags;
				4233	seq = read_seqbegin(&fs_info->profiles_lock);
				4234
				4235	if (flags & BTRFS_BLOCK_GROUP_DATA)
				4236	flags \|= fs_info->avail_data_alloc_bits;
				4237	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				4238	flags \|= fs_info->avail_system_alloc_bits;
				4239	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
				4240	flags \|= fs_info->avail_metadata_alloc_bits;
				4241	} while (read_seqretry(&fs_info->profiles_lock, seq));
				4242
				4243	return btrfs_reduce_alloc_profile(fs_info, flags);
				4244	}
				4245
				4246	static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
				4247	{
				4248	struct btrfs_fs_info *fs_info = root->fs_info;
				4249	u64 flags;
				4250	u64 ret;
				4251
				4252	if (data)
				4253	flags = BTRFS_BLOCK_GROUP_DATA;
				4254	else if (root == fs_info->chunk_root)
				4255	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				4256	else
				4257	flags = BTRFS_BLOCK_GROUP_METADATA;
				4258
				4259	ret = get_alloc_profile(fs_info, flags);
				4260	return ret;
				4261	}
				4262
				4263	u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
				4264	{
				4265	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
				4266	}
				4267
				4268	u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
				4269	{
				4270	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				4271	}
				4272
				4273	u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
				4274	{
				4275	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				4276	}
				4277
				4278	static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
				4279	bool may_use_included)
				4280	{
				4281	ASSERT(s_info);
				4282	return s_info->bytes_used + s_info->bytes_reserved +
				4283	s_info->bytes_pinned + s_info->bytes_readonly +
				4284	(may_use_included ? s_info->bytes_may_use : 0);
				4285	}
				4286
				4287	int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
				4288	{
				4289	struct btrfs_root *root = inode->root;
				4290	struct btrfs_fs_info *fs_info = root->fs_info;
				4291	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
				4292	u64 used;
				4293	int ret = 0;
				4294	int need_commit = 2;
				4295	int have_pinned_space;
				4296
				4297	/* make sure bytes are sectorsize aligned */
				4298	bytes = ALIGN(bytes, fs_info->sectorsize);
				4299
				4300	if (btrfs_is_free_space_inode(inode)) {
				4301	need_commit = 0;
				4302	ASSERT(current->journal_info);
				4303	}
				4304
				4305	again:
				4306	/* make sure we have enough space to handle the data first */
				4307	spin_lock(&data_sinfo->lock);
				4308	used = btrfs_space_info_used(data_sinfo, true);
				4309
				4310	if (used + bytes > data_sinfo->total_bytes) {
				4311	struct btrfs_trans_handle *trans;
				4312
				4313	/*
				4314	* if we don't have enough free bytes in this space then we need
				4315	* to alloc a new chunk.
				4316	*/
				4317	if (!data_sinfo->full) {
				4318	u64 alloc_target;
				4319
				4320	data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
				4321	spin_unlock(&data_sinfo->lock);
				4322
				4323	alloc_target = btrfs_data_alloc_profile(fs_info);
				4324	/*
				4325	* It is ugly that we don't call nolock join
				4326	* transaction for the free space inode case here.
				4327	* But it is safe because we only do the data space
				4328	* reservation for the free space cache in the
				4329	* transaction context, the common join transaction
				4330	* just increase the counter of the current transaction
				4331	* handler, doesn't try to acquire the trans_lock of
				4332	* the fs.
				4333	*/
				4334	trans = btrfs_join_transaction(root);
				4335	if (IS_ERR(trans))
				4336	return PTR_ERR(trans);
				4337
				4338	ret = do_chunk_alloc(trans, fs_info, alloc_target,
				4339	CHUNK_ALLOC_NO_FORCE);
				4340	btrfs_end_transaction(trans);
				4341	if (ret < 0) {
				4342	if (ret != -ENOSPC)
				4343	return ret;
				4344	else {
				4345	have_pinned_space = 1;
				4346	goto commit_trans;
				4347	}
				4348	}
				4349
				4350	goto again;
				4351	}
				4352
				4353	/*
				4354	* If we don't have enough pinned space to deal with this
				4355	* allocation, and no removed chunk in current transaction,
				4356	* don't bother committing the transaction.
				4357	*/
				4358	have_pinned_space = percpu_counter_compare(
				4359	&data_sinfo->total_bytes_pinned,
				4360	used + bytes - data_sinfo->total_bytes);
				4361	spin_unlock(&data_sinfo->lock);
				4362
				4363	/* commit the current transaction and try again */
				4364	commit_trans:
				4365	if (need_commit &&
				4366	!atomic_read(&fs_info->open_ioctl_trans)) {
				4367	need_commit--;
				4368
				4369	if (need_commit > 0) {
				4370	btrfs_start_delalloc_roots(fs_info, 0, -1);
				4371	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
				4372	(u64)-1);
				4373	}
				4374
				4375	trans = btrfs_join_transaction(root);
				4376	if (IS_ERR(trans))
				4377	return PTR_ERR(trans);
				4378	if (have_pinned_space >= 0 \|\|
				4379	test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
				4380	&trans->transaction->flags) \|\|
				4381	need_commit > 0) {
				4382	ret = btrfs_commit_transaction(trans);
				4383	if (ret)
				4384	return ret;
				4385	/*
				4386	* The cleaner kthread might still be doing iput
				4387	* operations. Wait for it to finish so that
				4388	* more space is released.
				4389	*/
				4390	mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
				4391	mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
				4392	goto again;
				4393	} else {
				4394	btrfs_end_transaction(trans);
				4395	}
				4396	}
				4397
				4398	trace_btrfs_space_reservation(fs_info,
				4399	"space_info:enospc",
				4400	data_sinfo->flags, bytes, 1);
				4401	return -ENOSPC;
				4402	}
				4403	data_sinfo->bytes_may_use += bytes;
				4404	trace_btrfs_space_reservation(fs_info, "space_info",
				4405	data_sinfo->flags, bytes, 1);
				4406	spin_unlock(&data_sinfo->lock);
				4407
				4408	return 0;
				4409	}
				4410
				4411	int btrfs_check_data_free_space(struct inode *inode,
				4412	struct extent_changeset **reserved, u64 start, u64 len)
				4413	{
				4414	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4415	int ret;
				4416
				4417	/* align the range */
				4418	len = round_up(start + len, fs_info->sectorsize) -
				4419	round_down(start, fs_info->sectorsize);
				4420	start = round_down(start, fs_info->sectorsize);
				4421
				4422	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
				4423	if (ret < 0)
				4424	return ret;
				4425
				4426	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
				4427	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
				4428	if (ret < 0)
				4429	btrfs_free_reserved_data_space_noquota(inode, start, len);
				4430	else
				4431	ret = 0;
				4432	return ret;
				4433	}
				4434
				4435	/*
				4436	* Called if we need to clear a data reservation for this inode
				4437	* Normally in a error case.
				4438	*
				4439	* This one will NOT use accurate qgroup reserved space API, just for case
				4440	* which we can't sleep and is sure it won't affect qgroup reserved space.
				4441	* Like clear_bit_hook().
				4442	*/
				4443	void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
				4444	u64 len)
				4445	{
				4446	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4447	struct btrfs_space_info *data_sinfo;
				4448
				4449	/* Make sure the range is aligned to sectorsize */
				4450	len = round_up(start + len, fs_info->sectorsize) -
				4451	round_down(start, fs_info->sectorsize);
				4452	start = round_down(start, fs_info->sectorsize);
				4453
				4454	data_sinfo = fs_info->data_sinfo;
				4455	spin_lock(&data_sinfo->lock);
				4456	if (WARN_ON(data_sinfo->bytes_may_use < len))
				4457	data_sinfo->bytes_may_use = 0;
				4458	else
				4459	data_sinfo->bytes_may_use -= len;
				4460	trace_btrfs_space_reservation(fs_info, "space_info",
				4461	data_sinfo->flags, len, 0);
				4462	spin_unlock(&data_sinfo->lock);
				4463	}
				4464
				4465	/*
				4466	* Called if we need to clear a data reservation for this inode
				4467	* Normally in a error case.
				4468	*
				4469	* This one will handle the per-inode data rsv map for accurate reserved
				4470	* space framework.
				4471	*/
				4472	void btrfs_free_reserved_data_space(struct inode *inode,
				4473	struct extent_changeset *reserved, u64 start, u64 len)
				4474	{
				4475	struct btrfs_root *root = BTRFS_I(inode)->root;
				4476
				4477	/* Make sure the range is aligned to sectorsize */
				4478	len = round_up(start + len, root->fs_info->sectorsize) -
				4479	round_down(start, root->fs_info->sectorsize);
				4480	start = round_down(start, root->fs_info->sectorsize);
				4481
				4482	btrfs_free_reserved_data_space_noquota(inode, start, len);
				4483	btrfs_qgroup_free_data(inode, reserved, start, len);
				4484	}
				4485
				4486	static void force_metadata_allocation(struct btrfs_fs_info *info)
				4487	{
				4488	struct list_head *head = &info->space_info;
				4489	struct btrfs_space_info *found;
				4490
				4491	rcu_read_lock();
				4492	list_for_each_entry_rcu(found, head, list) {
				4493	if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
				4494	found->force_alloc = CHUNK_ALLOC_FORCE;
				4495	}
				4496	rcu_read_unlock();
				4497	}
				4498
				4499	static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
				4500	{
				4501	return (global->size << 1);
				4502	}
				4503
				4504	static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
				4505	struct btrfs_space_info *sinfo, int force)
				4506	{
				4507	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				4508	u64 bytes_used = btrfs_space_info_used(sinfo, false);
				4509	u64 thresh;
				4510
				4511	if (force == CHUNK_ALLOC_FORCE)
				4512	return 1;
				4513
				4514	/*
				4515	* We need to take into account the global rsv because for all intents
				4516	* and purposes it's used space. Don't worry about locking the
				4517	* global_rsv, it doesn't change except when the transaction commits.
				4518	*/
				4519	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
				4520	bytes_used += calc_global_rsv_need_space(global_rsv);
				4521
				4522	/*
				4523	* in limited mode, we want to have some free space up to
				4524	* about 1% of the FS size.
				4525	*/
				4526	if (force == CHUNK_ALLOC_LIMITED) {
				4527	thresh = btrfs_super_total_bytes(fs_info->super_copy);
				4528	thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
				4529
				4530	if (sinfo->total_bytes - bytes_used < thresh)
				4531	return 1;
				4532	}
				4533
				4534	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
				4535	return 0;
				4536	return 1;
				4537	}
				4538
				4539	static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
				4540	{
				4541	u64 num_dev;
				4542
				4543	if (type & (BTRFS_BLOCK_GROUP_RAID10 \|
				4544	BTRFS_BLOCK_GROUP_RAID0 \|
				4545	BTRFS_BLOCK_GROUP_RAID5 \|
				4546	BTRFS_BLOCK_GROUP_RAID6))
				4547	num_dev = fs_info->fs_devices->rw_devices;
				4548	else if (type & BTRFS_BLOCK_GROUP_RAID1)
				4549	num_dev = 2;
				4550	else
				4551	num_dev = 1; /* DUP or single */
				4552
				4553	return num_dev;
				4554	}
				4555
				4556	/*
				4557	* If @is_allocation is true, reserve space in the system space info necessary
				4558	* for allocating a chunk, otherwise if it's false, reserve space necessary for
				4559	* removing a chunk.
				4560	*/
				4561	void check_system_chunk(struct btrfs_trans_handle *trans,
				4562	struct btrfs_fs_info *fs_info, u64 type)
				4563	{
				4564	struct btrfs_space_info *info;
				4565	u64 left;
				4566	u64 thresh;
				4567	int ret = 0;
				4568	u64 num_devs;
				4569
				4570	/*
				4571	* Needed because we can end up allocating a system chunk and for an
				4572	* atomic and race free space reservation in the chunk block reserve.
				4573	*/
				4574	ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
				4575
				4576	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				4577	spin_lock(&info->lock);
				4578	left = info->total_bytes - btrfs_space_info_used(info, true);
				4579	spin_unlock(&info->lock);
				4580
				4581	num_devs = get_profile_num_devs(fs_info, type);
				4582
				4583	/* num_devs device items to update and 1 chunk item to add or remove */
				4584	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
				4585	btrfs_calc_trans_metadata_size(fs_info, 1);
				4586
				4587	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				4588	btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
				4589	left, thresh, type);
				4590	dump_space_info(fs_info, info, 0, 0);
				4591	}
				4592
				4593	if (left < thresh) {
				4594	u64 flags = btrfs_system_alloc_profile(fs_info);
				4595
				4596	/*
				4597	* Ignore failure to create system chunk. We might end up not
				4598	* needing it, as we might not need to COW all nodes/leafs from
				4599	* the paths we visit in the chunk tree (they were already COWed
				4600	* or created in the current transaction for example).
				4601	*/
				4602	ret = btrfs_alloc_chunk(trans, fs_info, flags);
				4603	}
				4604
				4605	if (!ret) {
				4606	ret = btrfs_block_rsv_add(fs_info->chunk_root,
				4607	&fs_info->chunk_block_rsv,
				4608	thresh, BTRFS_RESERVE_NO_FLUSH);
				4609	if (!ret)
				4610	trans->chunk_bytes_reserved += thresh;
				4611	}
				4612	}
				4613
				4614	/*
				4615	* If force is CHUNK_ALLOC_FORCE:
				4616	* - return 1 if it successfully allocates a chunk,
				4617	* - return errors including -ENOSPC otherwise.
				4618	* If force is NOT CHUNK_ALLOC_FORCE:
				4619	* - return 0 if it doesn't need to allocate a new chunk,
				4620	* - return 1 if it successfully allocates a chunk,
				4621	* - return errors including -ENOSPC otherwise.
				4622	*/
				4623	static int do_chunk_alloc(struct btrfs_trans_handle *trans,
				4624	struct btrfs_fs_info *fs_info, u64 flags, int force)
				4625	{
				4626	struct btrfs_space_info *space_info;
				4627	int wait_for_alloc = 0;
				4628	int ret = 0;
				4629
				4630	/* Don't re-enter if we're already allocating a chunk */
				4631	if (trans->allocating_chunk)
				4632	return -ENOSPC;
				4633
				4634	space_info = __find_space_info(fs_info, flags);
				4635	if (!space_info) {
				4636	ret = create_space_info(fs_info, flags, &space_info);
				4637	if (ret)
				4638	return ret;
				4639	}
				4640
				4641	again:
				4642	spin_lock(&space_info->lock);
				4643	if (force < space_info->force_alloc)
				4644	force = space_info->force_alloc;
				4645	if (space_info->full) {
				4646	if (should_alloc_chunk(fs_info, space_info, force))
				4647	ret = -ENOSPC;
				4648	else
				4649	ret = 0;
				4650	spin_unlock(&space_info->lock);
				4651	return ret;
				4652	}
				4653
				4654	if (!should_alloc_chunk(fs_info, space_info, force)) {
				4655	spin_unlock(&space_info->lock);
				4656	return 0;
				4657	} else if (space_info->chunk_alloc) {
				4658	wait_for_alloc = 1;
				4659	} else {
				4660	space_info->chunk_alloc = 1;
				4661	}
				4662
				4663	spin_unlock(&space_info->lock);
				4664
				4665	mutex_lock(&fs_info->chunk_mutex);
				4666
				4667	/*
				4668	* The chunk_mutex is held throughout the entirety of a chunk
				4669	* allocation, so once we've acquired the chunk_mutex we know that the
				4670	* other guy is done and we need to recheck and see if we should
				4671	* allocate.
				4672	*/
				4673	if (wait_for_alloc) {
				4674	mutex_unlock(&fs_info->chunk_mutex);
				4675	wait_for_alloc = 0;
				4676	cond_resched();
				4677	goto again;
				4678	}
				4679
				4680	trans->allocating_chunk = true;
				4681
				4682	/*
				4683	* If we have mixed data/metadata chunks we want to make sure we keep
				4684	* allocating mixed chunks instead of individual chunks.
				4685	*/
				4686	if (btrfs_mixed_space_info(space_info))
				4687	flags \|= (BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA);
				4688
				4689	/*
				4690	* if we're doing a data chunk, go ahead and make sure that
				4691	* we keep a reasonable number of metadata chunks allocated in the
				4692	* FS as well.
				4693	*/
				4694	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
				4695	fs_info->data_chunk_allocations++;
				4696	if (!(fs_info->data_chunk_allocations %
				4697	fs_info->metadata_ratio))
				4698	force_metadata_allocation(fs_info);
				4699	}
				4700
				4701	/*
				4702	* Check if we have enough space in SYSTEM chunk because we may need
				4703	* to update devices.
				4704	*/
				4705	check_system_chunk(trans, fs_info, flags);
				4706
				4707	ret = btrfs_alloc_chunk(trans, fs_info, flags);
				4708	trans->allocating_chunk = false;
				4709
				4710	spin_lock(&space_info->lock);
				4711	if (ret < 0 && ret != -ENOSPC)
				4712	goto out;
				4713	if (ret)
				4714	space_info->full = 1;
				4715	else
				4716	ret = 1;
				4717
				4718	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				4719	out:
				4720	space_info->chunk_alloc = 0;
				4721	spin_unlock(&space_info->lock);
				4722	mutex_unlock(&fs_info->chunk_mutex);
				4723	/*
				4724	* When we allocate a new chunk we reserve space in the chunk block
				4725	* reserve to make sure we can COW nodes/leafs in the chunk tree or
				4726	* add new nodes/leafs to it if we end up needing to do it when
				4727	* inserting the chunk item and updating device items as part of the
				4728	* second phase of chunk allocation, performed by
				4729	* btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
				4730	* large number of new block groups to create in our transaction
				4731	* handle's new_bgs list to avoid exhausting the chunk block reserve
				4732	* in extreme cases - like having a single transaction create many new
				4733	* block groups when starting to write out the free space caches of all
				4734	* the block groups that were made dirty during the lifetime of the
				4735	* transaction.
				4736	*/
				4737	if (trans->can_flush_pending_bgs &&
				4738	trans->chunk_bytes_reserved >= (u64)SZ_2M) {
				4739	btrfs_create_pending_block_groups(trans, fs_info);
				4740	btrfs_trans_release_chunk_metadata(trans);
				4741	}
				4742	return ret;
				4743	}
				4744
				4745	static int can_overcommit(struct btrfs_fs_info *fs_info,
				4746	struct btrfs_space_info *space_info, u64 bytes,
				4747	enum btrfs_reserve_flush_enum flush,
				4748	bool system_chunk)
				4749	{
				4750	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				4751	u64 profile;
				4752	u64 space_size;
				4753	u64 avail;
				4754	u64 used;
				4755
				4756	/* Don't overcommit when in mixed mode. */
				4757	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
				4758	return 0;
				4759
				4760	if (system_chunk)
				4761	profile = btrfs_system_alloc_profile(fs_info);
				4762	else
				4763	profile = btrfs_metadata_alloc_profile(fs_info);
				4764
				4765	used = btrfs_space_info_used(space_info, false);
				4766
				4767	/*
				4768	* We only want to allow over committing if we have lots of actual space
				4769	* free, but if we don't have enough space to handle the global reserve
				4770	* space then we could end up having a real enospc problem when trying
				4771	* to allocate a chunk or some other such important allocation.
				4772	*/
				4773	spin_lock(&global_rsv->lock);
				4774	space_size = calc_global_rsv_need_space(global_rsv);
				4775	spin_unlock(&global_rsv->lock);
				4776	if (used + space_size >= space_info->total_bytes)
				4777	return 0;
				4778
				4779	used += space_info->bytes_may_use;
				4780
				4781	avail = atomic64_read(&fs_info->free_chunk_space);
				4782
				4783	/*
				4784	* If we have dup, raid1 or raid10 then only half of the free
				4785	* space is actually useable. For raid56, the space info used
				4786	* doesn't include the parity drive, so we don't have to
				4787	* change the math
				4788	*/
				4789	if (profile & (BTRFS_BLOCK_GROUP_DUP \|
				4790	BTRFS_BLOCK_GROUP_RAID1 \|
				4791	BTRFS_BLOCK_GROUP_RAID10))
				4792	avail >>= 1;
				4793
				4794	/*
				4795	* If we aren't flushing all things, let us overcommit up to
				4796	* 1/2th of the space. If we can flush, don't let us overcommit
				4797	* too much, let it overcommit up to 1/8 of the space.
				4798	*/
				4799	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				4800	avail >>= 3;
				4801	else
				4802	avail >>= 1;
				4803
				4804	if (used + bytes < space_info->total_bytes + avail)
				4805	return 1;
				4806	return 0;
				4807	}
				4808
				4809	static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
				4810	unsigned long nr_pages, int nr_items)
				4811	{
				4812	struct super_block *sb = fs_info->sb;
				4813
				4814	if (down_read_trylock(&sb->s_umount)) {
				4815	writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
				4816	up_read(&sb->s_umount);
				4817	} else {
				4818	/*
				4819	* We needn't worry the filesystem going from r/w to r/o though
				4820	* we don't acquire ->s_umount mutex, because the filesystem
				4821	* should guarantee the delalloc inodes list be empty after
				4822	* the filesystem is readonly(all dirty pages are written to
				4823	* the disk).
				4824	*/
				4825	btrfs_start_delalloc_roots(fs_info, 0, nr_items);
				4826	if (!current->journal_info)
				4827	btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
				4828	}
				4829	}
				4830
				4831	static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
				4832	u64 to_reclaim)
				4833	{
				4834	u64 bytes;
				4835	u64 nr;
				4836
				4837	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
				4838	nr = div64_u64(to_reclaim, bytes);
				4839	if (!nr)
				4840	nr = 1;
				4841	return nr;
				4842	}
				4843
				4844	#define EXTENT_SIZE_PER_ITEM SZ_256K
				4845
				4846	/*
				4847	* shrink metadata reservation for delalloc
				4848	*/
				4849	static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
				4850	u64 orig, bool wait_ordered)
				4851	{
				4852	struct btrfs_block_rsv *block_rsv;
				4853	struct btrfs_space_info *space_info;
				4854	struct btrfs_trans_handle *trans;
				4855	u64 delalloc_bytes;
				4856	u64 max_reclaim;
				4857	u64 items;
				4858	long time_left;
				4859	unsigned long nr_pages;
				4860	int loops;
				4861	enum btrfs_reserve_flush_enum flush;
				4862
				4863	/* Calc the number of the pages we need flush for space reservation */
				4864	items = calc_reclaim_items_nr(fs_info, to_reclaim);
				4865	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
				4866
				4867	trans = (struct btrfs_trans_handle *)current->journal_info;
				4868	block_rsv = &fs_info->delalloc_block_rsv;
				4869	space_info = block_rsv->space_info;
				4870
				4871	delalloc_bytes = percpu_counter_sum_positive(
				4872	&fs_info->delalloc_bytes);
				4873	if (delalloc_bytes == 0) {
				4874	if (trans)
				4875	return;
				4876	if (wait_ordered)
				4877	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				4878	return;
				4879	}
				4880
				4881	loops = 0;
				4882	while (delalloc_bytes && loops < 3) {
				4883	max_reclaim = min(delalloc_bytes, to_reclaim);
				4884	nr_pages = max_reclaim >> PAGE_SHIFT;
				4885	btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
				4886	/*
				4887	* We need to wait for the async pages to actually start before
				4888	* we do anything.
				4889	*/
				4890	max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
				4891	if (!max_reclaim)
				4892	goto skip_async;
				4893
				4894	if (max_reclaim <= nr_pages)
				4895	max_reclaim = 0;
				4896	else
				4897	max_reclaim -= nr_pages;
				4898
				4899	wait_event(fs_info->async_submit_wait,
				4900	atomic_read(&fs_info->async_delalloc_pages) <=
				4901	(int)max_reclaim);
				4902	skip_async:
				4903	if (!trans)
				4904	flush = BTRFS_RESERVE_FLUSH_ALL;
				4905	else
				4906	flush = BTRFS_RESERVE_NO_FLUSH;
				4907	spin_lock(&space_info->lock);
				4908	if (list_empty(&space_info->tickets) &&
				4909	list_empty(&space_info->priority_tickets)) {
				4910	spin_unlock(&space_info->lock);
				4911	break;
				4912	}
				4913	spin_unlock(&space_info->lock);
				4914
				4915	loops++;
				4916	if (wait_ordered && !trans) {
				4917	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				4918	} else {
				4919	time_left = schedule_timeout_killable(1);
				4920	if (time_left)
				4921	break;
				4922	}
				4923	delalloc_bytes = percpu_counter_sum_positive(
				4924	&fs_info->delalloc_bytes);
				4925	}
				4926	}
				4927
				4928	struct reserve_ticket {
				4929	u64 bytes;
				4930	int error;
				4931	struct list_head list;
				4932	wait_queue_head_t wait;
				4933	};
				4934
				4935	/**
				4936	* maybe_commit_transaction - possibly commit the transaction if its ok to
				4937	* @root - the root we're allocating for
				4938	* @bytes - the number of bytes we want to reserve
				4939	* @force - force the commit
				4940	*
				4941	* This will check to make sure that committing the transaction will actually
				4942	* get us somewhere and then commit the transaction if it does. Otherwise it
				4943	* will return -ENOSPC.
				4944	*/
				4945	static int may_commit_transaction(struct btrfs_fs_info *fs_info,
				4946	struct btrfs_space_info *space_info)
				4947	{
				4948	struct reserve_ticket *ticket = NULL;
				4949	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
				4950	struct btrfs_trans_handle *trans;
				4951	u64 bytes;
				4952
				4953	trans = (struct btrfs_trans_handle *)current->journal_info;
				4954	if (trans)
				4955	return -EAGAIN;
				4956
				4957	spin_lock(&space_info->lock);
				4958	if (!list_empty(&space_info->priority_tickets))
				4959	ticket = list_first_entry(&space_info->priority_tickets,
				4960	struct reserve_ticket, list);
				4961	else if (!list_empty(&space_info->tickets))
				4962	ticket = list_first_entry(&space_info->tickets,
				4963	struct reserve_ticket, list);
				4964	bytes = (ticket) ? ticket->bytes : 0;
				4965	spin_unlock(&space_info->lock);
				4966
				4967	if (!bytes)
				4968	return 0;
				4969
				4970	/* See if there is enough pinned space to make this reservation */
				4971	if (percpu_counter_compare(&space_info->total_bytes_pinned,
				4972	bytes) >= 0)
				4973	goto commit;
				4974
				4975	/*
				4976	* See if there is some space in the delayed insertion reservation for
				4977	* this reservation.
				4978	*/
				4979	if (space_info != delayed_rsv->space_info)
				4980	return -ENOSPC;
				4981
				4982	spin_lock(&delayed_rsv->lock);
				4983	if (delayed_rsv->size > bytes)
				4984	bytes = 0;
				4985	else
				4986	bytes -= delayed_rsv->size;
				4987	if (percpu_counter_compare(&space_info->total_bytes_pinned,
				4988	bytes) < 0) {
				4989	spin_unlock(&delayed_rsv->lock);
				4990	return -ENOSPC;
				4991	}
				4992	spin_unlock(&delayed_rsv->lock);
				4993
				4994	commit:
				4995	trans = btrfs_join_transaction(fs_info->extent_root);
				4996	if (IS_ERR(trans))
				4997	return -ENOSPC;
				4998
				4999	return btrfs_commit_transaction(trans);
				5000	}
				5001
				5002	/*
				5003	* Try to flush some data based on policy set by @state. This is only advisory
				5004	* and may fail for various reasons. The caller is supposed to examine the
				5005	* state of @space_info to detect the outcome.
				5006	*/
				5007	static void flush_space(struct btrfs_fs_info *fs_info,
				5008	struct btrfs_space_info *space_info, u64 num_bytes,
				5009	int state)
				5010	{
				5011	struct btrfs_root *root = fs_info->extent_root;
				5012	struct btrfs_trans_handle *trans;
				5013	int nr;
				5014	int ret = 0;
				5015
				5016	switch (state) {
				5017	case FLUSH_DELAYED_ITEMS_NR:
				5018	case FLUSH_DELAYED_ITEMS:
				5019	if (state == FLUSH_DELAYED_ITEMS_NR)
				5020	nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
				5021	else
				5022	nr = -1;
				5023
				5024	trans = btrfs_join_transaction(root);
				5025	if (IS_ERR(trans)) {
				5026	ret = PTR_ERR(trans);
				5027	break;
				5028	}
				5029	ret = btrfs_run_delayed_items_nr(trans, fs_info, nr);
				5030	btrfs_end_transaction(trans);
				5031	break;
				5032	case FLUSH_DELALLOC:
				5033	case FLUSH_DELALLOC_WAIT:
				5034	shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
				5035	state == FLUSH_DELALLOC_WAIT);
				5036	break;
				5037	case ALLOC_CHUNK:
				5038	trans = btrfs_join_transaction(root);
				5039	if (IS_ERR(trans)) {
				5040	ret = PTR_ERR(trans);
				5041	break;
				5042	}
				5043	ret = do_chunk_alloc(trans, fs_info,
				5044	btrfs_metadata_alloc_profile(fs_info),
				5045	CHUNK_ALLOC_NO_FORCE);
				5046	btrfs_end_transaction(trans);
				5047	if (ret > 0 \|\| ret == -ENOSPC)
				5048	ret = 0;
				5049	break;
				5050	case COMMIT_TRANS:
				5051	ret = may_commit_transaction(fs_info, space_info);
				5052	break;
				5053	default:
				5054	ret = -ENOSPC;
				5055	break;
				5056	}
				5057
				5058	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
				5059	ret);
				5060	return;
				5061	}
				5062
				5063	static inline u64
				5064	btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
				5065	struct btrfs_space_info *space_info,
				5066	bool system_chunk)
				5067	{
				5068	struct reserve_ticket *ticket;
				5069	u64 used;
				5070	u64 expected;
				5071	u64 to_reclaim = 0;
				5072
				5073	list_for_each_entry(ticket, &space_info->tickets, list)
				5074	to_reclaim += ticket->bytes;
				5075	list_for_each_entry(ticket, &space_info->priority_tickets, list)
				5076	to_reclaim += ticket->bytes;
				5077	if (to_reclaim)
				5078	return to_reclaim;
				5079
				5080	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
				5081	if (can_overcommit(fs_info, space_info, to_reclaim,
				5082	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
				5083	return 0;
				5084
				5085	used = btrfs_space_info_used(space_info, true);
				5086
				5087	if (can_overcommit(fs_info, space_info, SZ_1M,
				5088	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
				5089	expected = div_factor_fine(space_info->total_bytes, 95);
				5090	else
				5091	expected = div_factor_fine(space_info->total_bytes, 90);
				5092
				5093	if (used > expected)
				5094	to_reclaim = used - expected;
				5095	else
				5096	to_reclaim = 0;
				5097	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
				5098	space_info->bytes_reserved);
				5099	return to_reclaim;
				5100	}
				5101
				5102	static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
				5103	struct btrfs_space_info *space_info,
				5104	u64 used, bool system_chunk)
				5105	{
				5106	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
				5107
				5108	/* If we're just plain full then async reclaim just slows us down. */
				5109	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
				5110	return 0;
				5111
				5112	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				5113	system_chunk))
				5114	return 0;
				5115
				5116	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
				5117	!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
				5118	}
				5119
				5120	static void wake_all_tickets(struct list_head *head)
				5121	{
				5122	struct reserve_ticket *ticket;
				5123
				5124	while (!list_empty(head)) {
				5125	ticket = list_first_entry(head, struct reserve_ticket, list);
				5126	list_del_init(&ticket->list);
				5127	ticket->error = -ENOSPC;
				5128	wake_up(&ticket->wait);
				5129	}
				5130	}
				5131
				5132	/*
				5133	* This is for normal flushers, we can wait all goddamned day if we want to. We
				5134	* will loop and continuously try to flush as long as we are making progress.
				5135	* We count progress as clearing off tickets each time we have to loop.
				5136	*/
				5137	static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
				5138	{
				5139	struct btrfs_fs_info *fs_info;
				5140	struct btrfs_space_info *space_info;
				5141	u64 to_reclaim;
				5142	int flush_state;
				5143	int commit_cycles = 0;
				5144	u64 last_tickets_id;
				5145
				5146	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
				5147	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				5148
				5149	spin_lock(&space_info->lock);
				5150	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				5151	false);
				5152	if (!to_reclaim) {
				5153	space_info->flush = 0;
				5154	spin_unlock(&space_info->lock);
				5155	return;
				5156	}
				5157	last_tickets_id = space_info->tickets_id;
				5158	spin_unlock(&space_info->lock);
				5159
				5160	flush_state = FLUSH_DELAYED_ITEMS_NR;
				5161	do {
				5162	flush_space(fs_info, space_info, to_reclaim, flush_state);
				5163	spin_lock(&space_info->lock);
				5164	if (list_empty(&space_info->tickets)) {
				5165	space_info->flush = 0;
				5166	spin_unlock(&space_info->lock);
				5167	return;
				5168	}
				5169	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
				5170	space_info,
				5171	false);
				5172	if (last_tickets_id == space_info->tickets_id) {
				5173	flush_state++;
				5174	} else {
				5175	last_tickets_id = space_info->tickets_id;
				5176	flush_state = FLUSH_DELAYED_ITEMS_NR;
				5177	if (commit_cycles)
				5178	commit_cycles--;
				5179	}
				5180
				5181	if (flush_state > COMMIT_TRANS) {
				5182	commit_cycles++;
				5183	if (commit_cycles > 2) {
				5184	wake_all_tickets(&space_info->tickets);
				5185	space_info->flush = 0;
				5186	} else {
				5187	flush_state = FLUSH_DELAYED_ITEMS_NR;
				5188	}
				5189	}
				5190	spin_unlock(&space_info->lock);
				5191	} while (flush_state <= COMMIT_TRANS);
				5192	}
				5193
				5194	void btrfs_init_async_reclaim_work(struct work_struct *work)
				5195	{
				5196	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
				5197	}
				5198
				5199	static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
				5200	struct btrfs_space_info *space_info,
				5201	struct reserve_ticket *ticket)
				5202	{
				5203	u64 to_reclaim;
				5204	int flush_state = FLUSH_DELAYED_ITEMS_NR;
				5205
				5206	spin_lock(&space_info->lock);
				5207	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				5208	false);
				5209	if (!to_reclaim) {
				5210	spin_unlock(&space_info->lock);
				5211	return;
				5212	}
				5213	spin_unlock(&space_info->lock);
				5214
				5215	do {
				5216	flush_space(fs_info, space_info, to_reclaim, flush_state);
				5217	flush_state++;
				5218	spin_lock(&space_info->lock);
				5219	if (ticket->bytes == 0) {
				5220	spin_unlock(&space_info->lock);
				5221	return;
				5222	}
				5223	spin_unlock(&space_info->lock);
				5224
				5225	/*
				5226	* Priority flushers can't wait on delalloc without
				5227	* deadlocking.
				5228	*/
				5229	if (flush_state == FLUSH_DELALLOC \|\|
				5230	flush_state == FLUSH_DELALLOC_WAIT)
				5231	flush_state = ALLOC_CHUNK;
				5232	} while (flush_state < COMMIT_TRANS);
				5233	}
				5234
				5235	static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
				5236	struct btrfs_space_info *space_info,
				5237	struct reserve_ticket *ticket, u64 orig_bytes)
				5238
				5239	{
				5240	DEFINE_WAIT(wait);
				5241	int ret = 0;
				5242
				5243	spin_lock(&space_info->lock);
				5244	while (ticket->bytes > 0 && ticket->error == 0) {
				5245	ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
				5246	if (ret) {
				5247	ret = -EINTR;
				5248	break;
				5249	}
				5250	spin_unlock(&space_info->lock);
				5251
				5252	schedule();
				5253
				5254	finish_wait(&ticket->wait, &wait);
				5255	spin_lock(&space_info->lock);
				5256	}
				5257	if (!ret)
				5258	ret = ticket->error;
				5259	if (!list_empty(&ticket->list))
				5260	list_del_init(&ticket->list);
				5261	if (ticket->bytes && ticket->bytes < orig_bytes) {
				5262	u64 num_bytes = orig_bytes - ticket->bytes;
				5263	space_info->bytes_may_use -= num_bytes;
				5264	trace_btrfs_space_reservation(fs_info, "space_info",
				5265	space_info->flags, num_bytes, 0);
				5266	}
				5267	spin_unlock(&space_info->lock);
				5268
				5269	return ret;
				5270	}
				5271
				5272	/**
				5273	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				5274	* @root - the root we're allocating for
				5275	* @space_info - the space info we want to allocate from
				5276	* @orig_bytes - the number of bytes we want
				5277	* @flush - whether or not we can flush to make our reservation
				5278	*
				5279	* This will reserve orig_bytes number of bytes from the space info associated
				5280	* with the block_rsv. If there is not enough space it will make an attempt to
				5281	* flush out space to make room. It will do this by flushing delalloc if
				5282	* possible or committing the transaction. If flush is 0 then no attempts to
				5283	* regain reservations will be made and this will fail if there is not enough
				5284	* space already.
				5285	*/
				5286	static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
				5287	struct btrfs_space_info *space_info,
				5288	u64 orig_bytes,
				5289	enum btrfs_reserve_flush_enum flush,
				5290	bool system_chunk)
				5291	{
				5292	struct reserve_ticket ticket;
				5293	u64 used;
				5294	int ret = 0;
				5295
				5296	ASSERT(orig_bytes);
				5297	ASSERT(!current->journal_info \|\| flush != BTRFS_RESERVE_FLUSH_ALL);
				5298
				5299	spin_lock(&space_info->lock);
				5300	ret = -ENOSPC;
				5301	used = btrfs_space_info_used(space_info, true);
				5302
				5303	/*
				5304	* If we have enough space then hooray, make our reservation and carry
				5305	* on. If not see if we can overcommit, and if we can, hooray carry on.
				5306	* If not things get more complicated.
				5307	*/
				5308	if (used + orig_bytes <= space_info->total_bytes) {
				5309	space_info->bytes_may_use += orig_bytes;
				5310	trace_btrfs_space_reservation(fs_info, "space_info",
				5311	space_info->flags, orig_bytes, 1);
				5312	ret = 0;
				5313	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
				5314	system_chunk)) {
				5315	space_info->bytes_may_use += orig_bytes;
				5316	trace_btrfs_space_reservation(fs_info, "space_info",
				5317	space_info->flags, orig_bytes, 1);
				5318	ret = 0;
				5319	}
				5320
				5321	/*
				5322	* If we couldn't make a reservation then setup our reservation ticket
				5323	* and kick the async worker if it's not already running.
				5324	*
				5325	* If we are a priority flusher then we just need to add our ticket to
				5326	* the list and we will do our own flushing further down.
				5327	*/
				5328	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
				5329	ticket.bytes = orig_bytes;
				5330	ticket.error = 0;
				5331	init_waitqueue_head(&ticket.wait);
				5332	if (flush == BTRFS_RESERVE_FLUSH_ALL) {
				5333	list_add_tail(&ticket.list, &space_info->tickets);
				5334	if (!space_info->flush) {
				5335	space_info->flush = 1;
				5336	trace_btrfs_trigger_flush(fs_info,
				5337	space_info->flags,
				5338	orig_bytes, flush,
				5339	"enospc");
				5340	queue_work(system_unbound_wq,
				5341	&fs_info->async_reclaim_work);
				5342	}
				5343	} else {
				5344	list_add_tail(&ticket.list,
				5345	&space_info->priority_tickets);
				5346	}
				5347	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				5348	used += orig_bytes;
				5349	/*
				5350	* We will do the space reservation dance during log replay,
				5351	* which means we won't have fs_info->fs_root set, so don't do
				5352	* the async reclaim as we will panic.
				5353	*/
				5354	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
				5355	need_do_async_reclaim(fs_info, space_info,
				5356	used, system_chunk) &&
				5357	!work_busy(&fs_info->async_reclaim_work)) {
				5358	trace_btrfs_trigger_flush(fs_info, space_info->flags,
				5359	orig_bytes, flush, "preempt");
				5360	queue_work(system_unbound_wq,
				5361	&fs_info->async_reclaim_work);
				5362	}
				5363	}
				5364	spin_unlock(&space_info->lock);
				5365	if (!ret \|\| flush == BTRFS_RESERVE_NO_FLUSH)
				5366	return ret;
				5367
				5368	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				5369	return wait_reserve_ticket(fs_info, space_info, &ticket,
				5370	orig_bytes);
				5371
				5372	ret = 0;
				5373	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
				5374	spin_lock(&space_info->lock);
				5375	if (ticket.bytes) {
				5376	if (ticket.bytes < orig_bytes) {
				5377	u64 num_bytes = orig_bytes - ticket.bytes;
				5378	space_info->bytes_may_use -= num_bytes;
				5379	trace_btrfs_space_reservation(fs_info, "space_info",
				5380	space_info->flags,
				5381	num_bytes, 0);
				5382
				5383	}
				5384	list_del_init(&ticket.list);
				5385	ret = -ENOSPC;
				5386	}
				5387	spin_unlock(&space_info->lock);
				5388	ASSERT(list_empty(&ticket.list));
				5389	return ret;
				5390	}
				5391
				5392	/**
				5393	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				5394	* @root - the root we're allocating for
				5395	* @block_rsv - the block_rsv we're allocating for
				5396	* @orig_bytes - the number of bytes we want
				5397	* @flush - whether or not we can flush to make our reservation
				5398	*
				5399	* This will reserve orgi_bytes number of bytes from the space info associated
				5400	* with the block_rsv. If there is not enough space it will make an attempt to
				5401	* flush out space to make room. It will do this by flushing delalloc if
				5402	* possible or committing the transaction. If flush is 0 then no attempts to
				5403	* regain reservations will be made and this will fail if there is not enough
				5404	* space already.
				5405	*/
				5406	static int reserve_metadata_bytes(struct btrfs_root *root,
				5407	struct btrfs_block_rsv *block_rsv,
				5408	u64 orig_bytes,
				5409	enum btrfs_reserve_flush_enum flush)
				5410	{
				5411	struct btrfs_fs_info *fs_info = root->fs_info;
				5412	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5413	int ret;
				5414	bool system_chunk = (root == fs_info->chunk_root);
				5415
				5416	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
				5417	orig_bytes, flush, system_chunk);
				5418	if (ret == -ENOSPC &&
				5419	unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
				5420	if (block_rsv != global_rsv &&
				5421	!block_rsv_use_bytes(global_rsv, orig_bytes))
				5422	ret = 0;
				5423	}
				5424	if (ret == -ENOSPC)
				5425	trace_btrfs_space_reservation(fs_info, "space_info:enospc",
				5426	block_rsv->space_info->flags,
				5427	orig_bytes, 1);
				5428	return ret;
				5429	}
				5430
				5431	static struct btrfs_block_rsv *get_block_rsv(
				5432	const struct btrfs_trans_handle *trans,
				5433	const struct btrfs_root *root)
				5434	{
				5435	struct btrfs_fs_info *fs_info = root->fs_info;
				5436	struct btrfs_block_rsv *block_rsv = NULL;
				5437
				5438	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) \|\|
				5439	(root == fs_info->csum_root && trans->adding_csums) \|\|
				5440	(root == fs_info->uuid_root))
				5441	block_rsv = trans->block_rsv;
				5442
				5443	if (!block_rsv)
				5444	block_rsv = root->block_rsv;
				5445
				5446	if (!block_rsv)
				5447	block_rsv = &fs_info->empty_block_rsv;
				5448
				5449	return block_rsv;
				5450	}
				5451
				5452	static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
				5453	u64 num_bytes)
				5454	{
				5455	int ret = -ENOSPC;
				5456	spin_lock(&block_rsv->lock);
				5457	if (block_rsv->reserved >= num_bytes) {
				5458	block_rsv->reserved -= num_bytes;
				5459	if (block_rsv->reserved < block_rsv->size)
				5460	block_rsv->full = 0;
				5461	ret = 0;
				5462	}
				5463	spin_unlock(&block_rsv->lock);
				5464	return ret;
				5465	}
				5466
				5467	static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
				5468	u64 num_bytes, int update_size)
				5469	{
				5470	spin_lock(&block_rsv->lock);
				5471	block_rsv->reserved += num_bytes;
				5472	if (update_size)
				5473	block_rsv->size += num_bytes;
				5474	else if (block_rsv->reserved >= block_rsv->size)
				5475	block_rsv->full = 1;
				5476	spin_unlock(&block_rsv->lock);
				5477	}
				5478
				5479	int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
				5480	struct btrfs_block_rsv *dest, u64 num_bytes,
				5481	int min_factor)
				5482	{
				5483	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5484	u64 min_bytes;
				5485
				5486	if (global_rsv->space_info != dest->space_info)
				5487	return -ENOSPC;
				5488
				5489	spin_lock(&global_rsv->lock);
				5490	min_bytes = div_factor(global_rsv->size, min_factor);
				5491	if (global_rsv->reserved < min_bytes + num_bytes) {
				5492	spin_unlock(&global_rsv->lock);
				5493	return -ENOSPC;
				5494	}
				5495	global_rsv->reserved -= num_bytes;
				5496	if (global_rsv->reserved < global_rsv->size)
				5497	global_rsv->full = 0;
				5498	spin_unlock(&global_rsv->lock);
				5499
				5500	block_rsv_add_bytes(dest, num_bytes, 1);
				5501	return 0;
				5502	}
				5503
				5504	/*
				5505	* This is for space we already have accounted in space_info->bytes_may_use, so
				5506	* basically when we're returning space from block_rsv's.
				5507	*/
				5508	static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				5509	struct btrfs_space_info *space_info,
				5510	u64 num_bytes)
				5511	{
				5512	struct reserve_ticket *ticket;
				5513	struct list_head *head;
				5514	u64 used;
				5515	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
				5516	bool check_overcommit = false;
				5517
				5518	spin_lock(&space_info->lock);
				5519	head = &space_info->priority_tickets;
				5520
				5521	/*
				5522	* If we are over our limit then we need to check and see if we can
				5523	* overcommit, and if we can't then we just need to free up our space
				5524	* and not satisfy any requests.
				5525	*/
				5526	used = btrfs_space_info_used(space_info, true);
				5527	if (used - num_bytes >= space_info->total_bytes)
				5528	check_overcommit = true;
				5529	again:
				5530	while (!list_empty(head) && num_bytes) {
				5531	ticket = list_first_entry(head, struct reserve_ticket,
				5532	list);
				5533	/*
				5534	* We use 0 bytes because this space is already reserved, so
				5535	* adding the ticket space would be a double count.
				5536	*/
				5537	if (check_overcommit &&
				5538	!can_overcommit(fs_info, space_info, 0, flush, false))
				5539	break;
				5540	if (num_bytes >= ticket->bytes) {
				5541	list_del_init(&ticket->list);
				5542	num_bytes -= ticket->bytes;
				5543	ticket->bytes = 0;
				5544	space_info->tickets_id++;
				5545	wake_up(&ticket->wait);
				5546	} else {
				5547	ticket->bytes -= num_bytes;
				5548	num_bytes = 0;
				5549	}
				5550	}
				5551
				5552	if (num_bytes && head == &space_info->priority_tickets) {
				5553	head = &space_info->tickets;
				5554	flush = BTRFS_RESERVE_FLUSH_ALL;
				5555	goto again;
				5556	}
				5557	space_info->bytes_may_use -= num_bytes;
				5558	trace_btrfs_space_reservation(fs_info, "space_info",
				5559	space_info->flags, num_bytes, 0);
				5560	spin_unlock(&space_info->lock);
				5561	}
				5562
				5563	/*
				5564	* This is for newly allocated space that isn't accounted in
				5565	* space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
				5566	* we use this helper.
				5567	*/
				5568	static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				5569	struct btrfs_space_info *space_info,
				5570	u64 num_bytes)
				5571	{
				5572	struct reserve_ticket *ticket;
				5573	struct list_head *head = &space_info->priority_tickets;
				5574
				5575	again:
				5576	while (!list_empty(head) && num_bytes) {
				5577	ticket = list_first_entry(head, struct reserve_ticket,
				5578	list);
				5579	if (num_bytes >= ticket->bytes) {
				5580	trace_btrfs_space_reservation(fs_info, "space_info",
				5581	space_info->flags,
				5582	ticket->bytes, 1);
				5583	list_del_init(&ticket->list);
				5584	num_bytes -= ticket->bytes;
				5585	space_info->bytes_may_use += ticket->bytes;
				5586	ticket->bytes = 0;
				5587	space_info->tickets_id++;
				5588	wake_up(&ticket->wait);
				5589	} else {
				5590	trace_btrfs_space_reservation(fs_info, "space_info",
				5591	space_info->flags,
				5592	num_bytes, 1);
				5593	space_info->bytes_may_use += num_bytes;
				5594	ticket->bytes -= num_bytes;
				5595	num_bytes = 0;
				5596	}
				5597	}
				5598
				5599	if (num_bytes && head == &space_info->priority_tickets) {
				5600	head = &space_info->tickets;
				5601	goto again;
				5602	}
				5603	}
				5604
				5605	static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
				5606	struct btrfs_block_rsv *block_rsv,
				5607	struct btrfs_block_rsv *dest, u64 num_bytes)
				5608	{
				5609	struct btrfs_space_info *space_info = block_rsv->space_info;
				5610
				5611	spin_lock(&block_rsv->lock);
				5612	if (num_bytes == (u64)-1)
				5613	num_bytes = block_rsv->size;
				5614	block_rsv->size -= num_bytes;
				5615	if (block_rsv->reserved >= block_rsv->size) {
				5616	num_bytes = block_rsv->reserved - block_rsv->size;
				5617	block_rsv->reserved = block_rsv->size;
				5618	block_rsv->full = 1;
				5619	} else {
				5620	num_bytes = 0;
				5621	}
				5622	spin_unlock(&block_rsv->lock);
				5623
				5624	if (num_bytes > 0) {
				5625	if (dest) {
				5626	spin_lock(&dest->lock);
				5627	if (!dest->full) {
				5628	u64 bytes_to_add;
				5629
				5630	bytes_to_add = dest->size - dest->reserved;
				5631	bytes_to_add = min(num_bytes, bytes_to_add);
				5632	dest->reserved += bytes_to_add;
				5633	if (dest->reserved >= dest->size)
				5634	dest->full = 1;
				5635	num_bytes -= bytes_to_add;
				5636	}
				5637	spin_unlock(&dest->lock);
				5638	}
				5639	if (num_bytes)
				5640	space_info_add_old_bytes(fs_info, space_info,
				5641	num_bytes);
				5642	}
				5643	}
				5644
				5645	int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
				5646	struct btrfs_block_rsv *dst, u64 num_bytes,
				5647	int update_size)
				5648	{
				5649	int ret;
				5650
				5651	ret = block_rsv_use_bytes(src, num_bytes);
				5652	if (ret)
				5653	return ret;
				5654
				5655	block_rsv_add_bytes(dst, num_bytes, update_size);
				5656	return 0;
				5657	}
				5658
				5659	void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
				5660	{
				5661	memset(rsv, 0, sizeof(*rsv));
				5662	spin_lock_init(&rsv->lock);
				5663	rsv->type = type;
				5664	}
				5665
				5666	struct btrfs_block_rsv btrfs_alloc_block_rsv(struct btrfs_fs_info fs_info,
				5667	unsigned short type)
				5668	{
				5669	struct btrfs_block_rsv *block_rsv;
				5670
				5671	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
				5672	if (!block_rsv)
				5673	return NULL;
				5674
				5675	btrfs_init_block_rsv(block_rsv, type);
				5676	block_rsv->space_info = __find_space_info(fs_info,
				5677	BTRFS_BLOCK_GROUP_METADATA);
				5678	return block_rsv;
				5679	}
				5680
				5681	void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
				5682	struct btrfs_block_rsv *rsv)
				5683	{
				5684	if (!rsv)
				5685	return;
				5686	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
				5687	kfree(rsv);
				5688	}
				5689
				5690	void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
				5691	{
				5692	kfree(rsv);
				5693	}
				5694
				5695	int btrfs_block_rsv_add(struct btrfs_root *root,
				5696	struct btrfs_block_rsv *block_rsv, u64 num_bytes,
				5697	enum btrfs_reserve_flush_enum flush)
				5698	{
				5699	int ret;
				5700
				5701	if (num_bytes == 0)
				5702	return 0;
				5703
				5704	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
				5705	if (!ret) {
				5706	block_rsv_add_bytes(block_rsv, num_bytes, 1);
				5707	return 0;
				5708	}
				5709
				5710	return ret;
				5711	}
				5712
				5713	int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
				5714	{
				5715	u64 num_bytes = 0;
				5716	int ret = -ENOSPC;
				5717
				5718	if (!block_rsv)
				5719	return 0;
				5720
				5721	spin_lock(&block_rsv->lock);
				5722	num_bytes = div_factor(block_rsv->size, min_factor);
				5723	if (block_rsv->reserved >= num_bytes)
				5724	ret = 0;
				5725	spin_unlock(&block_rsv->lock);
				5726
				5727	return ret;
				5728	}
				5729
				5730	int btrfs_block_rsv_refill(struct btrfs_root *root,
				5731	struct btrfs_block_rsv *block_rsv, u64 min_reserved,
				5732	enum btrfs_reserve_flush_enum flush)
				5733	{
				5734	u64 num_bytes = 0;
				5735	int ret = -ENOSPC;
				5736
				5737	if (!block_rsv)
				5738	return 0;
				5739
				5740	spin_lock(&block_rsv->lock);
				5741	num_bytes = min_reserved;
				5742	if (block_rsv->reserved >= num_bytes)
				5743	ret = 0;
				5744	else
				5745	num_bytes -= block_rsv->reserved;
				5746	spin_unlock(&block_rsv->lock);
				5747
				5748	if (!ret)
				5749	return 0;
				5750
				5751	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
				5752	if (!ret) {
				5753	block_rsv_add_bytes(block_rsv, num_bytes, 0);
				5754	return 0;
				5755	}
				5756
				5757	return ret;
				5758	}
				5759
				5760	void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
				5761	struct btrfs_block_rsv *block_rsv,
				5762	u64 num_bytes)
				5763	{
				5764	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5765
				5766	if (global_rsv == block_rsv \|\|
				5767	block_rsv->space_info != global_rsv->space_info)
				5768	global_rsv = NULL;
				5769	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
				5770	}
				5771
				5772	static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
				5773	{
				5774	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
				5775	struct btrfs_space_info *sinfo = block_rsv->space_info;
				5776	u64 num_bytes;
				5777
				5778	/*
				5779	* The global block rsv is based on the size of the extent tree, the
				5780	* checksum tree and the root tree. If the fs is empty we want to set
				5781	* it to a minimal amount for safety.
				5782	*/
				5783	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
				5784	btrfs_root_used(&fs_info->csum_root->root_item) +
				5785	btrfs_root_used(&fs_info->tree_root->root_item);
				5786	num_bytes = max_t(u64, num_bytes, SZ_16M);
				5787
				5788	spin_lock(&sinfo->lock);
				5789	spin_lock(&block_rsv->lock);
				5790
				5791	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
				5792
				5793	if (block_rsv->reserved < block_rsv->size) {
				5794	num_bytes = btrfs_space_info_used(sinfo, true);
				5795	if (sinfo->total_bytes > num_bytes) {
				5796	num_bytes = sinfo->total_bytes - num_bytes;
				5797	num_bytes = min(num_bytes,
				5798	block_rsv->size - block_rsv->reserved);
				5799	block_rsv->reserved += num_bytes;
				5800	sinfo->bytes_may_use += num_bytes;
				5801	trace_btrfs_space_reservation(fs_info, "space_info",
				5802	sinfo->flags, num_bytes,
				5803	1);
				5804	}
				5805	} else if (block_rsv->reserved > block_rsv->size) {
				5806	num_bytes = block_rsv->reserved - block_rsv->size;
				5807	sinfo->bytes_may_use -= num_bytes;
				5808	trace_btrfs_space_reservation(fs_info, "space_info",
				5809	sinfo->flags, num_bytes, 0);
				5810	block_rsv->reserved = block_rsv->size;
				5811	}
				5812
				5813	if (block_rsv->reserved == block_rsv->size)
				5814	block_rsv->full = 1;
				5815	else
				5816	block_rsv->full = 0;
				5817
				5818	spin_unlock(&block_rsv->lock);
				5819	spin_unlock(&sinfo->lock);
				5820	}
				5821
				5822	static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
				5823	{
				5824	struct btrfs_space_info *space_info;
				5825
				5826	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				5827	fs_info->chunk_block_rsv.space_info = space_info;
				5828
				5829	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				5830	fs_info->global_block_rsv.space_info = space_info;
				5831	fs_info->delalloc_block_rsv.space_info = space_info;
				5832	fs_info->trans_block_rsv.space_info = space_info;
				5833	fs_info->empty_block_rsv.space_info = space_info;
				5834	fs_info->delayed_block_rsv.space_info = space_info;
				5835
				5836	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
				5837	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
				5838	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
				5839	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
				5840	if (fs_info->quota_root)
				5841	fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
				5842	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
				5843
				5844	update_global_block_rsv(fs_info);
				5845	}
				5846
				5847	static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
				5848	{
				5849	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
				5850	(u64)-1);
				5851	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
				5852	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
				5853	WARN_ON(fs_info->trans_block_rsv.size > 0);
				5854	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
				5855	WARN_ON(fs_info->chunk_block_rsv.size > 0);
				5856	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
				5857	WARN_ON(fs_info->delayed_block_rsv.size > 0);
				5858	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
				5859	}
				5860
				5861	void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
				5862	struct btrfs_fs_info *fs_info)
				5863	{
				5864	if (!trans->block_rsv)
				5865	return;
				5866
				5867	if (!trans->bytes_reserved)
				5868	return;
				5869
				5870	trace_btrfs_space_reservation(fs_info, "transaction",
				5871	trans->transid, trans->bytes_reserved, 0);
				5872	btrfs_block_rsv_release(fs_info, trans->block_rsv,
				5873	trans->bytes_reserved);
				5874	trans->bytes_reserved = 0;
				5875	}
				5876
				5877	/*
				5878	* To be called after all the new block groups attached to the transaction
				5879	* handle have been created (btrfs_create_pending_block_groups()).
				5880	*/
				5881	void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
				5882	{
				5883	struct btrfs_fs_info *fs_info = trans->fs_info;
				5884
				5885	if (!trans->chunk_bytes_reserved)
				5886	return;
				5887
				5888	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
				5889
				5890	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
				5891	trans->chunk_bytes_reserved);
				5892	trans->chunk_bytes_reserved = 0;
				5893	}
				5894
				5895	/* Can only return 0 or -ENOSPC */
				5896	int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
				5897	struct btrfs_inode *inode)
				5898	{
				5899	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				5900	struct btrfs_root *root = inode->root;
				5901	/*
				5902	* We always use trans->block_rsv here as we will have reserved space
				5903	* for our orphan when starting the transaction, using get_block_rsv()
				5904	* here will sometimes make us choose the wrong block rsv as we could be
				5905	* doing a reloc inode for a non refcounted root.
				5906	*/
				5907	struct btrfs_block_rsv *src_rsv = trans->block_rsv;
				5908	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
				5909
				5910	/*
				5911	* We need to hold space in order to delete our orphan item once we've
				5912	* added it, so this takes the reservation so we can release it later
				5913	* when we are truly done with the orphan item.
				5914	*/
				5915	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
				5916
				5917	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
				5918	num_bytes, 1);
				5919	return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
				5920	}
				5921
				5922	void btrfs_orphan_release_metadata(struct btrfs_inode *inode)
				5923	{
				5924	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				5925	struct btrfs_root *root = inode->root;
				5926	u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
				5927
				5928	trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode),
				5929	num_bytes, 0);
				5930	btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes);
				5931	}
				5932
				5933	/*
				5934	* btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
				5935	* root: the root of the parent directory
				5936	* rsv: block reservation
				5937	* items: the number of items that we need do reservation
				5938	* qgroup_reserved: used to return the reserved size in qgroup
				5939	*
				5940	* This function is used to reserve the space for snapshot/subvolume
				5941	* creation and deletion. Those operations are different with the
				5942	* common file/directory operations, they change two fs/file trees
				5943	* and root tree, the number of items that the qgroup reserves is
				5944	* different with the free space reservation. So we can not use
				5945	* the space reservation mechanism in start_transaction().
				5946	*/
				5947	int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
				5948	struct btrfs_block_rsv *rsv,
				5949	int items,
				5950	u64 *qgroup_reserved,
				5951	bool use_global_rsv)
				5952	{
				5953	u64 num_bytes;
				5954	int ret;
				5955	struct btrfs_fs_info *fs_info = root->fs_info;
				5956	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5957
				5958	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
				5959	/* One for parent inode, two for dir entries */
				5960	num_bytes = 3 * fs_info->nodesize;
				5961	ret = btrfs_qgroup_reserve_meta(root, num_bytes, true);
				5962	if (ret)
				5963	return ret;
				5964	} else {
				5965	num_bytes = 0;
				5966	}
				5967
				5968	*qgroup_reserved = num_bytes;
				5969
				5970	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
				5971	rsv->space_info = __find_space_info(fs_info,
				5972	BTRFS_BLOCK_GROUP_METADATA);
				5973	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
				5974	BTRFS_RESERVE_FLUSH_ALL);
				5975
				5976	if (ret == -ENOSPC && use_global_rsv)
				5977	ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
				5978
				5979	if (ret && *qgroup_reserved)
				5980	btrfs_qgroup_free_meta(root, *qgroup_reserved);
				5981
				5982	return ret;
				5983	}
				5984
				5985	void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
				5986	struct btrfs_block_rsv *rsv)
				5987	{
				5988	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
				5989	}
				5990
				5991	/**
				5992	* drop_outstanding_extent - drop an outstanding extent
				5993	* @inode: the inode we're dropping the extent for
				5994	* @num_bytes: the number of bytes we're releasing.
				5995	*
				5996	* This is called when we are freeing up an outstanding extent, either called
				5997	* after an error or after an extent is written. This will return the number of
				5998	* reserved extents that need to be freed. This must be called with
				5999	* BTRFS_I(inode)->lock held.
				6000	*/
				6001	static unsigned drop_outstanding_extent(struct btrfs_inode *inode,
				6002	u64 num_bytes)
				6003	{
				6004	unsigned drop_inode_space = 0;
				6005	unsigned dropped_extents = 0;
				6006	unsigned num_extents;
				6007
				6008	num_extents = count_max_extents(num_bytes);
				6009	ASSERT(num_extents);
				6010	ASSERT(inode->outstanding_extents >= num_extents);
				6011	inode->outstanding_extents -= num_extents;
				6012
				6013	if (inode->outstanding_extents == 0 &&
				6014	test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
				6015	&inode->runtime_flags))
				6016	drop_inode_space = 1;
				6017
				6018	/*
				6019	* If we have more or the same amount of outstanding extents than we have
				6020	* reserved then we need to leave the reserved extents count alone.
				6021	*/
				6022	if (inode->outstanding_extents >= inode->reserved_extents)
				6023	return drop_inode_space;
				6024
				6025	dropped_extents = inode->reserved_extents - inode->outstanding_extents;
				6026	inode->reserved_extents -= dropped_extents;
				6027	return dropped_extents + drop_inode_space;
				6028	}
				6029
				6030	/**
				6031	* calc_csum_metadata_size - return the amount of metadata space that must be
				6032	* reserved/freed for the given bytes.
				6033	* @inode: the inode we're manipulating
				6034	* @num_bytes: the number of bytes in question
				6035	* @reserve: 1 if we are reserving space, 0 if we are freeing space
				6036	*
				6037	* This adjusts the number of csum_bytes in the inode and then returns the
				6038	* correct amount of metadata that must either be reserved or freed. We
				6039	* calculate how many checksums we can fit into one leaf and then divide the
				6040	* number of bytes that will need to be checksumed by this value to figure out
				6041	* how many checksums will be required. If we are adding bytes then the number
				6042	* may go up and we will return the number of additional bytes that must be
				6043	* reserved. If it is going down we will return the number of bytes that must
				6044	* be freed.
				6045	*
				6046	* This must be called with BTRFS_I(inode)->lock held.
				6047	*/
				6048	static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 num_bytes,
				6049	int reserve)
				6050	{
				6051	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				6052	u64 old_csums, num_csums;
				6053
				6054	if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
				6055	return 0;
				6056
				6057	old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
				6058	if (reserve)
				6059	inode->csum_bytes += num_bytes;
				6060	else
				6061	inode->csum_bytes -= num_bytes;
				6062	num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
				6063
				6064	/* No change, no need to reserve more */
				6065	if (old_csums == num_csums)
				6066	return 0;
				6067
				6068	if (reserve)
				6069	return btrfs_calc_trans_metadata_size(fs_info,
				6070	num_csums - old_csums);
				6071
				6072	return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
				6073	}
				6074
				6075	int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
				6076	{
				6077	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				6078	struct btrfs_root *root = inode->root;
				6079	struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
				6080	u64 to_reserve = 0;
				6081	u64 csum_bytes;
				6082	unsigned nr_extents;
				6083	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
				6084	int ret = 0;
				6085	bool delalloc_lock = true;
				6086	u64 to_free = 0;
				6087	unsigned dropped;
				6088	bool release_extra = false;
				6089
				6090	/* If we are a free space inode we need to not flush since we will be in
				6091	* the middle of a transaction commit. We also don't need the delalloc
				6092	* mutex since we won't race with anybody. We need this mostly to make
				6093	* lockdep shut its filthy mouth.
				6094	*
				6095	* If we have a transaction open (can happen if we call truncate_block
				6096	* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
				6097	*/
				6098	if (btrfs_is_free_space_inode(inode)) {
				6099	flush = BTRFS_RESERVE_NO_FLUSH;
				6100	delalloc_lock = false;
				6101	} else if (current->journal_info) {
				6102	flush = BTRFS_RESERVE_FLUSH_LIMIT;
				6103	}
				6104
				6105	if (flush != BTRFS_RESERVE_NO_FLUSH &&
				6106	btrfs_transaction_in_commit(fs_info))
				6107	schedule_timeout(1);
				6108
				6109	if (delalloc_lock)
				6110	mutex_lock(&inode->delalloc_mutex);
				6111
				6112	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
				6113
				6114	spin_lock(&inode->lock);
				6115	nr_extents = count_max_extents(num_bytes);
				6116	inode->outstanding_extents += nr_extents;
				6117
				6118	nr_extents = 0;
				6119	if (inode->outstanding_extents > inode->reserved_extents)
				6120	nr_extents += inode->outstanding_extents -
				6121	inode->reserved_extents;
				6122
				6123	/* We always want to reserve a slot for updating the inode. */
				6124	to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1);
				6125	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
				6126	csum_bytes = inode->csum_bytes;
				6127	spin_unlock(&inode->lock);
				6128
				6129	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
				6130	ret = btrfs_qgroup_reserve_meta(root,
				6131	nr_extents * fs_info->nodesize, true);
				6132	if (ret)
				6133	goto out_fail;
				6134	}
				6135
				6136	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
				6137	if (unlikely(ret)) {
				6138	btrfs_qgroup_free_meta(root,
				6139	nr_extents * fs_info->nodesize);
				6140	goto out_fail;
				6141	}
				6142
				6143	spin_lock(&inode->lock);
				6144	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
				6145	&inode->runtime_flags)) {
				6146	to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
				6147	release_extra = true;
				6148	}
				6149	inode->reserved_extents += nr_extents;
				6150	spin_unlock(&inode->lock);
				6151
				6152	if (delalloc_lock)
				6153	mutex_unlock(&inode->delalloc_mutex);
				6154
				6155	if (to_reserve)
				6156	trace_btrfs_space_reservation(fs_info, "delalloc",
				6157	btrfs_ino(inode), to_reserve, 1);
				6158	if (release_extra)
				6159	btrfs_block_rsv_release(fs_info, block_rsv,
				6160	btrfs_calc_trans_metadata_size(fs_info, 1));
				6161	return 0;
				6162
				6163	out_fail:
				6164	spin_lock(&inode->lock);
				6165	dropped = drop_outstanding_extent(inode, num_bytes);
				6166	/*
				6167	* If the inodes csum_bytes is the same as the original
				6168	* csum_bytes then we know we haven't raced with any free()ers
				6169	* so we can just reduce our inodes csum bytes and carry on.
				6170	*/
				6171	if (inode->csum_bytes == csum_bytes) {
				6172	calc_csum_metadata_size(inode, num_bytes, 0);
				6173	} else {
				6174	u64 orig_csum_bytes = inode->csum_bytes;
				6175	u64 bytes;
				6176
				6177	/*
				6178	* This is tricky, but first we need to figure out how much we
				6179	* freed from any free-ers that occurred during this
				6180	* reservation, so we reset ->csum_bytes to the csum_bytes
				6181	* before we dropped our lock, and then call the free for the
				6182	* number of bytes that were freed while we were trying our
				6183	* reservation.
				6184	*/
				6185	bytes = csum_bytes - inode->csum_bytes;
				6186	inode->csum_bytes = csum_bytes;
				6187	to_free = calc_csum_metadata_size(inode, bytes, 0);
				6188
				6189
				6190	/*
				6191	* Now we need to see how much we would have freed had we not
				6192	* been making this reservation and our ->csum_bytes were not
				6193	* artificially inflated.
				6194	*/
				6195	inode->csum_bytes = csum_bytes - num_bytes;
				6196	bytes = csum_bytes - orig_csum_bytes;
				6197	bytes = calc_csum_metadata_size(inode, bytes, 0);
				6198
				6199	/*
				6200	* Now reset ->csum_bytes to what it should be. If bytes is
				6201	* more than to_free then we would have freed more space had we
				6202	* not had an artificially high ->csum_bytes, so we need to free
				6203	* the remainder. If bytes is the same or less then we don't
				6204	* need to do anything, the other free-ers did the correct
				6205	* thing.
				6206	*/
				6207	inode->csum_bytes = orig_csum_bytes - num_bytes;
				6208	if (bytes > to_free)
				6209	to_free = bytes - to_free;
				6210	else
				6211	to_free = 0;
				6212	}
				6213	spin_unlock(&inode->lock);
				6214	if (dropped)
				6215	to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
				6216
				6217	if (to_free) {
				6218	btrfs_block_rsv_release(fs_info, block_rsv, to_free);
				6219	trace_btrfs_space_reservation(fs_info, "delalloc",
				6220	btrfs_ino(inode), to_free, 0);
				6221	}
				6222	if (delalloc_lock)
				6223	mutex_unlock(&inode->delalloc_mutex);
				6224	return ret;
				6225	}
				6226
				6227	/**
				6228	* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
				6229	* @inode: the inode to release the reservation for
				6230	* @num_bytes: the number of bytes we're releasing
				6231	*
				6232	* This will release the metadata reservation for an inode. This can be called
				6233	* once we complete IO for a given set of bytes to release their metadata
				6234	* reservations.
				6235	*/
				6236	void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes)
				6237	{
				6238	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				6239	u64 to_free = 0;
				6240	unsigned dropped;
				6241
				6242	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
				6243	spin_lock(&inode->lock);
				6244	dropped = drop_outstanding_extent(inode, num_bytes);
				6245
				6246	if (num_bytes)
				6247	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
				6248	spin_unlock(&inode->lock);
				6249	if (dropped > 0)
				6250	to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
				6251
				6252	if (btrfs_is_testing(fs_info))
				6253	return;
				6254
				6255	trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
				6256	to_free, 0);
				6257
				6258	btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
				6259	}
				6260
				6261	/**
				6262	* btrfs_delalloc_reserve_space - reserve data and metadata space for
				6263	* delalloc
				6264	* @inode: inode we're writing to
				6265	* @start: start range we are writing to
				6266	* @len: how long the range we are writing to
				6267	* @reserved: mandatory parameter, record actually reserved qgroup ranges of
				6268	* current reservation.
				6269	*
				6270	* This will do the following things
				6271	*
				6272	* o reserve space in data space info for num bytes
				6273	* and reserve precious corresponding qgroup space
				6274	* (Done in check_data_free_space)
				6275	*
				6276	* o reserve space for metadata space, based on the number of outstanding
				6277	* extents and how much csums will be needed
				6278	* also reserve metadata space in a per root over-reserve method.
				6279	* o add to the inodes->delalloc_bytes
				6280	* o add it to the fs_info's delalloc inodes list.
				6281	* (Above 3 all done in delalloc_reserve_metadata)
				6282	*
				6283	* Return 0 for success
				6284	* Return <0 for error(-ENOSPC or -EQUOT)
				6285	*/
				6286	int btrfs_delalloc_reserve_space(struct inode *inode,
				6287	struct extent_changeset **reserved, u64 start, u64 len)
				6288	{
				6289	int ret;
				6290
				6291	ret = btrfs_check_data_free_space(inode, reserved, start, len);
				6292	if (ret < 0)
				6293	return ret;
				6294	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
				6295	if (ret < 0)
				6296	btrfs_free_reserved_data_space(inode, *reserved, start, len);
				6297	return ret;
				6298	}
				6299
				6300	/**
				6301	* btrfs_delalloc_release_space - release data and metadata space for delalloc
				6302	* @inode: inode we're releasing space for
				6303	* @start: start position of the space already reserved
				6304	* @len: the len of the space already reserved
				6305	*
				6306	* This must be matched with a call to btrfs_delalloc_reserve_space. This is
				6307	* called in the case that we don't need the metadata AND data reservations
				6308	* anymore. So if there is an error or we insert an inline extent.
				6309	*
				6310	* This function will release the metadata space that was not used and will
				6311	* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
				6312	* list if there are no delalloc bytes left.
				6313	* Also it will handle the qgroup reserved space.
				6314	*/
				6315	void btrfs_delalloc_release_space(struct inode *inode,
				6316	struct extent_changeset *reserved, u64 start, u64 len)
				6317	{
				6318	btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
				6319	btrfs_free_reserved_data_space(inode, reserved, start, len);
				6320	}
				6321
				6322	static int update_block_group(struct btrfs_trans_handle *trans,
				6323	struct btrfs_fs_info *info, u64 bytenr,
				6324	u64 num_bytes, int alloc)
				6325	{
				6326	struct btrfs_block_group_cache *cache = NULL;
				6327	u64 total = num_bytes;
				6328	u64 old_val;
				6329	u64 byte_in_group;
				6330	int factor;
				6331
				6332	/* block accounting for super block */
				6333	spin_lock(&info->delalloc_root_lock);
				6334	old_val = btrfs_super_bytes_used(info->super_copy);
				6335	if (alloc)
				6336	old_val += num_bytes;
				6337	else
				6338	old_val -= num_bytes;
				6339	btrfs_set_super_bytes_used(info->super_copy, old_val);
				6340	spin_unlock(&info->delalloc_root_lock);
				6341
				6342	while (total) {
				6343	cache = btrfs_lookup_block_group(info, bytenr);
				6344	if (!cache)
				6345	return -ENOENT;
				6346	if (cache->flags & (BTRFS_BLOCK_GROUP_DUP \|
				6347	BTRFS_BLOCK_GROUP_RAID1 \|
				6348	BTRFS_BLOCK_GROUP_RAID10))
				6349	factor = 2;
				6350	else
				6351	factor = 1;
				6352	/*
				6353	* If this block group has free space cache written out, we
				6354	* need to make sure to load it if we are removing space. This
				6355	* is because we need the unpinning stage to actually add the
				6356	* space back to the block group, otherwise we will leak space.
				6357	*/
				6358	if (!alloc && cache->cached == BTRFS_CACHE_NO)
				6359	cache_block_group(cache, 1);
				6360
				6361	byte_in_group = bytenr - cache->key.objectid;
				6362	WARN_ON(byte_in_group > cache->key.offset);
				6363
				6364	spin_lock(&cache->space_info->lock);
				6365	spin_lock(&cache->lock);
				6366
				6367	if (btrfs_test_opt(info, SPACE_CACHE) &&
				6368	cache->disk_cache_state < BTRFS_DC_CLEAR)
				6369	cache->disk_cache_state = BTRFS_DC_CLEAR;
				6370
				6371	old_val = btrfs_block_group_used(&cache->item);
				6372	num_bytes = min(total, cache->key.offset - byte_in_group);
				6373	if (alloc) {
				6374	old_val += num_bytes;
				6375	btrfs_set_block_group_used(&cache->item, old_val);
				6376	cache->reserved -= num_bytes;
				6377	cache->space_info->bytes_reserved -= num_bytes;
				6378	cache->space_info->bytes_used += num_bytes;
				6379	cache->space_info->disk_used += num_bytes * factor;
				6380	spin_unlock(&cache->lock);
				6381	spin_unlock(&cache->space_info->lock);
				6382	} else {
				6383	old_val -= num_bytes;
				6384	btrfs_set_block_group_used(&cache->item, old_val);
				6385	cache->pinned += num_bytes;
				6386	cache->space_info->bytes_pinned += num_bytes;
				6387	cache->space_info->bytes_used -= num_bytes;
				6388	cache->space_info->disk_used -= num_bytes * factor;
				6389	spin_unlock(&cache->lock);
				6390	spin_unlock(&cache->space_info->lock);
				6391
				6392	trace_btrfs_space_reservation(info, "pinned",
				6393	cache->space_info->flags,
				6394	num_bytes, 1);
				6395	percpu_counter_add(&cache->space_info->total_bytes_pinned,
				6396	num_bytes);
				6397	set_extent_dirty(info->pinned_extents,
				6398	bytenr, bytenr + num_bytes - 1,
				6399	GFP_NOFS \| __GFP_NOFAIL);
				6400	}
				6401
				6402	spin_lock(&trans->transaction->dirty_bgs_lock);
				6403	if (list_empty(&cache->dirty_list)) {
				6404	list_add_tail(&cache->dirty_list,
				6405	&trans->transaction->dirty_bgs);
				6406	trans->transaction->num_dirty_bgs++;
				6407	btrfs_get_block_group(cache);
				6408	}
				6409	spin_unlock(&trans->transaction->dirty_bgs_lock);
				6410
				6411	/*
				6412	* No longer have used bytes in this block group, queue it for
				6413	* deletion. We do this after adding the block group to the
				6414	* dirty list to avoid races between cleaner kthread and space
				6415	* cache writeout.
				6416	*/
				6417	if (!alloc && old_val == 0) {
				6418	spin_lock(&info->unused_bgs_lock);
				6419	if (list_empty(&cache->bg_list)) {
				6420	btrfs_get_block_group(cache);
				6421	list_add_tail(&cache->bg_list,
				6422	&info->unused_bgs);
				6423	}
				6424	spin_unlock(&info->unused_bgs_lock);
				6425	}
				6426
				6427	btrfs_put_block_group(cache);
				6428	total -= num_bytes;
				6429	bytenr += num_bytes;
				6430	}
				6431	return 0;
				6432	}
				6433
				6434	static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
				6435	{
				6436	struct btrfs_block_group_cache *cache;
				6437	u64 bytenr;
				6438
				6439	spin_lock(&fs_info->block_group_cache_lock);
				6440	bytenr = fs_info->first_logical_byte;
				6441	spin_unlock(&fs_info->block_group_cache_lock);
				6442
				6443	if (bytenr < (u64)-1)
				6444	return bytenr;
				6445
				6446	cache = btrfs_lookup_first_block_group(fs_info, search_start);
				6447	if (!cache)
				6448	return 0;
				6449
				6450	bytenr = cache->key.objectid;
				6451	btrfs_put_block_group(cache);
				6452
				6453	return bytenr;
				6454	}
				6455
				6456	static int pin_down_extent(struct btrfs_fs_info *fs_info,
				6457	struct btrfs_block_group_cache *cache,
				6458	u64 bytenr, u64 num_bytes, int reserved)
				6459	{
				6460	spin_lock(&cache->space_info->lock);
				6461	spin_lock(&cache->lock);
				6462	cache->pinned += num_bytes;
				6463	cache->space_info->bytes_pinned += num_bytes;
				6464	if (reserved) {
				6465	cache->reserved -= num_bytes;
				6466	cache->space_info->bytes_reserved -= num_bytes;
				6467	}
				6468	spin_unlock(&cache->lock);
				6469	spin_unlock(&cache->space_info->lock);
				6470
				6471	trace_btrfs_space_reservation(fs_info, "pinned",
				6472	cache->space_info->flags, num_bytes, 1);
				6473	percpu_counter_add(&cache->space_info->total_bytes_pinned, num_bytes);
				6474	set_extent_dirty(fs_info->pinned_extents, bytenr,
				6475	bytenr + num_bytes - 1, GFP_NOFS \| __GFP_NOFAIL);
				6476	return 0;
				6477	}
				6478
				6479	/*
				6480	* this function must be called within transaction
				6481	*/
				6482	int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
				6483	u64 bytenr, u64 num_bytes, int reserved)
				6484	{
				6485	struct btrfs_block_group_cache *cache;
				6486
				6487	cache = btrfs_lookup_block_group(fs_info, bytenr);
				6488	BUG_ON(!cache); /* Logic error */
				6489
				6490	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
				6491
				6492	btrfs_put_block_group(cache);
				6493	return 0;
				6494	}
				6495
				6496	/*
				6497	* this function must be called within transaction
				6498	*/
				6499	int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
				6500	u64 bytenr, u64 num_bytes)
				6501	{
				6502	struct btrfs_block_group_cache *cache;
				6503	int ret;
				6504
				6505	cache = btrfs_lookup_block_group(fs_info, bytenr);
				6506	if (!cache)
				6507	return -EINVAL;
				6508
				6509	/*
				6510	* pull in the free space cache (if any) so that our pin
				6511	* removes the free space from the cache. We have load_only set
				6512	* to one because the slow code to read in the free extents does check
				6513	* the pinned extents.
				6514	*/
				6515	cache_block_group(cache, 1);
				6516
				6517	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
				6518
				6519	/* remove us from the free space cache (if we're there at all) */
				6520	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
				6521	btrfs_put_block_group(cache);
				6522	return ret;
				6523	}
				6524
				6525	static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				6526	u64 start, u64 num_bytes)
				6527	{
				6528	int ret;
				6529	struct btrfs_block_group_cache *block_group;
				6530	struct btrfs_caching_control *caching_ctl;
				6531
				6532	block_group = btrfs_lookup_block_group(fs_info, start);
				6533	if (!block_group)
				6534	return -EINVAL;
				6535
				6536	cache_block_group(block_group, 0);
				6537	caching_ctl = get_caching_control(block_group);
				6538
				6539	if (!caching_ctl) {
				6540	/* Logic error */
				6541	BUG_ON(!block_group_cache_done(block_group));
				6542	ret = btrfs_remove_free_space(block_group, start, num_bytes);
				6543	} else {
				6544	mutex_lock(&caching_ctl->mutex);
				6545
				6546	if (start >= caching_ctl->progress) {
				6547	ret = add_excluded_extent(fs_info, start, num_bytes);
				6548	} else if (start + num_bytes <= caching_ctl->progress) {
				6549	ret = btrfs_remove_free_space(block_group,
				6550	start, num_bytes);
				6551	} else {
				6552	num_bytes = caching_ctl->progress - start;
				6553	ret = btrfs_remove_free_space(block_group,
				6554	start, num_bytes);
				6555	if (ret)
				6556	goto out_lock;
				6557
				6558	num_bytes = (start + num_bytes) -
				6559	caching_ctl->progress;
				6560	start = caching_ctl->progress;
				6561	ret = add_excluded_extent(fs_info, start, num_bytes);
				6562	}
				6563	out_lock:
				6564	mutex_unlock(&caching_ctl->mutex);
				6565	put_caching_control(caching_ctl);
				6566	}
				6567	btrfs_put_block_group(block_group);
				6568	return ret;
				6569	}
				6570
				6571	int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
				6572	struct extent_buffer *eb)
				6573	{
				6574	struct btrfs_file_extent_item *item;
				6575	struct btrfs_key key;
				6576	int found_type;
				6577	int i;
				6578
				6579	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
				6580	return 0;
				6581
				6582	for (i = 0; i < btrfs_header_nritems(eb); i++) {
				6583	btrfs_item_key_to_cpu(eb, &key, i);
				6584	if (key.type != BTRFS_EXTENT_DATA_KEY)
				6585	continue;
				6586	item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
				6587	found_type = btrfs_file_extent_type(eb, item);
				6588	if (found_type == BTRFS_FILE_EXTENT_INLINE)
				6589	continue;
				6590	if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
				6591	continue;
				6592	key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
				6593	key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
				6594	__exclude_logged_extent(fs_info, key.objectid, key.offset);
				6595	}
				6596
				6597	return 0;
				6598	}
				6599
				6600	static void
				6601	btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
				6602	{
				6603	atomic_inc(&bg->reservations);
				6604	}
				6605
				6606	void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
				6607	const u64 start)
				6608	{
				6609	struct btrfs_block_group_cache *bg;
				6610
				6611	bg = btrfs_lookup_block_group(fs_info, start);
				6612	ASSERT(bg);
				6613	if (atomic_dec_and_test(&bg->reservations))
				6614	wake_up_atomic_t(&bg->reservations);
				6615	btrfs_put_block_group(bg);
				6616	}
				6617
				6618	static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
				6619	{
				6620	schedule();
				6621	return 0;
				6622	}
				6623
				6624	void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
				6625	{
				6626	struct btrfs_space_info *space_info = bg->space_info;
				6627
				6628	ASSERT(bg->ro);
				6629
				6630	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
				6631	return;
				6632
				6633	/*
				6634	* Our block group is read only but before we set it to read only,
				6635	* some task might have had allocated an extent from it already, but it
				6636	* has not yet created a respective ordered extent (and added it to a
				6637	* root's list of ordered extents).
				6638	* Therefore wait for any task currently allocating extents, since the
				6639	* block group's reservations counter is incremented while a read lock
				6640	* on the groups' semaphore is held and decremented after releasing
				6641	* the read access on that semaphore and creating the ordered extent.
				6642	*/
				6643	down_write(&space_info->groups_sem);
				6644	up_write(&space_info->groups_sem);
				6645
				6646	wait_on_atomic_t(&bg->reservations,
				6647	btrfs_wait_bg_reservations_atomic_t,
				6648	TASK_UNINTERRUPTIBLE);
				6649	}
				6650
				6651	/**
				6652	* btrfs_add_reserved_bytes - update the block_group and space info counters
				6653	* @cache: The cache we are manipulating
				6654	* @ram_bytes: The number of bytes of file content, and will be same to
				6655	* @num_bytes except for the compress path.
				6656	* @num_bytes: The number of bytes in question
				6657	* @delalloc: The blocks are allocated for the delalloc write
				6658	*
				6659	* This is called by the allocator when it reserves space. If this is a
				6660	* reservation and the block group has become read only we cannot make the
				6661	* reservation and return -EAGAIN, otherwise this function always succeeds.
				6662	*/
				6663	static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
				6664	u64 ram_bytes, u64 num_bytes, int delalloc)
				6665	{
				6666	struct btrfs_space_info *space_info = cache->space_info;
				6667	int ret = 0;
				6668
				6669	spin_lock(&space_info->lock);
				6670	spin_lock(&cache->lock);
				6671	if (cache->ro) {
				6672	ret = -EAGAIN;
				6673	} else {
				6674	cache->reserved += num_bytes;
				6675	space_info->bytes_reserved += num_bytes;
				6676
				6677	trace_btrfs_space_reservation(cache->fs_info,
				6678	"space_info", space_info->flags,
				6679	ram_bytes, 0);
				6680	space_info->bytes_may_use -= ram_bytes;
				6681	if (delalloc)
				6682	cache->delalloc_bytes += num_bytes;
				6683	}
				6684	spin_unlock(&cache->lock);
				6685	spin_unlock(&space_info->lock);
				6686	return ret;
				6687	}
				6688
				6689	/**
				6690	* btrfs_free_reserved_bytes - update the block_group and space info counters
				6691	* @cache: The cache we are manipulating
				6692	* @num_bytes: The number of bytes in question
				6693	* @delalloc: The blocks are allocated for the delalloc write
				6694	*
				6695	* This is called by somebody who is freeing space that was never actually used
				6696	* on disk. For example if you reserve some space for a new leaf in transaction
				6697	* A and before transaction A commits you free that leaf, you call this with
				6698	* reserve set to 0 in order to clear the reservation.
				6699	*/
				6700
				6701	static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
				6702	u64 num_bytes, int delalloc)
				6703	{
				6704	struct btrfs_space_info *space_info = cache->space_info;
				6705	int ret = 0;
				6706
				6707	spin_lock(&space_info->lock);
				6708	spin_lock(&cache->lock);
				6709	if (cache->ro)
				6710	space_info->bytes_readonly += num_bytes;
				6711	cache->reserved -= num_bytes;
				6712	space_info->bytes_reserved -= num_bytes;
				6713
				6714	if (delalloc)
				6715	cache->delalloc_bytes -= num_bytes;
				6716	spin_unlock(&cache->lock);
				6717	spin_unlock(&space_info->lock);
				6718	return ret;
				6719	}
				6720	void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
				6721	{
				6722	struct btrfs_caching_control *next;
				6723	struct btrfs_caching_control *caching_ctl;
				6724	struct btrfs_block_group_cache *cache;
				6725
				6726	down_write(&fs_info->commit_root_sem);
				6727
				6728	list_for_each_entry_safe(caching_ctl, next,
				6729	&fs_info->caching_block_groups, list) {
				6730	cache = caching_ctl->block_group;
				6731	if (block_group_cache_done(cache)) {
				6732	cache->last_byte_to_unpin = (u64)-1;
				6733	list_del_init(&caching_ctl->list);
				6734	put_caching_control(caching_ctl);
				6735	} else {
				6736	cache->last_byte_to_unpin = caching_ctl->progress;
				6737	}
				6738	}
				6739
				6740	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
				6741	fs_info->pinned_extents = &fs_info->freed_extents[1];
				6742	else
				6743	fs_info->pinned_extents = &fs_info->freed_extents[0];
				6744
				6745	up_write(&fs_info->commit_root_sem);
				6746
				6747	update_global_block_rsv(fs_info);
				6748	}
				6749
				6750	/*
				6751	* Returns the free cluster for the given space info and sets empty_cluster to
				6752	* what it should be based on the mount options.
				6753	*/
				6754	static struct btrfs_free_cluster *
				6755	fetch_cluster_info(struct btrfs_fs_info *fs_info,
				6756	struct btrfs_space_info space_info, u64 empty_cluster)
				6757	{
				6758	struct btrfs_free_cluster *ret = NULL;
				6759
				6760	*empty_cluster = 0;
				6761	if (btrfs_mixed_space_info(space_info))
				6762	return ret;
				6763
				6764	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				6765	ret = &fs_info->meta_alloc_cluster;
				6766	if (btrfs_test_opt(fs_info, SSD))
				6767	*empty_cluster = SZ_2M;
				6768	else
				6769	*empty_cluster = SZ_64K;
				6770	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
				6771	btrfs_test_opt(fs_info, SSD_SPREAD)) {
				6772	*empty_cluster = SZ_2M;
				6773	ret = &fs_info->data_alloc_cluster;
				6774	}
				6775
				6776	return ret;
				6777	}
				6778
				6779	static int unpin_extent_range(struct btrfs_fs_info *fs_info,
				6780	u64 start, u64 end,
				6781	const bool return_free_space)
				6782	{
				6783	struct btrfs_block_group_cache *cache = NULL;
				6784	struct btrfs_space_info *space_info;
				6785	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				6786	struct btrfs_free_cluster *cluster = NULL;
				6787	u64 len;
				6788	u64 total_unpinned = 0;
				6789	u64 empty_cluster = 0;
				6790	bool readonly;
				6791
				6792	while (start <= end) {
				6793	readonly = false;
				6794	if (!cache \|\|
				6795	start >= cache->key.objectid + cache->key.offset) {
				6796	if (cache)
				6797	btrfs_put_block_group(cache);
				6798	total_unpinned = 0;
				6799	cache = btrfs_lookup_block_group(fs_info, start);
				6800	BUG_ON(!cache); /* Logic error */
				6801
				6802	cluster = fetch_cluster_info(fs_info,
				6803	cache->space_info,
				6804	&empty_cluster);
				6805	empty_cluster <<= 1;
				6806	}
				6807
				6808	len = cache->key.objectid + cache->key.offset - start;
				6809	len = min(len, end + 1 - start);
				6810
				6811	if (start < cache->last_byte_to_unpin) {
				6812	len = min(len, cache->last_byte_to_unpin - start);
				6813	if (return_free_space)
				6814	btrfs_add_free_space(cache, start, len);
				6815	}
				6816
				6817	start += len;
				6818	total_unpinned += len;
				6819	space_info = cache->space_info;
				6820
				6821	/*
				6822	* If this space cluster has been marked as fragmented and we've
				6823	* unpinned enough in this block group to potentially allow a
				6824	* cluster to be created inside of it go ahead and clear the
				6825	* fragmented check.
				6826	*/
				6827	if (cluster && cluster->fragmented &&
				6828	total_unpinned > empty_cluster) {
				6829	spin_lock(&cluster->lock);
				6830	cluster->fragmented = 0;
				6831	spin_unlock(&cluster->lock);
				6832	}
				6833
				6834	spin_lock(&space_info->lock);
				6835	spin_lock(&cache->lock);
				6836	cache->pinned -= len;
				6837	space_info->bytes_pinned -= len;
				6838
				6839	trace_btrfs_space_reservation(fs_info, "pinned",
				6840	space_info->flags, len, 0);
				6841	space_info->max_extent_size = 0;
				6842	percpu_counter_add(&space_info->total_bytes_pinned, -len);
				6843	if (cache->ro) {
				6844	space_info->bytes_readonly += len;
				6845	readonly = true;
				6846	}
				6847	spin_unlock(&cache->lock);
				6848	if (!readonly && return_free_space &&
				6849	global_rsv->space_info == space_info) {
				6850	u64 to_add = len;
				6851
				6852	spin_lock(&global_rsv->lock);
				6853	if (!global_rsv->full) {
				6854	to_add = min(len, global_rsv->size -
				6855	global_rsv->reserved);
				6856	global_rsv->reserved += to_add;
				6857	space_info->bytes_may_use += to_add;
				6858	if (global_rsv->reserved >= global_rsv->size)
				6859	global_rsv->full = 1;
				6860	trace_btrfs_space_reservation(fs_info,
				6861	"space_info",
				6862	space_info->flags,
				6863	to_add, 1);
				6864	len -= to_add;
				6865	}
				6866	spin_unlock(&global_rsv->lock);
				6867	/* Add to any tickets we may have */
				6868	if (len)
				6869	space_info_add_new_bytes(fs_info, space_info,
				6870	len);
				6871	}
				6872	spin_unlock(&space_info->lock);
				6873	}
				6874
				6875	if (cache)
				6876	btrfs_put_block_group(cache);
				6877	return 0;
				6878	}
				6879
				6880	int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
				6881	struct btrfs_fs_info *fs_info)
				6882	{
				6883	struct btrfs_block_group_cache block_group, tmp;
				6884	struct list_head *deleted_bgs;
				6885	struct extent_io_tree *unpin;
				6886	u64 start;
				6887	u64 end;
				6888	int ret;
				6889
				6890	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
				6891	unpin = &fs_info->freed_extents[1];
				6892	else
				6893	unpin = &fs_info->freed_extents[0];
				6894
				6895	while (!trans->aborted) {
				6896	mutex_lock(&fs_info->unused_bg_unpin_mutex);
				6897	ret = find_first_extent_bit(unpin, 0, &start, &end,
				6898	EXTENT_DIRTY, NULL);
				6899	if (ret) {
				6900	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				6901	break;
				6902	}
				6903
				6904	if (btrfs_test_opt(fs_info, DISCARD))
				6905	ret = btrfs_discard_extent(fs_info, start,
				6906	end + 1 - start, NULL);
				6907
				6908	clear_extent_dirty(unpin, start, end);
				6909	unpin_extent_range(fs_info, start, end, true);
				6910	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				6911	cond_resched();
				6912	}
				6913
				6914	/*
				6915	* Transaction is finished. We don't need the lock anymore. We
				6916	* do need to clean up the block groups in case of a transaction
				6917	* abort.
				6918	*/
				6919	deleted_bgs = &trans->transaction->deleted_bgs;
				6920	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
				6921	u64 trimmed = 0;
				6922
				6923	ret = -EROFS;
				6924	if (!trans->aborted)
				6925	ret = btrfs_discard_extent(fs_info,
				6926	block_group->key.objectid,
				6927	block_group->key.offset,
				6928	&trimmed);
				6929
				6930	list_del_init(&block_group->bg_list);
				6931	btrfs_put_block_group_trimming(block_group);
				6932	btrfs_put_block_group(block_group);
				6933
				6934	if (ret) {
				6935	const char *errstr = btrfs_decode_error(ret);
				6936	btrfs_warn(fs_info,
				6937	"discard failed while removing blockgroup: errno=%d %s",
				6938	ret, errstr);
				6939	}
				6940	}
				6941
				6942	return 0;
				6943	}
				6944
				6945	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				6946	struct btrfs_fs_info *info,
				6947	struct btrfs_delayed_ref_node *node, u64 parent,
				6948	u64 root_objectid, u64 owner_objectid,
				6949	u64 owner_offset, int refs_to_drop,
				6950	struct btrfs_delayed_extent_op *extent_op)
				6951	{
				6952	struct btrfs_key key;
				6953	struct btrfs_path *path;
				6954	struct btrfs_root *extent_root = info->extent_root;
				6955	struct extent_buffer *leaf;
				6956	struct btrfs_extent_item *ei;
				6957	struct btrfs_extent_inline_ref *iref;
				6958	int ret;
				6959	int is_data;
				6960	int extent_slot = 0;
				6961	int found_extent = 0;
				6962	int num_to_del = 1;
				6963	u32 item_size;
				6964	u64 refs;
				6965	u64 bytenr = node->bytenr;
				6966	u64 num_bytes = node->num_bytes;
				6967	int last_ref = 0;
				6968	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
				6969
				6970	path = btrfs_alloc_path();
				6971	if (!path)
				6972	return -ENOMEM;
				6973
				6974	path->reada = READA_FORWARD;
				6975	path->leave_spinning = 1;
				6976
				6977	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
				6978	BUG_ON(!is_data && refs_to_drop != 1);
				6979
				6980	if (is_data)
				6981	skinny_metadata = 0;
				6982
				6983	ret = lookup_extent_backref(trans, info, path, &iref,
				6984	bytenr, num_bytes, parent,
				6985	root_objectid, owner_objectid,
				6986	owner_offset);
				6987	if (ret == 0) {
				6988	extent_slot = path->slots[0];
				6989	while (extent_slot >= 0) {
				6990	btrfs_item_key_to_cpu(path->nodes[0], &key,
				6991	extent_slot);
				6992	if (key.objectid != bytenr)
				6993	break;
				6994	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
				6995	key.offset == num_bytes) {
				6996	found_extent = 1;
				6997	break;
				6998	}
				6999	if (key.type == BTRFS_METADATA_ITEM_KEY &&
				7000	key.offset == owner_objectid) {
				7001	found_extent = 1;
				7002	break;
				7003	}
				7004	if (path->slots[0] - extent_slot > 5)
				7005	break;
				7006	extent_slot--;
				7007	}
				7008	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				7009	item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
				7010	if (found_extent && item_size < sizeof(*ei))
				7011	found_extent = 0;
				7012	#endif
				7013	if (!found_extent) {
				7014	BUG_ON(iref);
				7015	ret = remove_extent_backref(trans, info, path, NULL,
				7016	refs_to_drop,
				7017	is_data, &last_ref);
				7018	if (ret) {
				7019	btrfs_abort_transaction(trans, ret);
				7020	goto out;
				7021	}
				7022	btrfs_release_path(path);
				7023	path->leave_spinning = 1;
				7024
				7025	key.objectid = bytenr;
				7026	key.type = BTRFS_EXTENT_ITEM_KEY;
				7027	key.offset = num_bytes;
				7028
				7029	if (!is_data && skinny_metadata) {
				7030	key.type = BTRFS_METADATA_ITEM_KEY;
				7031	key.offset = owner_objectid;
				7032	}
				7033
				7034	ret = btrfs_search_slot(trans, extent_root,
				7035	&key, path, -1, 1);
				7036	if (ret > 0 && skinny_metadata && path->slots[0]) {
				7037	/*
				7038	* Couldn't find our skinny metadata item,
				7039	* see if we have ye olde extent item.
				7040	*/
				7041	path->slots[0]--;
				7042	btrfs_item_key_to_cpu(path->nodes[0], &key,
				7043	path->slots[0]);
				7044	if (key.objectid == bytenr &&
				7045	key.type == BTRFS_EXTENT_ITEM_KEY &&
				7046	key.offset == num_bytes)
				7047	ret = 0;
				7048	}
				7049
				7050	if (ret > 0 && skinny_metadata) {
				7051	skinny_metadata = false;
				7052	key.objectid = bytenr;
				7053	key.type = BTRFS_EXTENT_ITEM_KEY;
				7054	key.offset = num_bytes;
				7055	btrfs_release_path(path);
				7056	ret = btrfs_search_slot(trans, extent_root,
				7057	&key, path, -1, 1);
				7058	}
				7059
				7060	if (ret) {
				7061	btrfs_err(info,
				7062	"umm, got %d back from search, was looking for %llu",
				7063	ret, bytenr);
				7064	if (ret > 0)
				7065	btrfs_print_leaf(path->nodes[0]);
				7066	}
				7067	if (ret < 0) {
				7068	btrfs_abort_transaction(trans, ret);
				7069	goto out;
				7070	}
				7071	extent_slot = path->slots[0];
				7072	}
				7073	} else if (WARN_ON(ret == -ENOENT)) {
				7074	btrfs_print_leaf(path->nodes[0]);
				7075	btrfs_err(info,
				7076	"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
				7077	bytenr, parent, root_objectid, owner_objectid,
				7078	owner_offset);
				7079	btrfs_abort_transaction(trans, ret);
				7080	goto out;
				7081	} else {
				7082	btrfs_abort_transaction(trans, ret);
				7083	goto out;
				7084	}
				7085
				7086	leaf = path->nodes[0];
				7087	item_size = btrfs_item_size_nr(leaf, extent_slot);
				7088	#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
				7089	if (item_size < sizeof(*ei)) {
				7090	BUG_ON(found_extent \|\| extent_slot != path->slots[0]);
				7091	ret = convert_extent_item_v0(trans, info, path, owner_objectid,
				7092	0);
				7093	if (ret < 0) {
				7094	btrfs_abort_transaction(trans, ret);
				7095	goto out;
				7096	}
				7097
				7098	btrfs_release_path(path);
				7099	path->leave_spinning = 1;
				7100
				7101	key.objectid = bytenr;
				7102	key.type = BTRFS_EXTENT_ITEM_KEY;
				7103	key.offset = num_bytes;
				7104
				7105	ret = btrfs_search_slot(trans, extent_root, &key, path,
				7106	-1, 1);
				7107	if (ret) {
				7108	btrfs_err(info,
				7109	"umm, got %d back from search, was looking for %llu",
				7110	ret, bytenr);
				7111	btrfs_print_leaf(path->nodes[0]);
				7112	}
				7113	if (ret < 0) {
				7114	btrfs_abort_transaction(trans, ret);
				7115	goto out;
				7116	}
				7117
				7118	extent_slot = path->slots[0];
				7119	leaf = path->nodes[0];
				7120	item_size = btrfs_item_size_nr(leaf, extent_slot);
				7121	}
				7122	#endif
				7123	BUG_ON(item_size < sizeof(*ei));
				7124	ei = btrfs_item_ptr(leaf, extent_slot,
				7125	struct btrfs_extent_item);
				7126	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
				7127	key.type == BTRFS_EXTENT_ITEM_KEY) {
				7128	struct btrfs_tree_block_info *bi;
				7129	BUG_ON(item_size < sizeof(ei) + sizeof(bi));
				7130	bi = (struct btrfs_tree_block_info *)(ei + 1);
				7131	WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
				7132	}
				7133
				7134	refs = btrfs_extent_refs(leaf, ei);
				7135	if (refs < refs_to_drop) {
				7136	btrfs_err(info,
				7137	"trying to drop %d refs but we only have %Lu for bytenr %Lu",
				7138	refs_to_drop, refs, bytenr);
				7139	ret = -EINVAL;
				7140	btrfs_abort_transaction(trans, ret);
				7141	goto out;
				7142	}
				7143	refs -= refs_to_drop;
				7144
				7145	if (refs > 0) {
				7146	if (extent_op)
				7147	__run_delayed_extent_op(extent_op, leaf, ei);
				7148	/*
				7149	* In the case of inline back ref, reference count will
				7150	* be updated by remove_extent_backref
				7151	*/
				7152	if (iref) {
				7153	BUG_ON(!found_extent);
				7154	} else {
				7155	btrfs_set_extent_refs(leaf, ei, refs);
				7156	btrfs_mark_buffer_dirty(leaf);
				7157	}
				7158	if (found_extent) {
				7159	ret = remove_extent_backref(trans, info, path,
				7160	iref, refs_to_drop,
				7161	is_data, &last_ref);
				7162	if (ret) {
				7163	btrfs_abort_transaction(trans, ret);
				7164	goto out;
				7165	}
				7166	}
				7167	} else {
				7168	if (found_extent) {
				7169	BUG_ON(is_data && refs_to_drop !=
				7170	extent_data_ref_count(path, iref));
				7171	if (iref) {
				7172	BUG_ON(path->slots[0] != extent_slot);
				7173	} else {
				7174	BUG_ON(path->slots[0] != extent_slot + 1);
				7175	path->slots[0] = extent_slot;
				7176	num_to_del = 2;
				7177	}
				7178	}
				7179
				7180	last_ref = 1;
				7181	ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				7182	num_to_del);
				7183	if (ret) {
				7184	btrfs_abort_transaction(trans, ret);
				7185	goto out;
				7186	}
				7187	btrfs_release_path(path);
				7188
				7189	if (is_data) {
				7190	ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
				7191	if (ret) {
				7192	btrfs_abort_transaction(trans, ret);
				7193	goto out;
				7194	}
				7195	}
				7196
				7197	ret = add_to_free_space_tree(trans, info, bytenr, num_bytes);
				7198	if (ret) {
				7199	btrfs_abort_transaction(trans, ret);
				7200	goto out;
				7201	}
				7202
				7203	ret = update_block_group(trans, info, bytenr, num_bytes, 0);
				7204	if (ret) {
				7205	btrfs_abort_transaction(trans, ret);
				7206	goto out;
				7207	}
				7208	}
				7209	btrfs_release_path(path);
				7210
				7211	out:
				7212	btrfs_free_path(path);
				7213	return ret;
				7214	}
				7215
				7216	/*
				7217	* when we free an block, it is possible (and likely) that we free the last
				7218	* delayed ref for that extent as well. This searches the delayed ref tree for
				7219	* a given extent, and if there are no other delayed refs to be processed, it
				7220	* removes it from the tree.
				7221	*/
				7222	static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
				7223	u64 bytenr)
				7224	{
				7225	struct btrfs_delayed_ref_head *head;
				7226	struct btrfs_delayed_ref_root *delayed_refs;
				7227	int ret = 0;
				7228
				7229	delayed_refs = &trans->transaction->delayed_refs;
				7230	spin_lock(&delayed_refs->lock);
				7231	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				7232	if (!head)
				7233	goto out_delayed_unlock;
				7234
				7235	spin_lock(&head->lock);
				7236	if (!list_empty(&head->ref_list))
				7237	goto out;
				7238
				7239	if (head->extent_op) {
				7240	if (!head->must_insert_reserved)
				7241	goto out;
				7242	btrfs_free_delayed_extent_op(head->extent_op);
				7243	head->extent_op = NULL;
				7244	}
				7245
				7246	/*
				7247	* waiting for the lock here would deadlock. If someone else has it
				7248	* locked they are already in the process of dropping it anyway
				7249	*/
				7250	if (!mutex_trylock(&head->mutex))
				7251	goto out;
				7252
				7253	/*
				7254	* at this point we have a head with no other entries. Go
				7255	* ahead and process it.
				7256	*/
				7257	head->node.in_tree = 0;
				7258	rb_erase(&head->href_node, &delayed_refs->href_root);
				7259
				7260	atomic_dec(&delayed_refs->num_entries);
				7261
				7262	/*
				7263	* we don't take a ref on the node because we're removing it from the
				7264	* tree, so we just steal the ref the tree was holding.
				7265	*/
				7266	delayed_refs->num_heads--;
				7267	if (head->processing == 0)
				7268	delayed_refs->num_heads_ready--;
				7269	head->processing = 0;
				7270	spin_unlock(&head->lock);
				7271	spin_unlock(&delayed_refs->lock);
				7272
				7273	BUG_ON(head->extent_op);
				7274	if (head->must_insert_reserved)
				7275	ret = 1;
				7276
				7277	mutex_unlock(&head->mutex);
				7278	btrfs_put_delayed_ref(&head->node);
				7279	return ret;
				7280	out:
				7281	spin_unlock(&head->lock);
				7282
				7283	out_delayed_unlock:
				7284	spin_unlock(&delayed_refs->lock);
				7285	return 0;
				7286	}
				7287
				7288	void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
				7289	struct btrfs_root *root,
				7290	struct extent_buffer *buf,
				7291	u64 parent, int last_ref)
				7292	{
				7293	struct btrfs_fs_info *fs_info = root->fs_info;
				7294	int pin = 1;
				7295	int ret;
				7296
				7297	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				7298	int old_ref_mod, new_ref_mod;
				7299
				7300	ret = btrfs_add_delayed_tree_ref(fs_info, trans, buf->start,
				7301	buf->len, parent,
				7302	root->root_key.objectid,
				7303	btrfs_header_level(buf),
				7304	BTRFS_DROP_DELAYED_REF, NULL,
				7305	&old_ref_mod, &new_ref_mod);
				7306	BUG_ON(ret); /* -ENOMEM */
				7307	pin = old_ref_mod >= 0 && new_ref_mod < 0;
				7308	}
				7309
				7310	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
				7311	struct btrfs_block_group_cache *cache;
				7312
				7313	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				7314	ret = check_ref_cleanup(trans, buf->start);
				7315	if (!ret)
				7316	goto out;
				7317	}
				7318
				7319	pin = 0;
				7320	cache = btrfs_lookup_block_group(fs_info, buf->start);
				7321
				7322	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
				7323	pin_down_extent(fs_info, cache, buf->start,
				7324	buf->len, 1);
				7325	btrfs_put_block_group(cache);
				7326	goto out;
				7327	}
				7328
				7329	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
				7330
				7331	btrfs_add_free_space(cache, buf->start, buf->len);
				7332	btrfs_free_reserved_bytes(cache, buf->len, 0);
				7333	btrfs_put_block_group(cache);
				7334	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
				7335	}
				7336	out:
				7337	if (pin)
				7338	add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf),
				7339	root->root_key.objectid);
				7340
				7341	if (last_ref) {
				7342	/*
				7343	* Deleting the buffer, clear the corrupt flag since it doesn't
				7344	* matter anymore.
				7345	*/
				7346	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
				7347	}
				7348	}
				7349
				7350	/* Can return -ENOMEM */
				7351	int btrfs_free_extent(struct btrfs_trans_handle *trans,
				7352	struct btrfs_fs_info *fs_info,
				7353	u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
				7354	u64 owner, u64 offset)
				7355	{
				7356	int old_ref_mod, new_ref_mod;
				7357	int ret;
				7358
				7359	if (btrfs_is_testing(fs_info))
				7360	return 0;
				7361
				7362
				7363	/*
				7364	* tree log blocks never actually go into the extent allocation
				7365	* tree, just update pinning info and exit early.
				7366	*/
				7367	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
				7368	WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
				7369	/* unlocks the pinned mutex */
				7370	btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
				7371	old_ref_mod = new_ref_mod = 0;
				7372	ret = 0;
				7373	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				7374	ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
				7375	num_bytes, parent,
				7376	root_objectid, (int)owner,
				7377	BTRFS_DROP_DELAYED_REF, NULL,
				7378	&old_ref_mod, &new_ref_mod);
				7379	} else {
				7380	ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
				7381	num_bytes, parent,
				7382	root_objectid, owner, offset,
				7383	0, BTRFS_DROP_DELAYED_REF,
				7384	&old_ref_mod, &new_ref_mod);
				7385	}
				7386
				7387	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
				7388	add_pinned_bytes(fs_info, num_bytes, owner, root_objectid);
				7389
				7390	return ret;
				7391	}
				7392
				7393	/*
				7394	* when we wait for progress in the block group caching, its because
				7395	* our allocation attempt failed at least once. So, we must sleep
				7396	* and let some progress happen before we try again.
				7397	*
				7398	* This function will sleep at least once waiting for new free space to
				7399	* show up, and then it will check the block group free space numbers
				7400	* for our min num_bytes. Another option is to have it go ahead
				7401	* and look in the rbtree for a free extent of a given size, but this
				7402	* is a good start.
				7403	*
				7404	* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
				7405	* any of the information in this block group.
				7406	*/
				7407	static noinline void
				7408	wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
				7409	u64 num_bytes)
				7410	{
				7411	struct btrfs_caching_control *caching_ctl;
				7412
				7413	caching_ctl = get_caching_control(cache);
				7414	if (!caching_ctl)
				7415	return;
				7416
				7417	wait_event(caching_ctl->wait, block_group_cache_done(cache) \|\|
				7418	(cache->free_space_ctl->free_space >= num_bytes));
				7419
				7420	put_caching_control(caching_ctl);
				7421	}
				7422
				7423	static noinline int
				7424	wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
				7425	{
				7426	struct btrfs_caching_control *caching_ctl;
				7427	int ret = 0;
				7428
				7429	caching_ctl = get_caching_control(cache);
				7430	if (!caching_ctl)
				7431	return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
				7432
				7433	wait_event(caching_ctl->wait, block_group_cache_done(cache));
				7434	if (cache->cached == BTRFS_CACHE_ERROR)
				7435	ret = -EIO;
				7436	put_caching_control(caching_ctl);
				7437	return ret;
				7438	}
				7439
				7440	int __get_raid_index(u64 flags)
				7441	{
				7442	if (flags & BTRFS_BLOCK_GROUP_RAID10)
				7443	return BTRFS_RAID_RAID10;
				7444	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
				7445	return BTRFS_RAID_RAID1;
				7446	else if (flags & BTRFS_BLOCK_GROUP_DUP)
				7447	return BTRFS_RAID_DUP;
				7448	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
				7449	return BTRFS_RAID_RAID0;
				7450	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
				7451	return BTRFS_RAID_RAID5;
				7452	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
				7453	return BTRFS_RAID_RAID6;
				7454
				7455	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
				7456	}
				7457
				7458	int get_block_group_index(struct btrfs_block_group_cache *cache)
				7459	{
				7460	return __get_raid_index(cache->flags);
				7461	}
				7462
				7463	static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
				7464	[BTRFS_RAID_RAID10] = "raid10",
				7465	[BTRFS_RAID_RAID1] = "raid1",
				7466	[BTRFS_RAID_DUP] = "dup",
				7467	[BTRFS_RAID_RAID0] = "raid0",
				7468	[BTRFS_RAID_SINGLE] = "single",
				7469	[BTRFS_RAID_RAID5] = "raid5",
				7470	[BTRFS_RAID_RAID6] = "raid6",
				7471	};
				7472
				7473	static const char *get_raid_name(enum btrfs_raid_types type)
				7474	{
				7475	if (type >= BTRFS_NR_RAID_TYPES)
				7476	return NULL;
				7477
				7478	return btrfs_raid_type_names[type];
				7479	}
				7480
				7481	enum btrfs_loop_type {
				7482	LOOP_CACHING_NOWAIT = 0,
				7483	LOOP_CACHING_WAIT = 1,
				7484	LOOP_ALLOC_CHUNK = 2,
				7485	LOOP_NO_EMPTY_SIZE = 3,
				7486	};
				7487
				7488	static inline void
				7489	btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
				7490	int delalloc)
				7491	{
				7492	if (delalloc)
				7493	down_read(&cache->data_rwsem);
				7494	}
				7495
				7496	static inline void
				7497	btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
				7498	int delalloc)
				7499	{
				7500	btrfs_get_block_group(cache);
				7501	if (delalloc)
				7502	down_read(&cache->data_rwsem);
				7503	}
				7504
				7505	static struct btrfs_block_group_cache *
				7506	btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
				7507	struct btrfs_free_cluster *cluster,
				7508	int delalloc)
				7509	{
				7510	struct btrfs_block_group_cache *used_bg = NULL;
				7511
				7512	spin_lock(&cluster->refill_lock);
				7513	while (1) {
				7514	used_bg = cluster->block_group;
				7515	if (!used_bg)
				7516	return NULL;
				7517
				7518	if (used_bg == block_group)
				7519	return used_bg;
				7520
				7521	btrfs_get_block_group(used_bg);
				7522
				7523	if (!delalloc)
				7524	return used_bg;
				7525
				7526	if (down_read_trylock(&used_bg->data_rwsem))
				7527	return used_bg;
				7528
				7529	spin_unlock(&cluster->refill_lock);
				7530
				7531	/* We should only have one-level nested. */
				7532	down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
				7533
				7534	spin_lock(&cluster->refill_lock);
				7535	if (used_bg == cluster->block_group)
				7536	return used_bg;
				7537
				7538	up_read(&used_bg->data_rwsem);
				7539	btrfs_put_block_group(used_bg);
				7540	}
				7541	}
				7542
				7543	static inline void
				7544	btrfs_release_block_group(struct btrfs_block_group_cache *cache,
				7545	int delalloc)
				7546	{
				7547	if (delalloc)
				7548	up_read(&cache->data_rwsem);
				7549	btrfs_put_block_group(cache);
				7550	}
				7551
				7552	/*
				7553	* walks the btree of allocated extents and find a hole of a given size.
				7554	* The key ins is changed to record the hole:
				7555	* ins->objectid == start position
				7556	* ins->flags = BTRFS_EXTENT_ITEM_KEY
				7557	* ins->offset == the size of the hole.
				7558	* Any available blocks before search_start are skipped.
				7559	*
				7560	* If there is no suitable free space, we will record the max size of
				7561	* the free space extent currently.
				7562	*/
				7563	static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
				7564	u64 ram_bytes, u64 num_bytes, u64 empty_size,
				7565	u64 hint_byte, struct btrfs_key *ins,
				7566	u64 flags, int delalloc)
				7567	{
				7568	int ret = 0;
				7569	struct btrfs_root *root = fs_info->extent_root;
				7570	struct btrfs_free_cluster *last_ptr = NULL;
				7571	struct btrfs_block_group_cache *block_group = NULL;
				7572	u64 search_start = 0;
				7573	u64 max_extent_size = 0;
				7574	u64 max_free_space = 0;
				7575	u64 empty_cluster = 0;
				7576	struct btrfs_space_info *space_info;
				7577	int loop = 0;
				7578	int index = __get_raid_index(flags);
				7579	bool failed_cluster_refill = false;
				7580	bool failed_alloc = false;
				7581	bool use_cluster = true;
				7582	bool have_caching_bg = false;
				7583	bool orig_have_caching_bg = false;
				7584	bool full_search = false;
				7585
				7586	WARN_ON(num_bytes < fs_info->sectorsize);
				7587	ins->type = BTRFS_EXTENT_ITEM_KEY;
				7588	ins->objectid = 0;
				7589	ins->offset = 0;
				7590
				7591	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
				7592
				7593	space_info = __find_space_info(fs_info, flags);
				7594	if (!space_info) {
				7595	btrfs_err(fs_info, "No space info for %llu", flags);
				7596	return -ENOSPC;
				7597	}
				7598
				7599	/*
				7600	* If our free space is heavily fragmented we may not be able to make
				7601	* big contiguous allocations, so instead of doing the expensive search
				7602	* for free space, simply return ENOSPC with our max_extent_size so we
				7603	* can go ahead and search for a more manageable chunk.
				7604	*
				7605	* If our max_extent_size is large enough for our allocation simply
				7606	* disable clustering since we will likely not be able to find enough
				7607	* space to create a cluster and induce latency trying.
				7608	*/
				7609	if (unlikely(space_info->max_extent_size)) {
				7610	spin_lock(&space_info->lock);
				7611	if (space_info->max_extent_size &&
				7612	num_bytes > space_info->max_extent_size) {
				7613	ins->offset = space_info->max_extent_size;
				7614	spin_unlock(&space_info->lock);
				7615	return -ENOSPC;
				7616	} else if (space_info->max_extent_size) {
				7617	use_cluster = false;
				7618	}
				7619	spin_unlock(&space_info->lock);
				7620	}
				7621
				7622	last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
				7623	if (last_ptr) {
				7624	spin_lock(&last_ptr->lock);
				7625	if (last_ptr->block_group)
				7626	hint_byte = last_ptr->window_start;
				7627	if (last_ptr->fragmented) {
				7628	/*
				7629	* We still set window_start so we can keep track of the
				7630	* last place we found an allocation to try and save
				7631	* some time.
				7632	*/
				7633	hint_byte = last_ptr->window_start;
				7634	use_cluster = false;
				7635	}
				7636	spin_unlock(&last_ptr->lock);
				7637	}
				7638
				7639	search_start = max(search_start, first_logical_byte(fs_info, 0));
				7640	search_start = max(search_start, hint_byte);
				7641	if (search_start == hint_byte) {
				7642	block_group = btrfs_lookup_block_group(fs_info, search_start);
				7643	/*
				7644	* we don't want to use the block group if it doesn't match our
				7645	* allocation bits, or if its not cached.
				7646	*
				7647	* However if we are re-searching with an ideal block group
				7648	* picked out then we don't care that the block group is cached.
				7649	*/
				7650	if (block_group && block_group_bits(block_group, flags) &&
				7651	block_group->cached != BTRFS_CACHE_NO) {
				7652	down_read(&space_info->groups_sem);
				7653	if (list_empty(&block_group->list) \|\|
				7654	block_group->ro) {
				7655	/*
				7656	* someone is removing this block group,
				7657	* we can't jump into the have_block_group
				7658	* target because our list pointers are not
				7659	* valid
				7660	*/
				7661	btrfs_put_block_group(block_group);
				7662	up_read(&space_info->groups_sem);
				7663	} else {
				7664	index = get_block_group_index(block_group);
				7665	btrfs_lock_block_group(block_group, delalloc);
				7666	goto have_block_group;
				7667	}
				7668	} else if (block_group) {
				7669	btrfs_put_block_group(block_group);
				7670	}
				7671	}
				7672	search:
				7673	have_caching_bg = false;
				7674	if (index == 0 \|\| index == __get_raid_index(flags))
				7675	full_search = true;
				7676	down_read(&space_info->groups_sem);
				7677	list_for_each_entry(block_group, &space_info->block_groups[index],
				7678	list) {
				7679	u64 offset;
				7680	int cached;
				7681
				7682	/* If the block group is read-only, we can skip it entirely. */
				7683	if (unlikely(block_group->ro))
				7684	continue;
				7685
				7686	btrfs_grab_block_group(block_group, delalloc);
				7687	search_start = block_group->key.objectid;
				7688
				7689	/*
				7690	* this can happen if we end up cycling through all the
				7691	* raid types, but we want to make sure we only allocate
				7692	* for the proper type.
				7693	*/
				7694	if (!block_group_bits(block_group, flags)) {
				7695	u64 extra = BTRFS_BLOCK_GROUP_DUP \|
				7696	BTRFS_BLOCK_GROUP_RAID1 \|
				7697	BTRFS_BLOCK_GROUP_RAID5 \|
				7698	BTRFS_BLOCK_GROUP_RAID6 \|
				7699	BTRFS_BLOCK_GROUP_RAID10;
				7700
				7701	/*
				7702	* if they asked for extra copies and this block group
				7703	* doesn't provide them, bail. This does allow us to
				7704	* fill raid0 from raid1.
				7705	*/
				7706	if ((flags & extra) && !(block_group->flags & extra))
				7707	goto loop;
				7708
				7709	/*
				7710	* This block group has different flags than we want.
				7711	* It's possible that we have MIXED_GROUP flag but no
				7712	* block group is mixed. Just skip such block group.
				7713	*/
				7714	btrfs_release_block_group(block_group, delalloc);
				7715	continue;
				7716	}
				7717
				7718	have_block_group:
				7719	cached = block_group_cache_done(block_group);
				7720	if (unlikely(!cached)) {
				7721	have_caching_bg = true;
				7722	ret = cache_block_group(block_group, 0);
				7723	BUG_ON(ret < 0);
				7724	ret = 0;
				7725	}
				7726
				7727	if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
				7728	goto loop;
				7729
				7730	/*
				7731	* Ok we want to try and use the cluster allocator, so
				7732	* lets look there
				7733	*/
				7734	if (last_ptr && use_cluster) {
				7735	struct btrfs_block_group_cache *used_block_group;
				7736	unsigned long aligned_cluster;
				7737	/*
				7738	* the refill lock keeps out other
				7739	* people trying to start a new cluster
				7740	*/
				7741	used_block_group = btrfs_lock_cluster(block_group,
				7742	last_ptr,
				7743	delalloc);
				7744	if (!used_block_group)
				7745	goto refill_cluster;
				7746
				7747	if (used_block_group != block_group &&
				7748	(used_block_group->ro \|\|
				7749	!block_group_bits(used_block_group, flags)))
				7750	goto release_cluster;
				7751
				7752	offset = btrfs_alloc_from_cluster(used_block_group,
				7753	last_ptr,
				7754	num_bytes,
				7755	used_block_group->key.objectid,
				7756	&max_extent_size);
				7757	if (offset) {
				7758	/* we have a block, we're done */
				7759	spin_unlock(&last_ptr->refill_lock);
				7760	trace_btrfs_reserve_extent_cluster(fs_info,
				7761	used_block_group,
				7762	search_start, num_bytes);
				7763	if (used_block_group != block_group) {
				7764	btrfs_release_block_group(block_group,
				7765	delalloc);
				7766	block_group = used_block_group;
				7767	}
				7768	goto checks;
				7769	}
				7770
				7771	WARN_ON(last_ptr->block_group != used_block_group);
				7772	release_cluster:
				7773	/* If we are on LOOP_NO_EMPTY_SIZE, we can't
				7774	* set up a new clusters, so lets just skip it
				7775	* and let the allocator find whatever block
				7776	* it can find. If we reach this point, we
				7777	* will have tried the cluster allocator
				7778	* plenty of times and not have found
				7779	* anything, so we are likely way too
				7780	* fragmented for the clustering stuff to find
				7781	* anything.
				7782	*
				7783	* However, if the cluster is taken from the
				7784	* current block group, release the cluster
				7785	* first, so that we stand a better chance of
				7786	* succeeding in the unclustered
				7787	* allocation. */
				7788	if (loop >= LOOP_NO_EMPTY_SIZE &&
				7789	used_block_group != block_group) {
				7790	spin_unlock(&last_ptr->refill_lock);
				7791	btrfs_release_block_group(used_block_group,
				7792	delalloc);
				7793	goto unclustered_alloc;
				7794	}
				7795
				7796	/*
				7797	* this cluster didn't work out, free it and
				7798	* start over
				7799	*/
				7800	btrfs_return_cluster_to_free_space(NULL, last_ptr);
				7801
				7802	if (used_block_group != block_group)
				7803	btrfs_release_block_group(used_block_group,
				7804	delalloc);
				7805	refill_cluster:
				7806	if (loop >= LOOP_NO_EMPTY_SIZE) {
				7807	spin_unlock(&last_ptr->refill_lock);
				7808	goto unclustered_alloc;
				7809	}
				7810
				7811	aligned_cluster = max_t(unsigned long,
				7812	empty_cluster + empty_size,
				7813	block_group->full_stripe_len);
				7814
				7815	/* allocate a cluster in this block group */
				7816	ret = btrfs_find_space_cluster(fs_info, block_group,
				7817	last_ptr, search_start,
				7818	num_bytes,
				7819	aligned_cluster);
				7820	if (ret == 0) {
				7821	/*
				7822	* now pull our allocation out of this
				7823	* cluster
				7824	*/
				7825	offset = btrfs_alloc_from_cluster(block_group,
				7826	last_ptr,
				7827	num_bytes,
				7828	search_start,
				7829	&max_extent_size);
				7830	if (offset) {
				7831	/* we found one, proceed */
				7832	spin_unlock(&last_ptr->refill_lock);
				7833	trace_btrfs_reserve_extent_cluster(fs_info,
				7834	block_group, search_start,
				7835	num_bytes);
				7836	goto checks;
				7837	}
				7838	} else if (!cached && loop > LOOP_CACHING_NOWAIT
				7839	&& !failed_cluster_refill) {
				7840	spin_unlock(&last_ptr->refill_lock);
				7841
				7842	failed_cluster_refill = true;
				7843	wait_block_group_cache_progress(block_group,
				7844	num_bytes + empty_cluster + empty_size);
				7845	goto have_block_group;
				7846	}
				7847
				7848	/*
				7849	* at this point we either didn't find a cluster
				7850	* or we weren't able to allocate a block from our
				7851	* cluster. Free the cluster we've been trying
				7852	* to use, and go to the next block group
				7853	*/
				7854	btrfs_return_cluster_to_free_space(NULL, last_ptr);
				7855	spin_unlock(&last_ptr->refill_lock);
				7856	goto loop;
				7857	}
				7858
				7859	unclustered_alloc:
				7860	/*
				7861	* We are doing an unclustered alloc, set the fragmented flag so
				7862	* we don't bother trying to setup a cluster again until we get
				7863	* more space.
				7864	*/
				7865	if (unlikely(last_ptr)) {
				7866	spin_lock(&last_ptr->lock);
				7867	last_ptr->fragmented = 1;
				7868	spin_unlock(&last_ptr->lock);
				7869	}
				7870	if (cached) {
				7871	struct btrfs_free_space_ctl *ctl =
				7872	block_group->free_space_ctl;
				7873
				7874	spin_lock(&ctl->tree_lock);
				7875	if (ctl->free_space <
				7876	num_bytes + empty_cluster + empty_size) {
				7877	max_free_space = max(max_free_space,
				7878	ctl->free_space);
				7879	spin_unlock(&ctl->tree_lock);
				7880	goto loop;
				7881	}
				7882	spin_unlock(&ctl->tree_lock);
				7883	}
				7884
				7885	offset = btrfs_find_space_for_alloc(block_group, search_start,
				7886	num_bytes, empty_size,
				7887	&max_extent_size);
				7888	/*
				7889	* If we didn't find a chunk, and we haven't failed on this
				7890	* block group before, and this block group is in the middle of
				7891	* caching and we are ok with waiting, then go ahead and wait
				7892	* for progress to be made, and set failed_alloc to true.
				7893	*
				7894	* If failed_alloc is true then we've already waited on this
				7895	* block group once and should move on to the next block group.
				7896	*/
				7897	if (!offset && !failed_alloc && !cached &&
				7898	loop > LOOP_CACHING_NOWAIT) {
				7899	wait_block_group_cache_progress(block_group,
				7900	num_bytes + empty_size);
				7901	failed_alloc = true;
				7902	goto have_block_group;
				7903	} else if (!offset) {
				7904	goto loop;
				7905	}
				7906	checks:
				7907	search_start = ALIGN(offset, fs_info->stripesize);
				7908
				7909	/* move on to the next group */
				7910	if (search_start + num_bytes >
				7911	block_group->key.objectid + block_group->key.offset) {
				7912	btrfs_add_free_space(block_group, offset, num_bytes);
				7913	goto loop;
				7914	}
				7915
				7916	if (offset < search_start)
				7917	btrfs_add_free_space(block_group, offset,
				7918	search_start - offset);
				7919	BUG_ON(offset > search_start);
				7920
				7921	ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				7922	num_bytes, delalloc);
				7923	if (ret == -EAGAIN) {
				7924	btrfs_add_free_space(block_group, offset, num_bytes);
				7925	goto loop;
				7926	}
				7927	btrfs_inc_block_group_reservations(block_group);
				7928
				7929	/* we are all good, lets return */
				7930	ins->objectid = search_start;
				7931	ins->offset = num_bytes;
				7932
				7933	trace_btrfs_reserve_extent(fs_info, block_group,
				7934	search_start, num_bytes);
				7935	btrfs_release_block_group(block_group, delalloc);
				7936	break;
				7937	loop:
				7938	failed_cluster_refill = false;
				7939	failed_alloc = false;
				7940	BUG_ON(index != get_block_group_index(block_group));
				7941	btrfs_release_block_group(block_group, delalloc);
				7942	cond_resched();
				7943	}
				7944	up_read(&space_info->groups_sem);
				7945
				7946	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
				7947	&& !orig_have_caching_bg)
				7948	orig_have_caching_bg = true;
				7949
				7950	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
				7951	goto search;
				7952
				7953	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
				7954	goto search;
				7955
				7956	/*
				7957	* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
				7958	* caching kthreads as we move along
				7959	* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
				7960	* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
				7961	* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
				7962	* again
				7963	*/
				7964	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
				7965	index = 0;
				7966	if (loop == LOOP_CACHING_NOWAIT) {
				7967	/*
				7968	* We want to skip the LOOP_CACHING_WAIT step if we
				7969	* don't have any uncached bgs and we've already done a
				7970	* full search through.
				7971	*/
				7972	if (orig_have_caching_bg \|\| !full_search)
				7973	loop = LOOP_CACHING_WAIT;
				7974	else
				7975	loop = LOOP_ALLOC_CHUNK;
				7976	} else {
				7977	loop++;
				7978	}
				7979
				7980	if (loop == LOOP_ALLOC_CHUNK) {
				7981	struct btrfs_trans_handle *trans;
				7982	int exist = 0;
				7983
				7984	trans = current->journal_info;
				7985	if (trans)
				7986	exist = 1;
				7987	else
				7988	trans = btrfs_join_transaction(root);
				7989
				7990	if (IS_ERR(trans)) {
				7991	ret = PTR_ERR(trans);
				7992	goto out;
				7993	}
				7994
				7995	ret = do_chunk_alloc(trans, fs_info, flags,
				7996	CHUNK_ALLOC_FORCE);
				7997
				7998	/*
				7999	* If we can't allocate a new chunk we've already looped
				8000	* through at least once, move on to the NO_EMPTY_SIZE
				8001	* case.
				8002	*/
				8003	if (ret == -ENOSPC)
				8004	loop = LOOP_NO_EMPTY_SIZE;
				8005
				8006	/*
				8007	* Do not bail out on ENOSPC since we
				8008	* can do more things.
				8009	*/
				8010	if (ret < 0 && ret != -ENOSPC)
				8011	btrfs_abort_transaction(trans, ret);
				8012	else
				8013	ret = 0;
				8014	if (!exist)
				8015	btrfs_end_transaction(trans);
				8016	if (ret)
				8017	goto out;
				8018	}
				8019
				8020	if (loop == LOOP_NO_EMPTY_SIZE) {
				8021	/*
				8022	* Don't loop again if we already have no empty_size and
				8023	* no empty_cluster.
				8024	*/
				8025	if (empty_size == 0 &&
				8026	empty_cluster == 0) {
				8027	ret = -ENOSPC;
				8028	goto out;
				8029	}
				8030	empty_size = 0;
				8031	empty_cluster = 0;
				8032	}
				8033
				8034	goto search;
				8035	} else if (!ins->objectid) {
				8036	ret = -ENOSPC;
				8037	} else if (ins->objectid) {
				8038	if (!use_cluster && last_ptr) {
				8039	spin_lock(&last_ptr->lock);
				8040	last_ptr->window_start = ins->objectid;
				8041	spin_unlock(&last_ptr->lock);
				8042	}
				8043	ret = 0;
				8044	}
				8045	out:
				8046	if (ret == -ENOSPC) {
				8047	if (!max_extent_size)
				8048	max_extent_size = max_free_space;
				8049	spin_lock(&space_info->lock);
				8050	space_info->max_extent_size = max_extent_size;
				8051	spin_unlock(&space_info->lock);
				8052	ins->offset = max_extent_size;
				8053	}
				8054	return ret;
				8055	}
				8056
				8057	static void dump_space_info(struct btrfs_fs_info *fs_info,
				8058	struct btrfs_space_info *info, u64 bytes,
				8059	int dump_block_groups)
				8060	{
				8061	struct btrfs_block_group_cache *cache;
				8062	int index = 0;
				8063
				8064	spin_lock(&info->lock);
				8065	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
				8066	info->flags,
				8067	info->total_bytes - btrfs_space_info_used(info, true),
				8068	info->full ? "" : "not ");
				8069	btrfs_info(fs_info,
				8070	"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
				8071	info->total_bytes, info->bytes_used, info->bytes_pinned,
				8072	info->bytes_reserved, info->bytes_may_use,
				8073	info->bytes_readonly);
				8074	spin_unlock(&info->lock);
				8075
				8076	if (!dump_block_groups)
				8077	return;
				8078
				8079	down_read(&info->groups_sem);
				8080	again:
				8081	list_for_each_entry(cache, &info->block_groups[index], list) {
				8082	spin_lock(&cache->lock);
				8083	btrfs_info(fs_info,
				8084	"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
				8085	cache->key.objectid, cache->key.offset,
				8086	btrfs_block_group_used(&cache->item), cache->pinned,
				8087	cache->reserved, cache->ro ? "[readonly]" : "");
				8088	btrfs_dump_free_space(cache, bytes);
				8089	spin_unlock(&cache->lock);
				8090	}
				8091	if (++index < BTRFS_NR_RAID_TYPES)
				8092	goto again;
				8093	up_read(&info->groups_sem);
				8094	}
				8095
				8096	int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
				8097	u64 num_bytes, u64 min_alloc_size,
				8098	u64 empty_size, u64 hint_byte,
				8099	struct btrfs_key *ins, int is_data, int delalloc)
				8100	{
				8101	struct btrfs_fs_info *fs_info = root->fs_info;
				8102	bool final_tried = num_bytes == min_alloc_size;
				8103	u64 flags;
				8104	int ret;
				8105
				8106	flags = get_alloc_profile_by_root(root, is_data);
				8107	again:
				8108	WARN_ON(num_bytes < fs_info->sectorsize);
				8109	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
				8110	hint_byte, ins, flags, delalloc);
				8111	if (!ret && !is_data) {
				8112	btrfs_dec_block_group_reservations(fs_info, ins->objectid);
				8113	} else if (ret == -ENOSPC) {
				8114	if (!final_tried && ins->offset) {
				8115	num_bytes = min(num_bytes >> 1, ins->offset);
				8116	num_bytes = round_down(num_bytes,
				8117	fs_info->sectorsize);
				8118	num_bytes = max(num_bytes, min_alloc_size);
				8119	ram_bytes = num_bytes;
				8120	if (num_bytes == min_alloc_size)
				8121	final_tried = true;
				8122	goto again;
				8123	} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				8124	struct btrfs_space_info *sinfo;
				8125
				8126	sinfo = __find_space_info(fs_info, flags);
				8127	btrfs_err(fs_info,
				8128	"allocation failed flags %llu, wanted %llu",
				8129	flags, num_bytes);
				8130	if (sinfo)
				8131	dump_space_info(fs_info, sinfo, num_bytes, 1);
				8132	}
				8133	}
				8134
				8135	return ret;
				8136	}
				8137
				8138	static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
				8139	u64 start, u64 len,
				8140	int pin, int delalloc)
				8141	{
				8142	struct btrfs_block_group_cache *cache;
				8143	int ret = 0;
				8144
				8145	cache = btrfs_lookup_block_group(fs_info, start);
				8146	if (!cache) {
				8147	btrfs_err(fs_info, "Unable to find block group for %llu",
				8148	start);
				8149	return -ENOSPC;
				8150	}
				8151
				8152	if (pin)
				8153	pin_down_extent(fs_info, cache, start, len, 1);
				8154	else {
				8155	if (btrfs_test_opt(fs_info, DISCARD))
				8156	ret = btrfs_discard_extent(fs_info, start, len, NULL);
				8157	btrfs_add_free_space(cache, start, len);
				8158	btrfs_free_reserved_bytes(cache, len, delalloc);
				8159	trace_btrfs_reserved_extent_free(fs_info, start, len);
				8160	}
				8161
				8162	btrfs_put_block_group(cache);
				8163	return ret;
				8164	}
				8165
				8166	int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
				8167	u64 start, u64 len, int delalloc)
				8168	{
				8169	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
				8170	}
				8171
				8172	int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
				8173	u64 start, u64 len)
				8174	{
				8175	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
				8176	}
				8177
				8178	static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				8179	struct btrfs_fs_info *fs_info,
				8180	u64 parent, u64 root_objectid,
				8181	u64 flags, u64 owner, u64 offset,
				8182	struct btrfs_key *ins, int ref_mod)
				8183	{
				8184	int ret;
				8185	struct btrfs_extent_item *extent_item;
				8186	struct btrfs_extent_inline_ref *iref;
				8187	struct btrfs_path *path;
				8188	struct extent_buffer *leaf;
				8189	int type;
				8190	u32 size;
				8191
				8192	if (parent > 0)
				8193	type = BTRFS_SHARED_DATA_REF_KEY;
				8194	else
				8195	type = BTRFS_EXTENT_DATA_REF_KEY;
				8196
				8197	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
				8198
				8199	path = btrfs_alloc_path();
				8200	if (!path)
				8201	return -ENOMEM;
				8202
				8203	path->leave_spinning = 1;
				8204	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				8205	ins, size);
				8206	if (ret) {
				8207	btrfs_free_path(path);
				8208	return ret;
				8209	}
				8210
				8211	leaf = path->nodes[0];
				8212	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				8213	struct btrfs_extent_item);
				8214	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
				8215	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
				8216	btrfs_set_extent_flags(leaf, extent_item,
				8217	flags \| BTRFS_EXTENT_FLAG_DATA);
				8218
				8219	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
				8220	btrfs_set_extent_inline_ref_type(leaf, iref, type);
				8221	if (parent > 0) {
				8222	struct btrfs_shared_data_ref *ref;
				8223	ref = (struct btrfs_shared_data_ref *)(iref + 1);
				8224	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				8225	btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
				8226	} else {
				8227	struct btrfs_extent_data_ref *ref;
				8228	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
				8229	btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
				8230	btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
				8231	btrfs_set_extent_data_ref_offset(leaf, ref, offset);
				8232	btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
				8233	}
				8234
				8235	btrfs_mark_buffer_dirty(path->nodes[0]);
				8236	btrfs_free_path(path);
				8237
				8238	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
				8239	ins->offset);
				8240	if (ret)
				8241	return ret;
				8242
				8243	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
				8244	if (ret) { /* -ENOENT, logic error */
				8245	btrfs_err(fs_info, "update block group failed for %llu %llu",
				8246	ins->objectid, ins->offset);
				8247	BUG();
				8248	}
				8249	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
				8250	return ret;
				8251	}
				8252
				8253	static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				8254	struct btrfs_fs_info *fs_info,
				8255	u64 parent, u64 root_objectid,
				8256	u64 flags, struct btrfs_disk_key *key,
				8257	int level, struct btrfs_key *ins)
				8258	{
				8259	int ret;
				8260	struct btrfs_extent_item *extent_item;
				8261	struct btrfs_tree_block_info *block_info;
				8262	struct btrfs_extent_inline_ref *iref;
				8263	struct btrfs_path *path;
				8264	struct extent_buffer *leaf;
				8265	u32 size = sizeof(extent_item) + sizeof(iref);
				8266	u64 num_bytes = ins->offset;
				8267	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				8268
				8269	if (!skinny_metadata)
				8270	size += sizeof(*block_info);
				8271
				8272	path = btrfs_alloc_path();
				8273	if (!path) {
				8274	btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
				8275	fs_info->nodesize);
				8276	return -ENOMEM;
				8277	}
				8278
				8279	path->leave_spinning = 1;
				8280	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				8281	ins, size);
				8282	if (ret) {
				8283	btrfs_free_path(path);
				8284	btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid,
				8285	fs_info->nodesize);
				8286	return ret;
				8287	}
				8288
				8289	leaf = path->nodes[0];
				8290	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				8291	struct btrfs_extent_item);
				8292	btrfs_set_extent_refs(leaf, extent_item, 1);
				8293	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
				8294	btrfs_set_extent_flags(leaf, extent_item,
				8295	flags \| BTRFS_EXTENT_FLAG_TREE_BLOCK);
				8296
				8297	if (skinny_metadata) {
				8298	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
				8299	num_bytes = fs_info->nodesize;
				8300	} else {
				8301	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
				8302	btrfs_set_tree_block_key(leaf, block_info, key);
				8303	btrfs_set_tree_block_level(leaf, block_info, level);
				8304	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
				8305	}
				8306
				8307	if (parent > 0) {
				8308	BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
				8309	btrfs_set_extent_inline_ref_type(leaf, iref,
				8310	BTRFS_SHARED_BLOCK_REF_KEY);
				8311	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				8312	} else {
				8313	btrfs_set_extent_inline_ref_type(leaf, iref,
				8314	BTRFS_TREE_BLOCK_REF_KEY);
				8315	btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
				8316	}
				8317
				8318	btrfs_mark_buffer_dirty(leaf);
				8319	btrfs_free_path(path);
				8320
				8321	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
				8322	num_bytes);
				8323	if (ret)
				8324	return ret;
				8325
				8326	ret = update_block_group(trans, fs_info, ins->objectid,
				8327	fs_info->nodesize, 1);
				8328	if (ret) { /* -ENOENT, logic error */
				8329	btrfs_err(fs_info, "update block group failed for %llu %llu",
				8330	ins->objectid, ins->offset);
				8331	BUG();
				8332	}
				8333
				8334	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid,
				8335	fs_info->nodesize);
				8336	return ret;
				8337	}
				8338
				8339	int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				8340	u64 root_objectid, u64 owner,
				8341	u64 offset, u64 ram_bytes,
				8342	struct btrfs_key *ins)
				8343	{
				8344	struct btrfs_fs_info *fs_info = trans->fs_info;
				8345	int ret;
				8346
				8347	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
				8348
				8349	ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid,
				8350	ins->offset, 0, root_objectid, owner,
				8351	offset, ram_bytes,
				8352	BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
				8353	return ret;
				8354	}
				8355
				8356	/*
				8357	* this is used by the tree logging recovery code. It records that
				8358	* an extent has been allocated and makes sure to clear the free
				8359	* space cache bits as well
				8360	*/
				8361	int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				8362	struct btrfs_fs_info *fs_info,
				8363	u64 root_objectid, u64 owner, u64 offset,
				8364	struct btrfs_key *ins)
				8365	{
				8366	int ret;
				8367	struct btrfs_block_group_cache *block_group;
				8368	struct btrfs_space_info *space_info;
				8369
				8370	/*
				8371	* Mixed block groups will exclude before processing the log so we only
				8372	* need to do the exclude dance if this fs isn't mixed.
				8373	*/
				8374	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
				8375	ret = __exclude_logged_extent(fs_info, ins->objectid,
				8376	ins->offset);
				8377	if (ret)
				8378	return ret;
				8379	}
				8380
				8381	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
				8382	if (!block_group)
				8383	return -EINVAL;
				8384
				8385	space_info = block_group->space_info;
				8386	spin_lock(&space_info->lock);
				8387	spin_lock(&block_group->lock);
				8388	space_info->bytes_reserved += ins->offset;
				8389	block_group->reserved += ins->offset;
				8390	spin_unlock(&block_group->lock);
				8391	spin_unlock(&space_info->lock);
				8392
				8393	ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid,
				8394	0, owner, offset, ins, 1);
				8395	btrfs_put_block_group(block_group);
				8396	return ret;
				8397	}
				8398
				8399	static struct extent_buffer *
				8400	btrfs_init_new_buffer(struct btrfs_trans_handle trans, struct btrfs_root root,
				8401	u64 bytenr, int level)
				8402	{
				8403	struct btrfs_fs_info *fs_info = root->fs_info;
				8404	struct extent_buffer *buf;
				8405
				8406	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				8407	if (IS_ERR(buf))
				8408	return buf;
				8409
				8410	/*
				8411	* Extra safety check in case the extent tree is corrupted and extent
				8412	* allocator chooses to use a tree block which is already used and
				8413	* locked.
				8414	*/
				8415	if (buf->lock_owner == current->pid) {
				8416	btrfs_err_rl(fs_info,
				8417	"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
				8418	buf->start, btrfs_header_owner(buf), current->pid);
				8419	free_extent_buffer(buf);
				8420	return ERR_PTR(-EUCLEAN);
				8421	}
				8422
				8423	btrfs_set_header_generation(buf, trans->transid);
				8424	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
				8425	btrfs_tree_lock(buf);
				8426	clean_tree_block(fs_info, buf);
				8427	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
				8428
				8429	btrfs_set_lock_blocking(buf);
				8430	set_extent_buffer_uptodate(buf);
				8431
				8432	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
				8433	buf->log_index = root->log_transid % 2;
				8434	/*
				8435	* we allow two log transactions at a time, use different
				8436	* EXENT bit to differentiate dirty pages.
				8437	*/
				8438	if (buf->log_index == 0)
				8439	set_extent_dirty(&root->dirty_log_pages, buf->start,
				8440	buf->start + buf->len - 1, GFP_NOFS);
				8441	else
				8442	set_extent_new(&root->dirty_log_pages, buf->start,
				8443	buf->start + buf->len - 1);
				8444	} else {
				8445	buf->log_index = -1;
				8446	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
				8447	buf->start + buf->len - 1, GFP_NOFS);
				8448	}
				8449	trans->dirty = true;
				8450	/* this returns a buffer locked for blocking */
				8451	return buf;
				8452	}
				8453
				8454	static struct btrfs_block_rsv *
				8455	use_block_rsv(struct btrfs_trans_handle *trans,
				8456	struct btrfs_root *root, u32 blocksize)
				8457	{
				8458	struct btrfs_fs_info *fs_info = root->fs_info;
				8459	struct btrfs_block_rsv *block_rsv;
				8460	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				8461	int ret;
				8462	bool global_updated = false;
				8463
				8464	block_rsv = get_block_rsv(trans, root);
				8465
				8466	if (unlikely(block_rsv->size == 0))
				8467	goto try_reserve;
				8468	again:
				8469	ret = block_rsv_use_bytes(block_rsv, blocksize);
				8470	if (!ret)
				8471	return block_rsv;
				8472
				8473	if (block_rsv->failfast)
				8474	return ERR_PTR(ret);
				8475
				8476	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
				8477	global_updated = true;
				8478	update_global_block_rsv(fs_info);
				8479	goto again;
				8480	}
				8481
				8482	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				8483	static DEFINE_RATELIMIT_STATE(_rs,
				8484	DEFAULT_RATELIMIT_INTERVAL * 10,
				8485	/DEFAULT_RATELIMIT_BURST/ 1);
				8486	if (__ratelimit(&_rs))
				8487	WARN(1, KERN_DEBUG
				8488	"BTRFS: block rsv returned %d\n", ret);
				8489	}
				8490	try_reserve:
				8491	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
				8492	BTRFS_RESERVE_NO_FLUSH);
				8493	if (!ret)
				8494	return block_rsv;
				8495	/*
				8496	* If we couldn't reserve metadata bytes try and use some from
				8497	* the global reserve if its space type is the same as the global
				8498	* reservation.
				8499	*/
				8500	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
				8501	block_rsv->space_info == global_rsv->space_info) {
				8502	ret = block_rsv_use_bytes(global_rsv, blocksize);
				8503	if (!ret)
				8504	return global_rsv;
				8505	}
				8506	return ERR_PTR(ret);
				8507	}
				8508
				8509	static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
				8510	struct btrfs_block_rsv *block_rsv, u32 blocksize)
				8511	{
				8512	block_rsv_add_bytes(block_rsv, blocksize, 0);
				8513	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
				8514	}
				8515
				8516	/*
				8517	* finds a free extent and does all the dirty work required for allocation
				8518	* returns the tree buffer or an ERR_PTR on error.
				8519	*/
				8520	struct extent_buffer btrfs_alloc_tree_block(struct btrfs_trans_handle trans,
				8521	struct btrfs_root *root,
				8522	u64 parent, u64 root_objectid,
				8523	const struct btrfs_disk_key *key,
				8524	int level, u64 hint,
				8525	u64 empty_size)
				8526	{
				8527	struct btrfs_fs_info *fs_info = root->fs_info;
				8528	struct btrfs_key ins;
				8529	struct btrfs_block_rsv *block_rsv;
				8530	struct extent_buffer *buf;
				8531	struct btrfs_delayed_extent_op *extent_op;
				8532	u64 flags = 0;
				8533	int ret;
				8534	u32 blocksize = fs_info->nodesize;
				8535	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				8536
				8537	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				8538	if (btrfs_is_testing(fs_info)) {
				8539	buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
				8540	level);
				8541	if (!IS_ERR(buf))
				8542	root->alloc_bytenr += blocksize;
				8543	return buf;
				8544	}
				8545	#endif
				8546
				8547	block_rsv = use_block_rsv(trans, root, blocksize);
				8548	if (IS_ERR(block_rsv))
				8549	return ERR_CAST(block_rsv);
				8550
				8551	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
				8552	empty_size, hint, &ins, 0, 0);
				8553	if (ret)
				8554	goto out_unuse;
				8555
				8556	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
				8557	if (IS_ERR(buf)) {
				8558	ret = PTR_ERR(buf);
				8559	goto out_free_reserved;
				8560	}
				8561
				8562	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
				8563	if (parent == 0)
				8564	parent = ins.objectid;
				8565	flags \|= BTRFS_BLOCK_FLAG_FULL_BACKREF;
				8566	} else
				8567	BUG_ON(parent > 0);
				8568
				8569	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
				8570	extent_op = btrfs_alloc_delayed_extent_op();
				8571	if (!extent_op) {
				8572	ret = -ENOMEM;
				8573	goto out_free_buf;
				8574	}
				8575	if (key)
				8576	memcpy(&extent_op->key, key, sizeof(extent_op->key));
				8577	else
				8578	memset(&extent_op->key, 0, sizeof(extent_op->key));
				8579	extent_op->flags_to_set = flags;
				8580	extent_op->update_key = skinny_metadata ? false : true;
				8581	extent_op->update_flags = true;
				8582	extent_op->is_data = false;
				8583	extent_op->level = level;
				8584
				8585	ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid,
				8586	ins.offset, parent,
				8587	root_objectid, level,
				8588	BTRFS_ADD_DELAYED_EXTENT,
				8589	extent_op, NULL, NULL);
				8590	if (ret)
				8591	goto out_free_delayed;
				8592	}
				8593	return buf;
				8594
				8595	out_free_delayed:
				8596	btrfs_free_delayed_extent_op(extent_op);
				8597	out_free_buf:
				8598	free_extent_buffer(buf);
				8599	out_free_reserved:
				8600	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
				8601	out_unuse:
				8602	unuse_block_rsv(fs_info, block_rsv, blocksize);
				8603	return ERR_PTR(ret);
				8604	}
				8605
				8606	struct walk_control {
				8607	u64 refs[BTRFS_MAX_LEVEL];
				8608	u64 flags[BTRFS_MAX_LEVEL];
				8609	struct btrfs_key update_progress;
				8610	int stage;
				8611	int level;
				8612	int shared_level;
				8613	int update_ref;
				8614	int keep_locks;
				8615	int reada_slot;
				8616	int reada_count;
				8617	int for_reloc;
				8618	};
				8619
				8620	#define DROP_REFERENCE 1
				8621	#define UPDATE_BACKREF 2
				8622
				8623	static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				8624	struct btrfs_root *root,
				8625	struct walk_control *wc,
				8626	struct btrfs_path *path)
				8627	{
				8628	struct btrfs_fs_info *fs_info = root->fs_info;
				8629	u64 bytenr;
				8630	u64 generation;
				8631	u64 refs;
				8632	u64 flags;
				8633	u32 nritems;
				8634	struct btrfs_key key;
				8635	struct extent_buffer *eb;
				8636	int ret;
				8637	int slot;
				8638	int nread = 0;
				8639
				8640	if (path->slots[wc->level] < wc->reada_slot) {
				8641	wc->reada_count = wc->reada_count * 2 / 3;
				8642	wc->reada_count = max(wc->reada_count, 2);
				8643	} else {
				8644	wc->reada_count = wc->reada_count * 3 / 2;
				8645	wc->reada_count = min_t(int, wc->reada_count,
				8646	BTRFS_NODEPTRS_PER_BLOCK(fs_info));
				8647	}
				8648
				8649	eb = path->nodes[wc->level];
				8650	nritems = btrfs_header_nritems(eb);
				8651
				8652	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
				8653	if (nread >= wc->reada_count)
				8654	break;
				8655
				8656	cond_resched();
				8657	bytenr = btrfs_node_blockptr(eb, slot);
				8658	generation = btrfs_node_ptr_generation(eb, slot);
				8659
				8660	if (slot == path->slots[wc->level])
				8661	goto reada;
				8662
				8663	if (wc->stage == UPDATE_BACKREF &&
				8664	generation <= root->root_key.offset)
				8665	continue;
				8666
				8667	/* We don't lock the tree block, it's OK to be racy here */
				8668	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
				8669	wc->level - 1, 1, &refs,
				8670	&flags);
				8671	/* We don't care about errors in readahead. */
				8672	if (ret < 0)
				8673	continue;
				8674	BUG_ON(refs == 0);
				8675
				8676	if (wc->stage == DROP_REFERENCE) {
				8677	if (refs == 1)
				8678	goto reada;
				8679
				8680	if (wc->level == 1 &&
				8681	(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8682	continue;
				8683	if (!wc->update_ref \|\|
				8684	generation <= root->root_key.offset)
				8685	continue;
				8686	btrfs_node_key_to_cpu(eb, &key, slot);
				8687	ret = btrfs_comp_cpu_keys(&key,
				8688	&wc->update_progress);
				8689	if (ret < 0)
				8690	continue;
				8691	} else {
				8692	if (wc->level == 1 &&
				8693	(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8694	continue;
				8695	}
				8696	reada:
				8697	readahead_tree_block(fs_info, bytenr);
				8698	nread++;
				8699	}
				8700	wc->reada_slot = slot;
				8701	}
				8702
				8703	/*
				8704	* helper to process tree block while walking down the tree.
				8705	*
				8706	* when wc->stage == UPDATE_BACKREF, this function updates
				8707	* back refs for pointers in the block.
				8708	*
				8709	* NOTE: return value 1 means we should stop walking down.
				8710	*/
				8711	static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
				8712	struct btrfs_root *root,
				8713	struct btrfs_path *path,
				8714	struct walk_control *wc, int lookup_info)
				8715	{
				8716	struct btrfs_fs_info *fs_info = root->fs_info;
				8717	int level = wc->level;
				8718	struct extent_buffer *eb = path->nodes[level];
				8719	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
				8720	int ret;
				8721
				8722	if (wc->stage == UPDATE_BACKREF &&
				8723	btrfs_header_owner(eb) != root->root_key.objectid)
				8724	return 1;
				8725
				8726	/*
				8727	* when reference count of tree block is 1, it won't increase
				8728	* again. once full backref flag is set, we never clear it.
				8729	*/
				8730	if (lookup_info &&
				8731	((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) \|\|
				8732	(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
				8733	BUG_ON(!path->locks[level]);
				8734	ret = btrfs_lookup_extent_info(trans, fs_info,
				8735	eb->start, level, 1,
				8736	&wc->refs[level],
				8737	&wc->flags[level]);
				8738	BUG_ON(ret == -ENOMEM);
				8739	if (ret)
				8740	return ret;
				8741	BUG_ON(wc->refs[level] == 0);
				8742	}
				8743
				8744	if (wc->stage == DROP_REFERENCE) {
				8745	if (wc->refs[level] > 1)
				8746	return 1;
				8747
				8748	if (path->locks[level] && !wc->keep_locks) {
				8749	btrfs_tree_unlock_rw(eb, path->locks[level]);
				8750	path->locks[level] = 0;
				8751	}
				8752	return 0;
				8753	}
				8754
				8755	/* wc->stage == UPDATE_BACKREF */
				8756	if (!(wc->flags[level] & flag)) {
				8757	BUG_ON(!path->locks[level]);
				8758	ret = btrfs_inc_ref(trans, root, eb, 1);
				8759	BUG_ON(ret); /* -ENOMEM */
				8760	ret = btrfs_dec_ref(trans, root, eb, 0);
				8761	BUG_ON(ret); /* -ENOMEM */
				8762	ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
				8763	eb->len, flag,
				8764	btrfs_header_level(eb), 0);
				8765	BUG_ON(ret); /* -ENOMEM */
				8766	wc->flags[level] \|= flag;
				8767	}
				8768
				8769	/*
				8770	* the block is shared by multiple trees, so it's not good to
				8771	* keep the tree lock
				8772	*/
				8773	if (path->locks[level] && level > 0) {
				8774	btrfs_tree_unlock_rw(eb, path->locks[level]);
				8775	path->locks[level] = 0;
				8776	}
				8777	return 0;
				8778	}
				8779
				8780	/*
				8781	* helper to process tree block pointer.
				8782	*
				8783	* when wc->stage == DROP_REFERENCE, this function checks
				8784	* reference count of the block pointed to. if the block
				8785	* is shared and we need update back refs for the subtree
				8786	* rooted at the block, this function changes wc->stage to
				8787	* UPDATE_BACKREF. if the block is shared and there is no
				8788	* need to update back, this function drops the reference
				8789	* to the block.
				8790	*
				8791	* NOTE: return value 1 means we should stop walking down.
				8792	*/
				8793	static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				8794	struct btrfs_root *root,
				8795	struct btrfs_path *path,
				8796	struct walk_control wc, int lookup_info)
				8797	{
				8798	struct btrfs_fs_info *fs_info = root->fs_info;
				8799	u64 bytenr;
				8800	u64 generation;
				8801	u64 parent;
				8802	u32 blocksize;
				8803	struct btrfs_key key;
				8804	struct extent_buffer *next;
				8805	int level = wc->level;
				8806	int reada = 0;
				8807	int ret = 0;
				8808	bool need_account = false;
				8809
				8810	generation = btrfs_node_ptr_generation(path->nodes[level],
				8811	path->slots[level]);
				8812	/*
				8813	* if the lower level block was created before the snapshot
				8814	* was created, we know there is no need to update back refs
				8815	* for the subtree
				8816	*/
				8817	if (wc->stage == UPDATE_BACKREF &&
				8818	generation <= root->root_key.offset) {
				8819	*lookup_info = 1;
				8820	return 1;
				8821	}
				8822
				8823	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
				8824	blocksize = fs_info->nodesize;
				8825
				8826	next = find_extent_buffer(fs_info, bytenr);
				8827	if (!next) {
				8828	next = btrfs_find_create_tree_block(fs_info, bytenr);
				8829	if (IS_ERR(next))
				8830	return PTR_ERR(next);
				8831
				8832	btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
				8833	level - 1);
				8834	reada = 1;
				8835	}
				8836	btrfs_tree_lock(next);
				8837	btrfs_set_lock_blocking(next);
				8838
				8839	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
				8840	&wc->refs[level - 1],
				8841	&wc->flags[level - 1]);
				8842	if (ret < 0)
				8843	goto out_unlock;
				8844
				8845	if (unlikely(wc->refs[level - 1] == 0)) {
				8846	btrfs_err(fs_info, "Missing references.");
				8847	ret = -EIO;
				8848	goto out_unlock;
				8849	}
				8850	*lookup_info = 0;
				8851
				8852	if (wc->stage == DROP_REFERENCE) {
				8853	if (wc->refs[level - 1] > 1) {
				8854	need_account = true;
				8855	if (level == 1 &&
				8856	(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8857	goto skip;
				8858
				8859	if (!wc->update_ref \|\|
				8860	generation <= root->root_key.offset)
				8861	goto skip;
				8862
				8863	btrfs_node_key_to_cpu(path->nodes[level], &key,
				8864	path->slots[level]);
				8865	ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
				8866	if (ret < 0)
				8867	goto skip;
				8868
				8869	wc->stage = UPDATE_BACKREF;
				8870	wc->shared_level = level - 1;
				8871	}
				8872	} else {
				8873	if (level == 1 &&
				8874	(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8875	goto skip;
				8876	}
				8877
				8878	if (!btrfs_buffer_uptodate(next, generation, 0)) {
				8879	btrfs_tree_unlock(next);
				8880	free_extent_buffer(next);
				8881	next = NULL;
				8882	*lookup_info = 1;
				8883	}
				8884
				8885	if (!next) {
				8886	if (reada && level == 1)
				8887	reada_walk_down(trans, root, wc, path);
				8888	next = read_tree_block(fs_info, bytenr, generation);
				8889	if (IS_ERR(next)) {
				8890	return PTR_ERR(next);
				8891	} else if (!extent_buffer_uptodate(next)) {
				8892	free_extent_buffer(next);
				8893	return -EIO;
				8894	}
				8895	btrfs_tree_lock(next);
				8896	btrfs_set_lock_blocking(next);
				8897	}
				8898
				8899	level--;
				8900	ASSERT(level == btrfs_header_level(next));
				8901	if (level != btrfs_header_level(next)) {
				8902	btrfs_err(root->fs_info, "mismatched level");
				8903	ret = -EIO;
				8904	goto out_unlock;
				8905	}
				8906	path->nodes[level] = next;
				8907	path->slots[level] = 0;
				8908	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				8909	wc->level = level;
				8910	if (wc->level == 1)
				8911	wc->reada_slot = 0;
				8912	return 0;
				8913	skip:
				8914	wc->refs[level - 1] = 0;
				8915	wc->flags[level - 1] = 0;
				8916	if (wc->stage == DROP_REFERENCE) {
				8917	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
				8918	parent = path->nodes[level]->start;
				8919	} else {
				8920	ASSERT(root->root_key.objectid ==
				8921	btrfs_header_owner(path->nodes[level]));
				8922	if (root->root_key.objectid !=
				8923	btrfs_header_owner(path->nodes[level])) {
				8924	btrfs_err(root->fs_info,
				8925	"mismatched block owner");
				8926	ret = -EIO;
				8927	goto out_unlock;
				8928	}
				8929	parent = 0;
				8930	}
				8931
				8932	if (need_account) {
				8933	ret = btrfs_qgroup_trace_subtree(trans, root, next,
				8934	generation, level - 1);
				8935	if (ret) {
				8936	btrfs_err_rl(fs_info,
				8937	"Error %d accounting shared subtree. Quota is out of sync, rescan required.",
				8938	ret);
				8939	}
				8940	}
				8941	ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize,
				8942	parent, root->root_key.objectid,
				8943	level - 1, 0);
				8944	if (ret)
				8945	goto out_unlock;
				8946	}
				8947
				8948	*lookup_info = 1;
				8949	ret = 1;
				8950
				8951	out_unlock:
				8952	btrfs_tree_unlock(next);
				8953	free_extent_buffer(next);
				8954
				8955	return ret;
				8956	}
				8957
				8958	/*
				8959	* helper to process tree block while walking up the tree.
				8960	*
				8961	* when wc->stage == DROP_REFERENCE, this function drops
				8962	* reference count on the block.
				8963	*
				8964	* when wc->stage == UPDATE_BACKREF, this function changes
				8965	* wc->stage back to DROP_REFERENCE if we changed wc->stage
				8966	* to UPDATE_BACKREF previously while processing the block.
				8967	*
				8968	* NOTE: return value 1 means we should stop walking up.
				8969	*/
				8970	static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				8971	struct btrfs_root *root,
				8972	struct btrfs_path *path,
				8973	struct walk_control *wc)
				8974	{
				8975	struct btrfs_fs_info *fs_info = root->fs_info;
				8976	int ret;
				8977	int level = wc->level;
				8978	struct extent_buffer *eb = path->nodes[level];
				8979	u64 parent = 0;
				8980
				8981	if (wc->stage == UPDATE_BACKREF) {
				8982	BUG_ON(wc->shared_level < level);
				8983	if (level < wc->shared_level)
				8984	goto out;
				8985
				8986	ret = find_next_key(path, level + 1, &wc->update_progress);
				8987	if (ret > 0)
				8988	wc->update_ref = 0;
				8989
				8990	wc->stage = DROP_REFERENCE;
				8991	wc->shared_level = -1;
				8992	path->slots[level] = 0;
				8993
				8994	/*
				8995	* check reference count again if the block isn't locked.
				8996	* we should start walking down the tree again if reference
				8997	* count is one.
				8998	*/
				8999	if (!path->locks[level]) {
				9000	BUG_ON(level == 0);
				9001	btrfs_tree_lock(eb);
				9002	btrfs_set_lock_blocking(eb);
				9003	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				9004
				9005	ret = btrfs_lookup_extent_info(trans, fs_info,
				9006	eb->start, level, 1,
				9007	&wc->refs[level],
				9008	&wc->flags[level]);
				9009	if (ret < 0) {
				9010	btrfs_tree_unlock_rw(eb, path->locks[level]);
				9011	path->locks[level] = 0;
				9012	return ret;
				9013	}
				9014	BUG_ON(wc->refs[level] == 0);
				9015	if (wc->refs[level] == 1) {
				9016	btrfs_tree_unlock_rw(eb, path->locks[level]);
				9017	path->locks[level] = 0;
				9018	return 1;
				9019	}
				9020	}
				9021	}
				9022
				9023	/* wc->stage == DROP_REFERENCE */
				9024	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
				9025
				9026	if (wc->refs[level] == 1) {
				9027	if (level == 0) {
				9028	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				9029	ret = btrfs_dec_ref(trans, root, eb, 1);
				9030	else
				9031	ret = btrfs_dec_ref(trans, root, eb, 0);
				9032	BUG_ON(ret); /* -ENOMEM */
				9033	ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb);
				9034	if (ret) {
				9035	btrfs_err_rl(fs_info,
				9036	"error %d accounting leaf items. Quota is out of sync, rescan required.",
				9037	ret);
				9038	}
				9039	}
				9040	/* make block locked assertion in clean_tree_block happy */
				9041	if (!path->locks[level] &&
				9042	btrfs_header_generation(eb) == trans->transid) {
				9043	btrfs_tree_lock(eb);
				9044	btrfs_set_lock_blocking(eb);
				9045	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				9046	}
				9047	clean_tree_block(fs_info, eb);
				9048	}
				9049
				9050	if (eb == root->node) {
				9051	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				9052	parent = eb->start;
				9053	else if (root->root_key.objectid != btrfs_header_owner(eb))
				9054	goto owner_mismatch;
				9055	} else {
				9056	if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				9057	parent = path->nodes[level + 1]->start;
				9058	else if (root->root_key.objectid !=
				9059	btrfs_header_owner(path->nodes[level + 1]))
				9060	goto owner_mismatch;
				9061	}
				9062
				9063	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
				9064	out:
				9065	wc->refs[level] = 0;
				9066	wc->flags[level] = 0;
				9067	return 0;
				9068
				9069	owner_mismatch:
				9070	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
				9071	btrfs_header_owner(eb), root->root_key.objectid);
				9072	return -EUCLEAN;
				9073	}
				9074
				9075	static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				9076	struct btrfs_root *root,
				9077	struct btrfs_path *path,
				9078	struct walk_control *wc)
				9079	{
				9080	int level = wc->level;
				9081	int lookup_info = 1;
				9082	int ret;
				9083
				9084	while (level >= 0) {
				9085	ret = walk_down_proc(trans, root, path, wc, lookup_info);
				9086	if (ret > 0)
				9087	break;
				9088
				9089	if (level == 0)
				9090	break;
				9091
				9092	if (path->slots[level] >=
				9093	btrfs_header_nritems(path->nodes[level]))
				9094	break;
				9095
				9096	ret = do_walk_down(trans, root, path, wc, &lookup_info);
				9097	if (ret > 0) {
				9098	path->slots[level]++;
				9099	continue;
				9100	} else if (ret < 0)
				9101	return ret;
				9102	level = wc->level;
				9103	}
				9104	return 0;
				9105	}
				9106
				9107	static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
				9108	struct btrfs_root *root,
				9109	struct btrfs_path *path,
				9110	struct walk_control *wc, int max_level)
				9111	{
				9112	int level = wc->level;
				9113	int ret;
				9114
				9115	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
				9116	while (level < max_level && path->nodes[level]) {
				9117	wc->level = level;
				9118	if (path->slots[level] + 1 <
				9119	btrfs_header_nritems(path->nodes[level])) {
				9120	path->slots[level]++;
				9121	return 0;
				9122	} else {
				9123	ret = walk_up_proc(trans, root, path, wc);
				9124	if (ret > 0)
				9125	return 0;
				9126	if (ret < 0)
				9127	return ret;
				9128
				9129	if (path->locks[level]) {
				9130	btrfs_tree_unlock_rw(path->nodes[level],
				9131	path->locks[level]);
				9132	path->locks[level] = 0;
				9133	}
				9134	free_extent_buffer(path->nodes[level]);
				9135	path->nodes[level] = NULL;
				9136	level++;
				9137	}
				9138	}
				9139	return 1;
				9140	}
				9141
				9142	/*
				9143	* drop a subvolume tree.
				9144	*
				9145	* this function traverses the tree freeing any blocks that only
				9146	* referenced by the tree.
				9147	*
				9148	* when a shared tree block is found. this function decreases its
				9149	* reference count by one. if update_ref is true, this function
				9150	* also make sure backrefs for the shared block and all lower level
				9151	* blocks are properly updated.
				9152	*
				9153	* If called with for_reloc == 0, may exit early with -EAGAIN
				9154	*/
				9155	int btrfs_drop_snapshot(struct btrfs_root *root,
				9156	struct btrfs_block_rsv *block_rsv, int update_ref,
				9157	int for_reloc)
				9158	{
				9159	struct btrfs_fs_info *fs_info = root->fs_info;
				9160	struct btrfs_path *path;
				9161	struct btrfs_trans_handle *trans;
				9162	struct btrfs_root *tree_root = fs_info->tree_root;
				9163	struct btrfs_root_item *root_item = &root->root_item;
				9164	struct walk_control *wc;
				9165	struct btrfs_key key;
				9166	int err = 0;
				9167	int ret;
				9168	int level;
				9169	bool root_dropped = false;
				9170
				9171	btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
				9172
				9173	path = btrfs_alloc_path();
				9174	if (!path) {
				9175	err = -ENOMEM;
				9176	goto out;
				9177	}
				9178
				9179	wc = kzalloc(sizeof(*wc), GFP_NOFS);
				9180	if (!wc) {
				9181	btrfs_free_path(path);
				9182	err = -ENOMEM;
				9183	goto out;
				9184	}
				9185
				9186	trans = btrfs_start_transaction(tree_root, 0);
				9187	if (IS_ERR(trans)) {
				9188	err = PTR_ERR(trans);
				9189	goto out_free;
				9190	}
				9191
				9192	if (block_rsv)
				9193	trans->block_rsv = block_rsv;
				9194
				9195	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
				9196	level = btrfs_header_level(root->node);
				9197	path->nodes[level] = btrfs_lock_root_node(root);
				9198	btrfs_set_lock_blocking(path->nodes[level]);
				9199	path->slots[level] = 0;
				9200	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				9201	memset(&wc->update_progress, 0,
				9202	sizeof(wc->update_progress));
				9203	} else {
				9204	btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
				9205	memcpy(&wc->update_progress, &key,
				9206	sizeof(wc->update_progress));
				9207
				9208	level = root_item->drop_level;
				9209	BUG_ON(level == 0);
				9210	path->lowest_level = level;
				9211	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				9212	path->lowest_level = 0;
				9213	if (ret < 0) {
				9214	err = ret;
				9215	goto out_end_trans;
				9216	}
				9217	WARN_ON(ret > 0);
				9218
				9219	/*
				9220	* unlock our path, this is safe because only this
				9221	* function is allowed to delete this snapshot
				9222	*/
				9223	btrfs_unlock_up_safe(path, 0);
				9224
				9225	level = btrfs_header_level(root->node);
				9226	while (1) {
				9227	btrfs_tree_lock(path->nodes[level]);
				9228	btrfs_set_lock_blocking(path->nodes[level]);
				9229	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				9230
				9231	ret = btrfs_lookup_extent_info(trans, fs_info,
				9232	path->nodes[level]->start,
				9233	level, 1, &wc->refs[level],
				9234	&wc->flags[level]);
				9235	if (ret < 0) {
				9236	err = ret;
				9237	goto out_end_trans;
				9238	}
				9239	BUG_ON(wc->refs[level] == 0);
				9240
				9241	if (level == root_item->drop_level)
				9242	break;
				9243
				9244	btrfs_tree_unlock(path->nodes[level]);
				9245	path->locks[level] = 0;
				9246	WARN_ON(wc->refs[level] != 1);
				9247	level--;
				9248	}
				9249	}
				9250
				9251	wc->level = level;
				9252	wc->shared_level = -1;
				9253	wc->stage = DROP_REFERENCE;
				9254	wc->update_ref = update_ref;
				9255	wc->keep_locks = 0;
				9256	wc->for_reloc = for_reloc;
				9257	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
				9258
				9259	while (1) {
				9260
				9261	ret = walk_down_tree(trans, root, path, wc);
				9262	if (ret < 0) {
				9263	err = ret;
				9264	break;
				9265	}
				9266
				9267	ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
				9268	if (ret < 0) {
				9269	err = ret;
				9270	break;
				9271	}
				9272
				9273	if (ret > 0) {
				9274	BUG_ON(wc->stage != DROP_REFERENCE);
				9275	break;
				9276	}
				9277
				9278	if (wc->stage == DROP_REFERENCE) {
				9279	level = wc->level;
				9280	btrfs_node_key(path->nodes[level],
				9281	&root_item->drop_progress,
				9282	path->slots[level]);
				9283	root_item->drop_level = level;
				9284	}
				9285
				9286	BUG_ON(wc->level == 0);
				9287	if (btrfs_should_end_transaction(trans) \|\|
				9288	(!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
				9289	ret = btrfs_update_root(trans, tree_root,
				9290	&root->root_key,
				9291	root_item);
				9292	if (ret) {
				9293	btrfs_abort_transaction(trans, ret);
				9294	err = ret;
				9295	goto out_end_trans;
				9296	}
				9297
				9298	btrfs_end_transaction_throttle(trans);
				9299	if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
				9300	btrfs_debug(fs_info,
				9301	"drop snapshot early exit");
				9302	err = -EAGAIN;
				9303	goto out_free;
				9304	}
				9305
				9306	trans = btrfs_start_transaction(tree_root, 0);
				9307	if (IS_ERR(trans)) {
				9308	err = PTR_ERR(trans);
				9309	goto out_free;
				9310	}
				9311	if (block_rsv)
				9312	trans->block_rsv = block_rsv;
				9313	}
				9314	}
				9315	btrfs_release_path(path);
				9316	if (err)
				9317	goto out_end_trans;
				9318
				9319	ret = btrfs_del_root(trans, fs_info, &root->root_key);
				9320	if (ret) {
				9321	btrfs_abort_transaction(trans, ret);
				9322	err = ret;
				9323	goto out_end_trans;
				9324	}
				9325
				9326	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
				9327	ret = btrfs_find_root(tree_root, &root->root_key, path,
				9328	NULL, NULL);
				9329	if (ret < 0) {
				9330	btrfs_abort_transaction(trans, ret);
				9331	err = ret;
				9332	goto out_end_trans;
				9333	} else if (ret > 0) {
				9334	/* if we fail to delete the orphan item this time
				9335	* around, it'll get picked up the next time.
				9336	*
				9337	* The most common failure here is just -ENOENT.
				9338	*/
				9339	btrfs_del_orphan_item(trans, tree_root,
				9340	root->root_key.objectid);
				9341	}
				9342	}
				9343
				9344	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
				9345	btrfs_add_dropped_root(trans, root);
				9346	} else {
				9347	free_extent_buffer(root->node);
				9348	free_extent_buffer(root->commit_root);
				9349	btrfs_put_fs_root(root);
				9350	}
				9351	root_dropped = true;
				9352	out_end_trans:
				9353	btrfs_end_transaction_throttle(trans);
				9354	out_free:
				9355	kfree(wc);
				9356	btrfs_free_path(path);
				9357	out:
				9358	/*
				9359	* So if we need to stop dropping the snapshot for whatever reason we
				9360	* need to make sure to add it back to the dead root list so that we
				9361	* keep trying to do the work later. This also cleans up roots if we
				9362	* don't have it in the radix (like when we recover after a power fail
				9363	* or unmount) so we don't leak memory.
				9364	*/
				9365	if (!for_reloc && root_dropped == false)
				9366	btrfs_add_dead_root(root);
				9367	return err;
				9368	}
				9369
				9370	/*
				9371	* drop subtree rooted at tree block 'node'.
				9372	*
				9373	* NOTE: this function will unlock and release tree block 'node'
				9374	* only used by relocation code
				9375	*/
				9376	int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
				9377	struct btrfs_root *root,
				9378	struct extent_buffer *node,
				9379	struct extent_buffer *parent)
				9380	{
				9381	struct btrfs_fs_info *fs_info = root->fs_info;
				9382	struct btrfs_path *path;
				9383	struct walk_control *wc;
				9384	int level;
				9385	int parent_level;
				9386	int ret = 0;
				9387	int wret;
				9388
				9389	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
				9390
				9391	path = btrfs_alloc_path();
				9392	if (!path)
				9393	return -ENOMEM;
				9394
				9395	wc = kzalloc(sizeof(*wc), GFP_NOFS);
				9396	if (!wc) {
				9397	btrfs_free_path(path);
				9398	return -ENOMEM;
				9399	}
				9400
				9401	btrfs_assert_tree_locked(parent);
				9402	parent_level = btrfs_header_level(parent);
				9403	extent_buffer_get(parent);
				9404	path->nodes[parent_level] = parent;
				9405	path->slots[parent_level] = btrfs_header_nritems(parent);
				9406
				9407	btrfs_assert_tree_locked(node);
				9408	level = btrfs_header_level(node);
				9409	path->nodes[level] = node;
				9410	path->slots[level] = 0;
				9411	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				9412
				9413	wc->refs[parent_level] = 1;
				9414	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
				9415	wc->level = level;
				9416	wc->shared_level = -1;
				9417	wc->stage = DROP_REFERENCE;
				9418	wc->update_ref = 0;
				9419	wc->keep_locks = 1;
				9420	wc->for_reloc = 1;
				9421	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
				9422
				9423	while (1) {
				9424	wret = walk_down_tree(trans, root, path, wc);
				9425	if (wret < 0) {
				9426	ret = wret;
				9427	break;
				9428	}
				9429
				9430	wret = walk_up_tree(trans, root, path, wc, parent_level);
				9431	if (wret < 0)
				9432	ret = wret;
				9433	if (wret != 0)
				9434	break;
				9435	}
				9436
				9437	kfree(wc);
				9438	btrfs_free_path(path);
				9439	return ret;
				9440	}
				9441
				9442	static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
				9443	{
				9444	u64 num_devices;
				9445	u64 stripped;
				9446
				9447	/*
				9448	* if restripe for this chunk_type is on pick target profile and
				9449	* return, otherwise do the usual balance
				9450	*/
				9451	stripped = get_restripe_target(fs_info, flags);
				9452	if (stripped)
				9453	return extended_to_chunk(stripped);
				9454
				9455	num_devices = fs_info->fs_devices->rw_devices;
				9456
				9457	stripped = BTRFS_BLOCK_GROUP_RAID0 \|
				9458	BTRFS_BLOCK_GROUP_RAID5 \| BTRFS_BLOCK_GROUP_RAID6 \|
				9459	BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10;
				9460
				9461	if (num_devices == 1) {
				9462	stripped \|= BTRFS_BLOCK_GROUP_DUP;
				9463	stripped = flags & ~stripped;
				9464
				9465	/* turn raid0 into single device chunks */
				9466	if (flags & BTRFS_BLOCK_GROUP_RAID0)
				9467	return stripped;
				9468
				9469	/* turn mirroring into duplication */
				9470	if (flags & (BTRFS_BLOCK_GROUP_RAID1 \|
				9471	BTRFS_BLOCK_GROUP_RAID10))
				9472	return stripped \| BTRFS_BLOCK_GROUP_DUP;
				9473	} else {
				9474	/* they already had raid on here, just return */
				9475	if (flags & stripped)
				9476	return flags;
				9477
				9478	stripped \|= BTRFS_BLOCK_GROUP_DUP;
				9479	stripped = flags & ~stripped;
				9480
				9481	/* switch duplicated blocks with raid1 */
				9482	if (flags & BTRFS_BLOCK_GROUP_DUP)
				9483	return stripped \| BTRFS_BLOCK_GROUP_RAID1;
				9484
				9485	/* this is drive concat, leave it alone */
				9486	}
				9487
				9488	return flags;
				9489	}
				9490
				9491	static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
				9492	{
				9493	struct btrfs_space_info *sinfo = cache->space_info;
				9494	u64 num_bytes;
				9495	u64 min_allocable_bytes;
				9496	int ret = -ENOSPC;
				9497
				9498	/*
				9499	* We need some metadata space and system metadata space for
				9500	* allocating chunks in some corner cases until we force to set
				9501	* it to be readonly.
				9502	*/
				9503	if ((sinfo->flags &
				9504	(BTRFS_BLOCK_GROUP_SYSTEM \| BTRFS_BLOCK_GROUP_METADATA)) &&
				9505	!force)
				9506	min_allocable_bytes = SZ_1M;
				9507	else
				9508	min_allocable_bytes = 0;
				9509
				9510	spin_lock(&sinfo->lock);
				9511	spin_lock(&cache->lock);
				9512
				9513	if (cache->ro) {
				9514	cache->ro++;
				9515	ret = 0;
				9516	goto out;
				9517	}
				9518
				9519	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
				9520	cache->bytes_super - btrfs_block_group_used(&cache->item);
				9521
				9522	if (btrfs_space_info_used(sinfo, true) + num_bytes +
				9523	min_allocable_bytes <= sinfo->total_bytes) {
				9524	sinfo->bytes_readonly += num_bytes;
				9525	cache->ro++;
				9526	list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
				9527	ret = 0;
				9528	}
				9529	out:
				9530	spin_unlock(&cache->lock);
				9531	spin_unlock(&sinfo->lock);
				9532	return ret;
				9533	}
				9534
				9535	int btrfs_inc_block_group_ro(struct btrfs_fs_info *fs_info,
				9536	struct btrfs_block_group_cache *cache)
				9537
				9538	{
				9539	struct btrfs_trans_handle *trans;
				9540	u64 alloc_flags;
				9541	int ret;
				9542
				9543	again:
				9544	trans = btrfs_join_transaction(fs_info->extent_root);
				9545	if (IS_ERR(trans))
				9546	return PTR_ERR(trans);
				9547
				9548	/*
				9549	* we're not allowed to set block groups readonly after the dirty
				9550	* block groups cache has started writing. If it already started,
				9551	* back off and let this transaction commit
				9552	*/
				9553	mutex_lock(&fs_info->ro_block_group_mutex);
				9554	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
				9555	u64 transid = trans->transid;
				9556
				9557	mutex_unlock(&fs_info->ro_block_group_mutex);
				9558	btrfs_end_transaction(trans);
				9559
				9560	ret = btrfs_wait_for_commit(fs_info, transid);
				9561	if (ret)
				9562	return ret;
				9563	goto again;
				9564	}
				9565
				9566	/*
				9567	* if we are changing raid levels, try to allocate a corresponding
				9568	* block group with the new raid level.
				9569	*/
				9570	alloc_flags = update_block_group_flags(fs_info, cache->flags);
				9571	if (alloc_flags != cache->flags) {
				9572	ret = do_chunk_alloc(trans, fs_info, alloc_flags,
				9573	CHUNK_ALLOC_FORCE);
				9574	/*
				9575	* ENOSPC is allowed here, we may have enough space
				9576	* already allocated at the new raid level to
				9577	* carry on
				9578	*/
				9579	if (ret == -ENOSPC)
				9580	ret = 0;
				9581	if (ret < 0)
				9582	goto out;
				9583	}
				9584
				9585	ret = inc_block_group_ro(cache, 0);
				9586	if (!ret)
				9587	goto out;
				9588	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
				9589	ret = do_chunk_alloc(trans, fs_info, alloc_flags,
				9590	CHUNK_ALLOC_FORCE);
				9591	if (ret < 0)
				9592	goto out;
				9593	ret = inc_block_group_ro(cache, 0);
				9594	out:
				9595	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
				9596	alloc_flags = update_block_group_flags(fs_info, cache->flags);
				9597	mutex_lock(&fs_info->chunk_mutex);
				9598	check_system_chunk(trans, fs_info, alloc_flags);
				9599	mutex_unlock(&fs_info->chunk_mutex);
				9600	}
				9601	mutex_unlock(&fs_info->ro_block_group_mutex);
				9602
				9603	btrfs_end_transaction(trans);
				9604	return ret;
				9605	}
				9606
				9607	int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
				9608	struct btrfs_fs_info *fs_info, u64 type)
				9609	{
				9610	u64 alloc_flags = get_alloc_profile(fs_info, type);
				9611
				9612	return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE);
				9613	}
				9614
				9615	/*
				9616	* helper to account the unused space of all the readonly block group in the
				9617	* space_info. takes mirrors into account.
				9618	*/
				9619	u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
				9620	{
				9621	struct btrfs_block_group_cache *block_group;
				9622	u64 free_bytes = 0;
				9623	int factor;
				9624
				9625	/* It's df, we don't care if it's racy */
				9626	if (list_empty(&sinfo->ro_bgs))
				9627	return 0;
				9628
				9629	spin_lock(&sinfo->lock);
				9630	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
				9631	spin_lock(&block_group->lock);
				9632
				9633	if (!block_group->ro) {
				9634	spin_unlock(&block_group->lock);
				9635	continue;
				9636	}
				9637
				9638	if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 \|
				9639	BTRFS_BLOCK_GROUP_RAID10 \|
				9640	BTRFS_BLOCK_GROUP_DUP))
				9641	factor = 2;
				9642	else
				9643	factor = 1;
				9644
				9645	free_bytes += (block_group->key.offset -
				9646	btrfs_block_group_used(&block_group->item)) *
				9647	factor;
				9648
				9649	spin_unlock(&block_group->lock);
				9650	}
				9651	spin_unlock(&sinfo->lock);
				9652
				9653	return free_bytes;
				9654	}
				9655
				9656	void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
				9657	{
				9658	struct btrfs_space_info *sinfo = cache->space_info;
				9659	u64 num_bytes;
				9660
				9661	BUG_ON(!cache->ro);
				9662
				9663	spin_lock(&sinfo->lock);
				9664	spin_lock(&cache->lock);
				9665	if (!--cache->ro) {
				9666	num_bytes = cache->key.offset - cache->reserved -
				9667	cache->pinned - cache->bytes_super -
				9668	btrfs_block_group_used(&cache->item);
				9669	sinfo->bytes_readonly -= num_bytes;
				9670	list_del_init(&cache->ro_list);
				9671	}
				9672	spin_unlock(&cache->lock);
				9673	spin_unlock(&sinfo->lock);
				9674	}
				9675
				9676	/*
				9677	* checks to see if its even possible to relocate this block group.
				9678	*
				9679	* @return - -1 if it's not a good idea to relocate this block group, 0 if its
				9680	* ok to go ahead and try.
				9681	*/
				9682	int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
				9683	{
				9684	struct btrfs_root *root = fs_info->extent_root;
				9685	struct btrfs_block_group_cache *block_group;
				9686	struct btrfs_space_info *space_info;
				9687	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				9688	struct btrfs_device *device;
				9689	struct btrfs_trans_handle *trans;
				9690	u64 min_free;
				9691	u64 dev_min = 1;
				9692	u64 dev_nr = 0;
				9693	u64 target;
				9694	int debug;
				9695	int index;
				9696	int full = 0;
				9697	int ret = 0;
				9698
				9699	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
				9700
				9701	block_group = btrfs_lookup_block_group(fs_info, bytenr);
				9702
				9703	/* odd, couldn't find the block group, leave it alone */
				9704	if (!block_group) {
				9705	if (debug)
				9706	btrfs_warn(fs_info,
				9707	"can't find block group for bytenr %llu",
				9708	bytenr);
				9709	return -1;
				9710	}
				9711
				9712	min_free = btrfs_block_group_used(&block_group->item);
				9713
				9714	/* no bytes used, we're good */
				9715	if (!min_free)
				9716	goto out;
				9717
				9718	space_info = block_group->space_info;
				9719	spin_lock(&space_info->lock);
				9720
				9721	full = space_info->full;
				9722
				9723	/*
				9724	* if this is the last block group we have in this space, we can't
				9725	* relocate it unless we're able to allocate a new chunk below.
				9726	*
				9727	* Otherwise, we need to make sure we have room in the space to handle
				9728	* all of the extents from this block group. If we can, we're good
				9729	*/
				9730	if ((space_info->total_bytes != block_group->key.offset) &&
				9731	(btrfs_space_info_used(space_info, false) + min_free <
				9732	space_info->total_bytes)) {
				9733	spin_unlock(&space_info->lock);
				9734	goto out;
				9735	}
				9736	spin_unlock(&space_info->lock);
				9737
				9738	/*
				9739	* ok we don't have enough space, but maybe we have free space on our
				9740	* devices to allocate new chunks for relocation, so loop through our
				9741	* alloc devices and guess if we have enough space. if this block
				9742	* group is going to be restriped, run checks against the target
				9743	* profile instead of the current one.
				9744	*/
				9745	ret = -1;
				9746
				9747	/*
				9748	* index:
				9749	* 0: raid10
				9750	* 1: raid1
				9751	* 2: dup
				9752	* 3: raid0
				9753	* 4: single
				9754	*/
				9755	target = get_restripe_target(fs_info, block_group->flags);
				9756	if (target) {
				9757	index = __get_raid_index(extended_to_chunk(target));
				9758	} else {
				9759	/*
				9760	* this is just a balance, so if we were marked as full
				9761	* we know there is no space for a new chunk
				9762	*/
				9763	if (full) {
				9764	if (debug)
				9765	btrfs_warn(fs_info,
				9766	"no space to alloc new chunk for block group %llu",
				9767	block_group->key.objectid);
				9768	goto out;
				9769	}
				9770
				9771	index = get_block_group_index(block_group);
				9772	}
				9773
				9774	if (index == BTRFS_RAID_RAID10) {
				9775	dev_min = 4;
				9776	/* Divide by 2 */
				9777	min_free >>= 1;
				9778	} else if (index == BTRFS_RAID_RAID1) {
				9779	dev_min = 2;
				9780	} else if (index == BTRFS_RAID_DUP) {
				9781	/* Multiply by 2 */
				9782	min_free <<= 1;
				9783	} else if (index == BTRFS_RAID_RAID0) {
				9784	dev_min = fs_devices->rw_devices;
				9785	min_free = div64_u64(min_free, dev_min);
				9786	}
				9787
				9788	/* We need to do this so that we can look at pending chunks */
				9789	trans = btrfs_join_transaction(root);
				9790	if (IS_ERR(trans)) {
				9791	ret = PTR_ERR(trans);
				9792	goto out;
				9793	}
				9794
				9795	mutex_lock(&fs_info->chunk_mutex);
				9796	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
				9797	u64 dev_offset;
				9798
				9799	/*
				9800	* check to make sure we can actually find a chunk with enough
				9801	* space to fit our block group in.
				9802	*/
				9803	if (device->total_bytes > device->bytes_used + min_free &&
				9804	!device->is_tgtdev_for_dev_replace) {
				9805	ret = find_free_dev_extent(trans, device, min_free,
				9806	&dev_offset, NULL);
				9807	if (!ret)
				9808	dev_nr++;
				9809
				9810	if (dev_nr >= dev_min)
				9811	break;
				9812
				9813	ret = -1;
				9814	}
				9815	}
				9816	if (debug && ret == -1)
				9817	btrfs_warn(fs_info,
				9818	"no space to allocate a new chunk for block group %llu",
				9819	block_group->key.objectid);
				9820	mutex_unlock(&fs_info->chunk_mutex);
				9821	btrfs_end_transaction(trans);
				9822	out:
				9823	btrfs_put_block_group(block_group);
				9824	return ret;
				9825	}
				9826
				9827	static int find_first_block_group(struct btrfs_fs_info *fs_info,
				9828	struct btrfs_path *path,
				9829	struct btrfs_key *key)
				9830	{
				9831	struct btrfs_root *root = fs_info->extent_root;
				9832	int ret = 0;
				9833	struct btrfs_key found_key;
				9834	struct extent_buffer *leaf;
				9835	struct btrfs_block_group_item bg;
				9836	u64 flags;
				9837	int slot;
				9838
				9839	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				9840	if (ret < 0)
				9841	goto out;
				9842
				9843	while (1) {
				9844	slot = path->slots[0];
				9845	leaf = path->nodes[0];
				9846	if (slot >= btrfs_header_nritems(leaf)) {
				9847	ret = btrfs_next_leaf(root, path);
				9848	if (ret == 0)
				9849	continue;
				9850	if (ret < 0)
				9851	goto out;
				9852	break;
				9853	}
				9854	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				9855
				9856	if (found_key.objectid >= key->objectid &&
				9857	found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
				9858	struct extent_map_tree *em_tree;
				9859	struct extent_map *em;
				9860
				9861	em_tree = &root->fs_info->mapping_tree.map_tree;
				9862	read_lock(&em_tree->lock);
				9863	em = lookup_extent_mapping(em_tree, found_key.objectid,
				9864	found_key.offset);
				9865	read_unlock(&em_tree->lock);
				9866	if (!em) {
				9867	btrfs_err(fs_info,
				9868	"logical %llu len %llu found bg but no related chunk",
				9869	found_key.objectid, found_key.offset);
				9870	ret = -ENOENT;
				9871	} else if (em->start != found_key.objectid \|\|
				9872	em->len != found_key.offset) {
				9873	btrfs_err(fs_info,
				9874	"block group %llu len %llu mismatch with chunk %llu len %llu",
				9875	found_key.objectid, found_key.offset,
				9876	em->start, em->len);
				9877	ret = -EUCLEAN;
				9878	} else {
				9879	read_extent_buffer(leaf, &bg,
				9880	btrfs_item_ptr_offset(leaf, slot),
				9881	sizeof(bg));
				9882	flags = btrfs_block_group_flags(&bg) &
				9883	BTRFS_BLOCK_GROUP_TYPE_MASK;
				9884
				9885	if (flags != (em->map_lookup->type &
				9886	BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				9887	btrfs_err(fs_info,
				9888	"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
				9889	found_key.objectid,
				9890	found_key.offset, flags,
				9891	(BTRFS_BLOCK_GROUP_TYPE_MASK &
				9892	em->map_lookup->type));
				9893	ret = -EUCLEAN;
				9894	} else {
				9895	ret = 0;
				9896	}
				9897	}
				9898	free_extent_map(em);
				9899	goto out;
				9900	}
				9901	path->slots[0]++;
				9902	}
				9903	out:
				9904	return ret;
				9905	}
				9906
				9907	void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
				9908	{
				9909	struct btrfs_block_group_cache *block_group;
				9910	u64 last = 0;
				9911
				9912	while (1) {
				9913	struct inode *inode;
				9914
				9915	block_group = btrfs_lookup_first_block_group(info, last);
				9916	while (block_group) {
				9917	wait_block_group_cache_done(block_group);
				9918	spin_lock(&block_group->lock);
				9919	if (block_group->iref)
				9920	break;
				9921	spin_unlock(&block_group->lock);
				9922	block_group = next_block_group(info, block_group);
				9923	}
				9924	if (!block_group) {
				9925	if (last == 0)
				9926	break;
				9927	last = 0;
				9928	continue;
				9929	}
				9930
				9931	inode = block_group->inode;
				9932	block_group->iref = 0;
				9933	block_group->inode = NULL;
				9934	spin_unlock(&block_group->lock);
				9935	ASSERT(block_group->io_ctl.inode == NULL);
				9936	iput(inode);
				9937	last = block_group->key.objectid + block_group->key.offset;
				9938	btrfs_put_block_group(block_group);
				9939	}
				9940	}
				9941
				9942	/*
				9943	* Must be called only after stopping all workers, since we could have block
				9944	* group caching kthreads running, and therefore they could race with us if we
				9945	* freed the block groups before stopping them.
				9946	*/
				9947	int btrfs_free_block_groups(struct btrfs_fs_info *info)
				9948	{
				9949	struct btrfs_block_group_cache *block_group;
				9950	struct btrfs_space_info *space_info;
				9951	struct btrfs_caching_control *caching_ctl;
				9952	struct rb_node *n;
				9953
				9954	down_write(&info->commit_root_sem);
				9955	while (!list_empty(&info->caching_block_groups)) {
				9956	caching_ctl = list_entry(info->caching_block_groups.next,
				9957	struct btrfs_caching_control, list);
				9958	list_del(&caching_ctl->list);
				9959	put_caching_control(caching_ctl);
				9960	}
				9961	up_write(&info->commit_root_sem);
				9962
				9963	spin_lock(&info->unused_bgs_lock);
				9964	while (!list_empty(&info->unused_bgs)) {
				9965	block_group = list_first_entry(&info->unused_bgs,
				9966	struct btrfs_block_group_cache,
				9967	bg_list);
				9968	list_del_init(&block_group->bg_list);
				9969	btrfs_put_block_group(block_group);
				9970	}
				9971	spin_unlock(&info->unused_bgs_lock);
				9972
				9973	spin_lock(&info->block_group_cache_lock);
				9974	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
				9975	block_group = rb_entry(n, struct btrfs_block_group_cache,
				9976	cache_node);
				9977	rb_erase(&block_group->cache_node,
				9978	&info->block_group_cache_tree);
				9979	RB_CLEAR_NODE(&block_group->cache_node);
				9980	spin_unlock(&info->block_group_cache_lock);
				9981
				9982	down_write(&block_group->space_info->groups_sem);
				9983	list_del(&block_group->list);
				9984	up_write(&block_group->space_info->groups_sem);
				9985
				9986	/*
				9987	* We haven't cached this block group, which means we could
				9988	* possibly have excluded extents on this block group.
				9989	*/
				9990	if (block_group->cached == BTRFS_CACHE_NO \|\|
				9991	block_group->cached == BTRFS_CACHE_ERROR)
				9992	free_excluded_extents(info, block_group);
				9993
				9994	btrfs_remove_free_space_cache(block_group);
				9995	ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
				9996	ASSERT(list_empty(&block_group->dirty_list));
				9997	ASSERT(list_empty(&block_group->io_list));
				9998	ASSERT(list_empty(&block_group->bg_list));
				9999	ASSERT(atomic_read(&block_group->count) == 1);
				10000	btrfs_put_block_group(block_group);
				10001
				10002	spin_lock(&info->block_group_cache_lock);
				10003	}
				10004	spin_unlock(&info->block_group_cache_lock);
				10005
				10006	/* now that all the block groups are freed, go through and
				10007	* free all the space_info structs. This is only called during
				10008	* the final stages of unmount, and so we know nobody is
				10009	* using them. We call synchronize_rcu() once before we start,
				10010	* just to be on the safe side.
				10011	*/
				10012	synchronize_rcu();
				10013
				10014	release_global_block_rsv(info);
				10015
				10016	while (!list_empty(&info->space_info)) {
				10017	int i;
				10018
				10019	space_info = list_entry(info->space_info.next,
				10020	struct btrfs_space_info,
				10021	list);
				10022
				10023	/*
				10024	* Do not hide this behind enospc_debug, this is actually
				10025	* important and indicates a real bug if this happens.
				10026	*/
				10027	if (WARN_ON(space_info->bytes_pinned > 0 \|\|
				10028	space_info->bytes_reserved > 0 \|\|
				10029	space_info->bytes_may_use > 0))
				10030	dump_space_info(info, space_info, 0, 0);
				10031	list_del(&space_info->list);
				10032	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
				10033	struct kobject *kobj;
				10034	kobj = space_info->block_group_kobjs[i];
				10035	space_info->block_group_kobjs[i] = NULL;
				10036	if (kobj) {
				10037	kobject_del(kobj);
				10038	kobject_put(kobj);
				10039	}
				10040	}
				10041	kobject_del(&space_info->kobj);
				10042	kobject_put(&space_info->kobj);
				10043	}
				10044	return 0;
				10045	}
				10046
				10047	static void __link_block_group(struct btrfs_space_info *space_info,
				10048	struct btrfs_block_group_cache *cache)
				10049	{
				10050	int index = get_block_group_index(cache);
				10051	bool first = false;
				10052
				10053	down_write(&space_info->groups_sem);
				10054	if (list_empty(&space_info->block_groups[index]))
				10055	first = true;
				10056	list_add_tail(&cache->list, &space_info->block_groups[index]);
				10057	up_write(&space_info->groups_sem);
				10058
				10059	if (first) {
				10060	struct raid_kobject *rkobj;
				10061	int ret;
				10062
				10063	rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
				10064	if (!rkobj)
				10065	goto out_err;
				10066	rkobj->raid_type = index;
				10067	kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
				10068	ret = kobject_add(&rkobj->kobj, &space_info->kobj,
				10069	"%s", get_raid_name(index));
				10070	if (ret) {
				10071	kobject_put(&rkobj->kobj);
				10072	goto out_err;
				10073	}
				10074	space_info->block_group_kobjs[index] = &rkobj->kobj;
				10075	}
				10076
				10077	return;
				10078	out_err:
				10079	btrfs_warn(cache->fs_info,
				10080	"failed to add kobject for block cache, ignoring");
				10081	}
				10082
				10083	static struct btrfs_block_group_cache *
				10084	btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
				10085	u64 start, u64 size)
				10086	{
				10087	struct btrfs_block_group_cache *cache;
				10088
				10089	cache = kzalloc(sizeof(*cache), GFP_NOFS);
				10090	if (!cache)
				10091	return NULL;
				10092
				10093	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
				10094	GFP_NOFS);
				10095	if (!cache->free_space_ctl) {
				10096	kfree(cache);
				10097	return NULL;
				10098	}
				10099
				10100	cache->key.objectid = start;
				10101	cache->key.offset = size;
				10102	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				10103
				10104	cache->fs_info = fs_info;
				10105	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
				10106	set_free_space_tree_thresholds(cache);
				10107
				10108	atomic_set(&cache->count, 1);
				10109	spin_lock_init(&cache->lock);
				10110	init_rwsem(&cache->data_rwsem);
				10111	INIT_LIST_HEAD(&cache->list);
				10112	INIT_LIST_HEAD(&cache->cluster_list);
				10113	INIT_LIST_HEAD(&cache->bg_list);
				10114	INIT_LIST_HEAD(&cache->ro_list);
				10115	INIT_LIST_HEAD(&cache->dirty_list);
				10116	INIT_LIST_HEAD(&cache->io_list);
				10117	btrfs_init_free_space_ctl(cache);
				10118	atomic_set(&cache->trimming, 0);
				10119	mutex_init(&cache->free_space_lock);
				10120	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
				10121
				10122	return cache;
				10123	}
				10124
				10125
				10126	/*
				10127	* Iterate all chunks and verify that each of them has the corresponding block
				10128	* group
				10129	*/
				10130	static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
				10131	{
				10132	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
				10133	struct extent_map *em;
				10134	struct btrfs_block_group_cache *bg;
				10135	u64 start = 0;
				10136	int ret = 0;
				10137
				10138	while (1) {
				10139	read_lock(&map_tree->map_tree.lock);
				10140	/*
				10141	* lookup_extent_mapping will return the first extent map
				10142	* intersecting the range, so setting @len to 1 is enough to
				10143	* get the first chunk.
				10144	*/
				10145	em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
				10146	read_unlock(&map_tree->map_tree.lock);
				10147	if (!em)
				10148	break;
				10149
				10150	bg = btrfs_lookup_block_group(fs_info, em->start);
				10151	if (!bg) {
				10152	btrfs_err(fs_info,
				10153	"chunk start=%llu len=%llu doesn't have corresponding block group",
				10154	em->start, em->len);
				10155	ret = -EUCLEAN;
				10156	free_extent_map(em);
				10157	break;
				10158	}
				10159	if (bg->key.objectid != em->start \|\|
				10160	bg->key.offset != em->len \|\|
				10161	(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
				10162	(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				10163	btrfs_err(fs_info,
				10164	"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
				10165	em->start, em->len,
				10166	em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
				10167	bg->key.objectid, bg->key.offset,
				10168	bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
				10169	ret = -EUCLEAN;
				10170	free_extent_map(em);
				10171	btrfs_put_block_group(bg);
				10172	break;
				10173	}
				10174	start = em->start + em->len;
				10175	free_extent_map(em);
				10176	btrfs_put_block_group(bg);
				10177	}
				10178	return ret;
				10179	}
				10180
				10181	int btrfs_read_block_groups(struct btrfs_fs_info *info)
				10182	{
				10183	struct btrfs_path *path;
				10184	int ret;
				10185	struct btrfs_block_group_cache *cache;
				10186	struct btrfs_space_info *space_info;
				10187	struct btrfs_key key;
				10188	struct btrfs_key found_key;
				10189	struct extent_buffer *leaf;
				10190	int need_clear = 0;
				10191	u64 cache_gen;
				10192	u64 feature;
				10193	int mixed;
				10194
				10195	feature = btrfs_super_incompat_flags(info->super_copy);
				10196	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
				10197
				10198	key.objectid = 0;
				10199	key.offset = 0;
				10200	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				10201	path = btrfs_alloc_path();
				10202	if (!path)
				10203	return -ENOMEM;
				10204	path->reada = READA_FORWARD;
				10205
				10206	cache_gen = btrfs_super_cache_generation(info->super_copy);
				10207	if (btrfs_test_opt(info, SPACE_CACHE) &&
				10208	btrfs_super_generation(info->super_copy) != cache_gen)
				10209	need_clear = 1;
				10210	if (btrfs_test_opt(info, CLEAR_CACHE))
				10211	need_clear = 1;
				10212
				10213	while (1) {
				10214	ret = find_first_block_group(info, path, &key);
				10215	if (ret > 0)
				10216	break;
				10217	if (ret != 0)
				10218	goto error;
				10219
				10220	leaf = path->nodes[0];
				10221	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				10222
				10223	cache = btrfs_create_block_group_cache(info, found_key.objectid,
				10224	found_key.offset);
				10225	if (!cache) {
				10226	ret = -ENOMEM;
				10227	goto error;
				10228	}
				10229
				10230	if (need_clear) {
				10231	/*
				10232	* When we mount with old space cache, we need to
				10233	* set BTRFS_DC_CLEAR and set dirty flag.
				10234	*
				10235	* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
				10236	* truncate the old free space cache inode and
				10237	* setup a new one.
				10238	* b) Setting 'dirty flag' makes sure that we flush
				10239	* the new space cache info onto disk.
				10240	*/
				10241	if (btrfs_test_opt(info, SPACE_CACHE))
				10242	cache->disk_cache_state = BTRFS_DC_CLEAR;
				10243	}
				10244
				10245	read_extent_buffer(leaf, &cache->item,
				10246	btrfs_item_ptr_offset(leaf, path->slots[0]),
				10247	sizeof(cache->item));
				10248	cache->flags = btrfs_block_group_flags(&cache->item);
				10249	if (!mixed &&
				10250	((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
				10251	(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
				10252	btrfs_err(info,
				10253	"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
				10254	cache->key.objectid);
				10255	btrfs_put_block_group(cache);
				10256	ret = -EINVAL;
				10257	goto error;
				10258	}
				10259
				10260	key.objectid = found_key.objectid + found_key.offset;
				10261	btrfs_release_path(path);
				10262
				10263	/*
				10264	* We need to exclude the super stripes now so that the space
				10265	* info has super bytes accounted for, otherwise we'll think
				10266	* we have more space than we actually do.
				10267	*/
				10268	ret = exclude_super_stripes(info, cache);
				10269	if (ret) {
				10270	/*
				10271	* We may have excluded something, so call this just in
				10272	* case.
				10273	*/
				10274	free_excluded_extents(info, cache);
				10275	btrfs_put_block_group(cache);
				10276	goto error;
				10277	}
				10278
				10279	/*
				10280	* check for two cases, either we are full, and therefore
				10281	* don't need to bother with the caching work since we won't
				10282	* find any space, or we are empty, and we can just add all
				10283	* the space in and be done with it. This saves us _alot_ of
				10284	* time, particularly in the full case.
				10285	*/
				10286	if (found_key.offset == btrfs_block_group_used(&cache->item)) {
				10287	cache->last_byte_to_unpin = (u64)-1;
				10288	cache->cached = BTRFS_CACHE_FINISHED;
				10289	free_excluded_extents(info, cache);
				10290	} else if (btrfs_block_group_used(&cache->item) == 0) {
				10291	cache->last_byte_to_unpin = (u64)-1;
				10292	cache->cached = BTRFS_CACHE_FINISHED;
				10293	add_new_free_space(cache, info,
				10294	found_key.objectid,
				10295	found_key.objectid +
				10296	found_key.offset);
				10297	free_excluded_extents(info, cache);
				10298	}
				10299
				10300	ret = btrfs_add_block_group_cache(info, cache);
				10301	if (ret) {
				10302	btrfs_remove_free_space_cache(cache);
				10303	btrfs_put_block_group(cache);
				10304	goto error;
				10305	}
				10306
				10307	trace_btrfs_add_block_group(info, cache, 0);
				10308	update_space_info(info, cache->flags, found_key.offset,
				10309	btrfs_block_group_used(&cache->item),
				10310	cache->bytes_super, &space_info);
				10311
				10312	cache->space_info = space_info;
				10313
				10314	__link_block_group(space_info, cache);
				10315
				10316	set_avail_alloc_bits(info, cache->flags);
				10317	if (btrfs_chunk_readonly(info, cache->key.objectid)) {
				10318	inc_block_group_ro(cache, 1);
				10319	} else if (btrfs_block_group_used(&cache->item) == 0) {
				10320	spin_lock(&info->unused_bgs_lock);
				10321	/* Should always be true but just in case. */
				10322	if (list_empty(&cache->bg_list)) {
				10323	btrfs_get_block_group(cache);
				10324	list_add_tail(&cache->bg_list,
				10325	&info->unused_bgs);
				10326	}
				10327	spin_unlock(&info->unused_bgs_lock);
				10328	}
				10329	}
				10330
				10331	list_for_each_entry_rcu(space_info, &info->space_info, list) {
				10332	if (!(get_alloc_profile(info, space_info->flags) &
				10333	(BTRFS_BLOCK_GROUP_RAID10 \|
				10334	BTRFS_BLOCK_GROUP_RAID1 \|
				10335	BTRFS_BLOCK_GROUP_RAID5 \|
				10336	BTRFS_BLOCK_GROUP_RAID6 \|
				10337	BTRFS_BLOCK_GROUP_DUP)))
				10338	continue;
				10339	/*
				10340	* avoid allocating from un-mirrored block group if there are
				10341	* mirrored block groups.
				10342	*/
				10343	list_for_each_entry(cache,
				10344	&space_info->block_groups[BTRFS_RAID_RAID0],
				10345	list)
				10346	inc_block_group_ro(cache, 1);
				10347	list_for_each_entry(cache,
				10348	&space_info->block_groups[BTRFS_RAID_SINGLE],
				10349	list)
				10350	inc_block_group_ro(cache, 1);
				10351	}
				10352
				10353	init_global_block_rsv(info);
				10354	ret = check_chunk_block_group_mappings(info);
				10355	error:
				10356	btrfs_free_path(path);
				10357	return ret;
				10358	}
				10359
				10360	void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
				10361	struct btrfs_fs_info *fs_info)
				10362	{
				10363	struct btrfs_block_group_cache *block_group;
				10364	struct btrfs_root *extent_root = fs_info->extent_root;
				10365	struct btrfs_block_group_item item;
				10366	struct btrfs_key key;
				10367	int ret = 0;
				10368	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
				10369
				10370	trans->can_flush_pending_bgs = false;
				10371	while (!list_empty(&trans->new_bgs)) {
				10372	block_group = list_first_entry(&trans->new_bgs,
				10373	struct btrfs_block_group_cache,
				10374	bg_list);
				10375	if (ret)
				10376	goto next;
				10377
				10378	spin_lock(&block_group->lock);
				10379	memcpy(&item, &block_group->item, sizeof(item));
				10380	memcpy(&key, &block_group->key, sizeof(key));
				10381	spin_unlock(&block_group->lock);
				10382
				10383	ret = btrfs_insert_item(trans, extent_root, &key, &item,
				10384	sizeof(item));
				10385	if (ret)
				10386	btrfs_abort_transaction(trans, ret);
				10387	ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid,
				10388	key.offset);
				10389	if (ret)
				10390	btrfs_abort_transaction(trans, ret);
				10391	add_block_group_free_space(trans, fs_info, block_group);
				10392	/* already aborted the transaction if it failed. */
				10393	next:
				10394	list_del_init(&block_group->bg_list);
				10395	}
				10396	trans->can_flush_pending_bgs = can_flush_pending_bgs;
				10397	}
				10398
				10399	int btrfs_make_block_group(struct btrfs_trans_handle *trans,
				10400	struct btrfs_fs_info *fs_info, u64 bytes_used,
				10401	u64 type, u64 chunk_offset, u64 size)
				10402	{
				10403	struct btrfs_block_group_cache *cache;
				10404	int ret;
				10405
				10406	btrfs_set_log_full_commit(fs_info, trans);
				10407
				10408	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
				10409	if (!cache)
				10410	return -ENOMEM;
				10411
				10412	btrfs_set_block_group_used(&cache->item, bytes_used);
				10413	btrfs_set_block_group_chunk_objectid(&cache->item,
				10414	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				10415	btrfs_set_block_group_flags(&cache->item, type);
				10416
				10417	cache->flags = type;
				10418	cache->last_byte_to_unpin = (u64)-1;
				10419	cache->cached = BTRFS_CACHE_FINISHED;
				10420	cache->needs_free_space = 1;
				10421	ret = exclude_super_stripes(fs_info, cache);
				10422	if (ret) {
				10423	/*
				10424	* We may have excluded something, so call this just in
				10425	* case.
				10426	*/
				10427	free_excluded_extents(fs_info, cache);
				10428	btrfs_put_block_group(cache);
				10429	return ret;
				10430	}
				10431
				10432	add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size);
				10433
				10434	free_excluded_extents(fs_info, cache);
				10435
				10436	#ifdef CONFIG_BTRFS_DEBUG
				10437	if (btrfs_should_fragment_free_space(cache)) {
				10438	u64 new_bytes_used = size - bytes_used;
				10439
				10440	bytes_used += new_bytes_used >> 1;
				10441	fragment_free_space(cache);
				10442	}
				10443	#endif
				10444	/*
				10445	* Ensure the corresponding space_info object is created and
				10446	* assigned to our block group. We want our bg to be added to the rbtree
				10447	* with its ->space_info set.
				10448	*/
				10449	cache->space_info = __find_space_info(fs_info, cache->flags);
				10450	if (!cache->space_info) {
				10451	ret = create_space_info(fs_info, cache->flags,
				10452	&cache->space_info);
				10453	if (ret) {
				10454	btrfs_remove_free_space_cache(cache);
				10455	btrfs_put_block_group(cache);
				10456	return ret;
				10457	}
				10458	}
				10459
				10460	ret = btrfs_add_block_group_cache(fs_info, cache);
				10461	if (ret) {
				10462	btrfs_remove_free_space_cache(cache);
				10463	btrfs_put_block_group(cache);
				10464	return ret;
				10465	}
				10466
				10467	/*
				10468	* Now that our block group has its ->space_info set and is inserted in
				10469	* the rbtree, update the space info's counters.
				10470	*/
				10471	trace_btrfs_add_block_group(fs_info, cache, 1);
				10472	update_space_info(fs_info, cache->flags, size, bytes_used,
				10473	cache->bytes_super, &cache->space_info);
				10474	update_global_block_rsv(fs_info);
				10475
				10476	__link_block_group(cache->space_info, cache);
				10477
				10478	list_add_tail(&cache->bg_list, &trans->new_bgs);
				10479
				10480	set_avail_alloc_bits(fs_info, type);
				10481	return 0;
				10482	}
				10483
				10484	static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				10485	{
				10486	u64 extra_flags = chunk_to_extended(flags) &
				10487	BTRFS_EXTENDED_PROFILE_MASK;
				10488
				10489	write_seqlock(&fs_info->profiles_lock);
				10490	if (flags & BTRFS_BLOCK_GROUP_DATA)
				10491	fs_info->avail_data_alloc_bits &= ~extra_flags;
				10492	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				10493	fs_info->avail_metadata_alloc_bits &= ~extra_flags;
				10494	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				10495	fs_info->avail_system_alloc_bits &= ~extra_flags;
				10496	write_sequnlock(&fs_info->profiles_lock);
				10497	}
				10498
				10499	int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
				10500	struct btrfs_fs_info *fs_info, u64 group_start,
				10501	struct extent_map *em)
				10502	{
				10503	struct btrfs_root *root = fs_info->extent_root;
				10504	struct btrfs_path *path;
				10505	struct btrfs_block_group_cache *block_group;
				10506	struct btrfs_free_cluster *cluster;
				10507	struct btrfs_root *tree_root = fs_info->tree_root;
				10508	struct btrfs_key key;
				10509	struct inode *inode;
				10510	struct kobject *kobj = NULL;
				10511	int ret;
				10512	int index;
				10513	int factor;
				10514	struct btrfs_caching_control *caching_ctl = NULL;
				10515	bool remove_em;
				10516
				10517	block_group = btrfs_lookup_block_group(fs_info, group_start);
				10518	BUG_ON(!block_group);
				10519	BUG_ON(!block_group->ro);
				10520
				10521	/*
				10522	* Free the reserved super bytes from this block group before
				10523	* remove it.
				10524	*/
				10525	free_excluded_extents(fs_info, block_group);
				10526
				10527	memcpy(&key, &block_group->key, sizeof(key));
				10528	index = get_block_group_index(block_group);
				10529	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP \|
				10530	BTRFS_BLOCK_GROUP_RAID1 \|
				10531	BTRFS_BLOCK_GROUP_RAID10))
				10532	factor = 2;
				10533	else
				10534	factor = 1;
				10535
				10536	/* make sure this block group isn't part of an allocation cluster */
				10537	cluster = &fs_info->data_alloc_cluster;
				10538	spin_lock(&cluster->refill_lock);
				10539	btrfs_return_cluster_to_free_space(block_group, cluster);
				10540	spin_unlock(&cluster->refill_lock);
				10541
				10542	/*
				10543	* make sure this block group isn't part of a metadata
				10544	* allocation cluster
				10545	*/
				10546	cluster = &fs_info->meta_alloc_cluster;
				10547	spin_lock(&cluster->refill_lock);
				10548	btrfs_return_cluster_to_free_space(block_group, cluster);
				10549	spin_unlock(&cluster->refill_lock);
				10550
				10551	path = btrfs_alloc_path();
				10552	if (!path) {
				10553	ret = -ENOMEM;
				10554	goto out;
				10555	}
				10556
				10557	/*
				10558	* get the inode first so any iput calls done for the io_list
				10559	* aren't the final iput (no unlinks allowed now)
				10560	*/
				10561	inode = lookup_free_space_inode(fs_info, block_group, path);
				10562
				10563	mutex_lock(&trans->transaction->cache_write_mutex);
				10564	/*
				10565	* make sure our free spache cache IO is done before remove the
				10566	* free space inode
				10567	*/
				10568	spin_lock(&trans->transaction->dirty_bgs_lock);
				10569	if (!list_empty(&block_group->io_list)) {
				10570	list_del_init(&block_group->io_list);
				10571
				10572	WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
				10573
				10574	spin_unlock(&trans->transaction->dirty_bgs_lock);
				10575	btrfs_wait_cache_io(trans, block_group, path);
				10576	btrfs_put_block_group(block_group);
				10577	spin_lock(&trans->transaction->dirty_bgs_lock);
				10578	}
				10579
				10580	if (!list_empty(&block_group->dirty_list)) {
				10581	list_del_init(&block_group->dirty_list);
				10582	btrfs_put_block_group(block_group);
				10583	}
				10584	spin_unlock(&trans->transaction->dirty_bgs_lock);
				10585	mutex_unlock(&trans->transaction->cache_write_mutex);
				10586
				10587	if (!IS_ERR(inode)) {
				10588	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
				10589	if (ret) {
				10590	btrfs_add_delayed_iput(inode);
				10591	goto out;
				10592	}
				10593	clear_nlink(inode);
				10594	/* One for the block groups ref */
				10595	spin_lock(&block_group->lock);
				10596	if (block_group->iref) {
				10597	block_group->iref = 0;
				10598	block_group->inode = NULL;
				10599	spin_unlock(&block_group->lock);
				10600	iput(inode);
				10601	} else {
				10602	spin_unlock(&block_group->lock);
				10603	}
				10604	/* One for our lookup ref */
				10605	btrfs_add_delayed_iput(inode);
				10606	}
				10607
				10608	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
				10609	key.offset = block_group->key.objectid;
				10610	key.type = 0;
				10611
				10612	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
				10613	if (ret < 0)
				10614	goto out;
				10615	if (ret > 0)
				10616	btrfs_release_path(path);
				10617	if (ret == 0) {
				10618	ret = btrfs_del_item(trans, tree_root, path);
				10619	if (ret)
				10620	goto out;
				10621	btrfs_release_path(path);
				10622	}
				10623
				10624	spin_lock(&fs_info->block_group_cache_lock);
				10625	rb_erase(&block_group->cache_node,
				10626	&fs_info->block_group_cache_tree);
				10627	RB_CLEAR_NODE(&block_group->cache_node);
				10628
				10629	/* Once for the block groups rbtree */
				10630	btrfs_put_block_group(block_group);
				10631
				10632	if (fs_info->first_logical_byte == block_group->key.objectid)
				10633	fs_info->first_logical_byte = (u64)-1;
				10634	spin_unlock(&fs_info->block_group_cache_lock);
				10635
				10636	down_write(&block_group->space_info->groups_sem);
				10637	/*
				10638	* we must use list_del_init so people can check to see if they
				10639	* are still on the list after taking the semaphore
				10640	*/
				10641	list_del_init(&block_group->list);
				10642	if (list_empty(&block_group->space_info->block_groups[index])) {
				10643	kobj = block_group->space_info->block_group_kobjs[index];
				10644	block_group->space_info->block_group_kobjs[index] = NULL;
				10645	clear_avail_alloc_bits(fs_info, block_group->flags);
				10646	}
				10647	up_write(&block_group->space_info->groups_sem);
				10648	if (kobj) {
				10649	kobject_del(kobj);
				10650	kobject_put(kobj);
				10651	}
				10652
				10653	if (block_group->has_caching_ctl)
				10654	caching_ctl = get_caching_control(block_group);
				10655	if (block_group->cached == BTRFS_CACHE_STARTED)
				10656	wait_block_group_cache_done(block_group);
				10657	if (block_group->has_caching_ctl) {
				10658	down_write(&fs_info->commit_root_sem);
				10659	if (!caching_ctl) {
				10660	struct btrfs_caching_control *ctl;
				10661
				10662	list_for_each_entry(ctl,
				10663	&fs_info->caching_block_groups, list)
				10664	if (ctl->block_group == block_group) {
				10665	caching_ctl = ctl;
				10666	refcount_inc(&caching_ctl->count);
				10667	break;
				10668	}
				10669	}
				10670	if (caching_ctl)
				10671	list_del_init(&caching_ctl->list);
				10672	up_write(&fs_info->commit_root_sem);
				10673	if (caching_ctl) {
				10674	/* Once for the caching bgs list and once for us. */
				10675	put_caching_control(caching_ctl);
				10676	put_caching_control(caching_ctl);
				10677	}
				10678	}
				10679
				10680	spin_lock(&trans->transaction->dirty_bgs_lock);
				10681	if (!list_empty(&block_group->dirty_list)) {
				10682	WARN_ON(1);
				10683	}
				10684	if (!list_empty(&block_group->io_list)) {
				10685	WARN_ON(1);
				10686	}
				10687	spin_unlock(&trans->transaction->dirty_bgs_lock);
				10688	btrfs_remove_free_space_cache(block_group);
				10689
				10690	spin_lock(&block_group->space_info->lock);
				10691	list_del_init(&block_group->ro_list);
				10692
				10693	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				10694	WARN_ON(block_group->space_info->total_bytes
				10695	< block_group->key.offset);
				10696	WARN_ON(block_group->space_info->bytes_readonly
				10697	< block_group->key.offset);
				10698	WARN_ON(block_group->space_info->disk_total
				10699	< block_group->key.offset * factor);
				10700	}
				10701	block_group->space_info->total_bytes -= block_group->key.offset;
				10702	block_group->space_info->bytes_readonly -= block_group->key.offset;
				10703	block_group->space_info->disk_total -= block_group->key.offset * factor;
				10704
				10705	spin_unlock(&block_group->space_info->lock);
				10706
				10707	memcpy(&key, &block_group->key, sizeof(key));
				10708
				10709	mutex_lock(&fs_info->chunk_mutex);
				10710	if (!list_empty(&em->list)) {
				10711	/* We're in the transaction->pending_chunks list. */
				10712	free_extent_map(em);
				10713	}
				10714	spin_lock(&block_group->lock);
				10715	block_group->removed = 1;
				10716	/*
				10717	* At this point trimming can't start on this block group, because we
				10718	* removed the block group from the tree fs_info->block_group_cache_tree
				10719	* so no one can't find it anymore and even if someone already got this
				10720	* block group before we removed it from the rbtree, they have already
				10721	* incremented block_group->trimming - if they didn't, they won't find
				10722	* any free space entries because we already removed them all when we
				10723	* called btrfs_remove_free_space_cache().
				10724	*
				10725	* And we must not remove the extent map from the fs_info->mapping_tree
				10726	* to prevent the same logical address range and physical device space
				10727	* ranges from being reused for a new block group. This is because our
				10728	* fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
				10729	* completely transactionless, so while it is trimming a range the
				10730	* currently running transaction might finish and a new one start,
				10731	* allowing for new block groups to be created that can reuse the same
				10732	* physical device locations unless we take this special care.
				10733	*
				10734	* There may also be an implicit trim operation if the file system
				10735	* is mounted with -odiscard. The same protections must remain
				10736	* in place until the extents have been discarded completely when
				10737	* the transaction commit has completed.
				10738	*/
				10739	remove_em = (atomic_read(&block_group->trimming) == 0);
				10740	/*
				10741	* Make sure a trimmer task always sees the em in the pinned_chunks list
				10742	* if it sees block_group->removed == 1 (needs to lock block_group->lock
				10743	* before checking block_group->removed).
				10744	*/
				10745	if (!remove_em) {
				10746	/*
				10747	* Our em might be in trans->transaction->pending_chunks which
				10748	* is protected by fs_info->chunk_mutex ([lock\|unlock]_chunks),
				10749	* and so is the fs_info->pinned_chunks list.
				10750	*
				10751	* So at this point we must be holding the chunk_mutex to avoid
				10752	* any races with chunk allocation (more specifically at
				10753	* volumes.c:contains_pending_extent()), to ensure it always
				10754	* sees the em, either in the pending_chunks list or in the
				10755	* pinned_chunks list.
				10756	*/
				10757	list_move_tail(&em->list, &fs_info->pinned_chunks);
				10758	}
				10759	spin_unlock(&block_group->lock);
				10760
				10761	if (remove_em) {
				10762	struct extent_map_tree *em_tree;
				10763
				10764	em_tree = &fs_info->mapping_tree.map_tree;
				10765	write_lock(&em_tree->lock);
				10766	/*
				10767	* The em might be in the pending_chunks list, so make sure the
				10768	* chunk mutex is locked, since remove_extent_mapping() will
				10769	* delete us from that list.
				10770	*/
				10771	remove_extent_mapping(em_tree, em);
				10772	write_unlock(&em_tree->lock);
				10773	/* once for the tree */
				10774	free_extent_map(em);
				10775	}
				10776
				10777	mutex_unlock(&fs_info->chunk_mutex);
				10778
				10779	ret = remove_block_group_free_space(trans, fs_info, block_group);
				10780	if (ret)
				10781	goto out;
				10782
				10783	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				10784	if (ret > 0)
				10785	ret = -EIO;
				10786	if (ret < 0)
				10787	goto out;
				10788
				10789	ret = btrfs_del_item(trans, root, path);
				10790
				10791	out:
				10792	/* Once for the lookup reference */
				10793	btrfs_put_block_group(block_group);
				10794	btrfs_free_path(path);
				10795	return ret;
				10796	}
				10797
				10798	struct btrfs_trans_handle *
				10799	btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
				10800	const u64 chunk_offset)
				10801	{
				10802	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
				10803	struct extent_map *em;
				10804	struct map_lookup *map;
				10805	unsigned int num_items;
				10806
				10807	read_lock(&em_tree->lock);
				10808	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
				10809	read_unlock(&em_tree->lock);
				10810	ASSERT(em && em->start == chunk_offset);
				10811
				10812	/*
				10813	* We need to reserve 3 + N units from the metadata space info in order
				10814	* to remove a block group (done at btrfs_remove_chunk() and at
				10815	* btrfs_remove_block_group()), which are used for:
				10816	*
				10817	* 1 unit for adding the free space inode's orphan (located in the tree
				10818	* of tree roots).
				10819	* 1 unit for deleting the block group item (located in the extent
				10820	* tree).
				10821	* 1 unit for deleting the free space item (located in tree of tree
				10822	* roots).
				10823	* N units for deleting N device extent items corresponding to each
				10824	* stripe (located in the device tree).
				10825	*
				10826	* In order to remove a block group we also need to reserve units in the
				10827	* system space info in order to update the chunk tree (update one or
				10828	* more device items and remove one chunk item), but this is done at
				10829	* btrfs_remove_chunk() through a call to check_system_chunk().
				10830	*/
				10831	map = em->map_lookup;
				10832	num_items = 3 + map->num_stripes;
				10833	free_extent_map(em);
				10834
				10835	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
				10836	num_items, 1);
				10837	}
				10838
				10839	/*
				10840	* Process the unused_bgs list and remove any that don't have any allocated
				10841	* space inside of them.
				10842	*/
				10843	void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
				10844	{
				10845	struct btrfs_block_group_cache *block_group;
				10846	struct btrfs_space_info *space_info;
				10847	struct btrfs_trans_handle *trans;
				10848	int ret = 0;
				10849
				10850	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				10851	return;
				10852
				10853	spin_lock(&fs_info->unused_bgs_lock);
				10854	while (!list_empty(&fs_info->unused_bgs)) {
				10855	u64 start, end;
				10856	int trimming;
				10857
				10858	block_group = list_first_entry(&fs_info->unused_bgs,
				10859	struct btrfs_block_group_cache,
				10860	bg_list);
				10861	list_del_init(&block_group->bg_list);
				10862
				10863	space_info = block_group->space_info;
				10864
				10865	if (ret \|\| btrfs_mixed_space_info(space_info)) {
				10866	btrfs_put_block_group(block_group);
				10867	continue;
				10868	}
				10869	spin_unlock(&fs_info->unused_bgs_lock);
				10870
				10871	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				10872
				10873	/* Don't want to race with allocators so take the groups_sem */
				10874	down_write(&space_info->groups_sem);
				10875	spin_lock(&block_group->lock);
				10876	if (block_group->reserved \|\| block_group->pinned \|\|
				10877	btrfs_block_group_used(&block_group->item) \|\|
				10878	block_group->ro \|\|
				10879	list_is_singular(&block_group->list)) {
				10880	/*
				10881	* We want to bail if we made new allocations or have
				10882	* outstanding allocations in this block group. We do
				10883	* the ro check in case balance is currently acting on
				10884	* this block group.
				10885	*/
				10886	spin_unlock(&block_group->lock);
				10887	up_write(&space_info->groups_sem);
				10888	goto next;
				10889	}
				10890	spin_unlock(&block_group->lock);
				10891
				10892	/* We don't want to force the issue, only flip if it's ok. */
				10893	ret = inc_block_group_ro(block_group, 0);
				10894	up_write(&space_info->groups_sem);
				10895	if (ret < 0) {
				10896	ret = 0;
				10897	goto next;
				10898	}
				10899
				10900	/*
				10901	* Want to do this before we do anything else so we can recover
				10902	* properly if we fail to join the transaction.
				10903	*/
				10904	trans = btrfs_start_trans_remove_block_group(fs_info,
				10905	block_group->key.objectid);
				10906	if (IS_ERR(trans)) {
				10907	btrfs_dec_block_group_ro(block_group);
				10908	ret = PTR_ERR(trans);
				10909	goto next;
				10910	}
				10911
				10912	/*
				10913	* We could have pending pinned extents for this block group,
				10914	* just delete them, we don't care about them anymore.
				10915	*/
				10916	start = block_group->key.objectid;
				10917	end = start + block_group->key.offset - 1;
				10918	/*
				10919	* Hold the unused_bg_unpin_mutex lock to avoid racing with
				10920	* btrfs_finish_extent_commit(). If we are at transaction N,
				10921	* another task might be running finish_extent_commit() for the
				10922	* previous transaction N - 1, and have seen a range belonging
				10923	* to the block group in freed_extents[] before we were able to
				10924	* clear the whole block group range from freed_extents[]. This
				10925	* means that task can lookup for the block group after we
				10926	* unpinned it from freed_extents[] and removed it, leading to
				10927	* a BUG_ON() at btrfs_unpin_extent_range().
				10928	*/
				10929	mutex_lock(&fs_info->unused_bg_unpin_mutex);
				10930	ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
				10931	EXTENT_DIRTY);
				10932	if (ret) {
				10933	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				10934	btrfs_dec_block_group_ro(block_group);
				10935	goto end_trans;
				10936	}
				10937	ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
				10938	EXTENT_DIRTY);
				10939	if (ret) {
				10940	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				10941	btrfs_dec_block_group_ro(block_group);
				10942	goto end_trans;
				10943	}
				10944	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				10945
				10946	/* Reset pinned so btrfs_put_block_group doesn't complain */
				10947	spin_lock(&space_info->lock);
				10948	spin_lock(&block_group->lock);
				10949
				10950	space_info->bytes_pinned -= block_group->pinned;
				10951	space_info->bytes_readonly += block_group->pinned;
				10952	percpu_counter_add(&space_info->total_bytes_pinned,
				10953	-block_group->pinned);
				10954	block_group->pinned = 0;
				10955
				10956	spin_unlock(&block_group->lock);
				10957	spin_unlock(&space_info->lock);
				10958
				10959	/* DISCARD can flip during remount */
				10960	trimming = btrfs_test_opt(fs_info, DISCARD);
				10961
				10962	/* Implicit trim during transaction commit. */
				10963	if (trimming)
				10964	btrfs_get_block_group_trimming(block_group);
				10965
				10966	/*
				10967	* Btrfs_remove_chunk will abort the transaction if things go
				10968	* horribly wrong.
				10969	*/
				10970	ret = btrfs_remove_chunk(trans, fs_info,
				10971	block_group->key.objectid);
				10972
				10973	if (ret) {
				10974	if (trimming)
				10975	btrfs_put_block_group_trimming(block_group);
				10976	goto end_trans;
				10977	}
				10978
				10979	/*
				10980	* If we're not mounted with -odiscard, we can just forget
				10981	* about this block group. Otherwise we'll need to wait
				10982	* until transaction commit to do the actual discard.
				10983	*/
				10984	if (trimming) {
				10985	spin_lock(&fs_info->unused_bgs_lock);
				10986	/*
				10987	* A concurrent scrub might have added us to the list
				10988	* fs_info->unused_bgs, so use a list_move operation
				10989	* to add the block group to the deleted_bgs list.
				10990	*/
				10991	list_move(&block_group->bg_list,
				10992	&trans->transaction->deleted_bgs);
				10993	spin_unlock(&fs_info->unused_bgs_lock);
				10994	btrfs_get_block_group(block_group);
				10995	}
				10996	end_trans:
				10997	btrfs_end_transaction(trans);
				10998	next:
				10999	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				11000	btrfs_put_block_group(block_group);
				11001	spin_lock(&fs_info->unused_bgs_lock);
				11002	}
				11003	spin_unlock(&fs_info->unused_bgs_lock);
				11004	}
				11005
				11006	int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
				11007	{
				11008	struct btrfs_space_info *space_info;
				11009	struct btrfs_super_block *disk_super;
				11010	u64 features;
				11011	u64 flags;
				11012	int mixed = 0;
				11013	int ret;
				11014
				11015	disk_super = fs_info->super_copy;
				11016	if (!btrfs_super_root(disk_super))
				11017	return -EINVAL;
				11018
				11019	features = btrfs_super_incompat_flags(disk_super);
				11020	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				11021	mixed = 1;
				11022
				11023	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				11024	ret = create_space_info(fs_info, flags, &space_info);
				11025	if (ret)
				11026	goto out;
				11027
				11028	if (mixed) {
				11029	flags = BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA;
				11030	ret = create_space_info(fs_info, flags, &space_info);
				11031	} else {
				11032	flags = BTRFS_BLOCK_GROUP_METADATA;
				11033	ret = create_space_info(fs_info, flags, &space_info);
				11034	if (ret)
				11035	goto out;
				11036
				11037	flags = BTRFS_BLOCK_GROUP_DATA;
				11038	ret = create_space_info(fs_info, flags, &space_info);
				11039	}
				11040	out:
				11041	return ret;
				11042	}
				11043
				11044	int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				11045	u64 start, u64 end)
				11046	{
				11047	return unpin_extent_range(fs_info, start, end, false);
				11048	}
				11049
				11050	/*
				11051	* It used to be that old block groups would be left around forever.
				11052	* Iterating over them would be enough to trim unused space. Since we
				11053	* now automatically remove them, we also need to iterate over unallocated
				11054	* space.
				11055	*
				11056	* We don't want a transaction for this since the discard may take a
				11057	* substantial amount of time. We don't require that a transaction be
				11058	* running, but we do need to take a running transaction into account
				11059	* to ensure that we're not discarding chunks that were released in
				11060	* the current transaction.
				11061	*
				11062	* Holding the chunks lock will prevent other threads from allocating
				11063	* or releasing chunks, but it won't prevent a running transaction
				11064	* from committing and releasing the memory that the pending chunks
				11065	* list head uses. For that, we need to take a reference to the
				11066	* transaction.
				11067	*/
				11068	static int btrfs_trim_free_extents(struct btrfs_device *device,
				11069	u64 minlen, u64 *trimmed)
				11070	{
				11071	u64 start = 0, len = 0;
				11072	int ret;
				11073
				11074	*trimmed = 0;
				11075
				11076	/* Discard not supported = nothing to do. */
				11077	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
				11078	return 0;
				11079
				11080	/* Not writeable = nothing to do. */
				11081	if (!device->writeable)
				11082	return 0;
				11083
				11084	/* No free space = nothing to do. */
				11085	if (device->total_bytes <= device->bytes_used)
				11086	return 0;
				11087
				11088	ret = 0;
				11089
				11090	while (1) {
				11091	struct btrfs_fs_info *fs_info = device->fs_info;
				11092	struct btrfs_transaction *trans;
				11093	u64 bytes;
				11094
				11095	ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
				11096	if (ret)
				11097	return ret;
				11098
				11099	down_read(&fs_info->commit_root_sem);
				11100
				11101	spin_lock(&fs_info->trans_lock);
				11102	trans = fs_info->running_transaction;
				11103	if (trans)
				11104	refcount_inc(&trans->use_count);
				11105	spin_unlock(&fs_info->trans_lock);
				11106
				11107	ret = find_free_dev_extent_start(trans, device, minlen, start,
				11108	&start, &len);
				11109	if (trans)
				11110	btrfs_put_transaction(trans);
				11111
				11112	if (ret) {
				11113	up_read(&fs_info->commit_root_sem);
				11114	mutex_unlock(&fs_info->chunk_mutex);
				11115	if (ret == -ENOSPC)
				11116	ret = 0;
				11117	break;
				11118	}
				11119
				11120	ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
				11121	up_read(&fs_info->commit_root_sem);
				11122	mutex_unlock(&fs_info->chunk_mutex);
				11123
				11124	if (ret)
				11125	break;
				11126
				11127	start += len;
				11128	*trimmed += bytes;
				11129
				11130	if (fatal_signal_pending(current)) {
				11131	ret = -ERESTARTSYS;
				11132	break;
				11133	}
				11134
				11135	cond_resched();
				11136	}
				11137
				11138	return ret;
				11139	}
				11140
				11141	/*
				11142	* Trim the whole filesystem by:
				11143	* 1) trimming the free space in each block group
				11144	* 2) trimming the unallocated space on each device
				11145	*
				11146	* This will also continue trimming even if a block group or device encounters
				11147	* an error. The return value will be the last error, or 0 if nothing bad
				11148	* happens.
				11149	*/
				11150	int btrfs_trim_fs(struct btrfs_fs_info fs_info, struct fstrim_range range)
				11151	{
				11152	struct btrfs_block_group_cache *cache = NULL;
				11153	struct btrfs_device *device;
				11154	struct list_head *devices;
				11155	u64 group_trimmed;
				11156	u64 start;
				11157	u64 end;
				11158	u64 trimmed = 0;
				11159	u64 bg_failed = 0;
				11160	u64 dev_failed = 0;
				11161	int bg_ret = 0;
				11162	int dev_ret = 0;
				11163	int ret = 0;
				11164
				11165	cache = btrfs_lookup_first_block_group(fs_info, range->start);
				11166	for (; cache; cache = next_block_group(fs_info, cache)) {
				11167	if (cache->key.objectid >= (range->start + range->len)) {
				11168	btrfs_put_block_group(cache);
				11169	break;
				11170	}
				11171
				11172	start = max(range->start, cache->key.objectid);
				11173	end = min(range->start + range->len,
				11174	cache->key.objectid + cache->key.offset);
				11175
				11176	if (end - start >= range->minlen) {
				11177	if (!block_group_cache_done(cache)) {
				11178	ret = cache_block_group(cache, 0);
				11179	if (ret) {
				11180	bg_failed++;
				11181	bg_ret = ret;
				11182	continue;
				11183	}
				11184	ret = wait_block_group_cache_done(cache);
				11185	if (ret) {
				11186	bg_failed++;
				11187	bg_ret = ret;
				11188	continue;
				11189	}
				11190	}
				11191	ret = btrfs_trim_block_group(cache,
				11192	&group_trimmed,
				11193	start,
				11194	end,
				11195	range->minlen);
				11196
				11197	trimmed += group_trimmed;
				11198	if (ret) {
				11199	bg_failed++;
				11200	bg_ret = ret;
				11201	continue;
				11202	}
				11203	}
				11204	}
				11205
				11206	if (bg_failed)
				11207	btrfs_warn(fs_info,
				11208	"failed to trim %llu block group(s), last error %d",
				11209	bg_failed, bg_ret);
				11210	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				11211	devices = &fs_info->fs_devices->devices;
				11212	list_for_each_entry(device, devices, dev_list) {
				11213	ret = btrfs_trim_free_extents(device, range->minlen,
				11214	&group_trimmed);
				11215	if (ret) {
				11216	dev_failed++;
				11217	dev_ret = ret;
				11218	break;
				11219	}
				11220
				11221	trimmed += group_trimmed;
				11222	}
				11223	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				11224
				11225	if (dev_failed)
				11226	btrfs_warn(fs_info,
				11227	"failed to trim %llu device(s), last error %d",
				11228	dev_failed, dev_ret);
				11229	range->len = trimmed;
				11230	if (bg_ret)
				11231	return bg_ret;
				11232	return dev_ret;
				11233	}
				11234
				11235	/*
				11236	* btrfs_{start,end}_write_no_snapshotting() are similar to
				11237	* mnt_{want,drop}_write(), they are used to prevent some tasks from writing
				11238	* data into the page cache through nocow before the subvolume is snapshoted,
				11239	* but flush the data into disk after the snapshot creation, or to prevent
				11240	* operations while snapshotting is ongoing and that cause the snapshot to be
				11241	* inconsistent (writes followed by expanding truncates for example).
				11242	*/
				11243	void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
				11244	{
				11245	percpu_counter_dec(&root->subv_writers->counter);
				11246	/*
				11247	* Make sure counter is updated before we wake up waiters.
				11248	*/
				11249	smp_mb();
				11250	if (waitqueue_active(&root->subv_writers->wait))
				11251	wake_up(&root->subv_writers->wait);
				11252	}
				11253
				11254	int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
				11255	{
				11256	if (atomic_read(&root->will_be_snapshotted))
				11257	return 0;
				11258
				11259	percpu_counter_inc(&root->subv_writers->counter);
				11260	/*
				11261	* Make sure counter is updated before we check for snapshot creation.
				11262	*/
				11263	smp_mb();
				11264	if (atomic_read(&root->will_be_snapshotted)) {
				11265	btrfs_end_write_no_snapshotting(root);
				11266	return 0;
				11267	}
				11268	return 1;
				11269	}
				11270
				11271	static int wait_snapshotting_atomic_t(atomic_t *a)
				11272	{
				11273	schedule();
				11274	return 0;
				11275	}
				11276
				11277	void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
				11278	{
				11279	while (true) {
				11280	int ret;
				11281
				11282	ret = btrfs_start_write_no_snapshotting(root);
				11283	if (ret)
				11284	break;
				11285	wait_on_atomic_t(&root->will_be_snapshotted,
				11286	wait_snapshotting_atomic_t,
				11287	TASK_UNINTERRUPTIBLE);
				11288	}
				11289	}