Blame - src/kernel/linux/v4.19/fs/btrfs/extent-tree.c - T800

blob: 47ca1ebda056d4af270f9790f73ed01596c827a5 [file] [log] [blame]

xj	b04a402	2021-11-25 15:01:52 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 2007 Oracle. All rights reserved.
				4	*/
				5
				6	#include <linux/sched.h>
				7	#include <linux/sched/signal.h>
				8	#include <linux/pagemap.h>
				9	#include <linux/writeback.h>
				10	#include <linux/blkdev.h>
				11	#include <linux/sort.h>
				12	#include <linux/rcupdate.h>
				13	#include <linux/kthread.h>
				14	#include <linux/slab.h>
				15	#include <linux/ratelimit.h>
				16	#include <linux/percpu_counter.h>
				17	#include <linux/lockdep.h>
				18	#include <linux/crc32c.h>
				19	#include "tree-log.h"
				20	#include "disk-io.h"
				21	#include "print-tree.h"
				22	#include "volumes.h"
				23	#include "raid56.h"
				24	#include "locking.h"
				25	#include "free-space-cache.h"
				26	#include "free-space-tree.h"
				27	#include "math.h"
				28	#include "sysfs.h"
				29	#include "qgroup.h"
				30	#include "ref-verify.h"
				31
				32	#undef SCRAMBLE_DELAYED_REFS
				33
				34	/*
				35	* control flags for do_chunk_alloc's force field
				36	* CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
				37	* if we really need one.
				38	*
				39	* CHUNK_ALLOC_LIMITED means to only try and allocate one
				40	* if we have very few chunks already allocated. This is
				41	* used as part of the clustering code to help make sure
				42	* we have a good pool of storage to cluster in, without
				43	* filling the FS with empty chunks
				44	*
				45	* CHUNK_ALLOC_FORCE means it must try to allocate one
				46	*
				47	*/
				48	enum {
				49	CHUNK_ALLOC_NO_FORCE = 0,
				50	CHUNK_ALLOC_LIMITED = 1,
				51	CHUNK_ALLOC_FORCE = 2,
				52	};
				53
				54	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				55	struct btrfs_delayed_ref_node *node, u64 parent,
				56	u64 root_objectid, u64 owner_objectid,
				57	u64 owner_offset, int refs_to_drop,
				58	struct btrfs_delayed_extent_op *extra_op);
				59	static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				60	struct extent_buffer *leaf,
				61	struct btrfs_extent_item *ei);
				62	static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				63	u64 parent, u64 root_objectid,
				64	u64 flags, u64 owner, u64 offset,
				65	struct btrfs_key *ins, int ref_mod);
				66	static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				67	struct btrfs_delayed_ref_node *node,
				68	struct btrfs_delayed_extent_op *extent_op);
				69	static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
				70	int force);
				71	static int find_next_key(struct btrfs_path *path, int level,
				72	struct btrfs_key *key);
				73	static void dump_space_info(struct btrfs_fs_info *fs_info,
				74	struct btrfs_space_info *info, u64 bytes,
				75	int dump_block_groups);
				76	static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
				77	u64 num_bytes);
				78	static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				79	struct btrfs_space_info *space_info,
				80	u64 num_bytes);
				81	static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				82	struct btrfs_space_info *space_info,
				83	u64 num_bytes);
				84
				85	static noinline int
				86	block_group_cache_done(struct btrfs_block_group_cache *cache)
				87	{
				88	smp_mb();
				89	return cache->cached == BTRFS_CACHE_FINISHED \|\|
				90	cache->cached == BTRFS_CACHE_ERROR;
				91	}
				92
				93	static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
				94	{
				95	return (cache->flags & bits) == bits;
				96	}
				97
				98	void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
				99	{
				100	atomic_inc(&cache->count);
				101	}
				102
				103	void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
				104	{
				105	if (atomic_dec_and_test(&cache->count)) {
				106	WARN_ON(cache->pinned > 0);
				107	WARN_ON(cache->reserved > 0);
				108
				109	/*
				110	* If not empty, someone is still holding mutex of
				111	* full_stripe_lock, which can only be released by caller.
				112	* And it will definitely cause use-after-free when caller
				113	* tries to release full stripe lock.
				114	*
				115	* No better way to resolve, but only to warn.
				116	*/
				117	WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
				118	kfree(cache->free_space_ctl);
				119	kfree(cache);
				120	}
				121	}
				122
				123	/*
				124	* this adds the block group to the fs_info rb tree for the block group
				125	* cache
				126	*/
				127	static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
				128	struct btrfs_block_group_cache *block_group)
				129	{
				130	struct rb_node **p;
				131	struct rb_node *parent = NULL;
				132	struct btrfs_block_group_cache *cache;
				133
				134	spin_lock(&info->block_group_cache_lock);
				135	p = &info->block_group_cache_tree.rb_node;
				136
				137	while (*p) {
				138	parent = *p;
				139	cache = rb_entry(parent, struct btrfs_block_group_cache,
				140	cache_node);
				141	if (block_group->key.objectid < cache->key.objectid) {
				142	p = &(*p)->rb_left;
				143	} else if (block_group->key.objectid > cache->key.objectid) {
				144	p = &(*p)->rb_right;
				145	} else {
				146	spin_unlock(&info->block_group_cache_lock);
				147	return -EEXIST;
				148	}
				149	}
				150
				151	rb_link_node(&block_group->cache_node, parent, p);
				152	rb_insert_color(&block_group->cache_node,
				153	&info->block_group_cache_tree);
				154
				155	if (info->first_logical_byte > block_group->key.objectid)
				156	info->first_logical_byte = block_group->key.objectid;
				157
				158	spin_unlock(&info->block_group_cache_lock);
				159
				160	return 0;
				161	}
				162
				163	/*
				164	* This will return the block group at or after bytenr if contains is 0, else
				165	* it will return the block group that contains the bytenr
				166	*/
				167	static struct btrfs_block_group_cache *
				168	block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
				169	int contains)
				170	{
				171	struct btrfs_block_group_cache cache, ret = NULL;
				172	struct rb_node *n;
				173	u64 end, start;
				174
				175	spin_lock(&info->block_group_cache_lock);
				176	n = info->block_group_cache_tree.rb_node;
				177
				178	while (n) {
				179	cache = rb_entry(n, struct btrfs_block_group_cache,
				180	cache_node);
				181	end = cache->key.objectid + cache->key.offset - 1;
				182	start = cache->key.objectid;
				183
				184	if (bytenr < start) {
				185	if (!contains && (!ret \|\| start < ret->key.objectid))
				186	ret = cache;
				187	n = n->rb_left;
				188	} else if (bytenr > start) {
				189	if (contains && bytenr <= end) {
				190	ret = cache;
				191	break;
				192	}
				193	n = n->rb_right;
				194	} else {
				195	ret = cache;
				196	break;
				197	}
				198	}
				199	if (ret) {
				200	btrfs_get_block_group(ret);
				201	if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
				202	info->first_logical_byte = ret->key.objectid;
				203	}
				204	spin_unlock(&info->block_group_cache_lock);
				205
				206	return ret;
				207	}
				208
				209	static int add_excluded_extent(struct btrfs_fs_info *fs_info,
				210	u64 start, u64 num_bytes)
				211	{
				212	u64 end = start + num_bytes - 1;
				213	set_extent_bits(&fs_info->freed_extents[0],
				214	start, end, EXTENT_UPTODATE);
				215	set_extent_bits(&fs_info->freed_extents[1],
				216	start, end, EXTENT_UPTODATE);
				217	return 0;
				218	}
				219
				220	static void free_excluded_extents(struct btrfs_block_group_cache *cache)
				221	{
				222	struct btrfs_fs_info *fs_info = cache->fs_info;
				223	u64 start, end;
				224
				225	start = cache->key.objectid;
				226	end = start + cache->key.offset - 1;
				227
				228	clear_extent_bits(&fs_info->freed_extents[0],
				229	start, end, EXTENT_UPTODATE);
				230	clear_extent_bits(&fs_info->freed_extents[1],
				231	start, end, EXTENT_UPTODATE);
				232	}
				233
				234	static int exclude_super_stripes(struct btrfs_block_group_cache *cache)
				235	{
				236	struct btrfs_fs_info *fs_info = cache->fs_info;
				237	u64 bytenr;
				238	u64 *logical;
				239	int stripe_len;
				240	int i, nr, ret;
				241
				242	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
				243	stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
				244	cache->bytes_super += stripe_len;
				245	ret = add_excluded_extent(fs_info, cache->key.objectid,
				246	stripe_len);
				247	if (ret)
				248	return ret;
				249	}
				250
				251	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
				252	bytenr = btrfs_sb_offset(i);
				253	ret = btrfs_rmap_block(fs_info, cache->key.objectid,
				254	bytenr, &logical, &nr, &stripe_len);
				255	if (ret)
				256	return ret;
				257
				258	while (nr--) {
				259	u64 start, len;
				260
				261	if (logical[nr] > cache->key.objectid +
				262	cache->key.offset)
				263	continue;
				264
				265	if (logical[nr] + stripe_len <= cache->key.objectid)
				266	continue;
				267
				268	start = logical[nr];
				269	if (start < cache->key.objectid) {
				270	start = cache->key.objectid;
				271	len = (logical[nr] + stripe_len) - start;
				272	} else {
				273	len = min_t(u64, stripe_len,
				274	cache->key.objectid +
				275	cache->key.offset - start);
				276	}
				277
				278	cache->bytes_super += len;
				279	ret = add_excluded_extent(fs_info, start, len);
				280	if (ret) {
				281	kfree(logical);
				282	return ret;
				283	}
				284	}
				285
				286	kfree(logical);
				287	}
				288	return 0;
				289	}
				290
				291	static struct btrfs_caching_control *
				292	get_caching_control(struct btrfs_block_group_cache *cache)
				293	{
				294	struct btrfs_caching_control *ctl;
				295
				296	spin_lock(&cache->lock);
				297	if (!cache->caching_ctl) {
				298	spin_unlock(&cache->lock);
				299	return NULL;
				300	}
				301
				302	ctl = cache->caching_ctl;
				303	refcount_inc(&ctl->count);
				304	spin_unlock(&cache->lock);
				305	return ctl;
				306	}
				307
				308	static void put_caching_control(struct btrfs_caching_control *ctl)
				309	{
				310	if (refcount_dec_and_test(&ctl->count))
				311	kfree(ctl);
				312	}
				313
				314	#ifdef CONFIG_BTRFS_DEBUG
				315	static void fragment_free_space(struct btrfs_block_group_cache *block_group)
				316	{
				317	struct btrfs_fs_info *fs_info = block_group->fs_info;
				318	u64 start = block_group->key.objectid;
				319	u64 len = block_group->key.offset;
				320	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
				321	fs_info->nodesize : fs_info->sectorsize;
				322	u64 step = chunk << 1;
				323
				324	while (len > chunk) {
				325	btrfs_remove_free_space(block_group, start, chunk);
				326	start += step;
				327	if (len < step)
				328	len = 0;
				329	else
				330	len -= step;
				331	}
				332	}
				333	#endif
				334
				335	/*
				336	* this is only called by cache_block_group, since we could have freed extents
				337	* we need to check the pinned_extents for any extents that can't be used yet
				338	* since their free space will be released as soon as the transaction commits.
				339	*/
				340	u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
				341	u64 start, u64 end)
				342	{
				343	struct btrfs_fs_info *info = block_group->fs_info;
				344	u64 extent_start, extent_end, size, total_added = 0;
				345	int ret;
				346
				347	while (start < end) {
				348	ret = find_first_extent_bit(info->pinned_extents, start,
				349	&extent_start, &extent_end,
				350	EXTENT_DIRTY \| EXTENT_UPTODATE,
				351	NULL);
				352	if (ret)
				353	break;
				354
				355	if (extent_start <= start) {
				356	start = extent_end + 1;
				357	} else if (extent_start > start && extent_start < end) {
				358	size = extent_start - start;
				359	total_added += size;
				360	ret = btrfs_add_free_space(block_group, start,
				361	size);
				362	BUG_ON(ret); /* -ENOMEM or logic error */
				363	start = extent_end + 1;
				364	} else {
				365	break;
				366	}
				367	}
				368
				369	if (start < end) {
				370	size = end - start;
				371	total_added += size;
				372	ret = btrfs_add_free_space(block_group, start, size);
				373	BUG_ON(ret); /* -ENOMEM or logic error */
				374	}
				375
				376	return total_added;
				377	}
				378
				379	static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
				380	{
				381	struct btrfs_block_group_cache *block_group = caching_ctl->block_group;
				382	struct btrfs_fs_info *fs_info = block_group->fs_info;
				383	struct btrfs_root *extent_root = fs_info->extent_root;
				384	struct btrfs_path *path;
				385	struct extent_buffer *leaf;
				386	struct btrfs_key key;
				387	u64 total_found = 0;
				388	u64 last = 0;
				389	u32 nritems;
				390	int ret;
				391	bool wakeup = true;
				392
				393	path = btrfs_alloc_path();
				394	if (!path)
				395	return -ENOMEM;
				396
				397	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
				398
				399	#ifdef CONFIG_BTRFS_DEBUG
				400	/*
				401	* If we're fragmenting we don't want to make anybody think we can
				402	* allocate from this block group until we've had a chance to fragment
				403	* the free space.
				404	*/
				405	if (btrfs_should_fragment_free_space(block_group))
				406	wakeup = false;
				407	#endif
				408	/*
				409	* We don't want to deadlock with somebody trying to allocate a new
				410	* extent for the extent root while also trying to search the extent
				411	* root to add free space. So we skip locking and search the commit
				412	* root, since its read-only
				413	*/
				414	path->skip_locking = 1;
				415	path->search_commit_root = 1;
				416	path->reada = READA_FORWARD;
				417
				418	key.objectid = last;
				419	key.offset = 0;
				420	key.type = BTRFS_EXTENT_ITEM_KEY;
				421
				422	next:
				423	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
				424	if (ret < 0)
				425	goto out;
				426
				427	leaf = path->nodes[0];
				428	nritems = btrfs_header_nritems(leaf);
				429
				430	while (1) {
				431	if (btrfs_fs_closing(fs_info) > 1) {
				432	last = (u64)-1;
				433	break;
				434	}
				435
				436	if (path->slots[0] < nritems) {
				437	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				438	} else {
				439	ret = find_next_key(path, 0, &key);
				440	if (ret)
				441	break;
				442
				443	if (need_resched() \|\|
				444	rwsem_is_contended(&fs_info->commit_root_sem)) {
				445	if (wakeup)
				446	caching_ctl->progress = last;
				447	btrfs_release_path(path);
				448	up_read(&fs_info->commit_root_sem);
				449	mutex_unlock(&caching_ctl->mutex);
				450	cond_resched();
				451	mutex_lock(&caching_ctl->mutex);
				452	down_read(&fs_info->commit_root_sem);
				453	goto next;
				454	}
				455
				456	ret = btrfs_next_leaf(extent_root, path);
				457	if (ret < 0)
				458	goto out;
				459	if (ret)
				460	break;
				461	leaf = path->nodes[0];
				462	nritems = btrfs_header_nritems(leaf);
				463	continue;
				464	}
				465
				466	if (key.objectid < last) {
				467	key.objectid = last;
				468	key.offset = 0;
				469	key.type = BTRFS_EXTENT_ITEM_KEY;
				470
				471	if (wakeup)
				472	caching_ctl->progress = last;
				473	btrfs_release_path(path);
				474	goto next;
				475	}
				476
				477	if (key.objectid < block_group->key.objectid) {
				478	path->slots[0]++;
				479	continue;
				480	}
				481
				482	if (key.objectid >= block_group->key.objectid +
				483	block_group->key.offset)
				484	break;
				485
				486	if (key.type == BTRFS_EXTENT_ITEM_KEY \|\|
				487	key.type == BTRFS_METADATA_ITEM_KEY) {
				488	total_found += add_new_free_space(block_group, last,
				489	key.objectid);
				490	if (key.type == BTRFS_METADATA_ITEM_KEY)
				491	last = key.objectid +
				492	fs_info->nodesize;
				493	else
				494	last = key.objectid + key.offset;
				495
				496	if (total_found > CACHING_CTL_WAKE_UP) {
				497	total_found = 0;
				498	if (wakeup)
				499	wake_up(&caching_ctl->wait);
				500	}
				501	}
				502	path->slots[0]++;
				503	}
				504	ret = 0;
				505
				506	total_found += add_new_free_space(block_group, last,
				507	block_group->key.objectid +
				508	block_group->key.offset);
				509	caching_ctl->progress = (u64)-1;
				510
				511	out:
				512	btrfs_free_path(path);
				513	return ret;
				514	}
				515
				516	static noinline void caching_thread(struct btrfs_work *work)
				517	{
				518	struct btrfs_block_group_cache *block_group;
				519	struct btrfs_fs_info *fs_info;
				520	struct btrfs_caching_control *caching_ctl;
				521	int ret;
				522
				523	caching_ctl = container_of(work, struct btrfs_caching_control, work);
				524	block_group = caching_ctl->block_group;
				525	fs_info = block_group->fs_info;
				526
				527	mutex_lock(&caching_ctl->mutex);
				528	down_read(&fs_info->commit_root_sem);
				529
				530	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
				531	ret = load_free_space_tree(caching_ctl);
				532	else
				533	ret = load_extent_tree_free(caching_ctl);
				534
				535	spin_lock(&block_group->lock);
				536	block_group->caching_ctl = NULL;
				537	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
				538	spin_unlock(&block_group->lock);
				539
				540	#ifdef CONFIG_BTRFS_DEBUG
				541	if (btrfs_should_fragment_free_space(block_group)) {
				542	u64 bytes_used;
				543
				544	spin_lock(&block_group->space_info->lock);
				545	spin_lock(&block_group->lock);
				546	bytes_used = block_group->key.offset -
				547	btrfs_block_group_used(&block_group->item);
				548	block_group->space_info->bytes_used += bytes_used >> 1;
				549	spin_unlock(&block_group->lock);
				550	spin_unlock(&block_group->space_info->lock);
				551	fragment_free_space(block_group);
				552	}
				553	#endif
				554
				555	caching_ctl->progress = (u64)-1;
				556
				557	up_read(&fs_info->commit_root_sem);
				558	free_excluded_extents(block_group);
				559	mutex_unlock(&caching_ctl->mutex);
				560
				561	wake_up(&caching_ctl->wait);
				562
				563	put_caching_control(caching_ctl);
				564	btrfs_put_block_group(block_group);
				565	}
				566
				567	static int cache_block_group(struct btrfs_block_group_cache *cache,
				568	int load_cache_only)
				569	{
				570	DEFINE_WAIT(wait);
				571	struct btrfs_fs_info *fs_info = cache->fs_info;
				572	struct btrfs_caching_control *caching_ctl;
				573	int ret = 0;
				574
				575	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
				576	if (!caching_ctl)
				577	return -ENOMEM;
				578
				579	INIT_LIST_HEAD(&caching_ctl->list);
				580	mutex_init(&caching_ctl->mutex);
				581	init_waitqueue_head(&caching_ctl->wait);
				582	caching_ctl->block_group = cache;
				583	caching_ctl->progress = cache->key.objectid;
				584	refcount_set(&caching_ctl->count, 1);
				585	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
				586	caching_thread, NULL, NULL);
				587
				588	spin_lock(&cache->lock);
				589	/*
				590	* This should be a rare occasion, but this could happen I think in the
				591	* case where one thread starts to load the space cache info, and then
				592	* some other thread starts a transaction commit which tries to do an
				593	* allocation while the other thread is still loading the space cache
				594	* info. The previous loop should have kept us from choosing this block
				595	* group, but if we've moved to the state where we will wait on caching
				596	* block groups we need to first check if we're doing a fast load here,
				597	* so we can wait for it to finish, otherwise we could end up allocating
				598	* from a block group who's cache gets evicted for one reason or
				599	* another.
				600	*/
				601	while (cache->cached == BTRFS_CACHE_FAST) {
				602	struct btrfs_caching_control *ctl;
				603
				604	ctl = cache->caching_ctl;
				605	refcount_inc(&ctl->count);
				606	prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
				607	spin_unlock(&cache->lock);
				608
				609	schedule();
				610
				611	finish_wait(&ctl->wait, &wait);
				612	put_caching_control(ctl);
				613	spin_lock(&cache->lock);
				614	}
				615
				616	if (cache->cached != BTRFS_CACHE_NO) {
				617	spin_unlock(&cache->lock);
				618	kfree(caching_ctl);
				619	return 0;
				620	}
				621	WARN_ON(cache->caching_ctl);
				622	cache->caching_ctl = caching_ctl;
				623	cache->cached = BTRFS_CACHE_FAST;
				624	spin_unlock(&cache->lock);
				625
				626	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
				627	mutex_lock(&caching_ctl->mutex);
				628	ret = load_free_space_cache(fs_info, cache);
				629
				630	spin_lock(&cache->lock);
				631	if (ret == 1) {
				632	cache->caching_ctl = NULL;
				633	cache->cached = BTRFS_CACHE_FINISHED;
				634	cache->last_byte_to_unpin = (u64)-1;
				635	caching_ctl->progress = (u64)-1;
				636	} else {
				637	if (load_cache_only) {
				638	cache->caching_ctl = NULL;
				639	cache->cached = BTRFS_CACHE_NO;
				640	} else {
				641	cache->cached = BTRFS_CACHE_STARTED;
				642	cache->has_caching_ctl = 1;
				643	}
				644	}
				645	spin_unlock(&cache->lock);
				646	#ifdef CONFIG_BTRFS_DEBUG
				647	if (ret == 1 &&
				648	btrfs_should_fragment_free_space(cache)) {
				649	u64 bytes_used;
				650
				651	spin_lock(&cache->space_info->lock);
				652	spin_lock(&cache->lock);
				653	bytes_used = cache->key.offset -
				654	btrfs_block_group_used(&cache->item);
				655	cache->space_info->bytes_used += bytes_used >> 1;
				656	spin_unlock(&cache->lock);
				657	spin_unlock(&cache->space_info->lock);
				658	fragment_free_space(cache);
				659	}
				660	#endif
				661	mutex_unlock(&caching_ctl->mutex);
				662
				663	wake_up(&caching_ctl->wait);
				664	if (ret == 1) {
				665	put_caching_control(caching_ctl);
				666	free_excluded_extents(cache);
				667	return 0;
				668	}
				669	} else {
				670	/*
				671	* We're either using the free space tree or no caching at all.
				672	* Set cached to the appropriate value and wakeup any waiters.
				673	*/
				674	spin_lock(&cache->lock);
				675	if (load_cache_only) {
				676	cache->caching_ctl = NULL;
				677	cache->cached = BTRFS_CACHE_NO;
				678	} else {
				679	cache->cached = BTRFS_CACHE_STARTED;
				680	cache->has_caching_ctl = 1;
				681	}
				682	spin_unlock(&cache->lock);
				683	wake_up(&caching_ctl->wait);
				684	}
				685
				686	if (load_cache_only) {
				687	put_caching_control(caching_ctl);
				688	return 0;
				689	}
				690
				691	down_write(&fs_info->commit_root_sem);
				692	refcount_inc(&caching_ctl->count);
				693	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
				694	up_write(&fs_info->commit_root_sem);
				695
				696	btrfs_get_block_group(cache);
				697
				698	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
				699
				700	return ret;
				701	}
				702
				703	/*
				704	* return the block group that starts at or after bytenr
				705	*/
				706	static struct btrfs_block_group_cache *
				707	btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
				708	{
				709	return block_group_cache_tree_search(info, bytenr, 0);
				710	}
				711
				712	/*
				713	* return the block group that contains the given bytenr
				714	*/
				715	struct btrfs_block_group_cache *btrfs_lookup_block_group(
				716	struct btrfs_fs_info *info,
				717	u64 bytenr)
				718	{
				719	return block_group_cache_tree_search(info, bytenr, 1);
				720	}
				721
				722	static struct btrfs_space_info __find_space_info(struct btrfs_fs_info info,
				723	u64 flags)
				724	{
				725	struct list_head *head = &info->space_info;
				726	struct btrfs_space_info *found;
				727
				728	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
				729
				730	rcu_read_lock();
				731	list_for_each_entry_rcu(found, head, list) {
				732	if (found->flags & flags) {
				733	rcu_read_unlock();
				734	return found;
				735	}
				736	}
				737	rcu_read_unlock();
				738	return NULL;
				739	}
				740
				741	static void add_pinned_bytes(struct btrfs_fs_info *fs_info, s64 num_bytes,
				742	bool metadata, u64 root_objectid)
				743	{
				744	struct btrfs_space_info *space_info;
				745	u64 flags;
				746
				747	if (metadata) {
				748	if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
				749	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				750	else
				751	flags = BTRFS_BLOCK_GROUP_METADATA;
				752	} else {
				753	flags = BTRFS_BLOCK_GROUP_DATA;
				754	}
				755
				756	space_info = __find_space_info(fs_info, flags);
				757	ASSERT(space_info);
				758	percpu_counter_add_batch(&space_info->total_bytes_pinned, num_bytes,
				759	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				760	}
				761
				762	/*
				763	* after adding space to the filesystem, we need to clear the full flags
				764	* on all the space infos.
				765	*/
				766	void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
				767	{
				768	struct list_head *head = &info->space_info;
				769	struct btrfs_space_info *found;
				770
				771	rcu_read_lock();
				772	list_for_each_entry_rcu(found, head, list)
				773	found->full = 0;
				774	rcu_read_unlock();
				775	}
				776
				777	/* simple helper to search for an existing data extent at a given offset */
				778	int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
				779	{
				780	int ret;
				781	struct btrfs_key key;
				782	struct btrfs_path *path;
				783
				784	path = btrfs_alloc_path();
				785	if (!path)
				786	return -ENOMEM;
				787
				788	key.objectid = start;
				789	key.offset = len;
				790	key.type = BTRFS_EXTENT_ITEM_KEY;
				791	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
				792	btrfs_free_path(path);
				793	return ret;
				794	}
				795
				796	/*
				797	* helper function to lookup reference count and flags of a tree block.
				798	*
				799	* the head node for delayed ref is used to store the sum of all the
				800	* reference count modifications queued up in the rbtree. the head
				801	* node may also store the extent flags to set. This way you can check
				802	* to see what the reference count and extent flags would be if all of
				803	* the delayed refs are not processed.
				804	*/
				805	int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
				806	struct btrfs_fs_info *fs_info, u64 bytenr,
				807	u64 offset, int metadata, u64 refs, u64 flags)
				808	{
				809	struct btrfs_delayed_ref_head *head;
				810	struct btrfs_delayed_ref_root *delayed_refs;
				811	struct btrfs_path *path;
				812	struct btrfs_extent_item *ei;
				813	struct extent_buffer *leaf;
				814	struct btrfs_key key;
				815	u32 item_size;
				816	u64 num_refs;
				817	u64 extent_flags;
				818	int ret;
				819
				820	/*
				821	* If we don't have skinny metadata, don't bother doing anything
				822	* different
				823	*/
				824	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
				825	offset = fs_info->nodesize;
				826	metadata = 0;
				827	}
				828
				829	path = btrfs_alloc_path();
				830	if (!path)
				831	return -ENOMEM;
				832
				833	if (!trans) {
				834	path->skip_locking = 1;
				835	path->search_commit_root = 1;
				836	}
				837
				838	search_again:
				839	key.objectid = bytenr;
				840	key.offset = offset;
				841	if (metadata)
				842	key.type = BTRFS_METADATA_ITEM_KEY;
				843	else
				844	key.type = BTRFS_EXTENT_ITEM_KEY;
				845
				846	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
				847	if (ret < 0)
				848	goto out_free;
				849
				850	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
				851	if (path->slots[0]) {
				852	path->slots[0]--;
				853	btrfs_item_key_to_cpu(path->nodes[0], &key,
				854	path->slots[0]);
				855	if (key.objectid == bytenr &&
				856	key.type == BTRFS_EXTENT_ITEM_KEY &&
				857	key.offset == fs_info->nodesize)
				858	ret = 0;
				859	}
				860	}
				861
				862	if (ret == 0) {
				863	leaf = path->nodes[0];
				864	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				865	if (item_size >= sizeof(*ei)) {
				866	ei = btrfs_item_ptr(leaf, path->slots[0],
				867	struct btrfs_extent_item);
				868	num_refs = btrfs_extent_refs(leaf, ei);
				869	extent_flags = btrfs_extent_flags(leaf, ei);
				870	} else {
				871	ret = -EINVAL;
				872	btrfs_print_v0_err(fs_info);
				873	if (trans)
				874	btrfs_abort_transaction(trans, ret);
				875	else
				876	btrfs_handle_fs_error(fs_info, ret, NULL);
				877
				878	goto out_free;
				879	}
				880
				881	BUG_ON(num_refs == 0);
				882	} else {
				883	num_refs = 0;
				884	extent_flags = 0;
				885	ret = 0;
				886	}
				887
				888	if (!trans)
				889	goto out;
				890
				891	delayed_refs = &trans->transaction->delayed_refs;
				892	spin_lock(&delayed_refs->lock);
				893	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				894	if (head) {
				895	if (!mutex_trylock(&head->mutex)) {
				896	refcount_inc(&head->refs);
				897	spin_unlock(&delayed_refs->lock);
				898
				899	btrfs_release_path(path);
				900
				901	/*
				902	* Mutex was contended, block until it's released and try
				903	* again
				904	*/
				905	mutex_lock(&head->mutex);
				906	mutex_unlock(&head->mutex);
				907	btrfs_put_delayed_ref_head(head);
				908	goto search_again;
				909	}
				910	spin_lock(&head->lock);
				911	if (head->extent_op && head->extent_op->update_flags)
				912	extent_flags \|= head->extent_op->flags_to_set;
				913	else
				914	BUG_ON(num_refs == 0);
				915
				916	num_refs += head->ref_mod;
				917	spin_unlock(&head->lock);
				918	mutex_unlock(&head->mutex);
				919	}
				920	spin_unlock(&delayed_refs->lock);
				921	out:
				922	WARN_ON(num_refs == 0);
				923	if (refs)
				924	*refs = num_refs;
				925	if (flags)
				926	*flags = extent_flags;
				927	out_free:
				928	btrfs_free_path(path);
				929	return ret;
				930	}
				931
				932	/*
				933	* Back reference rules. Back refs have three main goals:
				934	*
				935	* 1) differentiate between all holders of references to an extent so that
				936	* when a reference is dropped we can make sure it was a valid reference
				937	* before freeing the extent.
				938	*
				939	* 2) Provide enough information to quickly find the holders of an extent
				940	* if we notice a given block is corrupted or bad.
				941	*
				942	* 3) Make it easy to migrate blocks for FS shrinking or storage pool
				943	* maintenance. This is actually the same as #2, but with a slightly
				944	* different use case.
				945	*
				946	* There are two kinds of back refs. The implicit back refs is optimized
				947	* for pointers in non-shared tree blocks. For a given pointer in a block,
				948	* back refs of this kind provide information about the block's owner tree
				949	* and the pointer's key. These information allow us to find the block by
				950	* b-tree searching. The full back refs is for pointers in tree blocks not
				951	* referenced by their owner trees. The location of tree block is recorded
				952	* in the back refs. Actually the full back refs is generic, and can be
				953	* used in all cases the implicit back refs is used. The major shortcoming
				954	* of the full back refs is its overhead. Every time a tree block gets
				955	* COWed, we have to update back refs entry for all pointers in it.
				956	*
				957	* For a newly allocated tree block, we use implicit back refs for
				958	* pointers in it. This means most tree related operations only involve
				959	* implicit back refs. For a tree block created in old transaction, the
				960	* only way to drop a reference to it is COW it. So we can detect the
				961	* event that tree block loses its owner tree's reference and do the
				962	* back refs conversion.
				963	*
				964	* When a tree block is COWed through a tree, there are four cases:
				965	*
				966	* The reference count of the block is one and the tree is the block's
				967	* owner tree. Nothing to do in this case.
				968	*
				969	* The reference count of the block is one and the tree is not the
				970	* block's owner tree. In this case, full back refs is used for pointers
				971	* in the block. Remove these full back refs, add implicit back refs for
				972	* every pointers in the new block.
				973	*
				974	* The reference count of the block is greater than one and the tree is
				975	* the block's owner tree. In this case, implicit back refs is used for
				976	* pointers in the block. Add full back refs for every pointers in the
				977	* block, increase lower level extents' reference counts. The original
				978	* implicit back refs are entailed to the new block.
				979	*
				980	* The reference count of the block is greater than one and the tree is
				981	* not the block's owner tree. Add implicit back refs for every pointer in
				982	* the new block, increase lower level extents' reference count.
				983	*
				984	* Back Reference Key composing:
				985	*
				986	* The key objectid corresponds to the first byte in the extent,
				987	* The key type is used to differentiate between types of back refs.
				988	* There are different meanings of the key offset for different types
				989	* of back refs.
				990	*
				991	* File extents can be referenced by:
				992	*
				993	* - multiple snapshots, subvolumes, or different generations in one subvol
				994	* - different files inside a single subvolume
				995	* - different offsets inside a file (bookend extents in file.c)
				996	*
				997	* The extent ref structure for the implicit back refs has fields for:
				998	*
				999	* - Objectid of the subvolume root
				1000	* - objectid of the file holding the reference
				1001	* - original offset in the file
				1002	* - how many bookend extents
				1003	*
				1004	* The key offset for the implicit back refs is hash of the first
				1005	* three fields.
				1006	*
				1007	* The extent ref structure for the full back refs has field for:
				1008	*
				1009	* - number of pointers in the tree leaf
				1010	*
				1011	* The key offset for the implicit back refs is the first byte of
				1012	* the tree leaf
				1013	*
				1014	* When a file extent is allocated, The implicit back refs is used.
				1015	* the fields are filled in:
				1016	*
				1017	* (root_key.objectid, inode objectid, offset in file, 1)
				1018	*
				1019	* When a file extent is removed file truncation, we find the
				1020	* corresponding implicit back refs and check the following fields:
				1021	*
				1022	* (btrfs_header_owner(leaf), inode objectid, offset in file)
				1023	*
				1024	* Btree extents can be referenced by:
				1025	*
				1026	* - Different subvolumes
				1027	*
				1028	* Both the implicit back refs and the full back refs for tree blocks
				1029	* only consist of key. The key offset for the implicit back refs is
				1030	* objectid of block's owner tree. The key offset for the full back refs
				1031	* is the first byte of parent block.
				1032	*
				1033	* When implicit back refs is used, information about the lowest key and
				1034	* level of the tree block are required. These information are stored in
				1035	* tree block info structure.
				1036	*/
				1037
				1038	/*
				1039	* is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
				1040	* is_data == BTRFS_REF_TYPE_DATA, data type is requried,
				1041	* is_data == BTRFS_REF_TYPE_ANY, either type is OK.
				1042	*/
				1043	int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				1044	struct btrfs_extent_inline_ref *iref,
				1045	enum btrfs_inline_ref_type is_data)
				1046	{
				1047	int type = btrfs_extent_inline_ref_type(eb, iref);
				1048	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
				1049
				1050	if (type == BTRFS_TREE_BLOCK_REF_KEY \|\|
				1051	type == BTRFS_SHARED_BLOCK_REF_KEY \|\|
				1052	type == BTRFS_SHARED_DATA_REF_KEY \|\|
				1053	type == BTRFS_EXTENT_DATA_REF_KEY) {
				1054	if (is_data == BTRFS_REF_TYPE_BLOCK) {
				1055	if (type == BTRFS_TREE_BLOCK_REF_KEY)
				1056	return type;
				1057	if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				1058	ASSERT(eb->fs_info);
				1059	/*
				1060	* Every shared one has parent tree
				1061	* block, which must be aligned to
				1062	* nodesize.
				1063	*/
				1064	if (offset &&
				1065	IS_ALIGNED(offset, eb->fs_info->nodesize))
				1066	return type;
				1067	}
				1068	} else if (is_data == BTRFS_REF_TYPE_DATA) {
				1069	if (type == BTRFS_EXTENT_DATA_REF_KEY)
				1070	return type;
				1071	if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1072	ASSERT(eb->fs_info);
				1073	/*
				1074	* Every shared one has parent tree
				1075	* block, which must be aligned to
				1076	* nodesize.
				1077	*/
				1078	if (offset &&
				1079	IS_ALIGNED(offset, eb->fs_info->nodesize))
				1080	return type;
				1081	}
				1082	} else {
				1083	ASSERT(is_data == BTRFS_REF_TYPE_ANY);
				1084	return type;
				1085	}
				1086	}
				1087
				1088	btrfs_print_leaf((struct extent_buffer *)eb);
				1089	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
				1090	eb->start, type);
				1091	WARN_ON(1);
				1092
				1093	return BTRFS_REF_TYPE_INVALID;
				1094	}
				1095
				1096	static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
				1097	{
				1098	u32 high_crc = ~(u32)0;
				1099	u32 low_crc = ~(u32)0;
				1100	__le64 lenum;
				1101
				1102	lenum = cpu_to_le64(root_objectid);
				1103	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
				1104	lenum = cpu_to_le64(owner);
				1105	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
				1106	lenum = cpu_to_le64(offset);
				1107	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
				1108
				1109	return ((u64)high_crc << 31) ^ (u64)low_crc;
				1110	}
				1111
				1112	static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				1113	struct btrfs_extent_data_ref *ref)
				1114	{
				1115	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				1116	btrfs_extent_data_ref_objectid(leaf, ref),
				1117	btrfs_extent_data_ref_offset(leaf, ref));
				1118	}
				1119
				1120	static int match_extent_data_ref(struct extent_buffer *leaf,
				1121	struct btrfs_extent_data_ref *ref,
				1122	u64 root_objectid, u64 owner, u64 offset)
				1123	{
				1124	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid \|\|
				1125	btrfs_extent_data_ref_objectid(leaf, ref) != owner \|\|
				1126	btrfs_extent_data_ref_offset(leaf, ref) != offset)
				1127	return 0;
				1128	return 1;
				1129	}
				1130
				1131	static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
				1132	struct btrfs_path *path,
				1133	u64 bytenr, u64 parent,
				1134	u64 root_objectid,
				1135	u64 owner, u64 offset)
				1136	{
				1137	struct btrfs_root *root = trans->fs_info->extent_root;
				1138	struct btrfs_key key;
				1139	struct btrfs_extent_data_ref *ref;
				1140	struct extent_buffer *leaf;
				1141	u32 nritems;
				1142	int ret;
				1143	int recow;
				1144	int err = -ENOENT;
				1145
				1146	key.objectid = bytenr;
				1147	if (parent) {
				1148	key.type = BTRFS_SHARED_DATA_REF_KEY;
				1149	key.offset = parent;
				1150	} else {
				1151	key.type = BTRFS_EXTENT_DATA_REF_KEY;
				1152	key.offset = hash_extent_data_ref(root_objectid,
				1153	owner, offset);
				1154	}
				1155	again:
				1156	recow = 0;
				1157	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1158	if (ret < 0) {
				1159	err = ret;
				1160	goto fail;
				1161	}
				1162
				1163	if (parent) {
				1164	if (!ret)
				1165	return 0;
				1166	goto fail;
				1167	}
				1168
				1169	leaf = path->nodes[0];
				1170	nritems = btrfs_header_nritems(leaf);
				1171	while (1) {
				1172	if (path->slots[0] >= nritems) {
				1173	ret = btrfs_next_leaf(root, path);
				1174	if (ret < 0)
				1175	err = ret;
				1176	if (ret)
				1177	goto fail;
				1178
				1179	leaf = path->nodes[0];
				1180	nritems = btrfs_header_nritems(leaf);
				1181	recow = 1;
				1182	}
				1183
				1184	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1185	if (key.objectid != bytenr \|\|
				1186	key.type != BTRFS_EXTENT_DATA_REF_KEY)
				1187	goto fail;
				1188
				1189	ref = btrfs_item_ptr(leaf, path->slots[0],
				1190	struct btrfs_extent_data_ref);
				1191
				1192	if (match_extent_data_ref(leaf, ref, root_objectid,
				1193	owner, offset)) {
				1194	if (recow) {
				1195	btrfs_release_path(path);
				1196	goto again;
				1197	}
				1198	err = 0;
				1199	break;
				1200	}
				1201	path->slots[0]++;
				1202	}
				1203	fail:
				1204	return err;
				1205	}
				1206
				1207	static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
				1208	struct btrfs_path *path,
				1209	u64 bytenr, u64 parent,
				1210	u64 root_objectid, u64 owner,
				1211	u64 offset, int refs_to_add)
				1212	{
				1213	struct btrfs_root *root = trans->fs_info->extent_root;
				1214	struct btrfs_key key;
				1215	struct extent_buffer *leaf;
				1216	u32 size;
				1217	u32 num_refs;
				1218	int ret;
				1219
				1220	key.objectid = bytenr;
				1221	if (parent) {
				1222	key.type = BTRFS_SHARED_DATA_REF_KEY;
				1223	key.offset = parent;
				1224	size = sizeof(struct btrfs_shared_data_ref);
				1225	} else {
				1226	key.type = BTRFS_EXTENT_DATA_REF_KEY;
				1227	key.offset = hash_extent_data_ref(root_objectid,
				1228	owner, offset);
				1229	size = sizeof(struct btrfs_extent_data_ref);
				1230	}
				1231
				1232	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
				1233	if (ret && ret != -EEXIST)
				1234	goto fail;
				1235
				1236	leaf = path->nodes[0];
				1237	if (parent) {
				1238	struct btrfs_shared_data_ref *ref;
				1239	ref = btrfs_item_ptr(leaf, path->slots[0],
				1240	struct btrfs_shared_data_ref);
				1241	if (ret == 0) {
				1242	btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
				1243	} else {
				1244	num_refs = btrfs_shared_data_ref_count(leaf, ref);
				1245	num_refs += refs_to_add;
				1246	btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
				1247	}
				1248	} else {
				1249	struct btrfs_extent_data_ref *ref;
				1250	while (ret == -EEXIST) {
				1251	ref = btrfs_item_ptr(leaf, path->slots[0],
				1252	struct btrfs_extent_data_ref);
				1253	if (match_extent_data_ref(leaf, ref, root_objectid,
				1254	owner, offset))
				1255	break;
				1256	btrfs_release_path(path);
				1257	key.offset++;
				1258	ret = btrfs_insert_empty_item(trans, root, path, &key,
				1259	size);
				1260	if (ret && ret != -EEXIST)
				1261	goto fail;
				1262
				1263	leaf = path->nodes[0];
				1264	}
				1265	ref = btrfs_item_ptr(leaf, path->slots[0],
				1266	struct btrfs_extent_data_ref);
				1267	if (ret == 0) {
				1268	btrfs_set_extent_data_ref_root(leaf, ref,
				1269	root_objectid);
				1270	btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
				1271	btrfs_set_extent_data_ref_offset(leaf, ref, offset);
				1272	btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
				1273	} else {
				1274	num_refs = btrfs_extent_data_ref_count(leaf, ref);
				1275	num_refs += refs_to_add;
				1276	btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
				1277	}
				1278	}
				1279	btrfs_mark_buffer_dirty(leaf);
				1280	ret = 0;
				1281	fail:
				1282	btrfs_release_path(path);
				1283	return ret;
				1284	}
				1285
				1286	static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
				1287	struct btrfs_path *path,
				1288	int refs_to_drop, int *last_ref)
				1289	{
				1290	struct btrfs_key key;
				1291	struct btrfs_extent_data_ref *ref1 = NULL;
				1292	struct btrfs_shared_data_ref *ref2 = NULL;
				1293	struct extent_buffer *leaf;
				1294	u32 num_refs = 0;
				1295	int ret = 0;
				1296
				1297	leaf = path->nodes[0];
				1298	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1299
				1300	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
				1301	ref1 = btrfs_item_ptr(leaf, path->slots[0],
				1302	struct btrfs_extent_data_ref);
				1303	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				1304	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
				1305	ref2 = btrfs_item_ptr(leaf, path->slots[0],
				1306	struct btrfs_shared_data_ref);
				1307	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				1308	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
				1309	btrfs_print_v0_err(trans->fs_info);
				1310	btrfs_abort_transaction(trans, -EINVAL);
				1311	return -EINVAL;
				1312	} else {
				1313	BUG();
				1314	}
				1315
				1316	BUG_ON(num_refs < refs_to_drop);
				1317	num_refs -= refs_to_drop;
				1318
				1319	if (num_refs == 0) {
				1320	ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
				1321	*last_ref = 1;
				1322	} else {
				1323	if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
				1324	btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
				1325	else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
				1326	btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
				1327	btrfs_mark_buffer_dirty(leaf);
				1328	}
				1329	return ret;
				1330	}
				1331
				1332	static noinline u32 extent_data_ref_count(struct btrfs_path *path,
				1333	struct btrfs_extent_inline_ref *iref)
				1334	{
				1335	struct btrfs_key key;
				1336	struct extent_buffer *leaf;
				1337	struct btrfs_extent_data_ref *ref1;
				1338	struct btrfs_shared_data_ref *ref2;
				1339	u32 num_refs = 0;
				1340	int type;
				1341
				1342	leaf = path->nodes[0];
				1343	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				1344
				1345	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
				1346	if (iref) {
				1347	/*
				1348	* If type is invalid, we should have bailed out earlier than
				1349	* this call.
				1350	*/
				1351	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
				1352	ASSERT(type != BTRFS_REF_TYPE_INVALID);
				1353	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1354	ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
				1355	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				1356	} else {
				1357	ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
				1358	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				1359	}
				1360	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
				1361	ref1 = btrfs_item_ptr(leaf, path->slots[0],
				1362	struct btrfs_extent_data_ref);
				1363	num_refs = btrfs_extent_data_ref_count(leaf, ref1);
				1364	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
				1365	ref2 = btrfs_item_ptr(leaf, path->slots[0],
				1366	struct btrfs_shared_data_ref);
				1367	num_refs = btrfs_shared_data_ref_count(leaf, ref2);
				1368	} else {
				1369	WARN_ON(1);
				1370	}
				1371	return num_refs;
				1372	}
				1373
				1374	static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
				1375	struct btrfs_path *path,
				1376	u64 bytenr, u64 parent,
				1377	u64 root_objectid)
				1378	{
				1379	struct btrfs_root *root = trans->fs_info->extent_root;
				1380	struct btrfs_key key;
				1381	int ret;
				1382
				1383	key.objectid = bytenr;
				1384	if (parent) {
				1385	key.type = BTRFS_SHARED_BLOCK_REF_KEY;
				1386	key.offset = parent;
				1387	} else {
				1388	key.type = BTRFS_TREE_BLOCK_REF_KEY;
				1389	key.offset = root_objectid;
				1390	}
				1391
				1392	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1393	if (ret > 0)
				1394	ret = -ENOENT;
				1395	return ret;
				1396	}
				1397
				1398	static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
				1399	struct btrfs_path *path,
				1400	u64 bytenr, u64 parent,
				1401	u64 root_objectid)
				1402	{
				1403	struct btrfs_key key;
				1404	int ret;
				1405
				1406	key.objectid = bytenr;
				1407	if (parent) {
				1408	key.type = BTRFS_SHARED_BLOCK_REF_KEY;
				1409	key.offset = parent;
				1410	} else {
				1411	key.type = BTRFS_TREE_BLOCK_REF_KEY;
				1412	key.offset = root_objectid;
				1413	}
				1414
				1415	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
				1416	path, &key, 0);
				1417	btrfs_release_path(path);
				1418	return ret;
				1419	}
				1420
				1421	static inline int extent_ref_type(u64 parent, u64 owner)
				1422	{
				1423	int type;
				1424	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1425	if (parent > 0)
				1426	type = BTRFS_SHARED_BLOCK_REF_KEY;
				1427	else
				1428	type = BTRFS_TREE_BLOCK_REF_KEY;
				1429	} else {
				1430	if (parent > 0)
				1431	type = BTRFS_SHARED_DATA_REF_KEY;
				1432	else
				1433	type = BTRFS_EXTENT_DATA_REF_KEY;
				1434	}
				1435	return type;
				1436	}
				1437
				1438	static int find_next_key(struct btrfs_path *path, int level,
				1439	struct btrfs_key *key)
				1440
				1441	{
				1442	for (; level < BTRFS_MAX_LEVEL; level++) {
				1443	if (!path->nodes[level])
				1444	break;
				1445	if (path->slots[level] + 1 >=
				1446	btrfs_header_nritems(path->nodes[level]))
				1447	continue;
				1448	if (level == 0)
				1449	btrfs_item_key_to_cpu(path->nodes[level], key,
				1450	path->slots[level] + 1);
				1451	else
				1452	btrfs_node_key_to_cpu(path->nodes[level], key,
				1453	path->slots[level] + 1);
				1454	return 0;
				1455	}
				1456	return 1;
				1457	}
				1458
				1459	/*
				1460	* look for inline back ref. if back ref is found, *ref_ret is set
				1461	* to the address of inline back ref, and 0 is returned.
				1462	*
				1463	* if back ref isn't found, *ref_ret is set to the address where it
				1464	* should be inserted, and -ENOENT is returned.
				1465	*
				1466	* if insert is true and there are too many inline back refs, the path
				1467	* points to the extent item, and -EAGAIN is returned.
				1468	*
				1469	* NOTE: inline back refs are ordered in the same way that back ref
				1470	* items in the tree are ordered.
				1471	*/
				1472	static noinline_for_stack
				1473	int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				1474	struct btrfs_path *path,
				1475	struct btrfs_extent_inline_ref **ref_ret,
				1476	u64 bytenr, u64 num_bytes,
				1477	u64 parent, u64 root_objectid,
				1478	u64 owner, u64 offset, int insert)
				1479	{
				1480	struct btrfs_fs_info *fs_info = trans->fs_info;
				1481	struct btrfs_root *root = fs_info->extent_root;
				1482	struct btrfs_key key;
				1483	struct extent_buffer *leaf;
				1484	struct btrfs_extent_item *ei;
				1485	struct btrfs_extent_inline_ref *iref;
				1486	u64 flags;
				1487	u64 item_size;
				1488	unsigned long ptr;
				1489	unsigned long end;
				1490	int extra_size;
				1491	int type;
				1492	int want;
				1493	int ret;
				1494	int err = 0;
				1495	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				1496	int needed;
				1497
				1498	key.objectid = bytenr;
				1499	key.type = BTRFS_EXTENT_ITEM_KEY;
				1500	key.offset = num_bytes;
				1501
				1502	want = extent_ref_type(parent, owner);
				1503	if (insert) {
				1504	extra_size = btrfs_extent_inline_ref_size(want);
				1505	path->keep_locks = 1;
				1506	} else
				1507	extra_size = -1;
				1508
				1509	/*
				1510	* Owner is our level, so we can just add one to get the level for the
				1511	* block we are interested in.
				1512	*/
				1513	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
				1514	key.type = BTRFS_METADATA_ITEM_KEY;
				1515	key.offset = owner;
				1516	}
				1517
				1518	again:
				1519	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
				1520	if (ret < 0) {
				1521	err = ret;
				1522	goto out;
				1523	}
				1524
				1525	/*
				1526	* We may be a newly converted file system which still has the old fat
				1527	* extent entries for metadata, so try and see if we have one of those.
				1528	*/
				1529	if (ret > 0 && skinny_metadata) {
				1530	skinny_metadata = false;
				1531	if (path->slots[0]) {
				1532	path->slots[0]--;
				1533	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1534	path->slots[0]);
				1535	if (key.objectid == bytenr &&
				1536	key.type == BTRFS_EXTENT_ITEM_KEY &&
				1537	key.offset == num_bytes)
				1538	ret = 0;
				1539	}
				1540	if (ret) {
				1541	key.objectid = bytenr;
				1542	key.type = BTRFS_EXTENT_ITEM_KEY;
				1543	key.offset = num_bytes;
				1544	btrfs_release_path(path);
				1545	goto again;
				1546	}
				1547	}
				1548
				1549	if (ret && !insert) {
				1550	err = -ENOENT;
				1551	goto out;
				1552	} else if (WARN_ON(ret)) {
				1553	err = -EIO;
				1554	goto out;
				1555	}
				1556
				1557	leaf = path->nodes[0];
				1558	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1559	if (unlikely(item_size < sizeof(*ei))) {
				1560	err = -EINVAL;
				1561	btrfs_print_v0_err(fs_info);
				1562	btrfs_abort_transaction(trans, err);
				1563	goto out;
				1564	}
				1565
				1566	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1567	flags = btrfs_extent_flags(leaf, ei);
				1568
				1569	ptr = (unsigned long)(ei + 1);
				1570	end = (unsigned long)ei + item_size;
				1571
				1572	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
				1573	ptr += sizeof(struct btrfs_tree_block_info);
				1574	BUG_ON(ptr > end);
				1575	}
				1576
				1577	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
				1578	needed = BTRFS_REF_TYPE_DATA;
				1579	else
				1580	needed = BTRFS_REF_TYPE_BLOCK;
				1581
				1582	err = -ENOENT;
				1583	while (1) {
				1584	if (ptr >= end) {
				1585	WARN_ON(ptr > end);
				1586	break;
				1587	}
				1588	iref = (struct btrfs_extent_inline_ref *)ptr;
				1589	type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
				1590	if (type == BTRFS_REF_TYPE_INVALID) {
				1591	err = -EUCLEAN;
				1592	goto out;
				1593	}
				1594
				1595	if (want < type)
				1596	break;
				1597	if (want > type) {
				1598	ptr += btrfs_extent_inline_ref_size(type);
				1599	continue;
				1600	}
				1601
				1602	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1603	struct btrfs_extent_data_ref *dref;
				1604	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1605	if (match_extent_data_ref(leaf, dref, root_objectid,
				1606	owner, offset)) {
				1607	err = 0;
				1608	break;
				1609	}
				1610	if (hash_extent_data_ref_item(leaf, dref) <
				1611	hash_extent_data_ref(root_objectid, owner, offset))
				1612	break;
				1613	} else {
				1614	u64 ref_offset;
				1615	ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
				1616	if (parent > 0) {
				1617	if (parent == ref_offset) {
				1618	err = 0;
				1619	break;
				1620	}
				1621	if (ref_offset < parent)
				1622	break;
				1623	} else {
				1624	if (root_objectid == ref_offset) {
				1625	err = 0;
				1626	break;
				1627	}
				1628	if (ref_offset < root_objectid)
				1629	break;
				1630	}
				1631	}
				1632	ptr += btrfs_extent_inline_ref_size(type);
				1633	}
				1634	if (err == -ENOENT && insert) {
				1635	if (item_size + extra_size >=
				1636	BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
				1637	err = -EAGAIN;
				1638	goto out;
				1639	}
				1640	/*
				1641	* To add new inline back ref, we have to make sure
				1642	* there is no corresponding back ref item.
				1643	* For simplicity, we just do not add new inline back
				1644	* ref if there is any kind of item for this block
				1645	*/
				1646	if (find_next_key(path, 0, &key) == 0 &&
				1647	key.objectid == bytenr &&
				1648	key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
				1649	err = -EAGAIN;
				1650	goto out;
				1651	}
				1652	}
				1653	ref_ret = (struct btrfs_extent_inline_ref )ptr;
				1654	out:
				1655	if (insert) {
				1656	path->keep_locks = 0;
				1657	btrfs_unlock_up_safe(path, 1);
				1658	}
				1659	return err;
				1660	}
				1661
				1662	/*
				1663	* helper to add new inline back ref
				1664	*/
				1665	static noinline_for_stack
				1666	void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
				1667	struct btrfs_path *path,
				1668	struct btrfs_extent_inline_ref *iref,
				1669	u64 parent, u64 root_objectid,
				1670	u64 owner, u64 offset, int refs_to_add,
				1671	struct btrfs_delayed_extent_op *extent_op)
				1672	{
				1673	struct extent_buffer *leaf;
				1674	struct btrfs_extent_item *ei;
				1675	unsigned long ptr;
				1676	unsigned long end;
				1677	unsigned long item_offset;
				1678	u64 refs;
				1679	int size;
				1680	int type;
				1681
				1682	leaf = path->nodes[0];
				1683	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1684	item_offset = (unsigned long)iref - (unsigned long)ei;
				1685
				1686	type = extent_ref_type(parent, owner);
				1687	size = btrfs_extent_inline_ref_size(type);
				1688
				1689	btrfs_extend_item(fs_info, path, size);
				1690
				1691	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1692	refs = btrfs_extent_refs(leaf, ei);
				1693	refs += refs_to_add;
				1694	btrfs_set_extent_refs(leaf, ei, refs);
				1695	if (extent_op)
				1696	__run_delayed_extent_op(extent_op, leaf, ei);
				1697
				1698	ptr = (unsigned long)ei + item_offset;
				1699	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
				1700	if (ptr < end - size)
				1701	memmove_extent_buffer(leaf, ptr + size, ptr,
				1702	end - size - ptr);
				1703
				1704	iref = (struct btrfs_extent_inline_ref *)ptr;
				1705	btrfs_set_extent_inline_ref_type(leaf, iref, type);
				1706	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1707	struct btrfs_extent_data_ref *dref;
				1708	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1709	btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
				1710	btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
				1711	btrfs_set_extent_data_ref_offset(leaf, dref, offset);
				1712	btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
				1713	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1714	struct btrfs_shared_data_ref *sref;
				1715	sref = (struct btrfs_shared_data_ref *)(iref + 1);
				1716	btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
				1717	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				1718	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				1719	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				1720	} else {
				1721	btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
				1722	}
				1723	btrfs_mark_buffer_dirty(leaf);
				1724	}
				1725
				1726	static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				1727	struct btrfs_path *path,
				1728	struct btrfs_extent_inline_ref **ref_ret,
				1729	u64 bytenr, u64 num_bytes, u64 parent,
				1730	u64 root_objectid, u64 owner, u64 offset)
				1731	{
				1732	int ret;
				1733
				1734	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
				1735	num_bytes, parent, root_objectid,
				1736	owner, offset, 0);
				1737	if (ret != -ENOENT)
				1738	return ret;
				1739
				1740	btrfs_release_path(path);
				1741	*ref_ret = NULL;
				1742
				1743	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1744	ret = lookup_tree_block_ref(trans, path, bytenr, parent,
				1745	root_objectid);
				1746	} else {
				1747	ret = lookup_extent_data_ref(trans, path, bytenr, parent,
				1748	root_objectid, owner, offset);
				1749	}
				1750	return ret;
				1751	}
				1752
				1753	/*
				1754	* helper to update/remove inline back ref
				1755	*/
				1756	static noinline_for_stack
				1757	void update_inline_extent_backref(struct btrfs_path *path,
				1758	struct btrfs_extent_inline_ref *iref,
				1759	int refs_to_mod,
				1760	struct btrfs_delayed_extent_op *extent_op,
				1761	int *last_ref)
				1762	{
				1763	struct extent_buffer *leaf = path->nodes[0];
				1764	struct btrfs_fs_info *fs_info = leaf->fs_info;
				1765	struct btrfs_extent_item *ei;
				1766	struct btrfs_extent_data_ref *dref = NULL;
				1767	struct btrfs_shared_data_ref *sref = NULL;
				1768	unsigned long ptr;
				1769	unsigned long end;
				1770	u32 item_size;
				1771	int size;
				1772	int type;
				1773	u64 refs;
				1774
				1775	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				1776	refs = btrfs_extent_refs(leaf, ei);
				1777	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
				1778	refs += refs_to_mod;
				1779	btrfs_set_extent_refs(leaf, ei, refs);
				1780	if (extent_op)
				1781	__run_delayed_extent_op(extent_op, leaf, ei);
				1782
				1783	/*
				1784	* If type is invalid, we should have bailed out after
				1785	* lookup_inline_extent_backref().
				1786	*/
				1787	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
				1788	ASSERT(type != BTRFS_REF_TYPE_INVALID);
				1789
				1790	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
				1791	dref = (struct btrfs_extent_data_ref *)(&iref->offset);
				1792	refs = btrfs_extent_data_ref_count(leaf, dref);
				1793	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
				1794	sref = (struct btrfs_shared_data_ref *)(iref + 1);
				1795	refs = btrfs_shared_data_ref_count(leaf, sref);
				1796	} else {
				1797	refs = 1;
				1798	BUG_ON(refs_to_mod != -1);
				1799	}
				1800
				1801	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
				1802	refs += refs_to_mod;
				1803
				1804	if (refs > 0) {
				1805	if (type == BTRFS_EXTENT_DATA_REF_KEY)
				1806	btrfs_set_extent_data_ref_count(leaf, dref, refs);
				1807	else
				1808	btrfs_set_shared_data_ref_count(leaf, sref, refs);
				1809	} else {
				1810	*last_ref = 1;
				1811	size = btrfs_extent_inline_ref_size(type);
				1812	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1813	ptr = (unsigned long)iref;
				1814	end = (unsigned long)ei + item_size;
				1815	if (ptr + size < end)
				1816	memmove_extent_buffer(leaf, ptr, ptr + size,
				1817	end - ptr - size);
				1818	item_size -= size;
				1819	btrfs_truncate_item(fs_info, path, item_size, 1);
				1820	}
				1821	btrfs_mark_buffer_dirty(leaf);
				1822	}
				1823
				1824	static noinline_for_stack
				1825	int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				1826	struct btrfs_path *path,
				1827	u64 bytenr, u64 num_bytes, u64 parent,
				1828	u64 root_objectid, u64 owner,
				1829	u64 offset, int refs_to_add,
				1830	struct btrfs_delayed_extent_op *extent_op)
				1831	{
				1832	struct btrfs_extent_inline_ref *iref;
				1833	int ret;
				1834
				1835	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
				1836	num_bytes, parent, root_objectid,
				1837	owner, offset, 1);
				1838	if (ret == 0) {
				1839	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
				1840	update_inline_extent_backref(path, iref, refs_to_add,
				1841	extent_op, NULL);
				1842	} else if (ret == -ENOENT) {
				1843	setup_inline_extent_backref(trans->fs_info, path, iref, parent,
				1844	root_objectid, owner, offset,
				1845	refs_to_add, extent_op);
				1846	ret = 0;
				1847	}
				1848	return ret;
				1849	}
				1850
				1851	static int insert_extent_backref(struct btrfs_trans_handle *trans,
				1852	struct btrfs_path *path,
				1853	u64 bytenr, u64 parent, u64 root_objectid,
				1854	u64 owner, u64 offset, int refs_to_add)
				1855	{
				1856	int ret;
				1857	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				1858	BUG_ON(refs_to_add != 1);
				1859	ret = insert_tree_block_ref(trans, path, bytenr, parent,
				1860	root_objectid);
				1861	} else {
				1862	ret = insert_extent_data_ref(trans, path, bytenr, parent,
				1863	root_objectid, owner, offset,
				1864	refs_to_add);
				1865	}
				1866	return ret;
				1867	}
				1868
				1869	static int remove_extent_backref(struct btrfs_trans_handle *trans,
				1870	struct btrfs_path *path,
				1871	struct btrfs_extent_inline_ref *iref,
				1872	int refs_to_drop, int is_data, int *last_ref)
				1873	{
				1874	int ret = 0;
				1875
				1876	BUG_ON(!is_data && refs_to_drop != 1);
				1877	if (iref) {
				1878	update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
				1879	last_ref);
				1880	} else if (is_data) {
				1881	ret = remove_extent_data_ref(trans, path, refs_to_drop,
				1882	last_ref);
				1883	} else {
				1884	*last_ref = 1;
				1885	ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
				1886	}
				1887	return ret;
				1888	}
				1889
				1890	#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
				1891	static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
				1892	u64 *discarded_bytes)
				1893	{
				1894	int j, ret = 0;
				1895	u64 bytes_left, end;
				1896	u64 aligned_start = ALIGN(start, 1 << 9);
				1897
				1898	if (WARN_ON(start != aligned_start)) {
				1899	len -= aligned_start - start;
				1900	len = round_down(len, 1 << 9);
				1901	start = aligned_start;
				1902	}
				1903
				1904	*discarded_bytes = 0;
				1905
				1906	if (!len)
				1907	return 0;
				1908
				1909	end = start + len;
				1910	bytes_left = len;
				1911
				1912	/* Skip any superblocks on this device. */
				1913	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
				1914	u64 sb_start = btrfs_sb_offset(j);
				1915	u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
				1916	u64 size = sb_start - start;
				1917
				1918	if (!in_range(sb_start, start, bytes_left) &&
				1919	!in_range(sb_end, start, bytes_left) &&
				1920	!in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
				1921	continue;
				1922
				1923	/*
				1924	* Superblock spans beginning of range. Adjust start and
				1925	* try again.
				1926	*/
				1927	if (sb_start <= start) {
				1928	start += sb_end - start;
				1929	if (start > end) {
				1930	bytes_left = 0;
				1931	break;
				1932	}
				1933	bytes_left = end - start;
				1934	continue;
				1935	}
				1936
				1937	if (size) {
				1938	ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
				1939	GFP_NOFS, 0);
				1940	if (!ret)
				1941	*discarded_bytes += size;
				1942	else if (ret != -EOPNOTSUPP)
				1943	return ret;
				1944	}
				1945
				1946	start = sb_end;
				1947	if (start > end) {
				1948	bytes_left = 0;
				1949	break;
				1950	}
				1951	bytes_left = end - start;
				1952	}
				1953
				1954	if (bytes_left) {
				1955	ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
				1956	GFP_NOFS, 0);
				1957	if (!ret)
				1958	*discarded_bytes += bytes_left;
				1959	}
				1960	return ret;
				1961	}
				1962
				1963	int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
				1964	u64 num_bytes, u64 *actual_bytes)
				1965	{
				1966	int ret;
				1967	u64 discarded_bytes = 0;
				1968	struct btrfs_bio *bbio = NULL;
				1969
				1970
				1971	/*
				1972	* Avoid races with device replace and make sure our bbio has devices
				1973	* associated to its stripes that don't go away while we are discarding.
				1974	*/
				1975	btrfs_bio_counter_inc_blocked(fs_info);
				1976	/* Tell the block device(s) that the sectors can be discarded */
				1977	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
				1978	&bbio, 0);
				1979	/* Error condition is -ENOMEM */
				1980	if (!ret) {
				1981	struct btrfs_bio_stripe *stripe = bbio->stripes;
				1982	int i;
				1983
				1984
				1985	for (i = 0; i < bbio->num_stripes; i++, stripe++) {
				1986	u64 bytes;
				1987	struct request_queue *req_q;
				1988
				1989	if (!stripe->dev->bdev) {
				1990	ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				1991	continue;
				1992	}
				1993	req_q = bdev_get_queue(stripe->dev->bdev);
				1994	if (!blk_queue_discard(req_q))
				1995	continue;
				1996
				1997	ret = btrfs_issue_discard(stripe->dev->bdev,
				1998	stripe->physical,
				1999	stripe->length,
				2000	&bytes);
				2001	if (!ret)
				2002	discarded_bytes += bytes;
				2003	else if (ret != -EOPNOTSUPP)
				2004	break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
				2005
				2006	/*
				2007	* Just in case we get back EOPNOTSUPP for some reason,
				2008	* just ignore the return value so we don't screw up
				2009	* people calling discard_extent.
				2010	*/
				2011	ret = 0;
				2012	}
				2013	btrfs_put_bbio(bbio);
				2014	}
				2015	btrfs_bio_counter_dec(fs_info);
				2016
				2017	if (actual_bytes)
				2018	*actual_bytes = discarded_bytes;
				2019
				2020
				2021	if (ret == -EOPNOTSUPP)
				2022	ret = 0;
				2023	return ret;
				2024	}
				2025
				2026	/* Can return -ENOMEM */
				2027	int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
				2028	struct btrfs_root *root,
				2029	u64 bytenr, u64 num_bytes, u64 parent,
				2030	u64 root_objectid, u64 owner, u64 offset)
				2031	{
				2032	struct btrfs_fs_info *fs_info = root->fs_info;
				2033	int old_ref_mod, new_ref_mod;
				2034	int ret;
				2035
				2036	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
				2037	root_objectid == BTRFS_TREE_LOG_OBJECTID);
				2038
				2039	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent, root_objectid,
				2040	owner, offset, BTRFS_ADD_DELAYED_REF);
				2041
				2042	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				2043	ret = btrfs_add_delayed_tree_ref(trans, bytenr,
				2044	num_bytes, parent,
				2045	root_objectid, (int)owner,
				2046	BTRFS_ADD_DELAYED_REF, NULL,
				2047	&old_ref_mod, &new_ref_mod);
				2048	} else {
				2049	ret = btrfs_add_delayed_data_ref(trans, bytenr,
				2050	num_bytes, parent,
				2051	root_objectid, owner, offset,
				2052	0, BTRFS_ADD_DELAYED_REF,
				2053	&old_ref_mod, &new_ref_mod);
				2054	}
				2055
				2056	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0) {
				2057	bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
				2058
				2059	add_pinned_bytes(fs_info, -num_bytes, metadata, root_objectid);
				2060	}
				2061
				2062	return ret;
				2063	}
				2064
				2065	/*
				2066	* __btrfs_inc_extent_ref - insert backreference for a given extent
				2067	*
				2068	* @trans: Handle of transaction
				2069	*
				2070	* @node: The delayed ref node used to get the bytenr/length for
				2071	* extent whose references are incremented.
				2072	*
				2073	* @parent: If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
				2074	* BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
				2075	* bytenr of the parent block. Since new extents are always
				2076	* created with indirect references, this will only be the case
				2077	* when relocating a shared extent. In that case, root_objectid
				2078	* will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
				2079	* be 0
				2080	*
				2081	* @root_objectid: The id of the root where this modification has originated,
				2082	* this can be either one of the well-known metadata trees or
				2083	* the subvolume id which references this extent.
				2084	*
				2085	* @owner: For data extents it is the inode number of the owning file.
				2086	* For metadata extents this parameter holds the level in the
				2087	* tree of the extent.
				2088	*
				2089	* @offset: For metadata extents the offset is ignored and is currently
				2090	* always passed as 0. For data extents it is the fileoffset
				2091	* this extent belongs to.
				2092	*
				2093	* @refs_to_add Number of references to add
				2094	*
				2095	* @extent_op Pointer to a structure, holding information necessary when
				2096	* updating a tree block's flags
				2097	*
				2098	*/
				2099	static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
				2100	struct btrfs_delayed_ref_node *node,
				2101	u64 parent, u64 root_objectid,
				2102	u64 owner, u64 offset, int refs_to_add,
				2103	struct btrfs_delayed_extent_op *extent_op)
				2104	{
				2105	struct btrfs_path *path;
				2106	struct extent_buffer *leaf;
				2107	struct btrfs_extent_item *item;
				2108	struct btrfs_key key;
				2109	u64 bytenr = node->bytenr;
				2110	u64 num_bytes = node->num_bytes;
				2111	u64 refs;
				2112	int ret;
				2113
				2114	path = btrfs_alloc_path();
				2115	if (!path)
				2116	return -ENOMEM;
				2117
				2118	path->reada = READA_FORWARD;
				2119	path->leave_spinning = 1;
				2120	/* this will setup the path even if it fails to insert the back ref */
				2121	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
				2122	parent, root_objectid, owner,
				2123	offset, refs_to_add, extent_op);
				2124	if ((ret < 0 && ret != -EAGAIN) \|\| !ret)
				2125	goto out;
				2126
				2127	/*
				2128	* Ok we had -EAGAIN which means we didn't have space to insert and
				2129	* inline extent ref, so just update the reference count and add a
				2130	* normal backref.
				2131	*/
				2132	leaf = path->nodes[0];
				2133	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				2134	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				2135	refs = btrfs_extent_refs(leaf, item);
				2136	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
				2137	if (extent_op)
				2138	__run_delayed_extent_op(extent_op, leaf, item);
				2139
				2140	btrfs_mark_buffer_dirty(leaf);
				2141	btrfs_release_path(path);
				2142
				2143	path->reada = READA_FORWARD;
				2144	path->leave_spinning = 1;
				2145	/* now insert the actual backref */
				2146	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
				2147	owner, offset, refs_to_add);
				2148	if (ret)
				2149	btrfs_abort_transaction(trans, ret);
				2150	out:
				2151	btrfs_free_path(path);
				2152	return ret;
				2153	}
				2154
				2155	static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				2156	struct btrfs_delayed_ref_node *node,
				2157	struct btrfs_delayed_extent_op *extent_op,
				2158	int insert_reserved)
				2159	{
				2160	int ret = 0;
				2161	struct btrfs_delayed_data_ref *ref;
				2162	struct btrfs_key ins;
				2163	u64 parent = 0;
				2164	u64 ref_root = 0;
				2165	u64 flags = 0;
				2166
				2167	ins.objectid = node->bytenr;
				2168	ins.offset = node->num_bytes;
				2169	ins.type = BTRFS_EXTENT_ITEM_KEY;
				2170
				2171	ref = btrfs_delayed_node_to_data_ref(node);
				2172	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
				2173
				2174	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
				2175	parent = ref->parent;
				2176	ref_root = ref->root;
				2177
				2178	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
				2179	if (extent_op)
				2180	flags \|= extent_op->flags_to_set;
				2181	ret = alloc_reserved_file_extent(trans, parent, ref_root,
				2182	flags, ref->objectid,
				2183	ref->offset, &ins,
				2184	node->ref_mod);
				2185	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
				2186	ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
				2187	ref->objectid, ref->offset,
				2188	node->ref_mod, extent_op);
				2189	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
				2190	ret = __btrfs_free_extent(trans, node, parent,
				2191	ref_root, ref->objectid,
				2192	ref->offset, node->ref_mod,
				2193	extent_op);
				2194	} else {
				2195	BUG();
				2196	}
				2197	return ret;
				2198	}
				2199
				2200	static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				2201	struct extent_buffer *leaf,
				2202	struct btrfs_extent_item *ei)
				2203	{
				2204	u64 flags = btrfs_extent_flags(leaf, ei);
				2205	if (extent_op->update_flags) {
				2206	flags \|= extent_op->flags_to_set;
				2207	btrfs_set_extent_flags(leaf, ei, flags);
				2208	}
				2209
				2210	if (extent_op->update_key) {
				2211	struct btrfs_tree_block_info *bi;
				2212	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
				2213	bi = (struct btrfs_tree_block_info *)(ei + 1);
				2214	btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
				2215	}
				2216	}
				2217
				2218	static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
				2219	struct btrfs_delayed_ref_head *head,
				2220	struct btrfs_delayed_extent_op *extent_op)
				2221	{
				2222	struct btrfs_fs_info *fs_info = trans->fs_info;
				2223	struct btrfs_key key;
				2224	struct btrfs_path *path;
				2225	struct btrfs_extent_item *ei;
				2226	struct extent_buffer *leaf;
				2227	u32 item_size;
				2228	int ret;
				2229	int err = 0;
				2230	int metadata = !extent_op->is_data;
				2231
				2232	if (trans->aborted)
				2233	return 0;
				2234
				2235	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				2236	metadata = 0;
				2237
				2238	path = btrfs_alloc_path();
				2239	if (!path)
				2240	return -ENOMEM;
				2241
				2242	key.objectid = head->bytenr;
				2243
				2244	if (metadata) {
				2245	key.type = BTRFS_METADATA_ITEM_KEY;
				2246	key.offset = extent_op->level;
				2247	} else {
				2248	key.type = BTRFS_EXTENT_ITEM_KEY;
				2249	key.offset = head->num_bytes;
				2250	}
				2251
				2252	again:
				2253	path->reada = READA_FORWARD;
				2254	path->leave_spinning = 1;
				2255	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
				2256	if (ret < 0) {
				2257	err = ret;
				2258	goto out;
				2259	}
				2260	if (ret > 0) {
				2261	if (metadata) {
				2262	if (path->slots[0] > 0) {
				2263	path->slots[0]--;
				2264	btrfs_item_key_to_cpu(path->nodes[0], &key,
				2265	path->slots[0]);
				2266	if (key.objectid == head->bytenr &&
				2267	key.type == BTRFS_EXTENT_ITEM_KEY &&
				2268	key.offset == head->num_bytes)
				2269	ret = 0;
				2270	}
				2271	if (ret > 0) {
				2272	btrfs_release_path(path);
				2273	metadata = 0;
				2274
				2275	key.objectid = head->bytenr;
				2276	key.offset = head->num_bytes;
				2277	key.type = BTRFS_EXTENT_ITEM_KEY;
				2278	goto again;
				2279	}
				2280	} else {
				2281	err = -EIO;
				2282	goto out;
				2283	}
				2284	}
				2285
				2286	leaf = path->nodes[0];
				2287	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				2288
				2289	if (unlikely(item_size < sizeof(*ei))) {
				2290	err = -EINVAL;
				2291	btrfs_print_v0_err(fs_info);
				2292	btrfs_abort_transaction(trans, err);
				2293	goto out;
				2294	}
				2295
				2296	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				2297	__run_delayed_extent_op(extent_op, leaf, ei);
				2298
				2299	btrfs_mark_buffer_dirty(leaf);
				2300	out:
				2301	btrfs_free_path(path);
				2302	return err;
				2303	}
				2304
				2305	static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				2306	struct btrfs_delayed_ref_node *node,
				2307	struct btrfs_delayed_extent_op *extent_op,
				2308	int insert_reserved)
				2309	{
				2310	int ret = 0;
				2311	struct btrfs_delayed_tree_ref *ref;
				2312	u64 parent = 0;
				2313	u64 ref_root = 0;
				2314
				2315	ref = btrfs_delayed_node_to_tree_ref(node);
				2316	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
				2317
				2318	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
				2319	parent = ref->parent;
				2320	ref_root = ref->root;
				2321
				2322	if (node->ref_mod != 1) {
				2323	btrfs_err(trans->fs_info,
				2324	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
				2325	node->bytenr, node->ref_mod, node->action, ref_root,
				2326	parent);
				2327	return -EIO;
				2328	}
				2329	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
				2330	BUG_ON(!extent_op \|\| !extent_op->update_flags);
				2331	ret = alloc_reserved_tree_block(trans, node, extent_op);
				2332	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
				2333	ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
				2334	ref->level, 0, 1, extent_op);
				2335	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
				2336	ret = __btrfs_free_extent(trans, node, parent, ref_root,
				2337	ref->level, 0, 1, extent_op);
				2338	} else {
				2339	BUG();
				2340	}
				2341	return ret;
				2342	}
				2343
				2344	/* helper function to actually process a single delayed ref entry */
				2345	static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
				2346	struct btrfs_delayed_ref_node *node,
				2347	struct btrfs_delayed_extent_op *extent_op,
				2348	int insert_reserved)
				2349	{
				2350	int ret = 0;
				2351
				2352	if (trans->aborted) {
				2353	if (insert_reserved)
				2354	btrfs_pin_extent(trans->fs_info, node->bytenr,
				2355	node->num_bytes, 1);
				2356	return 0;
				2357	}
				2358
				2359	if (node->type == BTRFS_TREE_BLOCK_REF_KEY \|\|
				2360	node->type == BTRFS_SHARED_BLOCK_REF_KEY)
				2361	ret = run_delayed_tree_ref(trans, node, extent_op,
				2362	insert_reserved);
				2363	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY \|\|
				2364	node->type == BTRFS_SHARED_DATA_REF_KEY)
				2365	ret = run_delayed_data_ref(trans, node, extent_op,
				2366	insert_reserved);
				2367	else
				2368	BUG();
				2369	if (ret && insert_reserved)
				2370	btrfs_pin_extent(trans->fs_info, node->bytenr,
				2371	node->num_bytes, 1);
				2372	return ret;
				2373	}
				2374
				2375	static inline struct btrfs_delayed_ref_node *
				2376	select_delayed_ref(struct btrfs_delayed_ref_head *head)
				2377	{
				2378	struct btrfs_delayed_ref_node *ref;
				2379
				2380	if (RB_EMPTY_ROOT(&head->ref_tree))
				2381	return NULL;
				2382
				2383	/*
				2384	* Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
				2385	* This is to prevent a ref count from going down to zero, which deletes
				2386	* the extent item from the extent tree, when there still are references
				2387	* to add, which would fail because they would not find the extent item.
				2388	*/
				2389	if (!list_empty(&head->ref_add_list))
				2390	return list_first_entry(&head->ref_add_list,
				2391	struct btrfs_delayed_ref_node, add_list);
				2392
				2393	ref = rb_entry(rb_first(&head->ref_tree),
				2394	struct btrfs_delayed_ref_node, ref_node);
				2395	ASSERT(list_empty(&ref->add_list));
				2396	return ref;
				2397	}
				2398
				2399	static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				2400	struct btrfs_delayed_ref_head *head)
				2401	{
				2402	spin_lock(&delayed_refs->lock);
				2403	head->processing = 0;
				2404	delayed_refs->num_heads_ready++;
				2405	spin_unlock(&delayed_refs->lock);
				2406	btrfs_delayed_ref_unlock(head);
				2407	}
				2408
				2409	static int cleanup_extent_op(struct btrfs_trans_handle *trans,
				2410	struct btrfs_delayed_ref_head *head)
				2411	{
				2412	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
				2413	int ret;
				2414
				2415	if (!extent_op)
				2416	return 0;
				2417	head->extent_op = NULL;
				2418	if (head->must_insert_reserved) {
				2419	btrfs_free_delayed_extent_op(extent_op);
				2420	return 0;
				2421	}
				2422	spin_unlock(&head->lock);
				2423	ret = run_delayed_extent_op(trans, head, extent_op);
				2424	btrfs_free_delayed_extent_op(extent_op);
				2425	return ret ? ret : 1;
				2426	}
				2427
				2428	static int cleanup_ref_head(struct btrfs_trans_handle *trans,
				2429	struct btrfs_delayed_ref_head *head)
				2430	{
				2431
				2432	struct btrfs_fs_info *fs_info = trans->fs_info;
				2433	struct btrfs_delayed_ref_root *delayed_refs;
				2434	int ret;
				2435
				2436	delayed_refs = &trans->transaction->delayed_refs;
				2437
				2438	ret = cleanup_extent_op(trans, head);
				2439	if (ret < 0) {
				2440	unselect_delayed_ref_head(delayed_refs, head);
				2441	btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
				2442	return ret;
				2443	} else if (ret) {
				2444	return ret;
				2445	}
				2446
				2447	/*
				2448	* Need to drop our head ref lock and re-acquire the delayed ref lock
				2449	* and then re-check to make sure nobody got added.
				2450	*/
				2451	spin_unlock(&head->lock);
				2452	spin_lock(&delayed_refs->lock);
				2453	spin_lock(&head->lock);
				2454	if (!RB_EMPTY_ROOT(&head->ref_tree) \|\| head->extent_op) {
				2455	spin_unlock(&head->lock);
				2456	spin_unlock(&delayed_refs->lock);
				2457	return 1;
				2458	}
				2459	delayed_refs->num_heads--;
				2460	rb_erase(&head->href_node, &delayed_refs->href_root);
				2461	RB_CLEAR_NODE(&head->href_node);
				2462	spin_unlock(&head->lock);
				2463	spin_unlock(&delayed_refs->lock);
				2464	atomic_dec(&delayed_refs->num_entries);
				2465
				2466	trace_run_delayed_ref_head(fs_info, head, 0);
				2467
				2468	if (head->total_ref_mod < 0) {
				2469	struct btrfs_space_info *space_info;
				2470	u64 flags;
				2471
				2472	if (head->is_data)
				2473	flags = BTRFS_BLOCK_GROUP_DATA;
				2474	else if (head->is_system)
				2475	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				2476	else
				2477	flags = BTRFS_BLOCK_GROUP_METADATA;
				2478	space_info = __find_space_info(fs_info, flags);
				2479	ASSERT(space_info);
				2480	percpu_counter_add_batch(&space_info->total_bytes_pinned,
				2481	-head->num_bytes,
				2482	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				2483
				2484	if (head->is_data) {
				2485	spin_lock(&delayed_refs->lock);
				2486	delayed_refs->pending_csums -= head->num_bytes;
				2487	spin_unlock(&delayed_refs->lock);
				2488	}
				2489	}
				2490
				2491	if (head->must_insert_reserved) {
				2492	btrfs_pin_extent(fs_info, head->bytenr,
				2493	head->num_bytes, 1);
				2494	if (head->is_data) {
				2495	ret = btrfs_del_csums(trans, fs_info->csum_root,
				2496	head->bytenr, head->num_bytes);
				2497	}
				2498	}
				2499
				2500	/* Also free its reserved qgroup space */
				2501	btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
				2502	head->qgroup_reserved);
				2503	btrfs_delayed_ref_unlock(head);
				2504	btrfs_put_delayed_ref_head(head);
				2505	return 0;
				2506	}
				2507
				2508	/*
				2509	* Returns 0 on success or if called with an already aborted transaction.
				2510	* Returns -ENOMEM or -EIO on failure and will abort the transaction.
				2511	*/
				2512	static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
				2513	unsigned long nr)
				2514	{
				2515	struct btrfs_fs_info *fs_info = trans->fs_info;
				2516	struct btrfs_delayed_ref_root *delayed_refs;
				2517	struct btrfs_delayed_ref_node *ref;
				2518	struct btrfs_delayed_ref_head *locked_ref = NULL;
				2519	struct btrfs_delayed_extent_op *extent_op;
				2520	ktime_t start = ktime_get();
				2521	int ret;
				2522	unsigned long count = 0;
				2523	unsigned long actual_count = 0;
				2524	int must_insert_reserved = 0;
				2525
				2526	delayed_refs = &trans->transaction->delayed_refs;
				2527	while (1) {
				2528	if (!locked_ref) {
				2529	if (count >= nr)
				2530	break;
				2531
				2532	spin_lock(&delayed_refs->lock);
				2533	locked_ref = btrfs_select_ref_head(trans);
				2534	if (!locked_ref) {
				2535	spin_unlock(&delayed_refs->lock);
				2536	break;
				2537	}
				2538
				2539	/* grab the lock that says we are going to process
				2540	* all the refs for this head */
				2541	ret = btrfs_delayed_ref_lock(trans, locked_ref);
				2542	spin_unlock(&delayed_refs->lock);
				2543	/*
				2544	* we may have dropped the spin lock to get the head
				2545	* mutex lock, and that might have given someone else
				2546	* time to free the head. If that's true, it has been
				2547	* removed from our list and we can move on.
				2548	*/
				2549	if (ret == -EAGAIN) {
				2550	locked_ref = NULL;
				2551	count++;
				2552	continue;
				2553	}
				2554	}
				2555
				2556	/*
				2557	* We need to try and merge add/drops of the same ref since we
				2558	* can run into issues with relocate dropping the implicit ref
				2559	* and then it being added back again before the drop can
				2560	* finish. If we merged anything we need to re-loop so we can
				2561	* get a good ref.
				2562	* Or we can get node references of the same type that weren't
				2563	* merged when created due to bumps in the tree mod seq, and
				2564	* we need to merge them to prevent adding an inline extent
				2565	* backref before dropping it (triggering a BUG_ON at
				2566	* insert_inline_extent_backref()).
				2567	*/
				2568	spin_lock(&locked_ref->lock);
				2569	btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
				2570
				2571	ref = select_delayed_ref(locked_ref);
				2572
				2573	if (ref && ref->seq &&
				2574	btrfs_check_delayed_seq(fs_info, ref->seq)) {
				2575	spin_unlock(&locked_ref->lock);
				2576	unselect_delayed_ref_head(delayed_refs, locked_ref);
				2577	locked_ref = NULL;
				2578	cond_resched();
				2579	count++;
				2580	continue;
				2581	}
				2582
				2583	/*
				2584	* We're done processing refs in this ref_head, clean everything
				2585	* up and move on to the next ref_head.
				2586	*/
				2587	if (!ref) {
				2588	ret = cleanup_ref_head(trans, locked_ref);
				2589	if (ret > 0 ) {
				2590	/* We dropped our lock, we need to loop. */
				2591	ret = 0;
				2592	continue;
				2593	} else if (ret) {
				2594	return ret;
				2595	}
				2596	locked_ref = NULL;
				2597	count++;
				2598	continue;
				2599	}
				2600
				2601	actual_count++;
				2602	ref->in_tree = 0;
				2603	rb_erase(&ref->ref_node, &locked_ref->ref_tree);
				2604	RB_CLEAR_NODE(&ref->ref_node);
				2605	if (!list_empty(&ref->add_list))
				2606	list_del(&ref->add_list);
				2607	/*
				2608	* When we play the delayed ref, also correct the ref_mod on
				2609	* head
				2610	*/
				2611	switch (ref->action) {
				2612	case BTRFS_ADD_DELAYED_REF:
				2613	case BTRFS_ADD_DELAYED_EXTENT:
				2614	locked_ref->ref_mod -= ref->ref_mod;
				2615	break;
				2616	case BTRFS_DROP_DELAYED_REF:
				2617	locked_ref->ref_mod += ref->ref_mod;
				2618	break;
				2619	default:
				2620	WARN_ON(1);
				2621	}
				2622	atomic_dec(&delayed_refs->num_entries);
				2623
				2624	/*
				2625	* Record the must-insert_reserved flag before we drop the spin
				2626	* lock.
				2627	*/
				2628	must_insert_reserved = locked_ref->must_insert_reserved;
				2629	locked_ref->must_insert_reserved = 0;
				2630
				2631	extent_op = locked_ref->extent_op;
				2632	locked_ref->extent_op = NULL;
				2633	spin_unlock(&locked_ref->lock);
				2634
				2635	ret = run_one_delayed_ref(trans, ref, extent_op,
				2636	must_insert_reserved);
				2637
				2638	btrfs_free_delayed_extent_op(extent_op);
				2639	if (ret) {
				2640	unselect_delayed_ref_head(delayed_refs, locked_ref);
				2641	btrfs_put_delayed_ref(ref);
				2642	btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				2643	ret);
				2644	return ret;
				2645	}
				2646
				2647	btrfs_put_delayed_ref(ref);
				2648	count++;
				2649	cond_resched();
				2650	}
				2651
				2652	/*
				2653	* We don't want to include ref heads since we can have empty ref heads
				2654	* and those will drastically skew our runtime down since we just do
				2655	* accounting, no actual extent tree updates.
				2656	*/
				2657	if (actual_count > 0) {
				2658	u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
				2659	u64 avg;
				2660
				2661	/*
				2662	* We weigh the current average higher than our current runtime
				2663	* to avoid large swings in the average.
				2664	*/
				2665	spin_lock(&delayed_refs->lock);
				2666	avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
				2667	fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */
				2668	spin_unlock(&delayed_refs->lock);
				2669	}
				2670	return 0;
				2671	}
				2672
				2673	#ifdef SCRAMBLE_DELAYED_REFS
				2674	/*
				2675	* Normally delayed refs get processed in ascending bytenr order. This
				2676	* correlates in most cases to the order added. To expose dependencies on this
				2677	* order, we start to process the tree in the middle instead of the beginning
				2678	*/
				2679	static u64 find_middle(struct rb_root *root)
				2680	{
				2681	struct rb_node *n = root->rb_node;
				2682	struct btrfs_delayed_ref_node *entry;
				2683	int alt = 1;
				2684	u64 middle;
				2685	u64 first = 0, last = 0;
				2686
				2687	n = rb_first(root);
				2688	if (n) {
				2689	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2690	first = entry->bytenr;
				2691	}
				2692	n = rb_last(root);
				2693	if (n) {
				2694	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2695	last = entry->bytenr;
				2696	}
				2697	n = root->rb_node;
				2698
				2699	while (n) {
				2700	entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
				2701	WARN_ON(!entry->in_tree);
				2702
				2703	middle = entry->bytenr;
				2704
				2705	if (alt)
				2706	n = n->rb_left;
				2707	else
				2708	n = n->rb_right;
				2709
				2710	alt = 1 - alt;
				2711	}
				2712	return middle;
				2713	}
				2714	#endif
				2715
				2716	static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
				2717	{
				2718	u64 num_bytes;
				2719
				2720	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
				2721	sizeof(struct btrfs_extent_inline_ref));
				2722	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
				2723	num_bytes += heads * sizeof(struct btrfs_tree_block_info);
				2724
				2725	/*
				2726	* We don't ever fill up leaves all the way so multiply by 2 just to be
				2727	* closer to what we're really going to want to use.
				2728	*/
				2729	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
				2730	}
				2731
				2732	/*
				2733	* Takes the number of bytes to be csumm'ed and figures out how many leaves it
				2734	* would require to store the csums for that many bytes.
				2735	*/
				2736	u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
				2737	{
				2738	u64 csum_size;
				2739	u64 num_csums_per_leaf;
				2740	u64 num_csums;
				2741
				2742	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
				2743	num_csums_per_leaf = div64_u64(csum_size,
				2744	(u64)btrfs_super_csum_size(fs_info->super_copy));
				2745	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
				2746	num_csums += num_csums_per_leaf - 1;
				2747	num_csums = div64_u64(num_csums, num_csums_per_leaf);
				2748	return num_csums;
				2749	}
				2750
				2751	int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
				2752	struct btrfs_fs_info *fs_info)
				2753	{
				2754	struct btrfs_block_rsv *global_rsv;
				2755	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
				2756	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
				2757	unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
				2758	u64 num_bytes, num_dirty_bgs_bytes;
				2759	int ret = 0;
				2760
				2761	num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
				2762	num_heads = heads_to_leaves(fs_info, num_heads);
				2763	if (num_heads > 1)
				2764	num_bytes += (num_heads - 1) * fs_info->nodesize;
				2765	num_bytes <<= 1;
				2766	num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
				2767	fs_info->nodesize;
				2768	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
				2769	num_dirty_bgs);
				2770	global_rsv = &fs_info->global_block_rsv;
				2771
				2772	/*
				2773	* If we can't allocate any more chunks lets make sure we have _lots_ of
				2774	* wiggle room since running delayed refs can create more delayed refs.
				2775	*/
				2776	if (global_rsv->space_info->full) {
				2777	num_dirty_bgs_bytes <<= 1;
				2778	num_bytes <<= 1;
				2779	}
				2780
				2781	spin_lock(&global_rsv->lock);
				2782	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
				2783	ret = 1;
				2784	spin_unlock(&global_rsv->lock);
				2785	return ret;
				2786	}
				2787
				2788	int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
				2789	struct btrfs_fs_info *fs_info)
				2790	{
				2791	u64 num_entries =
				2792	atomic_read(&trans->transaction->delayed_refs.num_entries);
				2793	u64 avg_runtime;
				2794	u64 val;
				2795
				2796	smp_mb();
				2797	avg_runtime = fs_info->avg_delayed_ref_runtime;
				2798	val = num_entries * avg_runtime;
				2799	if (val >= NSEC_PER_SEC)
				2800	return 1;
				2801	if (val >= NSEC_PER_SEC / 2)
				2802	return 2;
				2803
				2804	return btrfs_check_space_for_delayed_refs(trans, fs_info);
				2805	}
				2806
				2807	struct async_delayed_refs {
				2808	struct btrfs_root *root;
				2809	u64 transid;
				2810	int count;
				2811	int error;
				2812	int sync;
				2813	struct completion wait;
				2814	struct btrfs_work work;
				2815	};
				2816
				2817	static inline struct async_delayed_refs *
				2818	to_async_delayed_refs(struct btrfs_work *work)
				2819	{
				2820	return container_of(work, struct async_delayed_refs, work);
				2821	}
				2822
				2823	static void delayed_ref_async_start(struct btrfs_work *work)
				2824	{
				2825	struct async_delayed_refs *async = to_async_delayed_refs(work);
				2826	struct btrfs_trans_handle *trans;
				2827	struct btrfs_fs_info *fs_info = async->root->fs_info;
				2828	int ret;
				2829
				2830	/* if the commit is already started, we don't need to wait here */
				2831	if (btrfs_transaction_blocked(fs_info))
				2832	goto done;
				2833
				2834	trans = btrfs_join_transaction(async->root);
				2835	if (IS_ERR(trans)) {
				2836	async->error = PTR_ERR(trans);
				2837	goto done;
				2838	}
				2839
				2840	/*
				2841	* trans->sync means that when we call end_transaction, we won't
				2842	* wait on delayed refs
				2843	*/
				2844	trans->sync = true;
				2845
				2846	/* Don't bother flushing if we got into a different transaction */
				2847	if (trans->transid > async->transid)
				2848	goto end;
				2849
				2850	ret = btrfs_run_delayed_refs(trans, async->count);
				2851	if (ret)
				2852	async->error = ret;
				2853	end:
				2854	ret = btrfs_end_transaction(trans);
				2855	if (ret && !async->error)
				2856	async->error = ret;
				2857	done:
				2858	if (async->sync)
				2859	complete(&async->wait);
				2860	else
				2861	kfree(async);
				2862	}
				2863
				2864	int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info,
				2865	unsigned long count, u64 transid, int wait)
				2866	{
				2867	struct async_delayed_refs *async;
				2868	int ret;
				2869
				2870	async = kmalloc(sizeof(*async), GFP_NOFS);
				2871	if (!async)
				2872	return -ENOMEM;
				2873
				2874	async->root = fs_info->tree_root;
				2875	async->count = count;
				2876	async->error = 0;
				2877	async->transid = transid;
				2878	if (wait)
				2879	async->sync = 1;
				2880	else
				2881	async->sync = 0;
				2882	init_completion(&async->wait);
				2883
				2884	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
				2885	delayed_ref_async_start, NULL, NULL);
				2886
				2887	btrfs_queue_work(fs_info->extent_workers, &async->work);
				2888
				2889	if (wait) {
				2890	wait_for_completion(&async->wait);
				2891	ret = async->error;
				2892	kfree(async);
				2893	return ret;
				2894	}
				2895	return 0;
				2896	}
				2897
				2898	/*
				2899	* this starts processing the delayed reference count updates and
				2900	* extent insertions we have queued up so far. count can be
				2901	* 0, which means to process everything in the tree at the start
				2902	* of the run (but not newly added entries), or it can be some target
				2903	* number you'd like to process.
				2904	*
				2905	* Returns 0 on success or if called with an aborted transaction
				2906	* Returns <0 on error and aborts the transaction
				2907	*/
				2908	int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
				2909	unsigned long count)
				2910	{
				2911	struct btrfs_fs_info *fs_info = trans->fs_info;
				2912	struct rb_node *node;
				2913	struct btrfs_delayed_ref_root *delayed_refs;
				2914	struct btrfs_delayed_ref_head *head;
				2915	int ret;
				2916	int run_all = count == (unsigned long)-1;
				2917
				2918	/* We'll clean this up in btrfs_cleanup_transaction */
				2919	if (trans->aborted)
				2920	return 0;
				2921
				2922	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
				2923	return 0;
				2924
				2925	delayed_refs = &trans->transaction->delayed_refs;
				2926	if (count == 0)
				2927	count = atomic_read(&delayed_refs->num_entries) * 2;
				2928
				2929	again:
				2930	#ifdef SCRAMBLE_DELAYED_REFS
				2931	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
				2932	#endif
				2933	ret = __btrfs_run_delayed_refs(trans, count);
				2934	if (ret < 0) {
				2935	btrfs_abort_transaction(trans, ret);
				2936	return ret;
				2937	}
				2938
				2939	if (run_all) {
				2940	if (!list_empty(&trans->new_bgs))
				2941	btrfs_create_pending_block_groups(trans);
				2942
				2943	spin_lock(&delayed_refs->lock);
				2944	node = rb_first(&delayed_refs->href_root);
				2945	if (!node) {
				2946	spin_unlock(&delayed_refs->lock);
				2947	goto out;
				2948	}
				2949	head = rb_entry(node, struct btrfs_delayed_ref_head,
				2950	href_node);
				2951	refcount_inc(&head->refs);
				2952	spin_unlock(&delayed_refs->lock);
				2953
				2954	/* Mutex was contended, block until it's released and retry. */
				2955	mutex_lock(&head->mutex);
				2956	mutex_unlock(&head->mutex);
				2957
				2958	btrfs_put_delayed_ref_head(head);
				2959	cond_resched();
				2960	goto again;
				2961	}
				2962	out:
				2963	return 0;
				2964	}
				2965
				2966	int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				2967	struct btrfs_fs_info *fs_info,
				2968	u64 bytenr, u64 num_bytes, u64 flags,
				2969	int level, int is_data)
				2970	{
				2971	struct btrfs_delayed_extent_op *extent_op;
				2972	int ret;
				2973
				2974	extent_op = btrfs_alloc_delayed_extent_op();
				2975	if (!extent_op)
				2976	return -ENOMEM;
				2977
				2978	extent_op->flags_to_set = flags;
				2979	extent_op->update_flags = true;
				2980	extent_op->update_key = false;
				2981	extent_op->is_data = is_data ? true : false;
				2982	extent_op->level = level;
				2983
				2984	ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr,
				2985	num_bytes, extent_op);
				2986	if (ret)
				2987	btrfs_free_delayed_extent_op(extent_op);
				2988	return ret;
				2989	}
				2990
				2991	static noinline int check_delayed_ref(struct btrfs_root *root,
				2992	struct btrfs_path *path,
				2993	u64 objectid, u64 offset, u64 bytenr)
				2994	{
				2995	struct btrfs_delayed_ref_head *head;
				2996	struct btrfs_delayed_ref_node *ref;
				2997	struct btrfs_delayed_data_ref *data_ref;
				2998	struct btrfs_delayed_ref_root *delayed_refs;
				2999	struct btrfs_transaction *cur_trans;
				3000	struct rb_node *node;
				3001	int ret = 0;
				3002
				3003	spin_lock(&root->fs_info->trans_lock);
				3004	cur_trans = root->fs_info->running_transaction;
				3005	if (cur_trans)
				3006	refcount_inc(&cur_trans->use_count);
				3007	spin_unlock(&root->fs_info->trans_lock);
				3008	if (!cur_trans)
				3009	return 0;
				3010
				3011	delayed_refs = &cur_trans->delayed_refs;
				3012	spin_lock(&delayed_refs->lock);
				3013	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				3014	if (!head) {
				3015	spin_unlock(&delayed_refs->lock);
				3016	btrfs_put_transaction(cur_trans);
				3017	return 0;
				3018	}
				3019
				3020	if (!mutex_trylock(&head->mutex)) {
				3021	refcount_inc(&head->refs);
				3022	spin_unlock(&delayed_refs->lock);
				3023
				3024	btrfs_release_path(path);
				3025
				3026	/*
				3027	* Mutex was contended, block until it's released and let
				3028	* caller try again
				3029	*/
				3030	mutex_lock(&head->mutex);
				3031	mutex_unlock(&head->mutex);
				3032	btrfs_put_delayed_ref_head(head);
				3033	btrfs_put_transaction(cur_trans);
				3034	return -EAGAIN;
				3035	}
				3036	spin_unlock(&delayed_refs->lock);
				3037
				3038	spin_lock(&head->lock);
				3039	/*
				3040	* XXX: We should replace this with a proper search function in the
				3041	* future.
				3042	*/
				3043	for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
				3044	ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
				3045	/* If it's a shared ref we know a cross reference exists */
				3046	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
				3047	ret = 1;
				3048	break;
				3049	}
				3050
				3051	data_ref = btrfs_delayed_node_to_data_ref(ref);
				3052
				3053	/*
				3054	* If our ref doesn't match the one we're currently looking at
				3055	* then we have a cross reference.
				3056	*/
				3057	if (data_ref->root != root->root_key.objectid \|\|
				3058	data_ref->objectid != objectid \|\|
				3059	data_ref->offset != offset) {
				3060	ret = 1;
				3061	break;
				3062	}
				3063	}
				3064	spin_unlock(&head->lock);
				3065	mutex_unlock(&head->mutex);
				3066	btrfs_put_transaction(cur_trans);
				3067	return ret;
				3068	}
				3069
				3070	static noinline int check_committed_ref(struct btrfs_root *root,
				3071	struct btrfs_path *path,
				3072	u64 objectid, u64 offset, u64 bytenr)
				3073	{
				3074	struct btrfs_fs_info *fs_info = root->fs_info;
				3075	struct btrfs_root *extent_root = fs_info->extent_root;
				3076	struct extent_buffer *leaf;
				3077	struct btrfs_extent_data_ref *ref;
				3078	struct btrfs_extent_inline_ref *iref;
				3079	struct btrfs_extent_item *ei;
				3080	struct btrfs_key key;
				3081	u32 item_size;
				3082	int type;
				3083	int ret;
				3084
				3085	key.objectid = bytenr;
				3086	key.offset = (u64)-1;
				3087	key.type = BTRFS_EXTENT_ITEM_KEY;
				3088
				3089	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
				3090	if (ret < 0)
				3091	goto out;
				3092	BUG_ON(ret == 0); /* Corruption */
				3093
				3094	ret = -ENOENT;
				3095	if (path->slots[0] == 0)
				3096	goto out;
				3097
				3098	path->slots[0]--;
				3099	leaf = path->nodes[0];
				3100	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				3101
				3102	if (key.objectid != bytenr \|\| key.type != BTRFS_EXTENT_ITEM_KEY)
				3103	goto out;
				3104
				3105	ret = 1;
				3106	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				3107	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
				3108
				3109	if (item_size != sizeof(*ei) +
				3110	btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
				3111	goto out;
				3112
				3113	if (btrfs_extent_generation(leaf, ei) <=
				3114	btrfs_root_last_snapshot(&root->root_item))
				3115	goto out;
				3116
				3117	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
				3118
				3119	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
				3120	if (type != BTRFS_EXTENT_DATA_REF_KEY)
				3121	goto out;
				3122
				3123	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
				3124	if (btrfs_extent_refs(leaf, ei) !=
				3125	btrfs_extent_data_ref_count(leaf, ref) \|\|
				3126	btrfs_extent_data_ref_root(leaf, ref) !=
				3127	root->root_key.objectid \|\|
				3128	btrfs_extent_data_ref_objectid(leaf, ref) != objectid \|\|
				3129	btrfs_extent_data_ref_offset(leaf, ref) != offset)
				3130	goto out;
				3131
				3132	ret = 0;
				3133	out:
				3134	return ret;
				3135	}
				3136
				3137	int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
				3138	u64 bytenr)
				3139	{
				3140	struct btrfs_path *path;
				3141	int ret;
				3142	int ret2;
				3143
				3144	path = btrfs_alloc_path();
				3145	if (!path)
				3146	return -ENOMEM;
				3147
				3148	do {
				3149	ret = check_committed_ref(root, path, objectid,
				3150	offset, bytenr);
				3151	if (ret && ret != -ENOENT)
				3152	goto out;
				3153
				3154	ret2 = check_delayed_ref(root, path, objectid,
				3155	offset, bytenr);
				3156	} while (ret2 == -EAGAIN);
				3157
				3158	if (ret2 && ret2 != -ENOENT) {
				3159	ret = ret2;
				3160	goto out;
				3161	}
				3162
				3163	if (ret != -ENOENT \|\| ret2 != -ENOENT)
				3164	ret = 0;
				3165	out:
				3166	btrfs_free_path(path);
				3167	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
				3168	WARN_ON(ret > 0);
				3169	return ret;
				3170	}
				3171
				3172	static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
				3173	struct btrfs_root *root,
				3174	struct extent_buffer *buf,
				3175	int full_backref, int inc)
				3176	{
				3177	struct btrfs_fs_info *fs_info = root->fs_info;
				3178	u64 bytenr;
				3179	u64 num_bytes;
				3180	u64 parent;
				3181	u64 ref_root;
				3182	u32 nritems;
				3183	struct btrfs_key key;
				3184	struct btrfs_file_extent_item *fi;
				3185	int i;
				3186	int level;
				3187	int ret = 0;
				3188	int (process_func)(struct btrfs_trans_handle ,
				3189	struct btrfs_root *,
				3190	u64, u64, u64, u64, u64, u64);
				3191
				3192
				3193	if (btrfs_is_testing(fs_info))
				3194	return 0;
				3195
				3196	ref_root = btrfs_header_owner(buf);
				3197	nritems = btrfs_header_nritems(buf);
				3198	level = btrfs_header_level(buf);
				3199
				3200	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
				3201	return 0;
				3202
				3203	if (inc)
				3204	process_func = btrfs_inc_extent_ref;
				3205	else
				3206	process_func = btrfs_free_extent;
				3207
				3208	if (full_backref)
				3209	parent = buf->start;
				3210	else
				3211	parent = 0;
				3212
				3213	for (i = 0; i < nritems; i++) {
				3214	if (level == 0) {
				3215	btrfs_item_key_to_cpu(buf, &key, i);
				3216	if (key.type != BTRFS_EXTENT_DATA_KEY)
				3217	continue;
				3218	fi = btrfs_item_ptr(buf, i,
				3219	struct btrfs_file_extent_item);
				3220	if (btrfs_file_extent_type(buf, fi) ==
				3221	BTRFS_FILE_EXTENT_INLINE)
				3222	continue;
				3223	bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
				3224	if (bytenr == 0)
				3225	continue;
				3226
				3227	num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
				3228	key.offset -= btrfs_file_extent_offset(buf, fi);
				3229	ret = process_func(trans, root, bytenr, num_bytes,
				3230	parent, ref_root, key.objectid,
				3231	key.offset);
				3232	if (ret)
				3233	goto fail;
				3234	} else {
				3235	bytenr = btrfs_node_blockptr(buf, i);
				3236	num_bytes = fs_info->nodesize;
				3237	ret = process_func(trans, root, bytenr, num_bytes,
				3238	parent, ref_root, level - 1, 0);
				3239	if (ret)
				3240	goto fail;
				3241	}
				3242	}
				3243	return 0;
				3244	fail:
				3245	return ret;
				3246	}
				3247
				3248	int btrfs_inc_ref(struct btrfs_trans_handle trans, struct btrfs_root root,
				3249	struct extent_buffer *buf, int full_backref)
				3250	{
				3251	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
				3252	}
				3253
				3254	int btrfs_dec_ref(struct btrfs_trans_handle trans, struct btrfs_root root,
				3255	struct extent_buffer *buf, int full_backref)
				3256	{
				3257	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
				3258	}
				3259
				3260	static int write_one_cache_group(struct btrfs_trans_handle *trans,
				3261	struct btrfs_fs_info *fs_info,
				3262	struct btrfs_path *path,
				3263	struct btrfs_block_group_cache *cache)
				3264	{
				3265	int ret;
				3266	struct btrfs_root *extent_root = fs_info->extent_root;
				3267	unsigned long bi;
				3268	struct extent_buffer *leaf;
				3269
				3270	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
				3271	if (ret) {
				3272	if (ret > 0)
				3273	ret = -ENOENT;
				3274	goto fail;
				3275	}
				3276
				3277	leaf = path->nodes[0];
				3278	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
				3279	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
				3280	btrfs_mark_buffer_dirty(leaf);
				3281	fail:
				3282	btrfs_release_path(path);
				3283	return ret;
				3284
				3285	}
				3286
				3287	static struct btrfs_block_group_cache *
				3288	next_block_group(struct btrfs_fs_info *fs_info,
				3289	struct btrfs_block_group_cache *cache)
				3290	{
				3291	struct rb_node *node;
				3292
				3293	spin_lock(&fs_info->block_group_cache_lock);
				3294
				3295	/* If our block group was removed, we need a full search. */
				3296	if (RB_EMPTY_NODE(&cache->cache_node)) {
				3297	const u64 next_bytenr = cache->key.objectid + cache->key.offset;
				3298
				3299	spin_unlock(&fs_info->block_group_cache_lock);
				3300	btrfs_put_block_group(cache);
				3301	cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
				3302	}
				3303	node = rb_next(&cache->cache_node);
				3304	btrfs_put_block_group(cache);
				3305	if (node) {
				3306	cache = rb_entry(node, struct btrfs_block_group_cache,
				3307	cache_node);
				3308	btrfs_get_block_group(cache);
				3309	} else
				3310	cache = NULL;
				3311	spin_unlock(&fs_info->block_group_cache_lock);
				3312	return cache;
				3313	}
				3314
				3315	static int cache_save_setup(struct btrfs_block_group_cache *block_group,
				3316	struct btrfs_trans_handle *trans,
				3317	struct btrfs_path *path)
				3318	{
				3319	struct btrfs_fs_info *fs_info = block_group->fs_info;
				3320	struct btrfs_root *root = fs_info->tree_root;
				3321	struct inode *inode = NULL;
				3322	struct extent_changeset *data_reserved = NULL;
				3323	u64 alloc_hint = 0;
				3324	int dcs = BTRFS_DC_ERROR;
				3325	u64 num_pages = 0;
				3326	int retries = 0;
				3327	int ret = 0;
				3328
				3329	/*
				3330	* If this block group is smaller than 100 megs don't bother caching the
				3331	* block group.
				3332	*/
				3333	if (block_group->key.offset < (100 * SZ_1M)) {
				3334	spin_lock(&block_group->lock);
				3335	block_group->disk_cache_state = BTRFS_DC_WRITTEN;
				3336	spin_unlock(&block_group->lock);
				3337	return 0;
				3338	}
				3339
				3340	if (trans->aborted)
				3341	return 0;
				3342	again:
				3343	inode = lookup_free_space_inode(fs_info, block_group, path);
				3344	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
				3345	ret = PTR_ERR(inode);
				3346	btrfs_release_path(path);
				3347	goto out;
				3348	}
				3349
				3350	if (IS_ERR(inode)) {
				3351	BUG_ON(retries);
				3352	retries++;
				3353
				3354	if (block_group->ro)
				3355	goto out_free;
				3356
				3357	ret = create_free_space_inode(fs_info, trans, block_group,
				3358	path);
				3359	if (ret)
				3360	goto out_free;
				3361	goto again;
				3362	}
				3363
				3364	/*
				3365	* We want to set the generation to 0, that way if anything goes wrong
				3366	* from here on out we know not to trust this cache when we load up next
				3367	* time.
				3368	*/
				3369	BTRFS_I(inode)->generation = 0;
				3370	ret = btrfs_update_inode(trans, root, inode);
				3371	if (ret) {
				3372	/*
				3373	* So theoretically we could recover from this, simply set the
				3374	* super cache generation to 0 so we know to invalidate the
				3375	* cache, but then we'd have to keep track of the block groups
				3376	* that fail this way so we know we _have_ to reset this cache
				3377	* before the next commit or risk reading stale cache. So to
				3378	* limit our exposure to horrible edge cases lets just abort the
				3379	* transaction, this only happens in really bad situations
				3380	* anyway.
				3381	*/
				3382	btrfs_abort_transaction(trans, ret);
				3383	goto out_put;
				3384	}
				3385	WARN_ON(ret);
				3386
				3387	/* We've already setup this transaction, go ahead and exit */
				3388	if (block_group->cache_generation == trans->transid &&
				3389	i_size_read(inode)) {
				3390	dcs = BTRFS_DC_SETUP;
				3391	goto out_put;
				3392	}
				3393
				3394	if (i_size_read(inode) > 0) {
				3395	ret = btrfs_check_trunc_cache_free_space(fs_info,
				3396	&fs_info->global_block_rsv);
				3397	if (ret)
				3398	goto out_put;
				3399
				3400	ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
				3401	if (ret)
				3402	goto out_put;
				3403	}
				3404
				3405	spin_lock(&block_group->lock);
				3406	if (block_group->cached != BTRFS_CACHE_FINISHED \|\|
				3407	!btrfs_test_opt(fs_info, SPACE_CACHE)) {
				3408	/*
				3409	* don't bother trying to write stuff out _if_
				3410	* a) we're not cached,
				3411	* b) we're with nospace_cache mount option,
				3412	* c) we're with v2 space_cache (FREE_SPACE_TREE).
				3413	*/
				3414	dcs = BTRFS_DC_WRITTEN;
				3415	spin_unlock(&block_group->lock);
				3416	goto out_put;
				3417	}
				3418	spin_unlock(&block_group->lock);
				3419
				3420	/*
				3421	* We hit an ENOSPC when setting up the cache in this transaction, just
				3422	* skip doing the setup, we've already cleared the cache so we're safe.
				3423	*/
				3424	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
				3425	ret = -ENOSPC;
				3426	goto out_put;
				3427	}
				3428
				3429	/*
				3430	* Try to preallocate enough space based on how big the block group is.
				3431	* Keep in mind this has to include any pinned space which could end up
				3432	* taking up quite a bit since it's not folded into the other space
				3433	* cache.
				3434	*/
				3435	num_pages = div_u64(block_group->key.offset, SZ_256M);
				3436	if (!num_pages)
				3437	num_pages = 1;
				3438
				3439	num_pages *= 16;
				3440	num_pages *= PAGE_SIZE;
				3441
				3442	ret = btrfs_check_data_free_space(inode, &data_reserved, 0, num_pages);
				3443	if (ret)
				3444	goto out_put;
				3445
				3446	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
				3447	num_pages, num_pages,
				3448	&alloc_hint);
				3449	/*
				3450	* Our cache requires contiguous chunks so that we don't modify a bunch
				3451	* of metadata or split extents when writing the cache out, which means
				3452	* we can enospc if we are heavily fragmented in addition to just normal
				3453	* out of space conditions. So if we hit this just skip setting up any
				3454	* other block groups for this transaction, maybe we'll unpin enough
				3455	* space the next time around.
				3456	*/
				3457	if (!ret)
				3458	dcs = BTRFS_DC_SETUP;
				3459	else if (ret == -ENOSPC)
				3460	set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
				3461
				3462	out_put:
				3463	iput(inode);
				3464	out_free:
				3465	btrfs_release_path(path);
				3466	out:
				3467	spin_lock(&block_group->lock);
				3468	if (!ret && dcs == BTRFS_DC_SETUP)
				3469	block_group->cache_generation = trans->transid;
				3470	block_group->disk_cache_state = dcs;
				3471	spin_unlock(&block_group->lock);
				3472
				3473	extent_changeset_free(data_reserved);
				3474	return ret;
				3475	}
				3476
				3477	int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
				3478	struct btrfs_fs_info *fs_info)
				3479	{
				3480	struct btrfs_block_group_cache cache, tmp;
				3481	struct btrfs_transaction *cur_trans = trans->transaction;
				3482	struct btrfs_path *path;
				3483
				3484	if (list_empty(&cur_trans->dirty_bgs) \|\|
				3485	!btrfs_test_opt(fs_info, SPACE_CACHE))
				3486	return 0;
				3487
				3488	path = btrfs_alloc_path();
				3489	if (!path)
				3490	return -ENOMEM;
				3491
				3492	/* Could add new block groups, use _safe just in case */
				3493	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
				3494	dirty_list) {
				3495	if (cache->disk_cache_state == BTRFS_DC_CLEAR)
				3496	cache_save_setup(cache, trans, path);
				3497	}
				3498
				3499	btrfs_free_path(path);
				3500	return 0;
				3501	}
				3502
				3503	/*
				3504	* transaction commit does final block group cache writeback during a
				3505	* critical section where nothing is allowed to change the FS. This is
				3506	* required in order for the cache to actually match the block group,
				3507	* but can introduce a lot of latency into the commit.
				3508	*
				3509	* So, btrfs_start_dirty_block_groups is here to kick off block group
				3510	* cache IO. There's a chance we'll have to redo some of it if the
				3511	* block group changes again during the commit, but it greatly reduces
				3512	* the commit latency by getting rid of the easy block groups while
				3513	* we're still allowing others to join the commit.
				3514	*/
				3515	int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
				3516	{
				3517	struct btrfs_fs_info *fs_info = trans->fs_info;
				3518	struct btrfs_block_group_cache *cache;
				3519	struct btrfs_transaction *cur_trans = trans->transaction;
				3520	int ret = 0;
				3521	int should_put;
				3522	struct btrfs_path *path = NULL;
				3523	LIST_HEAD(dirty);
				3524	struct list_head *io = &cur_trans->io_bgs;
				3525	int num_started = 0;
				3526	int loops = 0;
				3527
				3528	spin_lock(&cur_trans->dirty_bgs_lock);
				3529	if (list_empty(&cur_trans->dirty_bgs)) {
				3530	spin_unlock(&cur_trans->dirty_bgs_lock);
				3531	return 0;
				3532	}
				3533	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				3534	spin_unlock(&cur_trans->dirty_bgs_lock);
				3535
				3536	again:
				3537	/*
				3538	* make sure all the block groups on our dirty list actually
				3539	* exist
				3540	*/
				3541	btrfs_create_pending_block_groups(trans);
				3542
				3543	if (!path) {
				3544	path = btrfs_alloc_path();
				3545	if (!path)
				3546	return -ENOMEM;
				3547	}
				3548
				3549	/*
				3550	* cache_write_mutex is here only to save us from balance or automatic
				3551	* removal of empty block groups deleting this block group while we are
				3552	* writing out the cache
				3553	*/
				3554	mutex_lock(&trans->transaction->cache_write_mutex);
				3555	while (!list_empty(&dirty)) {
				3556	cache = list_first_entry(&dirty,
				3557	struct btrfs_block_group_cache,
				3558	dirty_list);
				3559	/*
				3560	* this can happen if something re-dirties a block
				3561	* group that is already under IO. Just wait for it to
				3562	* finish and then do it all again
				3563	*/
				3564	if (!list_empty(&cache->io_list)) {
				3565	list_del_init(&cache->io_list);
				3566	btrfs_wait_cache_io(trans, cache, path);
				3567	btrfs_put_block_group(cache);
				3568	}
				3569
				3570
				3571	/*
				3572	* btrfs_wait_cache_io uses the cache->dirty_list to decide
				3573	* if it should update the cache_state. Don't delete
				3574	* until after we wait.
				3575	*
				3576	* Since we're not running in the commit critical section
				3577	* we need the dirty_bgs_lock to protect from update_block_group
				3578	*/
				3579	spin_lock(&cur_trans->dirty_bgs_lock);
				3580	list_del_init(&cache->dirty_list);
				3581	spin_unlock(&cur_trans->dirty_bgs_lock);
				3582
				3583	should_put = 1;
				3584
				3585	cache_save_setup(cache, trans, path);
				3586
				3587	if (cache->disk_cache_state == BTRFS_DC_SETUP) {
				3588	cache->io_ctl.inode = NULL;
				3589	ret = btrfs_write_out_cache(fs_info, trans,
				3590	cache, path);
				3591	if (ret == 0 && cache->io_ctl.inode) {
				3592	num_started++;
				3593	should_put = 0;
				3594
				3595	/*
				3596	* The cache_write_mutex is protecting the
				3597	* io_list, also refer to the definition of
				3598	* btrfs_transaction::io_bgs for more details
				3599	*/
				3600	list_add_tail(&cache->io_list, io);
				3601	} else {
				3602	/*
				3603	* if we failed to write the cache, the
				3604	* generation will be bad and life goes on
				3605	*/
				3606	ret = 0;
				3607	}
				3608	}
				3609	if (!ret) {
				3610	ret = write_one_cache_group(trans, fs_info,
				3611	path, cache);
				3612	/*
				3613	* Our block group might still be attached to the list
				3614	* of new block groups in the transaction handle of some
				3615	* other task (struct btrfs_trans_handle->new_bgs). This
				3616	* means its block group item isn't yet in the extent
				3617	* tree. If this happens ignore the error, as we will
				3618	* try again later in the critical section of the
				3619	* transaction commit.
				3620	*/
				3621	if (ret == -ENOENT) {
				3622	ret = 0;
				3623	spin_lock(&cur_trans->dirty_bgs_lock);
				3624	if (list_empty(&cache->dirty_list)) {
				3625	list_add_tail(&cache->dirty_list,
				3626	&cur_trans->dirty_bgs);
				3627	btrfs_get_block_group(cache);
				3628	}
				3629	spin_unlock(&cur_trans->dirty_bgs_lock);
				3630	} else if (ret) {
				3631	btrfs_abort_transaction(trans, ret);
				3632	}
				3633	}
				3634
				3635	/* if its not on the io list, we need to put the block group */
				3636	if (should_put)
				3637	btrfs_put_block_group(cache);
				3638
				3639	if (ret)
				3640	break;
				3641
				3642	/*
				3643	* Avoid blocking other tasks for too long. It might even save
				3644	* us from writing caches for block groups that are going to be
				3645	* removed.
				3646	*/
				3647	mutex_unlock(&trans->transaction->cache_write_mutex);
				3648	mutex_lock(&trans->transaction->cache_write_mutex);
				3649	}
				3650	mutex_unlock(&trans->transaction->cache_write_mutex);
				3651
				3652	/*
				3653	* go through delayed refs for all the stuff we've just kicked off
				3654	* and then loop back (just once)
				3655	*/
				3656	ret = btrfs_run_delayed_refs(trans, 0);
				3657	if (!ret && loops == 0) {
				3658	loops++;
				3659	spin_lock(&cur_trans->dirty_bgs_lock);
				3660	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				3661	/*
				3662	* dirty_bgs_lock protects us from concurrent block group
				3663	* deletes too (not just cache_write_mutex).
				3664	*/
				3665	if (!list_empty(&dirty)) {
				3666	spin_unlock(&cur_trans->dirty_bgs_lock);
				3667	goto again;
				3668	}
				3669	spin_unlock(&cur_trans->dirty_bgs_lock);
				3670	} else if (ret < 0) {
				3671	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
				3672	}
				3673
				3674	btrfs_free_path(path);
				3675	return ret;
				3676	}
				3677
				3678	int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
				3679	struct btrfs_fs_info *fs_info)
				3680	{
				3681	struct btrfs_block_group_cache *cache;
				3682	struct btrfs_transaction *cur_trans = trans->transaction;
				3683	int ret = 0;
				3684	int should_put;
				3685	struct btrfs_path *path;
				3686	struct list_head *io = &cur_trans->io_bgs;
				3687	int num_started = 0;
				3688
				3689	path = btrfs_alloc_path();
				3690	if (!path)
				3691	return -ENOMEM;
				3692
				3693	/*
				3694	* Even though we are in the critical section of the transaction commit,
				3695	* we can still have concurrent tasks adding elements to this
				3696	* transaction's list of dirty block groups. These tasks correspond to
				3697	* endio free space workers started when writeback finishes for a
				3698	* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
				3699	* allocate new block groups as a result of COWing nodes of the root
				3700	* tree when updating the free space inode. The writeback for the space
				3701	* caches is triggered by an earlier call to
				3702	* btrfs_start_dirty_block_groups() and iterations of the following
				3703	* loop.
				3704	* Also we want to do the cache_save_setup first and then run the
				3705	* delayed refs to make sure we have the best chance at doing this all
				3706	* in one shot.
				3707	*/
				3708	spin_lock(&cur_trans->dirty_bgs_lock);
				3709	while (!list_empty(&cur_trans->dirty_bgs)) {
				3710	cache = list_first_entry(&cur_trans->dirty_bgs,
				3711	struct btrfs_block_group_cache,
				3712	dirty_list);
				3713
				3714	/*
				3715	* this can happen if cache_save_setup re-dirties a block
				3716	* group that is already under IO. Just wait for it to
				3717	* finish and then do it all again
				3718	*/
				3719	if (!list_empty(&cache->io_list)) {
				3720	spin_unlock(&cur_trans->dirty_bgs_lock);
				3721	list_del_init(&cache->io_list);
				3722	btrfs_wait_cache_io(trans, cache, path);
				3723	btrfs_put_block_group(cache);
				3724	spin_lock(&cur_trans->dirty_bgs_lock);
				3725	}
				3726
				3727	/*
				3728	* don't remove from the dirty list until after we've waited
				3729	* on any pending IO
				3730	*/
				3731	list_del_init(&cache->dirty_list);
				3732	spin_unlock(&cur_trans->dirty_bgs_lock);
				3733	should_put = 1;
				3734
				3735	cache_save_setup(cache, trans, path);
				3736
				3737	if (!ret)
				3738	ret = btrfs_run_delayed_refs(trans,
				3739	(unsigned long) -1);
				3740
				3741	if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
				3742	cache->io_ctl.inode = NULL;
				3743	ret = btrfs_write_out_cache(fs_info, trans,
				3744	cache, path);
				3745	if (ret == 0 && cache->io_ctl.inode) {
				3746	num_started++;
				3747	should_put = 0;
				3748	list_add_tail(&cache->io_list, io);
				3749	} else {
				3750	/*
				3751	* if we failed to write the cache, the
				3752	* generation will be bad and life goes on
				3753	*/
				3754	ret = 0;
				3755	}
				3756	}
				3757	if (!ret) {
				3758	ret = write_one_cache_group(trans, fs_info,
				3759	path, cache);
				3760	/*
				3761	* One of the free space endio workers might have
				3762	* created a new block group while updating a free space
				3763	* cache's inode (at inode.c:btrfs_finish_ordered_io())
				3764	* and hasn't released its transaction handle yet, in
				3765	* which case the new block group is still attached to
				3766	* its transaction handle and its creation has not
				3767	* finished yet (no block group item in the extent tree
				3768	* yet, etc). If this is the case, wait for all free
				3769	* space endio workers to finish and retry. This is a
				3770	* a very rare case so no need for a more efficient and
				3771	* complex approach.
				3772	*/
				3773	if (ret == -ENOENT) {
				3774	wait_event(cur_trans->writer_wait,
				3775	atomic_read(&cur_trans->num_writers) == 1);
				3776	ret = write_one_cache_group(trans, fs_info,
				3777	path, cache);
				3778	}
				3779	if (ret)
				3780	btrfs_abort_transaction(trans, ret);
				3781	}
				3782
				3783	/* if its not on the io list, we need to put the block group */
				3784	if (should_put)
				3785	btrfs_put_block_group(cache);
				3786	spin_lock(&cur_trans->dirty_bgs_lock);
				3787	}
				3788	spin_unlock(&cur_trans->dirty_bgs_lock);
				3789
				3790	/*
				3791	* Refer to the definition of io_bgs member for details why it's safe
				3792	* to use it without any locking
				3793	*/
				3794	while (!list_empty(io)) {
				3795	cache = list_first_entry(io, struct btrfs_block_group_cache,
				3796	io_list);
				3797	list_del_init(&cache->io_list);
				3798	btrfs_wait_cache_io(trans, cache, path);
				3799	btrfs_put_block_group(cache);
				3800	}
				3801
				3802	btrfs_free_path(path);
				3803	return ret;
				3804	}
				3805
				3806	int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
				3807	{
				3808	struct btrfs_block_group_cache *block_group;
				3809	int readonly = 0;
				3810
				3811	block_group = btrfs_lookup_block_group(fs_info, bytenr);
				3812	if (!block_group \|\| block_group->ro)
				3813	readonly = 1;
				3814	if (block_group)
				3815	btrfs_put_block_group(block_group);
				3816	return readonly;
				3817	}
				3818
				3819	bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				3820	{
				3821	struct btrfs_block_group_cache *bg;
				3822	bool ret = true;
				3823
				3824	bg = btrfs_lookup_block_group(fs_info, bytenr);
				3825	if (!bg)
				3826	return false;
				3827
				3828	spin_lock(&bg->lock);
				3829	if (bg->ro)
				3830	ret = false;
				3831	else
				3832	atomic_inc(&bg->nocow_writers);
				3833	spin_unlock(&bg->lock);
				3834
				3835	/* no put on block group, done by btrfs_dec_nocow_writers */
				3836	if (!ret)
				3837	btrfs_put_block_group(bg);
				3838
				3839	return ret;
				3840
				3841	}
				3842
				3843	void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				3844	{
				3845	struct btrfs_block_group_cache *bg;
				3846
				3847	bg = btrfs_lookup_block_group(fs_info, bytenr);
				3848	ASSERT(bg);
				3849	if (atomic_dec_and_test(&bg->nocow_writers))
				3850	wake_up_var(&bg->nocow_writers);
				3851	/*
				3852	* Once for our lookup and once for the lookup done by a previous call
				3853	* to btrfs_inc_nocow_writers()
				3854	*/
				3855	btrfs_put_block_group(bg);
				3856	btrfs_put_block_group(bg);
				3857	}
				3858
				3859	void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
				3860	{
				3861	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
				3862	}
				3863
				3864	static const char *alloc_name(u64 flags)
				3865	{
				3866	switch (flags) {
				3867	case BTRFS_BLOCK_GROUP_METADATA\|BTRFS_BLOCK_GROUP_DATA:
				3868	return "mixed";
				3869	case BTRFS_BLOCK_GROUP_METADATA:
				3870	return "metadata";
				3871	case BTRFS_BLOCK_GROUP_DATA:
				3872	return "data";
				3873	case BTRFS_BLOCK_GROUP_SYSTEM:
				3874	return "system";
				3875	default:
				3876	WARN_ON(1);
				3877	return "invalid-combination";
				3878	};
				3879	}
				3880
				3881	static int create_space_info(struct btrfs_fs_info *info, u64 flags)
				3882	{
				3883
				3884	struct btrfs_space_info *space_info;
				3885	int i;
				3886	int ret;
				3887
				3888	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
				3889	if (!space_info)
				3890	return -ENOMEM;
				3891
				3892	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
				3893	GFP_KERNEL);
				3894	if (ret) {
				3895	kfree(space_info);
				3896	return ret;
				3897	}
				3898
				3899	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
				3900	INIT_LIST_HEAD(&space_info->block_groups[i]);
				3901	init_rwsem(&space_info->groups_sem);
				3902	spin_lock_init(&space_info->lock);
				3903	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
				3904	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				3905	init_waitqueue_head(&space_info->wait);
				3906	INIT_LIST_HEAD(&space_info->ro_bgs);
				3907	INIT_LIST_HEAD(&space_info->tickets);
				3908	INIT_LIST_HEAD(&space_info->priority_tickets);
				3909
				3910	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
				3911	info->space_info_kobj, "%s",
				3912	alloc_name(space_info->flags));
				3913	if (ret) {
				3914	kobject_put(&space_info->kobj);
				3915	return ret;
				3916	}
				3917
				3918	list_add_rcu(&space_info->list, &info->space_info);
				3919	if (flags & BTRFS_BLOCK_GROUP_DATA)
				3920	info->data_sinfo = space_info;
				3921
				3922	return ret;
				3923	}
				3924
				3925	static void update_space_info(struct btrfs_fs_info *info, u64 flags,
				3926	u64 total_bytes, u64 bytes_used,
				3927	u64 bytes_readonly,
				3928	struct btrfs_space_info **space_info)
				3929	{
				3930	struct btrfs_space_info *found;
				3931	int factor;
				3932
				3933	factor = btrfs_bg_type_to_factor(flags);
				3934
				3935	found = __find_space_info(info, flags);
				3936	ASSERT(found);
				3937	spin_lock(&found->lock);
				3938	found->total_bytes += total_bytes;
				3939	found->disk_total += total_bytes * factor;
				3940	found->bytes_used += bytes_used;
				3941	found->disk_used += bytes_used * factor;
				3942	found->bytes_readonly += bytes_readonly;
				3943	if (total_bytes > 0)
				3944	found->full = 0;
				3945	space_info_add_new_bytes(info, found, total_bytes -
				3946	bytes_used - bytes_readonly);
				3947	spin_unlock(&found->lock);
				3948	*space_info = found;
				3949	}
				3950
				3951	static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				3952	{
				3953	u64 extra_flags = chunk_to_extended(flags) &
				3954	BTRFS_EXTENDED_PROFILE_MASK;
				3955
				3956	write_seqlock(&fs_info->profiles_lock);
				3957	if (flags & BTRFS_BLOCK_GROUP_DATA)
				3958	fs_info->avail_data_alloc_bits \|= extra_flags;
				3959	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				3960	fs_info->avail_metadata_alloc_bits \|= extra_flags;
				3961	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				3962	fs_info->avail_system_alloc_bits \|= extra_flags;
				3963	write_sequnlock(&fs_info->profiles_lock);
				3964	}
				3965
				3966	/*
				3967	* returns target flags in extended format or 0 if restripe for this
				3968	* chunk_type is not in progress
				3969	*
				3970	* should be called with balance_lock held
				3971	*/
				3972	static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
				3973	{
				3974	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				3975	u64 target = 0;
				3976
				3977	if (!bctl)
				3978	return 0;
				3979
				3980	if (flags & BTRFS_BLOCK_GROUP_DATA &&
				3981	bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				3982	target = BTRFS_BLOCK_GROUP_DATA \| bctl->data.target;
				3983	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
				3984	bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				3985	target = BTRFS_BLOCK_GROUP_SYSTEM \| bctl->sys.target;
				3986	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
				3987	bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				3988	target = BTRFS_BLOCK_GROUP_METADATA \| bctl->meta.target;
				3989	}
				3990
				3991	return target;
				3992	}
				3993
				3994	/*
				3995	* @flags: available profiles in extended format (see ctree.h)
				3996	*
				3997	* Returns reduced profile in chunk format. If profile changing is in
				3998	* progress (either running or paused) picks the target profile (if it's
				3999	* already available), otherwise falls back to plain reducing.
				4000	*/
				4001	static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
				4002	{
				4003	u64 num_devices = fs_info->fs_devices->rw_devices;
				4004	u64 target;
				4005	u64 raid_type;
				4006	u64 allowed = 0;
				4007
				4008	/*
				4009	* see if restripe for this chunk_type is in progress, if so
				4010	* try to reduce to the target profile
				4011	*/
				4012	spin_lock(&fs_info->balance_lock);
				4013	target = get_restripe_target(fs_info, flags);
				4014	if (target) {
				4015	/* pick target profile only if it's already available */
				4016	if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
				4017	spin_unlock(&fs_info->balance_lock);
				4018	return extended_to_chunk(target);
				4019	}
				4020	}
				4021	spin_unlock(&fs_info->balance_lock);
				4022
				4023	/* First, mask out the RAID levels which aren't possible */
				4024	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
				4025	if (num_devices >= btrfs_raid_array[raid_type].devs_min)
				4026	allowed \|= btrfs_raid_array[raid_type].bg_flag;
				4027	}
				4028	allowed &= flags;
				4029
				4030	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
				4031	allowed = BTRFS_BLOCK_GROUP_RAID6;
				4032	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
				4033	allowed = BTRFS_BLOCK_GROUP_RAID5;
				4034	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
				4035	allowed = BTRFS_BLOCK_GROUP_RAID10;
				4036	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
				4037	allowed = BTRFS_BLOCK_GROUP_RAID1;
				4038	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
				4039	allowed = BTRFS_BLOCK_GROUP_RAID0;
				4040
				4041	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
				4042
				4043	return extended_to_chunk(flags \| allowed);
				4044	}
				4045
				4046	static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
				4047	{
				4048	unsigned seq;
				4049	u64 flags;
				4050
				4051	do {
				4052	flags = orig_flags;
				4053	seq = read_seqbegin(&fs_info->profiles_lock);
				4054
				4055	if (flags & BTRFS_BLOCK_GROUP_DATA)
				4056	flags \|= fs_info->avail_data_alloc_bits;
				4057	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				4058	flags \|= fs_info->avail_system_alloc_bits;
				4059	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
				4060	flags \|= fs_info->avail_metadata_alloc_bits;
				4061	} while (read_seqretry(&fs_info->profiles_lock, seq));
				4062
				4063	return btrfs_reduce_alloc_profile(fs_info, flags);
				4064	}
				4065
				4066	static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
				4067	{
				4068	struct btrfs_fs_info *fs_info = root->fs_info;
				4069	u64 flags;
				4070	u64 ret;
				4071
				4072	if (data)
				4073	flags = BTRFS_BLOCK_GROUP_DATA;
				4074	else if (root == fs_info->chunk_root)
				4075	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				4076	else
				4077	flags = BTRFS_BLOCK_GROUP_METADATA;
				4078
				4079	ret = get_alloc_profile(fs_info, flags);
				4080	return ret;
				4081	}
				4082
				4083	u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
				4084	{
				4085	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
				4086	}
				4087
				4088	u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
				4089	{
				4090	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				4091	}
				4092
				4093	u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
				4094	{
				4095	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				4096	}
				4097
				4098	static u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
				4099	bool may_use_included)
				4100	{
				4101	ASSERT(s_info);
				4102	return s_info->bytes_used + s_info->bytes_reserved +
				4103	s_info->bytes_pinned + s_info->bytes_readonly +
				4104	(may_use_included ? s_info->bytes_may_use : 0);
				4105	}
				4106
				4107	int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
				4108	{
				4109	struct btrfs_root *root = inode->root;
				4110	struct btrfs_fs_info *fs_info = root->fs_info;
				4111	struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
				4112	u64 used;
				4113	int ret = 0;
				4114	int need_commit = 2;
				4115	int have_pinned_space;
				4116
				4117	/* make sure bytes are sectorsize aligned */
				4118	bytes = ALIGN(bytes, fs_info->sectorsize);
				4119
				4120	if (btrfs_is_free_space_inode(inode)) {
				4121	need_commit = 0;
				4122	ASSERT(current->journal_info);
				4123	}
				4124
				4125	again:
				4126	/* make sure we have enough space to handle the data first */
				4127	spin_lock(&data_sinfo->lock);
				4128	used = btrfs_space_info_used(data_sinfo, true);
				4129
				4130	if (used + bytes > data_sinfo->total_bytes) {
				4131	struct btrfs_trans_handle *trans;
				4132
				4133	/*
				4134	* if we don't have enough free bytes in this space then we need
				4135	* to alloc a new chunk.
				4136	*/
				4137	if (!data_sinfo->full) {
				4138	u64 alloc_target;
				4139
				4140	data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
				4141	spin_unlock(&data_sinfo->lock);
				4142
				4143	alloc_target = btrfs_data_alloc_profile(fs_info);
				4144	/*
				4145	* It is ugly that we don't call nolock join
				4146	* transaction for the free space inode case here.
				4147	* But it is safe because we only do the data space
				4148	* reservation for the free space cache in the
				4149	* transaction context, the common join transaction
				4150	* just increase the counter of the current transaction
				4151	* handler, doesn't try to acquire the trans_lock of
				4152	* the fs.
				4153	*/
				4154	trans = btrfs_join_transaction(root);
				4155	if (IS_ERR(trans))
				4156	return PTR_ERR(trans);
				4157
				4158	ret = do_chunk_alloc(trans, alloc_target,
				4159	CHUNK_ALLOC_NO_FORCE);
				4160	btrfs_end_transaction(trans);
				4161	if (ret < 0) {
				4162	if (ret != -ENOSPC)
				4163	return ret;
				4164	else {
				4165	have_pinned_space = 1;
				4166	goto commit_trans;
				4167	}
				4168	}
				4169
				4170	goto again;
				4171	}
				4172
				4173	/*
				4174	* If we don't have enough pinned space to deal with this
				4175	* allocation, and no removed chunk in current transaction,
				4176	* don't bother committing the transaction.
				4177	*/
				4178	have_pinned_space = __percpu_counter_compare(
				4179	&data_sinfo->total_bytes_pinned,
				4180	used + bytes - data_sinfo->total_bytes,
				4181	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				4182	spin_unlock(&data_sinfo->lock);
				4183
				4184	/* commit the current transaction and try again */
				4185	commit_trans:
				4186	if (need_commit) {
				4187	need_commit--;
				4188
				4189	if (need_commit > 0) {
				4190	btrfs_start_delalloc_roots(fs_info, -1);
				4191	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0,
				4192	(u64)-1);
				4193	}
				4194
				4195	trans = btrfs_join_transaction(root);
				4196	if (IS_ERR(trans))
				4197	return PTR_ERR(trans);
				4198	if (have_pinned_space >= 0 \|\|
				4199	test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
				4200	&trans->transaction->flags) \|\|
				4201	need_commit > 0) {
				4202	ret = btrfs_commit_transaction(trans);
				4203	if (ret)
				4204	return ret;
				4205	/*
				4206	* The cleaner kthread might still be doing iput
				4207	* operations. Wait for it to finish so that
				4208	* more space is released.
				4209	*/
				4210	mutex_lock(&fs_info->cleaner_delayed_iput_mutex);
				4211	mutex_unlock(&fs_info->cleaner_delayed_iput_mutex);
				4212	goto again;
				4213	} else {
				4214	btrfs_end_transaction(trans);
				4215	}
				4216	}
				4217
				4218	trace_btrfs_space_reservation(fs_info,
				4219	"space_info:enospc",
				4220	data_sinfo->flags, bytes, 1);
				4221	return -ENOSPC;
				4222	}
				4223	data_sinfo->bytes_may_use += bytes;
				4224	trace_btrfs_space_reservation(fs_info, "space_info",
				4225	data_sinfo->flags, bytes, 1);
				4226	spin_unlock(&data_sinfo->lock);
				4227
				4228	return 0;
				4229	}
				4230
				4231	int btrfs_check_data_free_space(struct inode *inode,
				4232	struct extent_changeset **reserved, u64 start, u64 len)
				4233	{
				4234	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4235	int ret;
				4236
				4237	/* align the range */
				4238	len = round_up(start + len, fs_info->sectorsize) -
				4239	round_down(start, fs_info->sectorsize);
				4240	start = round_down(start, fs_info->sectorsize);
				4241
				4242	ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), len);
				4243	if (ret < 0)
				4244	return ret;
				4245
				4246	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
				4247	ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
				4248	if (ret < 0)
				4249	btrfs_free_reserved_data_space_noquota(inode, start, len);
				4250	else
				4251	ret = 0;
				4252	return ret;
				4253	}
				4254
				4255	/*
				4256	* Called if we need to clear a data reservation for this inode
				4257	* Normally in a error case.
				4258	*
				4259	* This one will NOT use accurate qgroup reserved space API, just for case
				4260	* which we can't sleep and is sure it won't affect qgroup reserved space.
				4261	* Like clear_bit_hook().
				4262	*/
				4263	void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
				4264	u64 len)
				4265	{
				4266	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				4267	struct btrfs_space_info *data_sinfo;
				4268
				4269	/* Make sure the range is aligned to sectorsize */
				4270	len = round_up(start + len, fs_info->sectorsize) -
				4271	round_down(start, fs_info->sectorsize);
				4272	start = round_down(start, fs_info->sectorsize);
				4273
				4274	data_sinfo = fs_info->data_sinfo;
				4275	spin_lock(&data_sinfo->lock);
				4276	if (WARN_ON(data_sinfo->bytes_may_use < len))
				4277	data_sinfo->bytes_may_use = 0;
				4278	else
				4279	data_sinfo->bytes_may_use -= len;
				4280	trace_btrfs_space_reservation(fs_info, "space_info",
				4281	data_sinfo->flags, len, 0);
				4282	spin_unlock(&data_sinfo->lock);
				4283	}
				4284
				4285	/*
				4286	* Called if we need to clear a data reservation for this inode
				4287	* Normally in a error case.
				4288	*
				4289	* This one will handle the per-inode data rsv map for accurate reserved
				4290	* space framework.
				4291	*/
				4292	void btrfs_free_reserved_data_space(struct inode *inode,
				4293	struct extent_changeset *reserved, u64 start, u64 len)
				4294	{
				4295	struct btrfs_root *root = BTRFS_I(inode)->root;
				4296
				4297	/* Make sure the range is aligned to sectorsize */
				4298	len = round_up(start + len, root->fs_info->sectorsize) -
				4299	round_down(start, root->fs_info->sectorsize);
				4300	start = round_down(start, root->fs_info->sectorsize);
				4301
				4302	btrfs_free_reserved_data_space_noquota(inode, start, len);
				4303	btrfs_qgroup_free_data(inode, reserved, start, len);
				4304	}
				4305
				4306	static void force_metadata_allocation(struct btrfs_fs_info *info)
				4307	{
				4308	struct list_head *head = &info->space_info;
				4309	struct btrfs_space_info *found;
				4310
				4311	rcu_read_lock();
				4312	list_for_each_entry_rcu(found, head, list) {
				4313	if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
				4314	found->force_alloc = CHUNK_ALLOC_FORCE;
				4315	}
				4316	rcu_read_unlock();
				4317	}
				4318
				4319	static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
				4320	{
				4321	return (global->size << 1);
				4322	}
				4323
				4324	static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
				4325	struct btrfs_space_info *sinfo, int force)
				4326	{
				4327	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				4328	u64 bytes_used = btrfs_space_info_used(sinfo, false);
				4329	u64 thresh;
				4330
				4331	if (force == CHUNK_ALLOC_FORCE)
				4332	return 1;
				4333
				4334	/*
				4335	* We need to take into account the global rsv because for all intents
				4336	* and purposes it's used space. Don't worry about locking the
				4337	* global_rsv, it doesn't change except when the transaction commits.
				4338	*/
				4339	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
				4340	bytes_used += calc_global_rsv_need_space(global_rsv);
				4341
				4342	/*
				4343	* in limited mode, we want to have some free space up to
				4344	* about 1% of the FS size.
				4345	*/
				4346	if (force == CHUNK_ALLOC_LIMITED) {
				4347	thresh = btrfs_super_total_bytes(fs_info->super_copy);
				4348	thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
				4349
				4350	if (sinfo->total_bytes - bytes_used < thresh)
				4351	return 1;
				4352	}
				4353
				4354	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
				4355	return 0;
				4356	return 1;
				4357	}
				4358
				4359	static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
				4360	{
				4361	u64 num_dev;
				4362
				4363	if (type & (BTRFS_BLOCK_GROUP_RAID10 \|
				4364	BTRFS_BLOCK_GROUP_RAID0 \|
				4365	BTRFS_BLOCK_GROUP_RAID5 \|
				4366	BTRFS_BLOCK_GROUP_RAID6))
				4367	num_dev = fs_info->fs_devices->rw_devices;
				4368	else if (type & BTRFS_BLOCK_GROUP_RAID1)
				4369	num_dev = 2;
				4370	else
				4371	num_dev = 1; /* DUP or single */
				4372
				4373	return num_dev;
				4374	}
				4375
				4376	/*
				4377	* If @is_allocation is true, reserve space in the system space info necessary
				4378	* for allocating a chunk, otherwise if it's false, reserve space necessary for
				4379	* removing a chunk.
				4380	*/
				4381	void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
				4382	{
				4383	struct btrfs_fs_info *fs_info = trans->fs_info;
				4384	struct btrfs_space_info *info;
				4385	u64 left;
				4386	u64 thresh;
				4387	int ret = 0;
				4388	u64 num_devs;
				4389
				4390	/*
				4391	* Needed because we can end up allocating a system chunk and for an
				4392	* atomic and race free space reservation in the chunk block reserve.
				4393	*/
				4394	lockdep_assert_held(&fs_info->chunk_mutex);
				4395
				4396	info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				4397	spin_lock(&info->lock);
				4398	left = info->total_bytes - btrfs_space_info_used(info, true);
				4399	spin_unlock(&info->lock);
				4400
				4401	num_devs = get_profile_num_devs(fs_info, type);
				4402
				4403	/* num_devs device items to update and 1 chunk item to add or remove */
				4404	thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) +
				4405	btrfs_calc_trans_metadata_size(fs_info, 1);
				4406
				4407	if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				4408	btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
				4409	left, thresh, type);
				4410	dump_space_info(fs_info, info, 0, 0);
				4411	}
				4412
				4413	if (left < thresh) {
				4414	u64 flags = btrfs_system_alloc_profile(fs_info);
				4415
				4416	/*
				4417	* Ignore failure to create system chunk. We might end up not
				4418	* needing it, as we might not need to COW all nodes/leafs from
				4419	* the paths we visit in the chunk tree (they were already COWed
				4420	* or created in the current transaction for example).
				4421	*/
				4422	ret = btrfs_alloc_chunk(trans, flags);
				4423	}
				4424
				4425	if (!ret) {
				4426	ret = btrfs_block_rsv_add(fs_info->chunk_root,
				4427	&fs_info->chunk_block_rsv,
				4428	thresh, BTRFS_RESERVE_NO_FLUSH);
				4429	if (!ret)
				4430	trans->chunk_bytes_reserved += thresh;
				4431	}
				4432	}
				4433
				4434	/*
				4435	* If force is CHUNK_ALLOC_FORCE:
				4436	* - return 1 if it successfully allocates a chunk,
				4437	* - return errors including -ENOSPC otherwise.
				4438	* If force is NOT CHUNK_ALLOC_FORCE:
				4439	* - return 0 if it doesn't need to allocate a new chunk,
				4440	* - return 1 if it successfully allocates a chunk,
				4441	* - return errors including -ENOSPC otherwise.
				4442	*/
				4443	static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
				4444	int force)
				4445	{
				4446	struct btrfs_fs_info *fs_info = trans->fs_info;
				4447	struct btrfs_space_info *space_info;
				4448	bool wait_for_alloc = false;
				4449	bool should_alloc = false;
				4450	int ret = 0;
				4451
				4452	/* Don't re-enter if we're already allocating a chunk */
				4453	if (trans->allocating_chunk)
				4454	return -ENOSPC;
				4455
				4456	space_info = __find_space_info(fs_info, flags);
				4457	ASSERT(space_info);
				4458
				4459	do {
				4460	spin_lock(&space_info->lock);
				4461	if (force < space_info->force_alloc)
				4462	force = space_info->force_alloc;
				4463	should_alloc = should_alloc_chunk(fs_info, space_info, force);
				4464	if (space_info->full) {
				4465	/* No more free physical space */
				4466	if (should_alloc)
				4467	ret = -ENOSPC;
				4468	else
				4469	ret = 0;
				4470	spin_unlock(&space_info->lock);
				4471	return ret;
				4472	} else if (!should_alloc) {
				4473	spin_unlock(&space_info->lock);
				4474	return 0;
				4475	} else if (space_info->chunk_alloc) {
				4476	/*
				4477	* Someone is already allocating, so we need to block
				4478	* until this someone is finished and then loop to
				4479	* recheck if we should continue with our allocation
				4480	* attempt.
				4481	*/
				4482	wait_for_alloc = true;
				4483	spin_unlock(&space_info->lock);
				4484	mutex_lock(&fs_info->chunk_mutex);
				4485	mutex_unlock(&fs_info->chunk_mutex);
				4486	} else {
				4487	/* Proceed with allocation */
				4488	space_info->chunk_alloc = 1;
				4489	wait_for_alloc = false;
				4490	spin_unlock(&space_info->lock);
				4491	}
				4492
				4493	cond_resched();
				4494	} while (wait_for_alloc);
				4495
				4496	mutex_lock(&fs_info->chunk_mutex);
				4497	trans->allocating_chunk = true;
				4498
				4499	/*
				4500	* If we have mixed data/metadata chunks we want to make sure we keep
				4501	* allocating mixed chunks instead of individual chunks.
				4502	*/
				4503	if (btrfs_mixed_space_info(space_info))
				4504	flags \|= (BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA);
				4505
				4506	/*
				4507	* if we're doing a data chunk, go ahead and make sure that
				4508	* we keep a reasonable number of metadata chunks allocated in the
				4509	* FS as well.
				4510	*/
				4511	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
				4512	fs_info->data_chunk_allocations++;
				4513	if (!(fs_info->data_chunk_allocations %
				4514	fs_info->metadata_ratio))
				4515	force_metadata_allocation(fs_info);
				4516	}
				4517
				4518	/*
				4519	* Check if we have enough space in SYSTEM chunk because we may need
				4520	* to update devices.
				4521	*/
				4522	check_system_chunk(trans, flags);
				4523
				4524	ret = btrfs_alloc_chunk(trans, flags);
				4525	trans->allocating_chunk = false;
				4526
				4527	spin_lock(&space_info->lock);
				4528	if (ret < 0) {
				4529	if (ret == -ENOSPC)
				4530	space_info->full = 1;
				4531	else
				4532	goto out;
				4533	} else {
				4534	ret = 1;
				4535	space_info->max_extent_size = 0;
				4536	}
				4537
				4538	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				4539	out:
				4540	space_info->chunk_alloc = 0;
				4541	spin_unlock(&space_info->lock);
				4542	mutex_unlock(&fs_info->chunk_mutex);
				4543	/*
				4544	* When we allocate a new chunk we reserve space in the chunk block
				4545	* reserve to make sure we can COW nodes/leafs in the chunk tree or
				4546	* add new nodes/leafs to it if we end up needing to do it when
				4547	* inserting the chunk item and updating device items as part of the
				4548	* second phase of chunk allocation, performed by
				4549	* btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
				4550	* large number of new block groups to create in our transaction
				4551	* handle's new_bgs list to avoid exhausting the chunk block reserve
				4552	* in extreme cases - like having a single transaction create many new
				4553	* block groups when starting to write out the free space caches of all
				4554	* the block groups that were made dirty during the lifetime of the
				4555	* transaction.
				4556	*/
				4557	if (trans->chunk_bytes_reserved >= (u64)SZ_2M)
				4558	btrfs_create_pending_block_groups(trans);
				4559
				4560	return ret;
				4561	}
				4562
				4563	static int can_overcommit(struct btrfs_fs_info *fs_info,
				4564	struct btrfs_space_info *space_info, u64 bytes,
				4565	enum btrfs_reserve_flush_enum flush,
				4566	bool system_chunk)
				4567	{
				4568	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				4569	u64 profile;
				4570	u64 space_size;
				4571	u64 avail;
				4572	u64 used;
				4573	int factor;
				4574
				4575	/* Don't overcommit when in mixed mode. */
				4576	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
				4577	return 0;
				4578
				4579	if (system_chunk)
				4580	profile = btrfs_system_alloc_profile(fs_info);
				4581	else
				4582	profile = btrfs_metadata_alloc_profile(fs_info);
				4583
				4584	used = btrfs_space_info_used(space_info, false);
				4585
				4586	/*
				4587	* We only want to allow over committing if we have lots of actual space
				4588	* free, but if we don't have enough space to handle the global reserve
				4589	* space then we could end up having a real enospc problem when trying
				4590	* to allocate a chunk or some other such important allocation.
				4591	*/
				4592	spin_lock(&global_rsv->lock);
				4593	space_size = calc_global_rsv_need_space(global_rsv);
				4594	spin_unlock(&global_rsv->lock);
				4595	if (used + space_size >= space_info->total_bytes)
				4596	return 0;
				4597
				4598	used += space_info->bytes_may_use;
				4599
				4600	avail = atomic64_read(&fs_info->free_chunk_space);
				4601
				4602	/*
				4603	* If we have dup, raid1 or raid10 then only half of the free
				4604	* space is actually useable. For raid56, the space info used
				4605	* doesn't include the parity drive, so we don't have to
				4606	* change the math
				4607	*/
				4608	factor = btrfs_bg_type_to_factor(profile);
				4609	avail = div_u64(avail, factor);
				4610
				4611	/*
				4612	* If we aren't flushing all things, let us overcommit up to
				4613	* 1/2th of the space. If we can flush, don't let us overcommit
				4614	* too much, let it overcommit up to 1/8 of the space.
				4615	*/
				4616	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				4617	avail >>= 3;
				4618	else
				4619	avail >>= 1;
				4620
				4621	if (used + bytes < space_info->total_bytes + avail)
				4622	return 1;
				4623	return 0;
				4624	}
				4625
				4626	static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
				4627	unsigned long nr_pages, int nr_items)
				4628	{
				4629	struct super_block *sb = fs_info->sb;
				4630
				4631	if (down_read_trylock(&sb->s_umount)) {
				4632	writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
				4633	up_read(&sb->s_umount);
				4634	} else {
				4635	/*
				4636	* We needn't worry the filesystem going from r/w to r/o though
				4637	* we don't acquire ->s_umount mutex, because the filesystem
				4638	* should guarantee the delalloc inodes list be empty after
				4639	* the filesystem is readonly(all dirty pages are written to
				4640	* the disk).
				4641	*/
				4642	btrfs_start_delalloc_roots(fs_info, nr_items);
				4643	if (!current->journal_info)
				4644	btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
				4645	}
				4646	}
				4647
				4648	static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
				4649	u64 to_reclaim)
				4650	{
				4651	u64 bytes;
				4652	u64 nr;
				4653
				4654	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
				4655	nr = div64_u64(to_reclaim, bytes);
				4656	if (!nr)
				4657	nr = 1;
				4658	return nr;
				4659	}
				4660
				4661	#define EXTENT_SIZE_PER_ITEM SZ_256K
				4662
				4663	/*
				4664	* shrink metadata reservation for delalloc
				4665	*/
				4666	static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
				4667	u64 orig, bool wait_ordered)
				4668	{
				4669	struct btrfs_space_info *space_info;
				4670	struct btrfs_trans_handle *trans;
				4671	u64 delalloc_bytes;
				4672	u64 max_reclaim;
				4673	u64 items;
				4674	long time_left;
				4675	unsigned long nr_pages;
				4676	int loops;
				4677
				4678	/* Calc the number of the pages we need flush for space reservation */
				4679	items = calc_reclaim_items_nr(fs_info, to_reclaim);
				4680	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
				4681
				4682	trans = (struct btrfs_trans_handle *)current->journal_info;
				4683	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				4684
				4685	delalloc_bytes = percpu_counter_sum_positive(
				4686	&fs_info->delalloc_bytes);
				4687	if (delalloc_bytes == 0) {
				4688	if (trans)
				4689	return;
				4690	if (wait_ordered)
				4691	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				4692	return;
				4693	}
				4694
				4695	loops = 0;
				4696	while (delalloc_bytes && loops < 3) {
				4697	max_reclaim = min(delalloc_bytes, to_reclaim);
				4698	nr_pages = max_reclaim >> PAGE_SHIFT;
				4699	btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
				4700	/*
				4701	* We need to wait for the async pages to actually start before
				4702	* we do anything.
				4703	*/
				4704	max_reclaim = atomic_read(&fs_info->async_delalloc_pages);
				4705	if (!max_reclaim)
				4706	goto skip_async;
				4707
				4708	if (max_reclaim <= nr_pages)
				4709	max_reclaim = 0;
				4710	else
				4711	max_reclaim -= nr_pages;
				4712
				4713	wait_event(fs_info->async_submit_wait,
				4714	atomic_read(&fs_info->async_delalloc_pages) <=
				4715	(int)max_reclaim);
				4716	skip_async:
				4717	spin_lock(&space_info->lock);
				4718	if (list_empty(&space_info->tickets) &&
				4719	list_empty(&space_info->priority_tickets)) {
				4720	spin_unlock(&space_info->lock);
				4721	break;
				4722	}
				4723	spin_unlock(&space_info->lock);
				4724
				4725	loops++;
				4726	if (wait_ordered && !trans) {
				4727	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				4728	} else {
				4729	time_left = schedule_timeout_killable(1);
				4730	if (time_left)
				4731	break;
				4732	}
				4733	delalloc_bytes = percpu_counter_sum_positive(
				4734	&fs_info->delalloc_bytes);
				4735	}
				4736	}
				4737
				4738	struct reserve_ticket {
				4739	u64 bytes;
				4740	int error;
				4741	struct list_head list;
				4742	wait_queue_head_t wait;
				4743	};
				4744
				4745	/**
				4746	* maybe_commit_transaction - possibly commit the transaction if its ok to
				4747	* @root - the root we're allocating for
				4748	* @bytes - the number of bytes we want to reserve
				4749	* @force - force the commit
				4750	*
				4751	* This will check to make sure that committing the transaction will actually
				4752	* get us somewhere and then commit the transaction if it does. Otherwise it
				4753	* will return -ENOSPC.
				4754	*/
				4755	static int may_commit_transaction(struct btrfs_fs_info *fs_info,
				4756	struct btrfs_space_info *space_info)
				4757	{
				4758	struct reserve_ticket *ticket = NULL;
				4759	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
				4760	struct btrfs_trans_handle *trans;
				4761	u64 bytes;
				4762
				4763	trans = (struct btrfs_trans_handle *)current->journal_info;
				4764	if (trans)
				4765	return -EAGAIN;
				4766
				4767	spin_lock(&space_info->lock);
				4768	if (!list_empty(&space_info->priority_tickets))
				4769	ticket = list_first_entry(&space_info->priority_tickets,
				4770	struct reserve_ticket, list);
				4771	else if (!list_empty(&space_info->tickets))
				4772	ticket = list_first_entry(&space_info->tickets,
				4773	struct reserve_ticket, list);
				4774	bytes = (ticket) ? ticket->bytes : 0;
				4775	spin_unlock(&space_info->lock);
				4776
				4777	if (!bytes)
				4778	return 0;
				4779
				4780	/* See if there is enough pinned space to make this reservation */
				4781	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
				4782	bytes,
				4783	BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
				4784	goto commit;
				4785
				4786	/*
				4787	* See if there is some space in the delayed insertion reservation for
				4788	* this reservation.
				4789	*/
				4790	if (space_info != delayed_rsv->space_info)
				4791	return -ENOSPC;
				4792
				4793	spin_lock(&delayed_rsv->lock);
				4794	if (delayed_rsv->size > bytes)
				4795	bytes = 0;
				4796	else
				4797	bytes -= delayed_rsv->size;
				4798	spin_unlock(&delayed_rsv->lock);
				4799
				4800	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
				4801	bytes,
				4802	BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
				4803	return -ENOSPC;
				4804	}
				4805
				4806	commit:
				4807	trans = btrfs_join_transaction(fs_info->extent_root);
				4808	if (IS_ERR(trans))
				4809	return -ENOSPC;
				4810
				4811	return btrfs_commit_transaction(trans);
				4812	}
				4813
				4814	/*
				4815	* Try to flush some data based on policy set by @state. This is only advisory
				4816	* and may fail for various reasons. The caller is supposed to examine the
				4817	* state of @space_info to detect the outcome.
				4818	*/
				4819	static void flush_space(struct btrfs_fs_info *fs_info,
				4820	struct btrfs_space_info *space_info, u64 num_bytes,
				4821	int state)
				4822	{
				4823	struct btrfs_root *root = fs_info->extent_root;
				4824	struct btrfs_trans_handle *trans;
				4825	int nr;
				4826	int ret = 0;
				4827
				4828	switch (state) {
				4829	case FLUSH_DELAYED_ITEMS_NR:
				4830	case FLUSH_DELAYED_ITEMS:
				4831	if (state == FLUSH_DELAYED_ITEMS_NR)
				4832	nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
				4833	else
				4834	nr = -1;
				4835
				4836	trans = btrfs_join_transaction(root);
				4837	if (IS_ERR(trans)) {
				4838	ret = PTR_ERR(trans);
				4839	break;
				4840	}
				4841	ret = btrfs_run_delayed_items_nr(trans, nr);
				4842	btrfs_end_transaction(trans);
				4843	break;
				4844	case FLUSH_DELALLOC:
				4845	case FLUSH_DELALLOC_WAIT:
				4846	shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
				4847	state == FLUSH_DELALLOC_WAIT);
				4848	break;
				4849	case ALLOC_CHUNK:
				4850	trans = btrfs_join_transaction(root);
				4851	if (IS_ERR(trans)) {
				4852	ret = PTR_ERR(trans);
				4853	break;
				4854	}
				4855	ret = do_chunk_alloc(trans,
				4856	btrfs_metadata_alloc_profile(fs_info),
				4857	CHUNK_ALLOC_NO_FORCE);
				4858	btrfs_end_transaction(trans);
				4859	if (ret > 0 \|\| ret == -ENOSPC)
				4860	ret = 0;
				4861	break;
				4862	case COMMIT_TRANS:
				4863	ret = may_commit_transaction(fs_info, space_info);
				4864	break;
				4865	default:
				4866	ret = -ENOSPC;
				4867	break;
				4868	}
				4869
				4870	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
				4871	ret);
				4872	return;
				4873	}
				4874
				4875	static inline u64
				4876	btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
				4877	struct btrfs_space_info *space_info,
				4878	bool system_chunk)
				4879	{
				4880	struct reserve_ticket *ticket;
				4881	u64 used;
				4882	u64 expected;
				4883	u64 to_reclaim = 0;
				4884
				4885	list_for_each_entry(ticket, &space_info->tickets, list)
				4886	to_reclaim += ticket->bytes;
				4887	list_for_each_entry(ticket, &space_info->priority_tickets, list)
				4888	to_reclaim += ticket->bytes;
				4889	if (to_reclaim)
				4890	return to_reclaim;
				4891
				4892	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
				4893	if (can_overcommit(fs_info, space_info, to_reclaim,
				4894	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
				4895	return 0;
				4896
				4897	used = btrfs_space_info_used(space_info, true);
				4898
				4899	if (can_overcommit(fs_info, space_info, SZ_1M,
				4900	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
				4901	expected = div_factor_fine(space_info->total_bytes, 95);
				4902	else
				4903	expected = div_factor_fine(space_info->total_bytes, 90);
				4904
				4905	if (used > expected)
				4906	to_reclaim = used - expected;
				4907	else
				4908	to_reclaim = 0;
				4909	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
				4910	space_info->bytes_reserved);
				4911	return to_reclaim;
				4912	}
				4913
				4914	static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
				4915	struct btrfs_space_info *space_info,
				4916	u64 used, bool system_chunk)
				4917	{
				4918	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
				4919
				4920	/* If we're just plain full then async reclaim just slows us down. */
				4921	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
				4922	return 0;
				4923
				4924	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				4925	system_chunk))
				4926	return 0;
				4927
				4928	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
				4929	!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
				4930	}
				4931
				4932	static void wake_all_tickets(struct list_head *head)
				4933	{
				4934	struct reserve_ticket *ticket;
				4935
				4936	while (!list_empty(head)) {
				4937	ticket = list_first_entry(head, struct reserve_ticket, list);
				4938	list_del_init(&ticket->list);
				4939	ticket->error = -ENOSPC;
				4940	wake_up(&ticket->wait);
				4941	}
				4942	}
				4943
				4944	/*
				4945	* This is for normal flushers, we can wait all goddamned day if we want to. We
				4946	* will loop and continuously try to flush as long as we are making progress.
				4947	* We count progress as clearing off tickets each time we have to loop.
				4948	*/
				4949	static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
				4950	{
				4951	struct btrfs_fs_info *fs_info;
				4952	struct btrfs_space_info *space_info;
				4953	u64 to_reclaim;
				4954	int flush_state;
				4955	int commit_cycles = 0;
				4956	u64 last_tickets_id;
				4957
				4958	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
				4959	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				4960
				4961	spin_lock(&space_info->lock);
				4962	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				4963	false);
				4964	if (!to_reclaim) {
				4965	space_info->flush = 0;
				4966	spin_unlock(&space_info->lock);
				4967	return;
				4968	}
				4969	last_tickets_id = space_info->tickets_id;
				4970	spin_unlock(&space_info->lock);
				4971
				4972	flush_state = FLUSH_DELAYED_ITEMS_NR;
				4973	do {
				4974	flush_space(fs_info, space_info, to_reclaim, flush_state);
				4975	spin_lock(&space_info->lock);
				4976	if (list_empty(&space_info->tickets)) {
				4977	space_info->flush = 0;
				4978	spin_unlock(&space_info->lock);
				4979	return;
				4980	}
				4981	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
				4982	space_info,
				4983	false);
				4984	if (last_tickets_id == space_info->tickets_id) {
				4985	flush_state++;
				4986	} else {
				4987	last_tickets_id = space_info->tickets_id;
				4988	flush_state = FLUSH_DELAYED_ITEMS_NR;
				4989	if (commit_cycles)
				4990	commit_cycles--;
				4991	}
				4992
				4993	if (flush_state > COMMIT_TRANS) {
				4994	commit_cycles++;
				4995	if (commit_cycles > 2) {
				4996	wake_all_tickets(&space_info->tickets);
				4997	space_info->flush = 0;
				4998	} else {
				4999	flush_state = FLUSH_DELAYED_ITEMS_NR;
				5000	}
				5001	}
				5002	spin_unlock(&space_info->lock);
				5003	} while (flush_state <= COMMIT_TRANS);
				5004	}
				5005
				5006	void btrfs_init_async_reclaim_work(struct work_struct *work)
				5007	{
				5008	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
				5009	}
				5010
				5011	static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
				5012	struct btrfs_space_info *space_info,
				5013	struct reserve_ticket *ticket)
				5014	{
				5015	u64 to_reclaim;
				5016	int flush_state = FLUSH_DELAYED_ITEMS_NR;
				5017
				5018	spin_lock(&space_info->lock);
				5019	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				5020	false);
				5021	if (!to_reclaim) {
				5022	spin_unlock(&space_info->lock);
				5023	return;
				5024	}
				5025	spin_unlock(&space_info->lock);
				5026
				5027	do {
				5028	flush_space(fs_info, space_info, to_reclaim, flush_state);
				5029	flush_state++;
				5030	spin_lock(&space_info->lock);
				5031	if (ticket->bytes == 0) {
				5032	spin_unlock(&space_info->lock);
				5033	return;
				5034	}
				5035	spin_unlock(&space_info->lock);
				5036
				5037	/*
				5038	* Priority flushers can't wait on delalloc without
				5039	* deadlocking.
				5040	*/
				5041	if (flush_state == FLUSH_DELALLOC \|\|
				5042	flush_state == FLUSH_DELALLOC_WAIT)
				5043	flush_state = ALLOC_CHUNK;
				5044	} while (flush_state < COMMIT_TRANS);
				5045	}
				5046
				5047	static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
				5048	struct btrfs_space_info *space_info,
				5049	struct reserve_ticket *ticket, u64 orig_bytes)
				5050
				5051	{
				5052	DEFINE_WAIT(wait);
				5053	int ret = 0;
				5054
				5055	spin_lock(&space_info->lock);
				5056	while (ticket->bytes > 0 && ticket->error == 0) {
				5057	ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
				5058	if (ret) {
				5059	ret = -EINTR;
				5060	break;
				5061	}
				5062	spin_unlock(&space_info->lock);
				5063
				5064	schedule();
				5065
				5066	finish_wait(&ticket->wait, &wait);
				5067	spin_lock(&space_info->lock);
				5068	}
				5069	if (!ret)
				5070	ret = ticket->error;
				5071	if (!list_empty(&ticket->list))
				5072	list_del_init(&ticket->list);
				5073	if (ticket->bytes && ticket->bytes < orig_bytes) {
				5074	u64 num_bytes = orig_bytes - ticket->bytes;
				5075	space_info->bytes_may_use -= num_bytes;
				5076	trace_btrfs_space_reservation(fs_info, "space_info",
				5077	space_info->flags, num_bytes, 0);
				5078	}
				5079	spin_unlock(&space_info->lock);
				5080
				5081	return ret;
				5082	}
				5083
				5084	/**
				5085	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				5086	* @root - the root we're allocating for
				5087	* @space_info - the space info we want to allocate from
				5088	* @orig_bytes - the number of bytes we want
				5089	* @flush - whether or not we can flush to make our reservation
				5090	*
				5091	* This will reserve orig_bytes number of bytes from the space info associated
				5092	* with the block_rsv. If there is not enough space it will make an attempt to
				5093	* flush out space to make room. It will do this by flushing delalloc if
				5094	* possible or committing the transaction. If flush is 0 then no attempts to
				5095	* regain reservations will be made and this will fail if there is not enough
				5096	* space already.
				5097	*/
				5098	static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
				5099	struct btrfs_space_info *space_info,
				5100	u64 orig_bytes,
				5101	enum btrfs_reserve_flush_enum flush,
				5102	bool system_chunk)
				5103	{
				5104	struct reserve_ticket ticket;
				5105	u64 used;
				5106	int ret = 0;
				5107
				5108	ASSERT(orig_bytes);
				5109	ASSERT(!current->journal_info \|\| flush != BTRFS_RESERVE_FLUSH_ALL);
				5110
				5111	spin_lock(&space_info->lock);
				5112	ret = -ENOSPC;
				5113	used = btrfs_space_info_used(space_info, true);
				5114
				5115	/*
				5116	* If we have enough space then hooray, make our reservation and carry
				5117	* on. If not see if we can overcommit, and if we can, hooray carry on.
				5118	* If not things get more complicated.
				5119	*/
				5120	if (used + orig_bytes <= space_info->total_bytes) {
				5121	space_info->bytes_may_use += orig_bytes;
				5122	trace_btrfs_space_reservation(fs_info, "space_info",
				5123	space_info->flags, orig_bytes, 1);
				5124	ret = 0;
				5125	} else if (can_overcommit(fs_info, space_info, orig_bytes, flush,
				5126	system_chunk)) {
				5127	space_info->bytes_may_use += orig_bytes;
				5128	trace_btrfs_space_reservation(fs_info, "space_info",
				5129	space_info->flags, orig_bytes, 1);
				5130	ret = 0;
				5131	}
				5132
				5133	/*
				5134	* If we couldn't make a reservation then setup our reservation ticket
				5135	* and kick the async worker if it's not already running.
				5136	*
				5137	* If we are a priority flusher then we just need to add our ticket to
				5138	* the list and we will do our own flushing further down.
				5139	*/
				5140	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
				5141	ticket.bytes = orig_bytes;
				5142	ticket.error = 0;
				5143	init_waitqueue_head(&ticket.wait);
				5144	if (flush == BTRFS_RESERVE_FLUSH_ALL) {
				5145	list_add_tail(&ticket.list, &space_info->tickets);
				5146	if (!space_info->flush) {
				5147	space_info->flush = 1;
				5148	trace_btrfs_trigger_flush(fs_info,
				5149	space_info->flags,
				5150	orig_bytes, flush,
				5151	"enospc");
				5152	queue_work(system_unbound_wq,
				5153	&fs_info->async_reclaim_work);
				5154	}
				5155	} else {
				5156	list_add_tail(&ticket.list,
				5157	&space_info->priority_tickets);
				5158	}
				5159	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				5160	used += orig_bytes;
				5161	/*
				5162	* We will do the space reservation dance during log replay,
				5163	* which means we won't have fs_info->fs_root set, so don't do
				5164	* the async reclaim as we will panic.
				5165	*/
				5166	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
				5167	need_do_async_reclaim(fs_info, space_info,
				5168	used, system_chunk) &&
				5169	!work_busy(&fs_info->async_reclaim_work)) {
				5170	trace_btrfs_trigger_flush(fs_info, space_info->flags,
				5171	orig_bytes, flush, "preempt");
				5172	queue_work(system_unbound_wq,
				5173	&fs_info->async_reclaim_work);
				5174	}
				5175	}
				5176	spin_unlock(&space_info->lock);
				5177	if (!ret \|\| flush == BTRFS_RESERVE_NO_FLUSH)
				5178	return ret;
				5179
				5180	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				5181	return wait_reserve_ticket(fs_info, space_info, &ticket,
				5182	orig_bytes);
				5183
				5184	ret = 0;
				5185	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
				5186	spin_lock(&space_info->lock);
				5187	if (ticket.bytes) {
				5188	if (ticket.bytes < orig_bytes) {
				5189	u64 num_bytes = orig_bytes - ticket.bytes;
				5190	space_info->bytes_may_use -= num_bytes;
				5191	trace_btrfs_space_reservation(fs_info, "space_info",
				5192	space_info->flags,
				5193	num_bytes, 0);
				5194
				5195	}
				5196	list_del_init(&ticket.list);
				5197	ret = -ENOSPC;
				5198	}
				5199	spin_unlock(&space_info->lock);
				5200	ASSERT(list_empty(&ticket.list));
				5201	return ret;
				5202	}
				5203
				5204	/**
				5205	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				5206	* @root - the root we're allocating for
				5207	* @block_rsv - the block_rsv we're allocating for
				5208	* @orig_bytes - the number of bytes we want
				5209	* @flush - whether or not we can flush to make our reservation
				5210	*
				5211	* This will reserve orgi_bytes number of bytes from the space info associated
				5212	* with the block_rsv. If there is not enough space it will make an attempt to
				5213	* flush out space to make room. It will do this by flushing delalloc if
				5214	* possible or committing the transaction. If flush is 0 then no attempts to
				5215	* regain reservations will be made and this will fail if there is not enough
				5216	* space already.
				5217	*/
				5218	static int reserve_metadata_bytes(struct btrfs_root *root,
				5219	struct btrfs_block_rsv *block_rsv,
				5220	u64 orig_bytes,
				5221	enum btrfs_reserve_flush_enum flush)
				5222	{
				5223	struct btrfs_fs_info *fs_info = root->fs_info;
				5224	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5225	int ret;
				5226	bool system_chunk = (root == fs_info->chunk_root);
				5227
				5228	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
				5229	orig_bytes, flush, system_chunk);
				5230	if (ret == -ENOSPC &&
				5231	unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
				5232	if (block_rsv != global_rsv &&
				5233	!block_rsv_use_bytes(global_rsv, orig_bytes))
				5234	ret = 0;
				5235	}
				5236	if (ret == -ENOSPC) {
				5237	trace_btrfs_space_reservation(fs_info, "space_info:enospc",
				5238	block_rsv->space_info->flags,
				5239	orig_bytes, 1);
				5240
				5241	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
				5242	dump_space_info(fs_info, block_rsv->space_info,
				5243	orig_bytes, 0);
				5244	}
				5245	return ret;
				5246	}
				5247
				5248	static struct btrfs_block_rsv *get_block_rsv(
				5249	const struct btrfs_trans_handle *trans,
				5250	const struct btrfs_root *root)
				5251	{
				5252	struct btrfs_fs_info *fs_info = root->fs_info;
				5253	struct btrfs_block_rsv *block_rsv = NULL;
				5254
				5255	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) \|\|
				5256	(root == fs_info->csum_root && trans->adding_csums) \|\|
				5257	(root == fs_info->uuid_root))
				5258	block_rsv = trans->block_rsv;
				5259
				5260	if (!block_rsv)
				5261	block_rsv = root->block_rsv;
				5262
				5263	if (!block_rsv)
				5264	block_rsv = &fs_info->empty_block_rsv;
				5265
				5266	return block_rsv;
				5267	}
				5268
				5269	static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
				5270	u64 num_bytes)
				5271	{
				5272	int ret = -ENOSPC;
				5273	spin_lock(&block_rsv->lock);
				5274	if (block_rsv->reserved >= num_bytes) {
				5275	block_rsv->reserved -= num_bytes;
				5276	if (block_rsv->reserved < block_rsv->size)
				5277	block_rsv->full = 0;
				5278	ret = 0;
				5279	}
				5280	spin_unlock(&block_rsv->lock);
				5281	return ret;
				5282	}
				5283
				5284	static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
				5285	u64 num_bytes, int update_size)
				5286	{
				5287	spin_lock(&block_rsv->lock);
				5288	block_rsv->reserved += num_bytes;
				5289	if (update_size)
				5290	block_rsv->size += num_bytes;
				5291	else if (block_rsv->reserved >= block_rsv->size)
				5292	block_rsv->full = 1;
				5293	spin_unlock(&block_rsv->lock);
				5294	}
				5295
				5296	int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
				5297	struct btrfs_block_rsv *dest, u64 num_bytes,
				5298	int min_factor)
				5299	{
				5300	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5301	u64 min_bytes;
				5302
				5303	if (global_rsv->space_info != dest->space_info)
				5304	return -ENOSPC;
				5305
				5306	spin_lock(&global_rsv->lock);
				5307	min_bytes = div_factor(global_rsv->size, min_factor);
				5308	if (global_rsv->reserved < min_bytes + num_bytes) {
				5309	spin_unlock(&global_rsv->lock);
				5310	return -ENOSPC;
				5311	}
				5312	global_rsv->reserved -= num_bytes;
				5313	if (global_rsv->reserved < global_rsv->size)
				5314	global_rsv->full = 0;
				5315	spin_unlock(&global_rsv->lock);
				5316
				5317	block_rsv_add_bytes(dest, num_bytes, 1);
				5318	return 0;
				5319	}
				5320
				5321	/*
				5322	* This is for space we already have accounted in space_info->bytes_may_use, so
				5323	* basically when we're returning space from block_rsv's.
				5324	*/
				5325	static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				5326	struct btrfs_space_info *space_info,
				5327	u64 num_bytes)
				5328	{
				5329	struct reserve_ticket *ticket;
				5330	struct list_head *head;
				5331	u64 used;
				5332	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
				5333	bool check_overcommit = false;
				5334
				5335	spin_lock(&space_info->lock);
				5336	head = &space_info->priority_tickets;
				5337
				5338	/*
				5339	* If we are over our limit then we need to check and see if we can
				5340	* overcommit, and if we can't then we just need to free up our space
				5341	* and not satisfy any requests.
				5342	*/
				5343	used = btrfs_space_info_used(space_info, true);
				5344	if (used - num_bytes >= space_info->total_bytes)
				5345	check_overcommit = true;
				5346	again:
				5347	while (!list_empty(head) && num_bytes) {
				5348	ticket = list_first_entry(head, struct reserve_ticket,
				5349	list);
				5350	/*
				5351	* We use 0 bytes because this space is already reserved, so
				5352	* adding the ticket space would be a double count.
				5353	*/
				5354	if (check_overcommit &&
				5355	!can_overcommit(fs_info, space_info, 0, flush, false))
				5356	break;
				5357	if (num_bytes >= ticket->bytes) {
				5358	list_del_init(&ticket->list);
				5359	num_bytes -= ticket->bytes;
				5360	ticket->bytes = 0;
				5361	space_info->tickets_id++;
				5362	wake_up(&ticket->wait);
				5363	} else {
				5364	ticket->bytes -= num_bytes;
				5365	num_bytes = 0;
				5366	}
				5367	}
				5368
				5369	if (num_bytes && head == &space_info->priority_tickets) {
				5370	head = &space_info->tickets;
				5371	flush = BTRFS_RESERVE_FLUSH_ALL;
				5372	goto again;
				5373	}
				5374	space_info->bytes_may_use -= num_bytes;
				5375	trace_btrfs_space_reservation(fs_info, "space_info",
				5376	space_info->flags, num_bytes, 0);
				5377	spin_unlock(&space_info->lock);
				5378	}
				5379
				5380	/*
				5381	* This is for newly allocated space that isn't accounted in
				5382	* space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
				5383	* we use this helper.
				5384	*/
				5385	static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				5386	struct btrfs_space_info *space_info,
				5387	u64 num_bytes)
				5388	{
				5389	struct reserve_ticket *ticket;
				5390	struct list_head *head = &space_info->priority_tickets;
				5391
				5392	again:
				5393	while (!list_empty(head) && num_bytes) {
				5394	ticket = list_first_entry(head, struct reserve_ticket,
				5395	list);
				5396	if (num_bytes >= ticket->bytes) {
				5397	trace_btrfs_space_reservation(fs_info, "space_info",
				5398	space_info->flags,
				5399	ticket->bytes, 1);
				5400	list_del_init(&ticket->list);
				5401	num_bytes -= ticket->bytes;
				5402	space_info->bytes_may_use += ticket->bytes;
				5403	ticket->bytes = 0;
				5404	space_info->tickets_id++;
				5405	wake_up(&ticket->wait);
				5406	} else {
				5407	trace_btrfs_space_reservation(fs_info, "space_info",
				5408	space_info->flags,
				5409	num_bytes, 1);
				5410	space_info->bytes_may_use += num_bytes;
				5411	ticket->bytes -= num_bytes;
				5412	num_bytes = 0;
				5413	}
				5414	}
				5415
				5416	if (num_bytes && head == &space_info->priority_tickets) {
				5417	head = &space_info->tickets;
				5418	goto again;
				5419	}
				5420	}
				5421
				5422	static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
				5423	struct btrfs_block_rsv *block_rsv,
				5424	struct btrfs_block_rsv *dest, u64 num_bytes,
				5425	u64 *qgroup_to_release_ret)
				5426	{
				5427	struct btrfs_space_info *space_info = block_rsv->space_info;
				5428	u64 qgroup_to_release = 0;
				5429	u64 ret;
				5430
				5431	spin_lock(&block_rsv->lock);
				5432	if (num_bytes == (u64)-1) {
				5433	num_bytes = block_rsv->size;
				5434	qgroup_to_release = block_rsv->qgroup_rsv_size;
				5435	}
				5436	block_rsv->size -= num_bytes;
				5437	if (block_rsv->reserved >= block_rsv->size) {
				5438	num_bytes = block_rsv->reserved - block_rsv->size;
				5439	block_rsv->reserved = block_rsv->size;
				5440	block_rsv->full = 1;
				5441	} else {
				5442	num_bytes = 0;
				5443	}
				5444	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
				5445	qgroup_to_release = block_rsv->qgroup_rsv_reserved -
				5446	block_rsv->qgroup_rsv_size;
				5447	block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
				5448	} else {
				5449	qgroup_to_release = 0;
				5450	}
				5451	spin_unlock(&block_rsv->lock);
				5452
				5453	ret = num_bytes;
				5454	if (num_bytes > 0) {
				5455	if (dest) {
				5456	spin_lock(&dest->lock);
				5457	if (!dest->full) {
				5458	u64 bytes_to_add;
				5459
				5460	bytes_to_add = dest->size - dest->reserved;
				5461	bytes_to_add = min(num_bytes, bytes_to_add);
				5462	dest->reserved += bytes_to_add;
				5463	if (dest->reserved >= dest->size)
				5464	dest->full = 1;
				5465	num_bytes -= bytes_to_add;
				5466	}
				5467	spin_unlock(&dest->lock);
				5468	}
				5469	if (num_bytes)
				5470	space_info_add_old_bytes(fs_info, space_info,
				5471	num_bytes);
				5472	}
				5473	if (qgroup_to_release_ret)
				5474	*qgroup_to_release_ret = qgroup_to_release;
				5475	return ret;
				5476	}
				5477
				5478	int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
				5479	struct btrfs_block_rsv *dst, u64 num_bytes,
				5480	int update_size)
				5481	{
				5482	int ret;
				5483
				5484	ret = block_rsv_use_bytes(src, num_bytes);
				5485	if (ret)
				5486	return ret;
				5487
				5488	block_rsv_add_bytes(dst, num_bytes, update_size);
				5489	return 0;
				5490	}
				5491
				5492	void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
				5493	{
				5494	memset(rsv, 0, sizeof(*rsv));
				5495	spin_lock_init(&rsv->lock);
				5496	rsv->type = type;
				5497	}
				5498
				5499	void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
				5500	struct btrfs_block_rsv *rsv,
				5501	unsigned short type)
				5502	{
				5503	btrfs_init_block_rsv(rsv, type);
				5504	rsv->space_info = __find_space_info(fs_info,
				5505	BTRFS_BLOCK_GROUP_METADATA);
				5506	}
				5507
				5508	struct btrfs_block_rsv btrfs_alloc_block_rsv(struct btrfs_fs_info fs_info,
				5509	unsigned short type)
				5510	{
				5511	struct btrfs_block_rsv *block_rsv;
				5512
				5513	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
				5514	if (!block_rsv)
				5515	return NULL;
				5516
				5517	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
				5518	return block_rsv;
				5519	}
				5520
				5521	void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
				5522	struct btrfs_block_rsv *rsv)
				5523	{
				5524	if (!rsv)
				5525	return;
				5526	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
				5527	kfree(rsv);
				5528	}
				5529
				5530	int btrfs_block_rsv_add(struct btrfs_root *root,
				5531	struct btrfs_block_rsv *block_rsv, u64 num_bytes,
				5532	enum btrfs_reserve_flush_enum flush)
				5533	{
				5534	int ret;
				5535
				5536	if (num_bytes == 0)
				5537	return 0;
				5538
				5539	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
				5540	if (!ret) {
				5541	block_rsv_add_bytes(block_rsv, num_bytes, 1);
				5542	return 0;
				5543	}
				5544
				5545	return ret;
				5546	}
				5547
				5548	int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
				5549	{
				5550	u64 num_bytes = 0;
				5551	int ret = -ENOSPC;
				5552
				5553	if (!block_rsv)
				5554	return 0;
				5555
				5556	spin_lock(&block_rsv->lock);
				5557	num_bytes = div_factor(block_rsv->size, min_factor);
				5558	if (block_rsv->reserved >= num_bytes)
				5559	ret = 0;
				5560	spin_unlock(&block_rsv->lock);
				5561
				5562	return ret;
				5563	}
				5564
				5565	int btrfs_block_rsv_refill(struct btrfs_root *root,
				5566	struct btrfs_block_rsv *block_rsv, u64 min_reserved,
				5567	enum btrfs_reserve_flush_enum flush)
				5568	{
				5569	u64 num_bytes = 0;
				5570	int ret = -ENOSPC;
				5571
				5572	if (!block_rsv)
				5573	return 0;
				5574
				5575	spin_lock(&block_rsv->lock);
				5576	num_bytes = min_reserved;
				5577	if (block_rsv->reserved >= num_bytes)
				5578	ret = 0;
				5579	else
				5580	num_bytes -= block_rsv->reserved;
				5581	spin_unlock(&block_rsv->lock);
				5582
				5583	if (!ret)
				5584	return 0;
				5585
				5586	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
				5587	if (!ret) {
				5588	block_rsv_add_bytes(block_rsv, num_bytes, 0);
				5589	return 0;
				5590	}
				5591
				5592	return ret;
				5593	}
				5594
				5595	/**
				5596	* btrfs_inode_rsv_refill - refill the inode block rsv.
				5597	* @inode - the inode we are refilling.
				5598	* @flush - the flusing restriction.
				5599	*
				5600	* Essentially the same as btrfs_block_rsv_refill, except it uses the
				5601	* block_rsv->size as the minimum size. We'll either refill the missing amount
				5602	* or return if we already have enough space. This will also handle the resreve
				5603	* tracepoint for the reserved amount.
				5604	*/
				5605	static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
				5606	enum btrfs_reserve_flush_enum flush)
				5607	{
				5608	struct btrfs_root *root = inode->root;
				5609	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
				5610	u64 num_bytes = 0;
				5611	u64 qgroup_num_bytes = 0;
				5612	int ret = -ENOSPC;
				5613
				5614	spin_lock(&block_rsv->lock);
				5615	if (block_rsv->reserved < block_rsv->size)
				5616	num_bytes = block_rsv->size - block_rsv->reserved;
				5617	if (block_rsv->qgroup_rsv_reserved < block_rsv->qgroup_rsv_size)
				5618	qgroup_num_bytes = block_rsv->qgroup_rsv_size -
				5619	block_rsv->qgroup_rsv_reserved;
				5620	spin_unlock(&block_rsv->lock);
				5621
				5622	if (num_bytes == 0)
				5623	return 0;
				5624
				5625	ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_num_bytes, true);
				5626	if (ret)
				5627	return ret;
				5628	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
				5629	if (!ret) {
				5630	block_rsv_add_bytes(block_rsv, num_bytes, 0);
				5631	trace_btrfs_space_reservation(root->fs_info, "delalloc",
				5632	btrfs_ino(inode), num_bytes, 1);
				5633
				5634	/* Don't forget to increase qgroup_rsv_reserved */
				5635	spin_lock(&block_rsv->lock);
				5636	block_rsv->qgroup_rsv_reserved += qgroup_num_bytes;
				5637	spin_unlock(&block_rsv->lock);
				5638	} else
				5639	btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
				5640	return ret;
				5641	}
				5642
				5643	/**
				5644	* btrfs_inode_rsv_release - release any excessive reservation.
				5645	* @inode - the inode we need to release from.
				5646	* @qgroup_free - free or convert qgroup meta.
				5647	* Unlike normal operation, qgroup meta reservation needs to know if we are
				5648	* freeing qgroup reservation or just converting it into per-trans. Normally
				5649	* @qgroup_free is true for error handling, and false for normal release.
				5650	*
				5651	* This is the same as btrfs_block_rsv_release, except that it handles the
				5652	* tracepoint for the reservation.
				5653	*/
				5654	static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
				5655	{
				5656	struct btrfs_fs_info *fs_info = inode->root->fs_info;
				5657	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5658	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
				5659	u64 released = 0;
				5660	u64 qgroup_to_release = 0;
				5661
				5662	/*
				5663	* Since we statically set the block_rsv->size we just want to say we
				5664	* are releasing 0 bytes, and then we'll just get the reservation over
				5665	* the size free'd.
				5666	*/
				5667	released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
				5668	&qgroup_to_release);
				5669	if (released > 0)
				5670	trace_btrfs_space_reservation(fs_info, "delalloc",
				5671	btrfs_ino(inode), released, 0);
				5672	if (qgroup_free)
				5673	btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
				5674	else
				5675	btrfs_qgroup_convert_reserved_meta(inode->root,
				5676	qgroup_to_release);
				5677	}
				5678
				5679	void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
				5680	struct btrfs_block_rsv *block_rsv,
				5681	u64 num_bytes)
				5682	{
				5683	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5684
				5685	if (global_rsv == block_rsv \|\|
				5686	block_rsv->space_info != global_rsv->space_info)
				5687	global_rsv = NULL;
				5688	block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, NULL);
				5689	}
				5690
				5691	static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
				5692	{
				5693	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
				5694	struct btrfs_space_info *sinfo = block_rsv->space_info;
				5695	u64 num_bytes;
				5696
				5697	/*
				5698	* The global block rsv is based on the size of the extent tree, the
				5699	* checksum tree and the root tree. If the fs is empty we want to set
				5700	* it to a minimal amount for safety.
				5701	*/
				5702	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
				5703	btrfs_root_used(&fs_info->csum_root->root_item) +
				5704	btrfs_root_used(&fs_info->tree_root->root_item);
				5705	num_bytes = max_t(u64, num_bytes, SZ_16M);
				5706
				5707	spin_lock(&sinfo->lock);
				5708	spin_lock(&block_rsv->lock);
				5709
				5710	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
				5711
				5712	if (block_rsv->reserved < block_rsv->size) {
				5713	num_bytes = btrfs_space_info_used(sinfo, true);
				5714	if (sinfo->total_bytes > num_bytes) {
				5715	num_bytes = sinfo->total_bytes - num_bytes;
				5716	num_bytes = min(num_bytes,
				5717	block_rsv->size - block_rsv->reserved);
				5718	block_rsv->reserved += num_bytes;
				5719	sinfo->bytes_may_use += num_bytes;
				5720	trace_btrfs_space_reservation(fs_info, "space_info",
				5721	sinfo->flags, num_bytes,
				5722	1);
				5723	}
				5724	} else if (block_rsv->reserved > block_rsv->size) {
				5725	num_bytes = block_rsv->reserved - block_rsv->size;
				5726	sinfo->bytes_may_use -= num_bytes;
				5727	trace_btrfs_space_reservation(fs_info, "space_info",
				5728	sinfo->flags, num_bytes, 0);
				5729	block_rsv->reserved = block_rsv->size;
				5730	}
				5731
				5732	if (block_rsv->reserved == block_rsv->size)
				5733	block_rsv->full = 1;
				5734	else
				5735	block_rsv->full = 0;
				5736
				5737	spin_unlock(&block_rsv->lock);
				5738	spin_unlock(&sinfo->lock);
				5739	}
				5740
				5741	static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
				5742	{
				5743	struct btrfs_space_info *space_info;
				5744
				5745	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				5746	fs_info->chunk_block_rsv.space_info = space_info;
				5747
				5748	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				5749	fs_info->global_block_rsv.space_info = space_info;
				5750	fs_info->trans_block_rsv.space_info = space_info;
				5751	fs_info->empty_block_rsv.space_info = space_info;
				5752	fs_info->delayed_block_rsv.space_info = space_info;
				5753
				5754	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
				5755	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
				5756	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
				5757	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
				5758	if (fs_info->quota_root)
				5759	fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
				5760	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
				5761
				5762	update_global_block_rsv(fs_info);
				5763	}
				5764
				5765	static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
				5766	{
				5767	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
				5768	(u64)-1, NULL);
				5769	WARN_ON(fs_info->trans_block_rsv.size > 0);
				5770	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
				5771	WARN_ON(fs_info->chunk_block_rsv.size > 0);
				5772	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
				5773	WARN_ON(fs_info->delayed_block_rsv.size > 0);
				5774	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
				5775	}
				5776
				5777
				5778	/*
				5779	* To be called after all the new block groups attached to the transaction
				5780	* handle have been created (btrfs_create_pending_block_groups()).
				5781	*/
				5782	void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
				5783	{
				5784	struct btrfs_fs_info *fs_info = trans->fs_info;
				5785
				5786	if (!trans->chunk_bytes_reserved)
				5787	return;
				5788
				5789	WARN_ON_ONCE(!list_empty(&trans->new_bgs));
				5790
				5791	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
				5792	trans->chunk_bytes_reserved, NULL);
				5793	trans->chunk_bytes_reserved = 0;
				5794	}
				5795
				5796	/*
				5797	* btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
				5798	* root: the root of the parent directory
				5799	* rsv: block reservation
				5800	* items: the number of items that we need do reservation
				5801	* use_global_rsv: allow fallback to the global block reservation
				5802	*
				5803	* This function is used to reserve the space for snapshot/subvolume
				5804	* creation and deletion. Those operations are different with the
				5805	* common file/directory operations, they change two fs/file trees
				5806	* and root tree, the number of items that the qgroup reserves is
				5807	* different with the free space reservation. So we can not use
				5808	* the space reservation mechanism in start_transaction().
				5809	*/
				5810	int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
				5811	struct btrfs_block_rsv *rsv, int items,
				5812	bool use_global_rsv)
				5813	{
				5814	u64 qgroup_num_bytes = 0;
				5815	u64 num_bytes;
				5816	int ret;
				5817	struct btrfs_fs_info *fs_info = root->fs_info;
				5818	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				5819
				5820	if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
				5821	/* One for parent inode, two for dir entries */
				5822	qgroup_num_bytes = 3 * fs_info->nodesize;
				5823	ret = btrfs_qgroup_reserve_meta_prealloc(root,
				5824	qgroup_num_bytes, true);
				5825	if (ret)
				5826	return ret;
				5827	}
				5828
				5829	num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
				5830	rsv->space_info = __find_space_info(fs_info,
				5831	BTRFS_BLOCK_GROUP_METADATA);
				5832	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
				5833	BTRFS_RESERVE_FLUSH_ALL);
				5834
				5835	if (ret == -ENOSPC && use_global_rsv)
				5836	ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
				5837
				5838	if (ret && qgroup_num_bytes)
				5839	btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
				5840
				5841	return ret;
				5842	}
				5843
				5844	void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
				5845	struct btrfs_block_rsv *rsv)
				5846	{
				5847	btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
				5848	}
				5849
				5850	static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
				5851	struct btrfs_inode *inode)
				5852	{
				5853	struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
				5854	u64 reserve_size = 0;
				5855	u64 qgroup_rsv_size = 0;
				5856	u64 csum_leaves;
				5857	unsigned outstanding_extents;
				5858
				5859	lockdep_assert_held(&inode->lock);
				5860	outstanding_extents = inode->outstanding_extents;
				5861	if (outstanding_extents)
				5862	reserve_size = btrfs_calc_trans_metadata_size(fs_info,
				5863	outstanding_extents + 1);
				5864	csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
				5865	inode->csum_bytes);
				5866	reserve_size += btrfs_calc_trans_metadata_size(fs_info,
				5867	csum_leaves);
				5868	/*
				5869	* For qgroup rsv, the calculation is very simple:
				5870	* account one nodesize for each outstanding extent
				5871	*
				5872	* This is overestimating in most cases.
				5873	*/
				5874	qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
				5875
				5876	spin_lock(&block_rsv->lock);
				5877	block_rsv->size = reserve_size;
				5878	block_rsv->qgroup_rsv_size = qgroup_rsv_size;
				5879	spin_unlock(&block_rsv->lock);
				5880	}
				5881
				5882	int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
				5883	{
				5884	struct btrfs_fs_info *fs_info = inode->root->fs_info;
				5885	unsigned nr_extents;
				5886	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
				5887	int ret = 0;
				5888	bool delalloc_lock = true;
				5889
				5890	/* If we are a free space inode we need to not flush since we will be in
				5891	* the middle of a transaction commit. We also don't need the delalloc
				5892	* mutex since we won't race with anybody. We need this mostly to make
				5893	* lockdep shut its filthy mouth.
				5894	*
				5895	* If we have a transaction open (can happen if we call truncate_block
				5896	* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
				5897	*/
				5898	if (btrfs_is_free_space_inode(inode)) {
				5899	flush = BTRFS_RESERVE_NO_FLUSH;
				5900	delalloc_lock = false;
				5901	} else {
				5902	if (current->journal_info)
				5903	flush = BTRFS_RESERVE_FLUSH_LIMIT;
				5904
				5905	if (btrfs_transaction_in_commit(fs_info))
				5906	schedule_timeout(1);
				5907	}
				5908
				5909	if (delalloc_lock)
				5910	mutex_lock(&inode->delalloc_mutex);
				5911
				5912	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
				5913
				5914	/* Add our new extents and calculate the new rsv size. */
				5915	spin_lock(&inode->lock);
				5916	nr_extents = count_max_extents(num_bytes);
				5917	btrfs_mod_outstanding_extents(inode, nr_extents);
				5918	inode->csum_bytes += num_bytes;
				5919	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
				5920	spin_unlock(&inode->lock);
				5921
				5922	ret = btrfs_inode_rsv_refill(inode, flush);
				5923	if (unlikely(ret))
				5924	goto out_fail;
				5925
				5926	if (delalloc_lock)
				5927	mutex_unlock(&inode->delalloc_mutex);
				5928	return 0;
				5929
				5930	out_fail:
				5931	spin_lock(&inode->lock);
				5932	nr_extents = count_max_extents(num_bytes);
				5933	btrfs_mod_outstanding_extents(inode, -nr_extents);
				5934	inode->csum_bytes -= num_bytes;
				5935	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
				5936	spin_unlock(&inode->lock);
				5937
				5938	btrfs_inode_rsv_release(inode, true);
				5939	if (delalloc_lock)
				5940	mutex_unlock(&inode->delalloc_mutex);
				5941	return ret;
				5942	}
				5943
				5944	/**
				5945	* btrfs_delalloc_release_metadata - release a metadata reservation for an inode
				5946	* @inode: the inode to release the reservation for.
				5947	* @num_bytes: the number of bytes we are releasing.
				5948	* @qgroup_free: free qgroup reservation or convert it to per-trans reservation
				5949	*
				5950	* This will release the metadata reservation for an inode. This can be called
				5951	* once we complete IO for a given set of bytes to release their metadata
				5952	* reservations, or on error for the same reason.
				5953	*/
				5954	void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
				5955	bool qgroup_free)
				5956	{
				5957	struct btrfs_fs_info *fs_info = inode->root->fs_info;
				5958
				5959	num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
				5960	spin_lock(&inode->lock);
				5961	inode->csum_bytes -= num_bytes;
				5962	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
				5963	spin_unlock(&inode->lock);
				5964
				5965	if (btrfs_is_testing(fs_info))
				5966	return;
				5967
				5968	btrfs_inode_rsv_release(inode, qgroup_free);
				5969	}
				5970
				5971	/**
				5972	* btrfs_delalloc_release_extents - release our outstanding_extents
				5973	* @inode: the inode to balance the reservation for.
				5974	* @num_bytes: the number of bytes we originally reserved with
				5975	* @qgroup_free: do we need to free qgroup meta reservation or convert them.
				5976	*
				5977	* When we reserve space we increase outstanding_extents for the extents we may
				5978	* add. Once we've set the range as delalloc or created our ordered extents we
				5979	* have outstanding_extents to track the real usage, so we use this to free our
				5980	* temporarily tracked outstanding_extents. This _must_ be used in conjunction
				5981	* with btrfs_delalloc_reserve_metadata.
				5982	*/
				5983	void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
				5984	{
				5985	struct btrfs_fs_info *fs_info = inode->root->fs_info;
				5986	unsigned num_extents;
				5987
				5988	spin_lock(&inode->lock);
				5989	num_extents = count_max_extents(num_bytes);
				5990	btrfs_mod_outstanding_extents(inode, -num_extents);
				5991	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
				5992	spin_unlock(&inode->lock);
				5993
				5994	if (btrfs_is_testing(fs_info))
				5995	return;
				5996
				5997	btrfs_inode_rsv_release(inode, true);
				5998	}
				5999
				6000	/**
				6001	* btrfs_delalloc_reserve_space - reserve data and metadata space for
				6002	* delalloc
				6003	* @inode: inode we're writing to
				6004	* @start: start range we are writing to
				6005	* @len: how long the range we are writing to
				6006	* @reserved: mandatory parameter, record actually reserved qgroup ranges of
				6007	* current reservation.
				6008	*
				6009	* This will do the following things
				6010	*
				6011	* o reserve space in data space info for num bytes
				6012	* and reserve precious corresponding qgroup space
				6013	* (Done in check_data_free_space)
				6014	*
				6015	* o reserve space for metadata space, based on the number of outstanding
				6016	* extents and how much csums will be needed
				6017	* also reserve metadata space in a per root over-reserve method.
				6018	* o add to the inodes->delalloc_bytes
				6019	* o add it to the fs_info's delalloc inodes list.
				6020	* (Above 3 all done in delalloc_reserve_metadata)
				6021	*
				6022	* Return 0 for success
				6023	* Return <0 for error(-ENOSPC or -EQUOT)
				6024	*/
				6025	int btrfs_delalloc_reserve_space(struct inode *inode,
				6026	struct extent_changeset **reserved, u64 start, u64 len)
				6027	{
				6028	int ret;
				6029
				6030	ret = btrfs_check_data_free_space(inode, reserved, start, len);
				6031	if (ret < 0)
				6032	return ret;
				6033	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
				6034	if (ret < 0)
				6035	btrfs_free_reserved_data_space(inode, *reserved, start, len);
				6036	return ret;
				6037	}
				6038
				6039	/**
				6040	* btrfs_delalloc_release_space - release data and metadata space for delalloc
				6041	* @inode: inode we're releasing space for
				6042	* @start: start position of the space already reserved
				6043	* @len: the len of the space already reserved
				6044	* @release_bytes: the len of the space we consumed or didn't use
				6045	*
				6046	* This function will release the metadata space that was not used and will
				6047	* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
				6048	* list if there are no delalloc bytes left.
				6049	* Also it will handle the qgroup reserved space.
				6050	*/
				6051	void btrfs_delalloc_release_space(struct inode *inode,
				6052	struct extent_changeset *reserved,
				6053	u64 start, u64 len, bool qgroup_free)
				6054	{
				6055	btrfs_delalloc_release_metadata(BTRFS_I(inode), len, qgroup_free);
				6056	btrfs_free_reserved_data_space(inode, reserved, start, len);
				6057	}
				6058
				6059	static int update_block_group(struct btrfs_trans_handle *trans,
				6060	struct btrfs_fs_info *info, u64 bytenr,
				6061	u64 num_bytes, int alloc)
				6062	{
				6063	struct btrfs_block_group_cache *cache = NULL;
				6064	u64 total = num_bytes;
				6065	u64 old_val;
				6066	u64 byte_in_group;
				6067	int factor;
				6068
				6069	/* block accounting for super block */
				6070	spin_lock(&info->delalloc_root_lock);
				6071	old_val = btrfs_super_bytes_used(info->super_copy);
				6072	if (alloc)
				6073	old_val += num_bytes;
				6074	else
				6075	old_val -= num_bytes;
				6076	btrfs_set_super_bytes_used(info->super_copy, old_val);
				6077	spin_unlock(&info->delalloc_root_lock);
				6078
				6079	while (total) {
				6080	cache = btrfs_lookup_block_group(info, bytenr);
				6081	if (!cache)
				6082	return -ENOENT;
				6083	factor = btrfs_bg_type_to_factor(cache->flags);
				6084
				6085	/*
				6086	* If this block group has free space cache written out, we
				6087	* need to make sure to load it if we are removing space. This
				6088	* is because we need the unpinning stage to actually add the
				6089	* space back to the block group, otherwise we will leak space.
				6090	*/
				6091	if (!alloc && cache->cached == BTRFS_CACHE_NO)
				6092	cache_block_group(cache, 1);
				6093
				6094	byte_in_group = bytenr - cache->key.objectid;
				6095	WARN_ON(byte_in_group > cache->key.offset);
				6096
				6097	spin_lock(&cache->space_info->lock);
				6098	spin_lock(&cache->lock);
				6099
				6100	if (btrfs_test_opt(info, SPACE_CACHE) &&
				6101	cache->disk_cache_state < BTRFS_DC_CLEAR)
				6102	cache->disk_cache_state = BTRFS_DC_CLEAR;
				6103
				6104	old_val = btrfs_block_group_used(&cache->item);
				6105	num_bytes = min(total, cache->key.offset - byte_in_group);
				6106	if (alloc) {
				6107	old_val += num_bytes;
				6108	btrfs_set_block_group_used(&cache->item, old_val);
				6109	cache->reserved -= num_bytes;
				6110	cache->space_info->bytes_reserved -= num_bytes;
				6111	cache->space_info->bytes_used += num_bytes;
				6112	cache->space_info->disk_used += num_bytes * factor;
				6113	spin_unlock(&cache->lock);
				6114	spin_unlock(&cache->space_info->lock);
				6115	} else {
				6116	old_val -= num_bytes;
				6117	btrfs_set_block_group_used(&cache->item, old_val);
				6118	cache->pinned += num_bytes;
				6119	cache->space_info->bytes_pinned += num_bytes;
				6120	cache->space_info->bytes_used -= num_bytes;
				6121	cache->space_info->disk_used -= num_bytes * factor;
				6122	spin_unlock(&cache->lock);
				6123	spin_unlock(&cache->space_info->lock);
				6124
				6125	trace_btrfs_space_reservation(info, "pinned",
				6126	cache->space_info->flags,
				6127	num_bytes, 1);
				6128	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
				6129	num_bytes,
				6130	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				6131	set_extent_dirty(info->pinned_extents,
				6132	bytenr, bytenr + num_bytes - 1,
				6133	GFP_NOFS \| __GFP_NOFAIL);
				6134	}
				6135
				6136	spin_lock(&trans->transaction->dirty_bgs_lock);
				6137	if (list_empty(&cache->dirty_list)) {
				6138	list_add_tail(&cache->dirty_list,
				6139	&trans->transaction->dirty_bgs);
				6140	trans->transaction->num_dirty_bgs++;
				6141	btrfs_get_block_group(cache);
				6142	}
				6143	spin_unlock(&trans->transaction->dirty_bgs_lock);
				6144
				6145	/*
				6146	* No longer have used bytes in this block group, queue it for
				6147	* deletion. We do this after adding the block group to the
				6148	* dirty list to avoid races between cleaner kthread and space
				6149	* cache writeout.
				6150	*/
				6151	if (!alloc && old_val == 0)
				6152	btrfs_mark_bg_unused(cache);
				6153
				6154	btrfs_put_block_group(cache);
				6155	total -= num_bytes;
				6156	bytenr += num_bytes;
				6157	}
				6158	return 0;
				6159	}
				6160
				6161	static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
				6162	{
				6163	struct btrfs_block_group_cache *cache;
				6164	u64 bytenr;
				6165
				6166	spin_lock(&fs_info->block_group_cache_lock);
				6167	bytenr = fs_info->first_logical_byte;
				6168	spin_unlock(&fs_info->block_group_cache_lock);
				6169
				6170	if (bytenr < (u64)-1)
				6171	return bytenr;
				6172
				6173	cache = btrfs_lookup_first_block_group(fs_info, search_start);
				6174	if (!cache)
				6175	return 0;
				6176
				6177	bytenr = cache->key.objectid;
				6178	btrfs_put_block_group(cache);
				6179
				6180	return bytenr;
				6181	}
				6182
				6183	static int pin_down_extent(struct btrfs_fs_info *fs_info,
				6184	struct btrfs_block_group_cache *cache,
				6185	u64 bytenr, u64 num_bytes, int reserved)
				6186	{
				6187	spin_lock(&cache->space_info->lock);
				6188	spin_lock(&cache->lock);
				6189	cache->pinned += num_bytes;
				6190	cache->space_info->bytes_pinned += num_bytes;
				6191	if (reserved) {
				6192	cache->reserved -= num_bytes;
				6193	cache->space_info->bytes_reserved -= num_bytes;
				6194	}
				6195	spin_unlock(&cache->lock);
				6196	spin_unlock(&cache->space_info->lock);
				6197
				6198	trace_btrfs_space_reservation(fs_info, "pinned",
				6199	cache->space_info->flags, num_bytes, 1);
				6200	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
				6201	num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
				6202	set_extent_dirty(fs_info->pinned_extents, bytenr,
				6203	bytenr + num_bytes - 1, GFP_NOFS \| __GFP_NOFAIL);
				6204	return 0;
				6205	}
				6206
				6207	/*
				6208	* this function must be called within transaction
				6209	*/
				6210	int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
				6211	u64 bytenr, u64 num_bytes, int reserved)
				6212	{
				6213	struct btrfs_block_group_cache *cache;
				6214
				6215	cache = btrfs_lookup_block_group(fs_info, bytenr);
				6216	BUG_ON(!cache); /* Logic error */
				6217
				6218	pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved);
				6219
				6220	btrfs_put_block_group(cache);
				6221	return 0;
				6222	}
				6223
				6224	/*
				6225	* this function must be called within transaction
				6226	*/
				6227	int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
				6228	u64 bytenr, u64 num_bytes)
				6229	{
				6230	struct btrfs_block_group_cache *cache;
				6231	int ret;
				6232
				6233	cache = btrfs_lookup_block_group(fs_info, bytenr);
				6234	if (!cache)
				6235	return -EINVAL;
				6236
				6237	/*
				6238	* pull in the free space cache (if any) so that our pin
				6239	* removes the free space from the cache. We have load_only set
				6240	* to one because the slow code to read in the free extents does check
				6241	* the pinned extents.
				6242	*/
				6243	cache_block_group(cache, 1);
				6244
				6245	pin_down_extent(fs_info, cache, bytenr, num_bytes, 0);
				6246
				6247	/* remove us from the free space cache (if we're there at all) */
				6248	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
				6249	btrfs_put_block_group(cache);
				6250	return ret;
				6251	}
				6252
				6253	static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				6254	u64 start, u64 num_bytes)
				6255	{
				6256	int ret;
				6257	struct btrfs_block_group_cache *block_group;
				6258	struct btrfs_caching_control *caching_ctl;
				6259
				6260	block_group = btrfs_lookup_block_group(fs_info, start);
				6261	if (!block_group)
				6262	return -EINVAL;
				6263
				6264	cache_block_group(block_group, 0);
				6265	caching_ctl = get_caching_control(block_group);
				6266
				6267	if (!caching_ctl) {
				6268	/* Logic error */
				6269	BUG_ON(!block_group_cache_done(block_group));
				6270	ret = btrfs_remove_free_space(block_group, start, num_bytes);
				6271	} else {
				6272	mutex_lock(&caching_ctl->mutex);
				6273
				6274	if (start >= caching_ctl->progress) {
				6275	ret = add_excluded_extent(fs_info, start, num_bytes);
				6276	} else if (start + num_bytes <= caching_ctl->progress) {
				6277	ret = btrfs_remove_free_space(block_group,
				6278	start, num_bytes);
				6279	} else {
				6280	num_bytes = caching_ctl->progress - start;
				6281	ret = btrfs_remove_free_space(block_group,
				6282	start, num_bytes);
				6283	if (ret)
				6284	goto out_lock;
				6285
				6286	num_bytes = (start + num_bytes) -
				6287	caching_ctl->progress;
				6288	start = caching_ctl->progress;
				6289	ret = add_excluded_extent(fs_info, start, num_bytes);
				6290	}
				6291	out_lock:
				6292	mutex_unlock(&caching_ctl->mutex);
				6293	put_caching_control(caching_ctl);
				6294	}
				6295	btrfs_put_block_group(block_group);
				6296	return ret;
				6297	}
				6298
				6299	int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info,
				6300	struct extent_buffer *eb)
				6301	{
				6302	struct btrfs_file_extent_item *item;
				6303	struct btrfs_key key;
				6304	int found_type;
				6305	int i;
				6306	int ret = 0;
				6307
				6308	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
				6309	return 0;
				6310
				6311	for (i = 0; i < btrfs_header_nritems(eb); i++) {
				6312	btrfs_item_key_to_cpu(eb, &key, i);
				6313	if (key.type != BTRFS_EXTENT_DATA_KEY)
				6314	continue;
				6315	item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
				6316	found_type = btrfs_file_extent_type(eb, item);
				6317	if (found_type == BTRFS_FILE_EXTENT_INLINE)
				6318	continue;
				6319	if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
				6320	continue;
				6321	key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
				6322	key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
				6323	ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
				6324	if (ret)
				6325	break;
				6326	}
				6327
				6328	return ret;
				6329	}
				6330
				6331	static void
				6332	btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
				6333	{
				6334	atomic_inc(&bg->reservations);
				6335	}
				6336
				6337	void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
				6338	const u64 start)
				6339	{
				6340	struct btrfs_block_group_cache *bg;
				6341
				6342	bg = btrfs_lookup_block_group(fs_info, start);
				6343	ASSERT(bg);
				6344	if (atomic_dec_and_test(&bg->reservations))
				6345	wake_up_var(&bg->reservations);
				6346	btrfs_put_block_group(bg);
				6347	}
				6348
				6349	void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
				6350	{
				6351	struct btrfs_space_info *space_info = bg->space_info;
				6352
				6353	ASSERT(bg->ro);
				6354
				6355	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
				6356	return;
				6357
				6358	/*
				6359	* Our block group is read only but before we set it to read only,
				6360	* some task might have had allocated an extent from it already, but it
				6361	* has not yet created a respective ordered extent (and added it to a
				6362	* root's list of ordered extents).
				6363	* Therefore wait for any task currently allocating extents, since the
				6364	* block group's reservations counter is incremented while a read lock
				6365	* on the groups' semaphore is held and decremented after releasing
				6366	* the read access on that semaphore and creating the ordered extent.
				6367	*/
				6368	down_write(&space_info->groups_sem);
				6369	up_write(&space_info->groups_sem);
				6370
				6371	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
				6372	}
				6373
				6374	/**
				6375	* btrfs_add_reserved_bytes - update the block_group and space info counters
				6376	* @cache: The cache we are manipulating
				6377	* @ram_bytes: The number of bytes of file content, and will be same to
				6378	* @num_bytes except for the compress path.
				6379	* @num_bytes: The number of bytes in question
				6380	* @delalloc: The blocks are allocated for the delalloc write
				6381	*
				6382	* This is called by the allocator when it reserves space. If this is a
				6383	* reservation and the block group has become read only we cannot make the
				6384	* reservation and return -EAGAIN, otherwise this function always succeeds.
				6385	*/
				6386	static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
				6387	u64 ram_bytes, u64 num_bytes, int delalloc)
				6388	{
				6389	struct btrfs_space_info *space_info = cache->space_info;
				6390	int ret = 0;
				6391
				6392	spin_lock(&space_info->lock);
				6393	spin_lock(&cache->lock);
				6394	if (cache->ro) {
				6395	ret = -EAGAIN;
				6396	} else {
				6397	cache->reserved += num_bytes;
				6398	space_info->bytes_reserved += num_bytes;
				6399
				6400	trace_btrfs_space_reservation(cache->fs_info,
				6401	"space_info", space_info->flags,
				6402	ram_bytes, 0);
				6403	space_info->bytes_may_use -= ram_bytes;
				6404	if (delalloc)
				6405	cache->delalloc_bytes += num_bytes;
				6406	}
				6407	spin_unlock(&cache->lock);
				6408	spin_unlock(&space_info->lock);
				6409	return ret;
				6410	}
				6411
				6412	/**
				6413	* btrfs_free_reserved_bytes - update the block_group and space info counters
				6414	* @cache: The cache we are manipulating
				6415	* @num_bytes: The number of bytes in question
				6416	* @delalloc: The blocks are allocated for the delalloc write
				6417	*
				6418	* This is called by somebody who is freeing space that was never actually used
				6419	* on disk. For example if you reserve some space for a new leaf in transaction
				6420	* A and before transaction A commits you free that leaf, you call this with
				6421	* reserve set to 0 in order to clear the reservation.
				6422	*/
				6423
				6424	static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
				6425	u64 num_bytes, int delalloc)
				6426	{
				6427	struct btrfs_space_info *space_info = cache->space_info;
				6428	int ret = 0;
				6429
				6430	spin_lock(&space_info->lock);
				6431	spin_lock(&cache->lock);
				6432	if (cache->ro)
				6433	space_info->bytes_readonly += num_bytes;
				6434	cache->reserved -= num_bytes;
				6435	space_info->bytes_reserved -= num_bytes;
				6436	space_info->max_extent_size = 0;
				6437
				6438	if (delalloc)
				6439	cache->delalloc_bytes -= num_bytes;
				6440	spin_unlock(&cache->lock);
				6441	spin_unlock(&space_info->lock);
				6442	return ret;
				6443	}
				6444	void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
				6445	{
				6446	struct btrfs_caching_control *next;
				6447	struct btrfs_caching_control *caching_ctl;
				6448	struct btrfs_block_group_cache *cache;
				6449
				6450	down_write(&fs_info->commit_root_sem);
				6451
				6452	list_for_each_entry_safe(caching_ctl, next,
				6453	&fs_info->caching_block_groups, list) {
				6454	cache = caching_ctl->block_group;
				6455	if (block_group_cache_done(cache)) {
				6456	cache->last_byte_to_unpin = (u64)-1;
				6457	list_del_init(&caching_ctl->list);
				6458	put_caching_control(caching_ctl);
				6459	} else {
				6460	cache->last_byte_to_unpin = caching_ctl->progress;
				6461	}
				6462	}
				6463
				6464	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
				6465	fs_info->pinned_extents = &fs_info->freed_extents[1];
				6466	else
				6467	fs_info->pinned_extents = &fs_info->freed_extents[0];
				6468
				6469	up_write(&fs_info->commit_root_sem);
				6470
				6471	update_global_block_rsv(fs_info);
				6472	}
				6473
				6474	/*
				6475	* Returns the free cluster for the given space info and sets empty_cluster to
				6476	* what it should be based on the mount options.
				6477	*/
				6478	static struct btrfs_free_cluster *
				6479	fetch_cluster_info(struct btrfs_fs_info *fs_info,
				6480	struct btrfs_space_info space_info, u64 empty_cluster)
				6481	{
				6482	struct btrfs_free_cluster *ret = NULL;
				6483
				6484	*empty_cluster = 0;
				6485	if (btrfs_mixed_space_info(space_info))
				6486	return ret;
				6487
				6488	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				6489	ret = &fs_info->meta_alloc_cluster;
				6490	if (btrfs_test_opt(fs_info, SSD))
				6491	*empty_cluster = SZ_2M;
				6492	else
				6493	*empty_cluster = SZ_64K;
				6494	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
				6495	btrfs_test_opt(fs_info, SSD_SPREAD)) {
				6496	*empty_cluster = SZ_2M;
				6497	ret = &fs_info->data_alloc_cluster;
				6498	}
				6499
				6500	return ret;
				6501	}
				6502
				6503	static int unpin_extent_range(struct btrfs_fs_info *fs_info,
				6504	u64 start, u64 end,
				6505	const bool return_free_space)
				6506	{
				6507	struct btrfs_block_group_cache *cache = NULL;
				6508	struct btrfs_space_info *space_info;
				6509	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				6510	struct btrfs_free_cluster *cluster = NULL;
				6511	u64 len;
				6512	u64 total_unpinned = 0;
				6513	u64 empty_cluster = 0;
				6514	bool readonly;
				6515
				6516	while (start <= end) {
				6517	readonly = false;
				6518	if (!cache \|\|
				6519	start >= cache->key.objectid + cache->key.offset) {
				6520	if (cache)
				6521	btrfs_put_block_group(cache);
				6522	total_unpinned = 0;
				6523	cache = btrfs_lookup_block_group(fs_info, start);
				6524	BUG_ON(!cache); /* Logic error */
				6525
				6526	cluster = fetch_cluster_info(fs_info,
				6527	cache->space_info,
				6528	&empty_cluster);
				6529	empty_cluster <<= 1;
				6530	}
				6531
				6532	len = cache->key.objectid + cache->key.offset - start;
				6533	len = min(len, end + 1 - start);
				6534
				6535	if (start < cache->last_byte_to_unpin) {
				6536	len = min(len, cache->last_byte_to_unpin - start);
				6537	if (return_free_space)
				6538	btrfs_add_free_space(cache, start, len);
				6539	}
				6540
				6541	start += len;
				6542	total_unpinned += len;
				6543	space_info = cache->space_info;
				6544
				6545	/*
				6546	* If this space cluster has been marked as fragmented and we've
				6547	* unpinned enough in this block group to potentially allow a
				6548	* cluster to be created inside of it go ahead and clear the
				6549	* fragmented check.
				6550	*/
				6551	if (cluster && cluster->fragmented &&
				6552	total_unpinned > empty_cluster) {
				6553	spin_lock(&cluster->lock);
				6554	cluster->fragmented = 0;
				6555	spin_unlock(&cluster->lock);
				6556	}
				6557
				6558	spin_lock(&space_info->lock);
				6559	spin_lock(&cache->lock);
				6560	cache->pinned -= len;
				6561	space_info->bytes_pinned -= len;
				6562
				6563	trace_btrfs_space_reservation(fs_info, "pinned",
				6564	space_info->flags, len, 0);
				6565	space_info->max_extent_size = 0;
				6566	percpu_counter_add_batch(&space_info->total_bytes_pinned,
				6567	-len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
				6568	if (cache->ro) {
				6569	space_info->bytes_readonly += len;
				6570	readonly = true;
				6571	}
				6572	spin_unlock(&cache->lock);
				6573	if (!readonly && return_free_space &&
				6574	global_rsv->space_info == space_info) {
				6575	u64 to_add = len;
				6576
				6577	spin_lock(&global_rsv->lock);
				6578	if (!global_rsv->full) {
				6579	to_add = min(len, global_rsv->size -
				6580	global_rsv->reserved);
				6581	global_rsv->reserved += to_add;
				6582	space_info->bytes_may_use += to_add;
				6583	if (global_rsv->reserved >= global_rsv->size)
				6584	global_rsv->full = 1;
				6585	trace_btrfs_space_reservation(fs_info,
				6586	"space_info",
				6587	space_info->flags,
				6588	to_add, 1);
				6589	len -= to_add;
				6590	}
				6591	spin_unlock(&global_rsv->lock);
				6592	/* Add to any tickets we may have */
				6593	if (len)
				6594	space_info_add_new_bytes(fs_info, space_info,
				6595	len);
				6596	}
				6597	spin_unlock(&space_info->lock);
				6598	}
				6599
				6600	if (cache)
				6601	btrfs_put_block_group(cache);
				6602	return 0;
				6603	}
				6604
				6605	int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
				6606	{
				6607	struct btrfs_fs_info *fs_info = trans->fs_info;
				6608	struct btrfs_block_group_cache block_group, tmp;
				6609	struct list_head *deleted_bgs;
				6610	struct extent_io_tree *unpin;
				6611	u64 start;
				6612	u64 end;
				6613	int ret;
				6614
				6615	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
				6616	unpin = &fs_info->freed_extents[1];
				6617	else
				6618	unpin = &fs_info->freed_extents[0];
				6619
				6620	while (!trans->aborted) {
				6621	struct extent_state *cached_state = NULL;
				6622
				6623	mutex_lock(&fs_info->unused_bg_unpin_mutex);
				6624	ret = find_first_extent_bit(unpin, 0, &start, &end,
				6625	EXTENT_DIRTY, &cached_state);
				6626	if (ret) {
				6627	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				6628	break;
				6629	}
				6630
				6631	if (btrfs_test_opt(fs_info, DISCARD))
				6632	ret = btrfs_discard_extent(fs_info, start,
				6633	end + 1 - start, NULL);
				6634
				6635	clear_extent_dirty(unpin, start, end, &cached_state);
				6636	unpin_extent_range(fs_info, start, end, true);
				6637	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				6638	free_extent_state(cached_state);
				6639	cond_resched();
				6640	}
				6641
				6642	/*
				6643	* Transaction is finished. We don't need the lock anymore. We
				6644	* do need to clean up the block groups in case of a transaction
				6645	* abort.
				6646	*/
				6647	deleted_bgs = &trans->transaction->deleted_bgs;
				6648	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
				6649	u64 trimmed = 0;
				6650
				6651	ret = -EROFS;
				6652	if (!trans->aborted)
				6653	ret = btrfs_discard_extent(fs_info,
				6654	block_group->key.objectid,
				6655	block_group->key.offset,
				6656	&trimmed);
				6657
				6658	list_del_init(&block_group->bg_list);
				6659	btrfs_put_block_group_trimming(block_group);
				6660	btrfs_put_block_group(block_group);
				6661
				6662	if (ret) {
				6663	const char *errstr = btrfs_decode_error(ret);
				6664	btrfs_warn(fs_info,
				6665	"discard failed while removing blockgroup: errno=%d %s",
				6666	ret, errstr);
				6667	}
				6668	}
				6669
				6670	return 0;
				6671	}
				6672
				6673	static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				6674	struct btrfs_delayed_ref_node *node, u64 parent,
				6675	u64 root_objectid, u64 owner_objectid,
				6676	u64 owner_offset, int refs_to_drop,
				6677	struct btrfs_delayed_extent_op *extent_op)
				6678	{
				6679	struct btrfs_fs_info *info = trans->fs_info;
				6680	struct btrfs_key key;
				6681	struct btrfs_path *path;
				6682	struct btrfs_root *extent_root = info->extent_root;
				6683	struct extent_buffer *leaf;
				6684	struct btrfs_extent_item *ei;
				6685	struct btrfs_extent_inline_ref *iref;
				6686	int ret;
				6687	int is_data;
				6688	int extent_slot = 0;
				6689	int found_extent = 0;
				6690	int num_to_del = 1;
				6691	u32 item_size;
				6692	u64 refs;
				6693	u64 bytenr = node->bytenr;
				6694	u64 num_bytes = node->num_bytes;
				6695	int last_ref = 0;
				6696	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
				6697
				6698	path = btrfs_alloc_path();
				6699	if (!path)
				6700	return -ENOMEM;
				6701
				6702	path->reada = READA_FORWARD;
				6703	path->leave_spinning = 1;
				6704
				6705	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
				6706	BUG_ON(!is_data && refs_to_drop != 1);
				6707
				6708	if (is_data)
				6709	skinny_metadata = false;
				6710
				6711	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				6712	parent, root_objectid, owner_objectid,
				6713	owner_offset);
				6714	if (ret == 0) {
				6715	extent_slot = path->slots[0];
				6716	while (extent_slot >= 0) {
				6717	btrfs_item_key_to_cpu(path->nodes[0], &key,
				6718	extent_slot);
				6719	if (key.objectid != bytenr)
				6720	break;
				6721	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
				6722	key.offset == num_bytes) {
				6723	found_extent = 1;
				6724	break;
				6725	}
				6726	if (key.type == BTRFS_METADATA_ITEM_KEY &&
				6727	key.offset == owner_objectid) {
				6728	found_extent = 1;
				6729	break;
				6730	}
				6731	if (path->slots[0] - extent_slot > 5)
				6732	break;
				6733	extent_slot--;
				6734	}
				6735
				6736	if (!found_extent) {
				6737	BUG_ON(iref);
				6738	ret = remove_extent_backref(trans, path, NULL,
				6739	refs_to_drop,
				6740	is_data, &last_ref);
				6741	if (ret) {
				6742	btrfs_abort_transaction(trans, ret);
				6743	goto out;
				6744	}
				6745	btrfs_release_path(path);
				6746	path->leave_spinning = 1;
				6747
				6748	key.objectid = bytenr;
				6749	key.type = BTRFS_EXTENT_ITEM_KEY;
				6750	key.offset = num_bytes;
				6751
				6752	if (!is_data && skinny_metadata) {
				6753	key.type = BTRFS_METADATA_ITEM_KEY;
				6754	key.offset = owner_objectid;
				6755	}
				6756
				6757	ret = btrfs_search_slot(trans, extent_root,
				6758	&key, path, -1, 1);
				6759	if (ret > 0 && skinny_metadata && path->slots[0]) {
				6760	/*
				6761	* Couldn't find our skinny metadata item,
				6762	* see if we have ye olde extent item.
				6763	*/
				6764	path->slots[0]--;
				6765	btrfs_item_key_to_cpu(path->nodes[0], &key,
				6766	path->slots[0]);
				6767	if (key.objectid == bytenr &&
				6768	key.type == BTRFS_EXTENT_ITEM_KEY &&
				6769	key.offset == num_bytes)
				6770	ret = 0;
				6771	}
				6772
				6773	if (ret > 0 && skinny_metadata) {
				6774	skinny_metadata = false;
				6775	key.objectid = bytenr;
				6776	key.type = BTRFS_EXTENT_ITEM_KEY;
				6777	key.offset = num_bytes;
				6778	btrfs_release_path(path);
				6779	ret = btrfs_search_slot(trans, extent_root,
				6780	&key, path, -1, 1);
				6781	}
				6782
				6783	if (ret) {
				6784	btrfs_err(info,
				6785	"umm, got %d back from search, was looking for %llu",
				6786	ret, bytenr);
				6787	if (ret > 0)
				6788	btrfs_print_leaf(path->nodes[0]);
				6789	}
				6790	if (ret < 0) {
				6791	btrfs_abort_transaction(trans, ret);
				6792	goto out;
				6793	}
				6794	extent_slot = path->slots[0];
				6795	}
				6796	} else if (WARN_ON(ret == -ENOENT)) {
				6797	btrfs_print_leaf(path->nodes[0]);
				6798	btrfs_err(info,
				6799	"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
				6800	bytenr, parent, root_objectid, owner_objectid,
				6801	owner_offset);
				6802	btrfs_abort_transaction(trans, ret);
				6803	goto out;
				6804	} else {
				6805	btrfs_abort_transaction(trans, ret);
				6806	goto out;
				6807	}
				6808
				6809	leaf = path->nodes[0];
				6810	item_size = btrfs_item_size_nr(leaf, extent_slot);
				6811	if (unlikely(item_size < sizeof(*ei))) {
				6812	ret = -EINVAL;
				6813	btrfs_print_v0_err(info);
				6814	btrfs_abort_transaction(trans, ret);
				6815	goto out;
				6816	}
				6817	ei = btrfs_item_ptr(leaf, extent_slot,
				6818	struct btrfs_extent_item);
				6819	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
				6820	key.type == BTRFS_EXTENT_ITEM_KEY) {
				6821	struct btrfs_tree_block_info *bi;
				6822	BUG_ON(item_size < sizeof(ei) + sizeof(bi));
				6823	bi = (struct btrfs_tree_block_info *)(ei + 1);
				6824	WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
				6825	}
				6826
				6827	refs = btrfs_extent_refs(leaf, ei);
				6828	if (refs < refs_to_drop) {
				6829	btrfs_err(info,
				6830	"trying to drop %d refs but we only have %Lu for bytenr %Lu",
				6831	refs_to_drop, refs, bytenr);
				6832	ret = -EINVAL;
				6833	btrfs_abort_transaction(trans, ret);
				6834	goto out;
				6835	}
				6836	refs -= refs_to_drop;
				6837
				6838	if (refs > 0) {
				6839	if (extent_op)
				6840	__run_delayed_extent_op(extent_op, leaf, ei);
				6841	/*
				6842	* In the case of inline back ref, reference count will
				6843	* be updated by remove_extent_backref
				6844	*/
				6845	if (iref) {
				6846	BUG_ON(!found_extent);
				6847	} else {
				6848	btrfs_set_extent_refs(leaf, ei, refs);
				6849	btrfs_mark_buffer_dirty(leaf);
				6850	}
				6851	if (found_extent) {
				6852	ret = remove_extent_backref(trans, path, iref,
				6853	refs_to_drop, is_data,
				6854	&last_ref);
				6855	if (ret) {
				6856	btrfs_abort_transaction(trans, ret);
				6857	goto out;
				6858	}
				6859	}
				6860	} else {
				6861	if (found_extent) {
				6862	BUG_ON(is_data && refs_to_drop !=
				6863	extent_data_ref_count(path, iref));
				6864	if (iref) {
				6865	BUG_ON(path->slots[0] != extent_slot);
				6866	} else {
				6867	BUG_ON(path->slots[0] != extent_slot + 1);
				6868	path->slots[0] = extent_slot;
				6869	num_to_del = 2;
				6870	}
				6871	}
				6872
				6873	last_ref = 1;
				6874	ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				6875	num_to_del);
				6876	if (ret) {
				6877	btrfs_abort_transaction(trans, ret);
				6878	goto out;
				6879	}
				6880	btrfs_release_path(path);
				6881
				6882	if (is_data) {
				6883	ret = btrfs_del_csums(trans, info->csum_root, bytenr,
				6884	num_bytes);
				6885	if (ret) {
				6886	btrfs_abort_transaction(trans, ret);
				6887	goto out;
				6888	}
				6889	}
				6890
				6891	ret = add_to_free_space_tree(trans, bytenr, num_bytes);
				6892	if (ret) {
				6893	btrfs_abort_transaction(trans, ret);
				6894	goto out;
				6895	}
				6896
				6897	ret = update_block_group(trans, info, bytenr, num_bytes, 0);
				6898	if (ret) {
				6899	btrfs_abort_transaction(trans, ret);
				6900	goto out;
				6901	}
				6902	}
				6903	btrfs_release_path(path);
				6904
				6905	out:
				6906	btrfs_free_path(path);
				6907	return ret;
				6908	}
				6909
				6910	/*
				6911	* when we free an block, it is possible (and likely) that we free the last
				6912	* delayed ref for that extent as well. This searches the delayed ref tree for
				6913	* a given extent, and if there are no other delayed refs to be processed, it
				6914	* removes it from the tree.
				6915	*/
				6916	static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
				6917	u64 bytenr)
				6918	{
				6919	struct btrfs_delayed_ref_head *head;
				6920	struct btrfs_delayed_ref_root *delayed_refs;
				6921	int ret = 0;
				6922
				6923	delayed_refs = &trans->transaction->delayed_refs;
				6924	spin_lock(&delayed_refs->lock);
				6925	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
				6926	if (!head)
				6927	goto out_delayed_unlock;
				6928
				6929	spin_lock(&head->lock);
				6930	if (!RB_EMPTY_ROOT(&head->ref_tree))
				6931	goto out;
				6932
				6933	if (head->extent_op) {
				6934	if (!head->must_insert_reserved)
				6935	goto out;
				6936	btrfs_free_delayed_extent_op(head->extent_op);
				6937	head->extent_op = NULL;
				6938	}
				6939
				6940	/*
				6941	* waiting for the lock here would deadlock. If someone else has it
				6942	* locked they are already in the process of dropping it anyway
				6943	*/
				6944	if (!mutex_trylock(&head->mutex))
				6945	goto out;
				6946
				6947	/*
				6948	* at this point we have a head with no other entries. Go
				6949	* ahead and process it.
				6950	*/
				6951	rb_erase(&head->href_node, &delayed_refs->href_root);
				6952	RB_CLEAR_NODE(&head->href_node);
				6953	atomic_dec(&delayed_refs->num_entries);
				6954
				6955	/*
				6956	* we don't take a ref on the node because we're removing it from the
				6957	* tree, so we just steal the ref the tree was holding.
				6958	*/
				6959	delayed_refs->num_heads--;
				6960	if (head->processing == 0)
				6961	delayed_refs->num_heads_ready--;
				6962	head->processing = 0;
				6963	spin_unlock(&head->lock);
				6964	spin_unlock(&delayed_refs->lock);
				6965
				6966	BUG_ON(head->extent_op);
				6967	if (head->must_insert_reserved)
				6968	ret = 1;
				6969
				6970	mutex_unlock(&head->mutex);
				6971	btrfs_put_delayed_ref_head(head);
				6972	return ret;
				6973	out:
				6974	spin_unlock(&head->lock);
				6975
				6976	out_delayed_unlock:
				6977	spin_unlock(&delayed_refs->lock);
				6978	return 0;
				6979	}
				6980
				6981	void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
				6982	struct btrfs_root *root,
				6983	struct extent_buffer *buf,
				6984	u64 parent, int last_ref)
				6985	{
				6986	struct btrfs_fs_info *fs_info = root->fs_info;
				6987	int pin = 1;
				6988	int ret;
				6989
				6990	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				6991	int old_ref_mod, new_ref_mod;
				6992
				6993	btrfs_ref_tree_mod(root, buf->start, buf->len, parent,
				6994	root->root_key.objectid,
				6995	btrfs_header_level(buf), 0,
				6996	BTRFS_DROP_DELAYED_REF);
				6997	ret = btrfs_add_delayed_tree_ref(trans, buf->start,
				6998	buf->len, parent,
				6999	root->root_key.objectid,
				7000	btrfs_header_level(buf),
				7001	BTRFS_DROP_DELAYED_REF, NULL,
				7002	&old_ref_mod, &new_ref_mod);
				7003	BUG_ON(ret); /* -ENOMEM */
				7004	pin = old_ref_mod >= 0 && new_ref_mod < 0;
				7005	}
				7006
				7007	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
				7008	struct btrfs_block_group_cache *cache;
				7009
				7010	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
				7011	ret = check_ref_cleanup(trans, buf->start);
				7012	if (!ret)
				7013	goto out;
				7014	}
				7015
				7016	pin = 0;
				7017	cache = btrfs_lookup_block_group(fs_info, buf->start);
				7018
				7019	if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
				7020	pin_down_extent(fs_info, cache, buf->start,
				7021	buf->len, 1);
				7022	btrfs_put_block_group(cache);
				7023	goto out;
				7024	}
				7025
				7026	WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
				7027
				7028	btrfs_add_free_space(cache, buf->start, buf->len);
				7029	btrfs_free_reserved_bytes(cache, buf->len, 0);
				7030	btrfs_put_block_group(cache);
				7031	trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
				7032	}
				7033	out:
				7034	if (pin)
				7035	add_pinned_bytes(fs_info, buf->len, true,
				7036	root->root_key.objectid);
				7037
				7038	if (last_ref) {
				7039	/*
				7040	* Deleting the buffer, clear the corrupt flag since it doesn't
				7041	* matter anymore.
				7042	*/
				7043	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
				7044	}
				7045	}
				7046
				7047	/* Can return -ENOMEM */
				7048	int btrfs_free_extent(struct btrfs_trans_handle *trans,
				7049	struct btrfs_root *root,
				7050	u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
				7051	u64 owner, u64 offset)
				7052	{
				7053	struct btrfs_fs_info *fs_info = root->fs_info;
				7054	int old_ref_mod, new_ref_mod;
				7055	int ret;
				7056
				7057	if (btrfs_is_testing(fs_info))
				7058	return 0;
				7059
				7060	if (root_objectid != BTRFS_TREE_LOG_OBJECTID)
				7061	btrfs_ref_tree_mod(root, bytenr, num_bytes, parent,
				7062	root_objectid, owner, offset,
				7063	BTRFS_DROP_DELAYED_REF);
				7064
				7065	/*
				7066	* tree log blocks never actually go into the extent allocation
				7067	* tree, just update pinning info and exit early.
				7068	*/
				7069	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
				7070	WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
				7071	/* unlocks the pinned mutex */
				7072	btrfs_pin_extent(fs_info, bytenr, num_bytes, 1);
				7073	old_ref_mod = new_ref_mod = 0;
				7074	ret = 0;
				7075	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
				7076	ret = btrfs_add_delayed_tree_ref(trans, bytenr,
				7077	num_bytes, parent,
				7078	root_objectid, (int)owner,
				7079	BTRFS_DROP_DELAYED_REF, NULL,
				7080	&old_ref_mod, &new_ref_mod);
				7081	} else {
				7082	ret = btrfs_add_delayed_data_ref(trans, bytenr,
				7083	num_bytes, parent,
				7084	root_objectid, owner, offset,
				7085	0, BTRFS_DROP_DELAYED_REF,
				7086	&old_ref_mod, &new_ref_mod);
				7087	}
				7088
				7089	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0) {
				7090	bool metadata = owner < BTRFS_FIRST_FREE_OBJECTID;
				7091
				7092	add_pinned_bytes(fs_info, num_bytes, metadata, root_objectid);
				7093	}
				7094
				7095	return ret;
				7096	}
				7097
				7098	/*
				7099	* when we wait for progress in the block group caching, its because
				7100	* our allocation attempt failed at least once. So, we must sleep
				7101	* and let some progress happen before we try again.
				7102	*
				7103	* This function will sleep at least once waiting for new free space to
				7104	* show up, and then it will check the block group free space numbers
				7105	* for our min num_bytes. Another option is to have it go ahead
				7106	* and look in the rbtree for a free extent of a given size, but this
				7107	* is a good start.
				7108	*
				7109	* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
				7110	* any of the information in this block group.
				7111	*/
				7112	static noinline void
				7113	wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
				7114	u64 num_bytes)
				7115	{
				7116	struct btrfs_caching_control *caching_ctl;
				7117
				7118	caching_ctl = get_caching_control(cache);
				7119	if (!caching_ctl)
				7120	return;
				7121
				7122	wait_event(caching_ctl->wait, block_group_cache_done(cache) \|\|
				7123	(cache->free_space_ctl->free_space >= num_bytes));
				7124
				7125	put_caching_control(caching_ctl);
				7126	}
				7127
				7128	static noinline int
				7129	wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
				7130	{
				7131	struct btrfs_caching_control *caching_ctl;
				7132	int ret = 0;
				7133
				7134	caching_ctl = get_caching_control(cache);
				7135	if (!caching_ctl)
				7136	return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
				7137
				7138	wait_event(caching_ctl->wait, block_group_cache_done(cache));
				7139	if (cache->cached == BTRFS_CACHE_ERROR)
				7140	ret = -EIO;
				7141	put_caching_control(caching_ctl);
				7142	return ret;
				7143	}
				7144
				7145	enum btrfs_loop_type {
				7146	LOOP_CACHING_NOWAIT = 0,
				7147	LOOP_CACHING_WAIT = 1,
				7148	LOOP_ALLOC_CHUNK = 2,
				7149	LOOP_NO_EMPTY_SIZE = 3,
				7150	};
				7151
				7152	static inline void
				7153	btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
				7154	int delalloc)
				7155	{
				7156	if (delalloc)
				7157	down_read(&cache->data_rwsem);
				7158	}
				7159
				7160	static inline void
				7161	btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
				7162	int delalloc)
				7163	{
				7164	btrfs_get_block_group(cache);
				7165	if (delalloc)
				7166	down_read(&cache->data_rwsem);
				7167	}
				7168
				7169	static struct btrfs_block_group_cache *
				7170	btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
				7171	struct btrfs_free_cluster *cluster,
				7172	int delalloc)
				7173	{
				7174	struct btrfs_block_group_cache *used_bg = NULL;
				7175
				7176	spin_lock(&cluster->refill_lock);
				7177	while (1) {
				7178	used_bg = cluster->block_group;
				7179	if (!used_bg)
				7180	return NULL;
				7181
				7182	if (used_bg == block_group)
				7183	return used_bg;
				7184
				7185	btrfs_get_block_group(used_bg);
				7186
				7187	if (!delalloc)
				7188	return used_bg;
				7189
				7190	if (down_read_trylock(&used_bg->data_rwsem))
				7191	return used_bg;
				7192
				7193	spin_unlock(&cluster->refill_lock);
				7194
				7195	/* We should only have one-level nested. */
				7196	down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
				7197
				7198	spin_lock(&cluster->refill_lock);
				7199	if (used_bg == cluster->block_group)
				7200	return used_bg;
				7201
				7202	up_read(&used_bg->data_rwsem);
				7203	btrfs_put_block_group(used_bg);
				7204	}
				7205	}
				7206
				7207	static inline void
				7208	btrfs_release_block_group(struct btrfs_block_group_cache *cache,
				7209	int delalloc)
				7210	{
				7211	if (delalloc)
				7212	up_read(&cache->data_rwsem);
				7213	btrfs_put_block_group(cache);
				7214	}
				7215
				7216	/*
				7217	* walks the btree of allocated extents and find a hole of a given size.
				7218	* The key ins is changed to record the hole:
				7219	* ins->objectid == start position
				7220	* ins->flags = BTRFS_EXTENT_ITEM_KEY
				7221	* ins->offset == the size of the hole.
				7222	* Any available blocks before search_start are skipped.
				7223	*
				7224	* If there is no suitable free space, we will record the max size of
				7225	* the free space extent currently.
				7226	*/
				7227	static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
				7228	u64 ram_bytes, u64 num_bytes, u64 empty_size,
				7229	u64 hint_byte, struct btrfs_key *ins,
				7230	u64 flags, int delalloc)
				7231	{
				7232	int ret = 0;
				7233	struct btrfs_root *root = fs_info->extent_root;
				7234	struct btrfs_free_cluster *last_ptr = NULL;
				7235	struct btrfs_block_group_cache *block_group = NULL;
				7236	u64 search_start = 0;
				7237	u64 max_extent_size = 0;
				7238	u64 max_free_space = 0;
				7239	u64 empty_cluster = 0;
				7240	struct btrfs_space_info *space_info;
				7241	int loop = 0;
				7242	int index = btrfs_bg_flags_to_raid_index(flags);
				7243	bool failed_cluster_refill = false;
				7244	bool failed_alloc = false;
				7245	bool use_cluster = true;
				7246	bool have_caching_bg = false;
				7247	bool orig_have_caching_bg = false;
				7248	bool full_search = false;
				7249
				7250	WARN_ON(num_bytes < fs_info->sectorsize);
				7251	ins->type = BTRFS_EXTENT_ITEM_KEY;
				7252	ins->objectid = 0;
				7253	ins->offset = 0;
				7254
				7255	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
				7256
				7257	space_info = __find_space_info(fs_info, flags);
				7258	if (!space_info) {
				7259	btrfs_err(fs_info, "No space info for %llu", flags);
				7260	return -ENOSPC;
				7261	}
				7262
				7263	/*
				7264	* If our free space is heavily fragmented we may not be able to make
				7265	* big contiguous allocations, so instead of doing the expensive search
				7266	* for free space, simply return ENOSPC with our max_extent_size so we
				7267	* can go ahead and search for a more manageable chunk.
				7268	*
				7269	* If our max_extent_size is large enough for our allocation simply
				7270	* disable clustering since we will likely not be able to find enough
				7271	* space to create a cluster and induce latency trying.
				7272	*/
				7273	if (unlikely(space_info->max_extent_size)) {
				7274	spin_lock(&space_info->lock);
				7275	if (space_info->max_extent_size &&
				7276	num_bytes > space_info->max_extent_size) {
				7277	ins->offset = space_info->max_extent_size;
				7278	spin_unlock(&space_info->lock);
				7279	return -ENOSPC;
				7280	} else if (space_info->max_extent_size) {
				7281	use_cluster = false;
				7282	}
				7283	spin_unlock(&space_info->lock);
				7284	}
				7285
				7286	last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster);
				7287	if (last_ptr) {
				7288	spin_lock(&last_ptr->lock);
				7289	if (last_ptr->block_group)
				7290	hint_byte = last_ptr->window_start;
				7291	if (last_ptr->fragmented) {
				7292	/*
				7293	* We still set window_start so we can keep track of the
				7294	* last place we found an allocation to try and save
				7295	* some time.
				7296	*/
				7297	hint_byte = last_ptr->window_start;
				7298	use_cluster = false;
				7299	}
				7300	spin_unlock(&last_ptr->lock);
				7301	}
				7302
				7303	search_start = max(search_start, first_logical_byte(fs_info, 0));
				7304	search_start = max(search_start, hint_byte);
				7305	if (search_start == hint_byte) {
				7306	block_group = btrfs_lookup_block_group(fs_info, search_start);
				7307	/*
				7308	* we don't want to use the block group if it doesn't match our
				7309	* allocation bits, or if its not cached.
				7310	*
				7311	* However if we are re-searching with an ideal block group
				7312	* picked out then we don't care that the block group is cached.
				7313	*/
				7314	if (block_group && block_group_bits(block_group, flags) &&
				7315	block_group->cached != BTRFS_CACHE_NO) {
				7316	down_read(&space_info->groups_sem);
				7317	if (list_empty(&block_group->list) \|\|
				7318	block_group->ro) {
				7319	/*
				7320	* someone is removing this block group,
				7321	* we can't jump into the have_block_group
				7322	* target because our list pointers are not
				7323	* valid
				7324	*/
				7325	btrfs_put_block_group(block_group);
				7326	up_read(&space_info->groups_sem);
				7327	} else {
				7328	index = btrfs_bg_flags_to_raid_index(
				7329	block_group->flags);
				7330	btrfs_lock_block_group(block_group, delalloc);
				7331	goto have_block_group;
				7332	}
				7333	} else if (block_group) {
				7334	btrfs_put_block_group(block_group);
				7335	}
				7336	}
				7337	search:
				7338	have_caching_bg = false;
				7339	if (index == 0 \|\| index == btrfs_bg_flags_to_raid_index(flags))
				7340	full_search = true;
				7341	down_read(&space_info->groups_sem);
				7342	list_for_each_entry(block_group, &space_info->block_groups[index],
				7343	list) {
				7344	u64 offset;
				7345	int cached;
				7346
				7347	/* If the block group is read-only, we can skip it entirely. */
				7348	if (unlikely(block_group->ro))
				7349	continue;
				7350
				7351	btrfs_grab_block_group(block_group, delalloc);
				7352	search_start = block_group->key.objectid;
				7353
				7354	/*
				7355	* this can happen if we end up cycling through all the
				7356	* raid types, but we want to make sure we only allocate
				7357	* for the proper type.
				7358	*/
				7359	if (!block_group_bits(block_group, flags)) {
				7360	u64 extra = BTRFS_BLOCK_GROUP_DUP \|
				7361	BTRFS_BLOCK_GROUP_RAID1 \|
				7362	BTRFS_BLOCK_GROUP_RAID5 \|
				7363	BTRFS_BLOCK_GROUP_RAID6 \|
				7364	BTRFS_BLOCK_GROUP_RAID10;
				7365
				7366	/*
				7367	* if they asked for extra copies and this block group
				7368	* doesn't provide them, bail. This does allow us to
				7369	* fill raid0 from raid1.
				7370	*/
				7371	if ((flags & extra) && !(block_group->flags & extra))
				7372	goto loop;
				7373
				7374	/*
				7375	* This block group has different flags than we want.
				7376	* It's possible that we have MIXED_GROUP flag but no
				7377	* block group is mixed. Just skip such block group.
				7378	*/
				7379	btrfs_release_block_group(block_group, delalloc);
				7380	continue;
				7381	}
				7382
				7383	have_block_group:
				7384	cached = block_group_cache_done(block_group);
				7385	if (unlikely(!cached)) {
				7386	have_caching_bg = true;
				7387	ret = cache_block_group(block_group, 0);
				7388	BUG_ON(ret < 0);
				7389	ret = 0;
				7390	}
				7391
				7392	if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
				7393	goto loop;
				7394
				7395	/*
				7396	* Ok we want to try and use the cluster allocator, so
				7397	* lets look there
				7398	*/
				7399	if (last_ptr && use_cluster) {
				7400	struct btrfs_block_group_cache *used_block_group;
				7401	unsigned long aligned_cluster;
				7402	/*
				7403	* the refill lock keeps out other
				7404	* people trying to start a new cluster
				7405	*/
				7406	used_block_group = btrfs_lock_cluster(block_group,
				7407	last_ptr,
				7408	delalloc);
				7409	if (!used_block_group)
				7410	goto refill_cluster;
				7411
				7412	if (used_block_group != block_group &&
				7413	(used_block_group->ro \|\|
				7414	!block_group_bits(used_block_group, flags)))
				7415	goto release_cluster;
				7416
				7417	offset = btrfs_alloc_from_cluster(used_block_group,
				7418	last_ptr,
				7419	num_bytes,
				7420	used_block_group->key.objectid,
				7421	&max_extent_size);
				7422	if (offset) {
				7423	/* we have a block, we're done */
				7424	spin_unlock(&last_ptr->refill_lock);
				7425	trace_btrfs_reserve_extent_cluster(
				7426	used_block_group,
				7427	search_start, num_bytes);
				7428	if (used_block_group != block_group) {
				7429	btrfs_release_block_group(block_group,
				7430	delalloc);
				7431	block_group = used_block_group;
				7432	}
				7433	goto checks;
				7434	}
				7435
				7436	WARN_ON(last_ptr->block_group != used_block_group);
				7437	release_cluster:
				7438	/* If we are on LOOP_NO_EMPTY_SIZE, we can't
				7439	* set up a new clusters, so lets just skip it
				7440	* and let the allocator find whatever block
				7441	* it can find. If we reach this point, we
				7442	* will have tried the cluster allocator
				7443	* plenty of times and not have found
				7444	* anything, so we are likely way too
				7445	* fragmented for the clustering stuff to find
				7446	* anything.
				7447	*
				7448	* However, if the cluster is taken from the
				7449	* current block group, release the cluster
				7450	* first, so that we stand a better chance of
				7451	* succeeding in the unclustered
				7452	* allocation. */
				7453	if (loop >= LOOP_NO_EMPTY_SIZE &&
				7454	used_block_group != block_group) {
				7455	spin_unlock(&last_ptr->refill_lock);
				7456	btrfs_release_block_group(used_block_group,
				7457	delalloc);
				7458	goto unclustered_alloc;
				7459	}
				7460
				7461	/*
				7462	* this cluster didn't work out, free it and
				7463	* start over
				7464	*/
				7465	btrfs_return_cluster_to_free_space(NULL, last_ptr);
				7466
				7467	if (used_block_group != block_group)
				7468	btrfs_release_block_group(used_block_group,
				7469	delalloc);
				7470	refill_cluster:
				7471	if (loop >= LOOP_NO_EMPTY_SIZE) {
				7472	spin_unlock(&last_ptr->refill_lock);
				7473	goto unclustered_alloc;
				7474	}
				7475
				7476	aligned_cluster = max_t(unsigned long,
				7477	empty_cluster + empty_size,
				7478	block_group->full_stripe_len);
				7479
				7480	/* allocate a cluster in this block group */
				7481	ret = btrfs_find_space_cluster(fs_info, block_group,
				7482	last_ptr, search_start,
				7483	num_bytes,
				7484	aligned_cluster);
				7485	if (ret == 0) {
				7486	/*
				7487	* now pull our allocation out of this
				7488	* cluster
				7489	*/
				7490	offset = btrfs_alloc_from_cluster(block_group,
				7491	last_ptr,
				7492	num_bytes,
				7493	search_start,
				7494	&max_extent_size);
				7495	if (offset) {
				7496	/* we found one, proceed */
				7497	spin_unlock(&last_ptr->refill_lock);
				7498	trace_btrfs_reserve_extent_cluster(
				7499	block_group, search_start,
				7500	num_bytes);
				7501	goto checks;
				7502	}
				7503	} else if (!cached && loop > LOOP_CACHING_NOWAIT
				7504	&& !failed_cluster_refill) {
				7505	spin_unlock(&last_ptr->refill_lock);
				7506
				7507	failed_cluster_refill = true;
				7508	wait_block_group_cache_progress(block_group,
				7509	num_bytes + empty_cluster + empty_size);
				7510	goto have_block_group;
				7511	}
				7512
				7513	/*
				7514	* at this point we either didn't find a cluster
				7515	* or we weren't able to allocate a block from our
				7516	* cluster. Free the cluster we've been trying
				7517	* to use, and go to the next block group
				7518	*/
				7519	btrfs_return_cluster_to_free_space(NULL, last_ptr);
				7520	spin_unlock(&last_ptr->refill_lock);
				7521	goto loop;
				7522	}
				7523
				7524	unclustered_alloc:
				7525	/*
				7526	* We are doing an unclustered alloc, set the fragmented flag so
				7527	* we don't bother trying to setup a cluster again until we get
				7528	* more space.
				7529	*/
				7530	if (unlikely(last_ptr)) {
				7531	spin_lock(&last_ptr->lock);
				7532	last_ptr->fragmented = 1;
				7533	spin_unlock(&last_ptr->lock);
				7534	}
				7535	if (cached) {
				7536	struct btrfs_free_space_ctl *ctl =
				7537	block_group->free_space_ctl;
				7538
				7539	spin_lock(&ctl->tree_lock);
				7540	if (ctl->free_space <
				7541	num_bytes + empty_cluster + empty_size) {
				7542	max_free_space = max(max_free_space,
				7543	ctl->free_space);
				7544	spin_unlock(&ctl->tree_lock);
				7545	goto loop;
				7546	}
				7547	spin_unlock(&ctl->tree_lock);
				7548	}
				7549
				7550	offset = btrfs_find_space_for_alloc(block_group, search_start,
				7551	num_bytes, empty_size,
				7552	&max_extent_size);
				7553	/*
				7554	* If we didn't find a chunk, and we haven't failed on this
				7555	* block group before, and this block group is in the middle of
				7556	* caching and we are ok with waiting, then go ahead and wait
				7557	* for progress to be made, and set failed_alloc to true.
				7558	*
				7559	* If failed_alloc is true then we've already waited on this
				7560	* block group once and should move on to the next block group.
				7561	*/
				7562	if (!offset && !failed_alloc && !cached &&
				7563	loop > LOOP_CACHING_NOWAIT) {
				7564	wait_block_group_cache_progress(block_group,
				7565	num_bytes + empty_size);
				7566	failed_alloc = true;
				7567	goto have_block_group;
				7568	} else if (!offset) {
				7569	goto loop;
				7570	}
				7571	checks:
				7572	search_start = round_up(offset, fs_info->stripesize);
				7573
				7574	/* move on to the next group */
				7575	if (search_start + num_bytes >
				7576	block_group->key.objectid + block_group->key.offset) {
				7577	btrfs_add_free_space(block_group, offset, num_bytes);
				7578	goto loop;
				7579	}
				7580
				7581	if (offset < search_start)
				7582	btrfs_add_free_space(block_group, offset,
				7583	search_start - offset);
				7584
				7585	ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				7586	num_bytes, delalloc);
				7587	if (ret == -EAGAIN) {
				7588	btrfs_add_free_space(block_group, offset, num_bytes);
				7589	goto loop;
				7590	}
				7591	btrfs_inc_block_group_reservations(block_group);
				7592
				7593	/* we are all good, lets return */
				7594	ins->objectid = search_start;
				7595	ins->offset = num_bytes;
				7596
				7597	trace_btrfs_reserve_extent(block_group, search_start, num_bytes);
				7598	btrfs_release_block_group(block_group, delalloc);
				7599	break;
				7600	loop:
				7601	failed_cluster_refill = false;
				7602	failed_alloc = false;
				7603	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
				7604	index);
				7605	btrfs_release_block_group(block_group, delalloc);
				7606	cond_resched();
				7607	}
				7608	up_read(&space_info->groups_sem);
				7609
				7610	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
				7611	&& !orig_have_caching_bg)
				7612	orig_have_caching_bg = true;
				7613
				7614	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
				7615	goto search;
				7616
				7617	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
				7618	goto search;
				7619
				7620	/*
				7621	* LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
				7622	* caching kthreads as we move along
				7623	* LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
				7624	* LOOP_ALLOC_CHUNK, force a chunk allocation and try again
				7625	* LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
				7626	* again
				7627	*/
				7628	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
				7629	index = 0;
				7630	if (loop == LOOP_CACHING_NOWAIT) {
				7631	/*
				7632	* We want to skip the LOOP_CACHING_WAIT step if we
				7633	* don't have any uncached bgs and we've already done a
				7634	* full search through.
				7635	*/
				7636	if (orig_have_caching_bg \|\| !full_search)
				7637	loop = LOOP_CACHING_WAIT;
				7638	else
				7639	loop = LOOP_ALLOC_CHUNK;
				7640	} else {
				7641	loop++;
				7642	}
				7643
				7644	if (loop == LOOP_ALLOC_CHUNK) {
				7645	struct btrfs_trans_handle *trans;
				7646	int exist = 0;
				7647
				7648	trans = current->journal_info;
				7649	if (trans)
				7650	exist = 1;
				7651	else
				7652	trans = btrfs_join_transaction(root);
				7653
				7654	if (IS_ERR(trans)) {
				7655	ret = PTR_ERR(trans);
				7656	goto out;
				7657	}
				7658
				7659	ret = do_chunk_alloc(trans, flags, CHUNK_ALLOC_FORCE);
				7660
				7661	/*
				7662	* If we can't allocate a new chunk we've already looped
				7663	* through at least once, move on to the NO_EMPTY_SIZE
				7664	* case.
				7665	*/
				7666	if (ret == -ENOSPC)
				7667	loop = LOOP_NO_EMPTY_SIZE;
				7668
				7669	/*
				7670	* Do not bail out on ENOSPC since we
				7671	* can do more things.
				7672	*/
				7673	if (ret < 0 && ret != -ENOSPC)
				7674	btrfs_abort_transaction(trans, ret);
				7675	else
				7676	ret = 0;
				7677	if (!exist)
				7678	btrfs_end_transaction(trans);
				7679	if (ret)
				7680	goto out;
				7681	}
				7682
				7683	if (loop == LOOP_NO_EMPTY_SIZE) {
				7684	/*
				7685	* Don't loop again if we already have no empty_size and
				7686	* no empty_cluster.
				7687	*/
				7688	if (empty_size == 0 &&
				7689	empty_cluster == 0) {
				7690	ret = -ENOSPC;
				7691	goto out;
				7692	}
				7693	empty_size = 0;
				7694	empty_cluster = 0;
				7695	}
				7696
				7697	goto search;
				7698	} else if (!ins->objectid) {
				7699	ret = -ENOSPC;
				7700	} else if (ins->objectid) {
				7701	if (!use_cluster && last_ptr) {
				7702	spin_lock(&last_ptr->lock);
				7703	last_ptr->window_start = ins->objectid;
				7704	spin_unlock(&last_ptr->lock);
				7705	}
				7706	ret = 0;
				7707	}
				7708	out:
				7709	if (ret == -ENOSPC) {
				7710	if (!max_extent_size)
				7711	max_extent_size = max_free_space;
				7712	spin_lock(&space_info->lock);
				7713	space_info->max_extent_size = max_extent_size;
				7714	spin_unlock(&space_info->lock);
				7715	ins->offset = max_extent_size;
				7716	}
				7717	return ret;
				7718	}
				7719
				7720	static void dump_space_info(struct btrfs_fs_info *fs_info,
				7721	struct btrfs_space_info *info, u64 bytes,
				7722	int dump_block_groups)
				7723	{
				7724	struct btrfs_block_group_cache *cache;
				7725	int index = 0;
				7726
				7727	spin_lock(&info->lock);
				7728	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
				7729	info->flags,
				7730	info->total_bytes - btrfs_space_info_used(info, true),
				7731	info->full ? "" : "not ");
				7732	btrfs_info(fs_info,
				7733	"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
				7734	info->total_bytes, info->bytes_used, info->bytes_pinned,
				7735	info->bytes_reserved, info->bytes_may_use,
				7736	info->bytes_readonly);
				7737	spin_unlock(&info->lock);
				7738
				7739	if (!dump_block_groups)
				7740	return;
				7741
				7742	down_read(&info->groups_sem);
				7743	again:
				7744	list_for_each_entry(cache, &info->block_groups[index], list) {
				7745	spin_lock(&cache->lock);
				7746	btrfs_info(fs_info,
				7747	"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
				7748	cache->key.objectid, cache->key.offset,
				7749	btrfs_block_group_used(&cache->item), cache->pinned,
				7750	cache->reserved, cache->ro ? "[readonly]" : "");
				7751	btrfs_dump_free_space(cache, bytes);
				7752	spin_unlock(&cache->lock);
				7753	}
				7754	if (++index < BTRFS_NR_RAID_TYPES)
				7755	goto again;
				7756	up_read(&info->groups_sem);
				7757	}
				7758
				7759	/*
				7760	* btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
				7761	* hole that is at least as big as @num_bytes.
				7762	*
				7763	* @root - The root that will contain this extent
				7764	*
				7765	* @ram_bytes - The amount of space in ram that @num_bytes take. This
				7766	* is used for accounting purposes. This value differs
				7767	* from @num_bytes only in the case of compressed extents.
				7768	*
				7769	* @num_bytes - Number of bytes to allocate on-disk.
				7770	*
				7771	* @min_alloc_size - Indicates the minimum amount of space that the
				7772	* allocator should try to satisfy. In some cases
				7773	* @num_bytes may be larger than what is required and if
				7774	* the filesystem is fragmented then allocation fails.
				7775	* However, the presence of @min_alloc_size gives a
				7776	* chance to try and satisfy the smaller allocation.
				7777	*
				7778	* @empty_size - A hint that you plan on doing more COW. This is the
				7779	* size in bytes the allocator should try to find free
				7780	* next to the block it returns. This is just a hint and
				7781	* may be ignored by the allocator.
				7782	*
				7783	* @hint_byte - Hint to the allocator to start searching above the byte
				7784	* address passed. It might be ignored.
				7785	*
				7786	* @ins - This key is modified to record the found hole. It will
				7787	* have the following values:
				7788	* ins->objectid == start position
				7789	* ins->flags = BTRFS_EXTENT_ITEM_KEY
				7790	* ins->offset == the size of the hole.
				7791	*
				7792	* @is_data - Boolean flag indicating whether an extent is
				7793	* allocated for data (true) or metadata (false)
				7794	*
				7795	* @delalloc - Boolean flag indicating whether this allocation is for
				7796	* delalloc or not. If 'true' data_rwsem of block groups
				7797	* is going to be acquired.
				7798	*
				7799	*
				7800	* Returns 0 when an allocation succeeded or < 0 when an error occurred. In
				7801	* case -ENOSPC is returned then @ins->offset will contain the size of the
				7802	* largest available hole the allocator managed to find.
				7803	*/
				7804	int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
				7805	u64 num_bytes, u64 min_alloc_size,
				7806	u64 empty_size, u64 hint_byte,
				7807	struct btrfs_key *ins, int is_data, int delalloc)
				7808	{
				7809	struct btrfs_fs_info *fs_info = root->fs_info;
				7810	bool final_tried = num_bytes == min_alloc_size;
				7811	u64 flags;
				7812	int ret;
				7813
				7814	flags = get_alloc_profile_by_root(root, is_data);
				7815	again:
				7816	WARN_ON(num_bytes < fs_info->sectorsize);
				7817	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
				7818	hint_byte, ins, flags, delalloc);
				7819	if (!ret && !is_data) {
				7820	btrfs_dec_block_group_reservations(fs_info, ins->objectid);
				7821	} else if (ret == -ENOSPC) {
				7822	if (!final_tried && ins->offset) {
				7823	num_bytes = min(num_bytes >> 1, ins->offset);
				7824	num_bytes = round_down(num_bytes,
				7825	fs_info->sectorsize);
				7826	num_bytes = max(num_bytes, min_alloc_size);
				7827	ram_bytes = num_bytes;
				7828	if (num_bytes == min_alloc_size)
				7829	final_tried = true;
				7830	goto again;
				7831	} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				7832	struct btrfs_space_info *sinfo;
				7833
				7834	sinfo = __find_space_info(fs_info, flags);
				7835	btrfs_err(fs_info,
				7836	"allocation failed flags %llu, wanted %llu",
				7837	flags, num_bytes);
				7838	if (sinfo)
				7839	dump_space_info(fs_info, sinfo, num_bytes, 1);
				7840	}
				7841	}
				7842
				7843	return ret;
				7844	}
				7845
				7846	static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
				7847	u64 start, u64 len,
				7848	int pin, int delalloc)
				7849	{
				7850	struct btrfs_block_group_cache *cache;
				7851	int ret = 0;
				7852
				7853	cache = btrfs_lookup_block_group(fs_info, start);
				7854	if (!cache) {
				7855	btrfs_err(fs_info, "Unable to find block group for %llu",
				7856	start);
				7857	return -ENOSPC;
				7858	}
				7859
				7860	if (pin)
				7861	pin_down_extent(fs_info, cache, start, len, 1);
				7862	else {
				7863	if (btrfs_test_opt(fs_info, DISCARD))
				7864	ret = btrfs_discard_extent(fs_info, start, len, NULL);
				7865	btrfs_add_free_space(cache, start, len);
				7866	btrfs_free_reserved_bytes(cache, len, delalloc);
				7867	trace_btrfs_reserved_extent_free(fs_info, start, len);
				7868	}
				7869
				7870	btrfs_put_block_group(cache);
				7871	return ret;
				7872	}
				7873
				7874	int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
				7875	u64 start, u64 len, int delalloc)
				7876	{
				7877	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
				7878	}
				7879
				7880	int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
				7881	u64 start, u64 len)
				7882	{
				7883	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
				7884	}
				7885
				7886	static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				7887	u64 parent, u64 root_objectid,
				7888	u64 flags, u64 owner, u64 offset,
				7889	struct btrfs_key *ins, int ref_mod)
				7890	{
				7891	struct btrfs_fs_info *fs_info = trans->fs_info;
				7892	int ret;
				7893	struct btrfs_extent_item *extent_item;
				7894	struct btrfs_extent_inline_ref *iref;
				7895	struct btrfs_path *path;
				7896	struct extent_buffer *leaf;
				7897	int type;
				7898	u32 size;
				7899
				7900	if (parent > 0)
				7901	type = BTRFS_SHARED_DATA_REF_KEY;
				7902	else
				7903	type = BTRFS_EXTENT_DATA_REF_KEY;
				7904
				7905	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
				7906
				7907	path = btrfs_alloc_path();
				7908	if (!path)
				7909	return -ENOMEM;
				7910
				7911	path->leave_spinning = 1;
				7912	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				7913	ins, size);
				7914	if (ret) {
				7915	btrfs_free_path(path);
				7916	return ret;
				7917	}
				7918
				7919	leaf = path->nodes[0];
				7920	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				7921	struct btrfs_extent_item);
				7922	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
				7923	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
				7924	btrfs_set_extent_flags(leaf, extent_item,
				7925	flags \| BTRFS_EXTENT_FLAG_DATA);
				7926
				7927	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
				7928	btrfs_set_extent_inline_ref_type(leaf, iref, type);
				7929	if (parent > 0) {
				7930	struct btrfs_shared_data_ref *ref;
				7931	ref = (struct btrfs_shared_data_ref *)(iref + 1);
				7932	btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
				7933	btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
				7934	} else {
				7935	struct btrfs_extent_data_ref *ref;
				7936	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
				7937	btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
				7938	btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
				7939	btrfs_set_extent_data_ref_offset(leaf, ref, offset);
				7940	btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
				7941	}
				7942
				7943	btrfs_mark_buffer_dirty(path->nodes[0]);
				7944	btrfs_free_path(path);
				7945
				7946	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
				7947	if (ret)
				7948	return ret;
				7949
				7950	ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1);
				7951	if (ret) { /* -ENOENT, logic error */
				7952	btrfs_err(fs_info, "update block group failed for %llu %llu",
				7953	ins->objectid, ins->offset);
				7954	BUG();
				7955	}
				7956	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
				7957	return ret;
				7958	}
				7959
				7960	static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				7961	struct btrfs_delayed_ref_node *node,
				7962	struct btrfs_delayed_extent_op *extent_op)
				7963	{
				7964	struct btrfs_fs_info *fs_info = trans->fs_info;
				7965	int ret;
				7966	struct btrfs_extent_item *extent_item;
				7967	struct btrfs_key extent_key;
				7968	struct btrfs_tree_block_info *block_info;
				7969	struct btrfs_extent_inline_ref *iref;
				7970	struct btrfs_path *path;
				7971	struct extent_buffer *leaf;
				7972	struct btrfs_delayed_tree_ref *ref;
				7973	u32 size = sizeof(extent_item) + sizeof(iref);
				7974	u64 num_bytes;
				7975	u64 flags = extent_op->flags_to_set;
				7976	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				7977
				7978	ref = btrfs_delayed_node_to_tree_ref(node);
				7979
				7980	extent_key.objectid = node->bytenr;
				7981	if (skinny_metadata) {
				7982	extent_key.offset = ref->level;
				7983	extent_key.type = BTRFS_METADATA_ITEM_KEY;
				7984	num_bytes = fs_info->nodesize;
				7985	} else {
				7986	extent_key.offset = node->num_bytes;
				7987	extent_key.type = BTRFS_EXTENT_ITEM_KEY;
				7988	size += sizeof(*block_info);
				7989	num_bytes = node->num_bytes;
				7990	}
				7991
				7992	path = btrfs_alloc_path();
				7993	if (!path)
				7994	return -ENOMEM;
				7995
				7996	path->leave_spinning = 1;
				7997	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				7998	&extent_key, size);
				7999	if (ret) {
				8000	btrfs_free_path(path);
				8001	return ret;
				8002	}
				8003
				8004	leaf = path->nodes[0];
				8005	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				8006	struct btrfs_extent_item);
				8007	btrfs_set_extent_refs(leaf, extent_item, 1);
				8008	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
				8009	btrfs_set_extent_flags(leaf, extent_item,
				8010	flags \| BTRFS_EXTENT_FLAG_TREE_BLOCK);
				8011
				8012	if (skinny_metadata) {
				8013	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
				8014	} else {
				8015	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
				8016	btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
				8017	btrfs_set_tree_block_level(leaf, block_info, ref->level);
				8018	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
				8019	}
				8020
				8021	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
				8022	BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
				8023	btrfs_set_extent_inline_ref_type(leaf, iref,
				8024	BTRFS_SHARED_BLOCK_REF_KEY);
				8025	btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
				8026	} else {
				8027	btrfs_set_extent_inline_ref_type(leaf, iref,
				8028	BTRFS_TREE_BLOCK_REF_KEY);
				8029	btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
				8030	}
				8031
				8032	btrfs_mark_buffer_dirty(leaf);
				8033	btrfs_free_path(path);
				8034
				8035	ret = remove_from_free_space_tree(trans, extent_key.objectid,
				8036	num_bytes);
				8037	if (ret)
				8038	return ret;
				8039
				8040	ret = update_block_group(trans, fs_info, extent_key.objectid,
				8041	fs_info->nodesize, 1);
				8042	if (ret) { /* -ENOENT, logic error */
				8043	btrfs_err(fs_info, "update block group failed for %llu %llu",
				8044	extent_key.objectid, extent_key.offset);
				8045	BUG();
				8046	}
				8047
				8048	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
				8049	fs_info->nodesize);
				8050	return ret;
				8051	}
				8052
				8053	int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				8054	struct btrfs_root *root, u64 owner,
				8055	u64 offset, u64 ram_bytes,
				8056	struct btrfs_key *ins)
				8057	{
				8058	int ret;
				8059
				8060	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
				8061
				8062	btrfs_ref_tree_mod(root, ins->objectid, ins->offset, 0,
				8063	root->root_key.objectid, owner, offset,
				8064	BTRFS_ADD_DELAYED_EXTENT);
				8065
				8066	ret = btrfs_add_delayed_data_ref(trans, ins->objectid,
				8067	ins->offset, 0,
				8068	root->root_key.objectid, owner,
				8069	offset, ram_bytes,
				8070	BTRFS_ADD_DELAYED_EXTENT, NULL, NULL);
				8071	return ret;
				8072	}
				8073
				8074	/*
				8075	* this is used by the tree logging recovery code. It records that
				8076	* an extent has been allocated and makes sure to clear the free
				8077	* space cache bits as well
				8078	*/
				8079	int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				8080	u64 root_objectid, u64 owner, u64 offset,
				8081	struct btrfs_key *ins)
				8082	{
				8083	struct btrfs_fs_info *fs_info = trans->fs_info;
				8084	int ret;
				8085	struct btrfs_block_group_cache *block_group;
				8086	struct btrfs_space_info *space_info;
				8087
				8088	/*
				8089	* Mixed block groups will exclude before processing the log so we only
				8090	* need to do the exclude dance if this fs isn't mixed.
				8091	*/
				8092	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
				8093	ret = __exclude_logged_extent(fs_info, ins->objectid,
				8094	ins->offset);
				8095	if (ret)
				8096	return ret;
				8097	}
				8098
				8099	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
				8100	if (!block_group)
				8101	return -EINVAL;
				8102
				8103	space_info = block_group->space_info;
				8104	spin_lock(&space_info->lock);
				8105	spin_lock(&block_group->lock);
				8106	space_info->bytes_reserved += ins->offset;
				8107	block_group->reserved += ins->offset;
				8108	spin_unlock(&block_group->lock);
				8109	spin_unlock(&space_info->lock);
				8110
				8111	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
				8112	offset, ins, 1);
				8113	btrfs_put_block_group(block_group);
				8114	return ret;
				8115	}
				8116
				8117	static struct extent_buffer *
				8118	btrfs_init_new_buffer(struct btrfs_trans_handle trans, struct btrfs_root root,
				8119	u64 bytenr, int level, u64 owner)
				8120	{
				8121	struct btrfs_fs_info *fs_info = root->fs_info;
				8122	struct extent_buffer *buf;
				8123
				8124	buf = btrfs_find_create_tree_block(fs_info, bytenr);
				8125	if (IS_ERR(buf))
				8126	return buf;
				8127
				8128	/*
				8129	* Extra safety check in case the extent tree is corrupted and extent
				8130	* allocator chooses to use a tree block which is already used and
				8131	* locked.
				8132	*/
				8133	if (buf->lock_owner == current->pid) {
				8134	btrfs_err_rl(fs_info,
				8135	"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
				8136	buf->start, btrfs_header_owner(buf), current->pid);
				8137	free_extent_buffer(buf);
				8138	return ERR_PTR(-EUCLEAN);
				8139	}
				8140
				8141	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
				8142	btrfs_tree_lock(buf);
				8143	clean_tree_block(fs_info, buf);
				8144	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
				8145
				8146	btrfs_set_lock_blocking(buf);
				8147	set_extent_buffer_uptodate(buf);
				8148
				8149	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
				8150	btrfs_set_header_level(buf, level);
				8151	btrfs_set_header_bytenr(buf, buf->start);
				8152	btrfs_set_header_generation(buf, trans->transid);
				8153	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
				8154	btrfs_set_header_owner(buf, owner);
				8155	write_extent_buffer_fsid(buf, fs_info->fsid);
				8156	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
				8157	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
				8158	buf->log_index = root->log_transid % 2;
				8159	/*
				8160	* we allow two log transactions at a time, use different
				8161	* EXENT bit to differentiate dirty pages.
				8162	*/
				8163	if (buf->log_index == 0)
				8164	set_extent_dirty(&root->dirty_log_pages, buf->start,
				8165	buf->start + buf->len - 1, GFP_NOFS);
				8166	else
				8167	set_extent_new(&root->dirty_log_pages, buf->start,
				8168	buf->start + buf->len - 1);
				8169	} else {
				8170	buf->log_index = -1;
				8171	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
				8172	buf->start + buf->len - 1, GFP_NOFS);
				8173	}
				8174	trans->dirty = true;
				8175	/* this returns a buffer locked for blocking */
				8176	return buf;
				8177	}
				8178
				8179	static struct btrfs_block_rsv *
				8180	use_block_rsv(struct btrfs_trans_handle *trans,
				8181	struct btrfs_root *root, u32 blocksize)
				8182	{
				8183	struct btrfs_fs_info *fs_info = root->fs_info;
				8184	struct btrfs_block_rsv *block_rsv;
				8185	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				8186	int ret;
				8187	bool global_updated = false;
				8188
				8189	block_rsv = get_block_rsv(trans, root);
				8190
				8191	if (unlikely(block_rsv->size == 0))
				8192	goto try_reserve;
				8193	again:
				8194	ret = block_rsv_use_bytes(block_rsv, blocksize);
				8195	if (!ret)
				8196	return block_rsv;
				8197
				8198	if (block_rsv->failfast)
				8199	return ERR_PTR(ret);
				8200
				8201	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
				8202	global_updated = true;
				8203	update_global_block_rsv(fs_info);
				8204	goto again;
				8205	}
				8206
				8207	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				8208	static DEFINE_RATELIMIT_STATE(_rs,
				8209	DEFAULT_RATELIMIT_INTERVAL * 10,
				8210	/DEFAULT_RATELIMIT_BURST/ 1);
				8211	if (__ratelimit(&_rs))
				8212	WARN(1, KERN_DEBUG
				8213	"BTRFS: block rsv returned %d\n", ret);
				8214	}
				8215	try_reserve:
				8216	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
				8217	BTRFS_RESERVE_NO_FLUSH);
				8218	if (!ret)
				8219	return block_rsv;
				8220	/*
				8221	* If we couldn't reserve metadata bytes try and use some from
				8222	* the global reserve if its space type is the same as the global
				8223	* reservation.
				8224	*/
				8225	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
				8226	block_rsv->space_info == global_rsv->space_info) {
				8227	ret = block_rsv_use_bytes(global_rsv, blocksize);
				8228	if (!ret)
				8229	return global_rsv;
				8230	}
				8231	return ERR_PTR(ret);
				8232	}
				8233
				8234	static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
				8235	struct btrfs_block_rsv *block_rsv, u32 blocksize)
				8236	{
				8237	block_rsv_add_bytes(block_rsv, blocksize, 0);
				8238	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
				8239	}
				8240
				8241	/*
				8242	* finds a free extent and does all the dirty work required for allocation
				8243	* returns the tree buffer or an ERR_PTR on error.
				8244	*/
				8245	struct extent_buffer btrfs_alloc_tree_block(struct btrfs_trans_handle trans,
				8246	struct btrfs_root *root,
				8247	u64 parent, u64 root_objectid,
				8248	const struct btrfs_disk_key *key,
				8249	int level, u64 hint,
				8250	u64 empty_size)
				8251	{
				8252	struct btrfs_fs_info *fs_info = root->fs_info;
				8253	struct btrfs_key ins;
				8254	struct btrfs_block_rsv *block_rsv;
				8255	struct extent_buffer *buf;
				8256	struct btrfs_delayed_extent_op *extent_op;
				8257	u64 flags = 0;
				8258	int ret;
				8259	u32 blocksize = fs_info->nodesize;
				8260	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
				8261
				8262	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				8263	if (btrfs_is_testing(fs_info)) {
				8264	buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
				8265	level, root_objectid);
				8266	if (!IS_ERR(buf))
				8267	root->alloc_bytenr += blocksize;
				8268	return buf;
				8269	}
				8270	#endif
				8271
				8272	block_rsv = use_block_rsv(trans, root, blocksize);
				8273	if (IS_ERR(block_rsv))
				8274	return ERR_CAST(block_rsv);
				8275
				8276	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
				8277	empty_size, hint, &ins, 0, 0);
				8278	if (ret)
				8279	goto out_unuse;
				8280
				8281	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
				8282	root_objectid);
				8283	if (IS_ERR(buf)) {
				8284	ret = PTR_ERR(buf);
				8285	goto out_free_reserved;
				8286	}
				8287
				8288	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
				8289	if (parent == 0)
				8290	parent = ins.objectid;
				8291	flags \|= BTRFS_BLOCK_FLAG_FULL_BACKREF;
				8292	} else
				8293	BUG_ON(parent > 0);
				8294
				8295	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
				8296	extent_op = btrfs_alloc_delayed_extent_op();
				8297	if (!extent_op) {
				8298	ret = -ENOMEM;
				8299	goto out_free_buf;
				8300	}
				8301	if (key)
				8302	memcpy(&extent_op->key, key, sizeof(extent_op->key));
				8303	else
				8304	memset(&extent_op->key, 0, sizeof(extent_op->key));
				8305	extent_op->flags_to_set = flags;
				8306	extent_op->update_key = skinny_metadata ? false : true;
				8307	extent_op->update_flags = true;
				8308	extent_op->is_data = false;
				8309	extent_op->level = level;
				8310
				8311	btrfs_ref_tree_mod(root, ins.objectid, ins.offset, parent,
				8312	root_objectid, level, 0,
				8313	BTRFS_ADD_DELAYED_EXTENT);
				8314	ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
				8315	ins.offset, parent,
				8316	root_objectid, level,
				8317	BTRFS_ADD_DELAYED_EXTENT,
				8318	extent_op, NULL, NULL);
				8319	if (ret)
				8320	goto out_free_delayed;
				8321	}
				8322	return buf;
				8323
				8324	out_free_delayed:
				8325	btrfs_free_delayed_extent_op(extent_op);
				8326	out_free_buf:
				8327	free_extent_buffer(buf);
				8328	out_free_reserved:
				8329	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
				8330	out_unuse:
				8331	unuse_block_rsv(fs_info, block_rsv, blocksize);
				8332	return ERR_PTR(ret);
				8333	}
				8334
				8335	struct walk_control {
				8336	u64 refs[BTRFS_MAX_LEVEL];
				8337	u64 flags[BTRFS_MAX_LEVEL];
				8338	struct btrfs_key update_progress;
				8339	int stage;
				8340	int level;
				8341	int shared_level;
				8342	int update_ref;
				8343	int keep_locks;
				8344	int reada_slot;
				8345	int reada_count;
				8346	};
				8347
				8348	#define DROP_REFERENCE 1
				8349	#define UPDATE_BACKREF 2
				8350
				8351	static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				8352	struct btrfs_root *root,
				8353	struct walk_control *wc,
				8354	struct btrfs_path *path)
				8355	{
				8356	struct btrfs_fs_info *fs_info = root->fs_info;
				8357	u64 bytenr;
				8358	u64 generation;
				8359	u64 refs;
				8360	u64 flags;
				8361	u32 nritems;
				8362	struct btrfs_key key;
				8363	struct extent_buffer *eb;
				8364	int ret;
				8365	int slot;
				8366	int nread = 0;
				8367
				8368	if (path->slots[wc->level] < wc->reada_slot) {
				8369	wc->reada_count = wc->reada_count * 2 / 3;
				8370	wc->reada_count = max(wc->reada_count, 2);
				8371	} else {
				8372	wc->reada_count = wc->reada_count * 3 / 2;
				8373	wc->reada_count = min_t(int, wc->reada_count,
				8374	BTRFS_NODEPTRS_PER_BLOCK(fs_info));
				8375	}
				8376
				8377	eb = path->nodes[wc->level];
				8378	nritems = btrfs_header_nritems(eb);
				8379
				8380	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
				8381	if (nread >= wc->reada_count)
				8382	break;
				8383
				8384	cond_resched();
				8385	bytenr = btrfs_node_blockptr(eb, slot);
				8386	generation = btrfs_node_ptr_generation(eb, slot);
				8387
				8388	if (slot == path->slots[wc->level])
				8389	goto reada;
				8390
				8391	if (wc->stage == UPDATE_BACKREF &&
				8392	generation <= root->root_key.offset)
				8393	continue;
				8394
				8395	/* We don't lock the tree block, it's OK to be racy here */
				8396	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
				8397	wc->level - 1, 1, &refs,
				8398	&flags);
				8399	/* We don't care about errors in readahead. */
				8400	if (ret < 0)
				8401	continue;
				8402	BUG_ON(refs == 0);
				8403
				8404	if (wc->stage == DROP_REFERENCE) {
				8405	if (refs == 1)
				8406	goto reada;
				8407
				8408	if (wc->level == 1 &&
				8409	(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8410	continue;
				8411	if (!wc->update_ref \|\|
				8412	generation <= root->root_key.offset)
				8413	continue;
				8414	btrfs_node_key_to_cpu(eb, &key, slot);
				8415	ret = btrfs_comp_cpu_keys(&key,
				8416	&wc->update_progress);
				8417	if (ret < 0)
				8418	continue;
				8419	} else {
				8420	if (wc->level == 1 &&
				8421	(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8422	continue;
				8423	}
				8424	reada:
				8425	readahead_tree_block(fs_info, bytenr);
				8426	nread++;
				8427	}
				8428	wc->reada_slot = slot;
				8429	}
				8430
				8431	/*
				8432	* helper to process tree block while walking down the tree.
				8433	*
				8434	* when wc->stage == UPDATE_BACKREF, this function updates
				8435	* back refs for pointers in the block.
				8436	*
				8437	* NOTE: return value 1 means we should stop walking down.
				8438	*/
				8439	static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
				8440	struct btrfs_root *root,
				8441	struct btrfs_path *path,
				8442	struct walk_control *wc, int lookup_info)
				8443	{
				8444	struct btrfs_fs_info *fs_info = root->fs_info;
				8445	int level = wc->level;
				8446	struct extent_buffer *eb = path->nodes[level];
				8447	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
				8448	int ret;
				8449
				8450	if (wc->stage == UPDATE_BACKREF &&
				8451	btrfs_header_owner(eb) != root->root_key.objectid)
				8452	return 1;
				8453
				8454	/*
				8455	* when reference count of tree block is 1, it won't increase
				8456	* again. once full backref flag is set, we never clear it.
				8457	*/
				8458	if (lookup_info &&
				8459	((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) \|\|
				8460	(wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
				8461	BUG_ON(!path->locks[level]);
				8462	ret = btrfs_lookup_extent_info(trans, fs_info,
				8463	eb->start, level, 1,
				8464	&wc->refs[level],
				8465	&wc->flags[level]);
				8466	BUG_ON(ret == -ENOMEM);
				8467	if (ret)
				8468	return ret;
				8469	BUG_ON(wc->refs[level] == 0);
				8470	}
				8471
				8472	if (wc->stage == DROP_REFERENCE) {
				8473	if (wc->refs[level] > 1)
				8474	return 1;
				8475
				8476	if (path->locks[level] && !wc->keep_locks) {
				8477	btrfs_tree_unlock_rw(eb, path->locks[level]);
				8478	path->locks[level] = 0;
				8479	}
				8480	return 0;
				8481	}
				8482
				8483	/* wc->stage == UPDATE_BACKREF */
				8484	if (!(wc->flags[level] & flag)) {
				8485	BUG_ON(!path->locks[level]);
				8486	ret = btrfs_inc_ref(trans, root, eb, 1);
				8487	BUG_ON(ret); /* -ENOMEM */
				8488	ret = btrfs_dec_ref(trans, root, eb, 0);
				8489	BUG_ON(ret); /* -ENOMEM */
				8490	ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start,
				8491	eb->len, flag,
				8492	btrfs_header_level(eb), 0);
				8493	BUG_ON(ret); /* -ENOMEM */
				8494	wc->flags[level] \|= flag;
				8495	}
				8496
				8497	/*
				8498	* the block is shared by multiple trees, so it's not good to
				8499	* keep the tree lock
				8500	*/
				8501	if (path->locks[level] && level > 0) {
				8502	btrfs_tree_unlock_rw(eb, path->locks[level]);
				8503	path->locks[level] = 0;
				8504	}
				8505	return 0;
				8506	}
				8507
				8508	/*
				8509	* helper to process tree block pointer.
				8510	*
				8511	* when wc->stage == DROP_REFERENCE, this function checks
				8512	* reference count of the block pointed to. if the block
				8513	* is shared and we need update back refs for the subtree
				8514	* rooted at the block, this function changes wc->stage to
				8515	* UPDATE_BACKREF. if the block is shared and there is no
				8516	* need to update back, this function drops the reference
				8517	* to the block.
				8518	*
				8519	* NOTE: return value 1 means we should stop walking down.
				8520	*/
				8521	static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				8522	struct btrfs_root *root,
				8523	struct btrfs_path *path,
				8524	struct walk_control wc, int lookup_info)
				8525	{
				8526	struct btrfs_fs_info *fs_info = root->fs_info;
				8527	u64 bytenr;
				8528	u64 generation;
				8529	u64 parent;
				8530	u32 blocksize;
				8531	struct btrfs_key key;
				8532	struct btrfs_key first_key;
				8533	struct extent_buffer *next;
				8534	int level = wc->level;
				8535	int reada = 0;
				8536	int ret = 0;
				8537	bool need_account = false;
				8538
				8539	generation = btrfs_node_ptr_generation(path->nodes[level],
				8540	path->slots[level]);
				8541	/*
				8542	* if the lower level block was created before the snapshot
				8543	* was created, we know there is no need to update back refs
				8544	* for the subtree
				8545	*/
				8546	if (wc->stage == UPDATE_BACKREF &&
				8547	generation <= root->root_key.offset) {
				8548	*lookup_info = 1;
				8549	return 1;
				8550	}
				8551
				8552	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
				8553	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
				8554	path->slots[level]);
				8555	blocksize = fs_info->nodesize;
				8556
				8557	next = find_extent_buffer(fs_info, bytenr);
				8558	if (!next) {
				8559	next = btrfs_find_create_tree_block(fs_info, bytenr);
				8560	if (IS_ERR(next))
				8561	return PTR_ERR(next);
				8562
				8563	btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
				8564	level - 1);
				8565	reada = 1;
				8566	}
				8567	btrfs_tree_lock(next);
				8568	btrfs_set_lock_blocking(next);
				8569
				8570	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
				8571	&wc->refs[level - 1],
				8572	&wc->flags[level - 1]);
				8573	if (ret < 0)
				8574	goto out_unlock;
				8575
				8576	if (unlikely(wc->refs[level - 1] == 0)) {
				8577	btrfs_err(fs_info, "Missing references.");
				8578	ret = -EIO;
				8579	goto out_unlock;
				8580	}
				8581	*lookup_info = 0;
				8582
				8583	if (wc->stage == DROP_REFERENCE) {
				8584	if (wc->refs[level - 1] > 1) {
				8585	need_account = true;
				8586	if (level == 1 &&
				8587	(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8588	goto skip;
				8589
				8590	if (!wc->update_ref \|\|
				8591	generation <= root->root_key.offset)
				8592	goto skip;
				8593
				8594	btrfs_node_key_to_cpu(path->nodes[level], &key,
				8595	path->slots[level]);
				8596	ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
				8597	if (ret < 0)
				8598	goto skip;
				8599
				8600	wc->stage = UPDATE_BACKREF;
				8601	wc->shared_level = level - 1;
				8602	}
				8603	} else {
				8604	if (level == 1 &&
				8605	(wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				8606	goto skip;
				8607	}
				8608
				8609	if (!btrfs_buffer_uptodate(next, generation, 0)) {
				8610	btrfs_tree_unlock(next);
				8611	free_extent_buffer(next);
				8612	next = NULL;
				8613	*lookup_info = 1;
				8614	}
				8615
				8616	if (!next) {
				8617	if (reada && level == 1)
				8618	reada_walk_down(trans, root, wc, path);
				8619	next = read_tree_block(fs_info, bytenr, generation, level - 1,
				8620	&first_key);
				8621	if (IS_ERR(next)) {
				8622	return PTR_ERR(next);
				8623	} else if (!extent_buffer_uptodate(next)) {
				8624	free_extent_buffer(next);
				8625	return -EIO;
				8626	}
				8627	btrfs_tree_lock(next);
				8628	btrfs_set_lock_blocking(next);
				8629	}
				8630
				8631	level--;
				8632	ASSERT(level == btrfs_header_level(next));
				8633	if (level != btrfs_header_level(next)) {
				8634	btrfs_err(root->fs_info, "mismatched level");
				8635	ret = -EIO;
				8636	goto out_unlock;
				8637	}
				8638	path->nodes[level] = next;
				8639	path->slots[level] = 0;
				8640	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				8641	wc->level = level;
				8642	if (wc->level == 1)
				8643	wc->reada_slot = 0;
				8644	return 0;
				8645	skip:
				8646	wc->refs[level - 1] = 0;
				8647	wc->flags[level - 1] = 0;
				8648	if (wc->stage == DROP_REFERENCE) {
				8649	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
				8650	parent = path->nodes[level]->start;
				8651	} else {
				8652	ASSERT(root->root_key.objectid ==
				8653	btrfs_header_owner(path->nodes[level]));
				8654	if (root->root_key.objectid !=
				8655	btrfs_header_owner(path->nodes[level])) {
				8656	btrfs_err(root->fs_info,
				8657	"mismatched block owner");
				8658	ret = -EIO;
				8659	goto out_unlock;
				8660	}
				8661	parent = 0;
				8662	}
				8663
				8664	if (need_account) {
				8665	ret = btrfs_qgroup_trace_subtree(trans, next,
				8666	generation, level - 1);
				8667	if (ret) {
				8668	btrfs_err_rl(fs_info,
				8669	"Error %d accounting shared subtree. Quota is out of sync, rescan required.",
				8670	ret);
				8671	}
				8672	}
				8673	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
				8674	parent, root->root_key.objectid,
				8675	level - 1, 0);
				8676	if (ret)
				8677	goto out_unlock;
				8678	}
				8679
				8680	*lookup_info = 1;
				8681	ret = 1;
				8682
				8683	out_unlock:
				8684	btrfs_tree_unlock(next);
				8685	free_extent_buffer(next);
				8686
				8687	return ret;
				8688	}
				8689
				8690	/*
				8691	* helper to process tree block while walking up the tree.
				8692	*
				8693	* when wc->stage == DROP_REFERENCE, this function drops
				8694	* reference count on the block.
				8695	*
				8696	* when wc->stage == UPDATE_BACKREF, this function changes
				8697	* wc->stage back to DROP_REFERENCE if we changed wc->stage
				8698	* to UPDATE_BACKREF previously while processing the block.
				8699	*
				8700	* NOTE: return value 1 means we should stop walking up.
				8701	*/
				8702	static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				8703	struct btrfs_root *root,
				8704	struct btrfs_path *path,
				8705	struct walk_control *wc)
				8706	{
				8707	struct btrfs_fs_info *fs_info = root->fs_info;
				8708	int ret;
				8709	int level = wc->level;
				8710	struct extent_buffer *eb = path->nodes[level];
				8711	u64 parent = 0;
				8712
				8713	if (wc->stage == UPDATE_BACKREF) {
				8714	BUG_ON(wc->shared_level < level);
				8715	if (level < wc->shared_level)
				8716	goto out;
				8717
				8718	ret = find_next_key(path, level + 1, &wc->update_progress);
				8719	if (ret > 0)
				8720	wc->update_ref = 0;
				8721
				8722	wc->stage = DROP_REFERENCE;
				8723	wc->shared_level = -1;
				8724	path->slots[level] = 0;
				8725
				8726	/*
				8727	* check reference count again if the block isn't locked.
				8728	* we should start walking down the tree again if reference
				8729	* count is one.
				8730	*/
				8731	if (!path->locks[level]) {
				8732	BUG_ON(level == 0);
				8733	btrfs_tree_lock(eb);
				8734	btrfs_set_lock_blocking(eb);
				8735	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				8736
				8737	ret = btrfs_lookup_extent_info(trans, fs_info,
				8738	eb->start, level, 1,
				8739	&wc->refs[level],
				8740	&wc->flags[level]);
				8741	if (ret < 0) {
				8742	btrfs_tree_unlock_rw(eb, path->locks[level]);
				8743	path->locks[level] = 0;
				8744	return ret;
				8745	}
				8746	BUG_ON(wc->refs[level] == 0);
				8747	if (wc->refs[level] == 1) {
				8748	btrfs_tree_unlock_rw(eb, path->locks[level]);
				8749	path->locks[level] = 0;
				8750	return 1;
				8751	}
				8752	}
				8753	}
				8754
				8755	/* wc->stage == DROP_REFERENCE */
				8756	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
				8757
				8758	if (wc->refs[level] == 1) {
				8759	if (level == 0) {
				8760	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				8761	ret = btrfs_dec_ref(trans, root, eb, 1);
				8762	else
				8763	ret = btrfs_dec_ref(trans, root, eb, 0);
				8764	BUG_ON(ret); /* -ENOMEM */
				8765	ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				8766	if (ret) {
				8767	btrfs_err_rl(fs_info,
				8768	"error %d accounting leaf items. Quota is out of sync, rescan required.",
				8769	ret);
				8770	}
				8771	}
				8772	/* make block locked assertion in clean_tree_block happy */
				8773	if (!path->locks[level] &&
				8774	btrfs_header_generation(eb) == trans->transid) {
				8775	btrfs_tree_lock(eb);
				8776	btrfs_set_lock_blocking(eb);
				8777	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				8778	}
				8779	clean_tree_block(fs_info, eb);
				8780	}
				8781
				8782	if (eb == root->node) {
				8783	if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				8784	parent = eb->start;
				8785	else if (root->root_key.objectid != btrfs_header_owner(eb))
				8786	goto owner_mismatch;
				8787	} else {
				8788	if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
				8789	parent = path->nodes[level + 1]->start;
				8790	else if (root->root_key.objectid !=
				8791	btrfs_header_owner(path->nodes[level + 1]))
				8792	goto owner_mismatch;
				8793	}
				8794
				8795	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
				8796	out:
				8797	wc->refs[level] = 0;
				8798	wc->flags[level] = 0;
				8799	return 0;
				8800
				8801	owner_mismatch:
				8802	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
				8803	btrfs_header_owner(eb), root->root_key.objectid);
				8804	return -EUCLEAN;
				8805	}
				8806
				8807	static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				8808	struct btrfs_root *root,
				8809	struct btrfs_path *path,
				8810	struct walk_control *wc)
				8811	{
				8812	int level = wc->level;
				8813	int lookup_info = 1;
				8814	int ret;
				8815
				8816	while (level >= 0) {
				8817	ret = walk_down_proc(trans, root, path, wc, lookup_info);
				8818	if (ret > 0)
				8819	break;
				8820
				8821	if (level == 0)
				8822	break;
				8823
				8824	if (path->slots[level] >=
				8825	btrfs_header_nritems(path->nodes[level]))
				8826	break;
				8827
				8828	ret = do_walk_down(trans, root, path, wc, &lookup_info);
				8829	if (ret > 0) {
				8830	path->slots[level]++;
				8831	continue;
				8832	} else if (ret < 0)
				8833	return ret;
				8834	level = wc->level;
				8835	}
				8836	return 0;
				8837	}
				8838
				8839	static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
				8840	struct btrfs_root *root,
				8841	struct btrfs_path *path,
				8842	struct walk_control *wc, int max_level)
				8843	{
				8844	int level = wc->level;
				8845	int ret;
				8846
				8847	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
				8848	while (level < max_level && path->nodes[level]) {
				8849	wc->level = level;
				8850	if (path->slots[level] + 1 <
				8851	btrfs_header_nritems(path->nodes[level])) {
				8852	path->slots[level]++;
				8853	return 0;
				8854	} else {
				8855	ret = walk_up_proc(trans, root, path, wc);
				8856	if (ret > 0)
				8857	return 0;
				8858	if (ret < 0)
				8859	return ret;
				8860
				8861	if (path->locks[level]) {
				8862	btrfs_tree_unlock_rw(path->nodes[level],
				8863	path->locks[level]);
				8864	path->locks[level] = 0;
				8865	}
				8866	free_extent_buffer(path->nodes[level]);
				8867	path->nodes[level] = NULL;
				8868	level++;
				8869	}
				8870	}
				8871	return 1;
				8872	}
				8873
				8874	/*
				8875	* drop a subvolume tree.
				8876	*
				8877	* this function traverses the tree freeing any blocks that only
				8878	* referenced by the tree.
				8879	*
				8880	* when a shared tree block is found. this function decreases its
				8881	* reference count by one. if update_ref is true, this function
				8882	* also make sure backrefs for the shared block and all lower level
				8883	* blocks are properly updated.
				8884	*
				8885	* If called with for_reloc == 0, may exit early with -EAGAIN
				8886	*/
				8887	int btrfs_drop_snapshot(struct btrfs_root *root,
				8888	struct btrfs_block_rsv *block_rsv, int update_ref,
				8889	int for_reloc)
				8890	{
				8891	struct btrfs_fs_info *fs_info = root->fs_info;
				8892	struct btrfs_path *path;
				8893	struct btrfs_trans_handle *trans;
				8894	struct btrfs_root *tree_root = fs_info->tree_root;
				8895	struct btrfs_root_item *root_item = &root->root_item;
				8896	struct walk_control *wc;
				8897	struct btrfs_key key;
				8898	int err = 0;
				8899	int ret;
				8900	int level;
				8901	bool root_dropped = false;
				8902
				8903	btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
				8904
				8905	path = btrfs_alloc_path();
				8906	if (!path) {
				8907	err = -ENOMEM;
				8908	goto out;
				8909	}
				8910
				8911	wc = kzalloc(sizeof(*wc), GFP_NOFS);
				8912	if (!wc) {
				8913	btrfs_free_path(path);
				8914	err = -ENOMEM;
				8915	goto out;
				8916	}
				8917
				8918	trans = btrfs_start_transaction(tree_root, 0);
				8919	if (IS_ERR(trans)) {
				8920	err = PTR_ERR(trans);
				8921	goto out_free;
				8922	}
				8923
				8924	err = btrfs_run_delayed_items(trans);
				8925	if (err)
				8926	goto out_end_trans;
				8927
				8928	if (block_rsv)
				8929	trans->block_rsv = block_rsv;
				8930
				8931	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
				8932	level = btrfs_header_level(root->node);
				8933	path->nodes[level] = btrfs_lock_root_node(root);
				8934	btrfs_set_lock_blocking(path->nodes[level]);
				8935	path->slots[level] = 0;
				8936	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				8937	memset(&wc->update_progress, 0,
				8938	sizeof(wc->update_progress));
				8939	} else {
				8940	btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
				8941	memcpy(&wc->update_progress, &key,
				8942	sizeof(wc->update_progress));
				8943
				8944	level = root_item->drop_level;
				8945	BUG_ON(level == 0);
				8946	path->lowest_level = level;
				8947	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				8948	path->lowest_level = 0;
				8949	if (ret < 0) {
				8950	err = ret;
				8951	goto out_end_trans;
				8952	}
				8953	WARN_ON(ret > 0);
				8954
				8955	/*
				8956	* unlock our path, this is safe because only this
				8957	* function is allowed to delete this snapshot
				8958	*/
				8959	btrfs_unlock_up_safe(path, 0);
				8960
				8961	level = btrfs_header_level(root->node);
				8962	while (1) {
				8963	btrfs_tree_lock(path->nodes[level]);
				8964	btrfs_set_lock_blocking(path->nodes[level]);
				8965	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				8966
				8967	ret = btrfs_lookup_extent_info(trans, fs_info,
				8968	path->nodes[level]->start,
				8969	level, 1, &wc->refs[level],
				8970	&wc->flags[level]);
				8971	if (ret < 0) {
				8972	err = ret;
				8973	goto out_end_trans;
				8974	}
				8975	BUG_ON(wc->refs[level] == 0);
				8976
				8977	if (level == root_item->drop_level)
				8978	break;
				8979
				8980	btrfs_tree_unlock(path->nodes[level]);
				8981	path->locks[level] = 0;
				8982	WARN_ON(wc->refs[level] != 1);
				8983	level--;
				8984	}
				8985	}
				8986
				8987	wc->level = level;
				8988	wc->shared_level = -1;
				8989	wc->stage = DROP_REFERENCE;
				8990	wc->update_ref = update_ref;
				8991	wc->keep_locks = 0;
				8992	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
				8993
				8994	while (1) {
				8995
				8996	ret = walk_down_tree(trans, root, path, wc);
				8997	if (ret < 0) {
				8998	err = ret;
				8999	break;
				9000	}
				9001
				9002	ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
				9003	if (ret < 0) {
				9004	err = ret;
				9005	break;
				9006	}
				9007
				9008	if (ret > 0) {
				9009	BUG_ON(wc->stage != DROP_REFERENCE);
				9010	break;
				9011	}
				9012
				9013	if (wc->stage == DROP_REFERENCE) {
				9014	level = wc->level;
				9015	btrfs_node_key(path->nodes[level],
				9016	&root_item->drop_progress,
				9017	path->slots[level]);
				9018	root_item->drop_level = level;
				9019	}
				9020
				9021	BUG_ON(wc->level == 0);
				9022	if (btrfs_should_end_transaction(trans) \|\|
				9023	(!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
				9024	ret = btrfs_update_root(trans, tree_root,
				9025	&root->root_key,
				9026	root_item);
				9027	if (ret) {
				9028	btrfs_abort_transaction(trans, ret);
				9029	err = ret;
				9030	goto out_end_trans;
				9031	}
				9032
				9033	btrfs_end_transaction_throttle(trans);
				9034	if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
				9035	btrfs_debug(fs_info,
				9036	"drop snapshot early exit");
				9037	err = -EAGAIN;
				9038	goto out_free;
				9039	}
				9040
				9041	trans = btrfs_start_transaction(tree_root, 0);
				9042	if (IS_ERR(trans)) {
				9043	err = PTR_ERR(trans);
				9044	goto out_free;
				9045	}
				9046	if (block_rsv)
				9047	trans->block_rsv = block_rsv;
				9048	}
				9049	}
				9050	btrfs_release_path(path);
				9051	if (err)
				9052	goto out_end_trans;
				9053
				9054	ret = btrfs_del_root(trans, &root->root_key);
				9055	if (ret) {
				9056	btrfs_abort_transaction(trans, ret);
				9057	err = ret;
				9058	goto out_end_trans;
				9059	}
				9060
				9061	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
				9062	ret = btrfs_find_root(tree_root, &root->root_key, path,
				9063	NULL, NULL);
				9064	if (ret < 0) {
				9065	btrfs_abort_transaction(trans, ret);
				9066	err = ret;
				9067	goto out_end_trans;
				9068	} else if (ret > 0) {
				9069	/* if we fail to delete the orphan item this time
				9070	* around, it'll get picked up the next time.
				9071	*
				9072	* The most common failure here is just -ENOENT.
				9073	*/
				9074	btrfs_del_orphan_item(trans, tree_root,
				9075	root->root_key.objectid);
				9076	}
				9077	}
				9078
				9079	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
				9080	btrfs_add_dropped_root(trans, root);
				9081	} else {
				9082	free_extent_buffer(root->node);
				9083	free_extent_buffer(root->commit_root);
				9084	btrfs_put_fs_root(root);
				9085	}
				9086	root_dropped = true;
				9087	out_end_trans:
				9088	btrfs_end_transaction_throttle(trans);
				9089	out_free:
				9090	kfree(wc);
				9091	btrfs_free_path(path);
				9092	out:
				9093	/*
				9094	* So if we need to stop dropping the snapshot for whatever reason we
				9095	* need to make sure to add it back to the dead root list so that we
				9096	* keep trying to do the work later. This also cleans up roots if we
				9097	* don't have it in the radix (like when we recover after a power fail
				9098	* or unmount) so we don't leak memory.
				9099	*/
				9100	if (!for_reloc && !root_dropped)
				9101	btrfs_add_dead_root(root);
				9102	if (err && err != -EAGAIN)
				9103	btrfs_handle_fs_error(fs_info, err, NULL);
				9104	return err;
				9105	}
				9106
				9107	/*
				9108	* drop subtree rooted at tree block 'node'.
				9109	*
				9110	* NOTE: this function will unlock and release tree block 'node'
				9111	* only used by relocation code
				9112	*/
				9113	int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
				9114	struct btrfs_root *root,
				9115	struct extent_buffer *node,
				9116	struct extent_buffer *parent)
				9117	{
				9118	struct btrfs_fs_info *fs_info = root->fs_info;
				9119	struct btrfs_path *path;
				9120	struct walk_control *wc;
				9121	int level;
				9122	int parent_level;
				9123	int ret = 0;
				9124	int wret;
				9125
				9126	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
				9127
				9128	path = btrfs_alloc_path();
				9129	if (!path)
				9130	return -ENOMEM;
				9131
				9132	wc = kzalloc(sizeof(*wc), GFP_NOFS);
				9133	if (!wc) {
				9134	btrfs_free_path(path);
				9135	return -ENOMEM;
				9136	}
				9137
				9138	btrfs_assert_tree_locked(parent);
				9139	parent_level = btrfs_header_level(parent);
				9140	extent_buffer_get(parent);
				9141	path->nodes[parent_level] = parent;
				9142	path->slots[parent_level] = btrfs_header_nritems(parent);
				9143
				9144	btrfs_assert_tree_locked(node);
				9145	level = btrfs_header_level(node);
				9146	path->nodes[level] = node;
				9147	path->slots[level] = 0;
				9148	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
				9149
				9150	wc->refs[parent_level] = 1;
				9151	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
				9152	wc->level = level;
				9153	wc->shared_level = -1;
				9154	wc->stage = DROP_REFERENCE;
				9155	wc->update_ref = 0;
				9156	wc->keep_locks = 1;
				9157	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
				9158
				9159	while (1) {
				9160	wret = walk_down_tree(trans, root, path, wc);
				9161	if (wret < 0) {
				9162	ret = wret;
				9163	break;
				9164	}
				9165
				9166	wret = walk_up_tree(trans, root, path, wc, parent_level);
				9167	if (wret < 0)
				9168	ret = wret;
				9169	if (wret != 0)
				9170	break;
				9171	}
				9172
				9173	kfree(wc);
				9174	btrfs_free_path(path);
				9175	return ret;
				9176	}
				9177
				9178	static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags)
				9179	{
				9180	u64 num_devices;
				9181	u64 stripped;
				9182
				9183	/*
				9184	* if restripe for this chunk_type is on pick target profile and
				9185	* return, otherwise do the usual balance
				9186	*/
				9187	stripped = get_restripe_target(fs_info, flags);
				9188	if (stripped)
				9189	return extended_to_chunk(stripped);
				9190
				9191	num_devices = fs_info->fs_devices->rw_devices;
				9192
				9193	stripped = BTRFS_BLOCK_GROUP_RAID0 \|
				9194	BTRFS_BLOCK_GROUP_RAID5 \| BTRFS_BLOCK_GROUP_RAID6 \|
				9195	BTRFS_BLOCK_GROUP_RAID1 \| BTRFS_BLOCK_GROUP_RAID10;
				9196
				9197	if (num_devices == 1) {
				9198	stripped \|= BTRFS_BLOCK_GROUP_DUP;
				9199	stripped = flags & ~stripped;
				9200
				9201	/* turn raid0 into single device chunks */
				9202	if (flags & BTRFS_BLOCK_GROUP_RAID0)
				9203	return stripped;
				9204
				9205	/* turn mirroring into duplication */
				9206	if (flags & (BTRFS_BLOCK_GROUP_RAID1 \|
				9207	BTRFS_BLOCK_GROUP_RAID10))
				9208	return stripped \| BTRFS_BLOCK_GROUP_DUP;
				9209	} else {
				9210	/* they already had raid on here, just return */
				9211	if (flags & stripped)
				9212	return flags;
				9213
				9214	stripped \|= BTRFS_BLOCK_GROUP_DUP;
				9215	stripped = flags & ~stripped;
				9216
				9217	/* switch duplicated blocks with raid1 */
				9218	if (flags & BTRFS_BLOCK_GROUP_DUP)
				9219	return stripped \| BTRFS_BLOCK_GROUP_RAID1;
				9220
				9221	/* this is drive concat, leave it alone */
				9222	}
				9223
				9224	return flags;
				9225	}
				9226
				9227	static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
				9228	{
				9229	struct btrfs_space_info *sinfo = cache->space_info;
				9230	u64 num_bytes;
				9231	u64 min_allocable_bytes;
				9232	int ret = -ENOSPC;
				9233
				9234	/*
				9235	* We need some metadata space and system metadata space for
				9236	* allocating chunks in some corner cases until we force to set
				9237	* it to be readonly.
				9238	*/
				9239	if ((sinfo->flags &
				9240	(BTRFS_BLOCK_GROUP_SYSTEM \| BTRFS_BLOCK_GROUP_METADATA)) &&
				9241	!force)
				9242	min_allocable_bytes = SZ_1M;
				9243	else
				9244	min_allocable_bytes = 0;
				9245
				9246	spin_lock(&sinfo->lock);
				9247	spin_lock(&cache->lock);
				9248
				9249	if (cache->ro) {
				9250	cache->ro++;
				9251	ret = 0;
				9252	goto out;
				9253	}
				9254
				9255	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
				9256	cache->bytes_super - btrfs_block_group_used(&cache->item);
				9257
				9258	if (btrfs_space_info_used(sinfo, true) + num_bytes +
				9259	min_allocable_bytes <= sinfo->total_bytes) {
				9260	sinfo->bytes_readonly += num_bytes;
				9261	cache->ro++;
				9262	list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
				9263	ret = 0;
				9264	}
				9265	out:
				9266	spin_unlock(&cache->lock);
				9267	spin_unlock(&sinfo->lock);
				9268	return ret;
				9269	}
				9270
				9271	int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache)
				9272
				9273	{
				9274	struct btrfs_fs_info *fs_info = cache->fs_info;
				9275	struct btrfs_trans_handle *trans;
				9276	u64 alloc_flags;
				9277	int ret;
				9278
				9279	again:
				9280	trans = btrfs_join_transaction(fs_info->extent_root);
				9281	if (IS_ERR(trans))
				9282	return PTR_ERR(trans);
				9283
				9284	/*
				9285	* we're not allowed to set block groups readonly after the dirty
				9286	* block groups cache has started writing. If it already started,
				9287	* back off and let this transaction commit
				9288	*/
				9289	mutex_lock(&fs_info->ro_block_group_mutex);
				9290	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
				9291	u64 transid = trans->transid;
				9292
				9293	mutex_unlock(&fs_info->ro_block_group_mutex);
				9294	btrfs_end_transaction(trans);
				9295
				9296	ret = btrfs_wait_for_commit(fs_info, transid);
				9297	if (ret)
				9298	return ret;
				9299	goto again;
				9300	}
				9301
				9302	/*
				9303	* if we are changing raid levels, try to allocate a corresponding
				9304	* block group with the new raid level.
				9305	*/
				9306	alloc_flags = update_block_group_flags(fs_info, cache->flags);
				9307	if (alloc_flags != cache->flags) {
				9308	ret = do_chunk_alloc(trans, alloc_flags,
				9309	CHUNK_ALLOC_FORCE);
				9310	/*
				9311	* ENOSPC is allowed here, we may have enough space
				9312	* already allocated at the new raid level to
				9313	* carry on
				9314	*/
				9315	if (ret == -ENOSPC)
				9316	ret = 0;
				9317	if (ret < 0)
				9318	goto out;
				9319	}
				9320
				9321	ret = inc_block_group_ro(cache, 0);
				9322	if (!ret)
				9323	goto out;
				9324	alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags);
				9325	ret = do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
				9326	if (ret < 0)
				9327	goto out;
				9328	ret = inc_block_group_ro(cache, 0);
				9329	out:
				9330	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
				9331	alloc_flags = update_block_group_flags(fs_info, cache->flags);
				9332	mutex_lock(&fs_info->chunk_mutex);
				9333	check_system_chunk(trans, alloc_flags);
				9334	mutex_unlock(&fs_info->chunk_mutex);
				9335	}
				9336	mutex_unlock(&fs_info->ro_block_group_mutex);
				9337
				9338	btrfs_end_transaction(trans);
				9339	return ret;
				9340	}
				9341
				9342	int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
				9343	{
				9344	u64 alloc_flags = get_alloc_profile(trans->fs_info, type);
				9345
				9346	return do_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
				9347	}
				9348
				9349	/*
				9350	* helper to account the unused space of all the readonly block group in the
				9351	* space_info. takes mirrors into account.
				9352	*/
				9353	u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
				9354	{
				9355	struct btrfs_block_group_cache *block_group;
				9356	u64 free_bytes = 0;
				9357	int factor;
				9358
				9359	/* It's df, we don't care if it's racy */
				9360	if (list_empty(&sinfo->ro_bgs))
				9361	return 0;
				9362
				9363	spin_lock(&sinfo->lock);
				9364	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
				9365	spin_lock(&block_group->lock);
				9366
				9367	if (!block_group->ro) {
				9368	spin_unlock(&block_group->lock);
				9369	continue;
				9370	}
				9371
				9372	factor = btrfs_bg_type_to_factor(block_group->flags);
				9373	free_bytes += (block_group->key.offset -
				9374	btrfs_block_group_used(&block_group->item)) *
				9375	factor;
				9376
				9377	spin_unlock(&block_group->lock);
				9378	}
				9379	spin_unlock(&sinfo->lock);
				9380
				9381	return free_bytes;
				9382	}
				9383
				9384	void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache)
				9385	{
				9386	struct btrfs_space_info *sinfo = cache->space_info;
				9387	u64 num_bytes;
				9388
				9389	BUG_ON(!cache->ro);
				9390
				9391	spin_lock(&sinfo->lock);
				9392	spin_lock(&cache->lock);
				9393	if (!--cache->ro) {
				9394	num_bytes = cache->key.offset - cache->reserved -
				9395	cache->pinned - cache->bytes_super -
				9396	btrfs_block_group_used(&cache->item);
				9397	sinfo->bytes_readonly -= num_bytes;
				9398	list_del_init(&cache->ro_list);
				9399	}
				9400	spin_unlock(&cache->lock);
				9401	spin_unlock(&sinfo->lock);
				9402	}
				9403
				9404	/*
				9405	* checks to see if its even possible to relocate this block group.
				9406	*
				9407	* @return - -1 if it's not a good idea to relocate this block group, 0 if its
				9408	* ok to go ahead and try.
				9409	*/
				9410	int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr)
				9411	{
				9412	struct btrfs_root *root = fs_info->extent_root;
				9413	struct btrfs_block_group_cache *block_group;
				9414	struct btrfs_space_info *space_info;
				9415	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
				9416	struct btrfs_device *device;
				9417	struct btrfs_trans_handle *trans;
				9418	u64 min_free;
				9419	u64 dev_min = 1;
				9420	u64 dev_nr = 0;
				9421	u64 target;
				9422	int debug;
				9423	int index;
				9424	int full = 0;
				9425	int ret = 0;
				9426
				9427	debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG);
				9428
				9429	block_group = btrfs_lookup_block_group(fs_info, bytenr);
				9430
				9431	/* odd, couldn't find the block group, leave it alone */
				9432	if (!block_group) {
				9433	if (debug)
				9434	btrfs_warn(fs_info,
				9435	"can't find block group for bytenr %llu",
				9436	bytenr);
				9437	return -1;
				9438	}
				9439
				9440	min_free = btrfs_block_group_used(&block_group->item);
				9441
				9442	/* no bytes used, we're good */
				9443	if (!min_free)
				9444	goto out;
				9445
				9446	space_info = block_group->space_info;
				9447	spin_lock(&space_info->lock);
				9448
				9449	full = space_info->full;
				9450
				9451	/*
				9452	* if this is the last block group we have in this space, we can't
				9453	* relocate it unless we're able to allocate a new chunk below.
				9454	*
				9455	* Otherwise, we need to make sure we have room in the space to handle
				9456	* all of the extents from this block group. If we can, we're good
				9457	*/
				9458	if ((space_info->total_bytes != block_group->key.offset) &&
				9459	(btrfs_space_info_used(space_info, false) + min_free <
				9460	space_info->total_bytes)) {
				9461	spin_unlock(&space_info->lock);
				9462	goto out;
				9463	}
				9464	spin_unlock(&space_info->lock);
				9465
				9466	/*
				9467	* ok we don't have enough space, but maybe we have free space on our
				9468	* devices to allocate new chunks for relocation, so loop through our
				9469	* alloc devices and guess if we have enough space. if this block
				9470	* group is going to be restriped, run checks against the target
				9471	* profile instead of the current one.
				9472	*/
				9473	ret = -1;
				9474
				9475	/*
				9476	* index:
				9477	* 0: raid10
				9478	* 1: raid1
				9479	* 2: dup
				9480	* 3: raid0
				9481	* 4: single
				9482	*/
				9483	target = get_restripe_target(fs_info, block_group->flags);
				9484	if (target) {
				9485	index = btrfs_bg_flags_to_raid_index(extended_to_chunk(target));
				9486	} else {
				9487	/*
				9488	* this is just a balance, so if we were marked as full
				9489	* we know there is no space for a new chunk
				9490	*/
				9491	if (full) {
				9492	if (debug)
				9493	btrfs_warn(fs_info,
				9494	"no space to alloc new chunk for block group %llu",
				9495	block_group->key.objectid);
				9496	goto out;
				9497	}
				9498
				9499	index = btrfs_bg_flags_to_raid_index(block_group->flags);
				9500	}
				9501
				9502	if (index == BTRFS_RAID_RAID10) {
				9503	dev_min = 4;
				9504	/* Divide by 2 */
				9505	min_free >>= 1;
				9506	} else if (index == BTRFS_RAID_RAID1) {
				9507	dev_min = 2;
				9508	} else if (index == BTRFS_RAID_DUP) {
				9509	/* Multiply by 2 */
				9510	min_free <<= 1;
				9511	} else if (index == BTRFS_RAID_RAID0) {
				9512	dev_min = fs_devices->rw_devices;
				9513	min_free = div64_u64(min_free, dev_min);
				9514	}
				9515
				9516	/* We need to do this so that we can look at pending chunks */
				9517	trans = btrfs_join_transaction(root);
				9518	if (IS_ERR(trans)) {
				9519	ret = PTR_ERR(trans);
				9520	goto out;
				9521	}
				9522
				9523	mutex_lock(&fs_info->chunk_mutex);
				9524	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
				9525	u64 dev_offset;
				9526
				9527	/*
				9528	* check to make sure we can actually find a chunk with enough
				9529	* space to fit our block group in.
				9530	*/
				9531	if (device->total_bytes > device->bytes_used + min_free &&
				9532	!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
				9533	ret = find_free_dev_extent(trans, device, min_free,
				9534	&dev_offset, NULL);
				9535	if (!ret)
				9536	dev_nr++;
				9537
				9538	if (dev_nr >= dev_min)
				9539	break;
				9540
				9541	ret = -1;
				9542	}
				9543	}
				9544	if (debug && ret == -1)
				9545	btrfs_warn(fs_info,
				9546	"no space to allocate a new chunk for block group %llu",
				9547	block_group->key.objectid);
				9548	mutex_unlock(&fs_info->chunk_mutex);
				9549	btrfs_end_transaction(trans);
				9550	out:
				9551	btrfs_put_block_group(block_group);
				9552	return ret;
				9553	}
				9554
				9555	static int find_first_block_group(struct btrfs_fs_info *fs_info,
				9556	struct btrfs_path *path,
				9557	struct btrfs_key *key)
				9558	{
				9559	struct btrfs_root *root = fs_info->extent_root;
				9560	int ret = 0;
				9561	struct btrfs_key found_key;
				9562	struct extent_buffer *leaf;
				9563	struct btrfs_block_group_item bg;
				9564	u64 flags;
				9565	int slot;
				9566
				9567	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				9568	if (ret < 0)
				9569	goto out;
				9570
				9571	while (1) {
				9572	slot = path->slots[0];
				9573	leaf = path->nodes[0];
				9574	if (slot >= btrfs_header_nritems(leaf)) {
				9575	ret = btrfs_next_leaf(root, path);
				9576	if (ret == 0)
				9577	continue;
				9578	if (ret < 0)
				9579	goto out;
				9580	break;
				9581	}
				9582	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				9583
				9584	if (found_key.objectid >= key->objectid &&
				9585	found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
				9586	struct extent_map_tree *em_tree;
				9587	struct extent_map *em;
				9588
				9589	em_tree = &root->fs_info->mapping_tree.map_tree;
				9590	read_lock(&em_tree->lock);
				9591	em = lookup_extent_mapping(em_tree, found_key.objectid,
				9592	found_key.offset);
				9593	read_unlock(&em_tree->lock);
				9594	if (!em) {
				9595	btrfs_err(fs_info,
				9596	"logical %llu len %llu found bg but no related chunk",
				9597	found_key.objectid, found_key.offset);
				9598	ret = -ENOENT;
				9599	} else if (em->start != found_key.objectid \|\|
				9600	em->len != found_key.offset) {
				9601	btrfs_err(fs_info,
				9602	"block group %llu len %llu mismatch with chunk %llu len %llu",
				9603	found_key.objectid, found_key.offset,
				9604	em->start, em->len);
				9605	ret = -EUCLEAN;
				9606	} else {
				9607	read_extent_buffer(leaf, &bg,
				9608	btrfs_item_ptr_offset(leaf, slot),
				9609	sizeof(bg));
				9610	flags = btrfs_block_group_flags(&bg) &
				9611	BTRFS_BLOCK_GROUP_TYPE_MASK;
				9612
				9613	if (flags != (em->map_lookup->type &
				9614	BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				9615	btrfs_err(fs_info,
				9616	"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
				9617	found_key.objectid,
				9618	found_key.offset, flags,
				9619	(BTRFS_BLOCK_GROUP_TYPE_MASK &
				9620	em->map_lookup->type));
				9621	ret = -EUCLEAN;
				9622	} else {
				9623	ret = 0;
				9624	}
				9625	}
				9626	free_extent_map(em);
				9627	goto out;
				9628	}
				9629	path->slots[0]++;
				9630	}
				9631	out:
				9632	return ret;
				9633	}
				9634
				9635	void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
				9636	{
				9637	struct btrfs_block_group_cache *block_group;
				9638	u64 last = 0;
				9639
				9640	while (1) {
				9641	struct inode *inode;
				9642
				9643	block_group = btrfs_lookup_first_block_group(info, last);
				9644	while (block_group) {
				9645	wait_block_group_cache_done(block_group);
				9646	spin_lock(&block_group->lock);
				9647	if (block_group->iref)
				9648	break;
				9649	spin_unlock(&block_group->lock);
				9650	block_group = next_block_group(info, block_group);
				9651	}
				9652	if (!block_group) {
				9653	if (last == 0)
				9654	break;
				9655	last = 0;
				9656	continue;
				9657	}
				9658
				9659	inode = block_group->inode;
				9660	block_group->iref = 0;
				9661	block_group->inode = NULL;
				9662	spin_unlock(&block_group->lock);
				9663	ASSERT(block_group->io_ctl.inode == NULL);
				9664	iput(inode);
				9665	last = block_group->key.objectid + block_group->key.offset;
				9666	btrfs_put_block_group(block_group);
				9667	}
				9668	}
				9669
				9670	/*
				9671	* Must be called only after stopping all workers, since we could have block
				9672	* group caching kthreads running, and therefore they could race with us if we
				9673	* freed the block groups before stopping them.
				9674	*/
				9675	int btrfs_free_block_groups(struct btrfs_fs_info *info)
				9676	{
				9677	struct btrfs_block_group_cache *block_group;
				9678	struct btrfs_space_info *space_info;
				9679	struct btrfs_caching_control *caching_ctl;
				9680	struct rb_node *n;
				9681
				9682	down_write(&info->commit_root_sem);
				9683	while (!list_empty(&info->caching_block_groups)) {
				9684	caching_ctl = list_entry(info->caching_block_groups.next,
				9685	struct btrfs_caching_control, list);
				9686	list_del(&caching_ctl->list);
				9687	put_caching_control(caching_ctl);
				9688	}
				9689	up_write(&info->commit_root_sem);
				9690
				9691	spin_lock(&info->unused_bgs_lock);
				9692	while (!list_empty(&info->unused_bgs)) {
				9693	block_group = list_first_entry(&info->unused_bgs,
				9694	struct btrfs_block_group_cache,
				9695	bg_list);
				9696	list_del_init(&block_group->bg_list);
				9697	btrfs_put_block_group(block_group);
				9698	}
				9699	spin_unlock(&info->unused_bgs_lock);
				9700
				9701	spin_lock(&info->block_group_cache_lock);
				9702	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
				9703	block_group = rb_entry(n, struct btrfs_block_group_cache,
				9704	cache_node);
				9705	rb_erase(&block_group->cache_node,
				9706	&info->block_group_cache_tree);
				9707	RB_CLEAR_NODE(&block_group->cache_node);
				9708	spin_unlock(&info->block_group_cache_lock);
				9709
				9710	down_write(&block_group->space_info->groups_sem);
				9711	list_del(&block_group->list);
				9712	up_write(&block_group->space_info->groups_sem);
				9713
				9714	/*
				9715	* We haven't cached this block group, which means we could
				9716	* possibly have excluded extents on this block group.
				9717	*/
				9718	if (block_group->cached == BTRFS_CACHE_NO \|\|
				9719	block_group->cached == BTRFS_CACHE_ERROR)
				9720	free_excluded_extents(block_group);
				9721
				9722	btrfs_remove_free_space_cache(block_group);
				9723	ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
				9724	ASSERT(list_empty(&block_group->dirty_list));
				9725	ASSERT(list_empty(&block_group->io_list));
				9726	ASSERT(list_empty(&block_group->bg_list));
				9727	ASSERT(atomic_read(&block_group->count) == 1);
				9728	btrfs_put_block_group(block_group);
				9729
				9730	spin_lock(&info->block_group_cache_lock);
				9731	}
				9732	spin_unlock(&info->block_group_cache_lock);
				9733
				9734	/* now that all the block groups are freed, go through and
				9735	* free all the space_info structs. This is only called during
				9736	* the final stages of unmount, and so we know nobody is
				9737	* using them. We call synchronize_rcu() once before we start,
				9738	* just to be on the safe side.
				9739	*/
				9740	synchronize_rcu();
				9741
				9742	release_global_block_rsv(info);
				9743
				9744	while (!list_empty(&info->space_info)) {
				9745	int i;
				9746
				9747	space_info = list_entry(info->space_info.next,
				9748	struct btrfs_space_info,
				9749	list);
				9750
				9751	/*
				9752	* Do not hide this behind enospc_debug, this is actually
				9753	* important and indicates a real bug if this happens.
				9754	*/
				9755	if (WARN_ON(space_info->bytes_pinned > 0 \|\|
				9756	space_info->bytes_reserved > 0 \|\|
				9757	space_info->bytes_may_use > 0))
				9758	dump_space_info(info, space_info, 0, 0);
				9759	list_del(&space_info->list);
				9760	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
				9761	struct kobject *kobj;
				9762	kobj = space_info->block_group_kobjs[i];
				9763	space_info->block_group_kobjs[i] = NULL;
				9764	if (kobj) {
				9765	kobject_del(kobj);
				9766	kobject_put(kobj);
				9767	}
				9768	}
				9769	kobject_del(&space_info->kobj);
				9770	kobject_put(&space_info->kobj);
				9771	}
				9772	return 0;
				9773	}
				9774
				9775	/* link_block_group will queue up kobjects to add when we're reclaim-safe */
				9776	void btrfs_add_raid_kobjects(struct btrfs_fs_info *fs_info)
				9777	{
				9778	struct btrfs_space_info *space_info;
				9779	struct raid_kobject *rkobj;
				9780	LIST_HEAD(list);
				9781	int index;
				9782	int ret = 0;
				9783
				9784	spin_lock(&fs_info->pending_raid_kobjs_lock);
				9785	list_splice_init(&fs_info->pending_raid_kobjs, &list);
				9786	spin_unlock(&fs_info->pending_raid_kobjs_lock);
				9787
				9788	list_for_each_entry(rkobj, &list, list) {
				9789	space_info = __find_space_info(fs_info, rkobj->flags);
				9790	index = btrfs_bg_flags_to_raid_index(rkobj->flags);
				9791
				9792	ret = kobject_add(&rkobj->kobj, &space_info->kobj,
				9793	"%s", get_raid_name(index));
				9794	if (ret) {
				9795	kobject_put(&rkobj->kobj);
				9796	break;
				9797	}
				9798	}
				9799	if (ret)
				9800	btrfs_warn(fs_info,
				9801	"failed to add kobject for block cache, ignoring");
				9802	}
				9803
				9804	static void link_block_group(struct btrfs_block_group_cache *cache)
				9805	{
				9806	struct btrfs_space_info *space_info = cache->space_info;
				9807	struct btrfs_fs_info *fs_info = cache->fs_info;
				9808	int index = btrfs_bg_flags_to_raid_index(cache->flags);
				9809	bool first = false;
				9810
				9811	down_write(&space_info->groups_sem);
				9812	if (list_empty(&space_info->block_groups[index]))
				9813	first = true;
				9814	list_add_tail(&cache->list, &space_info->block_groups[index]);
				9815	up_write(&space_info->groups_sem);
				9816
				9817	if (first) {
				9818	struct raid_kobject rkobj = kzalloc(sizeof(rkobj), GFP_NOFS);
				9819	if (!rkobj) {
				9820	btrfs_warn(cache->fs_info,
				9821	"couldn't alloc memory for raid level kobject");
				9822	return;
				9823	}
				9824	rkobj->flags = cache->flags;
				9825	kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
				9826
				9827	spin_lock(&fs_info->pending_raid_kobjs_lock);
				9828	list_add_tail(&rkobj->list, &fs_info->pending_raid_kobjs);
				9829	spin_unlock(&fs_info->pending_raid_kobjs_lock);
				9830	space_info->block_group_kobjs[index] = &rkobj->kobj;
				9831	}
				9832	}
				9833
				9834	static struct btrfs_block_group_cache *
				9835	btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
				9836	u64 start, u64 size)
				9837	{
				9838	struct btrfs_block_group_cache *cache;
				9839
				9840	cache = kzalloc(sizeof(*cache), GFP_NOFS);
				9841	if (!cache)
				9842	return NULL;
				9843
				9844	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
				9845	GFP_NOFS);
				9846	if (!cache->free_space_ctl) {
				9847	kfree(cache);
				9848	return NULL;
				9849	}
				9850
				9851	cache->key.objectid = start;
				9852	cache->key.offset = size;
				9853	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				9854
				9855	cache->fs_info = fs_info;
				9856	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
				9857	set_free_space_tree_thresholds(cache);
				9858
				9859	atomic_set(&cache->count, 1);
				9860	spin_lock_init(&cache->lock);
				9861	init_rwsem(&cache->data_rwsem);
				9862	INIT_LIST_HEAD(&cache->list);
				9863	INIT_LIST_HEAD(&cache->cluster_list);
				9864	INIT_LIST_HEAD(&cache->bg_list);
				9865	INIT_LIST_HEAD(&cache->ro_list);
				9866	INIT_LIST_HEAD(&cache->dirty_list);
				9867	INIT_LIST_HEAD(&cache->io_list);
				9868	btrfs_init_free_space_ctl(cache);
				9869	atomic_set(&cache->trimming, 0);
				9870	mutex_init(&cache->free_space_lock);
				9871	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
				9872
				9873	return cache;
				9874	}
				9875
				9876
				9877	/*
				9878	* Iterate all chunks and verify that each of them has the corresponding block
				9879	* group
				9880	*/
				9881	static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
				9882	{
				9883	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
				9884	struct extent_map *em;
				9885	struct btrfs_block_group_cache *bg;
				9886	u64 start = 0;
				9887	int ret = 0;
				9888
				9889	while (1) {
				9890	read_lock(&map_tree->map_tree.lock);
				9891	/*
				9892	* lookup_extent_mapping will return the first extent map
				9893	* intersecting the range, so setting @len to 1 is enough to
				9894	* get the first chunk.
				9895	*/
				9896	em = lookup_extent_mapping(&map_tree->map_tree, start, 1);
				9897	read_unlock(&map_tree->map_tree.lock);
				9898	if (!em)
				9899	break;
				9900
				9901	bg = btrfs_lookup_block_group(fs_info, em->start);
				9902	if (!bg) {
				9903	btrfs_err(fs_info,
				9904	"chunk start=%llu len=%llu doesn't have corresponding block group",
				9905	em->start, em->len);
				9906	ret = -EUCLEAN;
				9907	free_extent_map(em);
				9908	break;
				9909	}
				9910	if (bg->key.objectid != em->start \|\|
				9911	bg->key.offset != em->len \|\|
				9912	(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
				9913	(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				9914	btrfs_err(fs_info,
				9915	"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
				9916	em->start, em->len,
				9917	em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
				9918	bg->key.objectid, bg->key.offset,
				9919	bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
				9920	ret = -EUCLEAN;
				9921	free_extent_map(em);
				9922	btrfs_put_block_group(bg);
				9923	break;
				9924	}
				9925	start = em->start + em->len;
				9926	free_extent_map(em);
				9927	btrfs_put_block_group(bg);
				9928	}
				9929	return ret;
				9930	}
				9931
				9932	int btrfs_read_block_groups(struct btrfs_fs_info *info)
				9933	{
				9934	struct btrfs_path *path;
				9935	int ret;
				9936	struct btrfs_block_group_cache *cache;
				9937	struct btrfs_space_info *space_info;
				9938	struct btrfs_key key;
				9939	struct btrfs_key found_key;
				9940	struct extent_buffer *leaf;
				9941	int need_clear = 0;
				9942	u64 cache_gen;
				9943	u64 feature;
				9944	int mixed;
				9945
				9946	feature = btrfs_super_incompat_flags(info->super_copy);
				9947	mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS);
				9948
				9949	key.objectid = 0;
				9950	key.offset = 0;
				9951	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				9952	path = btrfs_alloc_path();
				9953	if (!path)
				9954	return -ENOMEM;
				9955	path->reada = READA_FORWARD;
				9956
				9957	cache_gen = btrfs_super_cache_generation(info->super_copy);
				9958	if (btrfs_test_opt(info, SPACE_CACHE) &&
				9959	btrfs_super_generation(info->super_copy) != cache_gen)
				9960	need_clear = 1;
				9961	if (btrfs_test_opt(info, CLEAR_CACHE))
				9962	need_clear = 1;
				9963
				9964	while (1) {
				9965	ret = find_first_block_group(info, path, &key);
				9966	if (ret > 0)
				9967	break;
				9968	if (ret != 0)
				9969	goto error;
				9970
				9971	leaf = path->nodes[0];
				9972	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
				9973
				9974	cache = btrfs_create_block_group_cache(info, found_key.objectid,
				9975	found_key.offset);
				9976	if (!cache) {
				9977	ret = -ENOMEM;
				9978	goto error;
				9979	}
				9980
				9981	if (need_clear) {
				9982	/*
				9983	* When we mount with old space cache, we need to
				9984	* set BTRFS_DC_CLEAR and set dirty flag.
				9985	*
				9986	* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
				9987	* truncate the old free space cache inode and
				9988	* setup a new one.
				9989	* b) Setting 'dirty flag' makes sure that we flush
				9990	* the new space cache info onto disk.
				9991	*/
				9992	if (btrfs_test_opt(info, SPACE_CACHE))
				9993	cache->disk_cache_state = BTRFS_DC_CLEAR;
				9994	}
				9995
				9996	read_extent_buffer(leaf, &cache->item,
				9997	btrfs_item_ptr_offset(leaf, path->slots[0]),
				9998	sizeof(cache->item));
				9999	cache->flags = btrfs_block_group_flags(&cache->item);
				10000	if (!mixed &&
				10001	((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
				10002	(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
				10003	btrfs_err(info,
				10004	"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
				10005	cache->key.objectid);
				10006	btrfs_put_block_group(cache);
				10007	ret = -EINVAL;
				10008	goto error;
				10009	}
				10010
				10011	key.objectid = found_key.objectid + found_key.offset;
				10012	btrfs_release_path(path);
				10013
				10014	/*
				10015	* We need to exclude the super stripes now so that the space
				10016	* info has super bytes accounted for, otherwise we'll think
				10017	* we have more space than we actually do.
				10018	*/
				10019	ret = exclude_super_stripes(cache);
				10020	if (ret) {
				10021	/*
				10022	* We may have excluded something, so call this just in
				10023	* case.
				10024	*/
				10025	free_excluded_extents(cache);
				10026	btrfs_put_block_group(cache);
				10027	goto error;
				10028	}
				10029
				10030	/*
				10031	* check for two cases, either we are full, and therefore
				10032	* don't need to bother with the caching work since we won't
				10033	* find any space, or we are empty, and we can just add all
				10034	* the space in and be done with it. This saves us _alot_ of
				10035	* time, particularly in the full case.
				10036	*/
				10037	if (found_key.offset == btrfs_block_group_used(&cache->item)) {
				10038	cache->last_byte_to_unpin = (u64)-1;
				10039	cache->cached = BTRFS_CACHE_FINISHED;
				10040	free_excluded_extents(cache);
				10041	} else if (btrfs_block_group_used(&cache->item) == 0) {
				10042	cache->last_byte_to_unpin = (u64)-1;
				10043	cache->cached = BTRFS_CACHE_FINISHED;
				10044	add_new_free_space(cache, found_key.objectid,
				10045	found_key.objectid +
				10046	found_key.offset);
				10047	free_excluded_extents(cache);
				10048	}
				10049
				10050	ret = btrfs_add_block_group_cache(info, cache);
				10051	if (ret) {
				10052	btrfs_remove_free_space_cache(cache);
				10053	btrfs_put_block_group(cache);
				10054	goto error;
				10055	}
				10056
				10057	trace_btrfs_add_block_group(info, cache, 0);
				10058	update_space_info(info, cache->flags, found_key.offset,
				10059	btrfs_block_group_used(&cache->item),
				10060	cache->bytes_super, &space_info);
				10061
				10062	cache->space_info = space_info;
				10063
				10064	link_block_group(cache);
				10065
				10066	set_avail_alloc_bits(info, cache->flags);
				10067	if (btrfs_chunk_readonly(info, cache->key.objectid)) {
				10068	inc_block_group_ro(cache, 1);
				10069	} else if (btrfs_block_group_used(&cache->item) == 0) {
				10070	ASSERT(list_empty(&cache->bg_list));
				10071	btrfs_mark_bg_unused(cache);
				10072	}
				10073	}
				10074
				10075	list_for_each_entry_rcu(space_info, &info->space_info, list) {
				10076	if (!(get_alloc_profile(info, space_info->flags) &
				10077	(BTRFS_BLOCK_GROUP_RAID10 \|
				10078	BTRFS_BLOCK_GROUP_RAID1 \|
				10079	BTRFS_BLOCK_GROUP_RAID5 \|
				10080	BTRFS_BLOCK_GROUP_RAID6 \|
				10081	BTRFS_BLOCK_GROUP_DUP)))
				10082	continue;
				10083	/*
				10084	* avoid allocating from un-mirrored block group if there are
				10085	* mirrored block groups.
				10086	*/
				10087	list_for_each_entry(cache,
				10088	&space_info->block_groups[BTRFS_RAID_RAID0],
				10089	list)
				10090	inc_block_group_ro(cache, 1);
				10091	list_for_each_entry(cache,
				10092	&space_info->block_groups[BTRFS_RAID_SINGLE],
				10093	list)
				10094	inc_block_group_ro(cache, 1);
				10095	}
				10096
				10097	btrfs_add_raid_kobjects(info);
				10098	init_global_block_rsv(info);
				10099	ret = check_chunk_block_group_mappings(info);
				10100	error:
				10101	btrfs_free_path(path);
				10102	return ret;
				10103	}
				10104
				10105	void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
				10106	{
				10107	struct btrfs_fs_info *fs_info = trans->fs_info;
				10108	struct btrfs_block_group_cache *block_group;
				10109	struct btrfs_root *extent_root = fs_info->extent_root;
				10110	struct btrfs_block_group_item item;
				10111	struct btrfs_key key;
				10112	int ret = 0;
				10113
				10114	if (!trans->can_flush_pending_bgs)
				10115	return;
				10116
				10117	while (!list_empty(&trans->new_bgs)) {
				10118	block_group = list_first_entry(&trans->new_bgs,
				10119	struct btrfs_block_group_cache,
				10120	bg_list);
				10121	if (ret)
				10122	goto next;
				10123
				10124	spin_lock(&block_group->lock);
				10125	memcpy(&item, &block_group->item, sizeof(item));
				10126	memcpy(&key, &block_group->key, sizeof(key));
				10127	spin_unlock(&block_group->lock);
				10128
				10129	ret = btrfs_insert_item(trans, extent_root, &key, &item,
				10130	sizeof(item));
				10131	if (ret)
				10132	btrfs_abort_transaction(trans, ret);
				10133	ret = btrfs_finish_chunk_alloc(trans, key.objectid, key.offset);
				10134	if (ret)
				10135	btrfs_abort_transaction(trans, ret);
				10136	add_block_group_free_space(trans, block_group);
				10137	/* already aborted the transaction if it failed. */
				10138	next:
				10139	list_del_init(&block_group->bg_list);
				10140	}
				10141	btrfs_trans_release_chunk_metadata(trans);
				10142	}
				10143
				10144	int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
				10145	u64 type, u64 chunk_offset, u64 size)
				10146	{
				10147	struct btrfs_fs_info *fs_info = trans->fs_info;
				10148	struct btrfs_block_group_cache *cache;
				10149	int ret;
				10150
				10151	btrfs_set_log_full_commit(fs_info, trans);
				10152
				10153	cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size);
				10154	if (!cache)
				10155	return -ENOMEM;
				10156
				10157	btrfs_set_block_group_used(&cache->item, bytes_used);
				10158	btrfs_set_block_group_chunk_objectid(&cache->item,
				10159	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				10160	btrfs_set_block_group_flags(&cache->item, type);
				10161
				10162	cache->flags = type;
				10163	cache->last_byte_to_unpin = (u64)-1;
				10164	cache->cached = BTRFS_CACHE_FINISHED;
				10165	cache->needs_free_space = 1;
				10166	ret = exclude_super_stripes(cache);
				10167	if (ret) {
				10168	/*
				10169	* We may have excluded something, so call this just in
				10170	* case.
				10171	*/
				10172	free_excluded_extents(cache);
				10173	btrfs_put_block_group(cache);
				10174	return ret;
				10175	}
				10176
				10177	add_new_free_space(cache, chunk_offset, chunk_offset + size);
				10178
				10179	free_excluded_extents(cache);
				10180
				10181	#ifdef CONFIG_BTRFS_DEBUG
				10182	if (btrfs_should_fragment_free_space(cache)) {
				10183	u64 new_bytes_used = size - bytes_used;
				10184
				10185	bytes_used += new_bytes_used >> 1;
				10186	fragment_free_space(cache);
				10187	}
				10188	#endif
				10189	/*
				10190	* Ensure the corresponding space_info object is created and
				10191	* assigned to our block group. We want our bg to be added to the rbtree
				10192	* with its ->space_info set.
				10193	*/
				10194	cache->space_info = __find_space_info(fs_info, cache->flags);
				10195	ASSERT(cache->space_info);
				10196
				10197	ret = btrfs_add_block_group_cache(fs_info, cache);
				10198	if (ret) {
				10199	btrfs_remove_free_space_cache(cache);
				10200	btrfs_put_block_group(cache);
				10201	return ret;
				10202	}
				10203
				10204	/*
				10205	* Now that our block group has its ->space_info set and is inserted in
				10206	* the rbtree, update the space info's counters.
				10207	*/
				10208	trace_btrfs_add_block_group(fs_info, cache, 1);
				10209	update_space_info(fs_info, cache->flags, size, bytes_used,
				10210	cache->bytes_super, &cache->space_info);
				10211	update_global_block_rsv(fs_info);
				10212
				10213	link_block_group(cache);
				10214
				10215	list_add_tail(&cache->bg_list, &trans->new_bgs);
				10216
				10217	set_avail_alloc_bits(fs_info, type);
				10218	return 0;
				10219	}
				10220
				10221	static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				10222	{
				10223	u64 extra_flags = chunk_to_extended(flags) &
				10224	BTRFS_EXTENDED_PROFILE_MASK;
				10225
				10226	write_seqlock(&fs_info->profiles_lock);
				10227	if (flags & BTRFS_BLOCK_GROUP_DATA)
				10228	fs_info->avail_data_alloc_bits &= ~extra_flags;
				10229	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				10230	fs_info->avail_metadata_alloc_bits &= ~extra_flags;
				10231	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				10232	fs_info->avail_system_alloc_bits &= ~extra_flags;
				10233	write_sequnlock(&fs_info->profiles_lock);
				10234	}
				10235
				10236	int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
				10237	u64 group_start, struct extent_map *em)
				10238	{
				10239	struct btrfs_fs_info *fs_info = trans->fs_info;
				10240	struct btrfs_root *root = fs_info->extent_root;
				10241	struct btrfs_path *path;
				10242	struct btrfs_block_group_cache *block_group;
				10243	struct btrfs_free_cluster *cluster;
				10244	struct btrfs_root *tree_root = fs_info->tree_root;
				10245	struct btrfs_key key;
				10246	struct inode *inode;
				10247	struct kobject *kobj = NULL;
				10248	int ret;
				10249	int index;
				10250	int factor;
				10251	struct btrfs_caching_control *caching_ctl = NULL;
				10252	bool remove_em;
				10253
				10254	block_group = btrfs_lookup_block_group(fs_info, group_start);
				10255	BUG_ON(!block_group);
				10256	BUG_ON(!block_group->ro);
				10257
				10258	trace_btrfs_remove_block_group(block_group);
				10259	/*
				10260	* Free the reserved super bytes from this block group before
				10261	* remove it.
				10262	*/
				10263	free_excluded_extents(block_group);
				10264	btrfs_free_ref_tree_range(fs_info, block_group->key.objectid,
				10265	block_group->key.offset);
				10266
				10267	memcpy(&key, &block_group->key, sizeof(key));
				10268	index = btrfs_bg_flags_to_raid_index(block_group->flags);
				10269	factor = btrfs_bg_type_to_factor(block_group->flags);
				10270
				10271	/* make sure this block group isn't part of an allocation cluster */
				10272	cluster = &fs_info->data_alloc_cluster;
				10273	spin_lock(&cluster->refill_lock);
				10274	btrfs_return_cluster_to_free_space(block_group, cluster);
				10275	spin_unlock(&cluster->refill_lock);
				10276
				10277	/*
				10278	* make sure this block group isn't part of a metadata
				10279	* allocation cluster
				10280	*/
				10281	cluster = &fs_info->meta_alloc_cluster;
				10282	spin_lock(&cluster->refill_lock);
				10283	btrfs_return_cluster_to_free_space(block_group, cluster);
				10284	spin_unlock(&cluster->refill_lock);
				10285
				10286	path = btrfs_alloc_path();
				10287	if (!path) {
				10288	ret = -ENOMEM;
				10289	goto out;
				10290	}
				10291
				10292	/*
				10293	* get the inode first so any iput calls done for the io_list
				10294	* aren't the final iput (no unlinks allowed now)
				10295	*/
				10296	inode = lookup_free_space_inode(fs_info, block_group, path);
				10297
				10298	mutex_lock(&trans->transaction->cache_write_mutex);
				10299	/*
				10300	* make sure our free spache cache IO is done before remove the
				10301	* free space inode
				10302	*/
				10303	spin_lock(&trans->transaction->dirty_bgs_lock);
				10304	if (!list_empty(&block_group->io_list)) {
				10305	list_del_init(&block_group->io_list);
				10306
				10307	WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
				10308
				10309	spin_unlock(&trans->transaction->dirty_bgs_lock);
				10310	btrfs_wait_cache_io(trans, block_group, path);
				10311	btrfs_put_block_group(block_group);
				10312	spin_lock(&trans->transaction->dirty_bgs_lock);
				10313	}
				10314
				10315	if (!list_empty(&block_group->dirty_list)) {
				10316	list_del_init(&block_group->dirty_list);
				10317	btrfs_put_block_group(block_group);
				10318	}
				10319	spin_unlock(&trans->transaction->dirty_bgs_lock);
				10320	mutex_unlock(&trans->transaction->cache_write_mutex);
				10321
				10322	if (!IS_ERR(inode)) {
				10323	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
				10324	if (ret) {
				10325	btrfs_add_delayed_iput(inode);
				10326	goto out;
				10327	}
				10328	clear_nlink(inode);
				10329	/* One for the block groups ref */
				10330	spin_lock(&block_group->lock);
				10331	if (block_group->iref) {
				10332	block_group->iref = 0;
				10333	block_group->inode = NULL;
				10334	spin_unlock(&block_group->lock);
				10335	iput(inode);
				10336	} else {
				10337	spin_unlock(&block_group->lock);
				10338	}
				10339	/* One for our lookup ref */
				10340	btrfs_add_delayed_iput(inode);
				10341	}
				10342
				10343	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
				10344	key.offset = block_group->key.objectid;
				10345	key.type = 0;
				10346
				10347	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
				10348	if (ret < 0)
				10349	goto out;
				10350	if (ret > 0)
				10351	btrfs_release_path(path);
				10352	if (ret == 0) {
				10353	ret = btrfs_del_item(trans, tree_root, path);
				10354	if (ret)
				10355	goto out;
				10356	btrfs_release_path(path);
				10357	}
				10358
				10359	spin_lock(&fs_info->block_group_cache_lock);
				10360	rb_erase(&block_group->cache_node,
				10361	&fs_info->block_group_cache_tree);
				10362	RB_CLEAR_NODE(&block_group->cache_node);
				10363
				10364	if (fs_info->first_logical_byte == block_group->key.objectid)
				10365	fs_info->first_logical_byte = (u64)-1;
				10366	spin_unlock(&fs_info->block_group_cache_lock);
				10367
				10368	down_write(&block_group->space_info->groups_sem);
				10369	/*
				10370	* we must use list_del_init so people can check to see if they
				10371	* are still on the list after taking the semaphore
				10372	*/
				10373	list_del_init(&block_group->list);
				10374	if (list_empty(&block_group->space_info->block_groups[index])) {
				10375	kobj = block_group->space_info->block_group_kobjs[index];
				10376	block_group->space_info->block_group_kobjs[index] = NULL;
				10377	clear_avail_alloc_bits(fs_info, block_group->flags);
				10378	}
				10379	up_write(&block_group->space_info->groups_sem);
				10380	if (kobj) {
				10381	kobject_del(kobj);
				10382	kobject_put(kobj);
				10383	}
				10384
				10385	if (block_group->has_caching_ctl)
				10386	caching_ctl = get_caching_control(block_group);
				10387	if (block_group->cached == BTRFS_CACHE_STARTED)
				10388	wait_block_group_cache_done(block_group);
				10389	if (block_group->has_caching_ctl) {
				10390	down_write(&fs_info->commit_root_sem);
				10391	if (!caching_ctl) {
				10392	struct btrfs_caching_control *ctl;
				10393
				10394	list_for_each_entry(ctl,
				10395	&fs_info->caching_block_groups, list)
				10396	if (ctl->block_group == block_group) {
				10397	caching_ctl = ctl;
				10398	refcount_inc(&caching_ctl->count);
				10399	break;
				10400	}
				10401	}
				10402	if (caching_ctl)
				10403	list_del_init(&caching_ctl->list);
				10404	up_write(&fs_info->commit_root_sem);
				10405	if (caching_ctl) {
				10406	/* Once for the caching bgs list and once for us. */
				10407	put_caching_control(caching_ctl);
				10408	put_caching_control(caching_ctl);
				10409	}
				10410	}
				10411
				10412	spin_lock(&trans->transaction->dirty_bgs_lock);
				10413	if (!list_empty(&block_group->dirty_list)) {
				10414	WARN_ON(1);
				10415	}
				10416	if (!list_empty(&block_group->io_list)) {
				10417	WARN_ON(1);
				10418	}
				10419	spin_unlock(&trans->transaction->dirty_bgs_lock);
				10420	btrfs_remove_free_space_cache(block_group);
				10421
				10422	spin_lock(&block_group->space_info->lock);
				10423	list_del_init(&block_group->ro_list);
				10424
				10425	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				10426	WARN_ON(block_group->space_info->total_bytes
				10427	< block_group->key.offset);
				10428	WARN_ON(block_group->space_info->bytes_readonly
				10429	< block_group->key.offset);
				10430	WARN_ON(block_group->space_info->disk_total
				10431	< block_group->key.offset * factor);
				10432	}
				10433	block_group->space_info->total_bytes -= block_group->key.offset;
				10434	block_group->space_info->bytes_readonly -= block_group->key.offset;
				10435	block_group->space_info->disk_total -= block_group->key.offset * factor;
				10436
				10437	spin_unlock(&block_group->space_info->lock);
				10438
				10439	memcpy(&key, &block_group->key, sizeof(key));
				10440
				10441	mutex_lock(&fs_info->chunk_mutex);
				10442	if (!list_empty(&em->list)) {
				10443	/* We're in the transaction->pending_chunks list. */
				10444	free_extent_map(em);
				10445	}
				10446	spin_lock(&block_group->lock);
				10447	block_group->removed = 1;
				10448	/*
				10449	* At this point trimming can't start on this block group, because we
				10450	* removed the block group from the tree fs_info->block_group_cache_tree
				10451	* so no one can't find it anymore and even if someone already got this
				10452	* block group before we removed it from the rbtree, they have already
				10453	* incremented block_group->trimming - if they didn't, they won't find
				10454	* any free space entries because we already removed them all when we
				10455	* called btrfs_remove_free_space_cache().
				10456	*
				10457	* And we must not remove the extent map from the fs_info->mapping_tree
				10458	* to prevent the same logical address range and physical device space
				10459	* ranges from being reused for a new block group. This is because our
				10460	* fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
				10461	* completely transactionless, so while it is trimming a range the
				10462	* currently running transaction might finish and a new one start,
				10463	* allowing for new block groups to be created that can reuse the same
				10464	* physical device locations unless we take this special care.
				10465	*
				10466	* There may also be an implicit trim operation if the file system
				10467	* is mounted with -odiscard. The same protections must remain
				10468	* in place until the extents have been discarded completely when
				10469	* the transaction commit has completed.
				10470	*/
				10471	remove_em = (atomic_read(&block_group->trimming) == 0);
				10472	/*
				10473	* Make sure a trimmer task always sees the em in the pinned_chunks list
				10474	* if it sees block_group->removed == 1 (needs to lock block_group->lock
				10475	* before checking block_group->removed).
				10476	*/
				10477	if (!remove_em) {
				10478	/*
				10479	* Our em might be in trans->transaction->pending_chunks which
				10480	* is protected by fs_info->chunk_mutex ([lock\|unlock]_chunks),
				10481	* and so is the fs_info->pinned_chunks list.
				10482	*
				10483	* So at this point we must be holding the chunk_mutex to avoid
				10484	* any races with chunk allocation (more specifically at
				10485	* volumes.c:contains_pending_extent()), to ensure it always
				10486	* sees the em, either in the pending_chunks list or in the
				10487	* pinned_chunks list.
				10488	*/
				10489	list_move_tail(&em->list, &fs_info->pinned_chunks);
				10490	}
				10491	spin_unlock(&block_group->lock);
				10492
				10493	mutex_unlock(&fs_info->chunk_mutex);
				10494
				10495	ret = remove_block_group_free_space(trans, block_group);
				10496	if (ret)
				10497	goto out;
				10498
				10499	btrfs_put_block_group(block_group);
				10500	btrfs_put_block_group(block_group);
				10501
				10502	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				10503	if (ret > 0)
				10504	ret = -EIO;
				10505	if (ret < 0)
				10506	goto out;
				10507
				10508	ret = btrfs_del_item(trans, root, path);
				10509	if (ret)
				10510	goto out;
				10511
				10512	if (remove_em) {
				10513	struct extent_map_tree *em_tree;
				10514
				10515	em_tree = &fs_info->mapping_tree.map_tree;
				10516	write_lock(&em_tree->lock);
				10517	/*
				10518	* The em might be in the pending_chunks list, so make sure the
				10519	* chunk mutex is locked, since remove_extent_mapping() will
				10520	* delete us from that list.
				10521	*/
				10522	remove_extent_mapping(em_tree, em);
				10523	write_unlock(&em_tree->lock);
				10524	/* once for the tree */
				10525	free_extent_map(em);
				10526	}
				10527	out:
				10528	btrfs_free_path(path);
				10529	return ret;
				10530	}
				10531
				10532	struct btrfs_trans_handle *
				10533	btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
				10534	const u64 chunk_offset)
				10535	{
				10536	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
				10537	struct extent_map *em;
				10538	struct map_lookup *map;
				10539	unsigned int num_items;
				10540
				10541	read_lock(&em_tree->lock);
				10542	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
				10543	read_unlock(&em_tree->lock);
				10544	ASSERT(em && em->start == chunk_offset);
				10545
				10546	/*
				10547	* We need to reserve 3 + N units from the metadata space info in order
				10548	* to remove a block group (done at btrfs_remove_chunk() and at
				10549	* btrfs_remove_block_group()), which are used for:
				10550	*
				10551	* 1 unit for adding the free space inode's orphan (located in the tree
				10552	* of tree roots).
				10553	* 1 unit for deleting the block group item (located in the extent
				10554	* tree).
				10555	* 1 unit for deleting the free space item (located in tree of tree
				10556	* roots).
				10557	* N units for deleting N device extent items corresponding to each
				10558	* stripe (located in the device tree).
				10559	*
				10560	* In order to remove a block group we also need to reserve units in the
				10561	* system space info in order to update the chunk tree (update one or
				10562	* more device items and remove one chunk item), but this is done at
				10563	* btrfs_remove_chunk() through a call to check_system_chunk().
				10564	*/
				10565	map = em->map_lookup;
				10566	num_items = 3 + map->num_stripes;
				10567	free_extent_map(em);
				10568
				10569	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
				10570	num_items, 1);
				10571	}
				10572
				10573	/*
				10574	* Process the unused_bgs list and remove any that don't have any allocated
				10575	* space inside of them.
				10576	*/
				10577	void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
				10578	{
				10579	struct btrfs_block_group_cache *block_group;
				10580	struct btrfs_space_info *space_info;
				10581	struct btrfs_trans_handle *trans;
				10582	int ret = 0;
				10583
				10584	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				10585	return;
				10586
				10587	spin_lock(&fs_info->unused_bgs_lock);
				10588	while (!list_empty(&fs_info->unused_bgs)) {
				10589	u64 start, end;
				10590	int trimming;
				10591
				10592	block_group = list_first_entry(&fs_info->unused_bgs,
				10593	struct btrfs_block_group_cache,
				10594	bg_list);
				10595	list_del_init(&block_group->bg_list);
				10596
				10597	space_info = block_group->space_info;
				10598
				10599	if (ret \|\| btrfs_mixed_space_info(space_info)) {
				10600	btrfs_put_block_group(block_group);
				10601	continue;
				10602	}
				10603	spin_unlock(&fs_info->unused_bgs_lock);
				10604
				10605	mutex_lock(&fs_info->delete_unused_bgs_mutex);
				10606
				10607	/* Don't want to race with allocators so take the groups_sem */
				10608	down_write(&space_info->groups_sem);
				10609	spin_lock(&block_group->lock);
				10610	if (block_group->reserved \|\| block_group->pinned \|\|
				10611	btrfs_block_group_used(&block_group->item) \|\|
				10612	block_group->ro \|\|
				10613	list_is_singular(&block_group->list)) {
				10614	/*
				10615	* We want to bail if we made new allocations or have
				10616	* outstanding allocations in this block group. We do
				10617	* the ro check in case balance is currently acting on
				10618	* this block group.
				10619	*/
				10620	trace_btrfs_skip_unused_block_group(block_group);
				10621	spin_unlock(&block_group->lock);
				10622	up_write(&space_info->groups_sem);
				10623	goto next;
				10624	}
				10625	spin_unlock(&block_group->lock);
				10626
				10627	/* We don't want to force the issue, only flip if it's ok. */
				10628	ret = inc_block_group_ro(block_group, 0);
				10629	up_write(&space_info->groups_sem);
				10630	if (ret < 0) {
				10631	ret = 0;
				10632	goto next;
				10633	}
				10634
				10635	/*
				10636	* Want to do this before we do anything else so we can recover
				10637	* properly if we fail to join the transaction.
				10638	*/
				10639	trans = btrfs_start_trans_remove_block_group(fs_info,
				10640	block_group->key.objectid);
				10641	if (IS_ERR(trans)) {
				10642	btrfs_dec_block_group_ro(block_group);
				10643	ret = PTR_ERR(trans);
				10644	goto next;
				10645	}
				10646
				10647	/*
				10648	* We could have pending pinned extents for this block group,
				10649	* just delete them, we don't care about them anymore.
				10650	*/
				10651	start = block_group->key.objectid;
				10652	end = start + block_group->key.offset - 1;
				10653	/*
				10654	* Hold the unused_bg_unpin_mutex lock to avoid racing with
				10655	* btrfs_finish_extent_commit(). If we are at transaction N,
				10656	* another task might be running finish_extent_commit() for the
				10657	* previous transaction N - 1, and have seen a range belonging
				10658	* to the block group in freed_extents[] before we were able to
				10659	* clear the whole block group range from freed_extents[]. This
				10660	* means that task can lookup for the block group after we
				10661	* unpinned it from freed_extents[] and removed it, leading to
				10662	* a BUG_ON() at btrfs_unpin_extent_range().
				10663	*/
				10664	mutex_lock(&fs_info->unused_bg_unpin_mutex);
				10665	ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
				10666	EXTENT_DIRTY);
				10667	if (ret) {
				10668	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				10669	btrfs_dec_block_group_ro(block_group);
				10670	goto end_trans;
				10671	}
				10672	ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
				10673	EXTENT_DIRTY);
				10674	if (ret) {
				10675	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				10676	btrfs_dec_block_group_ro(block_group);
				10677	goto end_trans;
				10678	}
				10679	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
				10680
				10681	/* Reset pinned so btrfs_put_block_group doesn't complain */
				10682	spin_lock(&space_info->lock);
				10683	spin_lock(&block_group->lock);
				10684
				10685	space_info->bytes_pinned -= block_group->pinned;
				10686	space_info->bytes_readonly += block_group->pinned;
				10687	percpu_counter_add_batch(&space_info->total_bytes_pinned,
				10688	-block_group->pinned,
				10689	BTRFS_TOTAL_BYTES_PINNED_BATCH);
				10690	block_group->pinned = 0;
				10691
				10692	spin_unlock(&block_group->lock);
				10693	spin_unlock(&space_info->lock);
				10694
				10695	/* DISCARD can flip during remount */
				10696	trimming = btrfs_test_opt(fs_info, DISCARD);
				10697
				10698	/* Implicit trim during transaction commit. */
				10699	if (trimming)
				10700	btrfs_get_block_group_trimming(block_group);
				10701
				10702	/*
				10703	* Btrfs_remove_chunk will abort the transaction if things go
				10704	* horribly wrong.
				10705	*/
				10706	ret = btrfs_remove_chunk(trans, block_group->key.objectid);
				10707
				10708	if (ret) {
				10709	if (trimming)
				10710	btrfs_put_block_group_trimming(block_group);
				10711	goto end_trans;
				10712	}
				10713
				10714	/*
				10715	* If we're not mounted with -odiscard, we can just forget
				10716	* about this block group. Otherwise we'll need to wait
				10717	* until transaction commit to do the actual discard.
				10718	*/
				10719	if (trimming) {
				10720	spin_lock(&fs_info->unused_bgs_lock);
				10721	/*
				10722	* A concurrent scrub might have added us to the list
				10723	* fs_info->unused_bgs, so use a list_move operation
				10724	* to add the block group to the deleted_bgs list.
				10725	*/
				10726	list_move(&block_group->bg_list,
				10727	&trans->transaction->deleted_bgs);
				10728	spin_unlock(&fs_info->unused_bgs_lock);
				10729	btrfs_get_block_group(block_group);
				10730	}
				10731	end_trans:
				10732	btrfs_end_transaction(trans);
				10733	next:
				10734	mutex_unlock(&fs_info->delete_unused_bgs_mutex);
				10735	btrfs_put_block_group(block_group);
				10736	spin_lock(&fs_info->unused_bgs_lock);
				10737	}
				10738	spin_unlock(&fs_info->unused_bgs_lock);
				10739	}
				10740
				10741	int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
				10742	{
				10743	struct btrfs_super_block *disk_super;
				10744	u64 features;
				10745	u64 flags;
				10746	int mixed = 0;
				10747	int ret;
				10748
				10749	disk_super = fs_info->super_copy;
				10750	if (!btrfs_super_root(disk_super))
				10751	return -EINVAL;
				10752
				10753	features = btrfs_super_incompat_flags(disk_super);
				10754	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				10755	mixed = 1;
				10756
				10757	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				10758	ret = create_space_info(fs_info, flags);
				10759	if (ret)
				10760	goto out;
				10761
				10762	if (mixed) {
				10763	flags = BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA;
				10764	ret = create_space_info(fs_info, flags);
				10765	} else {
				10766	flags = BTRFS_BLOCK_GROUP_METADATA;
				10767	ret = create_space_info(fs_info, flags);
				10768	if (ret)
				10769	goto out;
				10770
				10771	flags = BTRFS_BLOCK_GROUP_DATA;
				10772	ret = create_space_info(fs_info, flags);
				10773	}
				10774	out:
				10775	return ret;
				10776	}
				10777
				10778	int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				10779	u64 start, u64 end)
				10780	{
				10781	return unpin_extent_range(fs_info, start, end, false);
				10782	}
				10783
				10784	/*
				10785	* It used to be that old block groups would be left around forever.
				10786	* Iterating over them would be enough to trim unused space. Since we
				10787	* now automatically remove them, we also need to iterate over unallocated
				10788	* space.
				10789	*
				10790	* We don't want a transaction for this since the discard may take a
				10791	* substantial amount of time. We don't require that a transaction be
				10792	* running, but we do need to take a running transaction into account
				10793	* to ensure that we're not discarding chunks that were released or
				10794	* allocated in the current transaction.
				10795	*
				10796	* Holding the chunks lock will prevent other threads from allocating
				10797	* or releasing chunks, but it won't prevent a running transaction
				10798	* from committing and releasing the memory that the pending chunks
				10799	* list head uses. For that, we need to take a reference to the
				10800	* transaction and hold the commit root sem. We only need to hold
				10801	* it while performing the free space search since we have already
				10802	* held back allocations.
				10803	*/
				10804	static int btrfs_trim_free_extents(struct btrfs_device *device,
				10805	u64 minlen, u64 *trimmed)
				10806	{
				10807	u64 start = 0, len = 0;
				10808	int ret;
				10809
				10810	*trimmed = 0;
				10811
				10812	/* Discard not supported = nothing to do. */
				10813	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
				10814	return 0;
				10815
				10816	/* Not writeable = nothing to do. */
				10817	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				10818	return 0;
				10819
				10820	/* No free space = nothing to do. */
				10821	if (device->total_bytes <= device->bytes_used)
				10822	return 0;
				10823
				10824	ret = 0;
				10825
				10826	while (1) {
				10827	struct btrfs_fs_info *fs_info = device->fs_info;
				10828	struct btrfs_transaction *trans;
				10829	u64 bytes;
				10830
				10831	ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
				10832	if (ret)
				10833	break;
				10834
				10835	ret = down_read_killable(&fs_info->commit_root_sem);
				10836	if (ret) {
				10837	mutex_unlock(&fs_info->chunk_mutex);
				10838	break;
				10839	}
				10840
				10841	spin_lock(&fs_info->trans_lock);
				10842	trans = fs_info->running_transaction;
				10843	if (trans)
				10844	refcount_inc(&trans->use_count);
				10845	spin_unlock(&fs_info->trans_lock);
				10846
				10847	if (!trans)
				10848	up_read(&fs_info->commit_root_sem);
				10849
				10850	ret = find_free_dev_extent_start(trans, device, minlen, start,
				10851	&start, &len);
				10852	if (trans) {
				10853	up_read(&fs_info->commit_root_sem);
				10854	btrfs_put_transaction(trans);
				10855	}
				10856
				10857	if (ret) {
				10858	mutex_unlock(&fs_info->chunk_mutex);
				10859	if (ret == -ENOSPC)
				10860	ret = 0;
				10861	break;
				10862	}
				10863
				10864	ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
				10865	mutex_unlock(&fs_info->chunk_mutex);
				10866
				10867	if (ret)
				10868	break;
				10869
				10870	start += len;
				10871	*trimmed += bytes;
				10872
				10873	if (fatal_signal_pending(current)) {
				10874	ret = -ERESTARTSYS;
				10875	break;
				10876	}
				10877
				10878	cond_resched();
				10879	}
				10880
				10881	return ret;
				10882	}
				10883
				10884	/*
				10885	* Trim the whole filesystem by:
				10886	* 1) trimming the free space in each block group
				10887	* 2) trimming the unallocated space on each device
				10888	*
				10889	* This will also continue trimming even if a block group or device encounters
				10890	* an error. The return value will be the last error, or 0 if nothing bad
				10891	* happens.
				10892	*/
				10893	int btrfs_trim_fs(struct btrfs_fs_info fs_info, struct fstrim_range range)
				10894	{
				10895	struct btrfs_block_group_cache *cache = NULL;
				10896	struct btrfs_device *device;
				10897	struct list_head *devices;
				10898	u64 group_trimmed;
				10899	u64 start;
				10900	u64 end;
				10901	u64 trimmed = 0;
				10902	u64 bg_failed = 0;
				10903	u64 dev_failed = 0;
				10904	int bg_ret = 0;
				10905	int dev_ret = 0;
				10906	int ret = 0;
				10907
				10908	cache = btrfs_lookup_first_block_group(fs_info, range->start);
				10909	for (; cache; cache = next_block_group(fs_info, cache)) {
				10910	if (cache->key.objectid >= (range->start + range->len)) {
				10911	btrfs_put_block_group(cache);
				10912	break;
				10913	}
				10914
				10915	start = max(range->start, cache->key.objectid);
				10916	end = min(range->start + range->len,
				10917	cache->key.objectid + cache->key.offset);
				10918
				10919	if (end - start >= range->minlen) {
				10920	if (!block_group_cache_done(cache)) {
				10921	ret = cache_block_group(cache, 0);
				10922	if (ret) {
				10923	bg_failed++;
				10924	bg_ret = ret;
				10925	continue;
				10926	}
				10927	ret = wait_block_group_cache_done(cache);
				10928	if (ret) {
				10929	bg_failed++;
				10930	bg_ret = ret;
				10931	continue;
				10932	}
				10933	}
				10934	ret = btrfs_trim_block_group(cache,
				10935	&group_trimmed,
				10936	start,
				10937	end,
				10938	range->minlen);
				10939
				10940	trimmed += group_trimmed;
				10941	if (ret) {
				10942	bg_failed++;
				10943	bg_ret = ret;
				10944	continue;
				10945	}
				10946	}
				10947	}
				10948
				10949	if (bg_failed)
				10950	btrfs_warn(fs_info,
				10951	"failed to trim %llu block group(s), last error %d",
				10952	bg_failed, bg_ret);
				10953	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				10954	devices = &fs_info->fs_devices->devices;
				10955	list_for_each_entry(device, devices, dev_list) {
				10956	ret = btrfs_trim_free_extents(device, range->minlen,
				10957	&group_trimmed);
				10958	if (ret) {
				10959	dev_failed++;
				10960	dev_ret = ret;
				10961	break;
				10962	}
				10963
				10964	trimmed += group_trimmed;
				10965	}
				10966	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				10967
				10968	if (dev_failed)
				10969	btrfs_warn(fs_info,
				10970	"failed to trim %llu device(s), last error %d",
				10971	dev_failed, dev_ret);
				10972	range->len = trimmed;
				10973	if (bg_ret)
				10974	return bg_ret;
				10975	return dev_ret;
				10976	}
				10977
				10978	/*
				10979	* btrfs_{start,end}_write_no_snapshotting() are similar to
				10980	* mnt_{want,drop}_write(), they are used to prevent some tasks from writing
				10981	* data into the page cache through nocow before the subvolume is snapshoted,
				10982	* but flush the data into disk after the snapshot creation, or to prevent
				10983	* operations while snapshotting is ongoing and that cause the snapshot to be
				10984	* inconsistent (writes followed by expanding truncates for example).
				10985	*/
				10986	void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
				10987	{
				10988	percpu_counter_dec(&root->subv_writers->counter);
				10989	cond_wake_up(&root->subv_writers->wait);
				10990	}
				10991
				10992	int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
				10993	{
				10994	if (atomic_read(&root->will_be_snapshotted))
				10995	return 0;
				10996
				10997	percpu_counter_inc(&root->subv_writers->counter);
				10998	/*
				10999	* Make sure counter is updated before we check for snapshot creation.
				11000	*/
				11001	smp_mb();
				11002	if (atomic_read(&root->will_be_snapshotted)) {
				11003	btrfs_end_write_no_snapshotting(root);
				11004	return 0;
				11005	}
				11006	return 1;
				11007	}
				11008
				11009	void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
				11010	{
				11011	while (true) {
				11012	int ret;
				11013
				11014	ret = btrfs_start_write_no_snapshotting(root);
				11015	if (ret)
				11016	break;
				11017	wait_var_event(&root->will_be_snapshotted,
				11018	!atomic_read(&root->will_be_snapshotted));
				11019	}
				11020	}
				11021
				11022	void btrfs_mark_bg_unused(struct btrfs_block_group_cache *bg)
				11023	{
				11024	struct btrfs_fs_info *fs_info = bg->fs_info;
				11025
				11026	spin_lock(&fs_info->unused_bgs_lock);
				11027	if (list_empty(&bg->bg_list)) {
				11028	btrfs_get_block_group(bg);
				11029	trace_btrfs_add_unused_block_group(bg);
				11030	list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
				11031	}
				11032	spin_unlock(&fs_info->unused_bgs_lock);
				11033	}