Blame - src/kernel/linux/v4.14/fs/btrfs/extent_io.c - T103

blob: 0ba338cffa937407ddec9395c63e6b0e6296b098 [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	// SPDX-License-Identifier: GPL-2.0
				2	#include <linux/bitops.h>
				3	#include <linux/slab.h>
				4	#include <linux/bio.h>
				5	#include <linux/mm.h>
				6	#include <linux/pagemap.h>
				7	#include <linux/page-flags.h>
				8	#include <linux/spinlock.h>
				9	#include <linux/blkdev.h>
				10	#include <linux/swap.h>
				11	#include <linux/writeback.h>
				12	#include <linux/pagevec.h>
				13	#include <linux/prefetch.h>
				14	#include <linux/cleancache.h>
				15	#include "extent_io.h"
				16	#include "extent_map.h"
				17	#include "ctree.h"
				18	#include "btrfs_inode.h"
				19	#include "volumes.h"
				20	#include "check-integrity.h"
				21	#include "locking.h"
				22	#include "rcu-string.h"
				23	#include "backref.h"
				24
				25	static struct kmem_cache *extent_state_cache;
				26	static struct kmem_cache *extent_buffer_cache;
				27	static struct bio_set *btrfs_bioset;
				28
				29	static inline bool extent_state_in_tree(const struct extent_state *state)
				30	{
				31	return !RB_EMPTY_NODE(&state->rb_node);
				32	}
				33
				34	#ifdef CONFIG_BTRFS_DEBUG
				35	static LIST_HEAD(buffers);
				36	static LIST_HEAD(states);
				37
				38	static DEFINE_SPINLOCK(leak_lock);
				39
				40	static inline
				41	void btrfs_leak_debug_add(struct list_head new, struct list_head head)
				42	{
				43	unsigned long flags;
				44
				45	spin_lock_irqsave(&leak_lock, flags);
				46	list_add(new, head);
				47	spin_unlock_irqrestore(&leak_lock, flags);
				48	}
				49
				50	static inline
				51	void btrfs_leak_debug_del(struct list_head *entry)
				52	{
				53	unsigned long flags;
				54
				55	spin_lock_irqsave(&leak_lock, flags);
				56	list_del(entry);
				57	spin_unlock_irqrestore(&leak_lock, flags);
				58	}
				59
				60	static inline
				61	void btrfs_leak_debug_check(void)
				62	{
				63	struct extent_state *state;
				64	struct extent_buffer *eb;
				65
				66	while (!list_empty(&states)) {
				67	state = list_entry(states.next, struct extent_state, leak_list);
				68	pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
				69	state->start, state->end, state->state,
				70	extent_state_in_tree(state),
				71	refcount_read(&state->refs));
				72	list_del(&state->leak_list);
				73	kmem_cache_free(extent_state_cache, state);
				74	}
				75
				76	while (!list_empty(&buffers)) {
				77	eb = list_entry(buffers.next, struct extent_buffer, leak_list);
				78	pr_err("BTRFS: buffer leak start %llu len %lu refs %d\n",
				79	eb->start, eb->len, atomic_read(&eb->refs));
				80	list_del(&eb->leak_list);
				81	kmem_cache_free(extent_buffer_cache, eb);
				82	}
				83	}
				84
				85	#define btrfs_debug_check_extent_io_range(tree, start, end) \
				86	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
				87	static inline void __btrfs_debug_check_extent_io_range(const char *caller,
				88	struct extent_io_tree *tree, u64 start, u64 end)
				89	{
				90	if (tree->ops && tree->ops->check_extent_io_range)
				91	tree->ops->check_extent_io_range(tree->private_data, caller,
				92	start, end);
				93	}
				94	#else
				95	#define btrfs_leak_debug_add(new, head) do {} while (0)
				96	#define btrfs_leak_debug_del(entry) do {} while (0)
				97	#define btrfs_leak_debug_check() do {} while (0)
				98	#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
				99	#endif
				100
				101	#define BUFFER_LRU_MAX 64
				102
				103	struct tree_entry {
				104	u64 start;
				105	u64 end;
				106	struct rb_node rb_node;
				107	};
				108
				109	struct extent_page_data {
				110	struct bio *bio;
				111	struct extent_io_tree *tree;
				112	get_extent_t *get_extent;
				113	unsigned long bio_flags;
				114
				115	/* tells writepage not to lock the state bits for this range
				116	* it still does the unlocking
				117	*/
				118	unsigned int extent_locked:1;
				119
				120	/* tells the submit_bio code to use REQ_SYNC */
				121	unsigned int sync_io:1;
				122	};
				123
				124	static void add_extent_changeset(struct extent_state *state, unsigned bits,
				125	struct extent_changeset *changeset,
				126	int set)
				127	{
				128	int ret;
				129
				130	if (!changeset)
				131	return;
				132	if (set && (state->state & bits) == bits)
				133	return;
				134	if (!set && (state->state & bits) == 0)
				135	return;
				136	changeset->bytes_changed += state->end - state->start + 1;
				137	ret = ulist_add(&changeset->range_changed, state->start, state->end,
				138	GFP_ATOMIC);
				139	/* ENOMEM */
				140	BUG_ON(ret < 0);
				141	}
				142
				143	static noinline void flush_write_bio(void *data);
				144	static inline struct btrfs_fs_info *
				145	tree_fs_info(struct extent_io_tree *tree)
				146	{
				147	if (tree->ops)
				148	return tree->ops->tree_fs_info(tree->private_data);
				149	return NULL;
				150	}
				151
				152	int __init extent_io_init(void)
				153	{
				154	extent_state_cache = kmem_cache_create("btrfs_extent_state",
				155	sizeof(struct extent_state), 0,
				156	SLAB_MEM_SPREAD, NULL);
				157	if (!extent_state_cache)
				158	return -ENOMEM;
				159
				160	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
				161	sizeof(struct extent_buffer), 0,
				162	SLAB_MEM_SPREAD, NULL);
				163	if (!extent_buffer_cache)
				164	goto free_state_cache;
				165
				166	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
				167	offsetof(struct btrfs_io_bio, bio),
				168	BIOSET_NEED_BVECS);
				169	if (!btrfs_bioset)
				170	goto free_buffer_cache;
				171
				172	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
				173	goto free_bioset;
				174
				175	return 0;
				176
				177	free_bioset:
				178	bioset_free(btrfs_bioset);
				179	btrfs_bioset = NULL;
				180
				181	free_buffer_cache:
				182	kmem_cache_destroy(extent_buffer_cache);
				183	extent_buffer_cache = NULL;
				184
				185	free_state_cache:
				186	kmem_cache_destroy(extent_state_cache);
				187	extent_state_cache = NULL;
				188	return -ENOMEM;
				189	}
				190
				191	void extent_io_exit(void)
				192	{
				193	btrfs_leak_debug_check();
				194
				195	/*
				196	* Make sure all delayed rcu free are flushed before we
				197	* destroy caches.
				198	*/
				199	rcu_barrier();
				200	kmem_cache_destroy(extent_state_cache);
				201	kmem_cache_destroy(extent_buffer_cache);
				202	if (btrfs_bioset)
				203	bioset_free(btrfs_bioset);
				204	}
				205
				206	void extent_io_tree_init(struct extent_io_tree *tree,
				207	void *private_data)
				208	{
				209	tree->state = RB_ROOT;
				210	tree->ops = NULL;
				211	tree->dirty_bytes = 0;
				212	spin_lock_init(&tree->lock);
				213	tree->private_data = private_data;
				214	}
				215
				216	static struct extent_state *alloc_extent_state(gfp_t mask)
				217	{
				218	struct extent_state *state;
				219
				220	/*
				221	* The given mask might be not appropriate for the slab allocator,
				222	* drop the unsupported bits
				223	*/
				224	mask &= ~(__GFP_DMA32\|__GFP_HIGHMEM);
				225	state = kmem_cache_alloc(extent_state_cache, mask);
				226	if (!state)
				227	return state;
				228	state->state = 0;
				229	state->failrec = NULL;
				230	RB_CLEAR_NODE(&state->rb_node);
				231	btrfs_leak_debug_add(&state->leak_list, &states);
				232	refcount_set(&state->refs, 1);
				233	init_waitqueue_head(&state->wq);
				234	trace_alloc_extent_state(state, mask, _RET_IP_);
				235	return state;
				236	}
				237
				238	void free_extent_state(struct extent_state *state)
				239	{
				240	if (!state)
				241	return;
				242	if (refcount_dec_and_test(&state->refs)) {
				243	WARN_ON(extent_state_in_tree(state));
				244	btrfs_leak_debug_del(&state->leak_list);
				245	trace_free_extent_state(state, _RET_IP_);
				246	kmem_cache_free(extent_state_cache, state);
				247	}
				248	}
				249
				250	static struct rb_node tree_insert(struct rb_root root,
				251	struct rb_node *search_start,
				252	u64 offset,
				253	struct rb_node *node,
				254	struct rb_node ***p_in,
				255	struct rb_node **parent_in)
				256	{
				257	struct rb_node **p;
				258	struct rb_node *parent = NULL;
				259	struct tree_entry *entry;
				260
				261	if (p_in && parent_in) {
				262	p = *p_in;
				263	parent = *parent_in;
				264	goto do_insert;
				265	}
				266
				267	p = search_start ? &search_start : &root->rb_node;
				268	while (*p) {
				269	parent = *p;
				270	entry = rb_entry(parent, struct tree_entry, rb_node);
				271
				272	if (offset < entry->start)
				273	p = &(*p)->rb_left;
				274	else if (offset > entry->end)
				275	p = &(*p)->rb_right;
				276	else
				277	return parent;
				278	}
				279
				280	do_insert:
				281	rb_link_node(node, parent, p);
				282	rb_insert_color(node, root);
				283	return NULL;
				284	}
				285
				286	static struct rb_node __etree_search(struct extent_io_tree tree, u64 offset,
				287	struct rb_node **prev_ret,
				288	struct rb_node **next_ret,
				289	struct rb_node ***p_ret,
				290	struct rb_node **parent_ret)
				291	{
				292	struct rb_root *root = &tree->state;
				293	struct rb_node **n = &root->rb_node;
				294	struct rb_node *prev = NULL;
				295	struct rb_node *orig_prev = NULL;
				296	struct tree_entry *entry;
				297	struct tree_entry *prev_entry = NULL;
				298
				299	while (*n) {
				300	prev = *n;
				301	entry = rb_entry(prev, struct tree_entry, rb_node);
				302	prev_entry = entry;
				303
				304	if (offset < entry->start)
				305	n = &(*n)->rb_left;
				306	else if (offset > entry->end)
				307	n = &(*n)->rb_right;
				308	else
				309	return *n;
				310	}
				311
				312	if (p_ret)
				313	*p_ret = n;
				314	if (parent_ret)
				315	*parent_ret = prev;
				316
				317	if (prev_ret) {
				318	orig_prev = prev;
				319	while (prev && offset > prev_entry->end) {
				320	prev = rb_next(prev);
				321	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				322	}
				323	*prev_ret = prev;
				324	prev = orig_prev;
				325	}
				326
				327	if (next_ret) {
				328	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				329	while (prev && offset < prev_entry->start) {
				330	prev = rb_prev(prev);
				331	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				332	}
				333	*next_ret = prev;
				334	}
				335	return NULL;
				336	}
				337
				338	static inline struct rb_node *
				339	tree_search_for_insert(struct extent_io_tree *tree,
				340	u64 offset,
				341	struct rb_node ***p_ret,
				342	struct rb_node **parent_ret)
				343	{
				344	struct rb_node *prev = NULL;
				345	struct rb_node *ret;
				346
				347	ret = __etree_search(tree, offset, &prev, NULL, p_ret, parent_ret);
				348	if (!ret)
				349	return prev;
				350	return ret;
				351	}
				352
				353	static inline struct rb_node tree_search(struct extent_io_tree tree,
				354	u64 offset)
				355	{
				356	return tree_search_for_insert(tree, offset, NULL, NULL);
				357	}
				358
				359	static void merge_cb(struct extent_io_tree tree, struct extent_state new,
				360	struct extent_state *other)
				361	{
				362	if (tree->ops && tree->ops->merge_extent_hook)
				363	tree->ops->merge_extent_hook(tree->private_data, new, other);
				364	}
				365
				366	/*
				367	* utility function to look for merge candidates inside a given range.
				368	* Any extents with matching state are merged together into a single
				369	* extent in the tree. Extents with EXTENT_IO in their state field
				370	* are not merged because the end_io handlers need to be able to do
				371	* operations on them without sleeping (or doing allocations/splits).
				372	*
				373	* This should be called with the tree lock held.
				374	*/
				375	static void merge_state(struct extent_io_tree *tree,
				376	struct extent_state *state)
				377	{
				378	struct extent_state *other;
				379	struct rb_node *other_node;
				380
				381	if (state->state & (EXTENT_IOBITS \| EXTENT_BOUNDARY))
				382	return;
				383
				384	other_node = rb_prev(&state->rb_node);
				385	if (other_node) {
				386	other = rb_entry(other_node, struct extent_state, rb_node);
				387	if (other->end == state->start - 1 &&
				388	other->state == state->state) {
				389	merge_cb(tree, state, other);
				390	state->start = other->start;
				391	rb_erase(&other->rb_node, &tree->state);
				392	RB_CLEAR_NODE(&other->rb_node);
				393	free_extent_state(other);
				394	}
				395	}
				396	other_node = rb_next(&state->rb_node);
				397	if (other_node) {
				398	other = rb_entry(other_node, struct extent_state, rb_node);
				399	if (other->start == state->end + 1 &&
				400	other->state == state->state) {
				401	merge_cb(tree, state, other);
				402	state->end = other->end;
				403	rb_erase(&other->rb_node, &tree->state);
				404	RB_CLEAR_NODE(&other->rb_node);
				405	free_extent_state(other);
				406	}
				407	}
				408	}
				409
				410	static void set_state_cb(struct extent_io_tree *tree,
				411	struct extent_state state, unsigned bits)
				412	{
				413	if (tree->ops && tree->ops->set_bit_hook)
				414	tree->ops->set_bit_hook(tree->private_data, state, bits);
				415	}
				416
				417	static void clear_state_cb(struct extent_io_tree *tree,
				418	struct extent_state state, unsigned bits)
				419	{
				420	if (tree->ops && tree->ops->clear_bit_hook)
				421	tree->ops->clear_bit_hook(tree->private_data, state, bits);
				422	}
				423
				424	static void set_state_bits(struct extent_io_tree *tree,
				425	struct extent_state state, unsigned bits,
				426	struct extent_changeset *changeset);
				427
				428	/*
				429	* insert an extent_state struct into the tree. 'bits' are set on the
				430	* struct before it is inserted.
				431	*
				432	* This may return -EEXIST if the extent is already there, in which case the
				433	* state struct is freed.
				434	*
				435	* The tree lock is not taken internally. This is a utility function and
				436	* probably isn't what you want to call (see set/clear_extent_bit).
				437	*/
				438	static int insert_state(struct extent_io_tree *tree,
				439	struct extent_state *state, u64 start, u64 end,
				440	struct rb_node ***p,
				441	struct rb_node **parent,
				442	unsigned bits, struct extent_changeset changeset)
				443	{
				444	struct rb_node *node;
				445
				446	if (end < start)
				447	WARN(1, KERN_ERR "BTRFS: end < start %llu %llu\n",
				448	end, start);
				449	state->start = start;
				450	state->end = end;
				451
				452	set_state_bits(tree, state, bits, changeset);
				453
				454	node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
				455	if (node) {
				456	struct extent_state *found;
				457	found = rb_entry(node, struct extent_state, rb_node);
				458	pr_err("BTRFS: found node %llu %llu on insert of %llu %llu\n",
				459	found->start, found->end, start, end);
				460	return -EEXIST;
				461	}
				462	merge_state(tree, state);
				463	return 0;
				464	}
				465
				466	static void split_cb(struct extent_io_tree tree, struct extent_state orig,
				467	u64 split)
				468	{
				469	if (tree->ops && tree->ops->split_extent_hook)
				470	tree->ops->split_extent_hook(tree->private_data, orig, split);
				471	}
				472
				473	/*
				474	* split a given extent state struct in two, inserting the preallocated
				475	* struct 'prealloc' as the newly created second half. 'split' indicates an
				476	* offset inside 'orig' where it should be split.
				477	*
				478	* Before calling,
				479	* the tree has 'orig' at [orig->start, orig->end]. After calling, there
				480	* are two extent state structs in the tree:
				481	* prealloc: [orig->start, split - 1]
				482	* orig: [ split, orig->end ]
				483	*
				484	* The tree locks are not taken by this function. They need to be held
				485	* by the caller.
				486	*/
				487	static int split_state(struct extent_io_tree tree, struct extent_state orig,
				488	struct extent_state *prealloc, u64 split)
				489	{
				490	struct rb_node *node;
				491
				492	split_cb(tree, orig, split);
				493
				494	prealloc->start = orig->start;
				495	prealloc->end = split - 1;
				496	prealloc->state = orig->state;
				497	orig->start = split;
				498
				499	node = tree_insert(&tree->state, &orig->rb_node, prealloc->end,
				500	&prealloc->rb_node, NULL, NULL);
				501	if (node) {
				502	free_extent_state(prealloc);
				503	return -EEXIST;
				504	}
				505	return 0;
				506	}
				507
				508	static struct extent_state next_state(struct extent_state state)
				509	{
				510	struct rb_node *next = rb_next(&state->rb_node);
				511	if (next)
				512	return rb_entry(next, struct extent_state, rb_node);
				513	else
				514	return NULL;
				515	}
				516
				517	/*
				518	* utility function to clear some bits in an extent state struct.
				519	* it will optionally wake up any one waiting on this state (wake == 1).
				520	*
				521	* If no bits are set on the state struct after clearing things, the
				522	* struct is freed and removed from the tree
				523	*/
				524	static struct extent_state clear_state_bit(struct extent_io_tree tree,
				525	struct extent_state *state,
				526	unsigned *bits, int wake,
				527	struct extent_changeset *changeset)
				528	{
				529	struct extent_state *next;
				530	unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
				531
				532	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
				533	u64 range = state->end - state->start + 1;
				534	WARN_ON(range > tree->dirty_bytes);
				535	tree->dirty_bytes -= range;
				536	}
				537	clear_state_cb(tree, state, bits);
				538	add_extent_changeset(state, bits_to_clear, changeset, 0);
				539	state->state &= ~bits_to_clear;
				540	if (wake)
				541	wake_up(&state->wq);
				542	if (state->state == 0) {
				543	next = next_state(state);
				544	if (extent_state_in_tree(state)) {
				545	rb_erase(&state->rb_node, &tree->state);
				546	RB_CLEAR_NODE(&state->rb_node);
				547	free_extent_state(state);
				548	} else {
				549	WARN_ON(1);
				550	}
				551	} else {
				552	merge_state(tree, state);
				553	next = next_state(state);
				554	}
				555	return next;
				556	}
				557
				558	static struct extent_state *
				559	alloc_extent_state_atomic(struct extent_state *prealloc)
				560	{
				561	if (!prealloc)
				562	prealloc = alloc_extent_state(GFP_ATOMIC);
				563
				564	return prealloc;
				565	}
				566
				567	static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
				568	{
				569	btrfs_panic(tree_fs_info(tree), err,
				570	"Locking error: Extent tree was modified by another thread while locked.");
				571	}
				572
				573	/*
				574	* clear some bits on a range in the tree. This may require splitting
				575	* or inserting elements in the tree, so the gfp mask is used to
				576	* indicate which allocations or sleeping are allowed.
				577	*
				578	* pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
				579	* the given range from the tree regardless of state (ie for truncate).
				580	*
				581	* the range [start, end] is inclusive.
				582	*
				583	* This takes the tree lock, and returns 0 on success and < 0 on error.
				584	*/
				585	static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				586	unsigned bits, int wake, int delete,
				587	struct extent_state **cached_state,
				588	gfp_t mask, struct extent_changeset *changeset)
				589	{
				590	struct extent_state *state;
				591	struct extent_state *cached;
				592	struct extent_state *prealloc = NULL;
				593	struct rb_node *node;
				594	u64 last_end;
				595	int err;
				596	int clear = 0;
				597
				598	btrfs_debug_check_extent_io_range(tree, start, end);
				599
				600	if (bits & EXTENT_DELALLOC)
				601	bits \|= EXTENT_NORESERVE;
				602
				603	if (delete)
				604	bits \|= ~EXTENT_CTLBITS;
				605	bits \|= EXTENT_FIRST_DELALLOC;
				606
				607	if (bits & (EXTENT_IOBITS \| EXTENT_BOUNDARY))
				608	clear = 1;
				609	again:
				610	if (!prealloc && gfpflags_allow_blocking(mask)) {
				611	/*
				612	* Don't care for allocation failure here because we might end
				613	* up not needing the pre-allocated extent state at all, which
				614	* is the case if we only have in the tree extent states that
				615	* cover our input range and don't cover too any other range.
				616	* If we end up needing a new extent state we allocate it later.
				617	*/
				618	prealloc = alloc_extent_state(mask);
				619	}
				620
				621	spin_lock(&tree->lock);
				622	if (cached_state) {
				623	cached = *cached_state;
				624
				625	if (clear) {
				626	*cached_state = NULL;
				627	cached_state = NULL;
				628	}
				629
				630	if (cached && extent_state_in_tree(cached) &&
				631	cached->start <= start && cached->end > start) {
				632	if (clear)
				633	refcount_dec(&cached->refs);
				634	state = cached;
				635	goto hit_next;
				636	}
				637	if (clear)
				638	free_extent_state(cached);
				639	}
				640	/*
				641	* this search will find the extents that end after
				642	* our range starts
				643	*/
				644	node = tree_search(tree, start);
				645	if (!node)
				646	goto out;
				647	state = rb_entry(node, struct extent_state, rb_node);
				648	hit_next:
				649	if (state->start > end)
				650	goto out;
				651	WARN_ON(state->end < start);
				652	last_end = state->end;
				653
				654	/* the state doesn't have the wanted bits, go ahead */
				655	if (!(state->state & bits)) {
				656	state = next_state(state);
				657	goto next;
				658	}
				659
				660	/*
				661	* \| ---- desired range ---- \|
				662	* \| state \| or
				663	* \| ------------- state -------------- \|
				664	*
				665	* We need to split the extent we found, and may flip
				666	* bits on second half.
				667	*
				668	* If the extent we found extends past our range, we
				669	* just split and search again. It'll get split again
				670	* the next time though.
				671	*
				672	* If the extent we found is inside our range, we clear
				673	* the desired bit on it.
				674	*/
				675
				676	if (state->start < start) {
				677	prealloc = alloc_extent_state_atomic(prealloc);
				678	BUG_ON(!prealloc);
				679	err = split_state(tree, state, prealloc, start);
				680	if (err)
				681	extent_io_tree_panic(tree, err);
				682
				683	prealloc = NULL;
				684	if (err)
				685	goto out;
				686	if (state->end <= end) {
				687	state = clear_state_bit(tree, state, &bits, wake,
				688	changeset);
				689	goto next;
				690	}
				691	goto search_again;
				692	}
				693	/*
				694	* \| ---- desired range ---- \|
				695	* \| state \|
				696	* We need to split the extent, and clear the bit
				697	* on the first half
				698	*/
				699	if (state->start <= end && state->end > end) {
				700	prealloc = alloc_extent_state_atomic(prealloc);
				701	BUG_ON(!prealloc);
				702	err = split_state(tree, state, prealloc, end + 1);
				703	if (err)
				704	extent_io_tree_panic(tree, err);
				705
				706	if (wake)
				707	wake_up(&state->wq);
				708
				709	clear_state_bit(tree, prealloc, &bits, wake, changeset);
				710
				711	prealloc = NULL;
				712	goto out;
				713	}
				714
				715	state = clear_state_bit(tree, state, &bits, wake, changeset);
				716	next:
				717	if (last_end == (u64)-1)
				718	goto out;
				719	start = last_end + 1;
				720	if (start <= end && state && !need_resched())
				721	goto hit_next;
				722
				723	search_again:
				724	if (start > end)
				725	goto out;
				726	spin_unlock(&tree->lock);
				727	if (gfpflags_allow_blocking(mask))
				728	cond_resched();
				729	goto again;
				730
				731	out:
				732	spin_unlock(&tree->lock);
				733	if (prealloc)
				734	free_extent_state(prealloc);
				735
				736	return 0;
				737
				738	}
				739
				740	static void wait_on_state(struct extent_io_tree *tree,
				741	struct extent_state *state)
				742	__releases(tree->lock)
				743	__acquires(tree->lock)
				744	{
				745	DEFINE_WAIT(wait);
				746	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
				747	spin_unlock(&tree->lock);
				748	schedule();
				749	spin_lock(&tree->lock);
				750	finish_wait(&state->wq, &wait);
				751	}
				752
				753	/*
				754	* waits for one or more bits to clear on a range in the state tree.
				755	* The range [start, end] is inclusive.
				756	* The tree lock is taken by this function
				757	*/
				758	static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				759	unsigned long bits)
				760	{
				761	struct extent_state *state;
				762	struct rb_node *node;
				763
				764	btrfs_debug_check_extent_io_range(tree, start, end);
				765
				766	spin_lock(&tree->lock);
				767	again:
				768	while (1) {
				769	/*
				770	* this search will find all the extents that end after
				771	* our range starts
				772	*/
				773	node = tree_search(tree, start);
				774	process_node:
				775	if (!node)
				776	break;
				777
				778	state = rb_entry(node, struct extent_state, rb_node);
				779
				780	if (state->start > end)
				781	goto out;
				782
				783	if (state->state & bits) {
				784	start = state->start;
				785	refcount_inc(&state->refs);
				786	wait_on_state(tree, state);
				787	free_extent_state(state);
				788	goto again;
				789	}
				790	start = state->end + 1;
				791
				792	if (start > end)
				793	break;
				794
				795	if (!cond_resched_lock(&tree->lock)) {
				796	node = rb_next(node);
				797	goto process_node;
				798	}
				799	}
				800	out:
				801	spin_unlock(&tree->lock);
				802	}
				803
				804	static void set_state_bits(struct extent_io_tree *tree,
				805	struct extent_state *state,
				806	unsigned bits, struct extent_changeset changeset)
				807	{
				808	unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
				809
				810	set_state_cb(tree, state, bits);
				811	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
				812	u64 range = state->end - state->start + 1;
				813	tree->dirty_bytes += range;
				814	}
				815	add_extent_changeset(state, bits_to_set, changeset, 1);
				816	state->state \|= bits_to_set;
				817	}
				818
				819	static void cache_state_if_flags(struct extent_state *state,
				820	struct extent_state **cached_ptr,
				821	unsigned flags)
				822	{
				823	if (cached_ptr && !(*cached_ptr)) {
				824	if (!flags \|\| (state->state & flags)) {
				825	*cached_ptr = state;
				826	refcount_inc(&state->refs);
				827	}
				828	}
				829	}
				830
				831	static void cache_state(struct extent_state *state,
				832	struct extent_state **cached_ptr)
				833	{
				834	return cache_state_if_flags(state, cached_ptr,
				835	EXTENT_IOBITS \| EXTENT_BOUNDARY);
				836	}
				837
				838	/*
				839	* set some bits on a range in the tree. This may require allocations or
				840	* sleeping, so the gfp mask is used to indicate what is allowed.
				841	*
				842	* If any of the exclusive bits are set, this will fail with -EEXIST if some
				843	* part of the range already has the desired bits set. The start of the
				844	* existing range is returned in failed_start in this case.
				845	*
				846	* [start, end] is inclusive This takes the tree lock.
				847	*/
				848
				849	static int __must_check
				850	__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				851	unsigned bits, unsigned exclusive_bits,
				852	u64 failed_start, struct extent_state *cached_state,
				853	gfp_t mask, struct extent_changeset *changeset)
				854	{
				855	struct extent_state *state;
				856	struct extent_state *prealloc = NULL;
				857	struct rb_node *node;
				858	struct rb_node **p;
				859	struct rb_node *parent;
				860	int err = 0;
				861	u64 last_start;
				862	u64 last_end;
				863
				864	btrfs_debug_check_extent_io_range(tree, start, end);
				865
				866	bits \|= EXTENT_FIRST_DELALLOC;
				867	again:
				868	if (!prealloc && gfpflags_allow_blocking(mask)) {
				869	/*
				870	* Don't care for allocation failure here because we might end
				871	* up not needing the pre-allocated extent state at all, which
				872	* is the case if we only have in the tree extent states that
				873	* cover our input range and don't cover too any other range.
				874	* If we end up needing a new extent state we allocate it later.
				875	*/
				876	prealloc = alloc_extent_state(mask);
				877	}
				878
				879	spin_lock(&tree->lock);
				880	if (cached_state && *cached_state) {
				881	state = *cached_state;
				882	if (state->start <= start && state->end > start &&
				883	extent_state_in_tree(state)) {
				884	node = &state->rb_node;
				885	goto hit_next;
				886	}
				887	}
				888	/*
				889	* this search will find all the extents that end after
				890	* our range starts.
				891	*/
				892	node = tree_search_for_insert(tree, start, &p, &parent);
				893	if (!node) {
				894	prealloc = alloc_extent_state_atomic(prealloc);
				895	BUG_ON(!prealloc);
				896	err = insert_state(tree, prealloc, start, end,
				897	&p, &parent, &bits, changeset);
				898	if (err)
				899	extent_io_tree_panic(tree, err);
				900
				901	cache_state(prealloc, cached_state);
				902	prealloc = NULL;
				903	goto out;
				904	}
				905	state = rb_entry(node, struct extent_state, rb_node);
				906	hit_next:
				907	last_start = state->start;
				908	last_end = state->end;
				909
				910	/*
				911	* \| ---- desired range ---- \|
				912	* \| state \|
				913	*
				914	* Just lock what we found and keep going
				915	*/
				916	if (state->start == start && state->end <= end) {
				917	if (state->state & exclusive_bits) {
				918	*failed_start = state->start;
				919	err = -EEXIST;
				920	goto out;
				921	}
				922
				923	set_state_bits(tree, state, &bits, changeset);
				924	cache_state(state, cached_state);
				925	merge_state(tree, state);
				926	if (last_end == (u64)-1)
				927	goto out;
				928	start = last_end + 1;
				929	state = next_state(state);
				930	if (start < end && state && state->start == start &&
				931	!need_resched())
				932	goto hit_next;
				933	goto search_again;
				934	}
				935
				936	/*
				937	* \| ---- desired range ---- \|
				938	* \| state \|
				939	* or
				940	* \| ------------- state -------------- \|
				941	*
				942	* We need to split the extent we found, and may flip bits on
				943	* second half.
				944	*
				945	* If the extent we found extends past our
				946	* range, we just split and search again. It'll get split
				947	* again the next time though.
				948	*
				949	* If the extent we found is inside our range, we set the
				950	* desired bit on it.
				951	*/
				952	if (state->start < start) {
				953	if (state->state & exclusive_bits) {
				954	*failed_start = start;
				955	err = -EEXIST;
				956	goto out;
				957	}
				958
				959	prealloc = alloc_extent_state_atomic(prealloc);
				960	BUG_ON(!prealloc);
				961	err = split_state(tree, state, prealloc, start);
				962	if (err)
				963	extent_io_tree_panic(tree, err);
				964
				965	prealloc = NULL;
				966	if (err)
				967	goto out;
				968	if (state->end <= end) {
				969	set_state_bits(tree, state, &bits, changeset);
				970	cache_state(state, cached_state);
				971	merge_state(tree, state);
				972	if (last_end == (u64)-1)
				973	goto out;
				974	start = last_end + 1;
				975	state = next_state(state);
				976	if (start < end && state && state->start == start &&
				977	!need_resched())
				978	goto hit_next;
				979	}
				980	goto search_again;
				981	}
				982	/*
				983	* \| ---- desired range ---- \|
				984	* \| state \| or \| state \|
				985	*
				986	* There's a hole, we need to insert something in it and
				987	* ignore the extent we found.
				988	*/
				989	if (state->start > start) {
				990	u64 this_end;
				991	if (end < last_start)
				992	this_end = end;
				993	else
				994	this_end = last_start - 1;
				995
				996	prealloc = alloc_extent_state_atomic(prealloc);
				997	BUG_ON(!prealloc);
				998
				999	/*
				1000	* Avoid to free 'prealloc' if it can be merged with
				1001	* the later extent.
				1002	*/
				1003	err = insert_state(tree, prealloc, start, this_end,
				1004	NULL, NULL, &bits, changeset);
				1005	if (err)
				1006	extent_io_tree_panic(tree, err);
				1007
				1008	cache_state(prealloc, cached_state);
				1009	prealloc = NULL;
				1010	start = this_end + 1;
				1011	goto search_again;
				1012	}
				1013	/*
				1014	* \| ---- desired range ---- \|
				1015	* \| state \|
				1016	* We need to split the extent, and set the bit
				1017	* on the first half
				1018	*/
				1019	if (state->start <= end && state->end > end) {
				1020	if (state->state & exclusive_bits) {
				1021	*failed_start = start;
				1022	err = -EEXIST;
				1023	goto out;
				1024	}
				1025
				1026	prealloc = alloc_extent_state_atomic(prealloc);
				1027	BUG_ON(!prealloc);
				1028	err = split_state(tree, state, prealloc, end + 1);
				1029	if (err)
				1030	extent_io_tree_panic(tree, err);
				1031
				1032	set_state_bits(tree, prealloc, &bits, changeset);
				1033	cache_state(prealloc, cached_state);
				1034	merge_state(tree, prealloc);
				1035	prealloc = NULL;
				1036	goto out;
				1037	}
				1038
				1039	search_again:
				1040	if (start > end)
				1041	goto out;
				1042	spin_unlock(&tree->lock);
				1043	if (gfpflags_allow_blocking(mask))
				1044	cond_resched();
				1045	goto again;
				1046
				1047	out:
				1048	spin_unlock(&tree->lock);
				1049	if (prealloc)
				1050	free_extent_state(prealloc);
				1051
				1052	return err;
				1053
				1054	}
				1055
				1056	int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1057	unsigned bits, u64 * failed_start,
				1058	struct extent_state **cached_state, gfp_t mask)
				1059	{
				1060	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
				1061	cached_state, mask, NULL);
				1062	}
				1063
				1064
				1065	/**
				1066	* convert_extent_bit - convert all bits in a given range from one bit to
				1067	* another
				1068	* @tree: the io tree to search
				1069	* @start: the start offset in bytes
				1070	* @end: the end offset in bytes (inclusive)
				1071	* @bits: the bits to set in this range
				1072	* @clear_bits: the bits to clear in this range
				1073	* @cached_state: state that we're going to cache
				1074	*
				1075	* This will go through and set bits for the given range. If any states exist
				1076	* already in this range they are set with the given bit and cleared of the
				1077	* clear_bits. This is only meant to be used by things that are mergeable, ie
				1078	* converting from say DELALLOC to DIRTY. This is not meant to be used with
				1079	* boundary bits like LOCK.
				1080	*
				1081	* All allocations are done with GFP_NOFS.
				1082	*/
				1083	int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1084	unsigned bits, unsigned clear_bits,
				1085	struct extent_state **cached_state)
				1086	{
				1087	struct extent_state *state;
				1088	struct extent_state *prealloc = NULL;
				1089	struct rb_node *node;
				1090	struct rb_node **p;
				1091	struct rb_node *parent;
				1092	int err = 0;
				1093	u64 last_start;
				1094	u64 last_end;
				1095	bool first_iteration = true;
				1096
				1097	btrfs_debug_check_extent_io_range(tree, start, end);
				1098
				1099	again:
				1100	if (!prealloc) {
				1101	/*
				1102	* Best effort, don't worry if extent state allocation fails
				1103	* here for the first iteration. We might have a cached state
				1104	* that matches exactly the target range, in which case no
				1105	* extent state allocations are needed. We'll only know this
				1106	* after locking the tree.
				1107	*/
				1108	prealloc = alloc_extent_state(GFP_NOFS);
				1109	if (!prealloc && !first_iteration)
				1110	return -ENOMEM;
				1111	}
				1112
				1113	spin_lock(&tree->lock);
				1114	if (cached_state && *cached_state) {
				1115	state = *cached_state;
				1116	if (state->start <= start && state->end > start &&
				1117	extent_state_in_tree(state)) {
				1118	node = &state->rb_node;
				1119	goto hit_next;
				1120	}
				1121	}
				1122
				1123	/*
				1124	* this search will find all the extents that end after
				1125	* our range starts.
				1126	*/
				1127	node = tree_search_for_insert(tree, start, &p, &parent);
				1128	if (!node) {
				1129	prealloc = alloc_extent_state_atomic(prealloc);
				1130	if (!prealloc) {
				1131	err = -ENOMEM;
				1132	goto out;
				1133	}
				1134	err = insert_state(tree, prealloc, start, end,
				1135	&p, &parent, &bits, NULL);
				1136	if (err)
				1137	extent_io_tree_panic(tree, err);
				1138	cache_state(prealloc, cached_state);
				1139	prealloc = NULL;
				1140	goto out;
				1141	}
				1142	state = rb_entry(node, struct extent_state, rb_node);
				1143	hit_next:
				1144	last_start = state->start;
				1145	last_end = state->end;
				1146
				1147	/*
				1148	* \| ---- desired range ---- \|
				1149	* \| state \|
				1150	*
				1151	* Just lock what we found and keep going
				1152	*/
				1153	if (state->start == start && state->end <= end) {
				1154	set_state_bits(tree, state, &bits, NULL);
				1155	cache_state(state, cached_state);
				1156	state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
				1157	if (last_end == (u64)-1)
				1158	goto out;
				1159	start = last_end + 1;
				1160	if (start < end && state && state->start == start &&
				1161	!need_resched())
				1162	goto hit_next;
				1163	goto search_again;
				1164	}
				1165
				1166	/*
				1167	* \| ---- desired range ---- \|
				1168	* \| state \|
				1169	* or
				1170	* \| ------------- state -------------- \|
				1171	*
				1172	* We need to split the extent we found, and may flip bits on
				1173	* second half.
				1174	*
				1175	* If the extent we found extends past our
				1176	* range, we just split and search again. It'll get split
				1177	* again the next time though.
				1178	*
				1179	* If the extent we found is inside our range, we set the
				1180	* desired bit on it.
				1181	*/
				1182	if (state->start < start) {
				1183	prealloc = alloc_extent_state_atomic(prealloc);
				1184	if (!prealloc) {
				1185	err = -ENOMEM;
				1186	goto out;
				1187	}
				1188	err = split_state(tree, state, prealloc, start);
				1189	if (err)
				1190	extent_io_tree_panic(tree, err);
				1191	prealloc = NULL;
				1192	if (err)
				1193	goto out;
				1194	if (state->end <= end) {
				1195	set_state_bits(tree, state, &bits, NULL);
				1196	cache_state(state, cached_state);
				1197	state = clear_state_bit(tree, state, &clear_bits, 0,
				1198	NULL);
				1199	if (last_end == (u64)-1)
				1200	goto out;
				1201	start = last_end + 1;
				1202	if (start < end && state && state->start == start &&
				1203	!need_resched())
				1204	goto hit_next;
				1205	}
				1206	goto search_again;
				1207	}
				1208	/*
				1209	* \| ---- desired range ---- \|
				1210	* \| state \| or \| state \|
				1211	*
				1212	* There's a hole, we need to insert something in it and
				1213	* ignore the extent we found.
				1214	*/
				1215	if (state->start > start) {
				1216	u64 this_end;
				1217	if (end < last_start)
				1218	this_end = end;
				1219	else
				1220	this_end = last_start - 1;
				1221
				1222	prealloc = alloc_extent_state_atomic(prealloc);
				1223	if (!prealloc) {
				1224	err = -ENOMEM;
				1225	goto out;
				1226	}
				1227
				1228	/*
				1229	* Avoid to free 'prealloc' if it can be merged with
				1230	* the later extent.
				1231	*/
				1232	err = insert_state(tree, prealloc, start, this_end,
				1233	NULL, NULL, &bits, NULL);
				1234	if (err)
				1235	extent_io_tree_panic(tree, err);
				1236	cache_state(prealloc, cached_state);
				1237	prealloc = NULL;
				1238	start = this_end + 1;
				1239	goto search_again;
				1240	}
				1241	/*
				1242	* \| ---- desired range ---- \|
				1243	* \| state \|
				1244	* We need to split the extent, and set the bit
				1245	* on the first half
				1246	*/
				1247	if (state->start <= end && state->end > end) {
				1248	prealloc = alloc_extent_state_atomic(prealloc);
				1249	if (!prealloc) {
				1250	err = -ENOMEM;
				1251	goto out;
				1252	}
				1253
				1254	err = split_state(tree, state, prealloc, end + 1);
				1255	if (err)
				1256	extent_io_tree_panic(tree, err);
				1257
				1258	set_state_bits(tree, prealloc, &bits, NULL);
				1259	cache_state(prealloc, cached_state);
				1260	clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
				1261	prealloc = NULL;
				1262	goto out;
				1263	}
				1264
				1265	search_again:
				1266	if (start > end)
				1267	goto out;
				1268	spin_unlock(&tree->lock);
				1269	cond_resched();
				1270	first_iteration = false;
				1271	goto again;
				1272
				1273	out:
				1274	spin_unlock(&tree->lock);
				1275	if (prealloc)
				1276	free_extent_state(prealloc);
				1277
				1278	return err;
				1279	}
				1280
				1281	/* wrappers around set/clear extent bit */
				1282	int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1283	unsigned bits, struct extent_changeset *changeset)
				1284	{
				1285	/*
				1286	* We don't support EXTENT_LOCKED yet, as current changeset will
				1287	* record any bits changed, so for EXTENT_LOCKED case, it will
				1288	* either fail with -EEXIST or changeset will record the whole
				1289	* range.
				1290	*/
				1291	BUG_ON(bits & EXTENT_LOCKED);
				1292
				1293	return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
				1294	changeset);
				1295	}
				1296
				1297	int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1298	unsigned bits, int wake, int delete,
				1299	struct extent_state **cached, gfp_t mask)
				1300	{
				1301	return __clear_extent_bit(tree, start, end, bits, wake, delete,
				1302	cached, mask, NULL);
				1303	}
				1304
				1305	int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1306	unsigned bits, struct extent_changeset *changeset)
				1307	{
				1308	/*
				1309	* Don't support EXTENT_LOCKED case, same reason as
				1310	* set_record_extent_bits().
				1311	*/
				1312	BUG_ON(bits & EXTENT_LOCKED);
				1313
				1314	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
				1315	changeset);
				1316	}
				1317
				1318	/*
				1319	* either insert or lock state struct between start and end use mask to tell
				1320	* us if waiting is desired.
				1321	*/
				1322	int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1323	struct extent_state **cached_state)
				1324	{
				1325	int err;
				1326	u64 failed_start;
				1327
				1328	while (1) {
				1329	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
				1330	EXTENT_LOCKED, &failed_start,
				1331	cached_state, GFP_NOFS, NULL);
				1332	if (err == -EEXIST) {
				1333	wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
				1334	start = failed_start;
				1335	} else
				1336	break;
				1337	WARN_ON(start > end);
				1338	}
				1339	return err;
				1340	}
				1341
				1342	int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
				1343	{
				1344	int err;
				1345	u64 failed_start;
				1346
				1347	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
				1348	&failed_start, NULL, GFP_NOFS, NULL);
				1349	if (err == -EEXIST) {
				1350	if (failed_start > start)
				1351	clear_extent_bit(tree, start, failed_start - 1,
				1352	EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
				1353	return 0;
				1354	}
				1355	return 1;
				1356	}
				1357
				1358	void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
				1359	{
				1360	unsigned long index = start >> PAGE_SHIFT;
				1361	unsigned long end_index = end >> PAGE_SHIFT;
				1362	struct page *page;
				1363
				1364	while (index <= end_index) {
				1365	page = find_get_page(inode->i_mapping, index);
				1366	BUG_ON(!page); /* Pages should be in the extent_io_tree */
				1367	clear_page_dirty_for_io(page);
				1368	put_page(page);
				1369	index++;
				1370	}
				1371	}
				1372
				1373	void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
				1374	{
				1375	unsigned long index = start >> PAGE_SHIFT;
				1376	unsigned long end_index = end >> PAGE_SHIFT;
				1377	struct page *page;
				1378
				1379	while (index <= end_index) {
				1380	page = find_get_page(inode->i_mapping, index);
				1381	BUG_ON(!page); /* Pages should be in the extent_io_tree */
				1382	__set_page_dirty_nobuffers(page);
				1383	account_page_redirty(page);
				1384	put_page(page);
				1385	index++;
				1386	}
				1387	}
				1388
				1389	/*
				1390	* helper function to set both pages and extents in the tree writeback
				1391	*/
				1392	static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
				1393	{
				1394	tree->ops->set_range_writeback(tree->private_data, start, end);
				1395	}
				1396
				1397	/* find the first state struct with 'bits' set after 'start', and
				1398	* return it. tree->lock must be held. NULL will returned if
				1399	* nothing was found after 'start'
				1400	*/
				1401	static struct extent_state *
				1402	find_first_extent_bit_state(struct extent_io_tree *tree,
				1403	u64 start, unsigned bits)
				1404	{
				1405	struct rb_node *node;
				1406	struct extent_state *state;
				1407
				1408	/*
				1409	* this search will find all the extents that end after
				1410	* our range starts.
				1411	*/
				1412	node = tree_search(tree, start);
				1413	if (!node)
				1414	goto out;
				1415
				1416	while (1) {
				1417	state = rb_entry(node, struct extent_state, rb_node);
				1418	if (state->end >= start && (state->state & bits))
				1419	return state;
				1420
				1421	node = rb_next(node);
				1422	if (!node)
				1423	break;
				1424	}
				1425	out:
				1426	return NULL;
				1427	}
				1428
				1429	/*
				1430	* find the first offset in the io tree with 'bits' set. zero is
				1431	* returned if we find something, and start_ret and end_ret are
				1432	* set to reflect the state struct that was found.
				1433	*
				1434	* If nothing was found, 1 is returned. If found something, return 0.
				1435	*/
				1436	int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
				1437	u64 start_ret, u64 end_ret, unsigned bits,
				1438	struct extent_state **cached_state)
				1439	{
				1440	struct extent_state *state;
				1441	struct rb_node *n;
				1442	int ret = 1;
				1443
				1444	spin_lock(&tree->lock);
				1445	if (cached_state && *cached_state) {
				1446	state = *cached_state;
				1447	if (state->end == start - 1 && extent_state_in_tree(state)) {
				1448	n = rb_next(&state->rb_node);
				1449	while (n) {
				1450	state = rb_entry(n, struct extent_state,
				1451	rb_node);
				1452	if (state->state & bits)
				1453	goto got_it;
				1454	n = rb_next(n);
				1455	}
				1456	free_extent_state(*cached_state);
				1457	*cached_state = NULL;
				1458	goto out;
				1459	}
				1460	free_extent_state(*cached_state);
				1461	*cached_state = NULL;
				1462	}
				1463
				1464	state = find_first_extent_bit_state(tree, start, bits);
				1465	got_it:
				1466	if (state) {
				1467	cache_state_if_flags(state, cached_state, 0);
				1468	*start_ret = state->start;
				1469	*end_ret = state->end;
				1470	ret = 0;
				1471	}
				1472	out:
				1473	spin_unlock(&tree->lock);
				1474	return ret;
				1475	}
				1476
				1477	/*
				1478	* find a contiguous range of bytes in the file marked as delalloc, not
				1479	* more than 'max_bytes'. start and end are used to return the range,
				1480	*
				1481	* 1 is returned if we find something, 0 if nothing was in the tree
				1482	*/
				1483	static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
				1484	u64 start, u64 end, u64 max_bytes,
				1485	struct extent_state **cached_state)
				1486	{
				1487	struct rb_node *node;
				1488	struct extent_state *state;
				1489	u64 cur_start = *start;
				1490	u64 found = 0;
				1491	u64 total_bytes = 0;
				1492
				1493	spin_lock(&tree->lock);
				1494
				1495	/*
				1496	* this search will find all the extents that end after
				1497	* our range starts.
				1498	*/
				1499	node = tree_search(tree, cur_start);
				1500	if (!node) {
				1501	if (!found)
				1502	*end = (u64)-1;
				1503	goto out;
				1504	}
				1505
				1506	while (1) {
				1507	state = rb_entry(node, struct extent_state, rb_node);
				1508	if (found && (state->start != cur_start \|\|
				1509	(state->state & EXTENT_BOUNDARY))) {
				1510	goto out;
				1511	}
				1512	if (!(state->state & EXTENT_DELALLOC)) {
				1513	if (!found)
				1514	*end = state->end;
				1515	goto out;
				1516	}
				1517	if (!found) {
				1518	*start = state->start;
				1519	*cached_state = state;
				1520	refcount_inc(&state->refs);
				1521	}
				1522	found++;
				1523	*end = state->end;
				1524	cur_start = state->end + 1;
				1525	node = rb_next(node);
				1526	total_bytes += state->end - state->start + 1;
				1527	if (total_bytes >= max_bytes)
				1528	break;
				1529	if (!node)
				1530	break;
				1531	}
				1532	out:
				1533	spin_unlock(&tree->lock);
				1534	return found;
				1535	}
				1536
				1537	static int __process_pages_contig(struct address_space *mapping,
				1538	struct page *locked_page,
				1539	pgoff_t start_index, pgoff_t end_index,
				1540	unsigned long page_ops, pgoff_t *index_ret);
				1541
				1542	static noinline void __unlock_for_delalloc(struct inode *inode,
				1543	struct page *locked_page,
				1544	u64 start, u64 end)
				1545	{
				1546	unsigned long index = start >> PAGE_SHIFT;
				1547	unsigned long end_index = end >> PAGE_SHIFT;
				1548
				1549	ASSERT(locked_page);
				1550	if (index == locked_page->index && end_index == index)
				1551	return;
				1552
				1553	__process_pages_contig(inode->i_mapping, locked_page, index, end_index,
				1554	PAGE_UNLOCK, NULL);
				1555	}
				1556
				1557	static noinline int lock_delalloc_pages(struct inode *inode,
				1558	struct page *locked_page,
				1559	u64 delalloc_start,
				1560	u64 delalloc_end)
				1561	{
				1562	unsigned long index = delalloc_start >> PAGE_SHIFT;
				1563	unsigned long index_ret = index;
				1564	unsigned long end_index = delalloc_end >> PAGE_SHIFT;
				1565	int ret;
				1566
				1567	ASSERT(locked_page);
				1568	if (index == locked_page->index && index == end_index)
				1569	return 0;
				1570
				1571	ret = __process_pages_contig(inode->i_mapping, locked_page, index,
				1572	end_index, PAGE_LOCK, &index_ret);
				1573	if (ret == -EAGAIN)
				1574	__unlock_for_delalloc(inode, locked_page, delalloc_start,
				1575	(u64)index_ret << PAGE_SHIFT);
				1576	return ret;
				1577	}
				1578
				1579	/*
				1580	* find a contiguous range of bytes in the file marked as delalloc, not
				1581	* more than 'max_bytes'. start and end are used to return the range,
				1582	*
				1583	* 1 is returned if we find something, 0 if nothing was in the tree
				1584	*/
				1585	STATIC u64 find_lock_delalloc_range(struct inode *inode,
				1586	struct extent_io_tree *tree,
				1587	struct page locked_page, u64 start,
				1588	u64 *end, u64 max_bytes)
				1589	{
				1590	u64 delalloc_start;
				1591	u64 delalloc_end;
				1592	u64 found;
				1593	struct extent_state *cached_state = NULL;
				1594	int ret;
				1595	int loops = 0;
				1596
				1597	again:
				1598	/* step one, find a bunch of delalloc bytes starting at start */
				1599	delalloc_start = *start;
				1600	delalloc_end = 0;
				1601	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
				1602	max_bytes, &cached_state);
				1603	if (!found \|\| delalloc_end <= *start) {
				1604	*start = delalloc_start;
				1605	*end = delalloc_end;
				1606	free_extent_state(cached_state);
				1607	return 0;
				1608	}
				1609
				1610	/*
				1611	* start comes from the offset of locked_page. We have to lock
				1612	* pages in order, so we can't process delalloc bytes before
				1613	* locked_page
				1614	*/
				1615	if (delalloc_start < *start)
				1616	delalloc_start = *start;
				1617
				1618	/*
				1619	* make sure to limit the number of pages we try to lock down
				1620	*/
				1621	if (delalloc_end + 1 - delalloc_start > max_bytes)
				1622	delalloc_end = delalloc_start + max_bytes - 1;
				1623
				1624	/* step two, lock all the pages after the page that has start */
				1625	ret = lock_delalloc_pages(inode, locked_page,
				1626	delalloc_start, delalloc_end);
				1627	if (ret == -EAGAIN) {
				1628	/* some of the pages are gone, lets avoid looping by
				1629	* shortening the size of the delalloc range we're searching
				1630	*/
				1631	free_extent_state(cached_state);
				1632	cached_state = NULL;
				1633	if (!loops) {
				1634	max_bytes = PAGE_SIZE;
				1635	loops = 1;
				1636	goto again;
				1637	} else {
				1638	found = 0;
				1639	goto out_failed;
				1640	}
				1641	}
				1642	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
				1643
				1644	/* step three, lock the state bits for the whole range */
				1645	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
				1646
				1647	/* then test to make sure it is all still delalloc */
				1648	ret = test_range_bit(tree, delalloc_start, delalloc_end,
				1649	EXTENT_DELALLOC, 1, cached_state);
				1650	if (!ret) {
				1651	unlock_extent_cached(tree, delalloc_start, delalloc_end,
				1652	&cached_state, GFP_NOFS);
				1653	__unlock_for_delalloc(inode, locked_page,
				1654	delalloc_start, delalloc_end);
				1655	cond_resched();
				1656	goto again;
				1657	}
				1658	free_extent_state(cached_state);
				1659	*start = delalloc_start;
				1660	*end = delalloc_end;
				1661	out_failed:
				1662	return found;
				1663	}
				1664
				1665	static int __process_pages_contig(struct address_space *mapping,
				1666	struct page *locked_page,
				1667	pgoff_t start_index, pgoff_t end_index,
				1668	unsigned long page_ops, pgoff_t *index_ret)
				1669	{
				1670	unsigned long nr_pages = end_index - start_index + 1;
				1671	unsigned long pages_locked = 0;
				1672	pgoff_t index = start_index;
				1673	struct page *pages[16];
				1674	unsigned ret;
				1675	int err = 0;
				1676	int i;
				1677
				1678	if (page_ops & PAGE_LOCK) {
				1679	ASSERT(page_ops == PAGE_LOCK);
				1680	ASSERT(index_ret && *index_ret == start_index);
				1681	}
				1682
				1683	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
				1684	mapping_set_error(mapping, -EIO);
				1685
				1686	while (nr_pages > 0) {
				1687	ret = find_get_pages_contig(mapping, index,
				1688	min_t(unsigned long,
				1689	nr_pages, ARRAY_SIZE(pages)), pages);
				1690	if (ret == 0) {
				1691	/*
				1692	* Only if we're going to lock these pages,
				1693	* can we find nothing at @index.
				1694	*/
				1695	ASSERT(page_ops & PAGE_LOCK);
				1696	err = -EAGAIN;
				1697	goto out;
				1698	}
				1699
				1700	for (i = 0; i < ret; i++) {
				1701	if (page_ops & PAGE_SET_PRIVATE2)
				1702	SetPagePrivate2(pages[i]);
				1703
				1704	if (pages[i] == locked_page) {
				1705	put_page(pages[i]);
				1706	pages_locked++;
				1707	continue;
				1708	}
				1709	if (page_ops & PAGE_CLEAR_DIRTY)
				1710	clear_page_dirty_for_io(pages[i]);
				1711	if (page_ops & PAGE_SET_WRITEBACK)
				1712	set_page_writeback(pages[i]);
				1713	if (page_ops & PAGE_SET_ERROR)
				1714	SetPageError(pages[i]);
				1715	if (page_ops & PAGE_END_WRITEBACK)
				1716	end_page_writeback(pages[i]);
				1717	if (page_ops & PAGE_UNLOCK)
				1718	unlock_page(pages[i]);
				1719	if (page_ops & PAGE_LOCK) {
				1720	lock_page(pages[i]);
				1721	if (!PageDirty(pages[i]) \|\|
				1722	pages[i]->mapping != mapping) {
				1723	unlock_page(pages[i]);
				1724	for (; i < ret; i++)
				1725	put_page(pages[i]);
				1726	err = -EAGAIN;
				1727	goto out;
				1728	}
				1729	}
				1730	put_page(pages[i]);
				1731	pages_locked++;
				1732	}
				1733	nr_pages -= ret;
				1734	index += ret;
				1735	cond_resched();
				1736	}
				1737	out:
				1738	if (err && index_ret)
				1739	*index_ret = start_index + pages_locked - 1;
				1740	return err;
				1741	}
				1742
				1743	void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
				1744	u64 delalloc_end, struct page *locked_page,
				1745	unsigned clear_bits,
				1746	unsigned long page_ops)
				1747	{
				1748	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, 1, 0,
				1749	NULL, GFP_NOFS);
				1750
				1751	__process_pages_contig(inode->i_mapping, locked_page,
				1752	start >> PAGE_SHIFT, end >> PAGE_SHIFT,
				1753	page_ops, NULL);
				1754	}
				1755
				1756	/*
				1757	* count the number of bytes in the tree that have a given bit(s)
				1758	* set. This can be fairly slow, except for EXTENT_DIRTY which is
				1759	* cached. The total number found is returned.
				1760	*/
				1761	u64 count_range_bits(struct extent_io_tree *tree,
				1762	u64 *start, u64 search_end, u64 max_bytes,
				1763	unsigned bits, int contig)
				1764	{
				1765	struct rb_node *node;
				1766	struct extent_state *state;
				1767	u64 cur_start = *start;
				1768	u64 total_bytes = 0;
				1769	u64 last = 0;
				1770	int found = 0;
				1771
				1772	if (WARN_ON(search_end <= cur_start))
				1773	return 0;
				1774
				1775	spin_lock(&tree->lock);
				1776	if (cur_start == 0 && bits == EXTENT_DIRTY) {
				1777	total_bytes = tree->dirty_bytes;
				1778	goto out;
				1779	}
				1780	/*
				1781	* this search will find all the extents that end after
				1782	* our range starts.
				1783	*/
				1784	node = tree_search(tree, cur_start);
				1785	if (!node)
				1786	goto out;
				1787
				1788	while (1) {
				1789	state = rb_entry(node, struct extent_state, rb_node);
				1790	if (state->start > search_end)
				1791	break;
				1792	if (contig && found && state->start > last + 1)
				1793	break;
				1794	if (state->end >= cur_start && (state->state & bits) == bits) {
				1795	total_bytes += min(search_end, state->end) + 1 -
				1796	max(cur_start, state->start);
				1797	if (total_bytes >= max_bytes)
				1798	break;
				1799	if (!found) {
				1800	*start = max(cur_start, state->start);
				1801	found = 1;
				1802	}
				1803	last = state->end;
				1804	} else if (contig && found) {
				1805	break;
				1806	}
				1807	node = rb_next(node);
				1808	if (!node)
				1809	break;
				1810	}
				1811	out:
				1812	spin_unlock(&tree->lock);
				1813	return total_bytes;
				1814	}
				1815
				1816	/*
				1817	* set the private field for a given byte offset in the tree. If there isn't
				1818	* an extent_state there already, this does nothing.
				1819	*/
				1820	static noinline int set_state_failrec(struct extent_io_tree *tree, u64 start,
				1821	struct io_failure_record *failrec)
				1822	{
				1823	struct rb_node *node;
				1824	struct extent_state *state;
				1825	int ret = 0;
				1826
				1827	spin_lock(&tree->lock);
				1828	/*
				1829	* this search will find all the extents that end after
				1830	* our range starts.
				1831	*/
				1832	node = tree_search(tree, start);
				1833	if (!node) {
				1834	ret = -ENOENT;
				1835	goto out;
				1836	}
				1837	state = rb_entry(node, struct extent_state, rb_node);
				1838	if (state->start != start) {
				1839	ret = -ENOENT;
				1840	goto out;
				1841	}
				1842	state->failrec = failrec;
				1843	out:
				1844	spin_unlock(&tree->lock);
				1845	return ret;
				1846	}
				1847
				1848	static noinline int get_state_failrec(struct extent_io_tree *tree, u64 start,
				1849	struct io_failure_record **failrec)
				1850	{
				1851	struct rb_node *node;
				1852	struct extent_state *state;
				1853	int ret = 0;
				1854
				1855	spin_lock(&tree->lock);
				1856	/*
				1857	* this search will find all the extents that end after
				1858	* our range starts.
				1859	*/
				1860	node = tree_search(tree, start);
				1861	if (!node) {
				1862	ret = -ENOENT;
				1863	goto out;
				1864	}
				1865	state = rb_entry(node, struct extent_state, rb_node);
				1866	if (state->start != start) {
				1867	ret = -ENOENT;
				1868	goto out;
				1869	}
				1870	*failrec = state->failrec;
				1871	out:
				1872	spin_unlock(&tree->lock);
				1873	return ret;
				1874	}
				1875
				1876	/*
				1877	* searches a range in the state tree for a given mask.
				1878	* If 'filled' == 1, this returns 1 only if every extent in the tree
				1879	* has the bits set. Otherwise, 1 is returned if any bit in the
				1880	* range is found set.
				1881	*/
				1882	int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1883	unsigned bits, int filled, struct extent_state *cached)
				1884	{
				1885	struct extent_state *state = NULL;
				1886	struct rb_node *node;
				1887	int bitset = 0;
				1888
				1889	spin_lock(&tree->lock);
				1890	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
				1891	cached->end > start)
				1892	node = &cached->rb_node;
				1893	else
				1894	node = tree_search(tree, start);
				1895	while (node && start <= end) {
				1896	state = rb_entry(node, struct extent_state, rb_node);
				1897
				1898	if (filled && state->start > start) {
				1899	bitset = 0;
				1900	break;
				1901	}
				1902
				1903	if (state->start > end)
				1904	break;
				1905
				1906	if (state->state & bits) {
				1907	bitset = 1;
				1908	if (!filled)
				1909	break;
				1910	} else if (filled) {
				1911	bitset = 0;
				1912	break;
				1913	}
				1914
				1915	if (state->end == (u64)-1)
				1916	break;
				1917
				1918	start = state->end + 1;
				1919	if (start > end)
				1920	break;
				1921	node = rb_next(node);
				1922	if (!node) {
				1923	if (filled)
				1924	bitset = 0;
				1925	break;
				1926	}
				1927	}
				1928	spin_unlock(&tree->lock);
				1929	return bitset;
				1930	}
				1931
				1932	/*
				1933	* helper function to set a given page up to date if all the
				1934	* extents in the tree for that page are up to date
				1935	*/
				1936	static void check_page_uptodate(struct extent_io_tree tree, struct page page)
				1937	{
				1938	u64 start = page_offset(page);
				1939	u64 end = start + PAGE_SIZE - 1;
				1940	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
				1941	SetPageUptodate(page);
				1942	}
				1943
				1944	int free_io_failure(struct extent_io_tree *failure_tree,
				1945	struct extent_io_tree *io_tree,
				1946	struct io_failure_record *rec)
				1947	{
				1948	int ret;
				1949	int err = 0;
				1950
				1951	set_state_failrec(failure_tree, rec->start, NULL);
				1952	ret = clear_extent_bits(failure_tree, rec->start,
				1953	rec->start + rec->len - 1,
				1954	EXTENT_LOCKED \| EXTENT_DIRTY);
				1955	if (ret)
				1956	err = ret;
				1957
				1958	ret = clear_extent_bits(io_tree, rec->start,
				1959	rec->start + rec->len - 1,
				1960	EXTENT_DAMAGED);
				1961	if (ret && !err)
				1962	err = ret;
				1963
				1964	kfree(rec);
				1965	return err;
				1966	}
				1967
				1968	/*
				1969	* this bypasses the standard btrfs submit functions deliberately, as
				1970	* the standard behavior is to write all copies in a raid setup. here we only
				1971	* want to write the one bad copy. so we do the mapping for ourselves and issue
				1972	* submit_bio directly.
				1973	* to avoid any synchronization issues, wait for the data after writing, which
				1974	* actually prevents the read that triggered the error from finishing.
				1975	* currently, there can be no more than two copies of every data bit. thus,
				1976	* exactly one rewrite is required.
				1977	*/
				1978	int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
				1979	u64 length, u64 logical, struct page *page,
				1980	unsigned int pg_offset, int mirror_num)
				1981	{
				1982	struct bio *bio;
				1983	struct btrfs_device *dev;
				1984	u64 map_length = 0;
				1985	u64 sector;
				1986	struct btrfs_bio *bbio = NULL;
				1987	int ret;
				1988
				1989	ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
				1990	BUG_ON(!mirror_num);
				1991
				1992	bio = btrfs_io_bio_alloc(1);
				1993	bio->bi_iter.bi_size = 0;
				1994	map_length = length;
				1995
				1996	/*
				1997	* Avoid races with device replace and make sure our bbio has devices
				1998	* associated to its stripes that don't go away while we are doing the
				1999	* read repair operation.
				2000	*/
				2001	btrfs_bio_counter_inc_blocked(fs_info);
				2002	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
				2003	/*
				2004	* Note that we don't use BTRFS_MAP_WRITE because it's supposed
				2005	* to update all raid stripes, but here we just want to correct
				2006	* bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
				2007	* stripe's dev and sector.
				2008	*/
				2009	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
				2010	&map_length, &bbio, 0);
				2011	if (ret) {
				2012	btrfs_bio_counter_dec(fs_info);
				2013	bio_put(bio);
				2014	return -EIO;
				2015	}
				2016	ASSERT(bbio->mirror_num == 1);
				2017	} else {
				2018	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
				2019	&map_length, &bbio, mirror_num);
				2020	if (ret) {
				2021	btrfs_bio_counter_dec(fs_info);
				2022	bio_put(bio);
				2023	return -EIO;
				2024	}
				2025	BUG_ON(mirror_num != bbio->mirror_num);
				2026	}
				2027
				2028	sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9;
				2029	bio->bi_iter.bi_sector = sector;
				2030	dev = bbio->stripes[bbio->mirror_num - 1].dev;
				2031	btrfs_put_bbio(bbio);
				2032	if (!dev \|\| !dev->bdev \|\| !dev->writeable) {
				2033	btrfs_bio_counter_dec(fs_info);
				2034	bio_put(bio);
				2035	return -EIO;
				2036	}
				2037	bio_set_dev(bio, dev->bdev);
				2038	bio->bi_opf = REQ_OP_WRITE \| REQ_SYNC;
				2039	bio_add_page(bio, page, length, pg_offset);
				2040
				2041	if (btrfsic_submit_bio_wait(bio)) {
				2042	/* try to remap that extent elsewhere? */
				2043	btrfs_bio_counter_dec(fs_info);
				2044	bio_put(bio);
				2045	btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
				2046	return -EIO;
				2047	}
				2048
				2049	btrfs_info_rl_in_rcu(fs_info,
				2050	"read error corrected: ino %llu off %llu (dev %s sector %llu)",
				2051	ino, start,
				2052	rcu_str_deref(dev->name), sector);
				2053	btrfs_bio_counter_dec(fs_info);
				2054	bio_put(bio);
				2055	return 0;
				2056	}
				2057
				2058	int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
				2059	struct extent_buffer *eb, int mirror_num)
				2060	{
				2061	u64 start = eb->start;
				2062	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
				2063	int ret = 0;
				2064
				2065	if (sb_rdonly(fs_info->sb))
				2066	return -EROFS;
				2067
				2068	for (i = 0; i < num_pages; i++) {
				2069	struct page *p = eb->pages[i];
				2070
				2071	ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
				2072	start - page_offset(p), mirror_num);
				2073	if (ret)
				2074	break;
				2075	start += PAGE_SIZE;
				2076	}
				2077
				2078	return ret;
				2079	}
				2080
				2081	/*
				2082	* each time an IO finishes, we do a fast check in the IO failure tree
				2083	* to see if we need to process or clean up an io_failure_record
				2084	*/
				2085	int clean_io_failure(struct btrfs_fs_info *fs_info,
				2086	struct extent_io_tree *failure_tree,
				2087	struct extent_io_tree *io_tree, u64 start,
				2088	struct page *page, u64 ino, unsigned int pg_offset)
				2089	{
				2090	u64 private;
				2091	struct io_failure_record *failrec;
				2092	struct extent_state *state;
				2093	int num_copies;
				2094	int ret;
				2095
				2096	private = 0;
				2097	ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
				2098	EXTENT_DIRTY, 0);
				2099	if (!ret)
				2100	return 0;
				2101
				2102	ret = get_state_failrec(failure_tree, start, &failrec);
				2103	if (ret)
				2104	return 0;
				2105
				2106	BUG_ON(!failrec->this_mirror);
				2107
				2108	if (failrec->in_validation) {
				2109	/* there was no real error, just free the record */
				2110	btrfs_debug(fs_info,
				2111	"clean_io_failure: freeing dummy error at %llu",
				2112	failrec->start);
				2113	goto out;
				2114	}
				2115	if (sb_rdonly(fs_info->sb))
				2116	goto out;
				2117
				2118	spin_lock(&io_tree->lock);
				2119	state = find_first_extent_bit_state(io_tree,
				2120	failrec->start,
				2121	EXTENT_LOCKED);
				2122	spin_unlock(&io_tree->lock);
				2123
				2124	if (state && state->start <= failrec->start &&
				2125	state->end >= failrec->start + failrec->len - 1) {
				2126	num_copies = btrfs_num_copies(fs_info, failrec->logical,
				2127	failrec->len);
				2128	if (num_copies > 1) {
				2129	repair_io_failure(fs_info, ino, start, failrec->len,
				2130	failrec->logical, page, pg_offset,
				2131	failrec->failed_mirror);
				2132	}
				2133	}
				2134
				2135	out:
				2136	free_io_failure(failure_tree, io_tree, failrec);
				2137
				2138	return 0;
				2139	}
				2140
				2141	/*
				2142	* Can be called when
				2143	* - hold extent lock
				2144	* - under ordered extent
				2145	* - the inode is freeing
				2146	*/
				2147	void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
				2148	{
				2149	struct extent_io_tree *failure_tree = &inode->io_failure_tree;
				2150	struct io_failure_record *failrec;
				2151	struct extent_state state, next;
				2152
				2153	if (RB_EMPTY_ROOT(&failure_tree->state))
				2154	return;
				2155
				2156	spin_lock(&failure_tree->lock);
				2157	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
				2158	while (state) {
				2159	if (state->start > end)
				2160	break;
				2161
				2162	ASSERT(state->end <= end);
				2163
				2164	next = next_state(state);
				2165
				2166	failrec = state->failrec;
				2167	free_extent_state(state);
				2168	kfree(failrec);
				2169
				2170	state = next;
				2171	}
				2172	spin_unlock(&failure_tree->lock);
				2173	}
				2174
				2175	int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end,
				2176	struct io_failure_record **failrec_ret)
				2177	{
				2178	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2179	struct io_failure_record *failrec;
				2180	struct extent_map *em;
				2181	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
				2182	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
				2183	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
				2184	int ret;
				2185	u64 logical;
				2186
				2187	ret = get_state_failrec(failure_tree, start, &failrec);
				2188	if (ret) {
				2189	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
				2190	if (!failrec)
				2191	return -ENOMEM;
				2192
				2193	failrec->start = start;
				2194	failrec->len = end - start + 1;
				2195	failrec->this_mirror = 0;
				2196	failrec->bio_flags = 0;
				2197	failrec->in_validation = 0;
				2198
				2199	read_lock(&em_tree->lock);
				2200	em = lookup_extent_mapping(em_tree, start, failrec->len);
				2201	if (!em) {
				2202	read_unlock(&em_tree->lock);
				2203	kfree(failrec);
				2204	return -EIO;
				2205	}
				2206
				2207	if (em->start > start \|\| em->start + em->len <= start) {
				2208	free_extent_map(em);
				2209	em = NULL;
				2210	}
				2211	read_unlock(&em_tree->lock);
				2212	if (!em) {
				2213	kfree(failrec);
				2214	return -EIO;
				2215	}
				2216
				2217	logical = start - em->start;
				2218	logical = em->block_start + logical;
				2219	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
				2220	logical = em->block_start;
				2221	failrec->bio_flags = EXTENT_BIO_COMPRESSED;
				2222	extent_set_compress_type(&failrec->bio_flags,
				2223	em->compress_type);
				2224	}
				2225
				2226	btrfs_debug(fs_info,
				2227	"Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu",
				2228	logical, start, failrec->len);
				2229
				2230	failrec->logical = logical;
				2231	free_extent_map(em);
				2232
				2233	/* set the bits in the private failure tree */
				2234	ret = set_extent_bits(failure_tree, start, end,
				2235	EXTENT_LOCKED \| EXTENT_DIRTY);
				2236	if (ret >= 0)
				2237	ret = set_state_failrec(failure_tree, start, failrec);
				2238	/* set the bits in the inode's tree */
				2239	if (ret >= 0)
				2240	ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED);
				2241	if (ret < 0) {
				2242	kfree(failrec);
				2243	return ret;
				2244	}
				2245	} else {
				2246	btrfs_debug(fs_info,
				2247	"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d",
				2248	failrec->logical, failrec->start, failrec->len,
				2249	failrec->in_validation);
				2250	/*
				2251	* when data can be on disk more than twice, add to failrec here
				2252	* (e.g. with a list for failed_mirror) to make
				2253	* clean_io_failure() clean all those errors at once.
				2254	*/
				2255	}
				2256
				2257	*failrec_ret = failrec;
				2258
				2259	return 0;
				2260	}
				2261
				2262	bool btrfs_check_repairable(struct inode inode, struct bio failed_bio,
				2263	struct io_failure_record *failrec, int failed_mirror)
				2264	{
				2265	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2266	int num_copies;
				2267
				2268	num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
				2269	if (num_copies == 1) {
				2270	/*
				2271	* we only have a single copy of the data, so don't bother with
				2272	* all the retry and error correction code that follows. no
				2273	* matter what the error is, it is very likely to persist.
				2274	*/
				2275	btrfs_debug(fs_info,
				2276	"Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
				2277	num_copies, failrec->this_mirror, failed_mirror);
				2278	return false;
				2279	}
				2280
				2281	/*
				2282	* there are two premises:
				2283	* a) deliver good data to the caller
				2284	* b) correct the bad sectors on disk
				2285	*/
				2286	if (failed_bio->bi_vcnt > 1) {
				2287	/*
				2288	* to fulfill b), we need to know the exact failing sectors, as
				2289	* we don't want to rewrite any more than the failed ones. thus,
				2290	* we need separate read requests for the failed bio
				2291	*
				2292	* if the following BUG_ON triggers, our validation request got
				2293	* merged. we need separate requests for our algorithm to work.
				2294	*/
				2295	BUG_ON(failrec->in_validation);
				2296	failrec->in_validation = 1;
				2297	failrec->this_mirror = failed_mirror;
				2298	} else {
				2299	/*
				2300	* we're ready to fulfill a) and b) alongside. get a good copy
				2301	* of the failed sector and if we succeed, we have setup
				2302	* everything for repair_io_failure to do the rest for us.
				2303	*/
				2304	if (failrec->in_validation) {
				2305	BUG_ON(failrec->this_mirror != failed_mirror);
				2306	failrec->in_validation = 0;
				2307	failrec->this_mirror = 0;
				2308	}
				2309	failrec->failed_mirror = failed_mirror;
				2310	failrec->this_mirror++;
				2311	if (failrec->this_mirror == failed_mirror)
				2312	failrec->this_mirror++;
				2313	}
				2314
				2315	if (failrec->this_mirror > num_copies) {
				2316	btrfs_debug(fs_info,
				2317	"Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
				2318	num_copies, failrec->this_mirror, failed_mirror);
				2319	return false;
				2320	}
				2321
				2322	return true;
				2323	}
				2324
				2325
				2326	struct bio btrfs_create_repair_bio(struct inode inode, struct bio *failed_bio,
				2327	struct io_failure_record *failrec,
				2328	struct page *page, int pg_offset, int icsum,
				2329	bio_end_io_t endio_func, void data)
				2330	{
				2331	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2332	struct bio *bio;
				2333	struct btrfs_io_bio *btrfs_failed_bio;
				2334	struct btrfs_io_bio *btrfs_bio;
				2335
				2336	bio = btrfs_io_bio_alloc(1);
				2337	bio->bi_end_io = endio_func;
				2338	bio->bi_iter.bi_sector = failrec->logical >> 9;
				2339	bio_set_dev(bio, fs_info->fs_devices->latest_bdev);
				2340	bio->bi_iter.bi_size = 0;
				2341	bio->bi_private = data;
				2342
				2343	btrfs_failed_bio = btrfs_io_bio(failed_bio);
				2344	if (btrfs_failed_bio->csum) {
				2345	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
				2346
				2347	btrfs_bio = btrfs_io_bio(bio);
				2348	btrfs_bio->csum = btrfs_bio->csum_inline;
				2349	icsum *= csum_size;
				2350	memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum,
				2351	csum_size);
				2352	}
				2353
				2354	bio_add_page(bio, page, failrec->len, pg_offset);
				2355
				2356	return bio;
				2357	}
				2358
				2359	/*
				2360	* this is a generic handler for readpage errors (default
				2361	* readpage_io_failed_hook). if other copies exist, read those and write back
				2362	* good data to the failed position. does not investigate in remapping the
				2363	* failed extent elsewhere, hoping the device will be smart enough to do this as
				2364	* needed
				2365	*/
				2366
				2367	static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
				2368	struct page *page, u64 start, u64 end,
				2369	int failed_mirror)
				2370	{
				2371	struct io_failure_record *failrec;
				2372	struct inode *inode = page->mapping->host;
				2373	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
				2374	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
				2375	struct bio *bio;
				2376	int read_mode = 0;
				2377	blk_status_t status;
				2378	int ret;
				2379
				2380	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
				2381
				2382	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
				2383	if (ret)
				2384	return ret;
				2385
				2386	if (!btrfs_check_repairable(inode, failed_bio, failrec,
				2387	failed_mirror)) {
				2388	free_io_failure(failure_tree, tree, failrec);
				2389	return -EIO;
				2390	}
				2391
				2392	if (failed_bio->bi_vcnt > 1)
				2393	read_mode \|= REQ_FAILFAST_DEV;
				2394
				2395	phy_offset >>= inode->i_sb->s_blocksize_bits;
				2396	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
				2397	start - page_offset(page),
				2398	(int)phy_offset, failed_bio->bi_end_io,
				2399	NULL);
				2400	bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
				2401
				2402	btrfs_debug(btrfs_sb(inode->i_sb),
				2403	"Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d",
				2404	read_mode, failrec->this_mirror, failrec->in_validation);
				2405
				2406	status = tree->ops->submit_bio_hook(tree->private_data, bio, failrec->this_mirror,
				2407	failrec->bio_flags, 0);
				2408	if (status) {
				2409	free_io_failure(failure_tree, tree, failrec);
				2410	bio_put(bio);
				2411	ret = blk_status_to_errno(status);
				2412	}
				2413
				2414	return ret;
				2415	}
				2416
				2417	/* lots and lots of room for performance fixes in the end_bio funcs */
				2418
				2419	void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
				2420	{
				2421	int uptodate = (err == 0);
				2422	struct extent_io_tree *tree;
				2423	int ret = 0;
				2424
				2425	tree = &BTRFS_I(page->mapping->host)->io_tree;
				2426
				2427	if (tree->ops && tree->ops->writepage_end_io_hook)
				2428	tree->ops->writepage_end_io_hook(page, start, end, NULL,
				2429	uptodate);
				2430
				2431	if (!uptodate) {
				2432	ClearPageUptodate(page);
				2433	SetPageError(page);
				2434	ret = err < 0 ? err : -EIO;
				2435	mapping_set_error(page->mapping, ret);
				2436	}
				2437	}
				2438
				2439	/*
				2440	* after a writepage IO is done, we need to:
				2441	* clear the uptodate bits on error
				2442	* clear the writeback bits in the extent tree for this IO
				2443	* end_page_writeback if the page has no more pending IO
				2444	*
				2445	* Scheduling is not allowed, so the extent state tree is expected
				2446	* to have one and only one object corresponding to this IO.
				2447	*/
				2448	static void end_bio_extent_writepage(struct bio *bio)
				2449	{
				2450	int error = blk_status_to_errno(bio->bi_status);
				2451	struct bio_vec *bvec;
				2452	u64 start;
				2453	u64 end;
				2454	int i;
				2455
				2456	ASSERT(!bio_flagged(bio, BIO_CLONED));
				2457	bio_for_each_segment_all(bvec, bio, i) {
				2458	struct page *page = bvec->bv_page;
				2459	struct inode *inode = page->mapping->host;
				2460	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2461
				2462	/* We always issue full-page reads, but if some block
				2463	* in a page fails to read, blk_update_request() will
				2464	* advance bv_offset and adjust bv_len to compensate.
				2465	* Print a warning for nonzero offsets, and an error
				2466	* if they don't add up to a full page. */
				2467	if (bvec->bv_offset \|\| bvec->bv_len != PAGE_SIZE) {
				2468	if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
				2469	btrfs_err(fs_info,
				2470	"partial page write in btrfs with offset %u and length %u",
				2471	bvec->bv_offset, bvec->bv_len);
				2472	else
				2473	btrfs_info(fs_info,
				2474	"incomplete page write in btrfs with offset %u and length %u",
				2475	bvec->bv_offset, bvec->bv_len);
				2476	}
				2477
				2478	start = page_offset(page);
				2479	end = start + bvec->bv_offset + bvec->bv_len - 1;
				2480
				2481	end_extent_writepage(page, error, start, end);
				2482	end_page_writeback(page);
				2483	}
				2484
				2485	bio_put(bio);
				2486	}
				2487
				2488	static void
				2489	endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
				2490	int uptodate)
				2491	{
				2492	struct extent_state *cached = NULL;
				2493	u64 end = start + len - 1;
				2494
				2495	if (uptodate && tree->track_uptodate)
				2496	set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
				2497	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
				2498	}
				2499
				2500	/*
				2501	* after a readpage IO is done, we need to:
				2502	* clear the uptodate bits on error
				2503	* set the uptodate bits if things worked
				2504	* set the page up to date if all extents in the tree are uptodate
				2505	* clear the lock bit in the extent tree
				2506	* unlock the page if there are no other extents locked for it
				2507	*
				2508	* Scheduling is not allowed, so the extent state tree is expected
				2509	* to have one and only one object corresponding to this IO.
				2510	*/
				2511	static void end_bio_extent_readpage(struct bio *bio)
				2512	{
				2513	struct bio_vec *bvec;
				2514	int uptodate = !bio->bi_status;
				2515	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
				2516	struct extent_io_tree tree, failure_tree;
				2517	u64 offset = 0;
				2518	u64 start;
				2519	u64 end;
				2520	u64 len;
				2521	u64 extent_start = 0;
				2522	u64 extent_len = 0;
				2523	int mirror;
				2524	int ret;
				2525	int i;
				2526
				2527	ASSERT(!bio_flagged(bio, BIO_CLONED));
				2528	bio_for_each_segment_all(bvec, bio, i) {
				2529	struct page *page = bvec->bv_page;
				2530	struct inode *inode = page->mapping->host;
				2531	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				2532
				2533	btrfs_debug(fs_info,
				2534	"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
				2535	(u64)bio->bi_iter.bi_sector, bio->bi_status,
				2536	io_bio->mirror_num);
				2537	tree = &BTRFS_I(inode)->io_tree;
				2538	failure_tree = &BTRFS_I(inode)->io_failure_tree;
				2539
				2540	/* We always issue full-page reads, but if some block
				2541	* in a page fails to read, blk_update_request() will
				2542	* advance bv_offset and adjust bv_len to compensate.
				2543	* Print a warning for nonzero offsets, and an error
				2544	* if they don't add up to a full page. */
				2545	if (bvec->bv_offset \|\| bvec->bv_len != PAGE_SIZE) {
				2546	if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE)
				2547	btrfs_err(fs_info,
				2548	"partial page read in btrfs with offset %u and length %u",
				2549	bvec->bv_offset, bvec->bv_len);
				2550	else
				2551	btrfs_info(fs_info,
				2552	"incomplete page read in btrfs with offset %u and length %u",
				2553	bvec->bv_offset, bvec->bv_len);
				2554	}
				2555
				2556	start = page_offset(page);
				2557	end = start + bvec->bv_offset + bvec->bv_len - 1;
				2558	len = bvec->bv_len;
				2559
				2560	mirror = io_bio->mirror_num;
				2561	if (likely(uptodate && tree->ops)) {
				2562	ret = tree->ops->readpage_end_io_hook(io_bio, offset,
				2563	page, start, end,
				2564	mirror);
				2565	if (ret)
				2566	uptodate = 0;
				2567	else
				2568	clean_io_failure(BTRFS_I(inode)->root->fs_info,
				2569	failure_tree, tree, start,
				2570	page,
				2571	btrfs_ino(BTRFS_I(inode)), 0);
				2572	}
				2573
				2574	if (likely(uptodate))
				2575	goto readpage_ok;
				2576
				2577	if (tree->ops) {
				2578	ret = tree->ops->readpage_io_failed_hook(page, mirror);
				2579	if (ret == -EAGAIN) {
				2580	/*
				2581	* Data inode's readpage_io_failed_hook() always
				2582	* returns -EAGAIN.
				2583	*
				2584	* The generic bio_readpage_error handles errors
				2585	* the following way: If possible, new read
				2586	* requests are created and submitted and will
				2587	* end up in end_bio_extent_readpage as well (if
				2588	* we're lucky, not in the !uptodate case). In
				2589	* that case it returns 0 and we just go on with
				2590	* the next page in our bio. If it can't handle
				2591	* the error it will return -EIO and we remain
				2592	* responsible for that page.
				2593	*/
				2594	ret = bio_readpage_error(bio, offset, page,
				2595	start, end, mirror);
				2596	if (ret == 0) {
				2597	uptodate = !bio->bi_status;
				2598	offset += len;
				2599	continue;
				2600	}
				2601	}
				2602
				2603	/*
				2604	* metadata's readpage_io_failed_hook() always returns
				2605	* -EIO and fixes nothing. -EIO is also returned if
				2606	* data inode error could not be fixed.
				2607	*/
				2608	ASSERT(ret == -EIO);
				2609	}
				2610	readpage_ok:
				2611	if (likely(uptodate)) {
				2612	loff_t i_size = i_size_read(inode);
				2613	pgoff_t end_index = i_size >> PAGE_SHIFT;
				2614	unsigned off;
				2615
				2616	/* Zero out the end if this page straddles i_size */
				2617	off = i_size & (PAGE_SIZE-1);
				2618	if (page->index == end_index && off)
				2619	zero_user_segment(page, off, PAGE_SIZE);
				2620	SetPageUptodate(page);
				2621	} else {
				2622	ClearPageUptodate(page);
				2623	SetPageError(page);
				2624	}
				2625	unlock_page(page);
				2626	offset += len;
				2627
				2628	if (unlikely(!uptodate)) {
				2629	if (extent_len) {
				2630	endio_readpage_release_extent(tree,
				2631	extent_start,
				2632	extent_len, 1);
				2633	extent_start = 0;
				2634	extent_len = 0;
				2635	}
				2636	endio_readpage_release_extent(tree, start,
				2637	end - start + 1, 0);
				2638	} else if (!extent_len) {
				2639	extent_start = start;
				2640	extent_len = end + 1 - start;
				2641	} else if (extent_start + extent_len == start) {
				2642	extent_len += end + 1 - start;
				2643	} else {
				2644	endio_readpage_release_extent(tree, extent_start,
				2645	extent_len, uptodate);
				2646	extent_start = start;
				2647	extent_len = end + 1 - start;
				2648	}
				2649	}
				2650
				2651	if (extent_len)
				2652	endio_readpage_release_extent(tree, extent_start, extent_len,
				2653	uptodate);
				2654	if (io_bio->end_io)
				2655	io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
				2656	bio_put(bio);
				2657	}
				2658
				2659	/*
				2660	* Initialize the members up to but not including 'bio'. Use after allocating a
				2661	* new bio by bio_alloc_bioset as it does not initialize the bytes outside of
				2662	* 'bio' because use of __GFP_ZERO is not supported.
				2663	*/
				2664	static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio)
				2665	{
				2666	memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio));
				2667	}
				2668
				2669	/*
				2670	* The following helpers allocate a bio. As it's backed by a bioset, it'll
				2671	* never fail. We're returning a bio right now but you can call btrfs_io_bio
				2672	* for the appropriate container_of magic
				2673	*/
				2674	struct bio btrfs_bio_alloc(struct block_device bdev, u64 first_byte)
				2675	{
				2676	struct bio *bio;
				2677
				2678	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset);
				2679	bio_set_dev(bio, bdev);
				2680	bio->bi_iter.bi_sector = first_byte >> 9;
				2681	btrfs_io_bio_init(btrfs_io_bio(bio));
				2682	return bio;
				2683	}
				2684
				2685	struct bio btrfs_bio_clone(struct bio bio)
				2686	{
				2687	struct btrfs_io_bio *btrfs_bio;
				2688	struct bio *new;
				2689
				2690	/* Bio allocation backed by a bioset does not fail */
				2691	new = bio_clone_fast(bio, GFP_NOFS, btrfs_bioset);
				2692	btrfs_bio = btrfs_io_bio(new);
				2693	btrfs_io_bio_init(btrfs_bio);
				2694	btrfs_bio->iter = bio->bi_iter;
				2695	return new;
				2696	}
				2697
				2698	struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs)
				2699	{
				2700	struct bio *bio;
				2701
				2702	/* Bio allocation backed by a bioset does not fail */
				2703	bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, btrfs_bioset);
				2704	btrfs_io_bio_init(btrfs_io_bio(bio));
				2705	return bio;
				2706	}
				2707
				2708	struct bio btrfs_bio_clone_partial(struct bio orig, int offset, int size)
				2709	{
				2710	struct bio *bio;
				2711	struct btrfs_io_bio *btrfs_bio;
				2712
				2713	/* this will never fail when it's backed by a bioset */
				2714	bio = bio_clone_fast(orig, GFP_NOFS, btrfs_bioset);
				2715	ASSERT(bio);
				2716
				2717	btrfs_bio = btrfs_io_bio(bio);
				2718	btrfs_io_bio_init(btrfs_bio);
				2719
				2720	bio_trim(bio, offset >> 9, size >> 9);
				2721	btrfs_bio->iter = bio->bi_iter;
				2722	return bio;
				2723	}
				2724
				2725	static int __must_check submit_one_bio(struct bio *bio, int mirror_num,
				2726	unsigned long bio_flags)
				2727	{
				2728	blk_status_t ret = 0;
				2729	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
				2730	struct page *page = bvec->bv_page;
				2731	struct extent_io_tree *tree = bio->bi_private;
				2732	u64 start;
				2733
				2734	start = page_offset(page) + bvec->bv_offset;
				2735
				2736	bio->bi_private = NULL;
				2737	bio_get(bio);
				2738
				2739	if (tree->ops)
				2740	ret = tree->ops->submit_bio_hook(tree->private_data, bio,
				2741	mirror_num, bio_flags, start);
				2742	else
				2743	btrfsic_submit_bio(bio);
				2744
				2745	bio_put(bio);
				2746	return blk_status_to_errno(ret);
				2747	}
				2748
				2749	static int merge_bio(struct extent_io_tree tree, struct page page,
				2750	unsigned long offset, size_t size, struct bio *bio,
				2751	unsigned long bio_flags)
				2752	{
				2753	int ret = 0;
				2754	if (tree->ops)
				2755	ret = tree->ops->merge_bio_hook(page, offset, size, bio,
				2756	bio_flags);
				2757	return ret;
				2758
				2759	}
				2760
				2761	/*
				2762	* @opf: bio REQ_OP_* and REQ_* flags as one value
				2763	*/
				2764	static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree,
				2765	struct writeback_control *wbc,
				2766	struct page *page, sector_t sector,
				2767	size_t size, unsigned long offset,
				2768	struct block_device *bdev,
				2769	struct bio **bio_ret,
				2770	bio_end_io_t end_io_func,
				2771	int mirror_num,
				2772	unsigned long prev_bio_flags,
				2773	unsigned long bio_flags,
				2774	bool force_bio_submit)
				2775	{
				2776	int ret = 0;
				2777	struct bio *bio;
				2778	int contig = 0;
				2779	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
				2780	size_t page_size = min_t(size_t, size, PAGE_SIZE);
				2781
				2782	if (bio_ret && *bio_ret) {
				2783	bio = *bio_ret;
				2784	if (old_compressed)
				2785	contig = bio->bi_iter.bi_sector == sector;
				2786	else
				2787	contig = bio_end_sector(bio) == sector;
				2788
				2789	if (prev_bio_flags != bio_flags \|\| !contig \|\|
				2790	force_bio_submit \|\|
				2791	merge_bio(tree, page, offset, page_size, bio, bio_flags) \|\|
				2792	bio_add_page(bio, page, page_size, offset) < page_size) {
				2793	ret = submit_one_bio(bio, mirror_num, prev_bio_flags);
				2794	if (ret < 0) {
				2795	*bio_ret = NULL;
				2796	return ret;
				2797	}
				2798	bio = NULL;
				2799	} else {
				2800	if (wbc)
				2801	wbc_account_io(wbc, page, page_size);
				2802	return 0;
				2803	}
				2804	}
				2805
				2806	bio = btrfs_bio_alloc(bdev, (u64)sector << 9);
				2807	bio_add_page(bio, page, page_size, offset);
				2808	bio->bi_end_io = end_io_func;
				2809	bio->bi_private = tree;
				2810	bio->bi_write_hint = page->mapping->host->i_write_hint;
				2811	bio->bi_opf = opf;
				2812	if (wbc) {
				2813	wbc_init_bio(wbc, bio);
				2814	wbc_account_io(wbc, page, page_size);
				2815	}
				2816
				2817	if (bio_ret)
				2818	*bio_ret = bio;
				2819	else
				2820	ret = submit_one_bio(bio, mirror_num, bio_flags);
				2821
				2822	return ret;
				2823	}
				2824
				2825	static void attach_extent_buffer_page(struct extent_buffer *eb,
				2826	struct page *page)
				2827	{
				2828	if (!PagePrivate(page)) {
				2829	SetPagePrivate(page);
				2830	get_page(page);
				2831	set_page_private(page, (unsigned long)eb);
				2832	} else {
				2833	WARN_ON(page->private != (unsigned long)eb);
				2834	}
				2835	}
				2836
				2837	void set_page_extent_mapped(struct page *page)
				2838	{
				2839	if (!PagePrivate(page)) {
				2840	SetPagePrivate(page);
				2841	get_page(page);
				2842	set_page_private(page, EXTENT_PAGE_PRIVATE);
				2843	}
				2844	}
				2845
				2846	static struct extent_map *
				2847	__get_extent_map(struct inode inode, struct page page, size_t pg_offset,
				2848	u64 start, u64 len, get_extent_t *get_extent,
				2849	struct extent_map **em_cached)
				2850	{
				2851	struct extent_map *em;
				2852
				2853	if (em_cached && *em_cached) {
				2854	em = *em_cached;
				2855	if (extent_map_in_tree(em) && start >= em->start &&
				2856	start < extent_map_end(em)) {
				2857	refcount_inc(&em->refs);
				2858	return em;
				2859	}
				2860
				2861	free_extent_map(em);
				2862	*em_cached = NULL;
				2863	}
				2864
				2865	em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0);
				2866	if (em_cached && !IS_ERR_OR_NULL(em)) {
				2867	BUG_ON(*em_cached);
				2868	refcount_inc(&em->refs);
				2869	*em_cached = em;
				2870	}
				2871	return em;
				2872	}
				2873	/*
				2874	* basic readpage implementation. Locked extent state structs are inserted
				2875	* into the tree that are removed when the IO is done (by the end_io
				2876	* handlers)
				2877	* XXX JDM: This needs looking at to ensure proper page locking
				2878	* return 0 on success, otherwise return error
				2879	*/
				2880	static int __do_readpage(struct extent_io_tree *tree,
				2881	struct page *page,
				2882	get_extent_t *get_extent,
				2883	struct extent_map **em_cached,
				2884	struct bio **bio, int mirror_num,
				2885	unsigned long *bio_flags, unsigned int read_flags,
				2886	u64 *prev_em_start)
				2887	{
				2888	struct inode *inode = page->mapping->host;
				2889	u64 start = page_offset(page);
				2890	u64 page_end = start + PAGE_SIZE - 1;
				2891	u64 end;
				2892	u64 cur = start;
				2893	u64 extent_offset;
				2894	u64 last_byte = i_size_read(inode);
				2895	u64 block_start;
				2896	u64 cur_end;
				2897	sector_t sector;
				2898	struct extent_map *em;
				2899	struct block_device *bdev;
				2900	int ret = 0;
				2901	int nr = 0;
				2902	size_t pg_offset = 0;
				2903	size_t iosize;
				2904	size_t disk_io_size;
				2905	size_t blocksize = inode->i_sb->s_blocksize;
				2906	unsigned long this_bio_flag = 0;
				2907
				2908	set_page_extent_mapped(page);
				2909
				2910	end = page_end;
				2911	if (!PageUptodate(page)) {
				2912	if (cleancache_get_page(page) == 0) {
				2913	BUG_ON(blocksize != PAGE_SIZE);
				2914	unlock_extent(tree, start, end);
				2915	goto out;
				2916	}
				2917	}
				2918
				2919	if (page->index == last_byte >> PAGE_SHIFT) {
				2920	char *userpage;
				2921	size_t zero_offset = last_byte & (PAGE_SIZE - 1);
				2922
				2923	if (zero_offset) {
				2924	iosize = PAGE_SIZE - zero_offset;
				2925	userpage = kmap_atomic(page);
				2926	memset(userpage + zero_offset, 0, iosize);
				2927	flush_dcache_page(page);
				2928	kunmap_atomic(userpage);
				2929	}
				2930	}
				2931	while (cur <= end) {
				2932	bool force_bio_submit = false;
				2933
				2934	if (cur >= last_byte) {
				2935	char *userpage;
				2936	struct extent_state *cached = NULL;
				2937
				2938	iosize = PAGE_SIZE - pg_offset;
				2939	userpage = kmap_atomic(page);
				2940	memset(userpage + pg_offset, 0, iosize);
				2941	flush_dcache_page(page);
				2942	kunmap_atomic(userpage);
				2943	set_extent_uptodate(tree, cur, cur + iosize - 1,
				2944	&cached, GFP_NOFS);
				2945	unlock_extent_cached(tree, cur,
				2946	cur + iosize - 1,
				2947	&cached, GFP_NOFS);
				2948	break;
				2949	}
				2950	em = __get_extent_map(inode, page, pg_offset, cur,
				2951	end - cur + 1, get_extent, em_cached);
				2952	if (IS_ERR_OR_NULL(em)) {
				2953	SetPageError(page);
				2954	unlock_extent(tree, cur, end);
				2955	break;
				2956	}
				2957	extent_offset = cur - em->start;
				2958	BUG_ON(extent_map_end(em) <= cur);
				2959	BUG_ON(end < cur);
				2960
				2961	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
				2962	this_bio_flag \|= EXTENT_BIO_COMPRESSED;
				2963	extent_set_compress_type(&this_bio_flag,
				2964	em->compress_type);
				2965	}
				2966
				2967	iosize = min(extent_map_end(em) - cur, end - cur + 1);
				2968	cur_end = min(extent_map_end(em) - 1, end);
				2969	iosize = ALIGN(iosize, blocksize);
				2970	if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
				2971	disk_io_size = em->block_len;
				2972	sector = em->block_start >> 9;
				2973	} else {
				2974	sector = (em->block_start + extent_offset) >> 9;
				2975	disk_io_size = iosize;
				2976	}
				2977	bdev = em->bdev;
				2978	block_start = em->block_start;
				2979	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
				2980	block_start = EXTENT_MAP_HOLE;
				2981
				2982	/*
				2983	* If we have a file range that points to a compressed extent
				2984	* and it's followed by a consecutive file range that points to
				2985	* to the same compressed extent (possibly with a different
				2986	* offset and/or length, so it either points to the whole extent
				2987	* or only part of it), we must make sure we do not submit a
				2988	* single bio to populate the pages for the 2 ranges because
				2989	* this makes the compressed extent read zero out the pages
				2990	* belonging to the 2nd range. Imagine the following scenario:
				2991	*
				2992	* File layout
				2993	* [0 - 8K] [8K - 24K]
				2994	* \| \|
				2995	* \| \|
				2996	* points to extent X, points to extent X,
				2997	* offset 4K, length of 8K offset 0, length 16K
				2998	*
				2999	* [extent X, compressed length = 4K uncompressed length = 16K]
				3000	*
				3001	* If the bio to read the compressed extent covers both ranges,
				3002	* it will decompress extent X into the pages belonging to the
				3003	* first range and then it will stop, zeroing out the remaining
				3004	* pages that belong to the other range that points to extent X.
				3005	* So here we make sure we submit 2 bios, one for the first
				3006	* range and another one for the third range. Both will target
				3007	* the same physical extent from disk, but we can't currently
				3008	* make the compressed bio endio callback populate the pages
				3009	* for both ranges because each compressed bio is tightly
				3010	* coupled with a single extent map, and each range can have
				3011	* an extent map with a different offset value relative to the
				3012	* uncompressed data of our extent and different lengths. This
				3013	* is a corner case so we prioritize correctness over
				3014	* non-optimal behavior (submitting 2 bios for the same extent).
				3015	*/
				3016	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
				3017	prev_em_start && *prev_em_start != (u64)-1 &&
				3018	*prev_em_start != em->start)
				3019	force_bio_submit = true;
				3020
				3021	if (prev_em_start)
				3022	*prev_em_start = em->start;
				3023
				3024	free_extent_map(em);
				3025	em = NULL;
				3026
				3027	/* we've found a hole, just zero and go on */
				3028	if (block_start == EXTENT_MAP_HOLE) {
				3029	char *userpage;
				3030	struct extent_state *cached = NULL;
				3031
				3032	userpage = kmap_atomic(page);
				3033	memset(userpage + pg_offset, 0, iosize);
				3034	flush_dcache_page(page);
				3035	kunmap_atomic(userpage);
				3036
				3037	set_extent_uptodate(tree, cur, cur + iosize - 1,
				3038	&cached, GFP_NOFS);
				3039	unlock_extent_cached(tree, cur,
				3040	cur + iosize - 1,
				3041	&cached, GFP_NOFS);
				3042	cur = cur + iosize;
				3043	pg_offset += iosize;
				3044	continue;
				3045	}
				3046	/* the get_extent function already copied into the page */
				3047	if (test_range_bit(tree, cur, cur_end,
				3048	EXTENT_UPTODATE, 1, NULL)) {
				3049	check_page_uptodate(tree, page);
				3050	unlock_extent(tree, cur, cur + iosize - 1);
				3051	cur = cur + iosize;
				3052	pg_offset += iosize;
				3053	continue;
				3054	}
				3055	/* we have an inline extent but it didn't get marked up
				3056	* to date. Error out
				3057	*/
				3058	if (block_start == EXTENT_MAP_INLINE) {
				3059	SetPageError(page);
				3060	unlock_extent(tree, cur, cur + iosize - 1);
				3061	cur = cur + iosize;
				3062	pg_offset += iosize;
				3063	continue;
				3064	}
				3065
				3066	ret = submit_extent_page(REQ_OP_READ \| read_flags, tree, NULL,
				3067	page, sector, disk_io_size, pg_offset,
				3068	bdev, bio,
				3069	end_bio_extent_readpage, mirror_num,
				3070	*bio_flags,
				3071	this_bio_flag,
				3072	force_bio_submit);
				3073	if (!ret) {
				3074	nr++;
				3075	*bio_flags = this_bio_flag;
				3076	} else {
				3077	SetPageError(page);
				3078	unlock_extent(tree, cur, cur + iosize - 1);
				3079	goto out;
				3080	}
				3081	cur = cur + iosize;
				3082	pg_offset += iosize;
				3083	}
				3084	out:
				3085	if (!nr) {
				3086	if (!PageError(page))
				3087	SetPageUptodate(page);
				3088	unlock_page(page);
				3089	}
				3090	return ret;
				3091	}
				3092
				3093	static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
				3094	struct page *pages[], int nr_pages,
				3095	u64 start, u64 end,
				3096	get_extent_t *get_extent,
				3097	struct extent_map **em_cached,
				3098	struct bio **bio, int mirror_num,
				3099	unsigned long *bio_flags,
				3100	u64 *prev_em_start)
				3101	{
				3102	struct inode *inode;
				3103	struct btrfs_ordered_extent *ordered;
				3104	int index;
				3105
				3106	inode = pages[0]->mapping->host;
				3107	while (1) {
				3108	lock_extent(tree, start, end);
				3109	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
				3110	end - start + 1);
				3111	if (!ordered)
				3112	break;
				3113	unlock_extent(tree, start, end);
				3114	btrfs_start_ordered_extent(inode, ordered, 1);
				3115	btrfs_put_ordered_extent(ordered);
				3116	}
				3117
				3118	for (index = 0; index < nr_pages; index++) {
				3119	__do_readpage(tree, pages[index], get_extent, em_cached, bio,
				3120	mirror_num, bio_flags, 0, prev_em_start);
				3121	put_page(pages[index]);
				3122	}
				3123	}
				3124
				3125	static void __extent_readpages(struct extent_io_tree *tree,
				3126	struct page *pages[],
				3127	int nr_pages, get_extent_t *get_extent,
				3128	struct extent_map **em_cached,
				3129	struct bio **bio, int mirror_num,
				3130	unsigned long *bio_flags,
				3131	u64 *prev_em_start)
				3132	{
				3133	u64 start = 0;
				3134	u64 end = 0;
				3135	u64 page_start;
				3136	int index;
				3137	int first_index = 0;
				3138
				3139	for (index = 0; index < nr_pages; index++) {
				3140	page_start = page_offset(pages[index]);
				3141	if (!end) {
				3142	start = page_start;
				3143	end = start + PAGE_SIZE - 1;
				3144	first_index = index;
				3145	} else if (end + 1 == page_start) {
				3146	end += PAGE_SIZE;
				3147	} else {
				3148	__do_contiguous_readpages(tree, &pages[first_index],
				3149	index - first_index, start,
				3150	end, get_extent, em_cached,
				3151	bio, mirror_num, bio_flags,
				3152	prev_em_start);
				3153	start = page_start;
				3154	end = start + PAGE_SIZE - 1;
				3155	first_index = index;
				3156	}
				3157	}
				3158
				3159	if (end)
				3160	__do_contiguous_readpages(tree, &pages[first_index],
				3161	index - first_index, start,
				3162	end, get_extent, em_cached, bio,
				3163	mirror_num, bio_flags,
				3164	prev_em_start);
				3165	}
				3166
				3167	static int __extent_read_full_page(struct extent_io_tree *tree,
				3168	struct page *page,
				3169	get_extent_t *get_extent,
				3170	struct bio **bio, int mirror_num,
				3171	unsigned long *bio_flags,
				3172	unsigned int read_flags)
				3173	{
				3174	struct inode *inode = page->mapping->host;
				3175	struct btrfs_ordered_extent *ordered;
				3176	u64 start = page_offset(page);
				3177	u64 end = start + PAGE_SIZE - 1;
				3178	int ret;
				3179
				3180	while (1) {
				3181	lock_extent(tree, start, end);
				3182	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
				3183	PAGE_SIZE);
				3184	if (!ordered)
				3185	break;
				3186	unlock_extent(tree, start, end);
				3187	btrfs_start_ordered_extent(inode, ordered, 1);
				3188	btrfs_put_ordered_extent(ordered);
				3189	}
				3190
				3191	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
				3192	bio_flags, read_flags, NULL);
				3193	return ret;
				3194	}
				3195
				3196	int extent_read_full_page(struct extent_io_tree tree, struct page page,
				3197	get_extent_t *get_extent, int mirror_num)
				3198	{
				3199	struct bio *bio = NULL;
				3200	unsigned long bio_flags = 0;
				3201	int ret;
				3202
				3203	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
				3204	&bio_flags, 0);
				3205	if (bio)
				3206	ret = submit_one_bio(bio, mirror_num, bio_flags);
				3207	return ret;
				3208	}
				3209
				3210	static void update_nr_written(struct writeback_control *wbc,
				3211	unsigned long nr_written)
				3212	{
				3213	wbc->nr_to_write -= nr_written;
				3214	}
				3215
				3216	/*
				3217	* helper for __extent_writepage, doing all of the delayed allocation setup.
				3218	*
				3219	* This returns 1 if our fill_delalloc function did all the work required
				3220	* to write the page (copy into inline extent). In this case the IO has
				3221	* been started and the page is already unlocked.
				3222	*
				3223	* This returns 0 if all went well (page still locked)
				3224	* This returns < 0 if there were errors (page still locked)
				3225	*/
				3226	static noinline_for_stack int writepage_delalloc(struct inode *inode,
				3227	struct page page, struct writeback_control wbc,
				3228	struct extent_page_data *epd,
				3229	u64 delalloc_start,
				3230	unsigned long *nr_written)
				3231	{
				3232	struct extent_io_tree *tree = epd->tree;
				3233	u64 page_end = delalloc_start + PAGE_SIZE - 1;
				3234	u64 nr_delalloc;
				3235	u64 delalloc_to_write = 0;
				3236	u64 delalloc_end = 0;
				3237	int ret;
				3238	int page_started = 0;
				3239
				3240	if (epd->extent_locked \|\| !tree->ops \|\| !tree->ops->fill_delalloc)
				3241	return 0;
				3242
				3243	while (delalloc_end < page_end) {
				3244	nr_delalloc = find_lock_delalloc_range(inode, tree,
				3245	page,
				3246	&delalloc_start,
				3247	&delalloc_end,
				3248	BTRFS_MAX_EXTENT_SIZE);
				3249	if (nr_delalloc == 0) {
				3250	delalloc_start = delalloc_end + 1;
				3251	continue;
				3252	}
				3253	ret = tree->ops->fill_delalloc(inode, page,
				3254	delalloc_start,
				3255	delalloc_end,
				3256	&page_started,
				3257	nr_written);
				3258	/* File system has been set read-only */
				3259	if (ret) {
				3260	SetPageError(page);
				3261	/* fill_delalloc should be return < 0 for error
				3262	* but just in case, we use > 0 here meaning the
				3263	* IO is started, so we don't want to return > 0
				3264	* unless things are going well.
				3265	*/
				3266	ret = ret < 0 ? ret : -EIO;
				3267	goto done;
				3268	}
				3269	/*
				3270	* delalloc_end is already one less than the total length, so
				3271	* we don't subtract one from PAGE_SIZE
				3272	*/
				3273	delalloc_to_write += (delalloc_end - delalloc_start +
				3274	PAGE_SIZE) >> PAGE_SHIFT;
				3275	delalloc_start = delalloc_end + 1;
				3276	}
				3277	if (wbc->nr_to_write < delalloc_to_write) {
				3278	int thresh = 8192;
				3279
				3280	if (delalloc_to_write < thresh * 2)
				3281	thresh = delalloc_to_write;
				3282	wbc->nr_to_write = min_t(u64, delalloc_to_write,
				3283	thresh);
				3284	}
				3285
				3286	/* did the fill delalloc function already unlock and start
				3287	* the IO?
				3288	*/
				3289	if (page_started) {
				3290	/*
				3291	* we've unlocked the page, so we can't update
				3292	* the mapping's writeback index, just update
				3293	* nr_to_write.
				3294	*/
				3295	wbc->nr_to_write -= *nr_written;
				3296	return 1;
				3297	}
				3298
				3299	ret = 0;
				3300
				3301	done:
				3302	return ret;
				3303	}
				3304
				3305	/*
				3306	* helper for __extent_writepage. This calls the writepage start hooks,
				3307	* and does the loop to map the page into extents and bios.
				3308	*
				3309	* We return 1 if the IO is started and the page is unlocked,
				3310	* 0 if all went well (page still locked)
				3311	* < 0 if there were errors (page still locked)
				3312	*/
				3313	static noinline_for_stack int __extent_writepage_io(struct inode *inode,
				3314	struct page *page,
				3315	struct writeback_control *wbc,
				3316	struct extent_page_data *epd,
				3317	loff_t i_size,
				3318	unsigned long nr_written,
				3319	unsigned int write_flags, int *nr_ret)
				3320	{
				3321	struct extent_io_tree *tree = epd->tree;
				3322	u64 start = page_offset(page);
				3323	u64 page_end = start + PAGE_SIZE - 1;
				3324	u64 end;
				3325	u64 cur = start;
				3326	u64 extent_offset;
				3327	u64 block_start;
				3328	u64 iosize;
				3329	sector_t sector;
				3330	struct extent_map *em;
				3331	struct block_device *bdev;
				3332	size_t pg_offset = 0;
				3333	size_t blocksize;
				3334	int ret = 0;
				3335	int nr = 0;
				3336	bool compressed;
				3337
				3338	if (tree->ops && tree->ops->writepage_start_hook) {
				3339	ret = tree->ops->writepage_start_hook(page, start,
				3340	page_end);
				3341	if (ret) {
				3342	/* Fixup worker will requeue */
				3343	if (ret == -EBUSY)
				3344	wbc->pages_skipped++;
				3345	else
				3346	redirty_page_for_writepage(wbc, page);
				3347
				3348	update_nr_written(wbc, nr_written);
				3349	unlock_page(page);
				3350	return 1;
				3351	}
				3352	}
				3353
				3354	/*
				3355	* we don't want to touch the inode after unlocking the page,
				3356	* so we update the mapping writeback index now
				3357	*/
				3358	update_nr_written(wbc, nr_written + 1);
				3359
				3360	end = page_end;
				3361	if (i_size <= start) {
				3362	if (tree->ops && tree->ops->writepage_end_io_hook)
				3363	tree->ops->writepage_end_io_hook(page, start,
				3364	page_end, NULL, 1);
				3365	goto done;
				3366	}
				3367
				3368	blocksize = inode->i_sb->s_blocksize;
				3369
				3370	while (cur <= end) {
				3371	u64 em_end;
				3372
				3373	if (cur >= i_size) {
				3374	if (tree->ops && tree->ops->writepage_end_io_hook)
				3375	tree->ops->writepage_end_io_hook(page, cur,
				3376	page_end, NULL, 1);
				3377	break;
				3378	}
				3379	em = epd->get_extent(BTRFS_I(inode), page, pg_offset, cur,
				3380	end - cur + 1, 1);
				3381	if (IS_ERR_OR_NULL(em)) {
				3382	SetPageError(page);
				3383	ret = PTR_ERR_OR_ZERO(em);
				3384	break;
				3385	}
				3386
				3387	extent_offset = cur - em->start;
				3388	em_end = extent_map_end(em);
				3389	BUG_ON(em_end <= cur);
				3390	BUG_ON(end < cur);
				3391	iosize = min(em_end - cur, end - cur + 1);
				3392	iosize = ALIGN(iosize, blocksize);
				3393	sector = (em->block_start + extent_offset) >> 9;
				3394	bdev = em->bdev;
				3395	block_start = em->block_start;
				3396	compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
				3397	free_extent_map(em);
				3398	em = NULL;
				3399
				3400	/*
				3401	* compressed and inline extents are written through other
				3402	* paths in the FS
				3403	*/
				3404	if (compressed \|\| block_start == EXTENT_MAP_HOLE \|\|
				3405	block_start == EXTENT_MAP_INLINE) {
				3406	/*
				3407	* end_io notification does not happen here for
				3408	* compressed extents
				3409	*/
				3410	if (!compressed && tree->ops &&
				3411	tree->ops->writepage_end_io_hook)
				3412	tree->ops->writepage_end_io_hook(page, cur,
				3413	cur + iosize - 1,
				3414	NULL, 1);
				3415	else if (compressed) {
				3416	/* we don't want to end_page_writeback on
				3417	* a compressed extent. this happens
				3418	* elsewhere
				3419	*/
				3420	nr++;
				3421	}
				3422
				3423	cur += iosize;
				3424	pg_offset += iosize;
				3425	continue;
				3426	}
				3427
				3428	set_range_writeback(tree, cur, cur + iosize - 1);
				3429	if (!PageWriteback(page)) {
				3430	btrfs_err(BTRFS_I(inode)->root->fs_info,
				3431	"page %lu not writeback, cur %llu end %llu",
				3432	page->index, cur, end);
				3433	}
				3434
				3435	ret = submit_extent_page(REQ_OP_WRITE \| write_flags, tree, wbc,
				3436	page, sector, iosize, pg_offset,
				3437	bdev, &epd->bio,
				3438	end_bio_extent_writepage,
				3439	0, 0, 0, false);
				3440	if (ret) {
				3441	SetPageError(page);
				3442	if (PageWriteback(page))
				3443	end_page_writeback(page);
				3444	}
				3445
				3446	cur = cur + iosize;
				3447	pg_offset += iosize;
				3448	nr++;
				3449	}
				3450	done:
				3451	*nr_ret = nr;
				3452	return ret;
				3453	}
				3454
				3455	/*
				3456	* the writepage semantics are similar to regular writepage. extent
				3457	* records are inserted to lock ranges in the tree, and as dirty areas
				3458	* are found, they are marked writeback. Then the lock bits are removed
				3459	* and the end_io handler clears the writeback ranges
				3460	*/
				3461	static int __extent_writepage(struct page page, struct writeback_control wbc,
				3462	void *data)
				3463	{
				3464	struct inode *inode = page->mapping->host;
				3465	struct extent_page_data *epd = data;
				3466	u64 start = page_offset(page);
				3467	u64 page_end = start + PAGE_SIZE - 1;
				3468	int ret;
				3469	int nr = 0;
				3470	size_t pg_offset = 0;
				3471	loff_t i_size = i_size_read(inode);
				3472	unsigned long end_index = i_size >> PAGE_SHIFT;
				3473	unsigned int write_flags = 0;
				3474	unsigned long nr_written = 0;
				3475
				3476	write_flags = wbc_to_write_flags(wbc);
				3477
				3478	trace___extent_writepage(page, inode, wbc);
				3479
				3480	WARN_ON(!PageLocked(page));
				3481
				3482	ClearPageError(page);
				3483
				3484	pg_offset = i_size & (PAGE_SIZE - 1);
				3485	if (page->index > end_index \|\|
				3486	(page->index == end_index && !pg_offset)) {
				3487	page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
				3488	unlock_page(page);
				3489	return 0;
				3490	}
				3491
				3492	if (page->index == end_index) {
				3493	char *userpage;
				3494
				3495	userpage = kmap_atomic(page);
				3496	memset(userpage + pg_offset, 0,
				3497	PAGE_SIZE - pg_offset);
				3498	kunmap_atomic(userpage);
				3499	flush_dcache_page(page);
				3500	}
				3501
				3502	pg_offset = 0;
				3503
				3504	set_page_extent_mapped(page);
				3505
				3506	ret = writepage_delalloc(inode, page, wbc, epd, start, &nr_written);
				3507	if (ret == 1)
				3508	goto done_unlocked;
				3509	if (ret)
				3510	goto done;
				3511
				3512	ret = __extent_writepage_io(inode, page, wbc, epd,
				3513	i_size, nr_written, write_flags, &nr);
				3514	if (ret == 1)
				3515	goto done_unlocked;
				3516
				3517	done:
				3518	if (nr == 0) {
				3519	/* make sure the mapping tag for page dirty gets cleared */
				3520	set_page_writeback(page);
				3521	end_page_writeback(page);
				3522	}
				3523	if (PageError(page)) {
				3524	ret = ret < 0 ? ret : -EIO;
				3525	end_extent_writepage(page, ret, start, page_end);
				3526	}
				3527	unlock_page(page);
				3528	return ret;
				3529
				3530	done_unlocked:
				3531	return 0;
				3532	}
				3533
				3534	void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
				3535	{
				3536	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
				3537	TASK_UNINTERRUPTIBLE);
				3538	}
				3539
				3540	static noinline_for_stack int
				3541	lock_extent_buffer_for_io(struct extent_buffer *eb,
				3542	struct btrfs_fs_info *fs_info,
				3543	struct extent_page_data *epd)
				3544	{
				3545	unsigned long i, num_pages;
				3546	int flush = 0;
				3547	int ret = 0;
				3548
				3549	if (!btrfs_try_tree_write_lock(eb)) {
				3550	flush = 1;
				3551	flush_write_bio(epd);
				3552	btrfs_tree_lock(eb);
				3553	}
				3554
				3555	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
				3556	btrfs_tree_unlock(eb);
				3557	if (!epd->sync_io)
				3558	return 0;
				3559	if (!flush) {
				3560	flush_write_bio(epd);
				3561	flush = 1;
				3562	}
				3563	while (1) {
				3564	wait_on_extent_buffer_writeback(eb);
				3565	btrfs_tree_lock(eb);
				3566	if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
				3567	break;
				3568	btrfs_tree_unlock(eb);
				3569	}
				3570	}
				3571
				3572	/*
				3573	* We need to do this to prevent races in people who check if the eb is
				3574	* under IO since we can end up having no IO bits set for a short period
				3575	* of time.
				3576	*/
				3577	spin_lock(&eb->refs_lock);
				3578	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
				3579	set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
				3580	spin_unlock(&eb->refs_lock);
				3581	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
				3582	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
				3583	-eb->len,
				3584	fs_info->dirty_metadata_batch);
				3585	ret = 1;
				3586	} else {
				3587	spin_unlock(&eb->refs_lock);
				3588	}
				3589
				3590	btrfs_tree_unlock(eb);
				3591
				3592	if (!ret)
				3593	return ret;
				3594
				3595	num_pages = num_extent_pages(eb->start, eb->len);
				3596	for (i = 0; i < num_pages; i++) {
				3597	struct page *p = eb->pages[i];
				3598
				3599	if (!trylock_page(p)) {
				3600	if (!flush) {
				3601	flush_write_bio(epd);
				3602	flush = 1;
				3603	}
				3604	lock_page(p);
				3605	}
				3606	}
				3607
				3608	return ret;
				3609	}
				3610
				3611	static void end_extent_buffer_writeback(struct extent_buffer *eb)
				3612	{
				3613	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
				3614	smp_mb__after_atomic();
				3615	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
				3616	}
				3617
				3618	static void set_btree_ioerr(struct page *page)
				3619	{
				3620	struct extent_buffer eb = (struct extent_buffer )page->private;
				3621
				3622	SetPageError(page);
				3623	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
				3624	return;
				3625
				3626	/*
				3627	* If writeback for a btree extent that doesn't belong to a log tree
				3628	* failed, increment the counter transaction->eb_write_errors.
				3629	* We do this because while the transaction is running and before it's
				3630	* committing (when we call filemap_fdata[write\|wait]_range against
				3631	* the btree inode), we might have
				3632	* btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
				3633	* returns an error or an error happens during writeback, when we're
				3634	* committing the transaction we wouldn't know about it, since the pages
				3635	* can be no longer dirty nor marked anymore for writeback (if a
				3636	* subsequent modification to the extent buffer didn't happen before the
				3637	* transaction commit), which makes filemap_fdata[write\|wait]_range not
				3638	* able to find the pages tagged with SetPageError at transaction
				3639	* commit time. So if this happens we must abort the transaction,
				3640	* otherwise we commit a super block with btree roots that point to
				3641	* btree nodes/leafs whose content on disk is invalid - either garbage
				3642	* or the content of some node/leaf from a past generation that got
				3643	* cowed or deleted and is no longer valid.
				3644	*
				3645	* Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
				3646	* not be enough - we need to distinguish between log tree extents vs
				3647	* non-log tree extents, and the next filemap_fdatawait_range() call
				3648	* will catch and clear such errors in the mapping - and that call might
				3649	* be from a log sync and not from a transaction commit. Also, checking
				3650	* for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
				3651	* not done and would not be reliable - the eb might have been released
				3652	* from memory and reading it back again means that flag would not be
				3653	* set (since it's a runtime flag, not persisted on disk).
				3654	*
				3655	* Using the flags below in the btree inode also makes us achieve the
				3656	* goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
				3657	* writeback for all dirty pages and before filemap_fdatawait_range()
				3658	* is called, the writeback for all dirty pages had already finished
				3659	* with errors - because we were not using AS_EIO/AS_ENOSPC,
				3660	* filemap_fdatawait_range() would return success, as it could not know
				3661	* that writeback errors happened (the pages were no longer tagged for
				3662	* writeback).
				3663	*/
				3664	switch (eb->log_index) {
				3665	case -1:
				3666	set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
				3667	break;
				3668	case 0:
				3669	set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
				3670	break;
				3671	case 1:
				3672	set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
				3673	break;
				3674	default:
				3675	BUG(); /* unexpected, logic error */
				3676	}
				3677	}
				3678
				3679	static void end_bio_extent_buffer_writepage(struct bio *bio)
				3680	{
				3681	struct bio_vec *bvec;
				3682	struct extent_buffer *eb;
				3683	int i, done;
				3684
				3685	ASSERT(!bio_flagged(bio, BIO_CLONED));
				3686	bio_for_each_segment_all(bvec, bio, i) {
				3687	struct page *page = bvec->bv_page;
				3688
				3689	eb = (struct extent_buffer *)page->private;
				3690	BUG_ON(!eb);
				3691	done = atomic_dec_and_test(&eb->io_pages);
				3692
				3693	if (bio->bi_status \|\|
				3694	test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
				3695	ClearPageUptodate(page);
				3696	set_btree_ioerr(page);
				3697	}
				3698
				3699	end_page_writeback(page);
				3700
				3701	if (!done)
				3702	continue;
				3703
				3704	end_extent_buffer_writeback(eb);
				3705	}
				3706
				3707	bio_put(bio);
				3708	}
				3709
				3710	static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
				3711	struct btrfs_fs_info *fs_info,
				3712	struct writeback_control *wbc,
				3713	struct extent_page_data *epd)
				3714	{
				3715	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
				3716	struct extent_io_tree *tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
				3717	u64 offset = eb->start;
				3718	u32 nritems;
				3719	unsigned long i, num_pages;
				3720	unsigned long bio_flags = 0;
				3721	unsigned long start, end;
				3722	unsigned int write_flags = wbc_to_write_flags(wbc) \| REQ_META;
				3723	int ret = 0;
				3724
				3725	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
				3726	num_pages = num_extent_pages(eb->start, eb->len);
				3727	atomic_set(&eb->io_pages, num_pages);
				3728	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
				3729	bio_flags = EXTENT_BIO_TREE_LOG;
				3730
				3731	/* set btree blocks beyond nritems with 0 to avoid stale content. */
				3732	nritems = btrfs_header_nritems(eb);
				3733	if (btrfs_header_level(eb) > 0) {
				3734	end = btrfs_node_key_ptr_offset(nritems);
				3735
				3736	memzero_extent_buffer(eb, end, eb->len - end);
				3737	} else {
				3738	/*
				3739	* leaf:
				3740	* header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
				3741	*/
				3742	start = btrfs_item_nr_offset(nritems);
				3743	end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, eb);
				3744	memzero_extent_buffer(eb, start, end - start);
				3745	}
				3746
				3747	for (i = 0; i < num_pages; i++) {
				3748	struct page *p = eb->pages[i];
				3749
				3750	clear_page_dirty_for_io(p);
				3751	set_page_writeback(p);
				3752	ret = submit_extent_page(REQ_OP_WRITE \| write_flags, tree, wbc,
				3753	p, offset >> 9, PAGE_SIZE, 0, bdev,
				3754	&epd->bio,
				3755	end_bio_extent_buffer_writepage,
				3756	0, epd->bio_flags, bio_flags, false);
				3757	epd->bio_flags = bio_flags;
				3758	if (ret) {
				3759	set_btree_ioerr(p);
				3760	if (PageWriteback(p))
				3761	end_page_writeback(p);
				3762	if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
				3763	end_extent_buffer_writeback(eb);
				3764	ret = -EIO;
				3765	break;
				3766	}
				3767	offset += PAGE_SIZE;
				3768	update_nr_written(wbc, 1);
				3769	unlock_page(p);
				3770	}
				3771
				3772	if (unlikely(ret)) {
				3773	for (; i < num_pages; i++) {
				3774	struct page *p = eb->pages[i];
				3775	clear_page_dirty_for_io(p);
				3776	unlock_page(p);
				3777	}
				3778	}
				3779
				3780	return ret;
				3781	}
				3782
				3783	int btree_write_cache_pages(struct address_space *mapping,
				3784	struct writeback_control *wbc)
				3785	{
				3786	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
				3787	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
				3788	struct extent_buffer eb, prev_eb = NULL;
				3789	struct extent_page_data epd = {
				3790	.bio = NULL,
				3791	.tree = tree,
				3792	.extent_locked = 0,
				3793	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				3794	.bio_flags = 0,
				3795	};
				3796	int ret = 0;
				3797	int done = 0;
				3798	int nr_to_write_done = 0;
				3799	struct pagevec pvec;
				3800	int nr_pages;
				3801	pgoff_t index;
				3802	pgoff_t end; /* Inclusive */
				3803	int scanned = 0;
				3804	int tag;
				3805
				3806	pagevec_init(&pvec, 0);
				3807	if (wbc->range_cyclic) {
				3808	index = mapping->writeback_index; /* Start from prev offset */
				3809	end = -1;
				3810	} else {
				3811	index = wbc->range_start >> PAGE_SHIFT;
				3812	end = wbc->range_end >> PAGE_SHIFT;
				3813	scanned = 1;
				3814	}
				3815	if (wbc->sync_mode == WB_SYNC_ALL)
				3816	tag = PAGECACHE_TAG_TOWRITE;
				3817	else
				3818	tag = PAGECACHE_TAG_DIRTY;
				3819	retry:
				3820	if (wbc->sync_mode == WB_SYNC_ALL)
				3821	tag_pages_for_writeback(mapping, index, end);
				3822	while (!done && !nr_to_write_done && (index <= end) &&
				3823	(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
				3824	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
				3825	unsigned i;
				3826
				3827	scanned = 1;
				3828	for (i = 0; i < nr_pages; i++) {
				3829	struct page *page = pvec.pages[i];
				3830
				3831	if (!PagePrivate(page))
				3832	continue;
				3833
				3834	if (!wbc->range_cyclic && page->index > end) {
				3835	done = 1;
				3836	break;
				3837	}
				3838
				3839	spin_lock(&mapping->private_lock);
				3840	if (!PagePrivate(page)) {
				3841	spin_unlock(&mapping->private_lock);
				3842	continue;
				3843	}
				3844
				3845	eb = (struct extent_buffer *)page->private;
				3846
				3847	/*
				3848	* Shouldn't happen and normally this would be a BUG_ON
				3849	* but no sense in crashing the users box for something
				3850	* we can survive anyway.
				3851	*/
				3852	if (WARN_ON(!eb)) {
				3853	spin_unlock(&mapping->private_lock);
				3854	continue;
				3855	}
				3856
				3857	if (eb == prev_eb) {
				3858	spin_unlock(&mapping->private_lock);
				3859	continue;
				3860	}
				3861
				3862	ret = atomic_inc_not_zero(&eb->refs);
				3863	spin_unlock(&mapping->private_lock);
				3864	if (!ret)
				3865	continue;
				3866
				3867	prev_eb = eb;
				3868	ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
				3869	if (!ret) {
				3870	free_extent_buffer(eb);
				3871	continue;
				3872	}
				3873
				3874	ret = write_one_eb(eb, fs_info, wbc, &epd);
				3875	if (ret) {
				3876	done = 1;
				3877	free_extent_buffer(eb);
				3878	break;
				3879	}
				3880	free_extent_buffer(eb);
				3881
				3882	/*
				3883	* the filesystem may choose to bump up nr_to_write.
				3884	* We have to make sure to honor the new nr_to_write
				3885	* at any time
				3886	*/
				3887	nr_to_write_done = wbc->nr_to_write <= 0;
				3888	}
				3889	pagevec_release(&pvec);
				3890	cond_resched();
				3891	}
				3892	if (!scanned && !done) {
				3893	/*
				3894	* We hit the last page and there is more work to be done: wrap
				3895	* back to the start of the file
				3896	*/
				3897	scanned = 1;
				3898	index = 0;
				3899	goto retry;
				3900	}
				3901	flush_write_bio(&epd);
				3902	return ret;
				3903	}
				3904
				3905	/**
				3906	* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
				3907	* @mapping: address space structure to write
				3908	* @wbc: subtract the number of written pages from *@wbc->nr_to_write
				3909	* @writepage: function called for each page
				3910	* @data: data passed to writepage function
				3911	*
				3912	* If a page is already under I/O, write_cache_pages() skips it, even
				3913	* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
				3914	* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
				3915	* and msync() need to guarantee that all the data which was dirty at the time
				3916	* the call was made get new I/O started against them. If wbc->sync_mode is
				3917	* WB_SYNC_ALL then we were called for data integrity and we must wait for
				3918	* existing IO to complete.
				3919	*/
				3920	static int extent_write_cache_pages(struct address_space *mapping,
				3921	struct writeback_control *wbc,
				3922	writepage_t writepage, void *data,
				3923	void (flush_fn)(void ))
				3924	{
				3925	struct inode *inode = mapping->host;
				3926	int ret = 0;
				3927	int done = 0;
				3928	int nr_to_write_done = 0;
				3929	struct pagevec pvec;
				3930	int nr_pages;
				3931	pgoff_t index;
				3932	pgoff_t end; /* Inclusive */
				3933	pgoff_t done_index;
				3934	int range_whole = 0;
				3935	int scanned = 0;
				3936	int tag;
				3937
				3938	/*
				3939	* We have to hold onto the inode so that ordered extents can do their
				3940	* work when the IO finishes. The alternative to this is failing to add
				3941	* an ordered extent if the igrab() fails there and that is a huge pain
				3942	* to deal with, so instead just hold onto the inode throughout the
				3943	* writepages operation. If it fails here we are freeing up the inode
				3944	* anyway and we'd rather not waste our time writing out stuff that is
				3945	* going to be truncated anyway.
				3946	*/
				3947	if (!igrab(inode))
				3948	return 0;
				3949
				3950	pagevec_init(&pvec, 0);
				3951	if (wbc->range_cyclic) {
				3952	index = mapping->writeback_index; /* Start from prev offset */
				3953	end = -1;
				3954	} else {
				3955	index = wbc->range_start >> PAGE_SHIFT;
				3956	end = wbc->range_end >> PAGE_SHIFT;
				3957	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
				3958	range_whole = 1;
				3959	scanned = 1;
				3960	}
				3961	if (wbc->sync_mode == WB_SYNC_ALL)
				3962	tag = PAGECACHE_TAG_TOWRITE;
				3963	else
				3964	tag = PAGECACHE_TAG_DIRTY;
				3965	retry:
				3966	if (wbc->sync_mode == WB_SYNC_ALL)
				3967	tag_pages_for_writeback(mapping, index, end);
				3968	done_index = index;
				3969	while (!done && !nr_to_write_done && (index <= end) &&
				3970	(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
				3971	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
				3972	unsigned i;
				3973
				3974	scanned = 1;
				3975	for (i = 0; i < nr_pages; i++) {
				3976	struct page *page = pvec.pages[i];
				3977
				3978	done_index = page->index;
				3979	/*
				3980	* At this point we hold neither mapping->tree_lock nor
				3981	* lock on the page itself: the page may be truncated or
				3982	* invalidated (changing page->mapping to NULL), or even
				3983	* swizzled back from swapper_space to tmpfs file
				3984	* mapping
				3985	*/
				3986	if (!trylock_page(page)) {
				3987	flush_fn(data);
				3988	lock_page(page);
				3989	}
				3990
				3991	if (unlikely(page->mapping != mapping)) {
				3992	unlock_page(page);
				3993	continue;
				3994	}
				3995
				3996	if (!wbc->range_cyclic && page->index > end) {
				3997	done = 1;
				3998	unlock_page(page);
				3999	continue;
				4000	}
				4001
				4002	if (wbc->sync_mode != WB_SYNC_NONE) {
				4003	if (PageWriteback(page))
				4004	flush_fn(data);
				4005	wait_on_page_writeback(page);
				4006	}
				4007
				4008	if (PageWriteback(page) \|\|
				4009	!clear_page_dirty_for_io(page)) {
				4010	unlock_page(page);
				4011	continue;
				4012	}
				4013
				4014	ret = (*writepage)(page, wbc, data);
				4015
				4016	if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
				4017	unlock_page(page);
				4018	ret = 0;
				4019	}
				4020	if (ret < 0) {
				4021	/*
				4022	* done_index is set past this page,
				4023	* so media errors will not choke
				4024	* background writeout for the entire
				4025	* file. This has consequences for
				4026	* range_cyclic semantics (ie. it may
				4027	* not be suitable for data integrity
				4028	* writeout).
				4029	*/
				4030	done_index = page->index + 1;
				4031	done = 1;
				4032	break;
				4033	}
				4034
				4035	/*
				4036	* the filesystem may choose to bump up nr_to_write.
				4037	* We have to make sure to honor the new nr_to_write
				4038	* at any time
				4039	*/
				4040	nr_to_write_done = wbc->nr_to_write <= 0;
				4041	}
				4042	pagevec_release(&pvec);
				4043	cond_resched();
				4044	}
				4045	if (!scanned && !done) {
				4046	/*
				4047	* We hit the last page and there is more work to be done: wrap
				4048	* back to the start of the file
				4049	*/
				4050	scanned = 1;
				4051	index = 0;
				4052
				4053	/*
				4054	* If we're looping we could run into a page that is locked by a
				4055	* writer and that writer could be waiting on writeback for a
				4056	* page in our current bio, and thus deadlock, so flush the
				4057	* write bio here.
				4058	*/
				4059	flush_write_bio(data);
				4060	goto retry;
				4061	}
				4062
				4063	if (wbc->range_cyclic \|\| (wbc->nr_to_write > 0 && range_whole))
				4064	mapping->writeback_index = done_index;
				4065
				4066	btrfs_add_delayed_iput(inode);
				4067	return ret;
				4068	}
				4069
				4070	static void flush_epd_write_bio(struct extent_page_data *epd)
				4071	{
				4072	if (epd->bio) {
				4073	int ret;
				4074
				4075	ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
				4076	BUG_ON(ret < 0); /* -ENOMEM */
				4077	epd->bio = NULL;
				4078	}
				4079	}
				4080
				4081	static noinline void flush_write_bio(void *data)
				4082	{
				4083	struct extent_page_data *epd = data;
				4084	flush_epd_write_bio(epd);
				4085	}
				4086
				4087	int extent_write_full_page(struct extent_io_tree tree, struct page page,
				4088	get_extent_t *get_extent,
				4089	struct writeback_control *wbc)
				4090	{
				4091	int ret;
				4092	struct extent_page_data epd = {
				4093	.bio = NULL,
				4094	.tree = tree,
				4095	.get_extent = get_extent,
				4096	.extent_locked = 0,
				4097	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				4098	.bio_flags = 0,
				4099	};
				4100
				4101	ret = __extent_writepage(page, wbc, &epd);
				4102
				4103	flush_epd_write_bio(&epd);
				4104	return ret;
				4105	}
				4106
				4107	int extent_write_locked_range(struct extent_io_tree tree, struct inode inode,
				4108	u64 start, u64 end, get_extent_t *get_extent,
				4109	int mode)
				4110	{
				4111	int ret = 0;
				4112	struct address_space *mapping = inode->i_mapping;
				4113	struct page *page;
				4114	unsigned long nr_pages = (end - start + PAGE_SIZE) >>
				4115	PAGE_SHIFT;
				4116
				4117	struct extent_page_data epd = {
				4118	.bio = NULL,
				4119	.tree = tree,
				4120	.get_extent = get_extent,
				4121	.extent_locked = 1,
				4122	.sync_io = mode == WB_SYNC_ALL,
				4123	.bio_flags = 0,
				4124	};
				4125	struct writeback_control wbc_writepages = {
				4126	.sync_mode = mode,
				4127	.nr_to_write = nr_pages * 2,
				4128	.range_start = start,
				4129	.range_end = end + 1,
				4130	};
				4131
				4132	while (start <= end) {
				4133	page = find_get_page(mapping, start >> PAGE_SHIFT);
				4134	if (clear_page_dirty_for_io(page))
				4135	ret = __extent_writepage(page, &wbc_writepages, &epd);
				4136	else {
				4137	if (tree->ops && tree->ops->writepage_end_io_hook)
				4138	tree->ops->writepage_end_io_hook(page, start,
				4139	start + PAGE_SIZE - 1,
				4140	NULL, 1);
				4141	unlock_page(page);
				4142	}
				4143	put_page(page);
				4144	start += PAGE_SIZE;
				4145	}
				4146
				4147	flush_epd_write_bio(&epd);
				4148	return ret;
				4149	}
				4150
				4151	int extent_writepages(struct extent_io_tree *tree,
				4152	struct address_space *mapping,
				4153	get_extent_t *get_extent,
				4154	struct writeback_control *wbc)
				4155	{
				4156	int ret = 0;
				4157	struct extent_page_data epd = {
				4158	.bio = NULL,
				4159	.tree = tree,
				4160	.get_extent = get_extent,
				4161	.extent_locked = 0,
				4162	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				4163	.bio_flags = 0,
				4164	};
				4165
				4166	ret = extent_write_cache_pages(mapping, wbc, __extent_writepage, &epd,
				4167	flush_write_bio);
				4168	flush_epd_write_bio(&epd);
				4169	return ret;
				4170	}
				4171
				4172	int extent_readpages(struct extent_io_tree *tree,
				4173	struct address_space *mapping,
				4174	struct list_head *pages, unsigned nr_pages,
				4175	get_extent_t get_extent)
				4176	{
				4177	struct bio *bio = NULL;
				4178	unsigned page_idx;
				4179	unsigned long bio_flags = 0;
				4180	struct page *pagepool[16];
				4181	struct page *page;
				4182	struct extent_map *em_cached = NULL;
				4183	int nr = 0;
				4184	u64 prev_em_start = (u64)-1;
				4185
				4186	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
				4187	page = list_entry(pages->prev, struct page, lru);
				4188
				4189	prefetchw(&page->flags);
				4190	list_del(&page->lru);
				4191	if (add_to_page_cache_lru(page, mapping,
				4192	page->index,
				4193	readahead_gfp_mask(mapping))) {
				4194	put_page(page);
				4195	continue;
				4196	}
				4197
				4198	pagepool[nr++] = page;
				4199	if (nr < ARRAY_SIZE(pagepool))
				4200	continue;
				4201	__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
				4202	&bio, 0, &bio_flags, &prev_em_start);
				4203	nr = 0;
				4204	}
				4205	if (nr)
				4206	__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
				4207	&bio, 0, &bio_flags, &prev_em_start);
				4208
				4209	if (em_cached)
				4210	free_extent_map(em_cached);
				4211
				4212	BUG_ON(!list_empty(pages));
				4213	if (bio)
				4214	return submit_one_bio(bio, 0, bio_flags);
				4215	return 0;
				4216	}
				4217
				4218	/*
				4219	* basic invalidatepage code, this waits on any locked or writeback
				4220	* ranges corresponding to the page, and then deletes any extent state
				4221	* records from the tree
				4222	*/
				4223	int extent_invalidatepage(struct extent_io_tree *tree,
				4224	struct page *page, unsigned long offset)
				4225	{
				4226	struct extent_state *cached_state = NULL;
				4227	u64 start = page_offset(page);
				4228	u64 end = start + PAGE_SIZE - 1;
				4229	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
				4230
				4231	start += ALIGN(offset, blocksize);
				4232	if (start > end)
				4233	return 0;
				4234
				4235	lock_extent_bits(tree, start, end, &cached_state);
				4236	wait_on_page_writeback(page);
				4237	clear_extent_bit(tree, start, end,
				4238	EXTENT_LOCKED \| EXTENT_DIRTY \| EXTENT_DELALLOC \|
				4239	EXTENT_DO_ACCOUNTING,
				4240	1, 1, &cached_state, GFP_NOFS);
				4241	return 0;
				4242	}
				4243
				4244	/*
				4245	* a helper for releasepage, this tests for areas of the page that
				4246	* are locked or under IO and drops the related state bits if it is safe
				4247	* to drop the page.
				4248	*/
				4249	static int try_release_extent_state(struct extent_map_tree *map,
				4250	struct extent_io_tree *tree,
				4251	struct page *page, gfp_t mask)
				4252	{
				4253	u64 start = page_offset(page);
				4254	u64 end = start + PAGE_SIZE - 1;
				4255	int ret = 1;
				4256
				4257	if (test_range_bit(tree, start, end,
				4258	EXTENT_IOBITS, 0, NULL))
				4259	ret = 0;
				4260	else {
				4261	/*
				4262	* at this point we can safely clear everything except the
				4263	* locked bit and the nodatasum bit
				4264	*/
				4265	ret = clear_extent_bit(tree, start, end,
				4266	~(EXTENT_LOCKED \| EXTENT_NODATASUM),
				4267	0, 0, NULL, mask);
				4268
				4269	/* if clear_extent_bit failed for enomem reasons,
				4270	* we can't allow the release to continue.
				4271	*/
				4272	if (ret < 0)
				4273	ret = 0;
				4274	else
				4275	ret = 1;
				4276	}
				4277	return ret;
				4278	}
				4279
				4280	/*
				4281	* a helper for releasepage. As long as there are no locked extents
				4282	* in the range corresponding to the page, both state records and extent
				4283	* map records are removed
				4284	*/
				4285	int try_release_extent_mapping(struct extent_map_tree *map,
				4286	struct extent_io_tree tree, struct page page,
				4287	gfp_t mask)
				4288	{
				4289	struct extent_map *em;
				4290	u64 start = page_offset(page);
				4291	u64 end = start + PAGE_SIZE - 1;
				4292	struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
				4293
				4294	if (gfpflags_allow_blocking(mask) &&
				4295	page->mapping->host->i_size > SZ_16M) {
				4296	u64 len;
				4297	while (start <= end) {
				4298	len = end - start + 1;
				4299	write_lock(&map->lock);
				4300	em = lookup_extent_mapping(map, start, len);
				4301	if (!em) {
				4302	write_unlock(&map->lock);
				4303	break;
				4304	}
				4305	if (test_bit(EXTENT_FLAG_PINNED, &em->flags) \|\|
				4306	em->start != start) {
				4307	write_unlock(&map->lock);
				4308	free_extent_map(em);
				4309	break;
				4310	}
				4311	if (!test_range_bit(tree, em->start,
				4312	extent_map_end(em) - 1,
				4313	EXTENT_LOCKED \| EXTENT_WRITEBACK,
				4314	0, NULL)) {
				4315	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				4316	&btrfs_inode->runtime_flags);
				4317	remove_extent_mapping(map, em);
				4318	/* once for the rb tree */
				4319	free_extent_map(em);
				4320	}
				4321	start = extent_map_end(em);
				4322	write_unlock(&map->lock);
				4323
				4324	/* once for us */
				4325	free_extent_map(em);
				4326
				4327	cond_resched(); /* Allow large-extent preemption. */
				4328	}
				4329	}
				4330	return try_release_extent_state(map, tree, page, mask);
				4331	}
				4332
				4333	/*
				4334	* helper function for fiemap, which doesn't want to see any holes.
				4335	* This maps until we find something past 'last'
				4336	*/
				4337	static struct extent_map get_extent_skip_holes(struct inode inode,
				4338	u64 offset,
				4339	u64 last,
				4340	get_extent_t *get_extent)
				4341	{
				4342	u64 sectorsize = btrfs_inode_sectorsize(inode);
				4343	struct extent_map *em;
				4344	u64 len;
				4345
				4346	if (offset >= last)
				4347	return NULL;
				4348
				4349	while (1) {
				4350	len = last - offset;
				4351	if (len == 0)
				4352	break;
				4353	len = ALIGN(len, sectorsize);
				4354	em = get_extent(BTRFS_I(inode), NULL, 0, offset, len, 0);
				4355	if (IS_ERR_OR_NULL(em))
				4356	return em;
				4357
				4358	/* if this isn't a hole return it */
				4359	if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
				4360	em->block_start != EXTENT_MAP_HOLE) {
				4361	return em;
				4362	}
				4363
				4364	/* this is a hole, advance to the next extent */
				4365	offset = extent_map_end(em);
				4366	free_extent_map(em);
				4367	if (offset >= last)
				4368	break;
				4369	}
				4370	return NULL;
				4371	}
				4372
				4373	/*
				4374	* To cache previous fiemap extent
				4375	*
				4376	* Will be used for merging fiemap extent
				4377	*/
				4378	struct fiemap_cache {
				4379	u64 offset;
				4380	u64 phys;
				4381	u64 len;
				4382	u32 flags;
				4383	bool cached;
				4384	};
				4385
				4386	/*
				4387	* Helper to submit fiemap extent.
				4388	*
				4389	* Will try to merge current fiemap extent specified by @offset, @phys,
				4390	* @len and @flags with cached one.
				4391	* And only when we fails to merge, cached one will be submitted as
				4392	* fiemap extent.
				4393	*
				4394	* Return value is the same as fiemap_fill_next_extent().
				4395	*/
				4396	static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
				4397	struct fiemap_cache *cache,
				4398	u64 offset, u64 phys, u64 len, u32 flags)
				4399	{
				4400	int ret = 0;
				4401
				4402	if (!cache->cached)
				4403	goto assign;
				4404
				4405	/*
				4406	* Sanity check, extent_fiemap() should have ensured that new
				4407	* fiemap extent won't overlap with cahced one.
				4408	* Not recoverable.
				4409	*
				4410	* NOTE: Physical address can overlap, due to compression
				4411	*/
				4412	if (cache->offset + cache->len > offset) {
				4413	WARN_ON(1);
				4414	return -EINVAL;
				4415	}
				4416
				4417	/*
				4418	* Only merges fiemap extents if
				4419	* 1) Their logical addresses are continuous
				4420	*
				4421	* 2) Their physical addresses are continuous
				4422	* So truly compressed (physical size smaller than logical size)
				4423	* extents won't get merged with each other
				4424	*
				4425	* 3) Share same flags except FIEMAP_EXTENT_LAST
				4426	* So regular extent won't get merged with prealloc extent
				4427	*/
				4428	if (cache->offset + cache->len == offset &&
				4429	cache->phys + cache->len == phys &&
				4430	(cache->flags & ~FIEMAP_EXTENT_LAST) ==
				4431	(flags & ~FIEMAP_EXTENT_LAST)) {
				4432	cache->len += len;
				4433	cache->flags \|= flags;
				4434	goto try_submit_last;
				4435	}
				4436
				4437	/* Not mergeable, need to submit cached one */
				4438	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
				4439	cache->len, cache->flags);
				4440	cache->cached = false;
				4441	if (ret)
				4442	return ret;
				4443	assign:
				4444	cache->cached = true;
				4445	cache->offset = offset;
				4446	cache->phys = phys;
				4447	cache->len = len;
				4448	cache->flags = flags;
				4449	try_submit_last:
				4450	if (cache->flags & FIEMAP_EXTENT_LAST) {
				4451	ret = fiemap_fill_next_extent(fieinfo, cache->offset,
				4452	cache->phys, cache->len, cache->flags);
				4453	cache->cached = false;
				4454	}
				4455	return ret;
				4456	}
				4457
				4458	/*
				4459	* Emit last fiemap cache
				4460	*
				4461	* The last fiemap cache may still be cached in the following case:
				4462	* 0 4k 8k
				4463	* \|<- Fiemap range ->\|
				4464	* \|<------------ First extent ----------->\|
				4465	*
				4466	* In this case, the first extent range will be cached but not emitted.
				4467	* So we must emit it before ending extent_fiemap().
				4468	*/
				4469	static int emit_last_fiemap_cache(struct btrfs_fs_info *fs_info,
				4470	struct fiemap_extent_info *fieinfo,
				4471	struct fiemap_cache *cache)
				4472	{
				4473	int ret;
				4474
				4475	if (!cache->cached)
				4476	return 0;
				4477
				4478	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
				4479	cache->len, cache->flags);
				4480	cache->cached = false;
				4481	if (ret > 0)
				4482	ret = 0;
				4483	return ret;
				4484	}
				4485
				4486	int extent_fiemap(struct inode inode, struct fiemap_extent_info fieinfo,
				4487	__u64 start, __u64 len, get_extent_t *get_extent)
				4488	{
				4489	int ret = 0;
				4490	u64 off = start;
				4491	u64 max = start + len;
				4492	u32 flags = 0;
				4493	u32 found_type;
				4494	u64 last;
				4495	u64 last_for_get_extent = 0;
				4496	u64 disko = 0;
				4497	u64 isize = i_size_read(inode);
				4498	struct btrfs_key found_key;
				4499	struct extent_map *em = NULL;
				4500	struct extent_state *cached_state = NULL;
				4501	struct btrfs_path *path;
				4502	struct btrfs_root *root = BTRFS_I(inode)->root;
				4503	struct fiemap_cache cache = { 0 };
				4504	int end = 0;
				4505	u64 em_start = 0;
				4506	u64 em_len = 0;
				4507	u64 em_end = 0;
				4508
				4509	if (len == 0)
				4510	return -EINVAL;
				4511
				4512	path = btrfs_alloc_path();
				4513	if (!path)
				4514	return -ENOMEM;
				4515	path->leave_spinning = 1;
				4516
				4517	start = round_down(start, btrfs_inode_sectorsize(inode));
				4518	len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
				4519
				4520	/*
				4521	* lookup the last file extent. We're not using i_size here
				4522	* because there might be preallocation past i_size
				4523	*/
				4524	ret = btrfs_lookup_file_extent(NULL, root, path,
				4525	btrfs_ino(BTRFS_I(inode)), -1, 0);
				4526	if (ret < 0) {
				4527	btrfs_free_path(path);
				4528	return ret;
				4529	} else {
				4530	WARN_ON(!ret);
				4531	if (ret == 1)
				4532	ret = 0;
				4533	}
				4534
				4535	path->slots[0]--;
				4536	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
				4537	found_type = found_key.type;
				4538
				4539	/* No extents, but there might be delalloc bits */
				4540	if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) \|\|
				4541	found_type != BTRFS_EXTENT_DATA_KEY) {
				4542	/* have to trust i_size as the end */
				4543	last = (u64)-1;
				4544	last_for_get_extent = isize;
				4545	} else {
				4546	/*
				4547	* remember the start of the last extent. There are a
				4548	* bunch of different factors that go into the length of the
				4549	* extent, so its much less complex to remember where it started
				4550	*/
				4551	last = found_key.offset;
				4552	last_for_get_extent = last + 1;
				4553	}
				4554	btrfs_release_path(path);
				4555
				4556	/*
				4557	* we might have some extents allocated but more delalloc past those
				4558	* extents. so, we trust isize unless the start of the last extent is
				4559	* beyond isize
				4560	*/
				4561	if (last < isize) {
				4562	last = (u64)-1;
				4563	last_for_get_extent = isize;
				4564	}
				4565
				4566	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
				4567	&cached_state);
				4568
				4569	em = get_extent_skip_holes(inode, start, last_for_get_extent,
				4570	get_extent);
				4571	if (!em)
				4572	goto out;
				4573	if (IS_ERR(em)) {
				4574	ret = PTR_ERR(em);
				4575	goto out;
				4576	}
				4577
				4578	while (!end) {
				4579	u64 offset_in_extent = 0;
				4580
				4581	/* break if the extent we found is outside the range */
				4582	if (em->start >= max \|\| extent_map_end(em) < off)
				4583	break;
				4584
				4585	/*
				4586	* get_extent may return an extent that starts before our
				4587	* requested range. We have to make sure the ranges
				4588	* we return to fiemap always move forward and don't
				4589	* overlap, so adjust the offsets here
				4590	*/
				4591	em_start = max(em->start, off);
				4592
				4593	/*
				4594	* record the offset from the start of the extent
				4595	* for adjusting the disk offset below. Only do this if the
				4596	* extent isn't compressed since our in ram offset may be past
				4597	* what we have actually allocated on disk.
				4598	*/
				4599	if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
				4600	offset_in_extent = em_start - em->start;
				4601	em_end = extent_map_end(em);
				4602	em_len = em_end - em_start;
				4603	disko = 0;
				4604	flags = 0;
				4605
				4606	/*
				4607	* bump off for our next call to get_extent
				4608	*/
				4609	off = extent_map_end(em);
				4610	if (off >= max)
				4611	end = 1;
				4612
				4613	if (em->block_start == EXTENT_MAP_LAST_BYTE) {
				4614	end = 1;
				4615	flags \|= FIEMAP_EXTENT_LAST;
				4616	} else if (em->block_start == EXTENT_MAP_INLINE) {
				4617	flags \|= (FIEMAP_EXTENT_DATA_INLINE \|
				4618	FIEMAP_EXTENT_NOT_ALIGNED);
				4619	} else if (em->block_start == EXTENT_MAP_DELALLOC) {
				4620	flags \|= (FIEMAP_EXTENT_DELALLOC \|
				4621	FIEMAP_EXTENT_UNKNOWN);
				4622	} else if (fieinfo->fi_extents_max) {
				4623	u64 bytenr = em->block_start -
				4624	(em->start - em->orig_start);
				4625
				4626	disko = em->block_start + offset_in_extent;
				4627
				4628	/*
				4629	* As btrfs supports shared space, this information
				4630	* can be exported to userspace tools via
				4631	* flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0
				4632	* then we're just getting a count and we can skip the
				4633	* lookup stuff.
				4634	*/
				4635	ret = btrfs_check_shared(root,
				4636	btrfs_ino(BTRFS_I(inode)),
				4637	bytenr);
				4638	if (ret < 0)
				4639	goto out_free;
				4640	if (ret)
				4641	flags \|= FIEMAP_EXTENT_SHARED;
				4642	ret = 0;
				4643	}
				4644	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
				4645	flags \|= FIEMAP_EXTENT_ENCODED;
				4646	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
				4647	flags \|= FIEMAP_EXTENT_UNWRITTEN;
				4648
				4649	free_extent_map(em);
				4650	em = NULL;
				4651	if ((em_start >= last) \|\| em_len == (u64)-1 \|\|
				4652	(last == (u64)-1 && isize <= em_end)) {
				4653	flags \|= FIEMAP_EXTENT_LAST;
				4654	end = 1;
				4655	}
				4656
				4657	/* now scan forward to see if this is really the last extent. */
				4658	em = get_extent_skip_holes(inode, off, last_for_get_extent,
				4659	get_extent);
				4660	if (IS_ERR(em)) {
				4661	ret = PTR_ERR(em);
				4662	goto out;
				4663	}
				4664	if (!em) {
				4665	flags \|= FIEMAP_EXTENT_LAST;
				4666	end = 1;
				4667	}
				4668	ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
				4669	em_len, flags);
				4670	if (ret) {
				4671	if (ret == 1)
				4672	ret = 0;
				4673	goto out_free;
				4674	}
				4675	}
				4676	out_free:
				4677	if (!ret)
				4678	ret = emit_last_fiemap_cache(root->fs_info, fieinfo, &cache);
				4679	free_extent_map(em);
				4680	out:
				4681	btrfs_free_path(path);
				4682	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
				4683	&cached_state, GFP_NOFS);
				4684	return ret;
				4685	}
				4686
				4687	static void __free_extent_buffer(struct extent_buffer *eb)
				4688	{
				4689	btrfs_leak_debug_del(&eb->leak_list);
				4690	kmem_cache_free(extent_buffer_cache, eb);
				4691	}
				4692
				4693	int extent_buffer_under_io(struct extent_buffer *eb)
				4694	{
				4695	return (atomic_read(&eb->io_pages) \|\|
				4696	test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) \|\|
				4697	test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				4698	}
				4699
				4700	/*
				4701	* Helper for releasing extent buffer page.
				4702	*/
				4703	static void btrfs_release_extent_buffer_page(struct extent_buffer *eb)
				4704	{
				4705	unsigned long index;
				4706	struct page *page;
				4707	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
				4708
				4709	BUG_ON(extent_buffer_under_io(eb));
				4710
				4711	index = num_extent_pages(eb->start, eb->len);
				4712	if (index == 0)
				4713	return;
				4714
				4715	do {
				4716	index--;
				4717	page = eb->pages[index];
				4718	if (!page)
				4719	continue;
				4720	if (mapped)
				4721	spin_lock(&page->mapping->private_lock);
				4722	/*
				4723	* We do this since we'll remove the pages after we've
				4724	* removed the eb from the radix tree, so we could race
				4725	* and have this page now attached to the new eb. So
				4726	* only clear page_private if it's still connected to
				4727	* this eb.
				4728	*/
				4729	if (PagePrivate(page) &&
				4730	page->private == (unsigned long)eb) {
				4731	BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				4732	BUG_ON(PageDirty(page));
				4733	BUG_ON(PageWriteback(page));
				4734	/*
				4735	* We need to make sure we haven't be attached
				4736	* to a new eb.
				4737	*/
				4738	ClearPagePrivate(page);
				4739	set_page_private(page, 0);
				4740	/* One for the page private */
				4741	put_page(page);
				4742	}
				4743
				4744	if (mapped)
				4745	spin_unlock(&page->mapping->private_lock);
				4746
				4747	/* One for when we allocated the page */
				4748	put_page(page);
				4749	} while (index != 0);
				4750	}
				4751
				4752	/*
				4753	* Helper for releasing the extent buffer.
				4754	*/
				4755	static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
				4756	{
				4757	btrfs_release_extent_buffer_page(eb);
				4758	__free_extent_buffer(eb);
				4759	}
				4760
				4761	static struct extent_buffer *
				4762	__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
				4763	unsigned long len)
				4764	{
				4765	struct extent_buffer *eb = NULL;
				4766
				4767	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS\|__GFP_NOFAIL);
				4768	eb->start = start;
				4769	eb->len = len;
				4770	eb->fs_info = fs_info;
				4771	eb->bflags = 0;
				4772	rwlock_init(&eb->lock);
				4773	atomic_set(&eb->write_locks, 0);
				4774	atomic_set(&eb->read_locks, 0);
				4775	atomic_set(&eb->blocking_readers, 0);
				4776	atomic_set(&eb->blocking_writers, 0);
				4777	atomic_set(&eb->spinning_readers, 0);
				4778	atomic_set(&eb->spinning_writers, 0);
				4779	eb->lock_nested = 0;
				4780	init_waitqueue_head(&eb->write_lock_wq);
				4781	init_waitqueue_head(&eb->read_lock_wq);
				4782
				4783	btrfs_leak_debug_add(&eb->leak_list, &buffers);
				4784
				4785	spin_lock_init(&eb->refs_lock);
				4786	atomic_set(&eb->refs, 1);
				4787	atomic_set(&eb->io_pages, 0);
				4788
				4789	/*
				4790	* Sanity checks, currently the maximum is 64k covered by 16x 4k pages
				4791	*/
				4792	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
				4793	> MAX_INLINE_EXTENT_BUFFER_SIZE);
				4794	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
				4795
				4796	return eb;
				4797	}
				4798
				4799	struct extent_buffer btrfs_clone_extent_buffer(struct extent_buffer src)
				4800	{
				4801	unsigned long i;
				4802	struct page *p;
				4803	struct extent_buffer *new;
				4804	unsigned long num_pages = num_extent_pages(src->start, src->len);
				4805
				4806	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
				4807	if (new == NULL)
				4808	return NULL;
				4809
				4810	for (i = 0; i < num_pages; i++) {
				4811	p = alloc_page(GFP_NOFS);
				4812	if (!p) {
				4813	btrfs_release_extent_buffer(new);
				4814	return NULL;
				4815	}
				4816	attach_extent_buffer_page(new, p);
				4817	WARN_ON(PageDirty(p));
				4818	SetPageUptodate(p);
				4819	new->pages[i] = p;
				4820	copy_page(page_address(p), page_address(src->pages[i]));
				4821	}
				4822
				4823	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
				4824	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
				4825
				4826	return new;
				4827	}
				4828
				4829	struct extent_buffer __alloc_dummy_extent_buffer(struct btrfs_fs_info fs_info,
				4830	u64 start, unsigned long len)
				4831	{
				4832	struct extent_buffer *eb;
				4833	unsigned long num_pages;
				4834	unsigned long i;
				4835
				4836	num_pages = num_extent_pages(start, len);
				4837
				4838	eb = __alloc_extent_buffer(fs_info, start, len);
				4839	if (!eb)
				4840	return NULL;
				4841
				4842	for (i = 0; i < num_pages; i++) {
				4843	eb->pages[i] = alloc_page(GFP_NOFS);
				4844	if (!eb->pages[i])
				4845	goto err;
				4846	}
				4847	set_extent_buffer_uptodate(eb);
				4848	btrfs_set_header_nritems(eb, 0);
				4849	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
				4850
				4851	return eb;
				4852	err:
				4853	for (; i > 0; i--)
				4854	__free_page(eb->pages[i - 1]);
				4855	__free_extent_buffer(eb);
				4856	return NULL;
				4857	}
				4858
				4859	struct extent_buffer alloc_dummy_extent_buffer(struct btrfs_fs_info fs_info,
				4860	u64 start)
				4861	{
				4862	return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
				4863	}
				4864
				4865	static void check_buffer_tree_ref(struct extent_buffer *eb)
				4866	{
				4867	int refs;
				4868	/*
				4869	* The TREE_REF bit is first set when the extent_buffer is added
				4870	* to the radix tree. It is also reset, if unset, when a new reference
				4871	* is created by find_extent_buffer.
				4872	*
				4873	* It is only cleared in two cases: freeing the last non-tree
				4874	* reference to the extent_buffer when its STALE bit is set or
				4875	* calling releasepage when the tree reference is the only reference.
				4876	*
				4877	* In both cases, care is taken to ensure that the extent_buffer's
				4878	* pages are not under io. However, releasepage can be concurrently
				4879	* called with creating new references, which is prone to race
				4880	* conditions between the calls to check_buffer_tree_ref in those
				4881	* codepaths and clearing TREE_REF in try_release_extent_buffer.
				4882	*
				4883	* The actual lifetime of the extent_buffer in the radix tree is
				4884	* adequately protected by the refcount, but the TREE_REF bit and
				4885	* its corresponding reference are not. To protect against this
				4886	* class of races, we call check_buffer_tree_ref from the codepaths
				4887	* which trigger io after they set eb->io_pages. Note that once io is
				4888	* initiated, TREE_REF can no longer be cleared, so that is the
				4889	* moment at which any such race is best fixed.
				4890	*/
				4891	refs = atomic_read(&eb->refs);
				4892	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				4893	return;
				4894
				4895	spin_lock(&eb->refs_lock);
				4896	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				4897	atomic_inc(&eb->refs);
				4898	spin_unlock(&eb->refs_lock);
				4899	}
				4900
				4901	static void mark_extent_buffer_accessed(struct extent_buffer *eb,
				4902	struct page *accessed)
				4903	{
				4904	unsigned long num_pages, i;
				4905
				4906	check_buffer_tree_ref(eb);
				4907
				4908	num_pages = num_extent_pages(eb->start, eb->len);
				4909	for (i = 0; i < num_pages; i++) {
				4910	struct page *p = eb->pages[i];
				4911
				4912	if (p != accessed)
				4913	mark_page_accessed(p);
				4914	}
				4915	}
				4916
				4917	struct extent_buffer find_extent_buffer(struct btrfs_fs_info fs_info,
				4918	u64 start)
				4919	{
				4920	struct extent_buffer *eb;
				4921
				4922	rcu_read_lock();
				4923	eb = radix_tree_lookup(&fs_info->buffer_radix,
				4924	start >> PAGE_SHIFT);
				4925	if (eb && atomic_inc_not_zero(&eb->refs)) {
				4926	rcu_read_unlock();
				4927	/*
				4928	* Lock our eb's refs_lock to avoid races with
				4929	* free_extent_buffer. When we get our eb it might be flagged
				4930	* with EXTENT_BUFFER_STALE and another task running
				4931	* free_extent_buffer might have seen that flag set,
				4932	* eb->refs == 2, that the buffer isn't under IO (dirty and
				4933	* writeback flags not set) and it's still in the tree (flag
				4934	* EXTENT_BUFFER_TREE_REF set), therefore being in the process
				4935	* of decrementing the extent buffer's reference count twice.
				4936	* So here we could race and increment the eb's reference count,
				4937	* clear its stale flag, mark it as dirty and drop our reference
				4938	* before the other task finishes executing free_extent_buffer,
				4939	* which would later result in an attempt to free an extent
				4940	* buffer that is dirty.
				4941	*/
				4942	if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
				4943	spin_lock(&eb->refs_lock);
				4944	spin_unlock(&eb->refs_lock);
				4945	}
				4946	mark_extent_buffer_accessed(eb, NULL);
				4947	return eb;
				4948	}
				4949	rcu_read_unlock();
				4950
				4951	return NULL;
				4952	}
				4953
				4954	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				4955	struct extent_buffer alloc_test_extent_buffer(struct btrfs_fs_info fs_info,
				4956	u64 start)
				4957	{
				4958	struct extent_buffer eb, exists = NULL;
				4959	int ret;
				4960
				4961	eb = find_extent_buffer(fs_info, start);
				4962	if (eb)
				4963	return eb;
				4964	eb = alloc_dummy_extent_buffer(fs_info, start);
				4965	if (!eb)
				4966	return ERR_PTR(-ENOMEM);
				4967	eb->fs_info = fs_info;
				4968	again:
				4969	ret = radix_tree_preload(GFP_NOFS);
				4970	if (ret) {
				4971	exists = ERR_PTR(ret);
				4972	goto free_eb;
				4973	}
				4974	spin_lock(&fs_info->buffer_lock);
				4975	ret = radix_tree_insert(&fs_info->buffer_radix,
				4976	start >> PAGE_SHIFT, eb);
				4977	spin_unlock(&fs_info->buffer_lock);
				4978	radix_tree_preload_end();
				4979	if (ret == -EEXIST) {
				4980	exists = find_extent_buffer(fs_info, start);
				4981	if (exists)
				4982	goto free_eb;
				4983	else
				4984	goto again;
				4985	}
				4986	check_buffer_tree_ref(eb);
				4987	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
				4988
				4989	/*
				4990	* We will free dummy extent buffer's if they come into
				4991	* free_extent_buffer with a ref count of 2, but if we are using this we
				4992	* want the buffers to stay in memory until we're done with them, so
				4993	* bump the ref count again.
				4994	*/
				4995	atomic_inc(&eb->refs);
				4996	return eb;
				4997	free_eb:
				4998	btrfs_release_extent_buffer(eb);
				4999	return exists;
				5000	}
				5001	#endif
				5002
				5003	struct extent_buffer alloc_extent_buffer(struct btrfs_fs_info fs_info,
				5004	u64 start)
				5005	{
				5006	unsigned long len = fs_info->nodesize;
				5007	unsigned long num_pages = num_extent_pages(start, len);
				5008	unsigned long i;
				5009	unsigned long index = start >> PAGE_SHIFT;
				5010	struct extent_buffer *eb;
				5011	struct extent_buffer *exists = NULL;
				5012	struct page *p;
				5013	struct address_space *mapping = fs_info->btree_inode->i_mapping;
				5014	int uptodate = 1;
				5015	int ret;
				5016
				5017	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
				5018	btrfs_err(fs_info, "bad tree block start %llu", start);
				5019	return ERR_PTR(-EINVAL);
				5020	}
				5021
				5022	eb = find_extent_buffer(fs_info, start);
				5023	if (eb)
				5024	return eb;
				5025
				5026	eb = __alloc_extent_buffer(fs_info, start, len);
				5027	if (!eb)
				5028	return ERR_PTR(-ENOMEM);
				5029
				5030	for (i = 0; i < num_pages; i++, index++) {
				5031	p = find_or_create_page(mapping, index, GFP_NOFS\|__GFP_NOFAIL);
				5032	if (!p) {
				5033	exists = ERR_PTR(-ENOMEM);
				5034	goto free_eb;
				5035	}
				5036
				5037	spin_lock(&mapping->private_lock);
				5038	if (PagePrivate(p)) {
				5039	/*
				5040	* We could have already allocated an eb for this page
				5041	* and attached one so lets see if we can get a ref on
				5042	* the existing eb, and if we can we know it's good and
				5043	* we can just return that one, else we know we can just
				5044	* overwrite page->private.
				5045	*/
				5046	exists = (struct extent_buffer *)p->private;
				5047	if (atomic_inc_not_zero(&exists->refs)) {
				5048	spin_unlock(&mapping->private_lock);
				5049	unlock_page(p);
				5050	put_page(p);
				5051	mark_extent_buffer_accessed(exists, p);
				5052	goto free_eb;
				5053	}
				5054	exists = NULL;
				5055
				5056	/*
				5057	* Do this so attach doesn't complain and we need to
				5058	* drop the ref the old guy had.
				5059	*/
				5060	ClearPagePrivate(p);
				5061	WARN_ON(PageDirty(p));
				5062	put_page(p);
				5063	}
				5064	attach_extent_buffer_page(eb, p);
				5065	spin_unlock(&mapping->private_lock);
				5066	WARN_ON(PageDirty(p));
				5067	eb->pages[i] = p;
				5068	if (!PageUptodate(p))
				5069	uptodate = 0;
				5070
				5071	/*
				5072	* see below about how we avoid a nasty race with release page
				5073	* and why we unlock later
				5074	*/
				5075	}
				5076	if (uptodate)
				5077	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5078	again:
				5079	ret = radix_tree_preload(GFP_NOFS);
				5080	if (ret) {
				5081	exists = ERR_PTR(ret);
				5082	goto free_eb;
				5083	}
				5084
				5085	spin_lock(&fs_info->buffer_lock);
				5086	ret = radix_tree_insert(&fs_info->buffer_radix,
				5087	start >> PAGE_SHIFT, eb);
				5088	spin_unlock(&fs_info->buffer_lock);
				5089	radix_tree_preload_end();
				5090	if (ret == -EEXIST) {
				5091	exists = find_extent_buffer(fs_info, start);
				5092	if (exists)
				5093	goto free_eb;
				5094	else
				5095	goto again;
				5096	}
				5097	/* add one reference for the tree */
				5098	check_buffer_tree_ref(eb);
				5099	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
				5100
				5101	/*
				5102	* there is a race where release page may have
				5103	* tried to find this extent buffer in the radix
				5104	* but failed. It will tell the VM it is safe to
				5105	* reclaim the, and it will clear the page private bit.
				5106	* We must make sure to set the page private bit properly
				5107	* after the extent buffer is in the radix tree so
				5108	* it doesn't get lost
				5109	*/
				5110	SetPageChecked(eb->pages[0]);
				5111	for (i = 1; i < num_pages; i++) {
				5112	p = eb->pages[i];
				5113	ClearPageChecked(p);
				5114	unlock_page(p);
				5115	}
				5116	unlock_page(eb->pages[0]);
				5117	return eb;
				5118
				5119	free_eb:
				5120	WARN_ON(!atomic_dec_and_test(&eb->refs));
				5121	for (i = 0; i < num_pages; i++) {
				5122	if (eb->pages[i])
				5123	unlock_page(eb->pages[i]);
				5124	}
				5125
				5126	btrfs_release_extent_buffer(eb);
				5127	return exists;
				5128	}
				5129
				5130	static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
				5131	{
				5132	struct extent_buffer *eb =
				5133	container_of(head, struct extent_buffer, rcu_head);
				5134
				5135	__free_extent_buffer(eb);
				5136	}
				5137
				5138	/* Expects to have eb->eb_lock already held */
				5139	static int release_extent_buffer(struct extent_buffer *eb)
				5140	{
				5141	WARN_ON(atomic_read(&eb->refs) == 0);
				5142	if (atomic_dec_and_test(&eb->refs)) {
				5143	if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
				5144	struct btrfs_fs_info *fs_info = eb->fs_info;
				5145
				5146	spin_unlock(&eb->refs_lock);
				5147
				5148	spin_lock(&fs_info->buffer_lock);
				5149	radix_tree_delete(&fs_info->buffer_radix,
				5150	eb->start >> PAGE_SHIFT);
				5151	spin_unlock(&fs_info->buffer_lock);
				5152	} else {
				5153	spin_unlock(&eb->refs_lock);
				5154	}
				5155
				5156	/* Should be safe to release our pages at this point */
				5157	btrfs_release_extent_buffer_page(eb);
				5158	#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
				5159	if (unlikely(test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))) {
				5160	__free_extent_buffer(eb);
				5161	return 1;
				5162	}
				5163	#endif
				5164	call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
				5165	return 1;
				5166	}
				5167	spin_unlock(&eb->refs_lock);
				5168
				5169	return 0;
				5170	}
				5171
				5172	void free_extent_buffer(struct extent_buffer *eb)
				5173	{
				5174	int refs;
				5175	int old;
				5176	if (!eb)
				5177	return;
				5178
				5179	while (1) {
				5180	refs = atomic_read(&eb->refs);
				5181	if (refs <= 3)
				5182	break;
				5183	old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
				5184	if (old == refs)
				5185	return;
				5186	}
				5187
				5188	spin_lock(&eb->refs_lock);
				5189	if (atomic_read(&eb->refs) == 2 &&
				5190	test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
				5191	atomic_dec(&eb->refs);
				5192
				5193	if (atomic_read(&eb->refs) == 2 &&
				5194	test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
				5195	!extent_buffer_under_io(eb) &&
				5196	test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				5197	atomic_dec(&eb->refs);
				5198
				5199	/*
				5200	* I know this is terrible, but it's temporary until we stop tracking
				5201	* the uptodate bits and such for the extent buffers.
				5202	*/
				5203	release_extent_buffer(eb);
				5204	}
				5205
				5206	void free_extent_buffer_stale(struct extent_buffer *eb)
				5207	{
				5208	if (!eb)
				5209	return;
				5210
				5211	spin_lock(&eb->refs_lock);
				5212	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
				5213
				5214	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
				5215	test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				5216	atomic_dec(&eb->refs);
				5217	release_extent_buffer(eb);
				5218	}
				5219
				5220	void clear_extent_buffer_dirty(struct extent_buffer *eb)
				5221	{
				5222	unsigned long i;
				5223	unsigned long num_pages;
				5224	struct page *page;
				5225
				5226	num_pages = num_extent_pages(eb->start, eb->len);
				5227
				5228	for (i = 0; i < num_pages; i++) {
				5229	page = eb->pages[i];
				5230	if (!PageDirty(page))
				5231	continue;
				5232
				5233	lock_page(page);
				5234	WARN_ON(!PagePrivate(page));
				5235
				5236	clear_page_dirty_for_io(page);
				5237	spin_lock_irq(&page->mapping->tree_lock);
				5238	if (!PageDirty(page)) {
				5239	radix_tree_tag_clear(&page->mapping->page_tree,
				5240	page_index(page),
				5241	PAGECACHE_TAG_DIRTY);
				5242	}
				5243	spin_unlock_irq(&page->mapping->tree_lock);
				5244	ClearPageError(page);
				5245	unlock_page(page);
				5246	}
				5247	WARN_ON(atomic_read(&eb->refs) == 0);
				5248	}
				5249
				5250	int set_extent_buffer_dirty(struct extent_buffer *eb)
				5251	{
				5252	unsigned long i;
				5253	unsigned long num_pages;
				5254	int was_dirty = 0;
				5255
				5256	check_buffer_tree_ref(eb);
				5257
				5258	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
				5259
				5260	num_pages = num_extent_pages(eb->start, eb->len);
				5261	WARN_ON(atomic_read(&eb->refs) == 0);
				5262	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
				5263
				5264	for (i = 0; i < num_pages; i++)
				5265	set_page_dirty(eb->pages[i]);
				5266	return was_dirty;
				5267	}
				5268
				5269	void clear_extent_buffer_uptodate(struct extent_buffer *eb)
				5270	{
				5271	unsigned long i;
				5272	struct page *page;
				5273	unsigned long num_pages;
				5274
				5275	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5276	num_pages = num_extent_pages(eb->start, eb->len);
				5277	for (i = 0; i < num_pages; i++) {
				5278	page = eb->pages[i];
				5279	if (page)
				5280	ClearPageUptodate(page);
				5281	}
				5282	}
				5283
				5284	void set_extent_buffer_uptodate(struct extent_buffer *eb)
				5285	{
				5286	unsigned long i;
				5287	struct page *page;
				5288	unsigned long num_pages;
				5289
				5290	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5291	num_pages = num_extent_pages(eb->start, eb->len);
				5292	for (i = 0; i < num_pages; i++) {
				5293	page = eb->pages[i];
				5294	SetPageUptodate(page);
				5295	}
				5296	}
				5297
				5298	int extent_buffer_uptodate(struct extent_buffer *eb)
				5299	{
				5300	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5301	}
				5302
				5303	int read_extent_buffer_pages(struct extent_io_tree *tree,
				5304	struct extent_buffer *eb, int wait,
				5305	get_extent_t *get_extent, int mirror_num)
				5306	{
				5307	unsigned long i;
				5308	struct page *page;
				5309	int err;
				5310	int ret = 0;
				5311	int locked_pages = 0;
				5312	int all_uptodate = 1;
				5313	unsigned long num_pages;
				5314	unsigned long num_reads = 0;
				5315	struct bio *bio = NULL;
				5316	unsigned long bio_flags = 0;
				5317
				5318	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
				5319	return 0;
				5320
				5321	num_pages = num_extent_pages(eb->start, eb->len);
				5322	for (i = 0; i < num_pages; i++) {
				5323	page = eb->pages[i];
				5324	if (wait == WAIT_NONE) {
				5325	if (!trylock_page(page))
				5326	goto unlock_exit;
				5327	} else {
				5328	lock_page(page);
				5329	}
				5330	locked_pages++;
				5331	}
				5332	/*
				5333	* We need to firstly lock all pages to make sure that
				5334	* the uptodate bit of our pages won't be affected by
				5335	* clear_extent_buffer_uptodate().
				5336	*/
				5337	for (i = 0; i < num_pages; i++) {
				5338	page = eb->pages[i];
				5339	if (!PageUptodate(page)) {
				5340	num_reads++;
				5341	all_uptodate = 0;
				5342	}
				5343	}
				5344
				5345	if (all_uptodate) {
				5346	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				5347	goto unlock_exit;
				5348	}
				5349
				5350	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
				5351	eb->read_mirror = 0;
				5352	atomic_set(&eb->io_pages, num_reads);
				5353	/*
				5354	* It is possible for releasepage to clear the TREE_REF bit before we
				5355	* set io_pages. See check_buffer_tree_ref for a more detailed comment.
				5356	*/
				5357	check_buffer_tree_ref(eb);
				5358	for (i = 0; i < num_pages; i++) {
				5359	page = eb->pages[i];
				5360
				5361	if (!PageUptodate(page)) {
				5362	if (ret) {
				5363	atomic_dec(&eb->io_pages);
				5364	unlock_page(page);
				5365	continue;
				5366	}
				5367
				5368	ClearPageError(page);
				5369	err = __extent_read_full_page(tree, page,
				5370	get_extent, &bio,
				5371	mirror_num, &bio_flags,
				5372	REQ_META);
				5373	if (err) {
				5374	ret = err;
				5375	/*
				5376	* We use &bio in above __extent_read_full_page,
				5377	* so we ensure that if it returns error, the
				5378	* current page fails to add itself to bio and
				5379	* it's been unlocked.
				5380	*
				5381	* We must dec io_pages by ourselves.
				5382	*/
				5383	atomic_dec(&eb->io_pages);
				5384	}
				5385	} else {
				5386	unlock_page(page);
				5387	}
				5388	}
				5389
				5390	if (bio) {
				5391	err = submit_one_bio(bio, mirror_num, bio_flags);
				5392	if (err)
				5393	return err;
				5394	}
				5395
				5396	if (ret \|\| wait != WAIT_COMPLETE)
				5397	return ret;
				5398
				5399	for (i = 0; i < num_pages; i++) {
				5400	page = eb->pages[i];
				5401	wait_on_page_locked(page);
				5402	if (!PageUptodate(page))
				5403	ret = -EIO;
				5404	}
				5405
				5406	return ret;
				5407
				5408	unlock_exit:
				5409	while (locked_pages > 0) {
				5410	locked_pages--;
				5411	page = eb->pages[locked_pages];
				5412	unlock_page(page);
				5413	}
				5414	return ret;
				5415	}
				5416
				5417	void read_extent_buffer(const struct extent_buffer eb, void dstv,
				5418	unsigned long start, unsigned long len)
				5419	{
				5420	size_t cur;
				5421	size_t offset;
				5422	struct page *page;
				5423	char *kaddr;
				5424	char dst = (char )dstv;
				5425	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
				5426	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
				5427
				5428	if (start + len > eb->len) {
				5429	WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
				5430	eb->start, eb->len, start, len);
				5431	memset(dst, 0, len);
				5432	return;
				5433	}
				5434
				5435	offset = (start_offset + start) & (PAGE_SIZE - 1);
				5436
				5437	while (len > 0) {
				5438	page = eb->pages[i];
				5439
				5440	cur = min(len, (PAGE_SIZE - offset));
				5441	kaddr = page_address(page);
				5442	memcpy(dst, kaddr + offset, cur);
				5443
				5444	dst += cur;
				5445	len -= cur;
				5446	offset = 0;
				5447	i++;
				5448	}
				5449	}
				5450
				5451	int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
				5452	void __user *dstv,
				5453	unsigned long start, unsigned long len)
				5454	{
				5455	size_t cur;
				5456	size_t offset;
				5457	struct page *page;
				5458	char *kaddr;
				5459	char __user dst = (char __user )dstv;
				5460	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
				5461	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
				5462	int ret = 0;
				5463
				5464	WARN_ON(start > eb->len);
				5465	WARN_ON(start + len > eb->start + eb->len);
				5466
				5467	offset = (start_offset + start) & (PAGE_SIZE - 1);
				5468
				5469	while (len > 0) {
				5470	page = eb->pages[i];
				5471
				5472	cur = min(len, (PAGE_SIZE - offset));
				5473	kaddr = page_address(page);
				5474	if (probe_user_write(dst, kaddr + offset, cur)) {
				5475	ret = -EFAULT;
				5476	break;
				5477	}
				5478
				5479	dst += cur;
				5480	len -= cur;
				5481	offset = 0;
				5482	i++;
				5483	}
				5484
				5485	return ret;
				5486	}
				5487
				5488	/*
				5489	* return 0 if the item is found within a page.
				5490	* return 1 if the item spans two pages.
				5491	* return -EINVAL otherwise.
				5492	*/
				5493	int map_private_extent_buffer(const struct extent_buffer *eb,
				5494	unsigned long start, unsigned long min_len,
				5495	char *map, unsigned long map_start,
				5496	unsigned long *map_len)
				5497	{
				5498	size_t offset = start & (PAGE_SIZE - 1);
				5499	char *kaddr;
				5500	struct page *p;
				5501	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
				5502	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
				5503	unsigned long end_i = (start_offset + start + min_len - 1) >>
				5504	PAGE_SHIFT;
				5505
				5506	if (start + min_len > eb->len) {
				5507	WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n",
				5508	eb->start, eb->len, start, min_len);
				5509	return -EINVAL;
				5510	}
				5511
				5512	if (i != end_i)
				5513	return 1;
				5514
				5515	if (i == 0) {
				5516	offset = start_offset;
				5517	*map_start = 0;
				5518	} else {
				5519	offset = 0;
				5520	*map_start = ((u64)i << PAGE_SHIFT) - start_offset;
				5521	}
				5522
				5523	p = eb->pages[i];
				5524	kaddr = page_address(p);
				5525	*map = kaddr + offset;
				5526	*map_len = PAGE_SIZE - offset;
				5527	return 0;
				5528	}
				5529
				5530	int memcmp_extent_buffer(const struct extent_buffer eb, const void ptrv,
				5531	unsigned long start, unsigned long len)
				5532	{
				5533	size_t cur;
				5534	size_t offset;
				5535	struct page *page;
				5536	char *kaddr;
				5537	char ptr = (char )ptrv;
				5538	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
				5539	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
				5540	int ret = 0;
				5541
				5542	WARN_ON(start > eb->len);
				5543	WARN_ON(start + len > eb->start + eb->len);
				5544
				5545	offset = (start_offset + start) & (PAGE_SIZE - 1);
				5546
				5547	while (len > 0) {
				5548	page = eb->pages[i];
				5549
				5550	cur = min(len, (PAGE_SIZE - offset));
				5551
				5552	kaddr = page_address(page);
				5553	ret = memcmp(ptr, kaddr + offset, cur);
				5554	if (ret)
				5555	break;
				5556
				5557	ptr += cur;
				5558	len -= cur;
				5559	offset = 0;
				5560	i++;
				5561	}
				5562	return ret;
				5563	}
				5564
				5565	void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
				5566	const void *srcv)
				5567	{
				5568	char *kaddr;
				5569
				5570	WARN_ON(!PageUptodate(eb->pages[0]));
				5571	kaddr = page_address(eb->pages[0]);
				5572	memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
				5573	BTRFS_FSID_SIZE);
				5574	}
				5575
				5576	void write_extent_buffer_fsid(struct extent_buffer eb, const void srcv)
				5577	{
				5578	char *kaddr;
				5579
				5580	WARN_ON(!PageUptodate(eb->pages[0]));
				5581	kaddr = page_address(eb->pages[0]);
				5582	memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
				5583	BTRFS_FSID_SIZE);
				5584	}
				5585
				5586	void write_extent_buffer(struct extent_buffer eb, const void srcv,
				5587	unsigned long start, unsigned long len)
				5588	{
				5589	size_t cur;
				5590	size_t offset;
				5591	struct page *page;
				5592	char *kaddr;
				5593	char src = (char )srcv;
				5594	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
				5595	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
				5596
				5597	WARN_ON(start > eb->len);
				5598	WARN_ON(start + len > eb->start + eb->len);
				5599
				5600	offset = (start_offset + start) & (PAGE_SIZE - 1);
				5601
				5602	while (len > 0) {
				5603	page = eb->pages[i];
				5604	WARN_ON(!PageUptodate(page));
				5605
				5606	cur = min(len, PAGE_SIZE - offset);
				5607	kaddr = page_address(page);
				5608	memcpy(kaddr + offset, src, cur);
				5609
				5610	src += cur;
				5611	len -= cur;
				5612	offset = 0;
				5613	i++;
				5614	}
				5615	}
				5616
				5617	void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
				5618	unsigned long len)
				5619	{
				5620	size_t cur;
				5621	size_t offset;
				5622	struct page *page;
				5623	char *kaddr;
				5624	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
				5625	unsigned long i = (start_offset + start) >> PAGE_SHIFT;
				5626
				5627	WARN_ON(start > eb->len);
				5628	WARN_ON(start + len > eb->start + eb->len);
				5629
				5630	offset = (start_offset + start) & (PAGE_SIZE - 1);
				5631
				5632	while (len > 0) {
				5633	page = eb->pages[i];
				5634	WARN_ON(!PageUptodate(page));
				5635
				5636	cur = min(len, PAGE_SIZE - offset);
				5637	kaddr = page_address(page);
				5638	memset(kaddr + offset, 0, cur);
				5639
				5640	len -= cur;
				5641	offset = 0;
				5642	i++;
				5643	}
				5644	}
				5645
				5646	void copy_extent_buffer_full(struct extent_buffer *dst,
				5647	struct extent_buffer *src)
				5648	{
				5649	int i;
				5650	unsigned num_pages;
				5651
				5652	ASSERT(dst->len == src->len);
				5653
				5654	num_pages = num_extent_pages(dst->start, dst->len);
				5655	for (i = 0; i < num_pages; i++)
				5656	copy_page(page_address(dst->pages[i]),
				5657	page_address(src->pages[i]));
				5658	}
				5659
				5660	void copy_extent_buffer(struct extent_buffer dst, struct extent_buffer src,
				5661	unsigned long dst_offset, unsigned long src_offset,
				5662	unsigned long len)
				5663	{
				5664	u64 dst_len = dst->len;
				5665	size_t cur;
				5666	size_t offset;
				5667	struct page *page;
				5668	char *kaddr;
				5669	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
				5670	unsigned long i = (start_offset + dst_offset) >> PAGE_SHIFT;
				5671
				5672	WARN_ON(src->len != dst_len);
				5673
				5674	offset = (start_offset + dst_offset) &
				5675	(PAGE_SIZE - 1);
				5676
				5677	while (len > 0) {
				5678	page = dst->pages[i];
				5679	WARN_ON(!PageUptodate(page));
				5680
				5681	cur = min(len, (unsigned long)(PAGE_SIZE - offset));
				5682
				5683	kaddr = page_address(page);
				5684	read_extent_buffer(src, kaddr + offset, src_offset, cur);
				5685
				5686	src_offset += cur;
				5687	len -= cur;
				5688	offset = 0;
				5689	i++;
				5690	}
				5691	}
				5692
				5693	void le_bitmap_set(u8 *map, unsigned int start, int len)
				5694	{
				5695	u8 *p = map + BIT_BYTE(start);
				5696	const unsigned int size = start + len;
				5697	int bits_to_set = BITS_PER_BYTE - (start % BITS_PER_BYTE);
				5698	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(start);
				5699
				5700	while (len - bits_to_set >= 0) {
				5701	*p \|= mask_to_set;
				5702	len -= bits_to_set;
				5703	bits_to_set = BITS_PER_BYTE;
				5704	mask_to_set = ~0;
				5705	p++;
				5706	}
				5707	if (len) {
				5708	mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
				5709	*p \|= mask_to_set;
				5710	}
				5711	}
				5712
				5713	void le_bitmap_clear(u8 *map, unsigned int start, int len)
				5714	{
				5715	u8 *p = map + BIT_BYTE(start);
				5716	const unsigned int size = start + len;
				5717	int bits_to_clear = BITS_PER_BYTE - (start % BITS_PER_BYTE);
				5718	u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(start);
				5719
				5720	while (len - bits_to_clear >= 0) {
				5721	*p &= ~mask_to_clear;
				5722	len -= bits_to_clear;
				5723	bits_to_clear = BITS_PER_BYTE;
				5724	mask_to_clear = ~0;
				5725	p++;
				5726	}
				5727	if (len) {
				5728	mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
				5729	*p &= ~mask_to_clear;
				5730	}
				5731	}
				5732
				5733	/*
				5734	* eb_bitmap_offset() - calculate the page and offset of the byte containing the
				5735	* given bit number
				5736	* @eb: the extent buffer
				5737	* @start: offset of the bitmap item in the extent buffer
				5738	* @nr: bit number
				5739	* @page_index: return index of the page in the extent buffer that contains the
				5740	* given bit number
				5741	* @page_offset: return offset into the page given by page_index
				5742	*
				5743	* This helper hides the ugliness of finding the byte in an extent buffer which
				5744	* contains a given bit.
				5745	*/
				5746	static inline void eb_bitmap_offset(struct extent_buffer *eb,
				5747	unsigned long start, unsigned long nr,
				5748	unsigned long *page_index,
				5749	size_t *page_offset)
				5750	{
				5751	size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1);
				5752	size_t byte_offset = BIT_BYTE(nr);
				5753	size_t offset;
				5754
				5755	/*
				5756	* The byte we want is the offset of the extent buffer + the offset of
				5757	* the bitmap item in the extent buffer + the offset of the byte in the
				5758	* bitmap item.
				5759	*/
				5760	offset = start_offset + start + byte_offset;
				5761
				5762	*page_index = offset >> PAGE_SHIFT;
				5763	*page_offset = offset & (PAGE_SIZE - 1);
				5764	}
				5765
				5766	/**
				5767	* extent_buffer_test_bit - determine whether a bit in a bitmap item is set
				5768	* @eb: the extent buffer
				5769	* @start: offset of the bitmap item in the extent buffer
				5770	* @nr: bit number to test
				5771	*/
				5772	int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
				5773	unsigned long nr)
				5774	{
				5775	u8 *kaddr;
				5776	struct page *page;
				5777	unsigned long i;
				5778	size_t offset;
				5779
				5780	eb_bitmap_offset(eb, start, nr, &i, &offset);
				5781	page = eb->pages[i];
				5782	WARN_ON(!PageUptodate(page));
				5783	kaddr = page_address(page);
				5784	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
				5785	}
				5786
				5787	/**
				5788	* extent_buffer_bitmap_set - set an area of a bitmap
				5789	* @eb: the extent buffer
				5790	* @start: offset of the bitmap item in the extent buffer
				5791	* @pos: bit number of the first bit
				5792	* @len: number of bits to set
				5793	*/
				5794	void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
				5795	unsigned long pos, unsigned long len)
				5796	{
				5797	u8 *kaddr;
				5798	struct page *page;
				5799	unsigned long i;
				5800	size_t offset;
				5801	const unsigned int size = pos + len;
				5802	int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
				5803	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
				5804
				5805	eb_bitmap_offset(eb, start, pos, &i, &offset);
				5806	page = eb->pages[i];
				5807	WARN_ON(!PageUptodate(page));
				5808	kaddr = page_address(page);
				5809
				5810	while (len >= bits_to_set) {
				5811	kaddr[offset] \|= mask_to_set;
				5812	len -= bits_to_set;
				5813	bits_to_set = BITS_PER_BYTE;
				5814	mask_to_set = ~0;
				5815	if (++offset >= PAGE_SIZE && len > 0) {
				5816	offset = 0;
				5817	page = eb->pages[++i];
				5818	WARN_ON(!PageUptodate(page));
				5819	kaddr = page_address(page);
				5820	}
				5821	}
				5822	if (len) {
				5823	mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
				5824	kaddr[offset] \|= mask_to_set;
				5825	}
				5826	}
				5827
				5828
				5829	/**
				5830	* extent_buffer_bitmap_clear - clear an area of a bitmap
				5831	* @eb: the extent buffer
				5832	* @start: offset of the bitmap item in the extent buffer
				5833	* @pos: bit number of the first bit
				5834	* @len: number of bits to clear
				5835	*/
				5836	void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
				5837	unsigned long pos, unsigned long len)
				5838	{
				5839	u8 *kaddr;
				5840	struct page *page;
				5841	unsigned long i;
				5842	size_t offset;
				5843	const unsigned int size = pos + len;
				5844	int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
				5845	u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
				5846
				5847	eb_bitmap_offset(eb, start, pos, &i, &offset);
				5848	page = eb->pages[i];
				5849	WARN_ON(!PageUptodate(page));
				5850	kaddr = page_address(page);
				5851
				5852	while (len >= bits_to_clear) {
				5853	kaddr[offset] &= ~mask_to_clear;
				5854	len -= bits_to_clear;
				5855	bits_to_clear = BITS_PER_BYTE;
				5856	mask_to_clear = ~0;
				5857	if (++offset >= PAGE_SIZE && len > 0) {
				5858	offset = 0;
				5859	page = eb->pages[++i];
				5860	WARN_ON(!PageUptodate(page));
				5861	kaddr = page_address(page);
				5862	}
				5863	}
				5864	if (len) {
				5865	mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
				5866	kaddr[offset] &= ~mask_to_clear;
				5867	}
				5868	}
				5869
				5870	static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
				5871	{
				5872	unsigned long distance = (src > dst) ? src - dst : dst - src;
				5873	return distance < len;
				5874	}
				5875
				5876	static void copy_pages(struct page dst_page, struct page src_page,
				5877	unsigned long dst_off, unsigned long src_off,
				5878	unsigned long len)
				5879	{
				5880	char *dst_kaddr = page_address(dst_page);
				5881	char *src_kaddr;
				5882	int must_memmove = 0;
				5883
				5884	if (dst_page != src_page) {
				5885	src_kaddr = page_address(src_page);
				5886	} else {
				5887	src_kaddr = dst_kaddr;
				5888	if (areas_overlap(src_off, dst_off, len))
				5889	must_memmove = 1;
				5890	}
				5891
				5892	if (must_memmove)
				5893	memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
				5894	else
				5895	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
				5896	}
				5897
				5898	void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
				5899	unsigned long src_offset, unsigned long len)
				5900	{
				5901	struct btrfs_fs_info *fs_info = dst->fs_info;
				5902	size_t cur;
				5903	size_t dst_off_in_page;
				5904	size_t src_off_in_page;
				5905	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
				5906	unsigned long dst_i;
				5907	unsigned long src_i;
				5908
				5909	if (src_offset + len > dst->len) {
				5910	btrfs_err(fs_info,
				5911	"memmove bogus src_offset %lu move len %lu dst len %lu",
				5912	src_offset, len, dst->len);
				5913	BUG_ON(1);
				5914	}
				5915	if (dst_offset + len > dst->len) {
				5916	btrfs_err(fs_info,
				5917	"memmove bogus dst_offset %lu move len %lu dst len %lu",
				5918	dst_offset, len, dst->len);
				5919	BUG_ON(1);
				5920	}
				5921
				5922	while (len > 0) {
				5923	dst_off_in_page = (start_offset + dst_offset) &
				5924	(PAGE_SIZE - 1);
				5925	src_off_in_page = (start_offset + src_offset) &
				5926	(PAGE_SIZE - 1);
				5927
				5928	dst_i = (start_offset + dst_offset) >> PAGE_SHIFT;
				5929	src_i = (start_offset + src_offset) >> PAGE_SHIFT;
				5930
				5931	cur = min(len, (unsigned long)(PAGE_SIZE -
				5932	src_off_in_page));
				5933	cur = min_t(unsigned long, cur,
				5934	(unsigned long)(PAGE_SIZE - dst_off_in_page));
				5935
				5936	copy_pages(dst->pages[dst_i], dst->pages[src_i],
				5937	dst_off_in_page, src_off_in_page, cur);
				5938
				5939	src_offset += cur;
				5940	dst_offset += cur;
				5941	len -= cur;
				5942	}
				5943	}
				5944
				5945	void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
				5946	unsigned long src_offset, unsigned long len)
				5947	{
				5948	struct btrfs_fs_info *fs_info = dst->fs_info;
				5949	size_t cur;
				5950	size_t dst_off_in_page;
				5951	size_t src_off_in_page;
				5952	unsigned long dst_end = dst_offset + len - 1;
				5953	unsigned long src_end = src_offset + len - 1;
				5954	size_t start_offset = dst->start & ((u64)PAGE_SIZE - 1);
				5955	unsigned long dst_i;
				5956	unsigned long src_i;
				5957
				5958	if (src_offset + len > dst->len) {
				5959	btrfs_err(fs_info,
				5960	"memmove bogus src_offset %lu move len %lu len %lu",
				5961	src_offset, len, dst->len);
				5962	BUG_ON(1);
				5963	}
				5964	if (dst_offset + len > dst->len) {
				5965	btrfs_err(fs_info,
				5966	"memmove bogus dst_offset %lu move len %lu len %lu",
				5967	dst_offset, len, dst->len);
				5968	BUG_ON(1);
				5969	}
				5970	if (dst_offset < src_offset) {
				5971	memcpy_extent_buffer(dst, dst_offset, src_offset, len);
				5972	return;
				5973	}
				5974	while (len > 0) {
				5975	dst_i = (start_offset + dst_end) >> PAGE_SHIFT;
				5976	src_i = (start_offset + src_end) >> PAGE_SHIFT;
				5977
				5978	dst_off_in_page = (start_offset + dst_end) &
				5979	(PAGE_SIZE - 1);
				5980	src_off_in_page = (start_offset + src_end) &
				5981	(PAGE_SIZE - 1);
				5982
				5983	cur = min_t(unsigned long, len, src_off_in_page + 1);
				5984	cur = min(cur, dst_off_in_page + 1);
				5985	copy_pages(dst->pages[dst_i], dst->pages[src_i],
				5986	dst_off_in_page - cur + 1,
				5987	src_off_in_page - cur + 1, cur);
				5988
				5989	dst_end -= cur;
				5990	src_end -= cur;
				5991	len -= cur;
				5992	}
				5993	}
				5994
				5995	int try_release_extent_buffer(struct page *page)
				5996	{
				5997	struct extent_buffer *eb;
				5998
				5999	/*
				6000	* We need to make sure nobody is attaching this page to an eb right
				6001	* now.
				6002	*/
				6003	spin_lock(&page->mapping->private_lock);
				6004	if (!PagePrivate(page)) {
				6005	spin_unlock(&page->mapping->private_lock);
				6006	return 1;
				6007	}
				6008
				6009	eb = (struct extent_buffer *)page->private;
				6010	BUG_ON(!eb);
				6011
				6012	/*
				6013	* This is a little awful but should be ok, we need to make sure that
				6014	* the eb doesn't disappear out from under us while we're looking at
				6015	* this page.
				6016	*/
				6017	spin_lock(&eb->refs_lock);
				6018	if (atomic_read(&eb->refs) != 1 \|\| extent_buffer_under_io(eb)) {
				6019	spin_unlock(&eb->refs_lock);
				6020	spin_unlock(&page->mapping->private_lock);
				6021	return 0;
				6022	}
				6023	spin_unlock(&page->mapping->private_lock);
				6024
				6025	/*
				6026	* If tree ref isn't set then we know the ref on this eb is a real ref,
				6027	* so just return, this page will likely be freed soon anyway.
				6028	*/
				6029	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
				6030	spin_unlock(&eb->refs_lock);
				6031	return 0;
				6032	}
				6033
				6034	return release_extent_buffer(eb);
				6035	}