Blame - ap/os/linux/linux-3.4.x/fs/btrfs/extent_io.c - R306

blob: 24b58c7f01ef2635c388575b2480504f46f83d9b [file] [log] [blame]

yuezonghe	824eb0c	2024-06-27 02:32:26 -0700	[diff] [blame]	1	#include <linux/bitops.h>
				2	#include <linux/slab.h>
				3	#include <linux/bio.h>
				4	#include <linux/mm.h>
				5	#include <linux/pagemap.h>
				6	#include <linux/page-flags.h>
				7	#include <linux/module.h>
				8	#include <linux/spinlock.h>
				9	#include <linux/blkdev.h>
				10	#include <linux/swap.h>
				11	#include <linux/writeback.h>
				12	#include <linux/pagevec.h>
				13	#include <linux/prefetch.h>
				14	#include <linux/cleancache.h>
				15	#include "extent_io.h"
				16	#include "extent_map.h"
				17	#include "compat.h"
				18	#include "ctree.h"
				19	#include "btrfs_inode.h"
				20	#include "volumes.h"
				21	#include "check-integrity.h"
				22	#include "locking.h"
				23
				24	static struct kmem_cache *extent_state_cache;
				25	static struct kmem_cache *extent_buffer_cache;
				26
				27	static LIST_HEAD(buffers);
				28	static LIST_HEAD(states);
				29
				30	#define LEAK_DEBUG 0
				31	#if LEAK_DEBUG
				32	static DEFINE_SPINLOCK(leak_lock);
				33	#endif
				34
				35	#define BUFFER_LRU_MAX 64
				36
				37	struct tree_entry {
				38	u64 start;
				39	u64 end;
				40	struct rb_node rb_node;
				41	};
				42
				43	struct extent_page_data {
				44	struct bio *bio;
				45	struct extent_io_tree *tree;
				46	get_extent_t *get_extent;
				47
				48	/* tells writepage not to lock the state bits for this range
				49	* it still does the unlocking
				50	*/
				51	unsigned int extent_locked:1;
				52
				53	/* tells the submit_bio code to use a WRITE_SYNC */
				54	unsigned int sync_io:1;
				55	};
				56
				57	static noinline void flush_write_bio(void *data);
				58	static inline struct btrfs_fs_info *
				59	tree_fs_info(struct extent_io_tree *tree)
				60	{
				61	return btrfs_sb(tree->mapping->host->i_sb);
				62	}
				63
				64	int __init extent_io_init(void)
				65	{
				66	extent_state_cache = kmem_cache_create("extent_state",
				67	sizeof(struct extent_state), 0,
				68	SLAB_RECLAIM_ACCOUNT \| SLAB_MEM_SPREAD, NULL);
				69	if (!extent_state_cache)
				70	return -ENOMEM;
				71
				72	extent_buffer_cache = kmem_cache_create("extent_buffers",
				73	sizeof(struct extent_buffer), 0,
				74	SLAB_RECLAIM_ACCOUNT \| SLAB_MEM_SPREAD, NULL);
				75	if (!extent_buffer_cache)
				76	goto free_state_cache;
				77	return 0;
				78
				79	free_state_cache:
				80	kmem_cache_destroy(extent_state_cache);
				81	return -ENOMEM;
				82	}
				83
				84	void extent_io_exit(void)
				85	{
				86	struct extent_state *state;
				87	struct extent_buffer *eb;
				88
				89	while (!list_empty(&states)) {
				90	state = list_entry(states.next, struct extent_state, leak_list);
				91	printk(KERN_ERR "btrfs state leak: start %llu end %llu "
				92	"state %lu in tree %p refs %d\n",
				93	(unsigned long long)state->start,
				94	(unsigned long long)state->end,
				95	state->state, state->tree, atomic_read(&state->refs));
				96	list_del(&state->leak_list);
				97	kmem_cache_free(extent_state_cache, state);
				98
				99	}
				100
				101	while (!list_empty(&buffers)) {
				102	eb = list_entry(buffers.next, struct extent_buffer, leak_list);
				103	printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
				104	"refs %d\n", (unsigned long long)eb->start,
				105	eb->len, atomic_read(&eb->refs));
				106	list_del(&eb->leak_list);
				107	kmem_cache_free(extent_buffer_cache, eb);
				108	}
				109	if (extent_state_cache)
				110	kmem_cache_destroy(extent_state_cache);
				111	if (extent_buffer_cache)
				112	kmem_cache_destroy(extent_buffer_cache);
				113	}
				114
				115	void extent_io_tree_init(struct extent_io_tree *tree,
				116	struct address_space *mapping)
				117	{
				118	tree->state = RB_ROOT;
				119	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
				120	tree->ops = NULL;
				121	tree->dirty_bytes = 0;
				122	spin_lock_init(&tree->lock);
				123	spin_lock_init(&tree->buffer_lock);
				124	tree->mapping = mapping;
				125	}
				126
				127	static struct extent_state *alloc_extent_state(gfp_t mask)
				128	{
				129	struct extent_state *state;
				130	#if LEAK_DEBUG
				131	unsigned long flags;
				132	#endif
				133
				134	state = kmem_cache_alloc(extent_state_cache, mask);
				135	if (!state)
				136	return state;
				137	state->state = 0;
				138	state->private = 0;
				139	state->tree = NULL;
				140	#if LEAK_DEBUG
				141	spin_lock_irqsave(&leak_lock, flags);
				142	list_add(&state->leak_list, &states);
				143	spin_unlock_irqrestore(&leak_lock, flags);
				144	#endif
				145	atomic_set(&state->refs, 1);
				146	init_waitqueue_head(&state->wq);
				147	trace_alloc_extent_state(state, mask, _RET_IP_);
				148	return state;
				149	}
				150
				151	void free_extent_state(struct extent_state *state)
				152	{
				153	if (!state)
				154	return;
				155	if (atomic_dec_and_test(&state->refs)) {
				156	#if LEAK_DEBUG
				157	unsigned long flags;
				158	#endif
				159	WARN_ON(state->tree);
				160	#if LEAK_DEBUG
				161	spin_lock_irqsave(&leak_lock, flags);
				162	list_del(&state->leak_list);
				163	spin_unlock_irqrestore(&leak_lock, flags);
				164	#endif
				165	trace_free_extent_state(state, _RET_IP_);
				166	kmem_cache_free(extent_state_cache, state);
				167	}
				168	}
				169
				170	static struct rb_node tree_insert(struct rb_root root, u64 offset,
				171	struct rb_node *node)
				172	{
				173	struct rb_node **p = &root->rb_node;
				174	struct rb_node *parent = NULL;
				175	struct tree_entry *entry;
				176
				177	while (*p) {
				178	parent = *p;
				179	entry = rb_entry(parent, struct tree_entry, rb_node);
				180
				181	if (offset < entry->start)
				182	p = &(*p)->rb_left;
				183	else if (offset > entry->end)
				184	p = &(*p)->rb_right;
				185	else
				186	return parent;
				187	}
				188
				189	entry = rb_entry(node, struct tree_entry, rb_node);
				190	rb_link_node(node, parent, p);
				191	rb_insert_color(node, root);
				192	return NULL;
				193	}
				194
				195	static struct rb_node __etree_search(struct extent_io_tree tree, u64 offset,
				196	struct rb_node **prev_ret,
				197	struct rb_node **next_ret)
				198	{
				199	struct rb_root *root = &tree->state;
				200	struct rb_node *n = root->rb_node;
				201	struct rb_node *prev = NULL;
				202	struct rb_node *orig_prev = NULL;
				203	struct tree_entry *entry;
				204	struct tree_entry *prev_entry = NULL;
				205
				206	while (n) {
				207	entry = rb_entry(n, struct tree_entry, rb_node);
				208	prev = n;
				209	prev_entry = entry;
				210
				211	if (offset < entry->start)
				212	n = n->rb_left;
				213	else if (offset > entry->end)
				214	n = n->rb_right;
				215	else
				216	return n;
				217	}
				218
				219	if (prev_ret) {
				220	orig_prev = prev;
				221	while (prev && offset > prev_entry->end) {
				222	prev = rb_next(prev);
				223	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				224	}
				225	*prev_ret = prev;
				226	prev = orig_prev;
				227	}
				228
				229	if (next_ret) {
				230	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				231	while (prev && offset < prev_entry->start) {
				232	prev = rb_prev(prev);
				233	prev_entry = rb_entry(prev, struct tree_entry, rb_node);
				234	}
				235	*next_ret = prev;
				236	}
				237	return NULL;
				238	}
				239
				240	static inline struct rb_node tree_search(struct extent_io_tree tree,
				241	u64 offset)
				242	{
				243	struct rb_node *prev = NULL;
				244	struct rb_node *ret;
				245
				246	ret = __etree_search(tree, offset, &prev, NULL);
				247	if (!ret)
				248	return prev;
				249	return ret;
				250	}
				251
				252	static void merge_cb(struct extent_io_tree tree, struct extent_state new,
				253	struct extent_state *other)
				254	{
				255	if (tree->ops && tree->ops->merge_extent_hook)
				256	tree->ops->merge_extent_hook(tree->mapping->host, new,
				257	other);
				258	}
				259
				260	/*
				261	* utility function to look for merge candidates inside a given range.
				262	* Any extents with matching state are merged together into a single
				263	* extent in the tree. Extents with EXTENT_IO in their state field
				264	* are not merged because the end_io handlers need to be able to do
				265	* operations on them without sleeping (or doing allocations/splits).
				266	*
				267	* This should be called with the tree lock held.
				268	*/
				269	static void merge_state(struct extent_io_tree *tree,
				270	struct extent_state *state)
				271	{
				272	struct extent_state *other;
				273	struct rb_node *other_node;
				274
				275	if (state->state & (EXTENT_IOBITS \| EXTENT_BOUNDARY))
				276	return;
				277
				278	other_node = rb_prev(&state->rb_node);
				279	if (other_node) {
				280	other = rb_entry(other_node, struct extent_state, rb_node);
				281	if (other->end == state->start - 1 &&
				282	other->state == state->state) {
				283	merge_cb(tree, state, other);
				284	state->start = other->start;
				285	other->tree = NULL;
				286	rb_erase(&other->rb_node, &tree->state);
				287	free_extent_state(other);
				288	}
				289	}
				290	other_node = rb_next(&state->rb_node);
				291	if (other_node) {
				292	other = rb_entry(other_node, struct extent_state, rb_node);
				293	if (other->start == state->end + 1 &&
				294	other->state == state->state) {
				295	merge_cb(tree, state, other);
				296	state->end = other->end;
				297	other->tree = NULL;
				298	rb_erase(&other->rb_node, &tree->state);
				299	free_extent_state(other);
				300	}
				301	}
				302	}
				303
				304	static void set_state_cb(struct extent_io_tree *tree,
				305	struct extent_state state, int bits)
				306	{
				307	if (tree->ops && tree->ops->set_bit_hook)
				308	tree->ops->set_bit_hook(tree->mapping->host, state, bits);
				309	}
				310
				311	static void clear_state_cb(struct extent_io_tree *tree,
				312	struct extent_state state, int bits)
				313	{
				314	if (tree->ops && tree->ops->clear_bit_hook)
				315	tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
				316	}
				317
				318	static void set_state_bits(struct extent_io_tree *tree,
				319	struct extent_state state, int bits);
				320
				321	/*
				322	* insert an extent_state struct into the tree. 'bits' are set on the
				323	* struct before it is inserted.
				324	*
				325	* This may return -EEXIST if the extent is already there, in which case the
				326	* state struct is freed.
				327	*
				328	* The tree lock is not taken internally. This is a utility function and
				329	* probably isn't what you want to call (see set/clear_extent_bit).
				330	*/
				331	static int insert_state(struct extent_io_tree *tree,
				332	struct extent_state *state, u64 start, u64 end,
				333	int *bits)
				334	{
				335	struct rb_node *node;
				336
				337	if (end < start) {
				338	printk(KERN_ERR "btrfs end < start %llu %llu\n",
				339	(unsigned long long)end,
				340	(unsigned long long)start);
				341	WARN_ON(1);
				342	}
				343	state->start = start;
				344	state->end = end;
				345
				346	set_state_bits(tree, state, bits);
				347
				348	node = tree_insert(&tree->state, end, &state->rb_node);
				349	if (node) {
				350	struct extent_state *found;
				351	found = rb_entry(node, struct extent_state, rb_node);
				352	printk(KERN_ERR "btrfs found node %llu %llu on insert of "
				353	"%llu %llu\n", (unsigned long long)found->start,
				354	(unsigned long long)found->end,
				355	(unsigned long long)start, (unsigned long long)end);
				356	return -EEXIST;
				357	}
				358	state->tree = tree;
				359	merge_state(tree, state);
				360	return 0;
				361	}
				362
				363	static void split_cb(struct extent_io_tree tree, struct extent_state orig,
				364	u64 split)
				365	{
				366	if (tree->ops && tree->ops->split_extent_hook)
				367	tree->ops->split_extent_hook(tree->mapping->host, orig, split);
				368	}
				369
				370	/*
				371	* split a given extent state struct in two, inserting the preallocated
				372	* struct 'prealloc' as the newly created second half. 'split' indicates an
				373	* offset inside 'orig' where it should be split.
				374	*
				375	* Before calling,
				376	* the tree has 'orig' at [orig->start, orig->end]. After calling, there
				377	* are two extent state structs in the tree:
				378	* prealloc: [orig->start, split - 1]
				379	* orig: [ split, orig->end ]
				380	*
				381	* The tree locks are not taken by this function. They need to be held
				382	* by the caller.
				383	*/
				384	static int split_state(struct extent_io_tree tree, struct extent_state orig,
				385	struct extent_state *prealloc, u64 split)
				386	{
				387	struct rb_node *node;
				388
				389	split_cb(tree, orig, split);
				390
				391	prealloc->start = orig->start;
				392	prealloc->end = split - 1;
				393	prealloc->state = orig->state;
				394	orig->start = split;
				395
				396	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
				397	if (node) {
				398	free_extent_state(prealloc);
				399	return -EEXIST;
				400	}
				401	prealloc->tree = tree;
				402	return 0;
				403	}
				404
				405	static struct extent_state next_state(struct extent_state state)
				406	{
				407	struct rb_node *next = rb_next(&state->rb_node);
				408	if (next)
				409	return rb_entry(next, struct extent_state, rb_node);
				410	else
				411	return NULL;
				412	}
				413
				414	/*
				415	* utility function to clear some bits in an extent state struct.
				416	* it will optionally wake up any one waiting on this state (wake == 1)
				417	*
				418	* If no bits are set on the state struct after clearing things, the
				419	* struct is freed and removed from the tree
				420	*/
				421	static struct extent_state clear_state_bit(struct extent_io_tree tree,
				422	struct extent_state *state,
				423	int *bits, int wake)
				424	{
				425	struct extent_state *next;
				426	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
				427
				428	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
				429	u64 range = state->end - state->start + 1;
				430	WARN_ON(range > tree->dirty_bytes);
				431	tree->dirty_bytes -= range;
				432	}
				433	clear_state_cb(tree, state, bits);
				434	state->state &= ~bits_to_clear;
				435	if (wake)
				436	wake_up(&state->wq);
				437	if (state->state == 0) {
				438	next = next_state(state);
				439	if (state->tree) {
				440	rb_erase(&state->rb_node, &tree->state);
				441	state->tree = NULL;
				442	free_extent_state(state);
				443	} else {
				444	WARN_ON(1);
				445	}
				446	} else {
				447	merge_state(tree, state);
				448	next = next_state(state);
				449	}
				450	return next;
				451	}
				452
				453	static struct extent_state *
				454	alloc_extent_state_atomic(struct extent_state *prealloc)
				455	{
				456	if (!prealloc)
				457	prealloc = alloc_extent_state(GFP_ATOMIC);
				458
				459	return prealloc;
				460	}
				461
				462	void extent_io_tree_panic(struct extent_io_tree *tree, int err)
				463	{
				464	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
				465	"Extent tree was modified by another "
				466	"thread while locked.");
				467	}
				468
				469	/*
				470	* clear some bits on a range in the tree. This may require splitting
				471	* or inserting elements in the tree, so the gfp mask is used to
				472	* indicate which allocations or sleeping are allowed.
				473	*
				474	* pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
				475	* the given range from the tree regardless of state (ie for truncate).
				476	*
				477	* the range [start, end] is inclusive.
				478	*
				479	* This takes the tree lock, and returns 0 on success and < 0 on error.
				480	*/
				481	int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				482	int bits, int wake, int delete,
				483	struct extent_state **cached_state,
				484	gfp_t mask)
				485	{
				486	struct extent_state *state;
				487	struct extent_state *cached;
				488	struct extent_state *prealloc = NULL;
				489	struct rb_node *node;
				490	u64 last_end;
				491	int err;
				492	int clear = 0;
				493
				494	if (delete)
				495	bits \|= ~EXTENT_CTLBITS;
				496	bits \|= EXTENT_FIRST_DELALLOC;
				497
				498	if (bits & (EXTENT_IOBITS \| EXTENT_BOUNDARY))
				499	clear = 1;
				500	again:
				501	if (!prealloc && (mask & __GFP_WAIT)) {
				502	prealloc = alloc_extent_state(mask);
				503	if (!prealloc)
				504	return -ENOMEM;
				505	}
				506
				507	spin_lock(&tree->lock);
				508	if (cached_state) {
				509	cached = *cached_state;
				510
				511	if (clear) {
				512	*cached_state = NULL;
				513	cached_state = NULL;
				514	}
				515
				516	if (cached && cached->tree && cached->start <= start &&
				517	cached->end > start) {
				518	if (clear)
				519	atomic_dec(&cached->refs);
				520	state = cached;
				521	goto hit_next;
				522	}
				523	if (clear)
				524	free_extent_state(cached);
				525	}
				526	/*
				527	* this search will find the extents that end after
				528	* our range starts
				529	*/
				530	node = tree_search(tree, start);
				531	if (!node)
				532	goto out;
				533	state = rb_entry(node, struct extent_state, rb_node);
				534	hit_next:
				535	if (state->start > end)
				536	goto out;
				537	WARN_ON(state->end < start);
				538	last_end = state->end;
				539
				540	/* the state doesn't have the wanted bits, go ahead */
				541	if (!(state->state & bits)) {
				542	state = next_state(state);
				543	goto next;
				544	}
				545
				546	/*
				547	* \| ---- desired range ---- \|
				548	* \| state \| or
				549	* \| ------------- state -------------- \|
				550	*
				551	* We need to split the extent we found, and may flip
				552	* bits on second half.
				553	*
				554	* If the extent we found extends past our range, we
				555	* just split and search again. It'll get split again
				556	* the next time though.
				557	*
				558	* If the extent we found is inside our range, we clear
				559	* the desired bit on it.
				560	*/
				561
				562	if (state->start < start) {
				563	prealloc = alloc_extent_state_atomic(prealloc);
				564	BUG_ON(!prealloc);
				565	err = split_state(tree, state, prealloc, start);
				566	if (err)
				567	extent_io_tree_panic(tree, err);
				568
				569	prealloc = NULL;
				570	if (err)
				571	goto out;
				572	if (state->end <= end) {
				573	clear_state_bit(tree, state, &bits, wake);
				574	if (last_end == (u64)-1)
				575	goto out;
				576	start = last_end + 1;
				577	}
				578	goto search_again;
				579	}
				580	/*
				581	* \| ---- desired range ---- \|
				582	* \| state \|
				583	* We need to split the extent, and clear the bit
				584	* on the first half
				585	*/
				586	if (state->start <= end && state->end > end) {
				587	prealloc = alloc_extent_state_atomic(prealloc);
				588	BUG_ON(!prealloc);
				589	err = split_state(tree, state, prealloc, end + 1);
				590	if (err)
				591	extent_io_tree_panic(tree, err);
				592
				593	if (wake)
				594	wake_up(&state->wq);
				595
				596	clear_state_bit(tree, prealloc, &bits, wake);
				597
				598	prealloc = NULL;
				599	goto out;
				600	}
				601
				602	state = clear_state_bit(tree, state, &bits, wake);
				603	next:
				604	if (last_end == (u64)-1)
				605	goto out;
				606	start = last_end + 1;
				607	if (start <= end && state && !need_resched())
				608	goto hit_next;
				609	goto search_again;
				610
				611	out:
				612	spin_unlock(&tree->lock);
				613	if (prealloc)
				614	free_extent_state(prealloc);
				615
				616	return 0;
				617
				618	search_again:
				619	if (start > end)
				620	goto out;
				621	spin_unlock(&tree->lock);
				622	if (mask & __GFP_WAIT)
				623	cond_resched();
				624	goto again;
				625	}
				626
				627	static void wait_on_state(struct extent_io_tree *tree,
				628	struct extent_state *state)
				629	__releases(tree->lock)
				630	__acquires(tree->lock)
				631	{
				632	DEFINE_WAIT(wait);
				633	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
				634	spin_unlock(&tree->lock);
				635	schedule();
				636	spin_lock(&tree->lock);
				637	finish_wait(&state->wq, &wait);
				638	}
				639
				640	/*
				641	* waits for one or more bits to clear on a range in the state tree.
				642	* The range [start, end] is inclusive.
				643	* The tree lock is taken by this function
				644	*/
				645	void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
				646	{
				647	struct extent_state *state;
				648	struct rb_node *node;
				649
				650	spin_lock(&tree->lock);
				651	again:
				652	while (1) {
				653	/*
				654	* this search will find all the extents that end after
				655	* our range starts
				656	*/
				657	node = tree_search(tree, start);
				658	if (!node)
				659	break;
				660
				661	state = rb_entry(node, struct extent_state, rb_node);
				662
				663	if (state->start > end)
				664	goto out;
				665
				666	if (state->state & bits) {
				667	start = state->start;
				668	atomic_inc(&state->refs);
				669	wait_on_state(tree, state);
				670	free_extent_state(state);
				671	goto again;
				672	}
				673	start = state->end + 1;
				674
				675	if (start > end)
				676	break;
				677
				678	cond_resched_lock(&tree->lock);
				679	}
				680	out:
				681	spin_unlock(&tree->lock);
				682	}
				683
				684	static void set_state_bits(struct extent_io_tree *tree,
				685	struct extent_state *state,
				686	int *bits)
				687	{
				688	int bits_to_set = *bits & ~EXTENT_CTLBITS;
				689
				690	set_state_cb(tree, state, bits);
				691	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
				692	u64 range = state->end - state->start + 1;
				693	tree->dirty_bytes += range;
				694	}
				695	state->state \|= bits_to_set;
				696	}
				697
				698	static void cache_state(struct extent_state *state,
				699	struct extent_state **cached_ptr)
				700	{
				701	if (cached_ptr && !(*cached_ptr)) {
				702	if (state->state & (EXTENT_IOBITS \| EXTENT_BOUNDARY)) {
				703	*cached_ptr = state;
				704	atomic_inc(&state->refs);
				705	}
				706	}
				707	}
				708
				709	static void uncache_state(struct extent_state **cached_ptr)
				710	{
				711	if (cached_ptr && (*cached_ptr)) {
				712	struct extent_state state = cached_ptr;
				713	*cached_ptr = NULL;
				714	free_extent_state(state);
				715	}
				716	}
				717
				718	/*
				719	* set some bits on a range in the tree. This may require allocations or
				720	* sleeping, so the gfp mask is used to indicate what is allowed.
				721	*
				722	* If any of the exclusive bits are set, this will fail with -EEXIST if some
				723	* part of the range already has the desired bits set. The start of the
				724	* existing range is returned in failed_start in this case.
				725	*
				726	* [start, end] is inclusive This takes the tree lock.
				727	*/
				728
				729	static int __must_check
				730	__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				731	int bits, int exclusive_bits, u64 *failed_start,
				732	struct extent_state **cached_state, gfp_t mask)
				733	{
				734	struct extent_state *state;
				735	struct extent_state *prealloc = NULL;
				736	struct rb_node *node;
				737	int err = 0;
				738	u64 last_start;
				739	u64 last_end;
				740
				741	bits \|= EXTENT_FIRST_DELALLOC;
				742	again:
				743	if (!prealloc && (mask & __GFP_WAIT)) {
				744	prealloc = alloc_extent_state(mask);
				745	BUG_ON(!prealloc);
				746	}
				747
				748	spin_lock(&tree->lock);
				749	if (cached_state && *cached_state) {
				750	state = *cached_state;
				751	if (state->start <= start && state->end > start &&
				752	state->tree) {
				753	node = &state->rb_node;
				754	goto hit_next;
				755	}
				756	}
				757	/*
				758	* this search will find all the extents that end after
				759	* our range starts.
				760	*/
				761	node = tree_search(tree, start);
				762	if (!node) {
				763	prealloc = alloc_extent_state_atomic(prealloc);
				764	BUG_ON(!prealloc);
				765	err = insert_state(tree, prealloc, start, end, &bits);
				766	if (err)
				767	extent_io_tree_panic(tree, err);
				768
				769	prealloc = NULL;
				770	goto out;
				771	}
				772	state = rb_entry(node, struct extent_state, rb_node);
				773	hit_next:
				774	last_start = state->start;
				775	last_end = state->end;
				776
				777	/*
				778	* \| ---- desired range ---- \|
				779	* \| state \|
				780	*
				781	* Just lock what we found and keep going
				782	*/
				783	if (state->start == start && state->end <= end) {
				784	struct rb_node *next_node;
				785	if (state->state & exclusive_bits) {
				786	*failed_start = state->start;
				787	err = -EEXIST;
				788	goto out;
				789	}
				790
				791	set_state_bits(tree, state, &bits);
				792
				793	cache_state(state, cached_state);
				794	merge_state(tree, state);
				795	if (last_end == (u64)-1)
				796	goto out;
				797
				798	start = last_end + 1;
				799	next_node = rb_next(&state->rb_node);
				800	if (next_node && start < end && prealloc && !need_resched()) {
				801	state = rb_entry(next_node, struct extent_state,
				802	rb_node);
				803	if (state->start == start)
				804	goto hit_next;
				805	}
				806	goto search_again;
				807	}
				808
				809	/*
				810	* \| ---- desired range ---- \|
				811	* \| state \|
				812	* or
				813	* \| ------------- state -------------- \|
				814	*
				815	* We need to split the extent we found, and may flip bits on
				816	* second half.
				817	*
				818	* If the extent we found extends past our
				819	* range, we just split and search again. It'll get split
				820	* again the next time though.
				821	*
				822	* If the extent we found is inside our range, we set the
				823	* desired bit on it.
				824	*/
				825	if (state->start < start) {
				826	if (state->state & exclusive_bits) {
				827	*failed_start = start;
				828	err = -EEXIST;
				829	goto out;
				830	}
				831
				832	prealloc = alloc_extent_state_atomic(prealloc);
				833	BUG_ON(!prealloc);
				834	err = split_state(tree, state, prealloc, start);
				835	if (err)
				836	extent_io_tree_panic(tree, err);
				837
				838	prealloc = NULL;
				839	if (err)
				840	goto out;
				841	if (state->end <= end) {
				842	set_state_bits(tree, state, &bits);
				843	cache_state(state, cached_state);
				844	merge_state(tree, state);
				845	if (last_end == (u64)-1)
				846	goto out;
				847	start = last_end + 1;
				848	}
				849	goto search_again;
				850	}
				851	/*
				852	* \| ---- desired range ---- \|
				853	* \| state \| or \| state \|
				854	*
				855	* There's a hole, we need to insert something in it and
				856	* ignore the extent we found.
				857	*/
				858	if (state->start > start) {
				859	u64 this_end;
				860	if (end < last_start)
				861	this_end = end;
				862	else
				863	this_end = last_start - 1;
				864
				865	prealloc = alloc_extent_state_atomic(prealloc);
				866	BUG_ON(!prealloc);
				867
				868	/*
				869	* Avoid to free 'prealloc' if it can be merged with
				870	* the later extent.
				871	*/
				872	err = insert_state(tree, prealloc, start, this_end,
				873	&bits);
				874	if (err)
				875	extent_io_tree_panic(tree, err);
				876
				877	cache_state(prealloc, cached_state);
				878	prealloc = NULL;
				879	start = this_end + 1;
				880	goto search_again;
				881	}
				882	/*
				883	* \| ---- desired range ---- \|
				884	* \| state \|
				885	* We need to split the extent, and set the bit
				886	* on the first half
				887	*/
				888	if (state->start <= end && state->end > end) {
				889	if (state->state & exclusive_bits) {
				890	*failed_start = start;
				891	err = -EEXIST;
				892	goto out;
				893	}
				894
				895	prealloc = alloc_extent_state_atomic(prealloc);
				896	BUG_ON(!prealloc);
				897	err = split_state(tree, state, prealloc, end + 1);
				898	if (err)
				899	extent_io_tree_panic(tree, err);
				900
				901	set_state_bits(tree, prealloc, &bits);
				902	cache_state(prealloc, cached_state);
				903	merge_state(tree, prealloc);
				904	prealloc = NULL;
				905	goto out;
				906	}
				907
				908	goto search_again;
				909
				910	out:
				911	spin_unlock(&tree->lock);
				912	if (prealloc)
				913	free_extent_state(prealloc);
				914
				915	return err;
				916
				917	search_again:
				918	if (start > end)
				919	goto out;
				920	spin_unlock(&tree->lock);
				921	if (mask & __GFP_WAIT)
				922	cond_resched();
				923	goto again;
				924	}
				925
				926	int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
				927	u64 failed_start, struct extent_state *cached_state,
				928	gfp_t mask)
				929	{
				930	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
				931	cached_state, mask);
				932	}
				933
				934
				935	/**
				936	* convert_extent - convert all bits in a given range from one bit to another
				937	* @tree: the io tree to search
				938	* @start: the start offset in bytes
				939	* @end: the end offset in bytes (inclusive)
				940	* @bits: the bits to set in this range
				941	* @clear_bits: the bits to clear in this range
				942	* @mask: the allocation mask
				943	*
				944	* This will go through and set bits for the given range. If any states exist
				945	* already in this range they are set with the given bit and cleared of the
				946	* clear_bits. This is only meant to be used by things that are mergeable, ie
				947	* converting from say DELALLOC to DIRTY. This is not meant to be used with
				948	* boundary bits like LOCK.
				949	*/
				950	int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
				951	int bits, int clear_bits, gfp_t mask)
				952	{
				953	struct extent_state *state;
				954	struct extent_state *prealloc = NULL;
				955	struct rb_node *node;
				956	int err = 0;
				957	u64 last_start;
				958	u64 last_end;
				959
				960	again:
				961	if (!prealloc && (mask & __GFP_WAIT)) {
				962	prealloc = alloc_extent_state(mask);
				963	if (!prealloc)
				964	return -ENOMEM;
				965	}
				966
				967	spin_lock(&tree->lock);
				968	/*
				969	* this search will find all the extents that end after
				970	* our range starts.
				971	*/
				972	node = tree_search(tree, start);
				973	if (!node) {
				974	prealloc = alloc_extent_state_atomic(prealloc);
				975	if (!prealloc) {
				976	err = -ENOMEM;
				977	goto out;
				978	}
				979	err = insert_state(tree, prealloc, start, end, &bits);
				980	prealloc = NULL;
				981	if (err)
				982	extent_io_tree_panic(tree, err);
				983	goto out;
				984	}
				985	state = rb_entry(node, struct extent_state, rb_node);
				986	hit_next:
				987	last_start = state->start;
				988	last_end = state->end;
				989
				990	/*
				991	* \| ---- desired range ---- \|
				992	* \| state \|
				993	*
				994	* Just lock what we found and keep going
				995	*/
				996	if (state->start == start && state->end <= end) {
				997	struct rb_node *next_node;
				998
				999	set_state_bits(tree, state, &bits);
				1000	clear_state_bit(tree, state, &clear_bits, 0);
				1001	if (last_end == (u64)-1)
				1002	goto out;
				1003
				1004	start = last_end + 1;
				1005	next_node = rb_next(&state->rb_node);
				1006	if (next_node && start < end && prealloc && !need_resched()) {
				1007	state = rb_entry(next_node, struct extent_state,
				1008	rb_node);
				1009	if (state->start == start)
				1010	goto hit_next;
				1011	}
				1012	goto search_again;
				1013	}
				1014
				1015	/*
				1016	* \| ---- desired range ---- \|
				1017	* \| state \|
				1018	* or
				1019	* \| ------------- state -------------- \|
				1020	*
				1021	* We need to split the extent we found, and may flip bits on
				1022	* second half.
				1023	*
				1024	* If the extent we found extends past our
				1025	* range, we just split and search again. It'll get split
				1026	* again the next time though.
				1027	*
				1028	* If the extent we found is inside our range, we set the
				1029	* desired bit on it.
				1030	*/
				1031	if (state->start < start) {
				1032	prealloc = alloc_extent_state_atomic(prealloc);
				1033	if (!prealloc) {
				1034	err = -ENOMEM;
				1035	goto out;
				1036	}
				1037	err = split_state(tree, state, prealloc, start);
				1038	if (err)
				1039	extent_io_tree_panic(tree, err);
				1040	prealloc = NULL;
				1041	if (err)
				1042	goto out;
				1043	if (state->end <= end) {
				1044	set_state_bits(tree, state, &bits);
				1045	clear_state_bit(tree, state, &clear_bits, 0);
				1046	if (last_end == (u64)-1)
				1047	goto out;
				1048	start = last_end + 1;
				1049	}
				1050	goto search_again;
				1051	}
				1052	/*
				1053	* \| ---- desired range ---- \|
				1054	* \| state \| or \| state \|
				1055	*
				1056	* There's a hole, we need to insert something in it and
				1057	* ignore the extent we found.
				1058	*/
				1059	if (state->start > start) {
				1060	u64 this_end;
				1061	if (end < last_start)
				1062	this_end = end;
				1063	else
				1064	this_end = last_start - 1;
				1065
				1066	prealloc = alloc_extent_state_atomic(prealloc);
				1067	if (!prealloc) {
				1068	err = -ENOMEM;
				1069	goto out;
				1070	}
				1071
				1072	/*
				1073	* Avoid to free 'prealloc' if it can be merged with
				1074	* the later extent.
				1075	*/
				1076	err = insert_state(tree, prealloc, start, this_end,
				1077	&bits);
				1078	if (err)
				1079	extent_io_tree_panic(tree, err);
				1080	prealloc = NULL;
				1081	start = this_end + 1;
				1082	goto search_again;
				1083	}
				1084	/*
				1085	* \| ---- desired range ---- \|
				1086	* \| state \|
				1087	* We need to split the extent, and set the bit
				1088	* on the first half
				1089	*/
				1090	if (state->start <= end && state->end > end) {
				1091	prealloc = alloc_extent_state_atomic(prealloc);
				1092	if (!prealloc) {
				1093	err = -ENOMEM;
				1094	goto out;
				1095	}
				1096
				1097	err = split_state(tree, state, prealloc, end + 1);
				1098	if (err)
				1099	extent_io_tree_panic(tree, err);
				1100
				1101	set_state_bits(tree, prealloc, &bits);
				1102	clear_state_bit(tree, prealloc, &clear_bits, 0);
				1103	prealloc = NULL;
				1104	goto out;
				1105	}
				1106
				1107	goto search_again;
				1108
				1109	out:
				1110	spin_unlock(&tree->lock);
				1111	if (prealloc)
				1112	free_extent_state(prealloc);
				1113
				1114	return err;
				1115
				1116	search_again:
				1117	if (start > end)
				1118	goto out;
				1119	spin_unlock(&tree->lock);
				1120	if (mask & __GFP_WAIT)
				1121	cond_resched();
				1122	goto again;
				1123	}
				1124
				1125	/* wrappers around set/clear extent bit */
				1126	int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
				1127	gfp_t mask)
				1128	{
				1129	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
				1130	NULL, mask);
				1131	}
				1132
				1133	int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1134	int bits, gfp_t mask)
				1135	{
				1136	return set_extent_bit(tree, start, end, bits, NULL,
				1137	NULL, mask);
				1138	}
				1139
				1140	int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1141	int bits, gfp_t mask)
				1142	{
				1143	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
				1144	}
				1145
				1146	int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
				1147	struct extent_state **cached_state, gfp_t mask)
				1148	{
				1149	return set_extent_bit(tree, start, end,
				1150	EXTENT_DELALLOC \| EXTENT_UPTODATE,
				1151	NULL, cached_state, mask);
				1152	}
				1153
				1154	int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
				1155	gfp_t mask)
				1156	{
				1157	return clear_extent_bit(tree, start, end,
				1158	EXTENT_DIRTY \| EXTENT_DELALLOC \|
				1159	EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
				1160	}
				1161
				1162	int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
				1163	gfp_t mask)
				1164	{
				1165	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
				1166	NULL, mask);
				1167	}
				1168
				1169	int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
				1170	struct extent_state **cached_state, gfp_t mask)
				1171	{
				1172	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
				1173	cached_state, mask);
				1174	}
				1175
				1176	static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
				1177	u64 end, struct extent_state **cached_state,
				1178	gfp_t mask)
				1179	{
				1180	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
				1181	cached_state, mask);
				1182	}
				1183
				1184	/*
				1185	* either insert or lock state struct between start and end use mask to tell
				1186	* us if waiting is desired.
				1187	*/
				1188	int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
				1189	int bits, struct extent_state **cached_state)
				1190	{
				1191	int err;
				1192	u64 failed_start;
				1193	while (1) {
				1194	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED \| bits,
				1195	EXTENT_LOCKED, &failed_start,
				1196	cached_state, GFP_NOFS);
				1197	if (err == -EEXIST) {
				1198	wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
				1199	start = failed_start;
				1200	} else
				1201	break;
				1202	WARN_ON(start > end);
				1203	}
				1204	return err;
				1205	}
				1206
				1207	int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
				1208	{
				1209	return lock_extent_bits(tree, start, end, 0, NULL);
				1210	}
				1211
				1212	int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
				1213	{
				1214	int err;
				1215	u64 failed_start;
				1216
				1217	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
				1218	&failed_start, NULL, GFP_NOFS);
				1219	if (err == -EEXIST) {
				1220	if (failed_start > start)
				1221	clear_extent_bit(tree, start, failed_start - 1,
				1222	EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
				1223	return 0;
				1224	}
				1225	return 1;
				1226	}
				1227
				1228	int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
				1229	struct extent_state **cached, gfp_t mask)
				1230	{
				1231	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
				1232	mask);
				1233	}
				1234
				1235	int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
				1236	{
				1237	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
				1238	GFP_NOFS);
				1239	}
				1240
				1241	int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
				1242	{
				1243	unsigned long index = start >> PAGE_CACHE_SHIFT;
				1244	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
				1245	struct page *page;
				1246
				1247	while (index <= end_index) {
				1248	page = find_get_page(inode->i_mapping, index);
				1249	BUG_ON(!page); /* Pages should be in the extent_io_tree */
				1250	clear_page_dirty_for_io(page);
				1251	page_cache_release(page);
				1252	index++;
				1253	}
				1254	return 0;
				1255	}
				1256
				1257	int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
				1258	{
				1259	unsigned long index = start >> PAGE_CACHE_SHIFT;
				1260	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
				1261	struct page *page;
				1262
				1263	while (index <= end_index) {
				1264	page = find_get_page(inode->i_mapping, index);
				1265	BUG_ON(!page); /* Pages should be in the extent_io_tree */
				1266	account_page_redirty(page);
				1267	__set_page_dirty_nobuffers(page);
				1268	page_cache_release(page);
				1269	index++;
				1270	}
				1271	return 0;
				1272	}
				1273
				1274	/*
				1275	* helper function to set both pages and extents in the tree writeback
				1276	*/
				1277	static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
				1278	{
				1279	unsigned long index = start >> PAGE_CACHE_SHIFT;
				1280	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
				1281	struct page *page;
				1282
				1283	while (index <= end_index) {
				1284	page = find_get_page(tree->mapping, index);
				1285	BUG_ON(!page); /* Pages should be in the extent_io_tree */
				1286	set_page_writeback(page);
				1287	page_cache_release(page);
				1288	index++;
				1289	}
				1290	return 0;
				1291	}
				1292
				1293	/* find the first state struct with 'bits' set after 'start', and
				1294	* return it. tree->lock must be held. NULL will returned if
				1295	* nothing was found after 'start'
				1296	*/
				1297	struct extent_state find_first_extent_bit_state(struct extent_io_tree tree,
				1298	u64 start, int bits)
				1299	{
				1300	struct rb_node *node;
				1301	struct extent_state *state;
				1302
				1303	/*
				1304	* this search will find all the extents that end after
				1305	* our range starts.
				1306	*/
				1307	node = tree_search(tree, start);
				1308	if (!node)
				1309	goto out;
				1310
				1311	while (1) {
				1312	state = rb_entry(node, struct extent_state, rb_node);
				1313	if (state->end >= start && (state->state & bits))
				1314	return state;
				1315
				1316	node = rb_next(node);
				1317	if (!node)
				1318	break;
				1319	}
				1320	out:
				1321	return NULL;
				1322	}
				1323
				1324	/*
				1325	* find the first offset in the io tree with 'bits' set. zero is
				1326	* returned if we find something, and start_ret and end_ret are
				1327	* set to reflect the state struct that was found.
				1328	*
				1329	* If nothing was found, 1 is returned, < 0 on error
				1330	*/
				1331	int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
				1332	u64 start_ret, u64 end_ret, int bits)
				1333	{
				1334	struct extent_state *state;
				1335	int ret = 1;
				1336
				1337	spin_lock(&tree->lock);
				1338	state = find_first_extent_bit_state(tree, start, bits);
				1339	if (state) {
				1340	*start_ret = state->start;
				1341	*end_ret = state->end;
				1342	ret = 0;
				1343	}
				1344	spin_unlock(&tree->lock);
				1345	return ret;
				1346	}
				1347
				1348	/*
				1349	* find a contiguous range of bytes in the file marked as delalloc, not
				1350	* more than 'max_bytes'. start and end are used to return the range,
				1351	*
				1352	* 1 is returned if we find something, 0 if nothing was in the tree
				1353	*/
				1354	static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
				1355	u64 start, u64 end, u64 max_bytes,
				1356	struct extent_state **cached_state)
				1357	{
				1358	struct rb_node *node;
				1359	struct extent_state *state;
				1360	u64 cur_start = *start;
				1361	u64 found = 0;
				1362	u64 total_bytes = 0;
				1363
				1364	spin_lock(&tree->lock);
				1365
				1366	/*
				1367	* this search will find all the extents that end after
				1368	* our range starts.
				1369	*/
				1370	node = tree_search(tree, cur_start);
				1371	if (!node) {
				1372	if (!found)
				1373	*end = (u64)-1;
				1374	goto out;
				1375	}
				1376
				1377	while (1) {
				1378	state = rb_entry(node, struct extent_state, rb_node);
				1379	if (found && (state->start != cur_start \|\|
				1380	(state->state & EXTENT_BOUNDARY))) {
				1381	goto out;
				1382	}
				1383	if (!(state->state & EXTENT_DELALLOC)) {
				1384	if (!found)
				1385	*end = state->end;
				1386	goto out;
				1387	}
				1388	if (!found) {
				1389	*start = state->start;
				1390	*cached_state = state;
				1391	atomic_inc(&state->refs);
				1392	}
				1393	found++;
				1394	*end = state->end;
				1395	cur_start = state->end + 1;
				1396	node = rb_next(node);
				1397	if (!node)
				1398	break;
				1399	total_bytes += state->end - state->start + 1;
				1400	if (total_bytes >= max_bytes)
				1401	break;
				1402	}
				1403	out:
				1404	spin_unlock(&tree->lock);
				1405	return found;
				1406	}
				1407
				1408	static noinline void __unlock_for_delalloc(struct inode *inode,
				1409	struct page *locked_page,
				1410	u64 start, u64 end)
				1411	{
				1412	int ret;
				1413	struct page *pages[16];
				1414	unsigned long index = start >> PAGE_CACHE_SHIFT;
				1415	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
				1416	unsigned long nr_pages = end_index - index + 1;
				1417	int i;
				1418
				1419	if (index == locked_page->index && end_index == index)
				1420	return;
				1421
				1422	while (nr_pages > 0) {
				1423	ret = find_get_pages_contig(inode->i_mapping, index,
				1424	min_t(unsigned long, nr_pages,
				1425	ARRAY_SIZE(pages)), pages);
				1426	for (i = 0; i < ret; i++) {
				1427	if (pages[i] != locked_page)
				1428	unlock_page(pages[i]);
				1429	page_cache_release(pages[i]);
				1430	}
				1431	nr_pages -= ret;
				1432	index += ret;
				1433	cond_resched();
				1434	}
				1435	}
				1436
				1437	static noinline int lock_delalloc_pages(struct inode *inode,
				1438	struct page *locked_page,
				1439	u64 delalloc_start,
				1440	u64 delalloc_end)
				1441	{
				1442	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
				1443	unsigned long start_index = index;
				1444	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
				1445	unsigned long pages_locked = 0;
				1446	struct page *pages[16];
				1447	unsigned long nrpages;
				1448	int ret;
				1449	int i;
				1450
				1451	/* the caller is responsible for locking the start index */
				1452	if (index == locked_page->index && index == end_index)
				1453	return 0;
				1454
				1455	/* skip the page at the start index */
				1456	nrpages = end_index - index + 1;
				1457	while (nrpages > 0) {
				1458	ret = find_get_pages_contig(inode->i_mapping, index,
				1459	min_t(unsigned long,
				1460	nrpages, ARRAY_SIZE(pages)), pages);
				1461	if (ret == 0) {
				1462	ret = -EAGAIN;
				1463	goto done;
				1464	}
				1465	/* now we have an array of pages, lock them all */
				1466	for (i = 0; i < ret; i++) {
				1467	/*
				1468	* the caller is taking responsibility for
				1469	* locked_page
				1470	*/
				1471	if (pages[i] != locked_page) {
				1472	lock_page(pages[i]);
				1473	if (!PageDirty(pages[i]) \|\|
				1474	pages[i]->mapping != inode->i_mapping) {
				1475	ret = -EAGAIN;
				1476	unlock_page(pages[i]);
				1477	page_cache_release(pages[i]);
				1478	goto done;
				1479	}
				1480	}
				1481	page_cache_release(pages[i]);
				1482	pages_locked++;
				1483	}
				1484	nrpages -= ret;
				1485	index += ret;
				1486	cond_resched();
				1487	}
				1488	ret = 0;
				1489	done:
				1490	if (ret && pages_locked) {
				1491	__unlock_for_delalloc(inode, locked_page,
				1492	delalloc_start,
				1493	((u64)(start_index + pages_locked - 1)) <<
				1494	PAGE_CACHE_SHIFT);
				1495	}
				1496	return ret;
				1497	}
				1498
				1499	/*
				1500	* find a contiguous range of bytes in the file marked as delalloc, not
				1501	* more than 'max_bytes'. start and end are used to return the range,
				1502	*
				1503	* 1 is returned if we find something, 0 if nothing was in the tree
				1504	*/
				1505	static noinline u64 find_lock_delalloc_range(struct inode *inode,
				1506	struct extent_io_tree *tree,
				1507	struct page *locked_page,
				1508	u64 start, u64 end,
				1509	u64 max_bytes)
				1510	{
				1511	u64 delalloc_start;
				1512	u64 delalloc_end;
				1513	u64 found;
				1514	struct extent_state *cached_state = NULL;
				1515	int ret;
				1516	int loops = 0;
				1517
				1518	again:
				1519	/* step one, find a bunch of delalloc bytes starting at start */
				1520	delalloc_start = *start;
				1521	delalloc_end = 0;
				1522	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
				1523	max_bytes, &cached_state);
				1524	if (!found \|\| delalloc_end <= *start) {
				1525	*start = delalloc_start;
				1526	*end = delalloc_end;
				1527	free_extent_state(cached_state);
				1528	return found;
				1529	}
				1530
				1531	/*
				1532	* start comes from the offset of locked_page. We have to lock
				1533	* pages in order, so we can't process delalloc bytes before
				1534	* locked_page
				1535	*/
				1536	if (delalloc_start < *start)
				1537	delalloc_start = *start;
				1538
				1539	/*
				1540	* make sure to limit the number of pages we try to lock down
				1541	* if we're looping.
				1542	*/
				1543	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
				1544	delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
				1545
				1546	/* step two, lock all the pages after the page that has start */
				1547	ret = lock_delalloc_pages(inode, locked_page,
				1548	delalloc_start, delalloc_end);
				1549	if (ret == -EAGAIN) {
				1550	/* some of the pages are gone, lets avoid looping by
				1551	* shortening the size of the delalloc range we're searching
				1552	*/
				1553	free_extent_state(cached_state);
				1554	cached_state = NULL;
				1555	if (!loops) {
				1556	unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
				1557	max_bytes = PAGE_CACHE_SIZE - offset;
				1558	loops = 1;
				1559	goto again;
				1560	} else {
				1561	found = 0;
				1562	goto out_failed;
				1563	}
				1564	}
				1565	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
				1566
				1567	/* step three, lock the state bits for the whole range */
				1568	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
				1569
				1570	/* then test to make sure it is all still delalloc */
				1571	ret = test_range_bit(tree, delalloc_start, delalloc_end,
				1572	EXTENT_DELALLOC, 1, cached_state);
				1573	if (!ret) {
				1574	unlock_extent_cached(tree, delalloc_start, delalloc_end,
				1575	&cached_state, GFP_NOFS);
				1576	__unlock_for_delalloc(inode, locked_page,
				1577	delalloc_start, delalloc_end);
				1578	cond_resched();
				1579	goto again;
				1580	}
				1581	free_extent_state(cached_state);
				1582	*start = delalloc_start;
				1583	*end = delalloc_end;
				1584	out_failed:
				1585	return found;
				1586	}
				1587
				1588	int extent_clear_unlock_delalloc(struct inode *inode,
				1589	struct extent_io_tree *tree,
				1590	u64 start, u64 end, struct page *locked_page,
				1591	unsigned long op)
				1592	{
				1593	int ret;
				1594	struct page *pages[16];
				1595	unsigned long index = start >> PAGE_CACHE_SHIFT;
				1596	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
				1597	unsigned long nr_pages = end_index - index + 1;
				1598	int i;
				1599	int clear_bits = 0;
				1600
				1601	if (op & EXTENT_CLEAR_UNLOCK)
				1602	clear_bits \|= EXTENT_LOCKED;
				1603	if (op & EXTENT_CLEAR_DIRTY)
				1604	clear_bits \|= EXTENT_DIRTY;
				1605
				1606	if (op & EXTENT_CLEAR_DELALLOC)
				1607	clear_bits \|= EXTENT_DELALLOC;
				1608
				1609	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
				1610	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE \| EXTENT_CLEAR_DIRTY \|
				1611	EXTENT_SET_WRITEBACK \| EXTENT_END_WRITEBACK \|
				1612	EXTENT_SET_PRIVATE2)))
				1613	return 0;
				1614
				1615	while (nr_pages > 0) {
				1616	ret = find_get_pages_contig(inode->i_mapping, index,
				1617	min_t(unsigned long,
				1618	nr_pages, ARRAY_SIZE(pages)), pages);
				1619	for (i = 0; i < ret; i++) {
				1620
				1621	if (op & EXTENT_SET_PRIVATE2)
				1622	SetPagePrivate2(pages[i]);
				1623
				1624	if (pages[i] == locked_page) {
				1625	page_cache_release(pages[i]);
				1626	continue;
				1627	}
				1628	if (op & EXTENT_CLEAR_DIRTY)
				1629	clear_page_dirty_for_io(pages[i]);
				1630	if (op & EXTENT_SET_WRITEBACK)
				1631	set_page_writeback(pages[i]);
				1632	if (op & EXTENT_END_WRITEBACK)
				1633	end_page_writeback(pages[i]);
				1634	if (op & EXTENT_CLEAR_UNLOCK_PAGE)
				1635	unlock_page(pages[i]);
				1636	page_cache_release(pages[i]);
				1637	}
				1638	nr_pages -= ret;
				1639	index += ret;
				1640	cond_resched();
				1641	}
				1642	return 0;
				1643	}
				1644
				1645	/*
				1646	* count the number of bytes in the tree that have a given bit(s)
				1647	* set. This can be fairly slow, except for EXTENT_DIRTY which is
				1648	* cached. The total number found is returned.
				1649	*/
				1650	u64 count_range_bits(struct extent_io_tree *tree,
				1651	u64 *start, u64 search_end, u64 max_bytes,
				1652	unsigned long bits, int contig)
				1653	{
				1654	struct rb_node *node;
				1655	struct extent_state *state;
				1656	u64 cur_start = *start;
				1657	u64 total_bytes = 0;
				1658	u64 last = 0;
				1659	int found = 0;
				1660
				1661	if (search_end <= cur_start) {
				1662	WARN_ON(1);
				1663	return 0;
				1664	}
				1665
				1666	spin_lock(&tree->lock);
				1667	if (cur_start == 0 && bits == EXTENT_DIRTY) {
				1668	total_bytes = tree->dirty_bytes;
				1669	goto out;
				1670	}
				1671	/*
				1672	* this search will find all the extents that end after
				1673	* our range starts.
				1674	*/
				1675	node = tree_search(tree, cur_start);
				1676	if (!node)
				1677	goto out;
				1678
				1679	while (1) {
				1680	state = rb_entry(node, struct extent_state, rb_node);
				1681	if (state->start > search_end)
				1682	break;
				1683	if (contig && found && state->start > last + 1)
				1684	break;
				1685	if (state->end >= cur_start && (state->state & bits) == bits) {
				1686	total_bytes += min(search_end, state->end) + 1 -
				1687	max(cur_start, state->start);
				1688	if (total_bytes >= max_bytes)
				1689	break;
				1690	if (!found) {
				1691	*start = max(cur_start, state->start);
				1692	found = 1;
				1693	}
				1694	last = state->end;
				1695	} else if (contig && found) {
				1696	break;
				1697	}
				1698	node = rb_next(node);
				1699	if (!node)
				1700	break;
				1701	}
				1702	out:
				1703	spin_unlock(&tree->lock);
				1704	return total_bytes;
				1705	}
				1706
				1707	/*
				1708	* set the private field for a given byte offset in the tree. If there isn't
				1709	* an extent_state there already, this does nothing.
				1710	*/
				1711	int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
				1712	{
				1713	struct rb_node *node;
				1714	struct extent_state *state;
				1715	int ret = 0;
				1716
				1717	spin_lock(&tree->lock);
				1718	/*
				1719	* this search will find all the extents that end after
				1720	* our range starts.
				1721	*/
				1722	node = tree_search(tree, start);
				1723	if (!node) {
				1724	ret = -ENOENT;
				1725	goto out;
				1726	}
				1727	state = rb_entry(node, struct extent_state, rb_node);
				1728	if (state->start != start) {
				1729	ret = -ENOENT;
				1730	goto out;
				1731	}
				1732	state->private = private;
				1733	out:
				1734	spin_unlock(&tree->lock);
				1735	return ret;
				1736	}
				1737
				1738	int get_state_private(struct extent_io_tree tree, u64 start, u64 private)
				1739	{
				1740	struct rb_node *node;
				1741	struct extent_state *state;
				1742	int ret = 0;
				1743
				1744	spin_lock(&tree->lock);
				1745	/*
				1746	* this search will find all the extents that end after
				1747	* our range starts.
				1748	*/
				1749	node = tree_search(tree, start);
				1750	if (!node) {
				1751	ret = -ENOENT;
				1752	goto out;
				1753	}
				1754	state = rb_entry(node, struct extent_state, rb_node);
				1755	if (state->start != start) {
				1756	ret = -ENOENT;
				1757	goto out;
				1758	}
				1759	*private = state->private;
				1760	out:
				1761	spin_unlock(&tree->lock);
				1762	return ret;
				1763	}
				1764
				1765	/*
				1766	* searches a range in the state tree for a given mask.
				1767	* If 'filled' == 1, this returns 1 only if every extent in the tree
				1768	* has the bits set. Otherwise, 1 is returned if any bit in the
				1769	* range is found set.
				1770	*/
				1771	int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
				1772	int bits, int filled, struct extent_state *cached)
				1773	{
				1774	struct extent_state *state = NULL;
				1775	struct rb_node *node;
				1776	int bitset = 0;
				1777
				1778	spin_lock(&tree->lock);
				1779	if (cached && cached->tree && cached->start <= start &&
				1780	cached->end > start)
				1781	node = &cached->rb_node;
				1782	else
				1783	node = tree_search(tree, start);
				1784	while (node && start <= end) {
				1785	state = rb_entry(node, struct extent_state, rb_node);
				1786
				1787	if (filled && state->start > start) {
				1788	bitset = 0;
				1789	break;
				1790	}
				1791
				1792	if (state->start > end)
				1793	break;
				1794
				1795	if (state->state & bits) {
				1796	bitset = 1;
				1797	if (!filled)
				1798	break;
				1799	} else if (filled) {
				1800	bitset = 0;
				1801	break;
				1802	}
				1803
				1804	if (state->end == (u64)-1)
				1805	break;
				1806
				1807	start = state->end + 1;
				1808	if (start > end)
				1809	break;
				1810	node = rb_next(node);
				1811	if (!node) {
				1812	if (filled)
				1813	bitset = 0;
				1814	break;
				1815	}
				1816	}
				1817	spin_unlock(&tree->lock);
				1818	return bitset;
				1819	}
				1820
				1821	/*
				1822	* helper function to set a given page up to date if all the
				1823	* extents in the tree for that page are up to date
				1824	*/
				1825	static void check_page_uptodate(struct extent_io_tree tree, struct page page)
				1826	{
				1827	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
				1828	u64 end = start + PAGE_CACHE_SIZE - 1;
				1829	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
				1830	SetPageUptodate(page);
				1831	}
				1832
				1833	/*
				1834	* helper function to unlock a page if all the extents in the tree
				1835	* for that page are unlocked
				1836	*/
				1837	static void check_page_locked(struct extent_io_tree tree, struct page page)
				1838	{
				1839	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
				1840	u64 end = start + PAGE_CACHE_SIZE - 1;
				1841	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
				1842	unlock_page(page);
				1843	}
				1844
				1845	/*
				1846	* helper function to end page writeback if all the extents
				1847	* in the tree for that page are done with writeback
				1848	*/
				1849	static void check_page_writeback(struct extent_io_tree *tree,
				1850	struct page *page)
				1851	{
				1852	end_page_writeback(page);
				1853	}
				1854
				1855	/*
				1856	* When IO fails, either with EIO or csum verification fails, we
				1857	* try other mirrors that might have a good copy of the data. This
				1858	* io_failure_record is used to record state as we go through all the
				1859	* mirrors. If another mirror has good data, the page is set up to date
				1860	* and things continue. If a good mirror can't be found, the original
				1861	* bio end_io callback is called to indicate things have failed.
				1862	*/
				1863	struct io_failure_record {
				1864	struct page *page;
				1865	u64 start;
				1866	u64 len;
				1867	u64 logical;
				1868	unsigned long bio_flags;
				1869	int this_mirror;
				1870	int failed_mirror;
				1871	int in_validation;
				1872	};
				1873
				1874	static int free_io_failure(struct inode inode, struct io_failure_record rec,
				1875	int did_repair)
				1876	{
				1877	int ret;
				1878	int err = 0;
				1879	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
				1880
				1881	set_state_private(failure_tree, rec->start, 0);
				1882	ret = clear_extent_bits(failure_tree, rec->start,
				1883	rec->start + rec->len - 1,
				1884	EXTENT_LOCKED \| EXTENT_DIRTY, GFP_NOFS);
				1885	if (ret)
				1886	err = ret;
				1887
				1888	if (did_repair) {
				1889	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
				1890	rec->start + rec->len - 1,
				1891	EXTENT_DAMAGED, GFP_NOFS);
				1892	if (ret && !err)
				1893	err = ret;
				1894	}
				1895
				1896	kfree(rec);
				1897	return err;
				1898	}
				1899
				1900	static void repair_io_failure_callback(struct bio *bio, int err)
				1901	{
				1902	complete(bio->bi_private);
				1903	}
				1904
				1905	/*
				1906	* this bypasses the standard btrfs submit functions deliberately, as
				1907	* the standard behavior is to write all copies in a raid setup. here we only
				1908	* want to write the one bad copy. so we do the mapping for ourselves and issue
				1909	* submit_bio directly.
				1910	* to avoid any synchonization issues, wait for the data after writing, which
				1911	* actually prevents the read that triggered the error from finishing.
				1912	* currently, there can be no more than two copies of every data bit. thus,
				1913	* exactly one rewrite is required.
				1914	*/
				1915	int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
				1916	u64 length, u64 logical, struct page *page,
				1917	int mirror_num)
				1918	{
				1919	struct bio *bio;
				1920	struct btrfs_device *dev;
				1921	DECLARE_COMPLETION_ONSTACK(compl);
				1922	u64 map_length = 0;
				1923	u64 sector;
				1924	struct btrfs_bio *bbio = NULL;
				1925	int ret;
				1926
				1927	BUG_ON(!mirror_num);
				1928
				1929	bio = bio_alloc(GFP_NOFS, 1);
				1930	if (!bio)
				1931	return -EIO;
				1932	bio->bi_private = &compl;
				1933	bio->bi_end_io = repair_io_failure_callback;
				1934	bio->bi_size = 0;
				1935	map_length = length;
				1936
				1937	ret = btrfs_map_block(map_tree, WRITE, logical,
				1938	&map_length, &bbio, mirror_num);
				1939	if (ret) {
				1940	bio_put(bio);
				1941	return -EIO;
				1942	}
				1943	BUG_ON(mirror_num != bbio->mirror_num);
				1944	sector = bbio->stripes[mirror_num-1].physical >> 9;
				1945	bio->bi_sector = sector;
				1946	dev = bbio->stripes[mirror_num-1].dev;
				1947	kfree(bbio);
				1948	if (!dev \|\| !dev->bdev \|\| !dev->writeable) {
				1949	bio_put(bio);
				1950	return -EIO;
				1951	}
				1952	bio->bi_bdev = dev->bdev;
				1953	bio_add_page(bio, page, length, start-page_offset(page));
				1954	btrfsic_submit_bio(WRITE_SYNC, bio);
				1955	wait_for_completion(&compl);
				1956
				1957	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
				1958	/* try to remap that extent elsewhere? */
				1959	bio_put(bio);
				1960	return -EIO;
				1961	}
				1962
				1963	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
				1964	"sector %llu)\n", page->mapping->host->i_ino, start,
				1965	dev->name, sector);
				1966
				1967	bio_put(bio);
				1968	return 0;
				1969	}
				1970
				1971	int repair_eb_io_failure(struct btrfs_root root, struct extent_buffer eb,
				1972	int mirror_num)
				1973	{
				1974	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
				1975	u64 start = eb->start;
				1976	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
				1977	int ret = 0;
				1978
				1979	for (i = 0; i < num_pages; i++) {
				1980	struct page *p = extent_buffer_page(eb, i);
				1981	ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
				1982	start, p, mirror_num);
				1983	if (ret)
				1984	break;
				1985	start += PAGE_CACHE_SIZE;
				1986	}
				1987
				1988	return ret;
				1989	}
				1990
				1991	/*
				1992	* each time an IO finishes, we do a fast check in the IO failure tree
				1993	* to see if we need to process or clean up an io_failure_record
				1994	*/
				1995	static int clean_io_failure(u64 start, struct page *page)
				1996	{
				1997	u64 private;
				1998	u64 private_failure;
				1999	struct io_failure_record *failrec;
				2000	struct btrfs_mapping_tree *map_tree;
				2001	struct extent_state *state;
				2002	int num_copies;
				2003	int did_repair = 0;
				2004	int ret;
				2005	struct inode *inode = page->mapping->host;
				2006
				2007	private = 0;
				2008	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
				2009	(u64)-1, 1, EXTENT_DIRTY, 0);
				2010	if (!ret)
				2011	return 0;
				2012
				2013	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
				2014	&private_failure);
				2015	if (ret)
				2016	return 0;
				2017
				2018	failrec = (struct io_failure_record *)(unsigned long) private_failure;
				2019	BUG_ON(!failrec->this_mirror);
				2020
				2021	if (failrec->in_validation) {
				2022	/* there was no real error, just free the record */
				2023	pr_debug("clean_io_failure: freeing dummy error at %llu\n",
				2024	failrec->start);
				2025	did_repair = 1;
				2026	goto out;
				2027	}
				2028
				2029	spin_lock(&BTRFS_I(inode)->io_tree.lock);
				2030	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
				2031	failrec->start,
				2032	EXTENT_LOCKED);
				2033	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
				2034
				2035	if (state && state->start == failrec->start) {
				2036	map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
				2037	num_copies = btrfs_num_copies(map_tree, failrec->logical,
				2038	failrec->len);
				2039	if (num_copies > 1) {
				2040	ret = repair_io_failure(map_tree, start, failrec->len,
				2041	failrec->logical, page,
				2042	failrec->failed_mirror);
				2043	did_repair = !ret;
				2044	}
				2045	}
				2046
				2047	out:
				2048	if (!ret)
				2049	ret = free_io_failure(inode, failrec, did_repair);
				2050
				2051	return ret;
				2052	}
				2053
				2054	/*
				2055	* this is a generic handler for readpage errors (default
				2056	* readpage_io_failed_hook). if other copies exist, read those and write back
				2057	* good data to the failed position. does not investigate in remapping the
				2058	* failed extent elsewhere, hoping the device will be smart enough to do this as
				2059	* needed
				2060	*/
				2061
				2062	static int bio_readpage_error(struct bio failed_bio, struct page page,
				2063	u64 start, u64 end, int failed_mirror,
				2064	struct extent_state *state)
				2065	{
				2066	struct io_failure_record *failrec = NULL;
				2067	u64 private;
				2068	struct extent_map *em;
				2069	struct inode *inode = page->mapping->host;
				2070	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
				2071	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
				2072	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
				2073	struct bio *bio;
				2074	int num_copies;
				2075	int ret;
				2076	int read_mode;
				2077	u64 logical;
				2078
				2079	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
				2080
				2081	ret = get_state_private(failure_tree, start, &private);
				2082	if (ret) {
				2083	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
				2084	if (!failrec)
				2085	return -ENOMEM;
				2086	failrec->start = start;
				2087	failrec->len = end - start + 1;
				2088	failrec->this_mirror = 0;
				2089	failrec->bio_flags = 0;
				2090	failrec->in_validation = 0;
				2091
				2092	read_lock(&em_tree->lock);
				2093	em = lookup_extent_mapping(em_tree, start, failrec->len);
				2094	if (!em) {
				2095	read_unlock(&em_tree->lock);
				2096	kfree(failrec);
				2097	return -EIO;
				2098	}
				2099
				2100	if (em->start > start \|\| em->start + em->len < start) {
				2101	free_extent_map(em);
				2102	em = NULL;
				2103	}
				2104	read_unlock(&em_tree->lock);
				2105
				2106	if (!em \|\| IS_ERR(em)) {
				2107	kfree(failrec);
				2108	return -EIO;
				2109	}
				2110	logical = start - em->start;
				2111	logical = em->block_start + logical;
				2112	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
				2113	logical = em->block_start;
				2114	failrec->bio_flags = EXTENT_BIO_COMPRESSED;
				2115	extent_set_compress_type(&failrec->bio_flags,
				2116	em->compress_type);
				2117	}
				2118	pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
				2119	"len=%llu\n", logical, start, failrec->len);
				2120	failrec->logical = logical;
				2121	free_extent_map(em);
				2122
				2123	/* set the bits in the private failure tree */
				2124	ret = set_extent_bits(failure_tree, start, end,
				2125	EXTENT_LOCKED \| EXTENT_DIRTY, GFP_NOFS);
				2126	if (ret >= 0)
				2127	ret = set_state_private(failure_tree, start,
				2128	(u64)(unsigned long)failrec);
				2129	/* set the bits in the inode's tree */
				2130	if (ret >= 0)
				2131	ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
				2132	GFP_NOFS);
				2133	if (ret < 0) {
				2134	kfree(failrec);
				2135	return ret;
				2136	}
				2137	} else {
				2138	failrec = (struct io_failure_record *)(unsigned long)private;
				2139	pr_debug("bio_readpage_error: (found) logical=%llu, "
				2140	"start=%llu, len=%llu, validation=%d\n",
				2141	failrec->logical, failrec->start, failrec->len,
				2142	failrec->in_validation);
				2143	/*
				2144	* when data can be on disk more than twice, add to failrec here
				2145	* (e.g. with a list for failed_mirror) to make
				2146	* clean_io_failure() clean all those errors at once.
				2147	*/
				2148	}
				2149	num_copies = btrfs_num_copies(
				2150	&BTRFS_I(inode)->root->fs_info->mapping_tree,
				2151	failrec->logical, failrec->len);
				2152	if (num_copies == 1) {
				2153	/*
				2154	* we only have a single copy of the data, so don't bother with
				2155	* all the retry and error correction code that follows. no
				2156	* matter what the error is, it is very likely to persist.
				2157	*/
				2158	pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
				2159	"state=%p, num_copies=%d, next_mirror %d, "
				2160	"failed_mirror %d\n", state, num_copies,
				2161	failrec->this_mirror, failed_mirror);
				2162	free_io_failure(inode, failrec, 0);
				2163	return -EIO;
				2164	}
				2165
				2166	if (!state) {
				2167	spin_lock(&tree->lock);
				2168	state = find_first_extent_bit_state(tree, failrec->start,
				2169	EXTENT_LOCKED);
				2170	if (state && state->start != failrec->start)
				2171	state = NULL;
				2172	spin_unlock(&tree->lock);
				2173	}
				2174
				2175	/*
				2176	* there are two premises:
				2177	* a) deliver good data to the caller
				2178	* b) correct the bad sectors on disk
				2179	*/
				2180	if (failed_bio->bi_vcnt > 1) {
				2181	/*
				2182	* to fulfill b), we need to know the exact failing sectors, as
				2183	* we don't want to rewrite any more than the failed ones. thus,
				2184	* we need separate read requests for the failed bio
				2185	*
				2186	* if the following BUG_ON triggers, our validation request got
				2187	* merged. we need separate requests for our algorithm to work.
				2188	*/
				2189	BUG_ON(failrec->in_validation);
				2190	failrec->in_validation = 1;
				2191	failrec->this_mirror = failed_mirror;
				2192	read_mode = READ_SYNC \| REQ_FAILFAST_DEV;
				2193	} else {
				2194	/*
				2195	* we're ready to fulfill a) and b) alongside. get a good copy
				2196	* of the failed sector and if we succeed, we have setup
				2197	* everything for repair_io_failure to do the rest for us.
				2198	*/
				2199	if (failrec->in_validation) {
				2200	BUG_ON(failrec->this_mirror != failed_mirror);
				2201	failrec->in_validation = 0;
				2202	failrec->this_mirror = 0;
				2203	}
				2204	failrec->failed_mirror = failed_mirror;
				2205	failrec->this_mirror++;
				2206	if (failrec->this_mirror == failed_mirror)
				2207	failrec->this_mirror++;
				2208	read_mode = READ_SYNC;
				2209	}
				2210
				2211	if (!state \|\| failrec->this_mirror > num_copies) {
				2212	pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
				2213	"next_mirror %d, failed_mirror %d\n", state,
				2214	num_copies, failrec->this_mirror, failed_mirror);
				2215	free_io_failure(inode, failrec, 0);
				2216	return -EIO;
				2217	}
				2218
				2219	bio = bio_alloc(GFP_NOFS, 1);
				2220	if (!bio) {
				2221	free_io_failure(inode, failrec, 0);
				2222	return -EIO;
				2223	}
				2224	bio->bi_private = state;
				2225	bio->bi_end_io = failed_bio->bi_end_io;
				2226	bio->bi_sector = failrec->logical >> 9;
				2227	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
				2228	bio->bi_size = 0;
				2229
				2230	bio_add_page(bio, page, failrec->len, start - page_offset(page));
				2231
				2232	pr_debug("bio_readpage_error: submitting new read[%#x] to "
				2233	"this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
				2234	failrec->this_mirror, num_copies, failrec->in_validation);
				2235
				2236	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
				2237	failrec->this_mirror,
				2238	failrec->bio_flags, 0);
				2239	return ret;
				2240	}
				2241
				2242	/* lots and lots of room for performance fixes in the end_bio funcs */
				2243
				2244	int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
				2245	{
				2246	int uptodate = (err == 0);
				2247	struct extent_io_tree *tree;
				2248	int ret = 0;
				2249
				2250	tree = &BTRFS_I(page->mapping->host)->io_tree;
				2251
				2252	if (tree->ops && tree->ops->writepage_end_io_hook) {
				2253	ret = tree->ops->writepage_end_io_hook(page, start,
				2254	end, NULL, uptodate);
				2255	if (ret)
				2256	uptodate = 0;
				2257	}
				2258
				2259	if (!uptodate && tree->ops &&
				2260	tree->ops->writepage_io_failed_hook) {
				2261	ret = tree->ops->writepage_io_failed_hook(NULL, page,
				2262	start, end, NULL);
				2263	/* Writeback already completed */
				2264	if (ret == 0)
				2265	return 1;
				2266	}
				2267
				2268	if (!uptodate) {
				2269	clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
				2270	ClearPageUptodate(page);
				2271	SetPageError(page);
				2272	}
				2273	return 0;
				2274	}
				2275
				2276	/*
				2277	* after a writepage IO is done, we need to:
				2278	* clear the uptodate bits on error
				2279	* clear the writeback bits in the extent tree for this IO
				2280	* end_page_writeback if the page has no more pending IO
				2281	*
				2282	* Scheduling is not allowed, so the extent state tree is expected
				2283	* to have one and only one object corresponding to this IO.
				2284	*/
				2285	static void end_bio_extent_writepage(struct bio *bio, int err)
				2286	{
				2287	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
				2288	struct extent_io_tree *tree;
				2289	u64 start;
				2290	u64 end;
				2291	int whole_page;
				2292
				2293	do {
				2294	struct page *page = bvec->bv_page;
				2295	tree = &BTRFS_I(page->mapping->host)->io_tree;
				2296
				2297	start = ((u64)page->index << PAGE_CACHE_SHIFT) +
				2298	bvec->bv_offset;
				2299	end = start + bvec->bv_len - 1;
				2300
				2301	if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
				2302	whole_page = 1;
				2303	else
				2304	whole_page = 0;
				2305
				2306	if (--bvec >= bio->bi_io_vec)
				2307	prefetchw(&bvec->bv_page->flags);
				2308
				2309	if (end_extent_writepage(page, err, start, end))
				2310	continue;
				2311
				2312	if (whole_page)
				2313	end_page_writeback(page);
				2314	else
				2315	check_page_writeback(tree, page);
				2316	} while (bvec >= bio->bi_io_vec);
				2317
				2318	bio_put(bio);
				2319	}
				2320
				2321	/*
				2322	* after a readpage IO is done, we need to:
				2323	* clear the uptodate bits on error
				2324	* set the uptodate bits if things worked
				2325	* set the page up to date if all extents in the tree are uptodate
				2326	* clear the lock bit in the extent tree
				2327	* unlock the page if there are no other extents locked for it
				2328	*
				2329	* Scheduling is not allowed, so the extent state tree is expected
				2330	* to have one and only one object corresponding to this IO.
				2331	*/
				2332	static void end_bio_extent_readpage(struct bio *bio, int err)
				2333	{
				2334	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
				2335	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
				2336	struct bio_vec *bvec = bio->bi_io_vec;
				2337	struct extent_io_tree *tree;
				2338	u64 start;
				2339	u64 end;
				2340	int whole_page;
				2341	int mirror;
				2342	int ret;
				2343
				2344	if (err)
				2345	uptodate = 0;
				2346
				2347	do {
				2348	struct page *page = bvec->bv_page;
				2349	struct extent_state *cached = NULL;
				2350	struct extent_state *state;
				2351
				2352	pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
				2353	"mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
				2354	(long int)bio->bi_bdev);
				2355	tree = &BTRFS_I(page->mapping->host)->io_tree;
				2356
				2357	start = ((u64)page->index << PAGE_CACHE_SHIFT) +
				2358	bvec->bv_offset;
				2359	end = start + bvec->bv_len - 1;
				2360
				2361	if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
				2362	whole_page = 1;
				2363	else
				2364	whole_page = 0;
				2365
				2366	if (++bvec <= bvec_end)
				2367	prefetchw(&bvec->bv_page->flags);
				2368
				2369	spin_lock(&tree->lock);
				2370	state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
				2371	if (state && state->start == start) {
				2372	/*
				2373	* take a reference on the state, unlock will drop
				2374	* the ref
				2375	*/
				2376	cache_state(state, &cached);
				2377	}
				2378	spin_unlock(&tree->lock);
				2379
				2380	mirror = (int)(unsigned long)bio->bi_bdev;
				2381	if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
				2382	ret = tree->ops->readpage_end_io_hook(page, start, end,
				2383	state, mirror);
				2384	if (ret)
				2385	uptodate = 0;
				2386	else
				2387	clean_io_failure(start, page);
				2388	}
				2389
				2390	if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
				2391	ret = tree->ops->readpage_io_failed_hook(page, mirror);
				2392	if (!ret && !err &&
				2393	test_bit(BIO_UPTODATE, &bio->bi_flags))
				2394	uptodate = 1;
				2395	} else if (!uptodate) {
				2396	/*
				2397	* The generic bio_readpage_error handles errors the
				2398	* following way: If possible, new read requests are
				2399	* created and submitted and will end up in
				2400	* end_bio_extent_readpage as well (if we're lucky, not
				2401	* in the !uptodate case). In that case it returns 0 and
				2402	* we just go on with the next page in our bio. If it
				2403	* can't handle the error it will return -EIO and we
				2404	* remain responsible for that page.
				2405	*/
				2406	ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
				2407	if (ret == 0) {
				2408	uptodate =
				2409	test_bit(BIO_UPTODATE, &bio->bi_flags);
				2410	if (err)
				2411	uptodate = 0;
				2412	uncache_state(&cached);
				2413	continue;
				2414	}
				2415	}
				2416
				2417	if (uptodate && tree->track_uptodate) {
				2418	set_extent_uptodate(tree, start, end, &cached,
				2419	GFP_ATOMIC);
				2420	}
				2421	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
				2422
				2423	if (whole_page) {
				2424	if (uptodate) {
				2425	SetPageUptodate(page);
				2426	} else {
				2427	ClearPageUptodate(page);
				2428	SetPageError(page);
				2429	}
				2430	unlock_page(page);
				2431	} else {
				2432	if (uptodate) {
				2433	check_page_uptodate(tree, page);
				2434	} else {
				2435	ClearPageUptodate(page);
				2436	SetPageError(page);
				2437	}
				2438	check_page_locked(tree, page);
				2439	}
				2440	} while (bvec <= bvec_end);
				2441
				2442	bio_put(bio);
				2443	}
				2444
				2445	struct bio *
				2446	btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
				2447	gfp_t gfp_flags)
				2448	{
				2449	struct bio *bio;
				2450
				2451	bio = bio_alloc(gfp_flags, nr_vecs);
				2452
				2453	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
				2454	while (!bio && (nr_vecs /= 2))
				2455	bio = bio_alloc(gfp_flags, nr_vecs);
				2456	}
				2457
				2458	if (bio) {
				2459	bio->bi_size = 0;
				2460	bio->bi_bdev = bdev;
				2461	bio->bi_sector = first_sector;
				2462	}
				2463	return bio;
				2464	}
				2465
				2466	/*
				2467	* Since writes are async, they will only return -ENOMEM.
				2468	* Reads can return the full range of I/O error conditions.
				2469	*/
				2470	static int __must_check submit_one_bio(int rw, struct bio *bio,
				2471	int mirror_num, unsigned long bio_flags)
				2472	{
				2473	int ret = 0;
				2474	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
				2475	struct page *page = bvec->bv_page;
				2476	struct extent_io_tree *tree = bio->bi_private;
				2477	u64 start;
				2478
				2479	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
				2480
				2481	bio->bi_private = NULL;
				2482
				2483	bio_get(bio);
				2484
				2485	if (tree->ops && tree->ops->submit_bio_hook)
				2486	ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
				2487	mirror_num, bio_flags, start);
				2488	else
				2489	btrfsic_submit_bio(rw, bio);
				2490
				2491	if (bio_flagged(bio, BIO_EOPNOTSUPP))
				2492	ret = -EOPNOTSUPP;
				2493	bio_put(bio);
				2494	return ret;
				2495	}
				2496
				2497	static int merge_bio(struct extent_io_tree tree, struct page page,
				2498	unsigned long offset, size_t size, struct bio *bio,
				2499	unsigned long bio_flags)
				2500	{
				2501	int ret = 0;
				2502	if (tree->ops && tree->ops->merge_bio_hook)
				2503	ret = tree->ops->merge_bio_hook(page, offset, size, bio,
				2504	bio_flags);
				2505	BUG_ON(ret < 0);
				2506	return ret;
				2507
				2508	}
				2509
				2510	static int submit_extent_page(int rw, struct extent_io_tree *tree,
				2511	struct page *page, sector_t sector,
				2512	size_t size, unsigned long offset,
				2513	struct block_device *bdev,
				2514	struct bio **bio_ret,
				2515	unsigned long max_pages,
				2516	bio_end_io_t end_io_func,
				2517	int mirror_num,
				2518	unsigned long prev_bio_flags,
				2519	unsigned long bio_flags)
				2520	{
				2521	int ret = 0;
				2522	struct bio *bio;
				2523	int nr;
				2524	int contig = 0;
				2525	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
				2526	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
				2527	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
				2528
				2529	if (bio_ret && *bio_ret) {
				2530	bio = *bio_ret;
				2531	if (old_compressed)
				2532	contig = bio->bi_sector == sector;
				2533	else
				2534	contig = bio->bi_sector + (bio->bi_size >> 9) ==
				2535	sector;
				2536
				2537	if (prev_bio_flags != bio_flags \|\| !contig \|\|
				2538	merge_bio(tree, page, offset, page_size, bio, bio_flags) \|\|
				2539	bio_add_page(bio, page, page_size, offset) < page_size) {
				2540	ret = submit_one_bio(rw, bio, mirror_num,
				2541	prev_bio_flags);
				2542	if (ret < 0)
				2543	return ret;
				2544	bio = NULL;
				2545	} else {
				2546	return 0;
				2547	}
				2548	}
				2549	if (this_compressed)
				2550	nr = BIO_MAX_PAGES;
				2551	else
				2552	nr = bio_get_nr_vecs(bdev);
				2553
				2554	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS \| __GFP_HIGH);
				2555	if (!bio)
				2556	return -ENOMEM;
				2557
				2558	bio_add_page(bio, page, page_size, offset);
				2559	bio->bi_end_io = end_io_func;
				2560	bio->bi_private = tree;
				2561
				2562	if (bio_ret)
				2563	*bio_ret = bio;
				2564	else
				2565	ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
				2566
				2567	return ret;
				2568	}
				2569
				2570	void attach_extent_buffer_page(struct extent_buffer eb, struct page page)
				2571	{
				2572	if (!PagePrivate(page)) {
				2573	SetPagePrivate(page);
				2574	page_cache_get(page);
				2575	set_page_private(page, (unsigned long)eb);
				2576	} else {
				2577	WARN_ON(page->private != (unsigned long)eb);
				2578	}
				2579	}
				2580
				2581	void set_page_extent_mapped(struct page *page)
				2582	{
				2583	if (!PagePrivate(page)) {
				2584	SetPagePrivate(page);
				2585	page_cache_get(page);
				2586	set_page_private(page, EXTENT_PAGE_PRIVATE);
				2587	}
				2588	}
				2589
				2590	/*
				2591	* basic readpage implementation. Locked extent state structs are inserted
				2592	* into the tree that are removed when the IO is done (by the end_io
				2593	* handlers)
				2594	* XXX JDM: This needs looking at to ensure proper page locking
				2595	*/
				2596	static int __extent_read_full_page(struct extent_io_tree *tree,
				2597	struct page *page,
				2598	get_extent_t *get_extent,
				2599	struct bio **bio, int mirror_num,
				2600	unsigned long *bio_flags)
				2601	{
				2602	struct inode *inode = page->mapping->host;
				2603	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
				2604	u64 page_end = start + PAGE_CACHE_SIZE - 1;
				2605	u64 end;
				2606	u64 cur = start;
				2607	u64 extent_offset;
				2608	u64 last_byte = i_size_read(inode);
				2609	u64 block_start;
				2610	u64 cur_end;
				2611	sector_t sector;
				2612	struct extent_map *em;
				2613	struct block_device *bdev;
				2614	struct btrfs_ordered_extent *ordered;
				2615	int ret;
				2616	int nr = 0;
				2617	size_t pg_offset = 0;
				2618	size_t iosize;
				2619	size_t disk_io_size;
				2620	size_t blocksize = inode->i_sb->s_blocksize;
				2621	unsigned long this_bio_flag = 0;
				2622
				2623	set_page_extent_mapped(page);
				2624
				2625	if (!PageUptodate(page)) {
				2626	if (cleancache_get_page(page) == 0) {
				2627	BUG_ON(blocksize != PAGE_SIZE);
				2628	goto out;
				2629	}
				2630	}
				2631
				2632	end = page_end;
				2633	while (1) {
				2634	lock_extent(tree, start, end);
				2635	ordered = btrfs_lookup_ordered_extent(inode, start);
				2636	if (!ordered)
				2637	break;
				2638	unlock_extent(tree, start, end);
				2639	btrfs_start_ordered_extent(inode, ordered, 1);
				2640	btrfs_put_ordered_extent(ordered);
				2641	}
				2642
				2643	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
				2644	char *userpage;
				2645	size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
				2646
				2647	if (zero_offset) {
				2648	iosize = PAGE_CACHE_SIZE - zero_offset;
				2649	userpage = kmap_atomic(page);
				2650	memset(userpage + zero_offset, 0, iosize);
				2651	flush_dcache_page(page);
				2652	kunmap_atomic(userpage);
				2653	}
				2654	}
				2655	while (cur <= end) {
				2656	if (cur >= last_byte) {
				2657	char *userpage;
				2658	struct extent_state *cached = NULL;
				2659
				2660	iosize = PAGE_CACHE_SIZE - pg_offset;
				2661	userpage = kmap_atomic(page);
				2662	memset(userpage + pg_offset, 0, iosize);
				2663	flush_dcache_page(page);
				2664	kunmap_atomic(userpage);
				2665	set_extent_uptodate(tree, cur, cur + iosize - 1,
				2666	&cached, GFP_NOFS);
				2667	unlock_extent_cached(tree, cur, cur + iosize - 1,
				2668	&cached, GFP_NOFS);
				2669	break;
				2670	}
				2671	em = get_extent(inode, page, pg_offset, cur,
				2672	end - cur + 1, 0);
				2673	if (IS_ERR_OR_NULL(em)) {
				2674	SetPageError(page);
				2675	unlock_extent(tree, cur, end);
				2676	break;
				2677	}
				2678	extent_offset = cur - em->start;
				2679	BUG_ON(extent_map_end(em) <= cur);
				2680	BUG_ON(end < cur);
				2681
				2682	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
				2683	this_bio_flag = EXTENT_BIO_COMPRESSED;
				2684	extent_set_compress_type(&this_bio_flag,
				2685	em->compress_type);
				2686	}
				2687
				2688	iosize = min(extent_map_end(em) - cur, end - cur + 1);
				2689	cur_end = min(extent_map_end(em) - 1, end);
				2690	iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
				2691	if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
				2692	disk_io_size = em->block_len;
				2693	sector = em->block_start >> 9;
				2694	} else {
				2695	sector = (em->block_start + extent_offset) >> 9;
				2696	disk_io_size = iosize;
				2697	}
				2698	bdev = em->bdev;
				2699	block_start = em->block_start;
				2700	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
				2701	block_start = EXTENT_MAP_HOLE;
				2702	free_extent_map(em);
				2703	em = NULL;
				2704
				2705	/* we've found a hole, just zero and go on */
				2706	if (block_start == EXTENT_MAP_HOLE) {
				2707	char *userpage;
				2708	struct extent_state *cached = NULL;
				2709
				2710	userpage = kmap_atomic(page);
				2711	memset(userpage + pg_offset, 0, iosize);
				2712	flush_dcache_page(page);
				2713	kunmap_atomic(userpage);
				2714
				2715	set_extent_uptodate(tree, cur, cur + iosize - 1,
				2716	&cached, GFP_NOFS);
				2717	unlock_extent_cached(tree, cur, cur + iosize - 1,
				2718	&cached, GFP_NOFS);
				2719	cur = cur + iosize;
				2720	pg_offset += iosize;
				2721	continue;
				2722	}
				2723	/* the get_extent function already copied into the page */
				2724	if (test_range_bit(tree, cur, cur_end,
				2725	EXTENT_UPTODATE, 1, NULL)) {
				2726	check_page_uptodate(tree, page);
				2727	unlock_extent(tree, cur, cur + iosize - 1);
				2728	cur = cur + iosize;
				2729	pg_offset += iosize;
				2730	continue;
				2731	}
				2732	/* we have an inline extent but it didn't get marked up
				2733	* to date. Error out
				2734	*/
				2735	if (block_start == EXTENT_MAP_INLINE) {
				2736	SetPageError(page);
				2737	unlock_extent(tree, cur, cur + iosize - 1);
				2738	cur = cur + iosize;
				2739	pg_offset += iosize;
				2740	continue;
				2741	}
				2742
				2743	ret = 0;
				2744	if (tree->ops && tree->ops->readpage_io_hook) {
				2745	ret = tree->ops->readpage_io_hook(page, cur,
				2746	cur + iosize - 1);
				2747	}
				2748	if (!ret) {
				2749	unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
				2750	pnr -= page->index;
				2751	ret = submit_extent_page(READ, tree, page,
				2752	sector, disk_io_size, pg_offset,
				2753	bdev, bio, pnr,
				2754	end_bio_extent_readpage, mirror_num,
				2755	*bio_flags,
				2756	this_bio_flag);
				2757	BUG_ON(ret == -ENOMEM);
				2758	nr++;
				2759	*bio_flags = this_bio_flag;
				2760	}
				2761	if (ret)
				2762	SetPageError(page);
				2763	cur = cur + iosize;
				2764	pg_offset += iosize;
				2765	}
				2766	out:
				2767	if (!nr) {
				2768	if (!PageError(page))
				2769	SetPageUptodate(page);
				2770	unlock_page(page);
				2771	}
				2772	return 0;
				2773	}
				2774
				2775	int extent_read_full_page(struct extent_io_tree tree, struct page page,
				2776	get_extent_t *get_extent, int mirror_num)
				2777	{
				2778	struct bio *bio = NULL;
				2779	unsigned long bio_flags = 0;
				2780	int ret;
				2781
				2782	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
				2783	&bio_flags);
				2784	if (bio)
				2785	ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
				2786	return ret;
				2787	}
				2788
				2789	static noinline void update_nr_written(struct page *page,
				2790	struct writeback_control *wbc,
				2791	unsigned long nr_written)
				2792	{
				2793	wbc->nr_to_write -= nr_written;
				2794	if (wbc->range_cyclic \|\| (wbc->nr_to_write > 0 &&
				2795	wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
				2796	page->mapping->writeback_index = page->index + nr_written;
				2797	}
				2798
				2799	/*
				2800	* the writepage semantics are similar to regular writepage. extent
				2801	* records are inserted to lock ranges in the tree, and as dirty areas
				2802	* are found, they are marked writeback. Then the lock bits are removed
				2803	* and the end_io handler clears the writeback ranges
				2804	*/
				2805	static int __extent_writepage(struct page page, struct writeback_control wbc,
				2806	void *data)
				2807	{
				2808	struct inode *inode = page->mapping->host;
				2809	struct extent_page_data *epd = data;
				2810	struct extent_io_tree *tree = epd->tree;
				2811	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
				2812	u64 delalloc_start;
				2813	u64 page_end = start + PAGE_CACHE_SIZE - 1;
				2814	u64 end;
				2815	u64 cur = start;
				2816	u64 extent_offset;
				2817	u64 last_byte = i_size_read(inode);
				2818	u64 block_start;
				2819	u64 iosize;
				2820	sector_t sector;
				2821	struct extent_state *cached_state = NULL;
				2822	struct extent_map *em;
				2823	struct block_device *bdev;
				2824	int ret;
				2825	int nr = 0;
				2826	size_t pg_offset = 0;
				2827	size_t blocksize;
				2828	loff_t i_size = i_size_read(inode);
				2829	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
				2830	u64 nr_delalloc;
				2831	u64 delalloc_end;
				2832	int page_started;
				2833	int compressed;
				2834	int write_flags;
				2835	unsigned long nr_written = 0;
				2836	bool fill_delalloc = true;
				2837
				2838	if (wbc->sync_mode == WB_SYNC_ALL)
				2839	write_flags = WRITE_SYNC;
				2840	else
				2841	write_flags = WRITE;
				2842
				2843	trace___extent_writepage(page, inode, wbc);
				2844
				2845	WARN_ON(!PageLocked(page));
				2846
				2847	ClearPageError(page);
				2848
				2849	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
				2850	if (page->index > end_index \|\|
				2851	(page->index == end_index && !pg_offset)) {
				2852	page->mapping->a_ops->invalidatepage(page, 0);
				2853	unlock_page(page);
				2854	return 0;
				2855	}
				2856
				2857	if (page->index == end_index) {
				2858	char *userpage;
				2859
				2860	userpage = kmap_atomic(page);
				2861	memset(userpage + pg_offset, 0,
				2862	PAGE_CACHE_SIZE - pg_offset);
				2863	kunmap_atomic(userpage);
				2864	flush_dcache_page(page);
				2865	}
				2866	pg_offset = 0;
				2867
				2868	set_page_extent_mapped(page);
				2869
				2870	if (!tree->ops \|\| !tree->ops->fill_delalloc)
				2871	fill_delalloc = false;
				2872
				2873	delalloc_start = start;
				2874	delalloc_end = 0;
				2875	page_started = 0;
				2876	if (!epd->extent_locked && fill_delalloc) {
				2877	u64 delalloc_to_write = 0;
				2878	/*
				2879	* make sure the wbc mapping index is at least updated
				2880	* to this page.
				2881	*/
				2882	update_nr_written(page, wbc, 0);
				2883
				2884	while (delalloc_end < page_end) {
				2885	nr_delalloc = find_lock_delalloc_range(inode, tree,
				2886	page,
				2887	&delalloc_start,
				2888	&delalloc_end,
				2889	128 * 1024 * 1024);
				2890	if (nr_delalloc == 0) {
				2891	delalloc_start = delalloc_end + 1;
				2892	continue;
				2893	}
				2894	ret = tree->ops->fill_delalloc(inode, page,
				2895	delalloc_start,
				2896	delalloc_end,
				2897	&page_started,
				2898	&nr_written);
				2899	/* File system has been set read-only */
				2900	if (ret) {
				2901	SetPageError(page);
				2902	goto done;
				2903	}
				2904	/*
				2905	* delalloc_end is already one less than the total
				2906	* length, so we don't subtract one from
				2907	* PAGE_CACHE_SIZE
				2908	*/
				2909	delalloc_to_write += (delalloc_end - delalloc_start +
				2910	PAGE_CACHE_SIZE) >>
				2911	PAGE_CACHE_SHIFT;
				2912	delalloc_start = delalloc_end + 1;
				2913	}
				2914	if (wbc->nr_to_write < delalloc_to_write) {
				2915	int thresh = 8192;
				2916
				2917	if (delalloc_to_write < thresh * 2)
				2918	thresh = delalloc_to_write;
				2919	wbc->nr_to_write = min_t(u64, delalloc_to_write,
				2920	thresh);
				2921	}
				2922
				2923	/* did the fill delalloc function already unlock and start
				2924	* the IO?
				2925	*/
				2926	if (page_started) {
				2927	ret = 0;
				2928	/*
				2929	* we've unlocked the page, so we can't update
				2930	* the mapping's writeback index, just update
				2931	* nr_to_write.
				2932	*/
				2933	wbc->nr_to_write -= nr_written;
				2934	goto done_unlocked;
				2935	}
				2936	}
				2937	if (tree->ops && tree->ops->writepage_start_hook) {
				2938	ret = tree->ops->writepage_start_hook(page, start,
				2939	page_end);
				2940	if (ret) {
				2941	/* Fixup worker will requeue */
				2942	if (ret == -EBUSY)
				2943	wbc->pages_skipped++;
				2944	else
				2945	redirty_page_for_writepage(wbc, page);
				2946	update_nr_written(page, wbc, nr_written);
				2947	unlock_page(page);
				2948	ret = 0;
				2949	goto done_unlocked;
				2950	}
				2951	}
				2952
				2953	/*
				2954	* we don't want to touch the inode after unlocking the page,
				2955	* so we update the mapping writeback index now
				2956	*/
				2957	update_nr_written(page, wbc, nr_written + 1);
				2958
				2959	end = page_end;
				2960	if (last_byte <= start) {
				2961	if (tree->ops && tree->ops->writepage_end_io_hook)
				2962	tree->ops->writepage_end_io_hook(page, start,
				2963	page_end, NULL, 1);
				2964	goto done;
				2965	}
				2966
				2967	blocksize = inode->i_sb->s_blocksize;
				2968
				2969	while (cur <= end) {
				2970	if (cur >= last_byte) {
				2971	if (tree->ops && tree->ops->writepage_end_io_hook)
				2972	tree->ops->writepage_end_io_hook(page, cur,
				2973	page_end, NULL, 1);
				2974	break;
				2975	}
				2976	em = epd->get_extent(inode, page, pg_offset, cur,
				2977	end - cur + 1, 1);
				2978	if (IS_ERR_OR_NULL(em)) {
				2979	SetPageError(page);
				2980	break;
				2981	}
				2982
				2983	extent_offset = cur - em->start;
				2984	BUG_ON(extent_map_end(em) <= cur);
				2985	BUG_ON(end < cur);
				2986	iosize = min(extent_map_end(em) - cur, end - cur + 1);
				2987	iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
				2988	sector = (em->block_start + extent_offset) >> 9;
				2989	bdev = em->bdev;
				2990	block_start = em->block_start;
				2991	compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
				2992	free_extent_map(em);
				2993	em = NULL;
				2994
				2995	/*
				2996	* compressed and inline extents are written through other
				2997	* paths in the FS
				2998	*/
				2999	if (compressed \|\| block_start == EXTENT_MAP_HOLE \|\|
				3000	block_start == EXTENT_MAP_INLINE) {
				3001	/*
				3002	* end_io notification does not happen here for
				3003	* compressed extents
				3004	*/
				3005	if (!compressed && tree->ops &&
				3006	tree->ops->writepage_end_io_hook)
				3007	tree->ops->writepage_end_io_hook(page, cur,
				3008	cur + iosize - 1,
				3009	NULL, 1);
				3010	else if (compressed) {
				3011	/* we don't want to end_page_writeback on
				3012	* a compressed extent. this happens
				3013	* elsewhere
				3014	*/
				3015	nr++;
				3016	}
				3017
				3018	cur += iosize;
				3019	pg_offset += iosize;
				3020	continue;
				3021	}
				3022	/* leave this out until we have a page_mkwrite call */
				3023	if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
				3024	EXTENT_DIRTY, 0, NULL)) {
				3025	cur = cur + iosize;
				3026	pg_offset += iosize;
				3027	continue;
				3028	}
				3029
				3030	if (tree->ops && tree->ops->writepage_io_hook) {
				3031	ret = tree->ops->writepage_io_hook(page, cur,
				3032	cur + iosize - 1);
				3033	} else {
				3034	ret = 0;
				3035	}
				3036	if (ret) {
				3037	SetPageError(page);
				3038	} else {
				3039	unsigned long max_nr = end_index + 1;
				3040
				3041	set_range_writeback(tree, cur, cur + iosize - 1);
				3042	if (!PageWriteback(page)) {
				3043	printk(KERN_ERR "btrfs warning page %lu not "
				3044	"writeback, cur %llu end %llu\n",
				3045	page->index, (unsigned long long)cur,
				3046	(unsigned long long)end);
				3047	}
				3048
				3049	ret = submit_extent_page(write_flags, tree, page,
				3050	sector, iosize, pg_offset,
				3051	bdev, &epd->bio, max_nr,
				3052	end_bio_extent_writepage,
				3053	0, 0, 0);
				3054	if (ret)
				3055	SetPageError(page);
				3056	}
				3057	cur = cur + iosize;
				3058	pg_offset += iosize;
				3059	nr++;
				3060	}
				3061	done:
				3062	if (nr == 0) {
				3063	/* make sure the mapping tag for page dirty gets cleared */
				3064	set_page_writeback(page);
				3065	end_page_writeback(page);
				3066	}
				3067	unlock_page(page);
				3068
				3069	done_unlocked:
				3070
				3071	/* drop our reference on any cached states */
				3072	free_extent_state(cached_state);
				3073	return 0;
				3074	}
				3075
				3076	static int eb_wait(void *word)
				3077	{
				3078	io_schedule();
				3079	return 0;
				3080	}
				3081
				3082	static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
				3083	{
				3084	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
				3085	TASK_UNINTERRUPTIBLE);
				3086	}
				3087
				3088	static int lock_extent_buffer_for_io(struct extent_buffer *eb,
				3089	struct btrfs_fs_info *fs_info,
				3090	struct extent_page_data *epd)
				3091	{
				3092	unsigned long i, num_pages;
				3093	int flush = 0;
				3094	int ret = 0;
				3095
				3096	if (!btrfs_try_tree_write_lock(eb)) {
				3097	flush = 1;
				3098	flush_write_bio(epd);
				3099	btrfs_tree_lock(eb);
				3100	}
				3101
				3102	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
				3103	btrfs_tree_unlock(eb);
				3104	if (!epd->sync_io)
				3105	return 0;
				3106	if (!flush) {
				3107	flush_write_bio(epd);
				3108	flush = 1;
				3109	}
				3110	while (1) {
				3111	wait_on_extent_buffer_writeback(eb);
				3112	btrfs_tree_lock(eb);
				3113	if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
				3114	break;
				3115	btrfs_tree_unlock(eb);
				3116	}
				3117	}
				3118
				3119	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
				3120	set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
				3121	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
				3122	spin_lock(&fs_info->delalloc_lock);
				3123	if (fs_info->dirty_metadata_bytes >= eb->len)
				3124	fs_info->dirty_metadata_bytes -= eb->len;
				3125	else
				3126	WARN_ON(1);
				3127	spin_unlock(&fs_info->delalloc_lock);
				3128	ret = 1;
				3129	}
				3130
				3131	btrfs_tree_unlock(eb);
				3132
				3133	if (!ret)
				3134	return ret;
				3135
				3136	num_pages = num_extent_pages(eb->start, eb->len);
				3137	for (i = 0; i < num_pages; i++) {
				3138	struct page *p = extent_buffer_page(eb, i);
				3139
				3140	if (!trylock_page(p)) {
				3141	if (!flush) {
				3142	flush_write_bio(epd);
				3143	flush = 1;
				3144	}
				3145	lock_page(p);
				3146	}
				3147	}
				3148
				3149	return ret;
				3150	}
				3151
				3152	static void end_extent_buffer_writeback(struct extent_buffer *eb)
				3153	{
				3154	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
				3155	smp_mb__after_clear_bit();
				3156	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
				3157	}
				3158
				3159	static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
				3160	{
				3161	int uptodate = err == 0;
				3162	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
				3163	struct extent_buffer *eb;
				3164	int done;
				3165
				3166	do {
				3167	struct page *page = bvec->bv_page;
				3168
				3169	bvec--;
				3170	eb = (struct extent_buffer *)page->private;
				3171	BUG_ON(!eb);
				3172	done = atomic_dec_and_test(&eb->io_pages);
				3173
				3174	if (!uptodate \|\| test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
				3175	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
				3176	ClearPageUptodate(page);
				3177	SetPageError(page);
				3178	}
				3179
				3180	end_page_writeback(page);
				3181
				3182	if (!done)
				3183	continue;
				3184
				3185	end_extent_buffer_writeback(eb);
				3186	} while (bvec >= bio->bi_io_vec);
				3187
				3188	bio_put(bio);
				3189
				3190	}
				3191
				3192	static int write_one_eb(struct extent_buffer *eb,
				3193	struct btrfs_fs_info *fs_info,
				3194	struct writeback_control *wbc,
				3195	struct extent_page_data *epd)
				3196	{
				3197	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
				3198	u64 offset = eb->start;
				3199	unsigned long i, num_pages;
				3200	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
				3201	int ret;
				3202
				3203	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
				3204	num_pages = num_extent_pages(eb->start, eb->len);
				3205	atomic_set(&eb->io_pages, num_pages);
				3206	for (i = 0; i < num_pages; i++) {
				3207	struct page *p = extent_buffer_page(eb, i);
				3208
				3209	clear_page_dirty_for_io(p);
				3210	set_page_writeback(p);
				3211	ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
				3212	PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
				3213	-1, end_bio_extent_buffer_writepage,
				3214	0, 0, 0);
				3215	if (ret) {
				3216	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
				3217	SetPageError(p);
				3218	if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
				3219	end_extent_buffer_writeback(eb);
				3220	ret = -EIO;
				3221	break;
				3222	}
				3223	offset += PAGE_CACHE_SIZE;
				3224	update_nr_written(p, wbc, 1);
				3225	unlock_page(p);
				3226	}
				3227
				3228	if (unlikely(ret)) {
				3229	for (; i < num_pages; i++) {
				3230	struct page *p = extent_buffer_page(eb, i);
				3231	unlock_page(p);
				3232	}
				3233	}
				3234
				3235	return ret;
				3236	}
				3237
				3238	int btree_write_cache_pages(struct address_space *mapping,
				3239	struct writeback_control *wbc)
				3240	{
				3241	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
				3242	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
				3243	struct extent_buffer eb, prev_eb = NULL;
				3244	struct extent_page_data epd = {
				3245	.bio = NULL,
				3246	.tree = tree,
				3247	.extent_locked = 0,
				3248	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				3249	};
				3250	int ret = 0;
				3251	int done = 0;
				3252	int nr_to_write_done = 0;
				3253	struct pagevec pvec;
				3254	int nr_pages;
				3255	pgoff_t index;
				3256	pgoff_t end; /* Inclusive */
				3257	int scanned = 0;
				3258	int tag;
				3259
				3260	pagevec_init(&pvec, 0);
				3261	if (wbc->range_cyclic) {
				3262	index = mapping->writeback_index; /* Start from prev offset */
				3263	end = -1;
				3264	} else {
				3265	index = wbc->range_start >> PAGE_CACHE_SHIFT;
				3266	end = wbc->range_end >> PAGE_CACHE_SHIFT;
				3267	scanned = 1;
				3268	}
				3269	if (wbc->sync_mode == WB_SYNC_ALL)
				3270	tag = PAGECACHE_TAG_TOWRITE;
				3271	else
				3272	tag = PAGECACHE_TAG_DIRTY;
				3273	retry:
				3274	if (wbc->sync_mode == WB_SYNC_ALL)
				3275	tag_pages_for_writeback(mapping, index, end);
				3276	while (!done && !nr_to_write_done && (index <= end) &&
				3277	(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
				3278	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
				3279	unsigned i;
				3280
				3281	scanned = 1;
				3282	for (i = 0; i < nr_pages; i++) {
				3283	struct page *page = pvec.pages[i];
				3284
				3285	if (!PagePrivate(page))
				3286	continue;
				3287
				3288	if (!wbc->range_cyclic && page->index > end) {
				3289	done = 1;
				3290	break;
				3291	}
				3292
				3293	eb = (struct extent_buffer *)page->private;
				3294	if (!eb) {
				3295	WARN_ON(1);
				3296	continue;
				3297	}
				3298
				3299	if (eb == prev_eb)
				3300	continue;
				3301
				3302	if (!atomic_inc_not_zero(&eb->refs)) {
				3303	WARN_ON(1);
				3304	continue;
				3305	}
				3306
				3307	prev_eb = eb;
				3308	ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
				3309	if (!ret) {
				3310	free_extent_buffer(eb);
				3311	continue;
				3312	}
				3313
				3314	ret = write_one_eb(eb, fs_info, wbc, &epd);
				3315	if (ret) {
				3316	done = 1;
				3317	free_extent_buffer(eb);
				3318	break;
				3319	}
				3320	free_extent_buffer(eb);
				3321
				3322	/*
				3323	* the filesystem may choose to bump up nr_to_write.
				3324	* We have to make sure to honor the new nr_to_write
				3325	* at any time
				3326	*/
				3327	nr_to_write_done = wbc->nr_to_write <= 0;
				3328	}
				3329	pagevec_release(&pvec);
				3330	cond_resched();
				3331	}
				3332	if (!scanned && !done) {
				3333	/*
				3334	* We hit the last page and there is more work to be done: wrap
				3335	* back to the start of the file
				3336	*/
				3337	scanned = 1;
				3338	index = 0;
				3339	goto retry;
				3340	}
				3341	flush_write_bio(&epd);
				3342	return ret;
				3343	}
				3344
				3345	/**
				3346	* write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
				3347	* @mapping: address space structure to write
				3348	* @wbc: subtract the number of written pages from *@wbc->nr_to_write
				3349	* @writepage: function called for each page
				3350	* @data: data passed to writepage function
				3351	*
				3352	* If a page is already under I/O, write_cache_pages() skips it, even
				3353	* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
				3354	* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
				3355	* and msync() need to guarantee that all the data which was dirty at the time
				3356	* the call was made get new I/O started against them. If wbc->sync_mode is
				3357	* WB_SYNC_ALL then we were called for data integrity and we must wait for
				3358	* existing IO to complete.
				3359	*/
				3360	static int extent_write_cache_pages(struct extent_io_tree *tree,
				3361	struct address_space *mapping,
				3362	struct writeback_control *wbc,
				3363	writepage_t writepage, void *data,
				3364	void (flush_fn)(void ))
				3365	{
				3366	int ret = 0;
				3367	int done = 0;
				3368	int nr_to_write_done = 0;
				3369	struct pagevec pvec;
				3370	int nr_pages;
				3371	pgoff_t index;
				3372	pgoff_t end; /* Inclusive */
				3373	int scanned = 0;
				3374	int tag;
				3375
				3376	pagevec_init(&pvec, 0);
				3377	if (wbc->range_cyclic) {
				3378	index = mapping->writeback_index; /* Start from prev offset */
				3379	end = -1;
				3380	} else {
				3381	index = wbc->range_start >> PAGE_CACHE_SHIFT;
				3382	end = wbc->range_end >> PAGE_CACHE_SHIFT;
				3383	scanned = 1;
				3384	}
				3385	if (wbc->sync_mode == WB_SYNC_ALL)
				3386	tag = PAGECACHE_TAG_TOWRITE;
				3387	else
				3388	tag = PAGECACHE_TAG_DIRTY;
				3389	retry:
				3390	if (wbc->sync_mode == WB_SYNC_ALL)
				3391	tag_pages_for_writeback(mapping, index, end);
				3392	while (!done && !nr_to_write_done && (index <= end) &&
				3393	(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
				3394	min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
				3395	unsigned i;
				3396
				3397	scanned = 1;
				3398	for (i = 0; i < nr_pages; i++) {
				3399	struct page *page = pvec.pages[i];
				3400
				3401	/*
				3402	* At this point we hold neither mapping->tree_lock nor
				3403	* lock on the page itself: the page may be truncated or
				3404	* invalidated (changing page->mapping to NULL), or even
				3405	* swizzled back from swapper_space to tmpfs file
				3406	* mapping
				3407	*/
				3408	if (tree->ops &&
				3409	tree->ops->write_cache_pages_lock_hook) {
				3410	tree->ops->write_cache_pages_lock_hook(page,
				3411	data, flush_fn);
				3412	} else {
				3413	if (!trylock_page(page)) {
				3414	flush_fn(data);
				3415	lock_page(page);
				3416	}
				3417	}
				3418
				3419	if (unlikely(page->mapping != mapping)) {
				3420	unlock_page(page);
				3421	continue;
				3422	}
				3423
				3424	if (!wbc->range_cyclic && page->index > end) {
				3425	done = 1;
				3426	unlock_page(page);
				3427	continue;
				3428	}
				3429
				3430	if (wbc->sync_mode != WB_SYNC_NONE) {
				3431	if (PageWriteback(page))
				3432	flush_fn(data);
				3433	wait_on_page_writeback(page);
				3434	}
				3435
				3436	if (PageWriteback(page) \|\|
				3437	!clear_page_dirty_for_io(page)) {
				3438	unlock_page(page);
				3439	continue;
				3440	}
				3441
				3442	ret = (*writepage)(page, wbc, data);
				3443
				3444	if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
				3445	unlock_page(page);
				3446	ret = 0;
				3447	}
				3448	if (ret)
				3449	done = 1;
				3450
				3451	/*
				3452	* the filesystem may choose to bump up nr_to_write.
				3453	* We have to make sure to honor the new nr_to_write
				3454	* at any time
				3455	*/
				3456	nr_to_write_done = wbc->nr_to_write <= 0;
				3457	}
				3458	pagevec_release(&pvec);
				3459	cond_resched();
				3460	}
				3461	if (!scanned && !done) {
				3462	/*
				3463	* We hit the last page and there is more work to be done: wrap
				3464	* back to the start of the file
				3465	*/
				3466	scanned = 1;
				3467	index = 0;
				3468	goto retry;
				3469	}
				3470	return ret;
				3471	}
				3472
				3473	static void flush_epd_write_bio(struct extent_page_data *epd)
				3474	{
				3475	if (epd->bio) {
				3476	int rw = WRITE;
				3477	int ret;
				3478
				3479	if (epd->sync_io)
				3480	rw = WRITE_SYNC;
				3481
				3482	ret = submit_one_bio(rw, epd->bio, 0, 0);
				3483	BUG_ON(ret < 0); /* -ENOMEM */
				3484	epd->bio = NULL;
				3485	}
				3486	}
				3487
				3488	static noinline void flush_write_bio(void *data)
				3489	{
				3490	struct extent_page_data *epd = data;
				3491	flush_epd_write_bio(epd);
				3492	}
				3493
				3494	int extent_write_full_page(struct extent_io_tree tree, struct page page,
				3495	get_extent_t *get_extent,
				3496	struct writeback_control *wbc)
				3497	{
				3498	int ret;
				3499	struct extent_page_data epd = {
				3500	.bio = NULL,
				3501	.tree = tree,
				3502	.get_extent = get_extent,
				3503	.extent_locked = 0,
				3504	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				3505	};
				3506
				3507	ret = __extent_writepage(page, wbc, &epd);
				3508
				3509	flush_epd_write_bio(&epd);
				3510	return ret;
				3511	}
				3512
				3513	int extent_write_locked_range(struct extent_io_tree tree, struct inode inode,
				3514	u64 start, u64 end, get_extent_t *get_extent,
				3515	int mode)
				3516	{
				3517	int ret = 0;
				3518	struct address_space *mapping = inode->i_mapping;
				3519	struct page *page;
				3520	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
				3521	PAGE_CACHE_SHIFT;
				3522
				3523	struct extent_page_data epd = {
				3524	.bio = NULL,
				3525	.tree = tree,
				3526	.get_extent = get_extent,
				3527	.extent_locked = 1,
				3528	.sync_io = mode == WB_SYNC_ALL,
				3529	};
				3530	struct writeback_control wbc_writepages = {
				3531	.sync_mode = mode,
				3532	.nr_to_write = nr_pages * 2,
				3533	.range_start = start,
				3534	.range_end = end + 1,
				3535	};
				3536
				3537	while (start <= end) {
				3538	page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
				3539	if (clear_page_dirty_for_io(page))
				3540	ret = __extent_writepage(page, &wbc_writepages, &epd);
				3541	else {
				3542	if (tree->ops && tree->ops->writepage_end_io_hook)
				3543	tree->ops->writepage_end_io_hook(page, start,
				3544	start + PAGE_CACHE_SIZE - 1,
				3545	NULL, 1);
				3546	unlock_page(page);
				3547	}
				3548	page_cache_release(page);
				3549	start += PAGE_CACHE_SIZE;
				3550	}
				3551
				3552	flush_epd_write_bio(&epd);
				3553	return ret;
				3554	}
				3555
				3556	int extent_writepages(struct extent_io_tree *tree,
				3557	struct address_space *mapping,
				3558	get_extent_t *get_extent,
				3559	struct writeback_control *wbc)
				3560	{
				3561	int ret = 0;
				3562	struct extent_page_data epd = {
				3563	.bio = NULL,
				3564	.tree = tree,
				3565	.get_extent = get_extent,
				3566	.extent_locked = 0,
				3567	.sync_io = wbc->sync_mode == WB_SYNC_ALL,
				3568	};
				3569
				3570	ret = extent_write_cache_pages(tree, mapping, wbc,
				3571	__extent_writepage, &epd,
				3572	flush_write_bio);
				3573	flush_epd_write_bio(&epd);
				3574	return ret;
				3575	}
				3576
				3577	int extent_readpages(struct extent_io_tree *tree,
				3578	struct address_space *mapping,
				3579	struct list_head *pages, unsigned nr_pages,
				3580	get_extent_t get_extent)
				3581	{
				3582	struct bio *bio = NULL;
				3583	unsigned page_idx;
				3584	unsigned long bio_flags = 0;
				3585
				3586	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
				3587	struct page *page = list_entry(pages->prev, struct page, lru);
				3588
				3589	prefetchw(&page->flags);
				3590	list_del(&page->lru);
				3591	if (!add_to_page_cache_lru(page, mapping,
				3592	page->index, GFP_NOFS)) {
				3593	__extent_read_full_page(tree, page, get_extent,
				3594	&bio, 0, &bio_flags);
				3595	}
				3596	page_cache_release(page);
				3597	}
				3598	BUG_ON(!list_empty(pages));
				3599	if (bio)
				3600	return submit_one_bio(READ, bio, 0, bio_flags);
				3601	return 0;
				3602	}
				3603
				3604	/*
				3605	* basic invalidatepage code, this waits on any locked or writeback
				3606	* ranges corresponding to the page, and then deletes any extent state
				3607	* records from the tree
				3608	*/
				3609	int extent_invalidatepage(struct extent_io_tree *tree,
				3610	struct page *page, unsigned long offset)
				3611	{
				3612	struct extent_state *cached_state = NULL;
				3613	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
				3614	u64 end = start + PAGE_CACHE_SIZE - 1;
				3615	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
				3616
				3617	start += (offset + blocksize - 1) & ~(blocksize - 1);
				3618	if (start > end)
				3619	return 0;
				3620
				3621	lock_extent_bits(tree, start, end, 0, &cached_state);
				3622	wait_on_page_writeback(page);
				3623	clear_extent_bit(tree, start, end,
				3624	EXTENT_LOCKED \| EXTENT_DIRTY \| EXTENT_DELALLOC \|
				3625	EXTENT_DO_ACCOUNTING,
				3626	1, 1, &cached_state, GFP_NOFS);
				3627	return 0;
				3628	}
				3629
				3630	/*
				3631	* a helper for releasepage, this tests for areas of the page that
				3632	* are locked or under IO and drops the related state bits if it is safe
				3633	* to drop the page.
				3634	*/
				3635	int try_release_extent_state(struct extent_map_tree *map,
				3636	struct extent_io_tree tree, struct page page,
				3637	gfp_t mask)
				3638	{
				3639	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
				3640	u64 end = start + PAGE_CACHE_SIZE - 1;
				3641	int ret = 1;
				3642
				3643	if (test_range_bit(tree, start, end,
				3644	EXTENT_IOBITS, 0, NULL))
				3645	ret = 0;
				3646	else {
				3647	if ((mask & GFP_NOFS) == GFP_NOFS)
				3648	mask = GFP_NOFS;
				3649	/*
				3650	* at this point we can safely clear everything except the
				3651	* locked bit and the nodatasum bit
				3652	*/
				3653	ret = clear_extent_bit(tree, start, end,
				3654	~(EXTENT_LOCKED \| EXTENT_NODATASUM),
				3655	0, 0, NULL, mask);
				3656
				3657	/* if clear_extent_bit failed for enomem reasons,
				3658	* we can't allow the release to continue.
				3659	*/
				3660	if (ret < 0)
				3661	ret = 0;
				3662	else
				3663	ret = 1;
				3664	}
				3665	return ret;
				3666	}
				3667
				3668	/*
				3669	* a helper for releasepage. As long as there are no locked extents
				3670	* in the range corresponding to the page, both state records and extent
				3671	* map records are removed
				3672	*/
				3673	int try_release_extent_mapping(struct extent_map_tree *map,
				3674	struct extent_io_tree tree, struct page page,
				3675	gfp_t mask)
				3676	{
				3677	struct extent_map *em;
				3678	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
				3679	u64 end = start + PAGE_CACHE_SIZE - 1;
				3680
				3681	if ((mask & __GFP_WAIT) &&
				3682	page->mapping->host->i_size > 16 * 1024 * 1024) {
				3683	u64 len;
				3684	while (start <= end) {
				3685	len = end - start + 1;
				3686	write_lock(&map->lock);
				3687	em = lookup_extent_mapping(map, start, len);
				3688	if (!em) {
				3689	write_unlock(&map->lock);
				3690	break;
				3691	}
				3692	if (test_bit(EXTENT_FLAG_PINNED, &em->flags) \|\|
				3693	em->start != start) {
				3694	write_unlock(&map->lock);
				3695	free_extent_map(em);
				3696	break;
				3697	}
				3698	if (!test_range_bit(tree, em->start,
				3699	extent_map_end(em) - 1,
				3700	EXTENT_LOCKED \| EXTENT_WRITEBACK,
				3701	0, NULL)) {
				3702	remove_extent_mapping(map, em);
				3703	/* once for the rb tree */
				3704	free_extent_map(em);
				3705	}
				3706	start = extent_map_end(em);
				3707	write_unlock(&map->lock);
				3708
				3709	/* once for us */
				3710	free_extent_map(em);
				3711	}
				3712	}
				3713	return try_release_extent_state(map, tree, page, mask);
				3714	}
				3715
				3716	/*
				3717	* helper function for fiemap, which doesn't want to see any holes.
				3718	* This maps until we find something past 'last'
				3719	*/
				3720	static struct extent_map get_extent_skip_holes(struct inode inode,
				3721	u64 offset,
				3722	u64 last,
				3723	get_extent_t *get_extent)
				3724	{
				3725	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
				3726	struct extent_map *em;
				3727	u64 len;
				3728
				3729	if (offset >= last)
				3730	return NULL;
				3731
				3732	while(1) {
				3733	len = last - offset;
				3734	if (len == 0)
				3735	break;
				3736	len = (len + sectorsize - 1) & ~(sectorsize - 1);
				3737	em = get_extent(inode, NULL, 0, offset, len, 0);
				3738	if (IS_ERR_OR_NULL(em))
				3739	return em;
				3740
				3741	/* if this isn't a hole return it */
				3742	if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
				3743	em->block_start != EXTENT_MAP_HOLE) {
				3744	return em;
				3745	}
				3746
				3747	/* this is a hole, advance to the next extent */
				3748	offset = extent_map_end(em);
				3749	free_extent_map(em);
				3750	if (offset >= last)
				3751	break;
				3752	}
				3753	return NULL;
				3754	}
				3755
				3756	int extent_fiemap(struct inode inode, struct fiemap_extent_info fieinfo,
				3757	__u64 start, __u64 len, get_extent_t *get_extent)
				3758	{
				3759	int ret = 0;
				3760	u64 off = start;
				3761	u64 max = start + len;
				3762	u32 flags = 0;
				3763	u32 found_type;
				3764	u64 last;
				3765	u64 last_for_get_extent = 0;
				3766	u64 disko = 0;
				3767	u64 isize = i_size_read(inode);
				3768	struct btrfs_key found_key;
				3769	struct extent_map *em = NULL;
				3770	struct extent_state *cached_state = NULL;
				3771	struct btrfs_path *path;
				3772	struct btrfs_file_extent_item *item;
				3773	int end = 0;
				3774	u64 em_start = 0;
				3775	u64 em_len = 0;
				3776	u64 em_end = 0;
				3777	unsigned long emflags;
				3778
				3779	if (len == 0)
				3780	return -EINVAL;
				3781
				3782	path = btrfs_alloc_path();
				3783	if (!path)
				3784	return -ENOMEM;
				3785	path->leave_spinning = 1;
				3786
				3787	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
				3788	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
				3789
				3790	/*
				3791	* lookup the last file extent. We're not using i_size here
				3792	* because there might be preallocation past i_size
				3793	*/
				3794	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
				3795	path, btrfs_ino(inode), -1, 0);
				3796	if (ret < 0) {
				3797	btrfs_free_path(path);
				3798	return ret;
				3799	}
				3800	WARN_ON(!ret);
				3801	path->slots[0]--;
				3802	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3803	struct btrfs_file_extent_item);
				3804	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
				3805	found_type = btrfs_key_type(&found_key);
				3806
				3807	/* No extents, but there might be delalloc bits */
				3808	if (found_key.objectid != btrfs_ino(inode) \|\|
				3809	found_type != BTRFS_EXTENT_DATA_KEY) {
				3810	/* have to trust i_size as the end */
				3811	last = (u64)-1;
				3812	last_for_get_extent = isize;
				3813	} else {
				3814	/*
				3815	* remember the start of the last extent. There are a
				3816	* bunch of different factors that go into the length of the
				3817	* extent, so its much less complex to remember where it started
				3818	*/
				3819	last = found_key.offset;
				3820	last_for_get_extent = last + 1;
				3821	}
				3822	btrfs_free_path(path);
				3823
				3824	/*
				3825	* we might have some extents allocated but more delalloc past those
				3826	* extents. so, we trust isize unless the start of the last extent is
				3827	* beyond isize
				3828	*/
				3829	if (last < isize) {
				3830	last = (u64)-1;
				3831	last_for_get_extent = isize;
				3832	}
				3833
				3834	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
				3835	&cached_state);
				3836
				3837	em = get_extent_skip_holes(inode, start, last_for_get_extent,
				3838	get_extent);
				3839	if (!em)
				3840	goto out;
				3841	if (IS_ERR(em)) {
				3842	ret = PTR_ERR(em);
				3843	goto out;
				3844	}
				3845
				3846	while (!end) {
				3847	u64 offset_in_extent;
				3848
				3849	/* break if the extent we found is outside the range */
				3850	if (em->start >= max \|\| extent_map_end(em) < off)
				3851	break;
				3852
				3853	/*
				3854	* get_extent may return an extent that starts before our
				3855	* requested range. We have to make sure the ranges
				3856	* we return to fiemap always move forward and don't
				3857	* overlap, so adjust the offsets here
				3858	*/
				3859	em_start = max(em->start, off);
				3860
				3861	/*
				3862	* record the offset from the start of the extent
				3863	* for adjusting the disk offset below
				3864	*/
				3865	offset_in_extent = em_start - em->start;
				3866	em_end = extent_map_end(em);
				3867	em_len = em_end - em_start;
				3868	emflags = em->flags;
				3869	disko = 0;
				3870	flags = 0;
				3871
				3872	/*
				3873	* bump off for our next call to get_extent
				3874	*/
				3875	off = extent_map_end(em);
				3876	if (off >= max)
				3877	end = 1;
				3878
				3879	if (em->block_start == EXTENT_MAP_LAST_BYTE) {
				3880	end = 1;
				3881	flags \|= FIEMAP_EXTENT_LAST;
				3882	} else if (em->block_start == EXTENT_MAP_INLINE) {
				3883	flags \|= (FIEMAP_EXTENT_DATA_INLINE \|
				3884	FIEMAP_EXTENT_NOT_ALIGNED);
				3885	} else if (em->block_start == EXTENT_MAP_DELALLOC) {
				3886	flags \|= (FIEMAP_EXTENT_DELALLOC \|
				3887	FIEMAP_EXTENT_UNKNOWN);
				3888	} else {
				3889	disko = em->block_start + offset_in_extent;
				3890	}
				3891	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
				3892	flags \|= FIEMAP_EXTENT_ENCODED;
				3893
				3894	free_extent_map(em);
				3895	em = NULL;
				3896	if ((em_start >= last) \|\| em_len == (u64)-1 \|\|
				3897	(last == (u64)-1 && isize <= em_end)) {
				3898	flags \|= FIEMAP_EXTENT_LAST;
				3899	end = 1;
				3900	}
				3901
				3902	/* now scan forward to see if this is really the last extent. */
				3903	em = get_extent_skip_holes(inode, off, last_for_get_extent,
				3904	get_extent);
				3905	if (IS_ERR(em)) {
				3906	ret = PTR_ERR(em);
				3907	goto out;
				3908	}
				3909	if (!em) {
				3910	flags \|= FIEMAP_EXTENT_LAST;
				3911	end = 1;
				3912	}
				3913	ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
				3914	em_len, flags);
				3915	if (ret)
				3916	goto out_free;
				3917	}
				3918	out_free:
				3919	free_extent_map(em);
				3920	out:
				3921	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
				3922	&cached_state, GFP_NOFS);
				3923	return ret;
				3924	}
				3925
				3926	inline struct page extent_buffer_page(struct extent_buffer eb,
				3927	unsigned long i)
				3928	{
				3929	return eb->pages[i];
				3930	}
				3931
				3932	inline unsigned long num_extent_pages(u64 start, u64 len)
				3933	{
				3934	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
				3935	(start >> PAGE_CACHE_SHIFT);
				3936	}
				3937
				3938	static void __free_extent_buffer(struct extent_buffer *eb)
				3939	{
				3940	#if LEAK_DEBUG
				3941	unsigned long flags;
				3942	spin_lock_irqsave(&leak_lock, flags);
				3943	list_del(&eb->leak_list);
				3944	spin_unlock_irqrestore(&leak_lock, flags);
				3945	#endif
				3946	if (eb->pages && eb->pages != eb->inline_pages)
				3947	kfree(eb->pages);
				3948	kmem_cache_free(extent_buffer_cache, eb);
				3949	}
				3950
				3951	static struct extent_buffer __alloc_extent_buffer(struct extent_io_tree tree,
				3952	u64 start,
				3953	unsigned long len,
				3954	gfp_t mask)
				3955	{
				3956	struct extent_buffer *eb = NULL;
				3957	#if LEAK_DEBUG
				3958	unsigned long flags;
				3959	#endif
				3960
				3961	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
				3962	if (eb == NULL)
				3963	return NULL;
				3964	eb->start = start;
				3965	eb->len = len;
				3966	eb->tree = tree;
				3967	rwlock_init(&eb->lock);
				3968	atomic_set(&eb->write_locks, 0);
				3969	atomic_set(&eb->read_locks, 0);
				3970	atomic_set(&eb->blocking_readers, 0);
				3971	atomic_set(&eb->blocking_writers, 0);
				3972	atomic_set(&eb->spinning_readers, 0);
				3973	atomic_set(&eb->spinning_writers, 0);
				3974	eb->lock_nested = 0;
				3975	init_waitqueue_head(&eb->write_lock_wq);
				3976	init_waitqueue_head(&eb->read_lock_wq);
				3977
				3978	#if LEAK_DEBUG
				3979	spin_lock_irqsave(&leak_lock, flags);
				3980	list_add(&eb->leak_list, &buffers);
				3981	spin_unlock_irqrestore(&leak_lock, flags);
				3982	#endif
				3983	spin_lock_init(&eb->refs_lock);
				3984	atomic_set(&eb->refs, 1);
				3985	atomic_set(&eb->io_pages, 0);
				3986
				3987	if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
				3988	struct page **pages;
				3989	int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
				3990	PAGE_CACHE_SHIFT;
				3991	pages = kzalloc(num_pages, mask);
				3992	if (!pages) {
				3993	__free_extent_buffer(eb);
				3994	return NULL;
				3995	}
				3996	eb->pages = pages;
				3997	} else {
				3998	eb->pages = eb->inline_pages;
				3999	}
				4000
				4001	return eb;
				4002	}
				4003
				4004	static int extent_buffer_under_io(struct extent_buffer *eb)
				4005	{
				4006	return (atomic_read(&eb->io_pages) \|\|
				4007	test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) \|\|
				4008	test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				4009	}
				4010
				4011	/*
				4012	* Helper for releasing extent buffer page.
				4013	*/
				4014	static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
				4015	unsigned long start_idx)
				4016	{
				4017	unsigned long index;
				4018	struct page *page;
				4019
				4020	BUG_ON(extent_buffer_under_io(eb));
				4021
				4022	index = num_extent_pages(eb->start, eb->len);
				4023	if (start_idx >= index)
				4024	return;
				4025
				4026	do {
				4027	index--;
				4028	page = extent_buffer_page(eb, index);
				4029	if (page) {
				4030	spin_lock(&page->mapping->private_lock);
				4031	/*
				4032	* We do this since we'll remove the pages after we've
				4033	* removed the eb from the radix tree, so we could race
				4034	* and have this page now attached to the new eb. So
				4035	* only clear page_private if it's still connected to
				4036	* this eb.
				4037	*/
				4038	if (PagePrivate(page) &&
				4039	page->private == (unsigned long)eb) {
				4040	BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
				4041	BUG_ON(PageDirty(page));
				4042	BUG_ON(PageWriteback(page));
				4043	/*
				4044	* We need to make sure we haven't be attached
				4045	* to a new eb.
				4046	*/
				4047	ClearPagePrivate(page);
				4048	set_page_private(page, 0);
				4049	/* One for the page private */
				4050	page_cache_release(page);
				4051	}
				4052	spin_unlock(&page->mapping->private_lock);
				4053
				4054	/* One for when we alloced the page */
				4055	page_cache_release(page);
				4056	}
				4057	} while (index != start_idx);
				4058	}
				4059
				4060	/*
				4061	* Helper for releasing the extent buffer.
				4062	*/
				4063	static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
				4064	{
				4065	btrfs_release_extent_buffer_page(eb, 0);
				4066	__free_extent_buffer(eb);
				4067	}
				4068
				4069	static void check_buffer_tree_ref(struct extent_buffer *eb)
				4070	{
				4071	/* the ref bit is tricky. We have to make sure it is set
				4072	* if we have the buffer dirty. Otherwise the
				4073	* code to free a buffer can end up dropping a dirty
				4074	* page
				4075	*
				4076	* Once the ref bit is set, it won't go away while the
				4077	* buffer is dirty or in writeback, and it also won't
				4078	* go away while we have the reference count on the
				4079	* eb bumped.
				4080	*
				4081	* We can't just set the ref bit without bumping the
				4082	* ref on the eb because free_extent_buffer might
				4083	* see the ref bit and try to clear it. If this happens
				4084	* free_extent_buffer might end up dropping our original
				4085	* ref by mistake and freeing the page before we are able
				4086	* to add one more ref.
				4087	*
				4088	* So bump the ref count first, then set the bit. If someone
				4089	* beat us to it, drop the ref we added.
				4090	*/
				4091	if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
				4092	atomic_inc(&eb->refs);
				4093	if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				4094	atomic_dec(&eb->refs);
				4095	}
				4096	}
				4097
				4098	static void mark_extent_buffer_accessed(struct extent_buffer *eb)
				4099	{
				4100	unsigned long num_pages, i;
				4101
				4102	check_buffer_tree_ref(eb);
				4103
				4104	num_pages = num_extent_pages(eb->start, eb->len);
				4105	for (i = 0; i < num_pages; i++) {
				4106	struct page *p = extent_buffer_page(eb, i);
				4107	mark_page_accessed(p);
				4108	}
				4109	}
				4110
				4111	struct extent_buffer alloc_extent_buffer(struct extent_io_tree tree,
				4112	u64 start, unsigned long len)
				4113	{
				4114	unsigned long num_pages = num_extent_pages(start, len);
				4115	unsigned long i;
				4116	unsigned long index = start >> PAGE_CACHE_SHIFT;
				4117	struct extent_buffer *eb;
				4118	struct extent_buffer *exists = NULL;
				4119	struct page *p;
				4120	struct address_space *mapping = tree->mapping;
				4121	int uptodate = 1;
				4122	int ret;
				4123
				4124	rcu_read_lock();
				4125	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
				4126	if (eb && atomic_inc_not_zero(&eb->refs)) {
				4127	rcu_read_unlock();
				4128	mark_extent_buffer_accessed(eb);
				4129	return eb;
				4130	}
				4131	rcu_read_unlock();
				4132
				4133	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
				4134	if (!eb)
				4135	return NULL;
				4136
				4137	for (i = 0; i < num_pages; i++, index++) {
				4138	p = find_or_create_page(mapping, index, GFP_NOFS);
				4139	if (!p) {
				4140	WARN_ON(1);
				4141	goto free_eb;
				4142	}
				4143
				4144	spin_lock(&mapping->private_lock);
				4145	if (PagePrivate(p)) {
				4146	/*
				4147	* We could have already allocated an eb for this page
				4148	* and attached one so lets see if we can get a ref on
				4149	* the existing eb, and if we can we know it's good and
				4150	* we can just return that one, else we know we can just
				4151	* overwrite page->private.
				4152	*/
				4153	exists = (struct extent_buffer *)p->private;
				4154	if (atomic_inc_not_zero(&exists->refs)) {
				4155	spin_unlock(&mapping->private_lock);
				4156	unlock_page(p);
				4157	page_cache_release(p);
				4158	mark_extent_buffer_accessed(exists);
				4159	goto free_eb;
				4160	}
				4161
				4162	/*
				4163	* Do this so attach doesn't complain and we need to
				4164	* drop the ref the old guy had.
				4165	*/
				4166	ClearPagePrivate(p);
				4167	WARN_ON(PageDirty(p));
				4168	page_cache_release(p);
				4169	}
				4170	attach_extent_buffer_page(eb, p);
				4171	spin_unlock(&mapping->private_lock);
				4172	WARN_ON(PageDirty(p));
				4173	mark_page_accessed(p);
				4174	eb->pages[i] = p;
				4175	if (!PageUptodate(p))
				4176	uptodate = 0;
				4177
				4178	/*
				4179	* see below about how we avoid a nasty race with release page
				4180	* and why we unlock later
				4181	*/
				4182	}
				4183	if (uptodate)
				4184	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				4185	again:
				4186	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
				4187	if (ret)
				4188	goto free_eb;
				4189
				4190	spin_lock(&tree->buffer_lock);
				4191	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
				4192	if (ret == -EEXIST) {
				4193	exists = radix_tree_lookup(&tree->buffer,
				4194	start >> PAGE_CACHE_SHIFT);
				4195	if (!atomic_inc_not_zero(&exists->refs)) {
				4196	spin_unlock(&tree->buffer_lock);
				4197	radix_tree_preload_end();
				4198	exists = NULL;
				4199	goto again;
				4200	}
				4201	spin_unlock(&tree->buffer_lock);
				4202	radix_tree_preload_end();
				4203	mark_extent_buffer_accessed(exists);
				4204	goto free_eb;
				4205	}
				4206	/* add one reference for the tree */
				4207	spin_lock(&eb->refs_lock);
				4208	check_buffer_tree_ref(eb);
				4209	spin_unlock(&eb->refs_lock);
				4210	spin_unlock(&tree->buffer_lock);
				4211	radix_tree_preload_end();
				4212
				4213	/*
				4214	* there is a race where release page may have
				4215	* tried to find this extent buffer in the radix
				4216	* but failed. It will tell the VM it is safe to
				4217	* reclaim the, and it will clear the page private bit.
				4218	* We must make sure to set the page private bit properly
				4219	* after the extent buffer is in the radix tree so
				4220	* it doesn't get lost
				4221	*/
				4222	SetPageChecked(eb->pages[0]);
				4223	for (i = 1; i < num_pages; i++) {
				4224	p = extent_buffer_page(eb, i);
				4225	ClearPageChecked(p);
				4226	unlock_page(p);
				4227	}
				4228	unlock_page(eb->pages[0]);
				4229	return eb;
				4230
				4231	free_eb:
				4232	for (i = 0; i < num_pages; i++) {
				4233	if (eb->pages[i])
				4234	unlock_page(eb->pages[i]);
				4235	}
				4236
				4237	WARN_ON(!atomic_dec_and_test(&eb->refs));
				4238	btrfs_release_extent_buffer(eb);
				4239	return exists;
				4240	}
				4241
				4242	struct extent_buffer find_extent_buffer(struct extent_io_tree tree,
				4243	u64 start, unsigned long len)
				4244	{
				4245	struct extent_buffer *eb;
				4246
				4247	rcu_read_lock();
				4248	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
				4249	if (eb && atomic_inc_not_zero(&eb->refs)) {
				4250	rcu_read_unlock();
				4251	mark_extent_buffer_accessed(eb);
				4252	return eb;
				4253	}
				4254	rcu_read_unlock();
				4255
				4256	return NULL;
				4257	}
				4258
				4259	static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
				4260	{
				4261	struct extent_buffer *eb =
				4262	container_of(head, struct extent_buffer, rcu_head);
				4263
				4264	__free_extent_buffer(eb);
				4265	}
				4266
				4267	/* Expects to have eb->eb_lock already held */
				4268	static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
				4269	{
				4270	WARN_ON(atomic_read(&eb->refs) == 0);
				4271	if (atomic_dec_and_test(&eb->refs)) {
				4272	struct extent_io_tree *tree = eb->tree;
				4273
				4274	spin_unlock(&eb->refs_lock);
				4275
				4276	spin_lock(&tree->buffer_lock);
				4277	radix_tree_delete(&tree->buffer,
				4278	eb->start >> PAGE_CACHE_SHIFT);
				4279	spin_unlock(&tree->buffer_lock);
				4280
				4281	/* Should be safe to release our pages at this point */
				4282	btrfs_release_extent_buffer_page(eb, 0);
				4283
				4284	call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
				4285	return;
				4286	}
				4287	spin_unlock(&eb->refs_lock);
				4288	}
				4289
				4290	void free_extent_buffer(struct extent_buffer *eb)
				4291	{
				4292	if (!eb)
				4293	return;
				4294
				4295	spin_lock(&eb->refs_lock);
				4296	if (atomic_read(&eb->refs) == 2 &&
				4297	test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
				4298	!extent_buffer_under_io(eb) &&
				4299	test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				4300	atomic_dec(&eb->refs);
				4301
				4302	/*
				4303	* I know this is terrible, but it's temporary until we stop tracking
				4304	* the uptodate bits and such for the extent buffers.
				4305	*/
				4306	release_extent_buffer(eb, GFP_ATOMIC);
				4307	}
				4308
				4309	void free_extent_buffer_stale(struct extent_buffer *eb)
				4310	{
				4311	if (!eb)
				4312	return;
				4313
				4314	spin_lock(&eb->refs_lock);
				4315	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
				4316
				4317	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
				4318	test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
				4319	atomic_dec(&eb->refs);
				4320	release_extent_buffer(eb, GFP_NOFS);
				4321	}
				4322
				4323	void clear_extent_buffer_dirty(struct extent_buffer *eb)
				4324	{
				4325	unsigned long i;
				4326	unsigned long num_pages;
				4327	struct page *page;
				4328
				4329	num_pages = num_extent_pages(eb->start, eb->len);
				4330
				4331	for (i = 0; i < num_pages; i++) {
				4332	page = extent_buffer_page(eb, i);
				4333	if (!PageDirty(page))
				4334	continue;
				4335
				4336	lock_page(page);
				4337	WARN_ON(!PagePrivate(page));
				4338
				4339	clear_page_dirty_for_io(page);
				4340	spin_lock_irq(&page->mapping->tree_lock);
				4341	if (!PageDirty(page)) {
				4342	radix_tree_tag_clear(&page->mapping->page_tree,
				4343	page_index(page),
				4344	PAGECACHE_TAG_DIRTY);
				4345	}
				4346	spin_unlock_irq(&page->mapping->tree_lock);
				4347	ClearPageError(page);
				4348	unlock_page(page);
				4349	}
				4350	WARN_ON(atomic_read(&eb->refs) == 0);
				4351	}
				4352
				4353	int set_extent_buffer_dirty(struct extent_buffer *eb)
				4354	{
				4355	unsigned long i;
				4356	unsigned long num_pages;
				4357	int was_dirty = 0;
				4358
				4359	check_buffer_tree_ref(eb);
				4360
				4361	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
				4362
				4363	num_pages = num_extent_pages(eb->start, eb->len);
				4364	WARN_ON(atomic_read(&eb->refs) == 0);
				4365	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
				4366
				4367	for (i = 0; i < num_pages; i++)
				4368	set_page_dirty(extent_buffer_page(eb, i));
				4369	return was_dirty;
				4370	}
				4371
				4372	static int range_straddles_pages(u64 start, u64 len)
				4373	{
				4374	if (len < PAGE_CACHE_SIZE)
				4375	return 1;
				4376	if (start & (PAGE_CACHE_SIZE - 1))
				4377	return 1;
				4378	if ((start + len) & (PAGE_CACHE_SIZE - 1))
				4379	return 1;
				4380	return 0;
				4381	}
				4382
				4383	int clear_extent_buffer_uptodate(struct extent_buffer *eb)
				4384	{
				4385	unsigned long i;
				4386	struct page *page;
				4387	unsigned long num_pages;
				4388
				4389	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				4390	num_pages = num_extent_pages(eb->start, eb->len);
				4391	for (i = 0; i < num_pages; i++) {
				4392	page = extent_buffer_page(eb, i);
				4393	if (page)
				4394	ClearPageUptodate(page);
				4395	}
				4396	return 0;
				4397	}
				4398
				4399	int set_extent_buffer_uptodate(struct extent_buffer *eb)
				4400	{
				4401	unsigned long i;
				4402	struct page *page;
				4403	unsigned long num_pages;
				4404
				4405	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				4406	num_pages = num_extent_pages(eb->start, eb->len);
				4407	for (i = 0; i < num_pages; i++) {
				4408	page = extent_buffer_page(eb, i);
				4409	SetPageUptodate(page);
				4410	}
				4411	return 0;
				4412	}
				4413
				4414	int extent_range_uptodate(struct extent_io_tree *tree,
				4415	u64 start, u64 end)
				4416	{
				4417	struct page *page;
				4418	int ret;
				4419	int pg_uptodate = 1;
				4420	int uptodate;
				4421	unsigned long index;
				4422
				4423	if (range_straddles_pages(start, end - start + 1)) {
				4424	ret = test_range_bit(tree, start, end,
				4425	EXTENT_UPTODATE, 1, NULL);
				4426	if (ret)
				4427	return 1;
				4428	}
				4429	while (start <= end) {
				4430	index = start >> PAGE_CACHE_SHIFT;
				4431	page = find_get_page(tree->mapping, index);
				4432	if (!page)
				4433	return 1;
				4434	uptodate = PageUptodate(page);
				4435	page_cache_release(page);
				4436	if (!uptodate) {
				4437	pg_uptodate = 0;
				4438	break;
				4439	}
				4440	start += PAGE_CACHE_SIZE;
				4441	}
				4442	return pg_uptodate;
				4443	}
				4444
				4445	int extent_buffer_uptodate(struct extent_buffer *eb)
				4446	{
				4447	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				4448	}
				4449
				4450	int read_extent_buffer_pages(struct extent_io_tree *tree,
				4451	struct extent_buffer *eb, u64 start, int wait,
				4452	get_extent_t *get_extent, int mirror_num)
				4453	{
				4454	unsigned long i;
				4455	unsigned long start_i;
				4456	struct page *page;
				4457	int err;
				4458	int ret = 0;
				4459	int locked_pages = 0;
				4460	int all_uptodate = 1;
				4461	unsigned long num_pages;
				4462	unsigned long num_reads = 0;
				4463	struct bio *bio = NULL;
				4464	unsigned long bio_flags = 0;
				4465
				4466	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
				4467	return 0;
				4468
				4469	if (start) {
				4470	WARN_ON(start < eb->start);
				4471	start_i = (start >> PAGE_CACHE_SHIFT) -
				4472	(eb->start >> PAGE_CACHE_SHIFT);
				4473	} else {
				4474	start_i = 0;
				4475	}
				4476
				4477	num_pages = num_extent_pages(eb->start, eb->len);
				4478	for (i = start_i; i < num_pages; i++) {
				4479	page = extent_buffer_page(eb, i);
				4480	if (wait == WAIT_NONE) {
				4481	if (!trylock_page(page))
				4482	goto unlock_exit;
				4483	} else {
				4484	lock_page(page);
				4485	}
				4486	locked_pages++;
				4487	if (!PageUptodate(page)) {
				4488	num_reads++;
				4489	all_uptodate = 0;
				4490	}
				4491	}
				4492	if (all_uptodate) {
				4493	if (start_i == 0)
				4494	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
				4495	goto unlock_exit;
				4496	}
				4497
				4498	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
				4499	eb->read_mirror = 0;
				4500	atomic_set(&eb->io_pages, num_reads);
				4501	for (i = start_i; i < num_pages; i++) {
				4502	page = extent_buffer_page(eb, i);
				4503	if (!PageUptodate(page)) {
				4504	ClearPageError(page);
				4505	err = __extent_read_full_page(tree, page,
				4506	get_extent, &bio,
				4507	mirror_num, &bio_flags);
				4508	if (err)
				4509	ret = err;
				4510	} else {
				4511	unlock_page(page);
				4512	}
				4513	}
				4514
				4515	if (bio) {
				4516	err = submit_one_bio(READ, bio, mirror_num, bio_flags);
				4517	if (err)
				4518	return err;
				4519	}
				4520
				4521	if (ret \|\| wait != WAIT_COMPLETE)
				4522	return ret;
				4523
				4524	for (i = start_i; i < num_pages; i++) {
				4525	page = extent_buffer_page(eb, i);
				4526	wait_on_page_locked(page);
				4527	if (!PageUptodate(page))
				4528	ret = -EIO;
				4529	}
				4530
				4531	return ret;
				4532
				4533	unlock_exit:
				4534	i = start_i;
				4535	while (locked_pages > 0) {
				4536	page = extent_buffer_page(eb, i);
				4537	i++;
				4538	unlock_page(page);
				4539	locked_pages--;
				4540	}
				4541	return ret;
				4542	}
				4543
				4544	void read_extent_buffer(struct extent_buffer eb, void dstv,
				4545	unsigned long start,
				4546	unsigned long len)
				4547	{
				4548	size_t cur;
				4549	size_t offset;
				4550	struct page *page;
				4551	char *kaddr;
				4552	char dst = (char )dstv;
				4553	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
				4554	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
				4555
				4556	WARN_ON(start > eb->len);
				4557	WARN_ON(start + len > eb->start + eb->len);
				4558
				4559	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
				4560
				4561	while (len > 0) {
				4562	page = extent_buffer_page(eb, i);
				4563
				4564	cur = min(len, (PAGE_CACHE_SIZE - offset));
				4565	kaddr = page_address(page);
				4566	memcpy(dst, kaddr + offset, cur);
				4567
				4568	dst += cur;
				4569	len -= cur;
				4570	offset = 0;
				4571	i++;
				4572	}
				4573	}
				4574
				4575	int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
				4576	unsigned long min_len, char **map,
				4577	unsigned long *map_start,
				4578	unsigned long *map_len)
				4579	{
				4580	size_t offset = start & (PAGE_CACHE_SIZE - 1);
				4581	char *kaddr;
				4582	struct page *p;
				4583	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
				4584	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
				4585	unsigned long end_i = (start_offset + start + min_len - 1) >>
				4586	PAGE_CACHE_SHIFT;
				4587
				4588	if (i != end_i)
				4589	return -EINVAL;
				4590
				4591	if (i == 0) {
				4592	offset = start_offset;
				4593	*map_start = 0;
				4594	} else {
				4595	offset = 0;
				4596	*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
				4597	}
				4598
				4599	if (start + min_len > eb->len) {
				4600	printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
				4601	"wanted %lu %lu\n", (unsigned long long)eb->start,
				4602	eb->len, start, min_len);
				4603	WARN_ON(1);
				4604	return -EINVAL;
				4605	}
				4606
				4607	p = extent_buffer_page(eb, i);
				4608	kaddr = page_address(p);
				4609	*map = kaddr + offset;
				4610	*map_len = PAGE_CACHE_SIZE - offset;
				4611	return 0;
				4612	}
				4613
				4614	int memcmp_extent_buffer(struct extent_buffer eb, const void ptrv,
				4615	unsigned long start,
				4616	unsigned long len)
				4617	{
				4618	size_t cur;
				4619	size_t offset;
				4620	struct page *page;
				4621	char *kaddr;
				4622	char ptr = (char )ptrv;
				4623	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
				4624	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
				4625	int ret = 0;
				4626
				4627	WARN_ON(start > eb->len);
				4628	WARN_ON(start + len > eb->start + eb->len);
				4629
				4630	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
				4631
				4632	while (len > 0) {
				4633	page = extent_buffer_page(eb, i);
				4634
				4635	cur = min(len, (PAGE_CACHE_SIZE - offset));
				4636
				4637	kaddr = page_address(page);
				4638	ret = memcmp(ptr, kaddr + offset, cur);
				4639	if (ret)
				4640	break;
				4641
				4642	ptr += cur;
				4643	len -= cur;
				4644	offset = 0;
				4645	i++;
				4646	}
				4647	return ret;
				4648	}
				4649
				4650	void write_extent_buffer(struct extent_buffer eb, const void srcv,
				4651	unsigned long start, unsigned long len)
				4652	{
				4653	size_t cur;
				4654	size_t offset;
				4655	struct page *page;
				4656	char *kaddr;
				4657	char src = (char )srcv;
				4658	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
				4659	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
				4660
				4661	WARN_ON(start > eb->len);
				4662	WARN_ON(start + len > eb->start + eb->len);
				4663
				4664	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
				4665
				4666	while (len > 0) {
				4667	page = extent_buffer_page(eb, i);
				4668	WARN_ON(!PageUptodate(page));
				4669
				4670	cur = min(len, PAGE_CACHE_SIZE - offset);
				4671	kaddr = page_address(page);
				4672	memcpy(kaddr + offset, src, cur);
				4673
				4674	src += cur;
				4675	len -= cur;
				4676	offset = 0;
				4677	i++;
				4678	}
				4679	}
				4680
				4681	void memset_extent_buffer(struct extent_buffer *eb, char c,
				4682	unsigned long start, unsigned long len)
				4683	{
				4684	size_t cur;
				4685	size_t offset;
				4686	struct page *page;
				4687	char *kaddr;
				4688	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
				4689	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
				4690
				4691	WARN_ON(start > eb->len);
				4692	WARN_ON(start + len > eb->start + eb->len);
				4693
				4694	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
				4695
				4696	while (len > 0) {
				4697	page = extent_buffer_page(eb, i);
				4698	WARN_ON(!PageUptodate(page));
				4699
				4700	cur = min(len, PAGE_CACHE_SIZE - offset);
				4701	kaddr = page_address(page);
				4702	memset(kaddr + offset, c, cur);
				4703
				4704	len -= cur;
				4705	offset = 0;
				4706	i++;
				4707	}
				4708	}
				4709
				4710	void copy_extent_buffer(struct extent_buffer dst, struct extent_buffer src,
				4711	unsigned long dst_offset, unsigned long src_offset,
				4712	unsigned long len)
				4713	{
				4714	u64 dst_len = dst->len;
				4715	size_t cur;
				4716	size_t offset;
				4717	struct page *page;
				4718	char *kaddr;
				4719	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
				4720	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
				4721
				4722	WARN_ON(src->len != dst_len);
				4723
				4724	offset = (start_offset + dst_offset) &
				4725	((unsigned long)PAGE_CACHE_SIZE - 1);
				4726
				4727	while (len > 0) {
				4728	page = extent_buffer_page(dst, i);
				4729	WARN_ON(!PageUptodate(page));
				4730
				4731	cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
				4732
				4733	kaddr = page_address(page);
				4734	read_extent_buffer(src, kaddr + offset, src_offset, cur);
				4735
				4736	src_offset += cur;
				4737	len -= cur;
				4738	offset = 0;
				4739	i++;
				4740	}
				4741	}
				4742
				4743	static void move_pages(struct page dst_page, struct page src_page,
				4744	unsigned long dst_off, unsigned long src_off,
				4745	unsigned long len)
				4746	{
				4747	char *dst_kaddr = page_address(dst_page);
				4748	if (dst_page == src_page) {
				4749	memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
				4750	} else {
				4751	char *src_kaddr = page_address(src_page);
				4752	char *p = dst_kaddr + dst_off + len;
				4753	char *s = src_kaddr + src_off + len;
				4754
				4755	while (len--)
				4756	--p = --s;
				4757	}
				4758	}
				4759
				4760	static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
				4761	{
				4762	unsigned long distance = (src > dst) ? src - dst : dst - src;
				4763	return distance < len;
				4764	}
				4765
				4766	static void copy_pages(struct page dst_page, struct page src_page,
				4767	unsigned long dst_off, unsigned long src_off,
				4768	unsigned long len)
				4769	{
				4770	char *dst_kaddr = page_address(dst_page);
				4771	char *src_kaddr;
				4772	int must_memmove = 0;
				4773
				4774	if (dst_page != src_page) {
				4775	src_kaddr = page_address(src_page);
				4776	} else {
				4777	src_kaddr = dst_kaddr;
				4778	if (areas_overlap(src_off, dst_off, len))
				4779	must_memmove = 1;
				4780	}
				4781
				4782	if (must_memmove)
				4783	memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
				4784	else
				4785	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
				4786	}
				4787
				4788	void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
				4789	unsigned long src_offset, unsigned long len)
				4790	{
				4791	size_t cur;
				4792	size_t dst_off_in_page;
				4793	size_t src_off_in_page;
				4794	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
				4795	unsigned long dst_i;
				4796	unsigned long src_i;
				4797
				4798	if (src_offset + len > dst->len) {
				4799	printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
				4800	"len %lu dst len %lu\n", src_offset, len, dst->len);
				4801	BUG_ON(1);
				4802	}
				4803	if (dst_offset + len > dst->len) {
				4804	printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
				4805	"len %lu dst len %lu\n", dst_offset, len, dst->len);
				4806	BUG_ON(1);
				4807	}
				4808
				4809	while (len > 0) {
				4810	dst_off_in_page = (start_offset + dst_offset) &
				4811	((unsigned long)PAGE_CACHE_SIZE - 1);
				4812	src_off_in_page = (start_offset + src_offset) &
				4813	((unsigned long)PAGE_CACHE_SIZE - 1);
				4814
				4815	dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
				4816	src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
				4817
				4818	cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
				4819	src_off_in_page));
				4820	cur = min_t(unsigned long, cur,
				4821	(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
				4822
				4823	copy_pages(extent_buffer_page(dst, dst_i),
				4824	extent_buffer_page(dst, src_i),
				4825	dst_off_in_page, src_off_in_page, cur);
				4826
				4827	src_offset += cur;
				4828	dst_offset += cur;
				4829	len -= cur;
				4830	}
				4831	}
				4832
				4833	void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
				4834	unsigned long src_offset, unsigned long len)
				4835	{
				4836	size_t cur;
				4837	size_t dst_off_in_page;
				4838	size_t src_off_in_page;
				4839	unsigned long dst_end = dst_offset + len - 1;
				4840	unsigned long src_end = src_offset + len - 1;
				4841	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
				4842	unsigned long dst_i;
				4843	unsigned long src_i;
				4844
				4845	if (src_offset + len > dst->len) {
				4846	printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
				4847	"len %lu len %lu\n", src_offset, len, dst->len);
				4848	BUG_ON(1);
				4849	}
				4850	if (dst_offset + len > dst->len) {
				4851	printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
				4852	"len %lu len %lu\n", dst_offset, len, dst->len);
				4853	BUG_ON(1);
				4854	}
				4855	if (dst_offset < src_offset) {
				4856	memcpy_extent_buffer(dst, dst_offset, src_offset, len);
				4857	return;
				4858	}
				4859	while (len > 0) {
				4860	dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
				4861	src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
				4862
				4863	dst_off_in_page = (start_offset + dst_end) &
				4864	((unsigned long)PAGE_CACHE_SIZE - 1);
				4865	src_off_in_page = (start_offset + src_end) &
				4866	((unsigned long)PAGE_CACHE_SIZE - 1);
				4867
				4868	cur = min_t(unsigned long, len, src_off_in_page + 1);
				4869	cur = min(cur, dst_off_in_page + 1);
				4870	move_pages(extent_buffer_page(dst, dst_i),
				4871	extent_buffer_page(dst, src_i),
				4872	dst_off_in_page - cur + 1,
				4873	src_off_in_page - cur + 1, cur);
				4874
				4875	dst_end -= cur;
				4876	src_end -= cur;
				4877	len -= cur;
				4878	}
				4879	}
				4880
				4881	int try_release_extent_buffer(struct page *page, gfp_t mask)
				4882	{
				4883	struct extent_buffer *eb;
				4884
				4885	/*
				4886	* We need to make sure noboody is attaching this page to an eb right
				4887	* now.
				4888	*/
				4889	spin_lock(&page->mapping->private_lock);
				4890	if (!PagePrivate(page)) {
				4891	spin_unlock(&page->mapping->private_lock);
				4892	return 1;
				4893	}
				4894
				4895	eb = (struct extent_buffer *)page->private;
				4896	BUG_ON(!eb);
				4897
				4898	/*
				4899	* This is a little awful but should be ok, we need to make sure that
				4900	* the eb doesn't disappear out from under us while we're looking at
				4901	* this page.
				4902	*/
				4903	spin_lock(&eb->refs_lock);
				4904	if (atomic_read(&eb->refs) != 1 \|\| extent_buffer_under_io(eb)) {
				4905	spin_unlock(&eb->refs_lock);
				4906	spin_unlock(&page->mapping->private_lock);
				4907	return 0;
				4908	}
				4909	spin_unlock(&page->mapping->private_lock);
				4910
				4911	if ((mask & GFP_NOFS) == GFP_NOFS)
				4912	mask = GFP_NOFS;
				4913
				4914	/*
				4915	* If tree ref isn't set then we know the ref on this eb is a real ref,
				4916	* so just return, this page will likely be freed soon anyway.
				4917	*/
				4918	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
				4919	spin_unlock(&eb->refs_lock);
				4920	return 0;
				4921	}
				4922	release_extent_buffer(eb, mask);
				4923
				4924	return 1;
				4925	}