Blame - src/kernel/linux/v4.14/fs/btrfs/tree-log.c - T103

blob: ec8706a6e9c668d0a157acf7a595b0b374c648fe [file] [log] [blame]

rjw	1f88458	2022-01-06 17:20:42 +0800	[diff] [blame^]	1	/*
				2	* Copyright (C) 2008 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include <linux/slab.h>
				21	#include <linux/blkdev.h>
				22	#include <linux/list_sort.h>
				23	#include "tree-log.h"
				24	#include "disk-io.h"
				25	#include "locking.h"
				26	#include "print-tree.h"
				27	#include "backref.h"
				28	#include "hash.h"
				29	#include "compression.h"
				30	#include "qgroup.h"
				31	#include "inode-map.h"
				32
				33	/* magic values for the inode_only field in btrfs_log_inode:
				34	*
				35	* LOG_INODE_ALL means to log everything
				36	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				37	* during log replay
				38	*/
				39	#define LOG_INODE_ALL 0
				40	#define LOG_INODE_EXISTS 1
				41	#define LOG_OTHER_INODE 2
				42
				43	/*
				44	* directory trouble cases
				45	*
				46	* 1) on rename or unlink, if the inode being unlinked isn't in the fsync
				47	* log, we must force a full commit before doing an fsync of the directory
				48	* where the unlink was done.
				49	* ---> record transid of last unlink/rename per directory
				50	*
				51	* mkdir foo/some_dir
				52	* normal commit
				53	* rename foo/some_dir foo2/some_dir
				54	* mkdir foo/some_dir
				55	* fsync foo/some_dir/some_file
				56	*
				57	* The fsync above will unlink the original some_dir without recording
				58	* it in its new location (foo2). After a crash, some_dir will be gone
				59	* unless the fsync of some_file forces a full commit
				60	*
				61	* 2) we must log any new names for any file or dir that is in the fsync
				62	* log. ---> check inode while renaming/linking.
				63	*
				64	* 2a) we must log any new names for any file or dir during rename
				65	* when the directory they are being removed from was logged.
				66	* ---> check inode and old parent dir during rename
				67	*
				68	* 2a is actually the more important variant. With the extra logging
				69	* a crash might unlink the old name without recreating the new one
				70	*
				71	* 3) after a crash, we must go through any directories with a link count
				72	* of zero and redo the rm -rf
				73	*
				74	* mkdir f1/foo
				75	* normal commit
				76	* rm -rf f1/foo
				77	* fsync(f1)
				78	*
				79	* The directory f1 was fully removed from the FS, but fsync was never
				80	* called on f1, only its parent dir. After a crash the rm -rf must
				81	* be replayed. This must be able to recurse down the entire
				82	* directory tree. The inode link count fixup code takes care of the
				83	* ugly details.
				84	*/
				85
				86	/*
				87	* stages for the tree walking. The first
				88	* stage (0) is to only pin down the blocks we find
				89	* the second stage (1) is to make sure that all the inodes
				90	* we find in the log are created in the subvolume.
				91	*
				92	* The last stage is to deal with directories and links and extents
				93	* and all the other fun semantics
				94	*/
				95	#define LOG_WALK_PIN_ONLY 0
				96	#define LOG_WALK_REPLAY_INODES 1
				97	#define LOG_WALK_REPLAY_DIR_INDEX 2
				98	#define LOG_WALK_REPLAY_ALL 3
				99
				100	static int btrfs_log_inode(struct btrfs_trans_handle *trans,
				101	struct btrfs_root root, struct btrfs_inode inode,
				102	int inode_only,
				103	const loff_t start,
				104	const loff_t end,
				105	struct btrfs_log_ctx *ctx);
				106	static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				107	struct btrfs_root *root,
				108	struct btrfs_path *path, u64 objectid);
				109	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				110	struct btrfs_root *root,
				111	struct btrfs_root *log,
				112	struct btrfs_path *path,
				113	u64 dirid, int del_all);
				114
				115	/*
				116	* tree logging is a special write ahead log used to make sure that
				117	* fsyncs and O_SYNCs can happen without doing full tree commits.
				118	*
				119	* Full tree commits are expensive because they require commonly
				120	* modified blocks to be recowed, creating many dirty pages in the
				121	* extent tree an 4x-6x higher write load than ext3.
				122	*
				123	* Instead of doing a tree commit on every fsync, we use the
				124	* key ranges and transaction ids to find items for a given file or directory
				125	* that have changed in this transaction. Those items are copied into
				126	* a special tree (one per subvolume root), that tree is written to disk
				127	* and then the fsync is considered complete.
				128	*
				129	* After a crash, items are copied out of the log-tree back into the
				130	* subvolume tree. Any file data extents found are recorded in the extent
				131	* allocation tree, and the log-tree freed.
				132	*
				133	* The log tree is read three times, once to pin down all the extents it is
				134	* using in ram and once, once to create all the inodes logged in the tree
				135	* and once to do all the other items.
				136	*/
				137
				138	/*
				139	* start a sub transaction and setup the log tree
				140	* this increments the log tree writer count to make the people
				141	* syncing the tree wait for us to finish
				142	*/
				143	static int start_log_trans(struct btrfs_trans_handle *trans,
				144	struct btrfs_root *root,
				145	struct btrfs_log_ctx *ctx)
				146	{
				147	struct btrfs_fs_info *fs_info = root->fs_info;
				148	int ret = 0;
				149
				150	mutex_lock(&root->log_mutex);
				151
				152	if (root->log_root) {
				153	if (btrfs_need_log_full_commit(fs_info, trans)) {
				154	ret = -EAGAIN;
				155	goto out;
				156	}
				157
				158	if (!root->log_start_pid) {
				159	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
				160	root->log_start_pid = current->pid;
				161	} else if (root->log_start_pid != current->pid) {
				162	set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
				163	}
				164	} else {
				165	mutex_lock(&fs_info->tree_log_mutex);
				166	if (!fs_info->log_root_tree)
				167	ret = btrfs_init_log_root_tree(trans, fs_info);
				168	mutex_unlock(&fs_info->tree_log_mutex);
				169	if (ret)
				170	goto out;
				171
				172	ret = btrfs_add_log_tree(trans, root);
				173	if (ret)
				174	goto out;
				175
				176	clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
				177	root->log_start_pid = current->pid;
				178	}
				179
				180	atomic_inc(&root->log_batch);
				181	atomic_inc(&root->log_writers);
				182	if (ctx) {
				183	int index = root->log_transid % 2;
				184	list_add_tail(&ctx->list, &root->log_ctxs[index]);
				185	ctx->log_transid = root->log_transid;
				186	}
				187
				188	out:
				189	mutex_unlock(&root->log_mutex);
				190	return ret;
				191	}
				192
				193	/*
				194	* returns 0 if there was a log transaction running and we were able
				195	* to join, or returns -ENOENT if there were not transactions
				196	* in progress
				197	*/
				198	static int join_running_log_trans(struct btrfs_root *root)
				199	{
				200	int ret = -ENOENT;
				201
				202	smp_mb();
				203	if (!root->log_root)
				204	return -ENOENT;
				205
				206	mutex_lock(&root->log_mutex);
				207	if (root->log_root) {
				208	ret = 0;
				209	atomic_inc(&root->log_writers);
				210	}
				211	mutex_unlock(&root->log_mutex);
				212	return ret;
				213	}
				214
				215	/*
				216	* This either makes the current running log transaction wait
				217	* until you call btrfs_end_log_trans() or it makes any future
				218	* log transactions wait until you call btrfs_end_log_trans()
				219	*/
				220	int btrfs_pin_log_trans(struct btrfs_root *root)
				221	{
				222	int ret = -ENOENT;
				223
				224	mutex_lock(&root->log_mutex);
				225	atomic_inc(&root->log_writers);
				226	mutex_unlock(&root->log_mutex);
				227	return ret;
				228	}
				229
				230	/*
				231	* indicate we're done making changes to the log tree
				232	* and wake up anyone waiting to do a sync
				233	*/
				234	void btrfs_end_log_trans(struct btrfs_root *root)
				235	{
				236	if (atomic_dec_and_test(&root->log_writers)) {
				237	/*
				238	* Implicit memory barrier after atomic_dec_and_test
				239	*/
				240	if (waitqueue_active(&root->log_writer_wait))
				241	wake_up(&root->log_writer_wait);
				242	}
				243	}
				244
				245
				246	/*
				247	* the walk control struct is used to pass state down the chain when
				248	* processing the log tree. The stage field tells us which part
				249	* of the log tree processing we are currently doing. The others
				250	* are state fields used for that specific part
				251	*/
				252	struct walk_control {
				253	/* should we free the extent on disk when done? This is used
				254	* at transaction commit time while freeing a log tree
				255	*/
				256	int free;
				257
				258	/* should we write out the extent buffer? This is used
				259	* while flushing the log tree to disk during a sync
				260	*/
				261	int write;
				262
				263	/* should we wait for the extent buffer io to finish? Also used
				264	* while flushing the log tree to disk for a sync
				265	*/
				266	int wait;
				267
				268	/* pin only walk, we record which extents on disk belong to the
				269	* log trees
				270	*/
				271	int pin;
				272
				273	/* what stage of the replay code we're currently in */
				274	int stage;
				275
				276	/*
				277	* Ignore any items from the inode currently being processed. Needs
				278	* to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
				279	* the LOG_WALK_REPLAY_INODES stage.
				280	*/
				281	bool ignore_cur_inode;
				282
				283	/* the root we are currently replaying */
				284	struct btrfs_root *replay_dest;
				285
				286	/* the trans handle for the current replay */
				287	struct btrfs_trans_handle *trans;
				288
				289	/* the function that gets used to process blocks we find in the
				290	* tree. Note the extent_buffer might not be up to date when it is
				291	* passed in, and it must be checked or read if you need the data
				292	* inside it
				293	*/
				294	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				295	struct walk_control *wc, u64 gen);
				296	};
				297
				298	/*
				299	* process_func used to pin down extents, write them or wait on them
				300	*/
				301	static int process_one_buffer(struct btrfs_root *log,
				302	struct extent_buffer *eb,
				303	struct walk_control *wc, u64 gen)
				304	{
				305	struct btrfs_fs_info *fs_info = log->fs_info;
				306	int ret = 0;
				307
				308	/*
				309	* If this fs is mixed then we need to be able to process the leaves to
				310	* pin down any logged extents, so we have to read the block.
				311	*/
				312	if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
				313	ret = btrfs_read_buffer(eb, gen);
				314	if (ret)
				315	return ret;
				316	}
				317
				318	if (wc->pin)
				319	ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
				320	eb->len);
				321
				322	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
				323	if (wc->pin && btrfs_header_level(eb) == 0)
				324	ret = btrfs_exclude_logged_extents(fs_info, eb);
				325	if (wc->write)
				326	btrfs_write_tree_block(eb);
				327	if (wc->wait)
				328	btrfs_wait_tree_block_writeback(eb);
				329	}
				330	return ret;
				331	}
				332
				333	/*
				334	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				335	* to the src data we are copying out.
				336	*
				337	* root is the tree we are copying into, and path is a scratch
				338	* path for use in this function (it should be released on entry and
				339	* will be released on exit).
				340	*
				341	* If the key is already in the destination tree the existing item is
				342	* overwritten. If the existing item isn't big enough, it is extended.
				343	* If it is too large, it is truncated.
				344	*
				345	* If the key isn't in the destination yet, a new item is inserted.
				346	*/
				347	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				348	struct btrfs_root *root,
				349	struct btrfs_path *path,
				350	struct extent_buffer *eb, int slot,
				351	struct btrfs_key *key)
				352	{
				353	struct btrfs_fs_info *fs_info = root->fs_info;
				354	int ret;
				355	u32 item_size;
				356	u64 saved_i_size = 0;
				357	int save_old_i_size = 0;
				358	unsigned long src_ptr;
				359	unsigned long dst_ptr;
				360	int overwrite_root = 0;
				361	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
				362
				363	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				364	overwrite_root = 1;
				365
				366	item_size = btrfs_item_size_nr(eb, slot);
				367	src_ptr = btrfs_item_ptr_offset(eb, slot);
				368
				369	/* look for the key in the destination tree */
				370	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				371	if (ret < 0)
				372	return ret;
				373
				374	if (ret == 0) {
				375	char *src_copy;
				376	char *dst_copy;
				377	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				378	path->slots[0]);
				379	if (dst_size != item_size)
				380	goto insert;
				381
				382	if (item_size == 0) {
				383	btrfs_release_path(path);
				384	return 0;
				385	}
				386	dst_copy = kmalloc(item_size, GFP_NOFS);
				387	src_copy = kmalloc(item_size, GFP_NOFS);
				388	if (!dst_copy \|\| !src_copy) {
				389	btrfs_release_path(path);
				390	kfree(dst_copy);
				391	kfree(src_copy);
				392	return -ENOMEM;
				393	}
				394
				395	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				396
				397	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				398	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				399	item_size);
				400	ret = memcmp(dst_copy, src_copy, item_size);
				401
				402	kfree(dst_copy);
				403	kfree(src_copy);
				404	/*
				405	* they have the same contents, just return, this saves
				406	* us from cowing blocks in the destination tree and doing
				407	* extra writes that may not have been done by a previous
				408	* sync
				409	*/
				410	if (ret == 0) {
				411	btrfs_release_path(path);
				412	return 0;
				413	}
				414
				415	/*
				416	* We need to load the old nbytes into the inode so when we
				417	* replay the extents we've logged we get the right nbytes.
				418	*/
				419	if (inode_item) {
				420	struct btrfs_inode_item *item;
				421	u64 nbytes;
				422	u32 mode;
				423
				424	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				425	struct btrfs_inode_item);
				426	nbytes = btrfs_inode_nbytes(path->nodes[0], item);
				427	item = btrfs_item_ptr(eb, slot,
				428	struct btrfs_inode_item);
				429	btrfs_set_inode_nbytes(eb, item, nbytes);
				430
				431	/*
				432	* If this is a directory we need to reset the i_size to
				433	* 0 so that we can set it up properly when replaying
				434	* the rest of the items in this log.
				435	*/
				436	mode = btrfs_inode_mode(eb, item);
				437	if (S_ISDIR(mode))
				438	btrfs_set_inode_size(eb, item, 0);
				439	}
				440	} else if (inode_item) {
				441	struct btrfs_inode_item *item;
				442	u32 mode;
				443
				444	/*
				445	* New inode, set nbytes to 0 so that the nbytes comes out
				446	* properly when we replay the extents.
				447	*/
				448	item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
				449	btrfs_set_inode_nbytes(eb, item, 0);
				450
				451	/*
				452	* If this is a directory we need to reset the i_size to 0 so
				453	* that we can set it up properly when replaying the rest of
				454	* the items in this log.
				455	*/
				456	mode = btrfs_inode_mode(eb, item);
				457	if (S_ISDIR(mode))
				458	btrfs_set_inode_size(eb, item, 0);
				459	}
				460	insert:
				461	btrfs_release_path(path);
				462	/* try to insert the key into the destination tree */
				463	path->skip_release_on_error = 1;
				464	ret = btrfs_insert_empty_item(trans, root, path,
				465	key, item_size);
				466	path->skip_release_on_error = 0;
				467
				468	/* make sure any existing item is the correct size */
				469	if (ret == -EEXIST \|\| ret == -EOVERFLOW) {
				470	u32 found_size;
				471	found_size = btrfs_item_size_nr(path->nodes[0],
				472	path->slots[0]);
				473	if (found_size > item_size)
				474	btrfs_truncate_item(fs_info, path, item_size, 1);
				475	else if (found_size < item_size)
				476	btrfs_extend_item(fs_info, path,
				477	item_size - found_size);
				478	} else if (ret) {
				479	return ret;
				480	}
				481	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				482	path->slots[0]);
				483
				484	/* don't overwrite an existing inode if the generation number
				485	* was logged as zero. This is done when the tree logging code
				486	* is just logging an inode to make sure it exists after recovery.
				487	*
				488	* Also, don't overwrite i_size on directories during replay.
				489	* log replay inserts and removes directory items based on the
				490	* state of the tree found in the subvolume, and i_size is modified
				491	* as it goes
				492	*/
				493	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				494	struct btrfs_inode_item *src_item;
				495	struct btrfs_inode_item *dst_item;
				496
				497	src_item = (struct btrfs_inode_item *)src_ptr;
				498	dst_item = (struct btrfs_inode_item *)dst_ptr;
				499
				500	if (btrfs_inode_generation(eb, src_item) == 0) {
				501	struct extent_buffer *dst_eb = path->nodes[0];
				502	const u64 ino_size = btrfs_inode_size(eb, src_item);
				503
				504	/*
				505	* For regular files an ino_size == 0 is used only when
				506	* logging that an inode exists, as part of a directory
				507	* fsync, and the inode wasn't fsynced before. In this
				508	* case don't set the size of the inode in the fs/subvol
				509	* tree, otherwise we would be throwing valid data away.
				510	*/
				511	if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
				512	S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
				513	ino_size != 0) {
				514	struct btrfs_map_token token;
				515
				516	btrfs_init_map_token(&token);
				517	btrfs_set_token_inode_size(dst_eb, dst_item,
				518	ino_size, &token);
				519	}
				520	goto no_copy;
				521	}
				522
				523	if (overwrite_root &&
				524	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				525	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				526	save_old_i_size = 1;
				527	saved_i_size = btrfs_inode_size(path->nodes[0],
				528	dst_item);
				529	}
				530	}
				531
				532	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				533	src_ptr, item_size);
				534
				535	if (save_old_i_size) {
				536	struct btrfs_inode_item *dst_item;
				537	dst_item = (struct btrfs_inode_item *)dst_ptr;
				538	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				539	}
				540
				541	/* make sure the generation is filled in */
				542	if (key->type == BTRFS_INODE_ITEM_KEY) {
				543	struct btrfs_inode_item *dst_item;
				544	dst_item = (struct btrfs_inode_item *)dst_ptr;
				545	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				546	btrfs_set_inode_generation(path->nodes[0], dst_item,
				547	trans->transid);
				548	}
				549	}
				550	no_copy:
				551	btrfs_mark_buffer_dirty(path->nodes[0]);
				552	btrfs_release_path(path);
				553	return 0;
				554	}
				555
				556	/*
				557	* simple helper to read an inode off the disk from a given root
				558	* This can only be called for subvolume roots and not for the log
				559	*/
				560	static noinline struct inode read_one_inode(struct btrfs_root root,
				561	u64 objectid)
				562	{
				563	struct btrfs_key key;
				564	struct inode *inode;
				565
				566	key.objectid = objectid;
				567	key.type = BTRFS_INODE_ITEM_KEY;
				568	key.offset = 0;
				569	inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
				570	if (IS_ERR(inode)) {
				571	inode = NULL;
				572	} else if (is_bad_inode(inode)) {
				573	iput(inode);
				574	inode = NULL;
				575	}
				576	return inode;
				577	}
				578
				579	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				580	* subvolume 'root'. path is released on entry and should be released
				581	* on exit.
				582	*
				583	* extents in the log tree have not been allocated out of the extent
				584	* tree yet. So, this completes the allocation, taking a reference
				585	* as required if the extent already exists or creating a new extent
				586	* if it isn't in the extent allocation tree yet.
				587	*
				588	* The extent is inserted into the file, dropping any existing extents
				589	* from the file that overlap the new one.
				590	*/
				591	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				592	struct btrfs_root *root,
				593	struct btrfs_path *path,
				594	struct extent_buffer *eb, int slot,
				595	struct btrfs_key *key)
				596	{
				597	struct btrfs_fs_info *fs_info = root->fs_info;
				598	int found_type;
				599	u64 extent_end;
				600	u64 start = key->offset;
				601	u64 nbytes = 0;
				602	struct btrfs_file_extent_item *item;
				603	struct inode *inode = NULL;
				604	unsigned long size;
				605	int ret = 0;
				606
				607	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				608	found_type = btrfs_file_extent_type(eb, item);
				609
				610	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				611	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
				612	nbytes = btrfs_file_extent_num_bytes(eb, item);
				613	extent_end = start + nbytes;
				614
				615	/*
				616	* We don't add to the inodes nbytes if we are prealloc or a
				617	* hole.
				618	*/
				619	if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
				620	nbytes = 0;
				621	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				622	size = btrfs_file_extent_ram_bytes(eb, item);
				623	nbytes = btrfs_file_extent_ram_bytes(eb, item);
				624	extent_end = ALIGN(start + size,
				625	fs_info->sectorsize);
				626	} else {
				627	ret = 0;
				628	goto out;
				629	}
				630
				631	inode = read_one_inode(root, key->objectid);
				632	if (!inode) {
				633	ret = -EIO;
				634	goto out;
				635	}
				636
				637	/*
				638	* first check to see if we already have this extent in the
				639	* file. This must be done before the btrfs_drop_extents run
				640	* so we don't try to drop this extent.
				641	*/
				642	ret = btrfs_lookup_file_extent(trans, root, path,
				643	btrfs_ino(BTRFS_I(inode)), start, 0);
				644
				645	if (ret == 0 &&
				646	(found_type == BTRFS_FILE_EXTENT_REG \|\|
				647	found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
				648	struct btrfs_file_extent_item cmp1;
				649	struct btrfs_file_extent_item cmp2;
				650	struct btrfs_file_extent_item *existing;
				651	struct extent_buffer *leaf;
				652
				653	leaf = path->nodes[0];
				654	existing = btrfs_item_ptr(leaf, path->slots[0],
				655	struct btrfs_file_extent_item);
				656
				657	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				658	sizeof(cmp1));
				659	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				660	sizeof(cmp2));
				661
				662	/*
				663	* we already have a pointer to this exact extent,
				664	* we don't have to do anything
				665	*/
				666	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				667	btrfs_release_path(path);
				668	goto out;
				669	}
				670	}
				671	btrfs_release_path(path);
				672
				673	/* drop any overlapping extents */
				674	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
				675	if (ret)
				676	goto out;
				677
				678	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				679	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
				680	u64 offset;
				681	unsigned long dest_offset;
				682	struct btrfs_key ins;
				683
				684	if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
				685	btrfs_fs_incompat(fs_info, NO_HOLES))
				686	goto update_inode;
				687
				688	ret = btrfs_insert_empty_item(trans, root, path, key,
				689	sizeof(*item));
				690	if (ret)
				691	goto out;
				692	dest_offset = btrfs_item_ptr_offset(path->nodes[0],
				693	path->slots[0]);
				694	copy_extent_buffer(path->nodes[0], eb, dest_offset,
				695	(unsigned long)item, sizeof(*item));
				696
				697	ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
				698	ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
				699	ins.type = BTRFS_EXTENT_ITEM_KEY;
				700	offset = key->offset - btrfs_file_extent_offset(eb, item);
				701
				702	/*
				703	* Manually record dirty extent, as here we did a shallow
				704	* file extent item copy and skip normal backref update,
				705	* but modifying extent tree all by ourselves.
				706	* So need to manually record dirty extent for qgroup,
				707	* as the owner of the file extent changed from log tree
				708	* (doesn't affect qgroup) to fs/file tree(affects qgroup)
				709	*/
				710	ret = btrfs_qgroup_trace_extent(trans, fs_info,
				711	btrfs_file_extent_disk_bytenr(eb, item),
				712	btrfs_file_extent_disk_num_bytes(eb, item),
				713	GFP_NOFS);
				714	if (ret < 0)
				715	goto out;
				716
				717	if (ins.objectid > 0) {
				718	u64 csum_start;
				719	u64 csum_end;
				720	LIST_HEAD(ordered_sums);
				721	/*
				722	* is this extent already allocated in the extent
				723	* allocation tree? If so, just add a reference
				724	*/
				725	ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
				726	ins.offset);
				727	if (ret == 0) {
				728	ret = btrfs_inc_extent_ref(trans, fs_info,
				729	ins.objectid, ins.offset,
				730	0, root->root_key.objectid,
				731	key->objectid, offset);
				732	if (ret)
				733	goto out;
				734	} else {
				735	/*
				736	* insert the extent pointer in the extent
				737	* allocation tree
				738	*/
				739	ret = btrfs_alloc_logged_file_extent(trans,
				740	fs_info,
				741	root->root_key.objectid,
				742	key->objectid, offset, &ins);
				743	if (ret)
				744	goto out;
				745	}
				746	btrfs_release_path(path);
				747
				748	if (btrfs_file_extent_compression(eb, item)) {
				749	csum_start = ins.objectid;
				750	csum_end = csum_start + ins.offset;
				751	} else {
				752	csum_start = ins.objectid +
				753	btrfs_file_extent_offset(eb, item);
				754	csum_end = csum_start +
				755	btrfs_file_extent_num_bytes(eb, item);
				756	}
				757
				758	ret = btrfs_lookup_csums_range(root->log_root,
				759	csum_start, csum_end - 1,
				760	&ordered_sums, 0);
				761	if (ret)
				762	goto out;
				763	/*
				764	* Now delete all existing cums in the csum root that
				765	* cover our range. We do this because we can have an
				766	* extent that is completely referenced by one file
				767	* extent item and partially referenced by another
				768	* file extent item (like after using the clone or
				769	* extent_same ioctls). In this case if we end up doing
				770	* the replay of the one that partially references the
				771	* extent first, and we do not do the csum deletion
				772	* below, we can get 2 csum items in the csum tree that
				773	* overlap each other. For example, imagine our log has
				774	* the two following file extent items:
				775	*
				776	* key (257 EXTENT_DATA 409600)
				777	* extent data disk byte 12845056 nr 102400
				778	* extent data offset 20480 nr 20480 ram 102400
				779	*
				780	* key (257 EXTENT_DATA 819200)
				781	* extent data disk byte 12845056 nr 102400
				782	* extent data offset 0 nr 102400 ram 102400
				783	*
				784	* Where the second one fully references the 100K extent
				785	* that starts at disk byte 12845056, and the log tree
				786	* has a single csum item that covers the entire range
				787	* of the extent:
				788	*
				789	* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
				790	*
				791	* After the first file extent item is replayed, the
				792	* csum tree gets the following csum item:
				793	*
				794	* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
				795	*
				796	* Which covers the 20K sub-range starting at offset 20K
				797	* of our extent. Now when we replay the second file
				798	* extent item, if we do not delete existing csum items
				799	* that cover any of its blocks, we end up getting two
				800	* csum items in our csum tree that overlap each other:
				801	*
				802	* key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
				803	* key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
				804	*
				805	* Which is a problem, because after this anyone trying
				806	* to lookup up for the checksum of any block of our
				807	* extent starting at an offset of 40K or higher, will
				808	* end up looking at the second csum item only, which
				809	* does not contain the checksum for any block starting
				810	* at offset 40K or higher of our extent.
				811	*/
				812	while (!list_empty(&ordered_sums)) {
				813	struct btrfs_ordered_sum *sums;
				814	sums = list_entry(ordered_sums.next,
				815	struct btrfs_ordered_sum,
				816	list);
				817	if (!ret)
				818	ret = btrfs_del_csums(trans, fs_info,
				819	sums->bytenr,
				820	sums->len);
				821	if (!ret)
				822	ret = btrfs_csum_file_blocks(trans,
				823	fs_info->csum_root, sums);
				824	list_del(&sums->list);
				825	kfree(sums);
				826	}
				827	if (ret)
				828	goto out;
				829	} else {
				830	btrfs_release_path(path);
				831	}
				832	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				833	/* inline extents are easy, we just overwrite them */
				834	ret = overwrite_item(trans, root, path, eb, slot, key);
				835	if (ret)
				836	goto out;
				837	}
				838
				839	inode_add_bytes(inode, nbytes);
				840	update_inode:
				841	ret = btrfs_update_inode(trans, root, inode);
				842	out:
				843	if (inode)
				844	iput(inode);
				845	return ret;
				846	}
				847
				848	/*
				849	* when cleaning up conflicts between the directory names in the
				850	* subvolume, directory names in the log and directory names in the
				851	* inode back references, we may have to unlink inodes from directories.
				852	*
				853	* This is a helper function to do the unlink of a specific directory
				854	* item
				855	*/
				856	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				857	struct btrfs_root *root,
				858	struct btrfs_path *path,
				859	struct btrfs_inode *dir,
				860	struct btrfs_dir_item *di)
				861	{
				862	struct btrfs_fs_info *fs_info = root->fs_info;
				863	struct inode *inode;
				864	char *name;
				865	int name_len;
				866	struct extent_buffer *leaf;
				867	struct btrfs_key location;
				868	int ret;
				869
				870	leaf = path->nodes[0];
				871
				872	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				873	name_len = btrfs_dir_name_len(leaf, di);
				874	name = kmalloc(name_len, GFP_NOFS);
				875	if (!name)
				876	return -ENOMEM;
				877
				878	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				879	btrfs_release_path(path);
				880
				881	inode = read_one_inode(root, location.objectid);
				882	if (!inode) {
				883	ret = -EIO;
				884	goto out;
				885	}
				886
				887	ret = link_to_fixup_dir(trans, root, path, location.objectid);
				888	if (ret)
				889	goto out;
				890
				891	ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
				892	name_len);
				893	if (ret)
				894	goto out;
				895	else
				896	ret = btrfs_run_delayed_items(trans, fs_info);
				897	out:
				898	kfree(name);
				899	iput(inode);
				900	return ret;
				901	}
				902
				903	/*
				904	* helper function to see if a given name and sequence number found
				905	* in an inode back reference are already in a directory and correctly
				906	* point to this inode
				907	*/
				908	static noinline int inode_in_dir(struct btrfs_root *root,
				909	struct btrfs_path *path,
				910	u64 dirid, u64 objectid, u64 index,
				911	const char *name, int name_len)
				912	{
				913	struct btrfs_dir_item *di;
				914	struct btrfs_key location;
				915	int match = 0;
				916
				917	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				918	index, name, name_len, 0);
				919	if (di && !IS_ERR(di)) {
				920	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				921	if (location.objectid != objectid)
				922	goto out;
				923	} else
				924	goto out;
				925	btrfs_release_path(path);
				926
				927	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				928	if (di && !IS_ERR(di)) {
				929	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				930	if (location.objectid != objectid)
				931	goto out;
				932	} else
				933	goto out;
				934	match = 1;
				935	out:
				936	btrfs_release_path(path);
				937	return match;
				938	}
				939
				940	/*
				941	* helper function to check a log tree for a named back reference in
				942	* an inode. This is used to decide if a back reference that is
				943	* found in the subvolume conflicts with what we find in the log.
				944	*
				945	* inode backreferences may have multiple refs in a single item,
				946	* during replay we process one reference at a time, and we don't
				947	* want to delete valid links to a file from the subvolume if that
				948	* link is also in the log.
				949	*/
				950	static noinline int backref_in_log(struct btrfs_root *log,
				951	struct btrfs_key *key,
				952	u64 ref_objectid,
				953	const char *name, int namelen)
				954	{
				955	struct btrfs_path *path;
				956	struct btrfs_inode_ref *ref;
				957	unsigned long ptr;
				958	unsigned long ptr_end;
				959	unsigned long name_ptr;
				960	int found_name_len;
				961	int item_size;
				962	int ret;
				963	int match = 0;
				964
				965	path = btrfs_alloc_path();
				966	if (!path)
				967	return -ENOMEM;
				968
				969	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				970	if (ret != 0)
				971	goto out;
				972
				973	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				974
				975	if (key->type == BTRFS_INODE_EXTREF_KEY) {
				976	if (btrfs_find_name_in_ext_backref(path, ref_objectid,
				977	name, namelen, NULL))
				978	match = 1;
				979
				980	goto out;
				981	}
				982
				983	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				984	ptr_end = ptr + item_size;
				985	while (ptr < ptr_end) {
				986	ref = (struct btrfs_inode_ref *)ptr;
				987	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				988	if (found_name_len == namelen) {
				989	name_ptr = (unsigned long)(ref + 1);
				990	ret = memcmp_extent_buffer(path->nodes[0], name,
				991	name_ptr, namelen);
				992	if (ret == 0) {
				993	match = 1;
				994	goto out;
				995	}
				996	}
				997	ptr = (unsigned long)(ref + 1) + found_name_len;
				998	}
				999	out:
				1000	btrfs_free_path(path);
				1001	return match;
				1002	}
				1003
				1004	static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
				1005	struct btrfs_root *root,
				1006	struct btrfs_path *path,
				1007	struct btrfs_root *log_root,
				1008	struct btrfs_inode *dir,
				1009	struct btrfs_inode *inode,
				1010	u64 inode_objectid, u64 parent_objectid,
				1011	u64 ref_index, char *name, int namelen,
				1012	int *search_done)
				1013	{
				1014	struct btrfs_fs_info *fs_info = root->fs_info;
				1015	int ret;
				1016	char *victim_name;
				1017	int victim_name_len;
				1018	struct extent_buffer *leaf;
				1019	struct btrfs_dir_item *di;
				1020	struct btrfs_key search_key;
				1021	struct btrfs_inode_extref *extref;
				1022
				1023	again:
				1024	/* Search old style refs */
				1025	search_key.objectid = inode_objectid;
				1026	search_key.type = BTRFS_INODE_REF_KEY;
				1027	search_key.offset = parent_objectid;
				1028	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
				1029	if (ret == 0) {
				1030	struct btrfs_inode_ref *victim_ref;
				1031	unsigned long ptr;
				1032	unsigned long ptr_end;
				1033
				1034	leaf = path->nodes[0];
				1035
				1036	/* are we trying to overwrite a back ref for the root directory
				1037	* if so, just jump out, we're done
				1038	*/
				1039	if (search_key.objectid == search_key.offset)
				1040	return 1;
				1041
				1042	/* check all the names in this back reference to see
				1043	* if they are in the log. if so, we allow them to stay
				1044	* otherwise they must be unlinked as a conflict
				1045	*/
				1046	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				1047	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
				1048	while (ptr < ptr_end) {
				1049	victim_ref = (struct btrfs_inode_ref *)ptr;
				1050	victim_name_len = btrfs_inode_ref_name_len(leaf,
				1051	victim_ref);
				1052	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				1053	if (!victim_name)
				1054	return -ENOMEM;
				1055
				1056	read_extent_buffer(leaf, victim_name,
				1057	(unsigned long)(victim_ref + 1),
				1058	victim_name_len);
				1059
				1060	if (!backref_in_log(log_root, &search_key,
				1061	parent_objectid,
				1062	victim_name,
				1063	victim_name_len)) {
				1064	inc_nlink(&inode->vfs_inode);
				1065	btrfs_release_path(path);
				1066
				1067	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1068	victim_name, victim_name_len);
				1069	kfree(victim_name);
				1070	if (ret)
				1071	return ret;
				1072	ret = btrfs_run_delayed_items(trans, fs_info);
				1073	if (ret)
				1074	return ret;
				1075	*search_done = 1;
				1076	goto again;
				1077	}
				1078	kfree(victim_name);
				1079
				1080	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				1081	}
				1082
				1083	/*
				1084	* NOTE: we have searched root tree and checked the
				1085	* corresponding ref, it does not need to check again.
				1086	*/
				1087	*search_done = 1;
				1088	}
				1089	btrfs_release_path(path);
				1090
				1091	/* Same search but for extended refs */
				1092	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
				1093	inode_objectid, parent_objectid, 0,
				1094	0);
				1095	if (!IS_ERR_OR_NULL(extref)) {
				1096	u32 item_size;
				1097	u32 cur_offset = 0;
				1098	unsigned long base;
				1099	struct inode *victim_parent;
				1100
				1101	leaf = path->nodes[0];
				1102
				1103	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1104	base = btrfs_item_ptr_offset(leaf, path->slots[0]);
				1105
				1106	while (cur_offset < item_size) {
				1107	extref = (struct btrfs_inode_extref *)(base + cur_offset);
				1108
				1109	victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
				1110
				1111	if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
				1112	goto next;
				1113
				1114	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				1115	if (!victim_name)
				1116	return -ENOMEM;
				1117	read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
				1118	victim_name_len);
				1119
				1120	search_key.objectid = inode_objectid;
				1121	search_key.type = BTRFS_INODE_EXTREF_KEY;
				1122	search_key.offset = btrfs_extref_hash(parent_objectid,
				1123	victim_name,
				1124	victim_name_len);
				1125	ret = 0;
				1126	if (!backref_in_log(log_root, &search_key,
				1127	parent_objectid, victim_name,
				1128	victim_name_len)) {
				1129	ret = -ENOENT;
				1130	victim_parent = read_one_inode(root,
				1131	parent_objectid);
				1132	if (victim_parent) {
				1133	inc_nlink(&inode->vfs_inode);
				1134	btrfs_release_path(path);
				1135
				1136	ret = btrfs_unlink_inode(trans, root,
				1137	BTRFS_I(victim_parent),
				1138	inode,
				1139	victim_name,
				1140	victim_name_len);
				1141	if (!ret)
				1142	ret = btrfs_run_delayed_items(
				1143	trans,
				1144	fs_info);
				1145	}
				1146	iput(victim_parent);
				1147	kfree(victim_name);
				1148	if (ret)
				1149	return ret;
				1150	*search_done = 1;
				1151	goto again;
				1152	}
				1153	kfree(victim_name);
				1154	next:
				1155	cur_offset += victim_name_len + sizeof(*extref);
				1156	}
				1157	*search_done = 1;
				1158	}
				1159	btrfs_release_path(path);
				1160
				1161	/* look for a conflicting sequence number */
				1162	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
				1163	ref_index, name, namelen, 0);
				1164	if (di && !IS_ERR(di)) {
				1165	ret = drop_one_dir_item(trans, root, path, dir, di);
				1166	if (ret)
				1167	return ret;
				1168	}
				1169	btrfs_release_path(path);
				1170
				1171	/* look for a conflicing name */
				1172	di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
				1173	name, namelen, 0);
				1174	if (di && !IS_ERR(di)) {
				1175	ret = drop_one_dir_item(trans, root, path, dir, di);
				1176	if (ret)
				1177	return ret;
				1178	}
				1179	btrfs_release_path(path);
				1180
				1181	return 0;
				1182	}
				1183
				1184	static int extref_get_fields(struct extent_buffer *eb, int slot,
				1185	unsigned long ref_ptr, u32 namelen, char *name,
				1186	u64 index, u64 parent_objectid)
				1187	{
				1188	struct btrfs_inode_extref *extref;
				1189
				1190	extref = (struct btrfs_inode_extref *)ref_ptr;
				1191
				1192	*namelen = btrfs_inode_extref_name_len(eb, extref);
				1193	if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)&extref->name,
				1194	*namelen))
				1195	return -EIO;
				1196
				1197	name = kmalloc(namelen, GFP_NOFS);
				1198	if (*name == NULL)
				1199	return -ENOMEM;
				1200
				1201	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
				1202	*namelen);
				1203
				1204	*index = btrfs_inode_extref_index(eb, extref);
				1205	if (parent_objectid)
				1206	*parent_objectid = btrfs_inode_extref_parent(eb, extref);
				1207
				1208	return 0;
				1209	}
				1210
				1211	static int ref_get_fields(struct extent_buffer *eb, int slot,
				1212	unsigned long ref_ptr, u32 namelen, char *name,
				1213	u64 *index)
				1214	{
				1215	struct btrfs_inode_ref *ref;
				1216
				1217	ref = (struct btrfs_inode_ref *)ref_ptr;
				1218
				1219	*namelen = btrfs_inode_ref_name_len(eb, ref);
				1220	if (!btrfs_is_name_len_valid(eb, slot, (unsigned long)(ref + 1),
				1221	*namelen))
				1222	return -EIO;
				1223
				1224	name = kmalloc(namelen, GFP_NOFS);
				1225	if (*name == NULL)
				1226	return -ENOMEM;
				1227
				1228	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				1229
				1230	*index = btrfs_inode_ref_index(eb, ref);
				1231
				1232	return 0;
				1233	}
				1234
				1235	/*
				1236	* replay one inode back reference item found in the log tree.
				1237	* eb, slot and key refer to the buffer and key found in the log tree.
				1238	* root is the destination we are replaying into, and path is for temp
				1239	* use by this function. (it should be released on return).
				1240	*/
				1241	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				1242	struct btrfs_root *root,
				1243	struct btrfs_root *log,
				1244	struct btrfs_path *path,
				1245	struct extent_buffer *eb, int slot,
				1246	struct btrfs_key *key)
				1247	{
				1248	struct inode *dir = NULL;
				1249	struct inode *inode = NULL;
				1250	unsigned long ref_ptr;
				1251	unsigned long ref_end;
				1252	char *name = NULL;
				1253	int namelen;
				1254	int ret;
				1255	int search_done = 0;
				1256	int log_ref_ver = 0;
				1257	u64 parent_objectid;
				1258	u64 inode_objectid;
				1259	u64 ref_index = 0;
				1260	int ref_struct_size;
				1261
				1262	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				1263	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				1264
				1265	if (key->type == BTRFS_INODE_EXTREF_KEY) {
				1266	struct btrfs_inode_extref *r;
				1267
				1268	ref_struct_size = sizeof(struct btrfs_inode_extref);
				1269	log_ref_ver = 1;
				1270	r = (struct btrfs_inode_extref *)ref_ptr;
				1271	parent_objectid = btrfs_inode_extref_parent(eb, r);
				1272	} else {
				1273	ref_struct_size = sizeof(struct btrfs_inode_ref);
				1274	parent_objectid = key->offset;
				1275	}
				1276	inode_objectid = key->objectid;
				1277
				1278	/*
				1279	* it is possible that we didn't log all the parent directories
				1280	* for a given inode. If we don't find the dir, just don't
				1281	* copy the back ref in. The link count fixup code will take
				1282	* care of the rest
				1283	*/
				1284	dir = read_one_inode(root, parent_objectid);
				1285	if (!dir) {
				1286	ret = -ENOENT;
				1287	goto out;
				1288	}
				1289
				1290	inode = read_one_inode(root, inode_objectid);
				1291	if (!inode) {
				1292	ret = -EIO;
				1293	goto out;
				1294	}
				1295
				1296	while (ref_ptr < ref_end) {
				1297	if (log_ref_ver) {
				1298	ret = extref_get_fields(eb, slot, ref_ptr, &namelen,
				1299	&name, &ref_index, &parent_objectid);
				1300	/*
				1301	* parent object can change from one array
				1302	* item to another.
				1303	*/
				1304	if (!dir)
				1305	dir = read_one_inode(root, parent_objectid);
				1306	if (!dir) {
				1307	ret = -ENOENT;
				1308	goto out;
				1309	}
				1310	} else {
				1311	ret = ref_get_fields(eb, slot, ref_ptr, &namelen,
				1312	&name, &ref_index);
				1313	}
				1314	if (ret)
				1315	goto out;
				1316
				1317	/* if we already have a perfect match, we're done */
				1318	if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
				1319	btrfs_ino(BTRFS_I(inode)), ref_index,
				1320	name, namelen)) {
				1321	/*
				1322	* look for a conflicting back reference in the
				1323	* metadata. if we find one we have to unlink that name
				1324	* of the file before we add our new link. Later on, we
				1325	* overwrite any existing back reference, and we don't
				1326	* want to create dangling pointers in the directory.
				1327	*/
				1328
				1329	if (!search_done) {
				1330	ret = __add_inode_ref(trans, root, path, log,
				1331	BTRFS_I(dir),
				1332	BTRFS_I(inode),
				1333	inode_objectid,
				1334	parent_objectid,
				1335	ref_index, name, namelen,
				1336	&search_done);
				1337	if (ret) {
				1338	if (ret == 1)
				1339	ret = 0;
				1340	goto out;
				1341	}
				1342	}
				1343
				1344	/* insert our name */
				1345	ret = btrfs_add_link(trans, BTRFS_I(dir),
				1346	BTRFS_I(inode),
				1347	name, namelen, 0, ref_index);
				1348	if (ret)
				1349	goto out;
				1350
				1351	btrfs_update_inode(trans, root, inode);
				1352	}
				1353
				1354	ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
				1355	kfree(name);
				1356	name = NULL;
				1357	if (log_ref_ver) {
				1358	iput(dir);
				1359	dir = NULL;
				1360	}
				1361	}
				1362
				1363	/* finally write the back reference in the inode */
				1364	ret = overwrite_item(trans, root, path, eb, slot, key);
				1365	out:
				1366	btrfs_release_path(path);
				1367	kfree(name);
				1368	iput(dir);
				1369	iput(inode);
				1370	return ret;
				1371	}
				1372
				1373	static int insert_orphan_item(struct btrfs_trans_handle *trans,
				1374	struct btrfs_root *root, u64 ino)
				1375	{
				1376	int ret;
				1377
				1378	ret = btrfs_insert_orphan_item(trans, root, ino);
				1379	if (ret == -EEXIST)
				1380	ret = 0;
				1381
				1382	return ret;
				1383	}
				1384
				1385	static int count_inode_extrefs(struct btrfs_root *root,
				1386	struct btrfs_inode inode, struct btrfs_path path)
				1387	{
				1388	int ret = 0;
				1389	int name_len;
				1390	unsigned int nlink = 0;
				1391	u32 item_size;
				1392	u32 cur_offset = 0;
				1393	u64 inode_objectid = btrfs_ino(inode);
				1394	u64 offset = 0;
				1395	unsigned long ptr;
				1396	struct btrfs_inode_extref *extref;
				1397	struct extent_buffer *leaf;
				1398
				1399	while (1) {
				1400	ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
				1401	&extref, &offset);
				1402	if (ret)
				1403	break;
				1404
				1405	leaf = path->nodes[0];
				1406	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
				1407	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				1408	cur_offset = 0;
				1409
				1410	while (cur_offset < item_size) {
				1411	extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
				1412	name_len = btrfs_inode_extref_name_len(leaf, extref);
				1413
				1414	nlink++;
				1415
				1416	cur_offset += name_len + sizeof(*extref);
				1417	}
				1418
				1419	offset++;
				1420	btrfs_release_path(path);
				1421	}
				1422	btrfs_release_path(path);
				1423
				1424	if (ret < 0 && ret != -ENOENT)
				1425	return ret;
				1426	return nlink;
				1427	}
				1428
				1429	static int count_inode_refs(struct btrfs_root *root,
				1430	struct btrfs_inode inode, struct btrfs_path path)
				1431	{
				1432	int ret;
				1433	struct btrfs_key key;
				1434	unsigned int nlink = 0;
				1435	unsigned long ptr;
				1436	unsigned long ptr_end;
				1437	int name_len;
				1438	u64 ino = btrfs_ino(inode);
				1439
				1440	key.objectid = ino;
				1441	key.type = BTRFS_INODE_REF_KEY;
				1442	key.offset = (u64)-1;
				1443
				1444	while (1) {
				1445	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1446	if (ret < 0)
				1447	break;
				1448	if (ret > 0) {
				1449	if (path->slots[0] == 0)
				1450	break;
				1451	path->slots[0]--;
				1452	}
				1453	process_slot:
				1454	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1455	path->slots[0]);
				1456	if (key.objectid != ino \|\|
				1457	key.type != BTRFS_INODE_REF_KEY)
				1458	break;
				1459	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				1460	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				1461	path->slots[0]);
				1462	while (ptr < ptr_end) {
				1463	struct btrfs_inode_ref *ref;
				1464
				1465	ref = (struct btrfs_inode_ref *)ptr;
				1466	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				1467	ref);
				1468	ptr = (unsigned long)(ref + 1) + name_len;
				1469	nlink++;
				1470	}
				1471
				1472	if (key.offset == 0)
				1473	break;
				1474	if (path->slots[0] > 0) {
				1475	path->slots[0]--;
				1476	goto process_slot;
				1477	}
				1478	key.offset--;
				1479	btrfs_release_path(path);
				1480	}
				1481	btrfs_release_path(path);
				1482
				1483	return nlink;
				1484	}
				1485
				1486	/*
				1487	* There are a few corners where the link count of the file can't
				1488	* be properly maintained during replay. So, instead of adding
				1489	* lots of complexity to the log code, we just scan the backrefs
				1490	* for any file that has been through replay.
				1491	*
				1492	* The scan will update the link count on the inode to reflect the
				1493	* number of back refs found. If it goes down to zero, the iput
				1494	* will free the inode.
				1495	*/
				1496	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				1497	struct btrfs_root *root,
				1498	struct inode *inode)
				1499	{
				1500	struct btrfs_path *path;
				1501	int ret;
				1502	u64 nlink = 0;
				1503	u64 ino = btrfs_ino(BTRFS_I(inode));
				1504
				1505	path = btrfs_alloc_path();
				1506	if (!path)
				1507	return -ENOMEM;
				1508
				1509	ret = count_inode_refs(root, BTRFS_I(inode), path);
				1510	if (ret < 0)
				1511	goto out;
				1512
				1513	nlink = ret;
				1514
				1515	ret = count_inode_extrefs(root, BTRFS_I(inode), path);
				1516	if (ret < 0)
				1517	goto out;
				1518
				1519	nlink += ret;
				1520
				1521	ret = 0;
				1522
				1523	if (nlink != inode->i_nlink) {
				1524	set_nlink(inode, nlink);
				1525	btrfs_update_inode(trans, root, inode);
				1526	}
				1527	BTRFS_I(inode)->index_cnt = (u64)-1;
				1528
				1529	if (inode->i_nlink == 0) {
				1530	if (S_ISDIR(inode->i_mode)) {
				1531	ret = replay_dir_deletes(trans, root, NULL, path,
				1532	ino, 1);
				1533	if (ret)
				1534	goto out;
				1535	}
				1536	ret = insert_orphan_item(trans, root, ino);
				1537	}
				1538
				1539	out:
				1540	btrfs_free_path(path);
				1541	return ret;
				1542	}
				1543
				1544	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				1545	struct btrfs_root *root,
				1546	struct btrfs_path *path)
				1547	{
				1548	int ret;
				1549	struct btrfs_key key;
				1550	struct inode *inode;
				1551
				1552	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1553	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1554	key.offset = (u64)-1;
				1555	while (1) {
				1556	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1557	if (ret < 0)
				1558	break;
				1559
				1560	if (ret == 1) {
				1561	if (path->slots[0] == 0)
				1562	break;
				1563	path->slots[0]--;
				1564	}
				1565
				1566	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1567	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				1568	key.type != BTRFS_ORPHAN_ITEM_KEY)
				1569	break;
				1570
				1571	ret = btrfs_del_item(trans, root, path);
				1572	if (ret)
				1573	goto out;
				1574
				1575	btrfs_release_path(path);
				1576	inode = read_one_inode(root, key.offset);
				1577	if (!inode)
				1578	return -EIO;
				1579
				1580	ret = fixup_inode_link_count(trans, root, inode);
				1581	iput(inode);
				1582	if (ret)
				1583	goto out;
				1584
				1585	/*
				1586	* fixup on a directory may create new entries,
				1587	* make sure we always look for the highset possible
				1588	* offset
				1589	*/
				1590	key.offset = (u64)-1;
				1591	}
				1592	ret = 0;
				1593	out:
				1594	btrfs_release_path(path);
				1595	return ret;
				1596	}
				1597
				1598
				1599	/*
				1600	* record a given inode in the fixup dir so we can check its link
				1601	* count when replay is done. The link count is incremented here
				1602	* so the inode won't go away until we check it
				1603	*/
				1604	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				1605	struct btrfs_root *root,
				1606	struct btrfs_path *path,
				1607	u64 objectid)
				1608	{
				1609	struct btrfs_key key;
				1610	int ret = 0;
				1611	struct inode *inode;
				1612
				1613	inode = read_one_inode(root, objectid);
				1614	if (!inode)
				1615	return -EIO;
				1616
				1617	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1618	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1619	key.offset = objectid;
				1620
				1621	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1622
				1623	btrfs_release_path(path);
				1624	if (ret == 0) {
				1625	if (!inode->i_nlink)
				1626	set_nlink(inode, 1);
				1627	else
				1628	inc_nlink(inode);
				1629	ret = btrfs_update_inode(trans, root, inode);
				1630	} else if (ret == -EEXIST) {
				1631	ret = 0;
				1632	} else {
				1633	BUG(); /* Logic Error */
				1634	}
				1635	iput(inode);
				1636
				1637	return ret;
				1638	}
				1639
				1640	/*
				1641	* when replaying the log for a directory, we only insert names
				1642	* for inodes that actually exist. This means an fsync on a directory
				1643	* does not implicitly fsync all the new files in it
				1644	*/
				1645	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1646	struct btrfs_root *root,
				1647	u64 dirid, u64 index,
				1648	char *name, int name_len,
				1649	struct btrfs_key *location)
				1650	{
				1651	struct inode *inode;
				1652	struct inode *dir;
				1653	int ret;
				1654
				1655	inode = read_one_inode(root, location->objectid);
				1656	if (!inode)
				1657	return -ENOENT;
				1658
				1659	dir = read_one_inode(root, dirid);
				1660	if (!dir) {
				1661	iput(inode);
				1662	return -EIO;
				1663	}
				1664
				1665	ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
				1666	name_len, 1, index);
				1667
				1668	/* FIXME, put inode into FIXUP list */
				1669
				1670	iput(inode);
				1671	iput(dir);
				1672	return ret;
				1673	}
				1674
				1675	/*
				1676	* Return true if an inode reference exists in the log for the given name,
				1677	* inode and parent inode.
				1678	*/
				1679	static bool name_in_log_ref(struct btrfs_root *log_root,
				1680	const char *name, const int name_len,
				1681	const u64 dirid, const u64 ino)
				1682	{
				1683	struct btrfs_key search_key;
				1684
				1685	search_key.objectid = ino;
				1686	search_key.type = BTRFS_INODE_REF_KEY;
				1687	search_key.offset = dirid;
				1688	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
				1689	return true;
				1690
				1691	search_key.type = BTRFS_INODE_EXTREF_KEY;
				1692	search_key.offset = btrfs_extref_hash(dirid, name, name_len);
				1693	if (backref_in_log(log_root, &search_key, dirid, name, name_len))
				1694	return true;
				1695
				1696	return false;
				1697	}
				1698
				1699	/*
				1700	* take a single entry in a log directory item and replay it into
				1701	* the subvolume.
				1702	*
				1703	* if a conflicting item exists in the subdirectory already,
				1704	* the inode it points to is unlinked and put into the link count
				1705	* fix up tree.
				1706	*
				1707	* If a name from the log points to a file or directory that does
				1708	* not exist in the FS, it is skipped. fsyncs on directories
				1709	* do not force down inodes inside that directory, just changes to the
				1710	* names or unlinks in a directory.
				1711	*
				1712	* Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
				1713	* non-existing inode) and 1 if the name was replayed.
				1714	*/
				1715	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1716	struct btrfs_root *root,
				1717	struct btrfs_path *path,
				1718	struct extent_buffer *eb,
				1719	struct btrfs_dir_item *di,
				1720	struct btrfs_key *key)
				1721	{
				1722	char *name;
				1723	int name_len;
				1724	struct btrfs_dir_item *dst_di;
				1725	struct btrfs_key found_key;
				1726	struct btrfs_key log_key;
				1727	struct inode *dir;
				1728	u8 log_type;
				1729	int exists;
				1730	int ret = 0;
				1731	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
				1732	bool name_added = false;
				1733
				1734	dir = read_one_inode(root, key->objectid);
				1735	if (!dir)
				1736	return -EIO;
				1737
				1738	name_len = btrfs_dir_name_len(eb, di);
				1739	name = kmalloc(name_len, GFP_NOFS);
				1740	if (!name) {
				1741	ret = -ENOMEM;
				1742	goto out;
				1743	}
				1744
				1745	log_type = btrfs_dir_type(eb, di);
				1746	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1747	name_len);
				1748
				1749	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
				1750	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1751	if (exists == 0)
				1752	exists = 1;
				1753	else
				1754	exists = 0;
				1755	btrfs_release_path(path);
				1756
				1757	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1758	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1759	name, name_len, 1);
				1760	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
				1761	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1762	key->objectid,
				1763	key->offset, name,
				1764	name_len, 1);
				1765	} else {
				1766	/* Corruption */
				1767	ret = -EINVAL;
				1768	goto out;
				1769	}
				1770	if (IS_ERR_OR_NULL(dst_di)) {
				1771	/* we need a sequence number to insert, so we only
				1772	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1773	*/
				1774	if (key->type != BTRFS_DIR_INDEX_KEY)
				1775	goto out;
				1776	goto insert;
				1777	}
				1778
				1779	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1780	/* the existing item matches the logged item */
				1781	if (found_key.objectid == log_key.objectid &&
				1782	found_key.type == log_key.type &&
				1783	found_key.offset == log_key.offset &&
				1784	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				1785	update_size = false;
				1786	goto out;
				1787	}
				1788
				1789	/*
				1790	* don't drop the conflicting directory entry if the inode
				1791	* for the new entry doesn't exist
				1792	*/
				1793	if (!exists)
				1794	goto out;
				1795
				1796	ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
				1797	if (ret)
				1798	goto out;
				1799
				1800	if (key->type == BTRFS_DIR_INDEX_KEY)
				1801	goto insert;
				1802	out:
				1803	btrfs_release_path(path);
				1804	if (!ret && update_size) {
				1805	btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
				1806	ret = btrfs_update_inode(trans, root, dir);
				1807	}
				1808	kfree(name);
				1809	iput(dir);
				1810	if (!ret && name_added)
				1811	ret = 1;
				1812	return ret;
				1813
				1814	insert:
				1815	if (name_in_log_ref(root->log_root, name, name_len,
				1816	key->objectid, log_key.objectid)) {
				1817	/* The dentry will be added later. */
				1818	ret = 0;
				1819	update_size = false;
				1820	goto out;
				1821	}
				1822	btrfs_release_path(path);
				1823	ret = insert_one_name(trans, root, key->objectid, key->offset,
				1824	name, name_len, &log_key);
				1825	if (ret && ret != -ENOENT && ret != -EEXIST)
				1826	goto out;
				1827	if (!ret)
				1828	name_added = true;
				1829	update_size = false;
				1830	ret = 0;
				1831	goto out;
				1832	}
				1833
				1834	/*
				1835	* find all the names in a directory item and reconcile them into
				1836	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				1837	* one name in a directory item, but the same code gets used for
				1838	* both directory index types
				1839	*/
				1840	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				1841	struct btrfs_root *root,
				1842	struct btrfs_path *path,
				1843	struct extent_buffer *eb, int slot,
				1844	struct btrfs_key *key)
				1845	{
				1846	struct btrfs_fs_info *fs_info = root->fs_info;
				1847	int ret = 0;
				1848	u32 item_size = btrfs_item_size_nr(eb, slot);
				1849	struct btrfs_dir_item *di;
				1850	int name_len;
				1851	unsigned long ptr;
				1852	unsigned long ptr_end;
				1853	struct btrfs_path *fixup_path = NULL;
				1854
				1855	ptr = btrfs_item_ptr_offset(eb, slot);
				1856	ptr_end = ptr + item_size;
				1857	while (ptr < ptr_end) {
				1858	di = (struct btrfs_dir_item *)ptr;
				1859	if (verify_dir_item(fs_info, eb, slot, di))
				1860	return -EIO;
				1861	name_len = btrfs_dir_name_len(eb, di);
				1862	ret = replay_one_name(trans, root, path, eb, di, key);
				1863	if (ret < 0)
				1864	break;
				1865	ptr = (unsigned long)(di + 1);
				1866	ptr += name_len;
				1867
				1868	/*
				1869	* If this entry refers to a non-directory (directories can not
				1870	* have a link count > 1) and it was added in the transaction
				1871	* that was not committed, make sure we fixup the link count of
				1872	* the inode it the entry points to. Otherwise something like
				1873	* the following would result in a directory pointing to an
				1874	* inode with a wrong link that does not account for this dir
				1875	* entry:
				1876	*
				1877	* mkdir testdir
				1878	* touch testdir/foo
				1879	* touch testdir/bar
				1880	* sync
				1881	*
				1882	* ln testdir/bar testdir/bar_link
				1883	* ln testdir/foo testdir/foo_link
				1884	* xfs_io -c "fsync" testdir/bar
				1885	*
				1886	* <power failure>
				1887	*
				1888	* mount fs, log replay happens
				1889	*
				1890	* File foo would remain with a link count of 1 when it has two
				1891	* entries pointing to it in the directory testdir. This would
				1892	* make it impossible to ever delete the parent directory has
				1893	* it would result in stale dentries that can never be deleted.
				1894	*/
				1895	if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
				1896	struct btrfs_key di_key;
				1897
				1898	if (!fixup_path) {
				1899	fixup_path = btrfs_alloc_path();
				1900	if (!fixup_path) {
				1901	ret = -ENOMEM;
				1902	break;
				1903	}
				1904	}
				1905
				1906	btrfs_dir_item_key_to_cpu(eb, di, &di_key);
				1907	ret = link_to_fixup_dir(trans, root, fixup_path,
				1908	di_key.objectid);
				1909	if (ret)
				1910	break;
				1911	}
				1912	ret = 0;
				1913	}
				1914	btrfs_free_path(fixup_path);
				1915	return ret;
				1916	}
				1917
				1918	/*
				1919	* directory replay has two parts. There are the standard directory
				1920	* items in the log copied from the subvolume, and range items
				1921	* created in the log while the subvolume was logged.
				1922	*
				1923	* The range items tell us which parts of the key space the log
				1924	* is authoritative for. During replay, if a key in the subvolume
				1925	* directory is in a logged range item, but not actually in the log
				1926	* that means it was deleted from the directory before the fsync
				1927	* and should be removed.
				1928	*/
				1929	static noinline int find_dir_range(struct btrfs_root *root,
				1930	struct btrfs_path *path,
				1931	u64 dirid, int key_type,
				1932	u64 start_ret, u64 end_ret)
				1933	{
				1934	struct btrfs_key key;
				1935	u64 found_end;
				1936	struct btrfs_dir_log_item *item;
				1937	int ret;
				1938	int nritems;
				1939
				1940	if (*start_ret == (u64)-1)
				1941	return 1;
				1942
				1943	key.objectid = dirid;
				1944	key.type = key_type;
				1945	key.offset = *start_ret;
				1946
				1947	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1948	if (ret < 0)
				1949	goto out;
				1950	if (ret > 0) {
				1951	if (path->slots[0] == 0)
				1952	goto out;
				1953	path->slots[0]--;
				1954	}
				1955	if (ret != 0)
				1956	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1957
				1958	if (key.type != key_type \|\| key.objectid != dirid) {
				1959	ret = 1;
				1960	goto next;
				1961	}
				1962	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1963	struct btrfs_dir_log_item);
				1964	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1965
				1966	if (start_ret >= key.offset && start_ret <= found_end) {
				1967	ret = 0;
				1968	*start_ret = key.offset;
				1969	*end_ret = found_end;
				1970	goto out;
				1971	}
				1972	ret = 1;
				1973	next:
				1974	/* check the next slot in the tree to see if it is a valid item */
				1975	nritems = btrfs_header_nritems(path->nodes[0]);
				1976	path->slots[0]++;
				1977	if (path->slots[0] >= nritems) {
				1978	ret = btrfs_next_leaf(root, path);
				1979	if (ret)
				1980	goto out;
				1981	}
				1982
				1983	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1984
				1985	if (key.type != key_type \|\| key.objectid != dirid) {
				1986	ret = 1;
				1987	goto out;
				1988	}
				1989	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1990	struct btrfs_dir_log_item);
				1991	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1992	*start_ret = key.offset;
				1993	*end_ret = found_end;
				1994	ret = 0;
				1995	out:
				1996	btrfs_release_path(path);
				1997	return ret;
				1998	}
				1999
				2000	/*
				2001	* this looks for a given directory item in the log. If the directory
				2002	* item is not in the log, the item is removed and the inode it points
				2003	* to is unlinked
				2004	*/
				2005	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				2006	struct btrfs_root *root,
				2007	struct btrfs_root *log,
				2008	struct btrfs_path *path,
				2009	struct btrfs_path *log_path,
				2010	struct inode *dir,
				2011	struct btrfs_key *dir_key)
				2012	{
				2013	struct btrfs_fs_info *fs_info = root->fs_info;
				2014	int ret;
				2015	struct extent_buffer *eb;
				2016	int slot;
				2017	u32 item_size;
				2018	struct btrfs_dir_item *di;
				2019	struct btrfs_dir_item *log_di;
				2020	int name_len;
				2021	unsigned long ptr;
				2022	unsigned long ptr_end;
				2023	char *name;
				2024	struct inode *inode;
				2025	struct btrfs_key location;
				2026
				2027	again:
				2028	eb = path->nodes[0];
				2029	slot = path->slots[0];
				2030	item_size = btrfs_item_size_nr(eb, slot);
				2031	ptr = btrfs_item_ptr_offset(eb, slot);
				2032	ptr_end = ptr + item_size;
				2033	while (ptr < ptr_end) {
				2034	di = (struct btrfs_dir_item *)ptr;
				2035	if (verify_dir_item(fs_info, eb, slot, di)) {
				2036	ret = -EIO;
				2037	goto out;
				2038	}
				2039
				2040	name_len = btrfs_dir_name_len(eb, di);
				2041	name = kmalloc(name_len, GFP_NOFS);
				2042	if (!name) {
				2043	ret = -ENOMEM;
				2044	goto out;
				2045	}
				2046	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				2047	name_len);
				2048	log_di = NULL;
				2049	if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
				2050	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				2051	dir_key->objectid,
				2052	name, name_len, 0);
				2053	} else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
				2054	log_di = btrfs_lookup_dir_index_item(trans, log,
				2055	log_path,
				2056	dir_key->objectid,
				2057	dir_key->offset,
				2058	name, name_len, 0);
				2059	}
				2060	if (!log_di \|\| (IS_ERR(log_di) && PTR_ERR(log_di) == -ENOENT)) {
				2061	btrfs_dir_item_key_to_cpu(eb, di, &location);
				2062	btrfs_release_path(path);
				2063	btrfs_release_path(log_path);
				2064	inode = read_one_inode(root, location.objectid);
				2065	if (!inode) {
				2066	kfree(name);
				2067	return -EIO;
				2068	}
				2069
				2070	ret = link_to_fixup_dir(trans, root,
				2071	path, location.objectid);
				2072	if (ret) {
				2073	kfree(name);
				2074	iput(inode);
				2075	goto out;
				2076	}
				2077
				2078	inc_nlink(inode);
				2079	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
				2080	BTRFS_I(inode), name, name_len);
				2081	if (!ret)
				2082	ret = btrfs_run_delayed_items(trans, fs_info);
				2083	kfree(name);
				2084	iput(inode);
				2085	if (ret)
				2086	goto out;
				2087
				2088	/* there might still be more names under this key
				2089	* check and repeat if required
				2090	*/
				2091	ret = btrfs_search_slot(NULL, root, dir_key, path,
				2092	0, 0);
				2093	if (ret == 0)
				2094	goto again;
				2095	ret = 0;
				2096	goto out;
				2097	} else if (IS_ERR(log_di)) {
				2098	kfree(name);
				2099	return PTR_ERR(log_di);
				2100	}
				2101	btrfs_release_path(log_path);
				2102	kfree(name);
				2103
				2104	ptr = (unsigned long)(di + 1);
				2105	ptr += name_len;
				2106	}
				2107	ret = 0;
				2108	out:
				2109	btrfs_release_path(path);
				2110	btrfs_release_path(log_path);
				2111	return ret;
				2112	}
				2113
				2114	static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
				2115	struct btrfs_root *root,
				2116	struct btrfs_root *log,
				2117	struct btrfs_path *path,
				2118	const u64 ino)
				2119	{
				2120	struct btrfs_fs_info *fs_info = root->fs_info;
				2121	struct btrfs_key search_key;
				2122	struct btrfs_path *log_path;
				2123	int i;
				2124	int nritems;
				2125	int ret;
				2126
				2127	log_path = btrfs_alloc_path();
				2128	if (!log_path)
				2129	return -ENOMEM;
				2130
				2131	search_key.objectid = ino;
				2132	search_key.type = BTRFS_XATTR_ITEM_KEY;
				2133	search_key.offset = 0;
				2134	again:
				2135	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
				2136	if (ret < 0)
				2137	goto out;
				2138	process_leaf:
				2139	nritems = btrfs_header_nritems(path->nodes[0]);
				2140	for (i = path->slots[0]; i < nritems; i++) {
				2141	struct btrfs_key key;
				2142	struct btrfs_dir_item *di;
				2143	struct btrfs_dir_item *log_di;
				2144	u32 total_size;
				2145	u32 cur;
				2146
				2147	btrfs_item_key_to_cpu(path->nodes[0], &key, i);
				2148	if (key.objectid != ino \|\| key.type != BTRFS_XATTR_ITEM_KEY) {
				2149	ret = 0;
				2150	goto out;
				2151	}
				2152
				2153	di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
				2154	total_size = btrfs_item_size_nr(path->nodes[0], i);
				2155	cur = 0;
				2156	while (cur < total_size) {
				2157	u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
				2158	u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
				2159	u32 this_len = sizeof(*di) + name_len + data_len;
				2160	char *name;
				2161
				2162	ret = verify_dir_item(fs_info, path->nodes[0], i, di);
				2163	if (ret) {
				2164	ret = -EIO;
				2165	goto out;
				2166	}
				2167	name = kmalloc(name_len, GFP_NOFS);
				2168	if (!name) {
				2169	ret = -ENOMEM;
				2170	goto out;
				2171	}
				2172	read_extent_buffer(path->nodes[0], name,
				2173	(unsigned long)(di + 1), name_len);
				2174
				2175	log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
				2176	name, name_len, 0);
				2177	btrfs_release_path(log_path);
				2178	if (!log_di) {
				2179	/* Doesn't exist in log tree, so delete it. */
				2180	btrfs_release_path(path);
				2181	di = btrfs_lookup_xattr(trans, root, path, ino,
				2182	name, name_len, -1);
				2183	kfree(name);
				2184	if (IS_ERR(di)) {
				2185	ret = PTR_ERR(di);
				2186	goto out;
				2187	}
				2188	ASSERT(di);
				2189	ret = btrfs_delete_one_dir_name(trans, root,
				2190	path, di);
				2191	if (ret)
				2192	goto out;
				2193	btrfs_release_path(path);
				2194	search_key = key;
				2195	goto again;
				2196	}
				2197	kfree(name);
				2198	if (IS_ERR(log_di)) {
				2199	ret = PTR_ERR(log_di);
				2200	goto out;
				2201	}
				2202	cur += this_len;
				2203	di = (struct btrfs_dir_item )((char )di + this_len);
				2204	}
				2205	}
				2206	ret = btrfs_next_leaf(root, path);
				2207	if (ret > 0)
				2208	ret = 0;
				2209	else if (ret == 0)
				2210	goto process_leaf;
				2211	out:
				2212	btrfs_free_path(log_path);
				2213	btrfs_release_path(path);
				2214	return ret;
				2215	}
				2216
				2217
				2218	/*
				2219	* deletion replay happens before we copy any new directory items
				2220	* out of the log or out of backreferences from inodes. It
				2221	* scans the log to find ranges of keys that log is authoritative for,
				2222	* and then scans the directory to find items in those ranges that are
				2223	* not present in the log.
				2224	*
				2225	* Anything we don't find in the log is unlinked and removed from the
				2226	* directory.
				2227	*/
				2228	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				2229	struct btrfs_root *root,
				2230	struct btrfs_root *log,
				2231	struct btrfs_path *path,
				2232	u64 dirid, int del_all)
				2233	{
				2234	u64 range_start;
				2235	u64 range_end;
				2236	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				2237	int ret = 0;
				2238	struct btrfs_key dir_key;
				2239	struct btrfs_key found_key;
				2240	struct btrfs_path *log_path;
				2241	struct inode *dir;
				2242
				2243	dir_key.objectid = dirid;
				2244	dir_key.type = BTRFS_DIR_ITEM_KEY;
				2245	log_path = btrfs_alloc_path();
				2246	if (!log_path)
				2247	return -ENOMEM;
				2248
				2249	dir = read_one_inode(root, dirid);
				2250	/* it isn't an error if the inode isn't there, that can happen
				2251	* because we replay the deletes before we copy in the inode item
				2252	* from the log
				2253	*/
				2254	if (!dir) {
				2255	btrfs_free_path(log_path);
				2256	return 0;
				2257	}
				2258	again:
				2259	range_start = 0;
				2260	range_end = 0;
				2261	while (1) {
				2262	if (del_all)
				2263	range_end = (u64)-1;
				2264	else {
				2265	ret = find_dir_range(log, path, dirid, key_type,
				2266	&range_start, &range_end);
				2267	if (ret != 0)
				2268	break;
				2269	}
				2270
				2271	dir_key.offset = range_start;
				2272	while (1) {
				2273	int nritems;
				2274	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				2275	0, 0);
				2276	if (ret < 0)
				2277	goto out;
				2278
				2279	nritems = btrfs_header_nritems(path->nodes[0]);
				2280	if (path->slots[0] >= nritems) {
				2281	ret = btrfs_next_leaf(root, path);
				2282	if (ret == 1)
				2283	break;
				2284	else if (ret < 0)
				2285	goto out;
				2286	}
				2287	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2288	path->slots[0]);
				2289	if (found_key.objectid != dirid \|\|
				2290	found_key.type != dir_key.type)
				2291	goto next_type;
				2292
				2293	if (found_key.offset > range_end)
				2294	break;
				2295
				2296	ret = check_item_in_log(trans, root, log, path,
				2297	log_path, dir,
				2298	&found_key);
				2299	if (ret)
				2300	goto out;
				2301	if (found_key.offset == (u64)-1)
				2302	break;
				2303	dir_key.offset = found_key.offset + 1;
				2304	}
				2305	btrfs_release_path(path);
				2306	if (range_end == (u64)-1)
				2307	break;
				2308	range_start = range_end + 1;
				2309	}
				2310
				2311	next_type:
				2312	ret = 0;
				2313	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				2314	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2315	dir_key.type = BTRFS_DIR_INDEX_KEY;
				2316	btrfs_release_path(path);
				2317	goto again;
				2318	}
				2319	out:
				2320	btrfs_release_path(path);
				2321	btrfs_free_path(log_path);
				2322	iput(dir);
				2323	return ret;
				2324	}
				2325
				2326	/*
				2327	* the process_func used to replay items from the log tree. This
				2328	* gets called in two different stages. The first stage just looks
				2329	* for inodes and makes sure they are all copied into the subvolume.
				2330	*
				2331	* The second stage copies all the other item types from the log into
				2332	* the subvolume. The two stage approach is slower, but gets rid of
				2333	* lots of complexity around inodes referencing other inodes that exist
				2334	* only in the log (references come from either directory items or inode
				2335	* back refs).
				2336	*/
				2337	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				2338	struct walk_control *wc, u64 gen)
				2339	{
				2340	int nritems;
				2341	struct btrfs_path *path;
				2342	struct btrfs_root *root = wc->replay_dest;
				2343	struct btrfs_key key;
				2344	int level;
				2345	int i;
				2346	int ret;
				2347
				2348	ret = btrfs_read_buffer(eb, gen);
				2349	if (ret)
				2350	return ret;
				2351
				2352	level = btrfs_header_level(eb);
				2353
				2354	if (level != 0)
				2355	return 0;
				2356
				2357	path = btrfs_alloc_path();
				2358	if (!path)
				2359	return -ENOMEM;
				2360
				2361	nritems = btrfs_header_nritems(eb);
				2362	for (i = 0; i < nritems; i++) {
				2363	btrfs_item_key_to_cpu(eb, &key, i);
				2364
				2365	/* inode keys are done during the first stage */
				2366	if (key.type == BTRFS_INODE_ITEM_KEY &&
				2367	wc->stage == LOG_WALK_REPLAY_INODES) {
				2368	struct btrfs_inode_item *inode_item;
				2369	u32 mode;
				2370
				2371	inode_item = btrfs_item_ptr(eb, i,
				2372	struct btrfs_inode_item);
				2373	/*
				2374	* If we have a tmpfile (O_TMPFILE) that got fsync'ed
				2375	* and never got linked before the fsync, skip it, as
				2376	* replaying it is pointless since it would be deleted
				2377	* later. We skip logging tmpfiles, but it's always
				2378	* possible we are replaying a log created with a kernel
				2379	* that used to log tmpfiles.
				2380	*/
				2381	if (btrfs_inode_nlink(eb, inode_item) == 0) {
				2382	wc->ignore_cur_inode = true;
				2383	continue;
				2384	} else {
				2385	wc->ignore_cur_inode = false;
				2386	}
				2387	ret = replay_xattr_deletes(wc->trans, root, log,
				2388	path, key.objectid);
				2389	if (ret)
				2390	break;
				2391	mode = btrfs_inode_mode(eb, inode_item);
				2392	if (S_ISDIR(mode)) {
				2393	ret = replay_dir_deletes(wc->trans,
				2394	root, log, path, key.objectid, 0);
				2395	if (ret)
				2396	break;
				2397	}
				2398	ret = overwrite_item(wc->trans, root, path,
				2399	eb, i, &key);
				2400	if (ret)
				2401	break;
				2402
				2403	/*
				2404	* Before replaying extents, truncate the inode to its
				2405	* size. We need to do it now and not after log replay
				2406	* because before an fsync we can have prealloc extents
				2407	* added beyond the inode's i_size. If we did it after,
				2408	* through orphan cleanup for example, we would drop
				2409	* those prealloc extents just after replaying them.
				2410	*/
				2411	if (S_ISREG(mode)) {
				2412	struct inode *inode;
				2413	u64 from;
				2414
				2415	inode = read_one_inode(root, key.objectid);
				2416	if (!inode) {
				2417	ret = -EIO;
				2418	break;
				2419	}
				2420	from = ALIGN(i_size_read(inode),
				2421	root->fs_info->sectorsize);
				2422	ret = btrfs_drop_extents(wc->trans, root, inode,
				2423	from, (u64)-1, 1);
				2424	if (!ret) {
				2425	/* Update the inode's nbytes. */
				2426	ret = btrfs_update_inode(wc->trans,
				2427	root, inode);
				2428	}
				2429	iput(inode);
				2430	if (ret)
				2431	break;
				2432	}
				2433
				2434	ret = link_to_fixup_dir(wc->trans, root,
				2435	path, key.objectid);
				2436	if (ret)
				2437	break;
				2438	}
				2439
				2440	if (wc->ignore_cur_inode)
				2441	continue;
				2442
				2443	if (key.type == BTRFS_DIR_INDEX_KEY &&
				2444	wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
				2445	ret = replay_one_dir_item(wc->trans, root, path,
				2446	eb, i, &key);
				2447	if (ret)
				2448	break;
				2449	}
				2450
				2451	if (wc->stage < LOG_WALK_REPLAY_ALL)
				2452	continue;
				2453
				2454	/* these keys are simply copied */
				2455	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				2456	ret = overwrite_item(wc->trans, root, path,
				2457	eb, i, &key);
				2458	if (ret)
				2459	break;
				2460	} else if (key.type == BTRFS_INODE_REF_KEY \|\|
				2461	key.type == BTRFS_INODE_EXTREF_KEY) {
				2462	ret = add_inode_ref(wc->trans, root, log, path,
				2463	eb, i, &key);
				2464	if (ret && ret != -ENOENT)
				2465	break;
				2466	ret = 0;
				2467	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				2468	ret = replay_one_extent(wc->trans, root, path,
				2469	eb, i, &key);
				2470	if (ret)
				2471	break;
				2472	} else if (key.type == BTRFS_DIR_ITEM_KEY) {
				2473	ret = replay_one_dir_item(wc->trans, root, path,
				2474	eb, i, &key);
				2475	if (ret)
				2476	break;
				2477	}
				2478	}
				2479	btrfs_free_path(path);
				2480	return ret;
				2481	}
				2482
				2483	static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
				2484	struct btrfs_root *root,
				2485	struct btrfs_path path, int level,
				2486	struct walk_control *wc)
				2487	{
				2488	struct btrfs_fs_info *fs_info = root->fs_info;
				2489	u64 root_owner;
				2490	u64 bytenr;
				2491	u64 ptr_gen;
				2492	struct extent_buffer *next;
				2493	struct extent_buffer *cur;
				2494	struct extent_buffer *parent;
				2495	u32 blocksize;
				2496	int ret = 0;
				2497
				2498	WARN_ON(*level < 0);
				2499	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				2500
				2501	while (*level > 0) {
				2502	WARN_ON(*level < 0);
				2503	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				2504	cur = path->nodes[*level];
				2505
				2506	WARN_ON(btrfs_header_level(cur) != *level);
				2507
				2508	if (path->slots[*level] >=
				2509	btrfs_header_nritems(cur))
				2510	break;
				2511
				2512	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				2513	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				2514	blocksize = fs_info->nodesize;
				2515
				2516	parent = path->nodes[*level];
				2517	root_owner = btrfs_header_owner(parent);
				2518
				2519	next = btrfs_find_create_tree_block(fs_info, bytenr);
				2520	if (IS_ERR(next))
				2521	return PTR_ERR(next);
				2522
				2523	if (*level == 1) {
				2524	ret = wc->process_func(root, next, wc, ptr_gen);
				2525	if (ret) {
				2526	free_extent_buffer(next);
				2527	return ret;
				2528	}
				2529
				2530	path->slots[*level]++;
				2531	if (wc->free) {
				2532	ret = btrfs_read_buffer(next, ptr_gen);
				2533	if (ret) {
				2534	free_extent_buffer(next);
				2535	return ret;
				2536	}
				2537
				2538	if (trans) {
				2539	btrfs_tree_lock(next);
				2540	btrfs_set_lock_blocking(next);
				2541	clean_tree_block(fs_info, next);
				2542	btrfs_wait_tree_block_writeback(next);
				2543	btrfs_tree_unlock(next);
				2544	} else {
				2545	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
				2546	clear_extent_buffer_dirty(next);
				2547	}
				2548
				2549	WARN_ON(root_owner !=
				2550	BTRFS_TREE_LOG_OBJECTID);
				2551	ret = btrfs_free_and_pin_reserved_extent(
				2552	fs_info, bytenr,
				2553	blocksize);
				2554	if (ret) {
				2555	free_extent_buffer(next);
				2556	return ret;
				2557	}
				2558	}
				2559	free_extent_buffer(next);
				2560	continue;
				2561	}
				2562	ret = btrfs_read_buffer(next, ptr_gen);
				2563	if (ret) {
				2564	free_extent_buffer(next);
				2565	return ret;
				2566	}
				2567
				2568	WARN_ON(*level <= 0);
				2569	if (path->nodes[*level-1])
				2570	free_extent_buffer(path->nodes[*level-1]);
				2571	path->nodes[*level-1] = next;
				2572	*level = btrfs_header_level(next);
				2573	path->slots[*level] = 0;
				2574	cond_resched();
				2575	}
				2576	WARN_ON(*level < 0);
				2577	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				2578
				2579	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
				2580
				2581	cond_resched();
				2582	return 0;
				2583	}
				2584
				2585	static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
				2586	struct btrfs_root *root,
				2587	struct btrfs_path path, int level,
				2588	struct walk_control *wc)
				2589	{
				2590	struct btrfs_fs_info *fs_info = root->fs_info;
				2591	u64 root_owner;
				2592	int i;
				2593	int slot;
				2594	int ret;
				2595
				2596	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
				2597	slot = path->slots[i];
				2598	if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
				2599	path->slots[i]++;
				2600	*level = i;
				2601	WARN_ON(*level == 0);
				2602	return 0;
				2603	} else {
				2604	struct extent_buffer *parent;
				2605	if (path->nodes[*level] == root->node)
				2606	parent = path->nodes[*level];
				2607	else
				2608	parent = path->nodes[*level + 1];
				2609
				2610	root_owner = btrfs_header_owner(parent);
				2611	ret = wc->process_func(root, path->nodes[*level], wc,
				2612	btrfs_header_generation(path->nodes[*level]));
				2613	if (ret)
				2614	return ret;
				2615
				2616	if (wc->free) {
				2617	struct extent_buffer *next;
				2618
				2619	next = path->nodes[*level];
				2620
				2621	if (trans) {
				2622	btrfs_tree_lock(next);
				2623	btrfs_set_lock_blocking(next);
				2624	clean_tree_block(fs_info, next);
				2625	btrfs_wait_tree_block_writeback(next);
				2626	btrfs_tree_unlock(next);
				2627	} else {
				2628	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
				2629	clear_extent_buffer_dirty(next);
				2630	}
				2631
				2632	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
				2633	ret = btrfs_free_and_pin_reserved_extent(
				2634	fs_info,
				2635	path->nodes[*level]->start,
				2636	path->nodes[*level]->len);
				2637	if (ret)
				2638	return ret;
				2639	}
				2640	free_extent_buffer(path->nodes[*level]);
				2641	path->nodes[*level] = NULL;
				2642	*level = i + 1;
				2643	}
				2644	}
				2645	return 1;
				2646	}
				2647
				2648	/*
				2649	* drop the reference count on the tree rooted at 'snap'. This traverses
				2650	* the tree freeing any blocks that have a ref count of zero after being
				2651	* decremented.
				2652	*/
				2653	static int walk_log_tree(struct btrfs_trans_handle *trans,
				2654	struct btrfs_root log, struct walk_control wc)
				2655	{
				2656	struct btrfs_fs_info *fs_info = log->fs_info;
				2657	int ret = 0;
				2658	int wret;
				2659	int level;
				2660	struct btrfs_path *path;
				2661	int orig_level;
				2662
				2663	path = btrfs_alloc_path();
				2664	if (!path)
				2665	return -ENOMEM;
				2666
				2667	level = btrfs_header_level(log->node);
				2668	orig_level = level;
				2669	path->nodes[level] = log->node;
				2670	extent_buffer_get(log->node);
				2671	path->slots[level] = 0;
				2672
				2673	while (1) {
				2674	wret = walk_down_log_tree(trans, log, path, &level, wc);
				2675	if (wret > 0)
				2676	break;
				2677	if (wret < 0) {
				2678	ret = wret;
				2679	goto out;
				2680	}
				2681
				2682	wret = walk_up_log_tree(trans, log, path, &level, wc);
				2683	if (wret > 0)
				2684	break;
				2685	if (wret < 0) {
				2686	ret = wret;
				2687	goto out;
				2688	}
				2689	}
				2690
				2691	/* was the root node processed? if not, catch it here */
				2692	if (path->nodes[orig_level]) {
				2693	ret = wc->process_func(log, path->nodes[orig_level], wc,
				2694	btrfs_header_generation(path->nodes[orig_level]));
				2695	if (ret)
				2696	goto out;
				2697	if (wc->free) {
				2698	struct extent_buffer *next;
				2699
				2700	next = path->nodes[orig_level];
				2701
				2702	if (trans) {
				2703	btrfs_tree_lock(next);
				2704	btrfs_set_lock_blocking(next);
				2705	clean_tree_block(fs_info, next);
				2706	btrfs_wait_tree_block_writeback(next);
				2707	btrfs_tree_unlock(next);
				2708	} else {
				2709	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
				2710	clear_extent_buffer_dirty(next);
				2711	}
				2712
				2713	WARN_ON(log->root_key.objectid !=
				2714	BTRFS_TREE_LOG_OBJECTID);
				2715	ret = btrfs_free_and_pin_reserved_extent(fs_info,
				2716	next->start, next->len);
				2717	if (ret)
				2718	goto out;
				2719	}
				2720	}
				2721
				2722	out:
				2723	btrfs_free_path(path);
				2724	return ret;
				2725	}
				2726
				2727	/*
				2728	* helper function to update the item for a given subvolumes log root
				2729	* in the tree of log roots
				2730	*/
				2731	static int update_log_root(struct btrfs_trans_handle *trans,
				2732	struct btrfs_root *log,
				2733	struct btrfs_root_item *root_item)
				2734	{
				2735	struct btrfs_fs_info *fs_info = log->fs_info;
				2736	int ret;
				2737
				2738	if (log->log_transid == 1) {
				2739	/* insert root item on the first sync */
				2740	ret = btrfs_insert_root(trans, fs_info->log_root_tree,
				2741	&log->root_key, root_item);
				2742	} else {
				2743	ret = btrfs_update_root(trans, fs_info->log_root_tree,
				2744	&log->root_key, root_item);
				2745	}
				2746	return ret;
				2747	}
				2748
				2749	static void wait_log_commit(struct btrfs_root *root, int transid)
				2750	{
				2751	DEFINE_WAIT(wait);
				2752	int index = transid % 2;
				2753
				2754	/*
				2755	* we only allow two pending log transactions at a time,
				2756	* so we know that if ours is more than 2 older than the
				2757	* current transaction, we're done
				2758	*/
				2759	do {
				2760	prepare_to_wait(&root->log_commit_wait[index],
				2761	&wait, TASK_UNINTERRUPTIBLE);
				2762	mutex_unlock(&root->log_mutex);
				2763
				2764	if (root->log_transid_committed < transid &&
				2765	atomic_read(&root->log_commit[index]))
				2766	schedule();
				2767
				2768	finish_wait(&root->log_commit_wait[index], &wait);
				2769	mutex_lock(&root->log_mutex);
				2770	} while (root->log_transid_committed < transid &&
				2771	atomic_read(&root->log_commit[index]));
				2772	}
				2773
				2774	static void wait_for_writer(struct btrfs_root *root)
				2775	{
				2776	DEFINE_WAIT(wait);
				2777
				2778	while (atomic_read(&root->log_writers)) {
				2779	prepare_to_wait(&root->log_writer_wait,
				2780	&wait, TASK_UNINTERRUPTIBLE);
				2781	mutex_unlock(&root->log_mutex);
				2782	if (atomic_read(&root->log_writers))
				2783	schedule();
				2784	finish_wait(&root->log_writer_wait, &wait);
				2785	mutex_lock(&root->log_mutex);
				2786	}
				2787	}
				2788
				2789	static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
				2790	struct btrfs_log_ctx *ctx)
				2791	{
				2792	if (!ctx)
				2793	return;
				2794
				2795	mutex_lock(&root->log_mutex);
				2796	list_del_init(&ctx->list);
				2797	mutex_unlock(&root->log_mutex);
				2798	}
				2799
				2800	/*
				2801	* Invoked in log mutex context, or be sure there is no other task which
				2802	* can access the list.
				2803	*/
				2804	static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
				2805	int index, int error)
				2806	{
				2807	struct btrfs_log_ctx *ctx;
				2808	struct btrfs_log_ctx *safe;
				2809
				2810	list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
				2811	list_del_init(&ctx->list);
				2812	ctx->log_ret = error;
				2813	}
				2814
				2815	INIT_LIST_HEAD(&root->log_ctxs[index]);
				2816	}
				2817
				2818	/*
				2819	* btrfs_sync_log does sends a given tree log down to the disk and
				2820	* updates the super blocks to record it. When this call is done,
				2821	* you know that any inodes previously logged are safely on disk only
				2822	* if it returns 0.
				2823	*
				2824	* Any other return value means you need to call btrfs_commit_transaction.
				2825	* Some of the edge cases for fsyncing directories that have had unlinks
				2826	* or renames done in the past mean that sometimes the only safe
				2827	* fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
				2828	* that has happened.
				2829	*/
				2830	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				2831	struct btrfs_root root, struct btrfs_log_ctx ctx)
				2832	{
				2833	int index1;
				2834	int index2;
				2835	int mark;
				2836	int ret;
				2837	struct btrfs_fs_info *fs_info = root->fs_info;
				2838	struct btrfs_root *log = root->log_root;
				2839	struct btrfs_root *log_root_tree = fs_info->log_root_tree;
				2840	struct btrfs_root_item new_root_item;
				2841	int log_transid = 0;
				2842	struct btrfs_log_ctx root_log_ctx;
				2843	struct blk_plug plug;
				2844
				2845	mutex_lock(&root->log_mutex);
				2846	log_transid = ctx->log_transid;
				2847	if (root->log_transid_committed >= log_transid) {
				2848	mutex_unlock(&root->log_mutex);
				2849	return ctx->log_ret;
				2850	}
				2851
				2852	index1 = log_transid % 2;
				2853	if (atomic_read(&root->log_commit[index1])) {
				2854	wait_log_commit(root, log_transid);
				2855	mutex_unlock(&root->log_mutex);
				2856	return ctx->log_ret;
				2857	}
				2858	ASSERT(log_transid == root->log_transid);
				2859	atomic_set(&root->log_commit[index1], 1);
				2860
				2861	/* wait for previous tree log sync to complete */
				2862	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
				2863	wait_log_commit(root, log_transid - 1);
				2864
				2865	while (1) {
				2866	int batch = atomic_read(&root->log_batch);
				2867	/* when we're on an ssd, just kick the log commit out */
				2868	if (!btrfs_test_opt(fs_info, SSD) &&
				2869	test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
				2870	mutex_unlock(&root->log_mutex);
				2871	schedule_timeout_uninterruptible(1);
				2872	mutex_lock(&root->log_mutex);
				2873	}
				2874	wait_for_writer(root);
				2875	if (batch == atomic_read(&root->log_batch))
				2876	break;
				2877	}
				2878
				2879	/* bail out if we need to do a full commit */
				2880	if (btrfs_need_log_full_commit(fs_info, trans)) {
				2881	ret = -EAGAIN;
				2882	btrfs_free_logged_extents(log, log_transid);
				2883	mutex_unlock(&root->log_mutex);
				2884	goto out;
				2885	}
				2886
				2887	if (log_transid % 2 == 0)
				2888	mark = EXTENT_DIRTY;
				2889	else
				2890	mark = EXTENT_NEW;
				2891
				2892	/* we start IO on all the marked extents here, but we don't actually
				2893	* wait for them until later.
				2894	*/
				2895	blk_start_plug(&plug);
				2896	ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
				2897	if (ret) {
				2898	blk_finish_plug(&plug);
				2899	btrfs_abort_transaction(trans, ret);
				2900	btrfs_free_logged_extents(log, log_transid);
				2901	btrfs_set_log_full_commit(fs_info, trans);
				2902	mutex_unlock(&root->log_mutex);
				2903	goto out;
				2904	}
				2905
				2906	/*
				2907	* We _must_ update under the root->log_mutex in order to make sure we
				2908	* have a consistent view of the log root we are trying to commit at
				2909	* this moment.
				2910	*
				2911	* We _must_ copy this into a local copy, because we are not holding the
				2912	* log_root_tree->log_mutex yet. This is important because when we
				2913	* commit the log_root_tree we must have a consistent view of the
				2914	* log_root_tree when we update the super block to point at the
				2915	* log_root_tree bytenr. If we update the log_root_tree here we'll race
				2916	* with the commit and possibly point at the new block which we may not
				2917	* have written out.
				2918	*/
				2919	btrfs_set_root_node(&log->root_item, log->node);
				2920	memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
				2921
				2922	root->log_transid++;
				2923	log->log_transid = root->log_transid;
				2924	root->log_start_pid = 0;
				2925	/*
				2926	* IO has been started, blocks of the log tree have WRITTEN flag set
				2927	* in their headers. new modifications of the log will be written to
				2928	* new positions. so it's safe to allow log writers to go in.
				2929	*/
				2930	mutex_unlock(&root->log_mutex);
				2931
				2932	btrfs_init_log_ctx(&root_log_ctx, NULL);
				2933
				2934	mutex_lock(&log_root_tree->log_mutex);
				2935	atomic_inc(&log_root_tree->log_batch);
				2936	atomic_inc(&log_root_tree->log_writers);
				2937
				2938	index2 = log_root_tree->log_transid % 2;
				2939	list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
				2940	root_log_ctx.log_transid = log_root_tree->log_transid;
				2941
				2942	mutex_unlock(&log_root_tree->log_mutex);
				2943
				2944	mutex_lock(&log_root_tree->log_mutex);
				2945
				2946	/*
				2947	* Now we are safe to update the log_root_tree because we're under the
				2948	* log_mutex, and we're a current writer so we're holding the commit
				2949	* open until we drop the log_mutex.
				2950	*/
				2951	ret = update_log_root(trans, log, &new_root_item);
				2952
				2953	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
				2954	/*
				2955	* Implicit memory barrier after atomic_dec_and_test
				2956	*/
				2957	if (waitqueue_active(&log_root_tree->log_writer_wait))
				2958	wake_up(&log_root_tree->log_writer_wait);
				2959	}
				2960
				2961	if (ret) {
				2962	if (!list_empty(&root_log_ctx.list))
				2963	list_del_init(&root_log_ctx.list);
				2964
				2965	blk_finish_plug(&plug);
				2966	btrfs_set_log_full_commit(fs_info, trans);
				2967
				2968	if (ret != -ENOSPC) {
				2969	btrfs_abort_transaction(trans, ret);
				2970	mutex_unlock(&log_root_tree->log_mutex);
				2971	goto out;
				2972	}
				2973	btrfs_wait_tree_log_extents(log, mark);
				2974	btrfs_free_logged_extents(log, log_transid);
				2975	mutex_unlock(&log_root_tree->log_mutex);
				2976	ret = -EAGAIN;
				2977	goto out;
				2978	}
				2979
				2980	if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
				2981	blk_finish_plug(&plug);
				2982	list_del_init(&root_log_ctx.list);
				2983	mutex_unlock(&log_root_tree->log_mutex);
				2984	ret = root_log_ctx.log_ret;
				2985	goto out;
				2986	}
				2987
				2988	index2 = root_log_ctx.log_transid % 2;
				2989	if (atomic_read(&log_root_tree->log_commit[index2])) {
				2990	blk_finish_plug(&plug);
				2991	ret = btrfs_wait_tree_log_extents(log, mark);
				2992	btrfs_wait_logged_extents(trans, log, log_transid);
				2993	wait_log_commit(log_root_tree,
				2994	root_log_ctx.log_transid);
				2995	mutex_unlock(&log_root_tree->log_mutex);
				2996	if (!ret)
				2997	ret = root_log_ctx.log_ret;
				2998	goto out;
				2999	}
				3000	ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
				3001	atomic_set(&log_root_tree->log_commit[index2], 1);
				3002
				3003	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
				3004	wait_log_commit(log_root_tree,
				3005	root_log_ctx.log_transid - 1);
				3006	}
				3007
				3008	wait_for_writer(log_root_tree);
				3009
				3010	/*
				3011	* now that we've moved on to the tree of log tree roots,
				3012	* check the full commit flag again
				3013	*/
				3014	if (btrfs_need_log_full_commit(fs_info, trans)) {
				3015	blk_finish_plug(&plug);
				3016	btrfs_wait_tree_log_extents(log, mark);
				3017	btrfs_free_logged_extents(log, log_transid);
				3018	mutex_unlock(&log_root_tree->log_mutex);
				3019	ret = -EAGAIN;
				3020	goto out_wake_log_root;
				3021	}
				3022
				3023	ret = btrfs_write_marked_extents(fs_info,
				3024	&log_root_tree->dirty_log_pages,
				3025	EXTENT_DIRTY \| EXTENT_NEW);
				3026	blk_finish_plug(&plug);
				3027	if (ret) {
				3028	btrfs_set_log_full_commit(fs_info, trans);
				3029	btrfs_abort_transaction(trans, ret);
				3030	btrfs_free_logged_extents(log, log_transid);
				3031	mutex_unlock(&log_root_tree->log_mutex);
				3032	goto out_wake_log_root;
				3033	}
				3034	ret = btrfs_wait_tree_log_extents(log, mark);
				3035	if (!ret)
				3036	ret = btrfs_wait_tree_log_extents(log_root_tree,
				3037	EXTENT_NEW \| EXTENT_DIRTY);
				3038	if (ret) {
				3039	btrfs_set_log_full_commit(fs_info, trans);
				3040	btrfs_free_logged_extents(log, log_transid);
				3041	mutex_unlock(&log_root_tree->log_mutex);
				3042	goto out_wake_log_root;
				3043	}
				3044	btrfs_wait_logged_extents(trans, log, log_transid);
				3045
				3046	btrfs_set_super_log_root(fs_info->super_for_commit,
				3047	log_root_tree->node->start);
				3048	btrfs_set_super_log_root_level(fs_info->super_for_commit,
				3049	btrfs_header_level(log_root_tree->node));
				3050
				3051	log_root_tree->log_transid++;
				3052	mutex_unlock(&log_root_tree->log_mutex);
				3053
				3054	/*
				3055	* nobody else is going to jump in and write the the ctree
				3056	* super here because the log_commit atomic below is protecting
				3057	* us. We must be called with a transaction handle pinning
				3058	* the running transaction open, so a full commit can't hop
				3059	* in and cause problems either.
				3060	*/
				3061	ret = write_all_supers(fs_info, 1);
				3062	if (ret) {
				3063	btrfs_set_log_full_commit(fs_info, trans);
				3064	btrfs_abort_transaction(trans, ret);
				3065	goto out_wake_log_root;
				3066	}
				3067
				3068	mutex_lock(&root->log_mutex);
				3069	if (root->last_log_commit < log_transid)
				3070	root->last_log_commit = log_transid;
				3071	mutex_unlock(&root->log_mutex);
				3072
				3073	out_wake_log_root:
				3074	mutex_lock(&log_root_tree->log_mutex);
				3075	btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
				3076
				3077	log_root_tree->log_transid_committed++;
				3078	atomic_set(&log_root_tree->log_commit[index2], 0);
				3079	mutex_unlock(&log_root_tree->log_mutex);
				3080
				3081	/*
				3082	* The barrier before waitqueue_active is needed so all the updates
				3083	* above are seen by the woken threads. It might not be necessary, but
				3084	* proving that seems to be hard.
				3085	*/
				3086	smp_mb();
				3087	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
				3088	wake_up(&log_root_tree->log_commit_wait[index2]);
				3089	out:
				3090	mutex_lock(&root->log_mutex);
				3091	btrfs_remove_all_log_ctxs(root, index1, ret);
				3092	root->log_transid_committed++;
				3093	atomic_set(&root->log_commit[index1], 0);
				3094	mutex_unlock(&root->log_mutex);
				3095
				3096	/*
				3097	* The barrier before waitqueue_active is needed so all the updates
				3098	* above are seen by the woken threads. It might not be necessary, but
				3099	* proving that seems to be hard.
				3100	*/
				3101	smp_mb();
				3102	if (waitqueue_active(&root->log_commit_wait[index1]))
				3103	wake_up(&root->log_commit_wait[index1]);
				3104	return ret;
				3105	}
				3106
				3107	static void free_log_tree(struct btrfs_trans_handle *trans,
				3108	struct btrfs_root *log)
				3109	{
				3110	int ret;
				3111	u64 start;
				3112	u64 end;
				3113	struct walk_control wc = {
				3114	.free = 1,
				3115	.process_func = process_one_buffer
				3116	};
				3117
				3118	ret = walk_log_tree(trans, log, &wc);
				3119	if (ret) {
				3120	if (trans)
				3121	btrfs_abort_transaction(trans, ret);
				3122	else
				3123	btrfs_handle_fs_error(log->fs_info, ret, NULL);
				3124	}
				3125
				3126	while (1) {
				3127	ret = find_first_extent_bit(&log->dirty_log_pages,
				3128	0, &start, &end,
				3129	EXTENT_DIRTY \| EXTENT_NEW \| EXTENT_NEED_WAIT,
				3130	NULL);
				3131	if (ret)
				3132	break;
				3133
				3134	clear_extent_bits(&log->dirty_log_pages, start, end,
				3135	EXTENT_DIRTY \| EXTENT_NEW \| EXTENT_NEED_WAIT);
				3136	}
				3137
				3138	/*
				3139	* We may have short-circuited the log tree with the full commit logic
				3140	* and left ordered extents on our list, so clear these out to keep us
				3141	* from leaking inodes and memory.
				3142	*/
				3143	btrfs_free_logged_extents(log, 0);
				3144	btrfs_free_logged_extents(log, 1);
				3145
				3146	free_extent_buffer(log->node);
				3147	kfree(log);
				3148	}
				3149
				3150	/*
				3151	* free all the extents used by the tree log. This should be called
				3152	* at commit time of the full transaction
				3153	*/
				3154	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				3155	{
				3156	if (root->log_root) {
				3157	free_log_tree(trans, root->log_root);
				3158	root->log_root = NULL;
				3159	}
				3160	return 0;
				3161	}
				3162
				3163	int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
				3164	struct btrfs_fs_info *fs_info)
				3165	{
				3166	if (fs_info->log_root_tree) {
				3167	free_log_tree(trans, fs_info->log_root_tree);
				3168	fs_info->log_root_tree = NULL;
				3169	}
				3170	return 0;
				3171	}
				3172
				3173	/*
				3174	* Check if an inode was logged in the current transaction. We can't always rely
				3175	* on an inode's logged_trans value, because it's an in-memory only field and
				3176	* therefore not persisted. This means that its value is lost if the inode gets
				3177	* evicted and loaded again from disk (in which case it has a value of 0, and
				3178	* certainly it is smaller then any possible transaction ID), when that happens
				3179	* the full_sync flag is set in the inode's runtime flags, so on that case we
				3180	* assume eviction happened and ignore the logged_trans value, assuming the
				3181	* worst case, that the inode was logged before in the current transaction.
				3182	*/
				3183	static bool inode_logged(struct btrfs_trans_handle *trans,
				3184	struct btrfs_inode *inode)
				3185	{
				3186	if (inode->logged_trans == trans->transid)
				3187	return true;
				3188
				3189	if (inode->last_trans == trans->transid &&
				3190	test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
				3191	!test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
				3192	return true;
				3193
				3194	return false;
				3195	}
				3196
				3197	/*
				3198	* If both a file and directory are logged, and unlinks or renames are
				3199	* mixed in, we have a few interesting corners:
				3200	*
				3201	* create file X in dir Y
				3202	* link file X to X.link in dir Y
				3203	* fsync file X
				3204	* unlink file X but leave X.link
				3205	* fsync dir Y
				3206	*
				3207	* After a crash we would expect only X.link to exist. But file X
				3208	* didn't get fsync'd again so the log has back refs for X and X.link.
				3209	*
				3210	* We solve this by removing directory entries and inode backrefs from the
				3211	* log when a file that was logged in the current transaction is
				3212	* unlinked. Any later fsync will include the updated log entries, and
				3213	* we'll be able to reconstruct the proper directory items from backrefs.
				3214	*
				3215	* This optimizations allows us to avoid relogging the entire inode
				3216	* or the entire directory.
				3217	*/
				3218	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				3219	struct btrfs_root *root,
				3220	const char *name, int name_len,
				3221	struct btrfs_inode *dir, u64 index)
				3222	{
				3223	struct btrfs_root *log;
				3224	struct btrfs_dir_item *di;
				3225	struct btrfs_path *path;
				3226	int ret;
				3227	int err = 0;
				3228	int bytes_del = 0;
				3229	u64 dir_ino = btrfs_ino(dir);
				3230
				3231	if (!inode_logged(trans, dir))
				3232	return 0;
				3233
				3234	ret = join_running_log_trans(root);
				3235	if (ret)
				3236	return 0;
				3237
				3238	mutex_lock(&dir->log_mutex);
				3239
				3240	log = root->log_root;
				3241	path = btrfs_alloc_path();
				3242	if (!path) {
				3243	err = -ENOMEM;
				3244	goto out_unlock;
				3245	}
				3246
				3247	di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
				3248	name, name_len, -1);
				3249	if (IS_ERR(di)) {
				3250	err = PTR_ERR(di);
				3251	goto fail;
				3252	}
				3253	if (di) {
				3254	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				3255	bytes_del += name_len;
				3256	if (ret) {
				3257	err = ret;
				3258	goto fail;
				3259	}
				3260	}
				3261	btrfs_release_path(path);
				3262	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
				3263	index, name, name_len, -1);
				3264	if (IS_ERR(di)) {
				3265	err = PTR_ERR(di);
				3266	goto fail;
				3267	}
				3268	if (di) {
				3269	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				3270	bytes_del += name_len;
				3271	if (ret) {
				3272	err = ret;
				3273	goto fail;
				3274	}
				3275	}
				3276
				3277	/* update the directory size in the log to reflect the names
				3278	* we have removed
				3279	*/
				3280	if (bytes_del) {
				3281	struct btrfs_key key;
				3282
				3283	key.objectid = dir_ino;
				3284	key.offset = 0;
				3285	key.type = BTRFS_INODE_ITEM_KEY;
				3286	btrfs_release_path(path);
				3287
				3288	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				3289	if (ret < 0) {
				3290	err = ret;
				3291	goto fail;
				3292	}
				3293	if (ret == 0) {
				3294	struct btrfs_inode_item *item;
				3295	u64 i_size;
				3296
				3297	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3298	struct btrfs_inode_item);
				3299	i_size = btrfs_inode_size(path->nodes[0], item);
				3300	if (i_size > bytes_del)
				3301	i_size -= bytes_del;
				3302	else
				3303	i_size = 0;
				3304	btrfs_set_inode_size(path->nodes[0], item, i_size);
				3305	btrfs_mark_buffer_dirty(path->nodes[0]);
				3306	} else
				3307	ret = 0;
				3308	btrfs_release_path(path);
				3309	}
				3310	fail:
				3311	btrfs_free_path(path);
				3312	out_unlock:
				3313	mutex_unlock(&dir->log_mutex);
				3314	if (err == -ENOSPC) {
				3315	btrfs_set_log_full_commit(root->fs_info, trans);
				3316	err = 0;
				3317	} else if (err < 0 && err != -ENOENT) {
				3318	/* ENOENT can be returned if the entry hasn't been fsynced yet */
				3319	btrfs_abort_transaction(trans, err);
				3320	}
				3321
				3322	btrfs_end_log_trans(root);
				3323
				3324	return err;
				3325	}
				3326
				3327	/* see comments for btrfs_del_dir_entries_in_log */
				3328	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				3329	struct btrfs_root *root,
				3330	const char *name, int name_len,
				3331	struct btrfs_inode *inode, u64 dirid)
				3332	{
				3333	struct btrfs_fs_info *fs_info = root->fs_info;
				3334	struct btrfs_root *log;
				3335	u64 index;
				3336	int ret;
				3337
				3338	if (!inode_logged(trans, inode))
				3339	return 0;
				3340
				3341	ret = join_running_log_trans(root);
				3342	if (ret)
				3343	return 0;
				3344	log = root->log_root;
				3345	mutex_lock(&inode->log_mutex);
				3346
				3347	ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
				3348	dirid, &index);
				3349	mutex_unlock(&inode->log_mutex);
				3350	if (ret == -ENOSPC) {
				3351	btrfs_set_log_full_commit(fs_info, trans);
				3352	ret = 0;
				3353	} else if (ret < 0 && ret != -ENOENT)
				3354	btrfs_abort_transaction(trans, ret);
				3355	btrfs_end_log_trans(root);
				3356
				3357	return ret;
				3358	}
				3359
				3360	/*
				3361	* creates a range item in the log for 'dirid'. first_offset and
				3362	* last_offset tell us which parts of the key space the log should
				3363	* be considered authoritative for.
				3364	*/
				3365	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				3366	struct btrfs_root *log,
				3367	struct btrfs_path *path,
				3368	int key_type, u64 dirid,
				3369	u64 first_offset, u64 last_offset)
				3370	{
				3371	int ret;
				3372	struct btrfs_key key;
				3373	struct btrfs_dir_log_item *item;
				3374
				3375	key.objectid = dirid;
				3376	key.offset = first_offset;
				3377	if (key_type == BTRFS_DIR_ITEM_KEY)
				3378	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				3379	else
				3380	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				3381	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				3382	if (ret)
				3383	return ret;
				3384
				3385	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3386	struct btrfs_dir_log_item);
				3387	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				3388	btrfs_mark_buffer_dirty(path->nodes[0]);
				3389	btrfs_release_path(path);
				3390	return 0;
				3391	}
				3392
				3393	/*
				3394	* log all the items included in the current transaction for a given
				3395	* directory. This also creates the range items in the log tree required
				3396	* to replay anything deleted before the fsync
				3397	*/
				3398	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				3399	struct btrfs_root root, struct btrfs_inode inode,
				3400	struct btrfs_path *path,
				3401	struct btrfs_path *dst_path, int key_type,
				3402	struct btrfs_log_ctx *ctx,
				3403	u64 min_offset, u64 *last_offset_ret)
				3404	{
				3405	struct btrfs_key min_key;
				3406	struct btrfs_root *log = root->log_root;
				3407	struct extent_buffer *src;
				3408	int err = 0;
				3409	int ret;
				3410	int i;
				3411	int nritems;
				3412	u64 first_offset = min_offset;
				3413	u64 last_offset = (u64)-1;
				3414	u64 ino = btrfs_ino(inode);
				3415
				3416	log = root->log_root;
				3417
				3418	min_key.objectid = ino;
				3419	min_key.type = key_type;
				3420	min_key.offset = min_offset;
				3421
				3422	ret = btrfs_search_forward(root, &min_key, path, trans->transid);
				3423
				3424	/*
				3425	* we didn't find anything from this transaction, see if there
				3426	* is anything at all
				3427	*/
				3428	if (ret != 0 \|\| min_key.objectid != ino \|\| min_key.type != key_type) {
				3429	min_key.objectid = ino;
				3430	min_key.type = key_type;
				3431	min_key.offset = (u64)-1;
				3432	btrfs_release_path(path);
				3433	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				3434	if (ret < 0) {
				3435	btrfs_release_path(path);
				3436	return ret;
				3437	}
				3438	ret = btrfs_previous_item(root, path, ino, key_type);
				3439
				3440	/* if ret == 0 there are items for this type,
				3441	* create a range to tell us the last key of this type.
				3442	* otherwise, there are no items in this directory after
				3443	* *min_offset, and we create a range to indicate that.
				3444	*/
				3445	if (ret == 0) {
				3446	struct btrfs_key tmp;
				3447	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				3448	path->slots[0]);
				3449	if (key_type == tmp.type)
				3450	first_offset = max(min_offset, tmp.offset) + 1;
				3451	}
				3452	goto done;
				3453	}
				3454
				3455	/* go backward to find any previous key */
				3456	ret = btrfs_previous_item(root, path, ino, key_type);
				3457	if (ret == 0) {
				3458	struct btrfs_key tmp;
				3459	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				3460	if (key_type == tmp.type) {
				3461	first_offset = tmp.offset;
				3462	ret = overwrite_item(trans, log, dst_path,
				3463	path->nodes[0], path->slots[0],
				3464	&tmp);
				3465	if (ret) {
				3466	err = ret;
				3467	goto done;
				3468	}
				3469	}
				3470	}
				3471	btrfs_release_path(path);
				3472
				3473	/*
				3474	* Find the first key from this transaction again. See the note for
				3475	* log_new_dir_dentries, if we're logging a directory recursively we
				3476	* won't be holding its i_mutex, which means we can modify the directory
				3477	* while we're logging it. If we remove an entry between our first
				3478	* search and this search we'll not find the key again and can just
				3479	* bail.
				3480	*/
				3481	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				3482	if (ret != 0)
				3483	goto done;
				3484
				3485	/*
				3486	* we have a block from this transaction, log every item in it
				3487	* from our directory
				3488	*/
				3489	while (1) {
				3490	struct btrfs_key tmp;
				3491	src = path->nodes[0];
				3492	nritems = btrfs_header_nritems(src);
				3493	for (i = path->slots[0]; i < nritems; i++) {
				3494	struct btrfs_dir_item *di;
				3495
				3496	btrfs_item_key_to_cpu(src, &min_key, i);
				3497
				3498	if (min_key.objectid != ino \|\| min_key.type != key_type)
				3499	goto done;
				3500	ret = overwrite_item(trans, log, dst_path, src, i,
				3501	&min_key);
				3502	if (ret) {
				3503	err = ret;
				3504	goto done;
				3505	}
				3506
				3507	/*
				3508	* We must make sure that when we log a directory entry,
				3509	* the corresponding inode, after log replay, has a
				3510	* matching link count. For example:
				3511	*
				3512	* touch foo
				3513	* mkdir mydir
				3514	* sync
				3515	* ln foo mydir/bar
				3516	* xfs_io -c "fsync" mydir
				3517	* <crash>
				3518	* <mount fs and log replay>
				3519	*
				3520	* Would result in a fsync log that when replayed, our
				3521	* file inode would have a link count of 1, but we get
				3522	* two directory entries pointing to the same inode.
				3523	* After removing one of the names, it would not be
				3524	* possible to remove the other name, which resulted
				3525	* always in stale file handle errors, and would not
				3526	* be possible to rmdir the parent directory, since
				3527	* its i_size could never decrement to the value
				3528	* BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
				3529	*/
				3530	di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
				3531	btrfs_dir_item_key_to_cpu(src, di, &tmp);
				3532	if (ctx &&
				3533	(btrfs_dir_transid(src, di) == trans->transid \|\|
				3534	btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
				3535	tmp.type != BTRFS_ROOT_ITEM_KEY)
				3536	ctx->log_new_dentries = true;
				3537	}
				3538	path->slots[0] = nritems;
				3539
				3540	/*
				3541	* look ahead to the next item and see if it is also
				3542	* from this directory and from this transaction
				3543	*/
				3544	ret = btrfs_next_leaf(root, path);
				3545	if (ret) {
				3546	if (ret == 1)
				3547	last_offset = (u64)-1;
				3548	else
				3549	err = ret;
				3550	goto done;
				3551	}
				3552	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				3553	if (tmp.objectid != ino \|\| tmp.type != key_type) {
				3554	last_offset = (u64)-1;
				3555	goto done;
				3556	}
				3557	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				3558	ret = overwrite_item(trans, log, dst_path,
				3559	path->nodes[0], path->slots[0],
				3560	&tmp);
				3561	if (ret)
				3562	err = ret;
				3563	else
				3564	last_offset = tmp.offset;
				3565	goto done;
				3566	}
				3567	}
				3568	done:
				3569	btrfs_release_path(path);
				3570	btrfs_release_path(dst_path);
				3571
				3572	if (err == 0) {
				3573	*last_offset_ret = last_offset;
				3574	/*
				3575	* insert the log range keys to indicate where the log
				3576	* is valid
				3577	*/
				3578	ret = insert_dir_log_key(trans, log, path, key_type,
				3579	ino, first_offset, last_offset);
				3580	if (ret)
				3581	err = ret;
				3582	}
				3583	return err;
				3584	}
				3585
				3586	/*
				3587	* logging directories is very similar to logging inodes, We find all the items
				3588	* from the current transaction and write them to the log.
				3589	*
				3590	* The recovery code scans the directory in the subvolume, and if it finds a
				3591	* key in the range logged that is not present in the log tree, then it means
				3592	* that dir entry was unlinked during the transaction.
				3593	*
				3594	* In order for that scan to work, we must include one key smaller than
				3595	* the smallest logged by this transaction and one key larger than the largest
				3596	* key logged by this transaction.
				3597	*/
				3598	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				3599	struct btrfs_root root, struct btrfs_inode inode,
				3600	struct btrfs_path *path,
				3601	struct btrfs_path *dst_path,
				3602	struct btrfs_log_ctx *ctx)
				3603	{
				3604	u64 min_key;
				3605	u64 max_key;
				3606	int ret;
				3607	int key_type = BTRFS_DIR_ITEM_KEY;
				3608
				3609	again:
				3610	min_key = 0;
				3611	max_key = 0;
				3612	while (1) {
				3613	ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
				3614	ctx, min_key, &max_key);
				3615	if (ret)
				3616	return ret;
				3617	if (max_key == (u64)-1)
				3618	break;
				3619	min_key = max_key + 1;
				3620	}
				3621
				3622	if (key_type == BTRFS_DIR_ITEM_KEY) {
				3623	key_type = BTRFS_DIR_INDEX_KEY;
				3624	goto again;
				3625	}
				3626	return 0;
				3627	}
				3628
				3629	/*
				3630	* a helper function to drop items from the log before we relog an
				3631	* inode. max_key_type indicates the highest item type to remove.
				3632	* This cannot be run for file data extents because it does not
				3633	* free the extents they point to.
				3634	*/
				3635	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				3636	struct btrfs_root *log,
				3637	struct btrfs_path *path,
				3638	u64 objectid, int max_key_type)
				3639	{
				3640	int ret;
				3641	struct btrfs_key key;
				3642	struct btrfs_key found_key;
				3643	int start_slot;
				3644
				3645	key.objectid = objectid;
				3646	key.type = max_key_type;
				3647	key.offset = (u64)-1;
				3648
				3649	while (1) {
				3650	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				3651	BUG_ON(ret == 0); /* Logic error */
				3652	if (ret < 0)
				3653	break;
				3654
				3655	if (path->slots[0] == 0)
				3656	break;
				3657
				3658	path->slots[0]--;
				3659	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				3660	path->slots[0]);
				3661
				3662	if (found_key.objectid != objectid)
				3663	break;
				3664
				3665	found_key.offset = 0;
				3666	found_key.type = 0;
				3667	ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
				3668	&start_slot);
				3669
				3670	ret = btrfs_del_items(trans, log, path, start_slot,
				3671	path->slots[0] - start_slot + 1);
				3672	/*
				3673	* If start slot isn't 0 then we don't need to re-search, we've
				3674	* found the last guy with the objectid in this tree.
				3675	*/
				3676	if (ret \|\| start_slot != 0)
				3677	break;
				3678	btrfs_release_path(path);
				3679	}
				3680	btrfs_release_path(path);
				3681	if (ret > 0)
				3682	ret = 0;
				3683	return ret;
				3684	}
				3685
				3686	static void fill_inode_item(struct btrfs_trans_handle *trans,
				3687	struct extent_buffer *leaf,
				3688	struct btrfs_inode_item *item,
				3689	struct inode *inode, int log_inode_only,
				3690	u64 logged_isize)
				3691	{
				3692	struct btrfs_map_token token;
				3693
				3694	btrfs_init_map_token(&token);
				3695
				3696	if (log_inode_only) {
				3697	/* set the generation to zero so the recover code
				3698	* can tell the difference between an logging
				3699	* just to say 'this inode exists' and a logging
				3700	* to say 'update this inode with these values'
				3701	*/
				3702	btrfs_set_token_inode_generation(leaf, item, 0, &token);
				3703	btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
				3704	} else {
				3705	btrfs_set_token_inode_generation(leaf, item,
				3706	BTRFS_I(inode)->generation,
				3707	&token);
				3708	btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
				3709	}
				3710
				3711	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
				3712	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
				3713	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
				3714	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
				3715
				3716	btrfs_set_token_timespec_sec(leaf, &item->atime,
				3717	inode->i_atime.tv_sec, &token);
				3718	btrfs_set_token_timespec_nsec(leaf, &item->atime,
				3719	inode->i_atime.tv_nsec, &token);
				3720
				3721	btrfs_set_token_timespec_sec(leaf, &item->mtime,
				3722	inode->i_mtime.tv_sec, &token);
				3723	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
				3724	inode->i_mtime.tv_nsec, &token);
				3725
				3726	btrfs_set_token_timespec_sec(leaf, &item->ctime,
				3727	inode->i_ctime.tv_sec, &token);
				3728	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
				3729	inode->i_ctime.tv_nsec, &token);
				3730
				3731	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
				3732	&token);
				3733
				3734	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
				3735	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
				3736	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
				3737	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
				3738	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
				3739	}
				3740
				3741	static int log_inode_item(struct btrfs_trans_handle *trans,
				3742	struct btrfs_root log, struct btrfs_path path,
				3743	struct btrfs_inode *inode)
				3744	{
				3745	struct btrfs_inode_item *inode_item;
				3746	int ret;
				3747
				3748	ret = btrfs_insert_empty_item(trans, log, path,
				3749	&inode->location, sizeof(*inode_item));
				3750	if (ret && ret != -EEXIST)
				3751	return ret;
				3752	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				3753	struct btrfs_inode_item);
				3754	fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
				3755	0, 0);
				3756	btrfs_release_path(path);
				3757	return 0;
				3758	}
				3759
				3760	static noinline int copy_items(struct btrfs_trans_handle *trans,
				3761	struct btrfs_inode *inode,
				3762	struct btrfs_path *dst_path,
				3763	struct btrfs_path *src_path,
				3764	int start_slot, int nr, int inode_only,
				3765	u64 logged_isize)
				3766	{
				3767	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				3768	unsigned long src_offset;
				3769	unsigned long dst_offset;
				3770	struct btrfs_root *log = inode->root->log_root;
				3771	struct btrfs_file_extent_item *extent;
				3772	struct btrfs_inode_item *inode_item;
				3773	struct extent_buffer *src = src_path->nodes[0];
				3774	int ret;
				3775	struct btrfs_key *ins_keys;
				3776	u32 *ins_sizes;
				3777	char *ins_data;
				3778	int i;
				3779	struct list_head ordered_sums;
				3780	int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
				3781
				3782	INIT_LIST_HEAD(&ordered_sums);
				3783
				3784	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				3785	nr * sizeof(u32), GFP_NOFS);
				3786	if (!ins_data)
				3787	return -ENOMEM;
				3788
				3789	ins_sizes = (u32 *)ins_data;
				3790	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				3791
				3792	for (i = 0; i < nr; i++) {
				3793	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				3794	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				3795	}
				3796	ret = btrfs_insert_empty_items(trans, log, dst_path,
				3797	ins_keys, ins_sizes, nr);
				3798	if (ret) {
				3799	kfree(ins_data);
				3800	return ret;
				3801	}
				3802
				3803	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
				3804	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				3805	dst_path->slots[0]);
				3806
				3807	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				3808
				3809	if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				3810	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				3811	dst_path->slots[0],
				3812	struct btrfs_inode_item);
				3813	fill_inode_item(trans, dst_path->nodes[0], inode_item,
				3814	&inode->vfs_inode,
				3815	inode_only == LOG_INODE_EXISTS,
				3816	logged_isize);
				3817	} else {
				3818	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				3819	src_offset, ins_sizes[i]);
				3820	}
				3821
				3822	/* take a reference on file data extents so that truncates
				3823	* or deletes of this inode don't have to relog the inode
				3824	* again
				3825	*/
				3826	if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
				3827	!skip_csum) {
				3828	int found_type;
				3829	extent = btrfs_item_ptr(src, start_slot + i,
				3830	struct btrfs_file_extent_item);
				3831
				3832	if (btrfs_file_extent_generation(src, extent) < trans->transid)
				3833	continue;
				3834
				3835	found_type = btrfs_file_extent_type(src, extent);
				3836	if (found_type == BTRFS_FILE_EXTENT_REG) {
				3837	u64 ds, dl, cs, cl;
				3838	ds = btrfs_file_extent_disk_bytenr(src,
				3839	extent);
				3840	/* ds == 0 is a hole */
				3841	if (ds == 0)
				3842	continue;
				3843
				3844	dl = btrfs_file_extent_disk_num_bytes(src,
				3845	extent);
				3846	cs = btrfs_file_extent_offset(src, extent);
				3847	cl = btrfs_file_extent_num_bytes(src,
				3848	extent);
				3849	if (btrfs_file_extent_compression(src,
				3850	extent)) {
				3851	cs = 0;
				3852	cl = dl;
				3853	}
				3854
				3855	ret = btrfs_lookup_csums_range(
				3856	fs_info->csum_root,
				3857	ds + cs, ds + cs + cl - 1,
				3858	&ordered_sums, 0);
				3859	if (ret)
				3860	break;
				3861	}
				3862	}
				3863	}
				3864
				3865	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				3866	btrfs_release_path(dst_path);
				3867	kfree(ins_data);
				3868
				3869	/*
				3870	* we have to do this after the loop above to avoid changing the
				3871	* log tree while trying to change the log tree.
				3872	*/
				3873	while (!list_empty(&ordered_sums)) {
				3874	struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
				3875	struct btrfs_ordered_sum,
				3876	list);
				3877	if (!ret)
				3878	ret = btrfs_csum_file_blocks(trans, log, sums);
				3879	list_del(&sums->list);
				3880	kfree(sums);
				3881	}
				3882
				3883	return ret;
				3884	}
				3885
				3886	static int extent_cmp(void priv, struct list_head a, struct list_head *b)
				3887	{
				3888	struct extent_map em1, em2;
				3889
				3890	em1 = list_entry(a, struct extent_map, list);
				3891	em2 = list_entry(b, struct extent_map, list);
				3892
				3893	if (em1->start < em2->start)
				3894	return -1;
				3895	else if (em1->start > em2->start)
				3896	return 1;
				3897	return 0;
				3898	}
				3899
				3900	static int wait_ordered_extents(struct btrfs_trans_handle *trans,
				3901	struct inode *inode,
				3902	struct btrfs_root *root,
				3903	const struct extent_map *em,
				3904	const struct list_head *logged_list,
				3905	bool *ordered_io_error)
				3906	{
				3907	struct btrfs_fs_info *fs_info = root->fs_info;
				3908	struct btrfs_ordered_extent *ordered;
				3909	struct btrfs_root *log = root->log_root;
				3910	u64 mod_start = em->mod_start;
				3911	u64 mod_len = em->mod_len;
				3912	const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
				3913	u64 csum_offset;
				3914	u64 csum_len;
				3915	LIST_HEAD(ordered_sums);
				3916	int ret = 0;
				3917
				3918	*ordered_io_error = false;
				3919
				3920	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) \|\|
				3921	em->block_start == EXTENT_MAP_HOLE)
				3922	return 0;
				3923
				3924	/*
				3925	* Wait far any ordered extent that covers our extent map. If it
				3926	* finishes without an error, first check and see if our csums are on
				3927	* our outstanding ordered extents.
				3928	*/
				3929	list_for_each_entry(ordered, logged_list, log_list) {
				3930	struct btrfs_ordered_sum *sum;
				3931
				3932	if (!mod_len)
				3933	break;
				3934
				3935	if (ordered->file_offset + ordered->len <= mod_start \|\|
				3936	mod_start + mod_len <= ordered->file_offset)
				3937	continue;
				3938
				3939	if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
				3940	!test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
				3941	!test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
				3942	const u64 start = ordered->file_offset;
				3943	const u64 end = ordered->file_offset + ordered->len - 1;
				3944
				3945	WARN_ON(ordered->inode != inode);
				3946	filemap_fdatawrite_range(inode->i_mapping, start, end);
				3947	}
				3948
				3949	wait_event(ordered->wait,
				3950	(test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) \|\|
				3951	test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
				3952
				3953	if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
				3954	/*
				3955	* Clear the AS_EIO/AS_ENOSPC flags from the inode's
				3956	* i_mapping flags, so that the next fsync won't get
				3957	* an outdated io error too.
				3958	*/
				3959	filemap_check_errors(inode->i_mapping);
				3960	*ordered_io_error = true;
				3961	break;
				3962	}
				3963	/*
				3964	* We are going to copy all the csums on this ordered extent, so
				3965	* go ahead and adjust mod_start and mod_len in case this
				3966	* ordered extent has already been logged.
				3967	*/
				3968	if (ordered->file_offset > mod_start) {
				3969	if (ordered->file_offset + ordered->len >=
				3970	mod_start + mod_len)
				3971	mod_len = ordered->file_offset - mod_start;
				3972	/*
				3973	* If we have this case
				3974	*
				3975	* \|--------- logged extent ---------\|
				3976	* \|----- ordered extent ----\|
				3977	*
				3978	* Just don't mess with mod_start and mod_len, we'll
				3979	* just end up logging more csums than we need and it
				3980	* will be ok.
				3981	*/
				3982	} else {
				3983	if (ordered->file_offset + ordered->len <
				3984	mod_start + mod_len) {
				3985	mod_len = (mod_start + mod_len) -
				3986	(ordered->file_offset + ordered->len);
				3987	mod_start = ordered->file_offset +
				3988	ordered->len;
				3989	} else {
				3990	mod_len = 0;
				3991	}
				3992	}
				3993
				3994	if (skip_csum)
				3995	continue;
				3996
				3997	/*
				3998	* To keep us from looping for the above case of an ordered
				3999	* extent that falls inside of the logged extent.
				4000	*/
				4001	if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
				4002	&ordered->flags))
				4003	continue;
				4004
				4005	list_for_each_entry(sum, &ordered->list, list) {
				4006	ret = btrfs_csum_file_blocks(trans, log, sum);
				4007	if (ret)
				4008	break;
				4009	}
				4010	}
				4011
				4012	if (*ordered_io_error \|\| !mod_len \|\| ret \|\| skip_csum)
				4013	return ret;
				4014
				4015	if (em->compress_type) {
				4016	csum_offset = 0;
				4017	csum_len = max(em->block_len, em->orig_block_len);
				4018	} else {
				4019	csum_offset = mod_start - em->start;
				4020	csum_len = mod_len;
				4021	}
				4022
				4023	/* block start is already adjusted for the file extent offset. */
				4024	ret = btrfs_lookup_csums_range(fs_info->csum_root,
				4025	em->block_start + csum_offset,
				4026	em->block_start + csum_offset +
				4027	csum_len - 1, &ordered_sums, 0);
				4028	if (ret)
				4029	return ret;
				4030
				4031	while (!list_empty(&ordered_sums)) {
				4032	struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
				4033	struct btrfs_ordered_sum,
				4034	list);
				4035	if (!ret)
				4036	ret = btrfs_csum_file_blocks(trans, log, sums);
				4037	list_del(&sums->list);
				4038	kfree(sums);
				4039	}
				4040
				4041	return ret;
				4042	}
				4043
				4044	static int log_one_extent(struct btrfs_trans_handle *trans,
				4045	struct btrfs_inode inode, struct btrfs_root root,
				4046	const struct extent_map *em,
				4047	struct btrfs_path *path,
				4048	const struct list_head *logged_list,
				4049	struct btrfs_log_ctx *ctx)
				4050	{
				4051	struct btrfs_root *log = root->log_root;
				4052	struct btrfs_file_extent_item *fi;
				4053	struct extent_buffer *leaf;
				4054	struct btrfs_map_token token;
				4055	struct btrfs_key key;
				4056	u64 extent_offset = em->start - em->orig_start;
				4057	u64 block_len;
				4058	int ret;
				4059	int extent_inserted = 0;
				4060	bool ordered_io_err = false;
				4061
				4062	ret = wait_ordered_extents(trans, &inode->vfs_inode, root, em,
				4063	logged_list, &ordered_io_err);
				4064	if (ret)
				4065	return ret;
				4066
				4067	if (ordered_io_err) {
				4068	ctx->io_err = -EIO;
				4069	return ctx->io_err;
				4070	}
				4071
				4072	btrfs_init_map_token(&token);
				4073
				4074	ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
				4075	em->start + em->len, NULL, 0, 1,
				4076	sizeof(*fi), &extent_inserted);
				4077	if (ret)
				4078	return ret;
				4079
				4080	if (!extent_inserted) {
				4081	key.objectid = btrfs_ino(inode);
				4082	key.type = BTRFS_EXTENT_DATA_KEY;
				4083	key.offset = em->start;
				4084
				4085	ret = btrfs_insert_empty_item(trans, log, path, &key,
				4086	sizeof(*fi));
				4087	if (ret)
				4088	return ret;
				4089	}
				4090	leaf = path->nodes[0];
				4091	fi = btrfs_item_ptr(leaf, path->slots[0],
				4092	struct btrfs_file_extent_item);
				4093
				4094	btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
				4095	&token);
				4096	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
				4097	btrfs_set_token_file_extent_type(leaf, fi,
				4098	BTRFS_FILE_EXTENT_PREALLOC,
				4099	&token);
				4100	else
				4101	btrfs_set_token_file_extent_type(leaf, fi,
				4102	BTRFS_FILE_EXTENT_REG,
				4103	&token);
				4104
				4105	block_len = max(em->block_len, em->orig_block_len);
				4106	if (em->compress_type != BTRFS_COMPRESS_NONE) {
				4107	btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
				4108	em->block_start,
				4109	&token);
				4110	btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
				4111	&token);
				4112	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				4113	btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
				4114	em->block_start -
				4115	extent_offset, &token);
				4116	btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
				4117	&token);
				4118	} else {
				4119	btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
				4120	btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
				4121	&token);
				4122	}
				4123
				4124	btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
				4125	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
				4126	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
				4127	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
				4128	&token);
				4129	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
				4130	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
				4131	btrfs_mark_buffer_dirty(leaf);
				4132
				4133	btrfs_release_path(path);
				4134
				4135	return ret;
				4136	}
				4137
				4138	/*
				4139	* Log all prealloc extents beyond the inode's i_size to make sure we do not
				4140	* lose them after doing a fast fsync and replaying the log. We scan the
				4141	* subvolume's root instead of iterating the inode's extent map tree because
				4142	* otherwise we can log incorrect extent items based on extent map conversion.
				4143	* That can happen due to the fact that extent maps are merged when they
				4144	* are not in the extent map tree's list of modified extents.
				4145	*/
				4146	static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
				4147	struct btrfs_inode *inode,
				4148	struct btrfs_path *path)
				4149	{
				4150	struct btrfs_root *root = inode->root;
				4151	struct btrfs_key key;
				4152	const u64 i_size = i_size_read(&inode->vfs_inode);
				4153	const u64 ino = btrfs_ino(inode);
				4154	struct btrfs_path *dst_path = NULL;
				4155	bool dropped_extents = false;
				4156	u64 truncate_offset = i_size;
				4157	struct extent_buffer *leaf;
				4158	int slot;
				4159	int ins_nr = 0;
				4160	int start_slot;
				4161	int ret;
				4162
				4163	if (!(inode->flags & BTRFS_INODE_PREALLOC))
				4164	return 0;
				4165
				4166	key.objectid = ino;
				4167	key.type = BTRFS_EXTENT_DATA_KEY;
				4168	key.offset = i_size;
				4169	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4170	if (ret < 0)
				4171	goto out;
				4172
				4173	/*
				4174	* We must check if there is a prealloc extent that starts before the
				4175	* i_size and crosses the i_size boundary. This is to ensure later we
				4176	* truncate down to the end of that extent and not to the i_size, as
				4177	* otherwise we end up losing part of the prealloc extent after a log
				4178	* replay and with an implicit hole if there is another prealloc extent
				4179	* that starts at an offset beyond i_size.
				4180	*/
				4181	ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
				4182	if (ret < 0)
				4183	goto out;
				4184
				4185	if (ret == 0) {
				4186	struct btrfs_file_extent_item *ei;
				4187
				4188	leaf = path->nodes[0];
				4189	slot = path->slots[0];
				4190	ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
				4191
				4192	if (btrfs_file_extent_type(leaf, ei) ==
				4193	BTRFS_FILE_EXTENT_PREALLOC) {
				4194	u64 extent_end;
				4195
				4196	btrfs_item_key_to_cpu(leaf, &key, slot);
				4197	extent_end = key.offset +
				4198	btrfs_file_extent_num_bytes(leaf, ei);
				4199
				4200	if (extent_end > i_size)
				4201	truncate_offset = extent_end;
				4202	}
				4203	} else {
				4204	ret = 0;
				4205	}
				4206
				4207	while (true) {
				4208	leaf = path->nodes[0];
				4209	slot = path->slots[0];
				4210
				4211	if (slot >= btrfs_header_nritems(leaf)) {
				4212	if (ins_nr > 0) {
				4213	ret = copy_items(trans, inode, dst_path, path,
				4214	start_slot, ins_nr, 1, 0);
				4215	if (ret < 0)
				4216	goto out;
				4217	ins_nr = 0;
				4218	}
				4219	ret = btrfs_next_leaf(root, path);
				4220	if (ret < 0)
				4221	goto out;
				4222	if (ret > 0) {
				4223	ret = 0;
				4224	break;
				4225	}
				4226	continue;
				4227	}
				4228
				4229	btrfs_item_key_to_cpu(leaf, &key, slot);
				4230	if (key.objectid > ino)
				4231	break;
				4232	if (WARN_ON_ONCE(key.objectid < ino) \|\|
				4233	key.type < BTRFS_EXTENT_DATA_KEY \|\|
				4234	key.offset < i_size) {
				4235	path->slots[0]++;
				4236	continue;
				4237	}
				4238	if (!dropped_extents) {
				4239	/*
				4240	* Avoid logging extent items logged in past fsync calls
				4241	* and leading to duplicate keys in the log tree.
				4242	*/
				4243	do {
				4244	ret = btrfs_truncate_inode_items(trans,
				4245	root->log_root,
				4246	&inode->vfs_inode,
				4247	truncate_offset,
				4248	BTRFS_EXTENT_DATA_KEY);
				4249	} while (ret == -EAGAIN);
				4250	if (ret)
				4251	goto out;
				4252	dropped_extents = true;
				4253	}
				4254	if (ins_nr == 0)
				4255	start_slot = slot;
				4256	ins_nr++;
				4257	path->slots[0]++;
				4258	if (!dst_path) {
				4259	dst_path = btrfs_alloc_path();
				4260	if (!dst_path) {
				4261	ret = -ENOMEM;
				4262	goto out;
				4263	}
				4264	}
				4265	}
				4266	if (ins_nr > 0) {
				4267	ret = copy_items(trans, inode, dst_path, path,
				4268	start_slot, ins_nr, 1, 0);
				4269	if (ret > 0)
				4270	ret = 0;
				4271	}
				4272	out:
				4273	btrfs_release_path(path);
				4274	btrfs_free_path(dst_path);
				4275	return ret;
				4276	}
				4277
				4278	static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
				4279	struct btrfs_root *root,
				4280	struct btrfs_inode *inode,
				4281	struct btrfs_path *path,
				4282	struct list_head *logged_list,
				4283	struct btrfs_log_ctx *ctx,
				4284	const u64 start,
				4285	const u64 end)
				4286	{
				4287	struct extent_map em, n;
				4288	struct list_head extents;
				4289	struct extent_map_tree *tree = &inode->extent_tree;
				4290	u64 logged_start, logged_end;
				4291	u64 test_gen;
				4292	int ret = 0;
				4293	int num = 0;
				4294
				4295	INIT_LIST_HEAD(&extents);
				4296
				4297	write_lock(&tree->lock);
				4298	test_gen = root->fs_info->last_trans_committed;
				4299	logged_start = start;
				4300	logged_end = end;
				4301
				4302	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
				4303	list_del_init(&em->list);
				4304	/*
				4305	* Just an arbitrary number, this can be really CPU intensive
				4306	* once we start getting a lot of extents, and really once we
				4307	* have a bunch of extents we just want to commit since it will
				4308	* be faster.
				4309	*/
				4310	if (++num > 32768) {
				4311	list_del_init(&tree->modified_extents);
				4312	ret = -EFBIG;
				4313	goto process;
				4314	}
				4315
				4316	if (em->generation <= test_gen)
				4317	continue;
				4318
				4319	/* We log prealloc extents beyond eof later. */
				4320	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
				4321	em->start >= i_size_read(&inode->vfs_inode))
				4322	continue;
				4323
				4324	if (em->start < logged_start)
				4325	logged_start = em->start;
				4326	if ((em->start + em->len - 1) > logged_end)
				4327	logged_end = em->start + em->len - 1;
				4328
				4329	/* Need a ref to keep it from getting evicted from cache */
				4330	refcount_inc(&em->refs);
				4331	set_bit(EXTENT_FLAG_LOGGING, &em->flags);
				4332	list_add_tail(&em->list, &extents);
				4333	num++;
				4334	}
				4335
				4336	list_sort(NULL, &extents, extent_cmp);
				4337	btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
				4338	/*
				4339	* Some ordered extents started by fsync might have completed
				4340	* before we could collect them into the list logged_list, which
				4341	* means they're gone, not in our logged_list nor in the inode's
				4342	* ordered tree. We want the application/user space to know an
				4343	* error happened while attempting to persist file data so that
				4344	* it can take proper action. If such error happened, we leave
				4345	* without writing to the log tree and the fsync must report the
				4346	* file data write error and not commit the current transaction.
				4347	*/
				4348	ret = filemap_check_errors(inode->vfs_inode.i_mapping);
				4349	if (ret)
				4350	ctx->io_err = ret;
				4351	process:
				4352	while (!list_empty(&extents)) {
				4353	em = list_entry(extents.next, struct extent_map, list);
				4354
				4355	list_del_init(&em->list);
				4356
				4357	/*
				4358	* If we had an error we just need to delete everybody from our
				4359	* private list.
				4360	*/
				4361	if (ret) {
				4362	clear_em_logging(tree, em);
				4363	free_extent_map(em);
				4364	continue;
				4365	}
				4366
				4367	write_unlock(&tree->lock);
				4368
				4369	ret = log_one_extent(trans, inode, root, em, path, logged_list,
				4370	ctx);
				4371	write_lock(&tree->lock);
				4372	clear_em_logging(tree, em);
				4373	free_extent_map(em);
				4374	}
				4375	WARN_ON(!list_empty(&extents));
				4376	write_unlock(&tree->lock);
				4377
				4378	btrfs_release_path(path);
				4379	if (!ret)
				4380	ret = btrfs_log_prealloc_extents(trans, inode, path);
				4381
				4382	return ret;
				4383	}
				4384
				4385	static int logged_inode_size(struct btrfs_root log, struct btrfs_inode inode,
				4386	struct btrfs_path path, u64 size_ret)
				4387	{
				4388	struct btrfs_key key;
				4389	int ret;
				4390
				4391	key.objectid = btrfs_ino(inode);
				4392	key.type = BTRFS_INODE_ITEM_KEY;
				4393	key.offset = 0;
				4394
				4395	ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
				4396	if (ret < 0) {
				4397	return ret;
				4398	} else if (ret > 0) {
				4399	*size_ret = 0;
				4400	} else {
				4401	struct btrfs_inode_item *item;
				4402
				4403	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				4404	struct btrfs_inode_item);
				4405	*size_ret = btrfs_inode_size(path->nodes[0], item);
				4406	/*
				4407	* If the in-memory inode's i_size is smaller then the inode
				4408	* size stored in the btree, return the inode's i_size, so
				4409	* that we get a correct inode size after replaying the log
				4410	* when before a power failure we had a shrinking truncate
				4411	* followed by addition of a new name (rename / new hard link).
				4412	* Otherwise return the inode size from the btree, to avoid
				4413	* data loss when replaying a log due to previously doing a
				4414	* write that expands the inode's size and logging a new name
				4415	* immediately after.
				4416	*/
				4417	if (*size_ret > inode->vfs_inode.i_size)
				4418	*size_ret = inode->vfs_inode.i_size;
				4419	}
				4420
				4421	btrfs_release_path(path);
				4422	return 0;
				4423	}
				4424
				4425	/*
				4426	* At the moment we always log all xattrs. This is to figure out at log replay
				4427	* time which xattrs must have their deletion replayed. If a xattr is missing
				4428	* in the log tree and exists in the fs/subvol tree, we delete it. This is
				4429	* because if a xattr is deleted, the inode is fsynced and a power failure
				4430	* happens, causing the log to be replayed the next time the fs is mounted,
				4431	* we want the xattr to not exist anymore (same behaviour as other filesystems
				4432	* with a journal, ext3/4, xfs, f2fs, etc).
				4433	*/
				4434	static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
				4435	struct btrfs_root *root,
				4436	struct btrfs_inode *inode,
				4437	struct btrfs_path *path,
				4438	struct btrfs_path *dst_path)
				4439	{
				4440	int ret;
				4441	struct btrfs_key key;
				4442	const u64 ino = btrfs_ino(inode);
				4443	int ins_nr = 0;
				4444	int start_slot = 0;
				4445
				4446	key.objectid = ino;
				4447	key.type = BTRFS_XATTR_ITEM_KEY;
				4448	key.offset = 0;
				4449
				4450	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4451	if (ret < 0)
				4452	return ret;
				4453
				4454	while (true) {
				4455	int slot = path->slots[0];
				4456	struct extent_buffer *leaf = path->nodes[0];
				4457	int nritems = btrfs_header_nritems(leaf);
				4458
				4459	if (slot >= nritems) {
				4460	if (ins_nr > 0) {
				4461	ret = copy_items(trans, inode, dst_path, path,
				4462	start_slot, ins_nr, 1, 0);
				4463	if (ret < 0)
				4464	return ret;
				4465	ins_nr = 0;
				4466	}
				4467	ret = btrfs_next_leaf(root, path);
				4468	if (ret < 0)
				4469	return ret;
				4470	else if (ret > 0)
				4471	break;
				4472	continue;
				4473	}
				4474
				4475	btrfs_item_key_to_cpu(leaf, &key, slot);
				4476	if (key.objectid != ino \|\| key.type != BTRFS_XATTR_ITEM_KEY)
				4477	break;
				4478
				4479	if (ins_nr == 0)
				4480	start_slot = slot;
				4481	ins_nr++;
				4482	path->slots[0]++;
				4483	cond_resched();
				4484	}
				4485	if (ins_nr > 0) {
				4486	ret = copy_items(trans, inode, dst_path, path,
				4487	start_slot, ins_nr, 1, 0);
				4488	if (ret < 0)
				4489	return ret;
				4490	}
				4491
				4492	return 0;
				4493	}
				4494
				4495	/*
				4496	* When using the NO_HOLES feature if we punched a hole that causes the
				4497	* deletion of entire leafs or all the extent items of the first leaf (the one
				4498	* that contains the inode item and references) we may end up not processing
				4499	* any extents, because there are no leafs with a generation matching the
				4500	* current transaction that have extent items for our inode. So we need to find
				4501	* if any holes exist and then log them. We also need to log holes after any
				4502	* truncate operation that changes the inode's size.
				4503	*/
				4504	static int btrfs_log_holes(struct btrfs_trans_handle *trans,
				4505	struct btrfs_root *root,
				4506	struct btrfs_inode *inode,
				4507	struct btrfs_path *path)
				4508	{
				4509	struct btrfs_fs_info *fs_info = root->fs_info;
				4510	struct btrfs_key key;
				4511	const u64 ino = btrfs_ino(inode);
				4512	const u64 i_size = i_size_read(&inode->vfs_inode);
				4513	u64 prev_extent_end = 0;
				4514	int ret;
				4515
				4516	if (!btrfs_fs_incompat(fs_info, NO_HOLES) \|\| i_size == 0)
				4517	return 0;
				4518
				4519	key.objectid = ino;
				4520	key.type = BTRFS_EXTENT_DATA_KEY;
				4521	key.offset = 0;
				4522
				4523	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4524	if (ret < 0)
				4525	return ret;
				4526
				4527	while (true) {
				4528	struct btrfs_file_extent_item *extent;
				4529	struct extent_buffer *leaf = path->nodes[0];
				4530	u64 len;
				4531
				4532	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				4533	ret = btrfs_next_leaf(root, path);
				4534	if (ret < 0)
				4535	return ret;
				4536	if (ret > 0) {
				4537	ret = 0;
				4538	break;
				4539	}
				4540	leaf = path->nodes[0];
				4541	}
				4542
				4543	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				4544	if (key.objectid != ino \|\| key.type != BTRFS_EXTENT_DATA_KEY)
				4545	break;
				4546
				4547	/* We have a hole, log it. */
				4548	if (prev_extent_end < key.offset) {
				4549	const u64 hole_len = key.offset - prev_extent_end;
				4550
				4551	/*
				4552	* Release the path to avoid deadlocks with other code
				4553	* paths that search the root while holding locks on
				4554	* leafs from the log root.
				4555	*/
				4556	btrfs_release_path(path);
				4557	ret = btrfs_insert_file_extent(trans, root->log_root,
				4558	ino, prev_extent_end, 0,
				4559	0, hole_len, 0, hole_len,
				4560	0, 0, 0);
				4561	if (ret < 0)
				4562	return ret;
				4563
				4564	/*
				4565	* Search for the same key again in the root. Since it's
				4566	* an extent item and we are holding the inode lock, the
				4567	* key must still exist. If it doesn't just emit warning
				4568	* and return an error to fall back to a transaction
				4569	* commit.
				4570	*/
				4571	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				4572	if (ret < 0)
				4573	return ret;
				4574	if (WARN_ON(ret > 0))
				4575	return -ENOENT;
				4576	leaf = path->nodes[0];
				4577	}
				4578
				4579	extent = btrfs_item_ptr(leaf, path->slots[0],
				4580	struct btrfs_file_extent_item);
				4581	if (btrfs_file_extent_type(leaf, extent) ==
				4582	BTRFS_FILE_EXTENT_INLINE) {
				4583	len = btrfs_file_extent_ram_bytes(leaf, extent);
				4584	prev_extent_end = ALIGN(key.offset + len,
				4585	fs_info->sectorsize);
				4586	} else {
				4587	len = btrfs_file_extent_num_bytes(leaf, extent);
				4588	prev_extent_end = key.offset + len;
				4589	}
				4590
				4591	path->slots[0]++;
				4592	cond_resched();
				4593	}
				4594
				4595	if (prev_extent_end < i_size) {
				4596	u64 hole_len;
				4597
				4598	btrfs_release_path(path);
				4599	hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
				4600	ret = btrfs_insert_file_extent(trans, root->log_root,
				4601	ino, prev_extent_end, 0, 0,
				4602	hole_len, 0, hole_len,
				4603	0, 0, 0);
				4604	if (ret < 0)
				4605	return ret;
				4606	}
				4607
				4608	return 0;
				4609	}
				4610
				4611	/*
				4612	* When we are logging a new inode X, check if it doesn't have a reference that
				4613	* matches the reference from some other inode Y created in a past transaction
				4614	* and that was renamed in the current transaction. If we don't do this, then at
				4615	* log replay time we can lose inode Y (and all its files if it's a directory):
				4616	*
				4617	* mkdir /mnt/x
				4618	* echo "hello world" > /mnt/x/foobar
				4619	* sync
				4620	* mv /mnt/x /mnt/y
				4621	* mkdir /mnt/x # or touch /mnt/x
				4622	* xfs_io -c fsync /mnt/x
				4623	* <power fail>
				4624	* mount fs, trigger log replay
				4625	*
				4626	* After the log replay procedure, we would lose the first directory and all its
				4627	* files (file foobar).
				4628	* For the case where inode Y is not a directory we simply end up losing it:
				4629	*
				4630	* echo "123" > /mnt/foo
				4631	* sync
				4632	* mv /mnt/foo /mnt/bar
				4633	* echo "abc" > /mnt/foo
				4634	* xfs_io -c fsync /mnt/foo
				4635	* <power fail>
				4636	*
				4637	* We also need this for cases where a snapshot entry is replaced by some other
				4638	* entry (file or directory) otherwise we end up with an unreplayable log due to
				4639	* attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
				4640	* if it were a regular entry:
				4641	*
				4642	* mkdir /mnt/x
				4643	* btrfs subvolume snapshot /mnt /mnt/x/snap
				4644	* btrfs subvolume delete /mnt/x/snap
				4645	* rmdir /mnt/x
				4646	* mkdir /mnt/x
				4647	* fsync /mnt/x or fsync some new file inside it
				4648	* <power fail>
				4649	*
				4650	* The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
				4651	* the same transaction.
				4652	*/
				4653	static int btrfs_check_ref_name_override(struct extent_buffer *eb,
				4654	const int slot,
				4655	const struct btrfs_key *key,
				4656	struct btrfs_inode *inode,
				4657	u64 *other_ino)
				4658	{
				4659	int ret;
				4660	struct btrfs_path *search_path;
				4661	char *name = NULL;
				4662	u32 name_len = 0;
				4663	u32 item_size = btrfs_item_size_nr(eb, slot);
				4664	u32 cur_offset = 0;
				4665	unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
				4666
				4667	search_path = btrfs_alloc_path();
				4668	if (!search_path)
				4669	return -ENOMEM;
				4670	search_path->search_commit_root = 1;
				4671	search_path->skip_locking = 1;
				4672
				4673	while (cur_offset < item_size) {
				4674	u64 parent;
				4675	u32 this_name_len;
				4676	u32 this_len;
				4677	unsigned long name_ptr;
				4678	struct btrfs_dir_item *di;
				4679
				4680	if (key->type == BTRFS_INODE_REF_KEY) {
				4681	struct btrfs_inode_ref *iref;
				4682
				4683	iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
				4684	parent = key->offset;
				4685	this_name_len = btrfs_inode_ref_name_len(eb, iref);
				4686	name_ptr = (unsigned long)(iref + 1);
				4687	this_len = sizeof(*iref) + this_name_len;
				4688	} else {
				4689	struct btrfs_inode_extref *extref;
				4690
				4691	extref = (struct btrfs_inode_extref *)(ptr +
				4692	cur_offset);
				4693	parent = btrfs_inode_extref_parent(eb, extref);
				4694	this_name_len = btrfs_inode_extref_name_len(eb, extref);
				4695	name_ptr = (unsigned long)&extref->name;
				4696	this_len = sizeof(*extref) + this_name_len;
				4697	}
				4698
				4699	ret = btrfs_is_name_len_valid(eb, slot, name_ptr,
				4700	this_name_len);
				4701	if (!ret) {
				4702	ret = -EIO;
				4703	goto out;
				4704	}
				4705	if (this_name_len > name_len) {
				4706	char *new_name;
				4707
				4708	new_name = krealloc(name, this_name_len, GFP_NOFS);
				4709	if (!new_name) {
				4710	ret = -ENOMEM;
				4711	goto out;
				4712	}
				4713	name_len = this_name_len;
				4714	name = new_name;
				4715	}
				4716
				4717	read_extent_buffer(eb, name, name_ptr, this_name_len);
				4718	di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
				4719	parent, name, this_name_len, 0);
				4720	if (di && !IS_ERR(di)) {
				4721	struct btrfs_key di_key;
				4722
				4723	btrfs_dir_item_key_to_cpu(search_path->nodes[0],
				4724	di, &di_key);
				4725	if (di_key.type == BTRFS_INODE_ITEM_KEY) {
				4726	ret = 1;
				4727	*other_ino = di_key.objectid;
				4728	} else {
				4729	ret = -EAGAIN;
				4730	}
				4731	goto out;
				4732	} else if (IS_ERR(di)) {
				4733	ret = PTR_ERR(di);
				4734	goto out;
				4735	}
				4736	btrfs_release_path(search_path);
				4737
				4738	cur_offset += this_len;
				4739	}
				4740	ret = 0;
				4741	out:
				4742	btrfs_free_path(search_path);
				4743	kfree(name);
				4744	return ret;
				4745	}
				4746
				4747	/* log a single inode in the tree log.
				4748	* At least one parent directory for this inode must exist in the tree
				4749	* or be logged already.
				4750	*
				4751	* Any items from this inode changed by the current transaction are copied
				4752	* to the log tree. An extra reference is taken on any extents in this
				4753	* file, allowing us to avoid a whole pile of corner cases around logging
				4754	* blocks that have been removed from the tree.
				4755	*
				4756	* See LOG_INODE_ALL and related defines for a description of what inode_only
				4757	* does.
				4758	*
				4759	* This handles both files and directories.
				4760	*/
				4761	static int btrfs_log_inode(struct btrfs_trans_handle *trans,
				4762	struct btrfs_root root, struct btrfs_inode inode,
				4763	int inode_only,
				4764	const loff_t start,
				4765	const loff_t end,
				4766	struct btrfs_log_ctx *ctx)
				4767	{
				4768	struct btrfs_fs_info *fs_info = root->fs_info;
				4769	struct btrfs_path *path;
				4770	struct btrfs_path *dst_path;
				4771	struct btrfs_key min_key;
				4772	struct btrfs_key max_key;
				4773	struct btrfs_root *log = root->log_root;
				4774	struct extent_buffer *src = NULL;
				4775	LIST_HEAD(logged_list);
				4776	int err = 0;
				4777	int ret;
				4778	int nritems;
				4779	int ins_start_slot = 0;
				4780	int ins_nr;
				4781	bool fast_search = false;
				4782	u64 ino = btrfs_ino(inode);
				4783	struct extent_map_tree *em_tree = &inode->extent_tree;
				4784	u64 logged_isize = 0;
				4785	bool need_log_inode_item = true;
				4786	bool xattrs_logged = false;
				4787
				4788	path = btrfs_alloc_path();
				4789	if (!path)
				4790	return -ENOMEM;
				4791	dst_path = btrfs_alloc_path();
				4792	if (!dst_path) {
				4793	btrfs_free_path(path);
				4794	return -ENOMEM;
				4795	}
				4796
				4797	min_key.objectid = ino;
				4798	min_key.type = BTRFS_INODE_ITEM_KEY;
				4799	min_key.offset = 0;
				4800
				4801	max_key.objectid = ino;
				4802
				4803
				4804	/* today the code can only do partial logging of directories */
				4805	if (S_ISDIR(inode->vfs_inode.i_mode) \|\|
				4806	(!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				4807	&inode->runtime_flags) &&
				4808	inode_only >= LOG_INODE_EXISTS))
				4809	max_key.type = BTRFS_XATTR_ITEM_KEY;
				4810	else
				4811	max_key.type = (u8)-1;
				4812	max_key.offset = (u64)-1;
				4813
				4814	/*
				4815	* Only run delayed items if we are a dir or a new file.
				4816	* Otherwise commit the delayed inode only, which is needed in
				4817	* order for the log replay code to mark inodes for link count
				4818	* fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
				4819	*/
				4820	if (S_ISDIR(inode->vfs_inode.i_mode) \|\|
				4821	inode->generation > fs_info->last_trans_committed)
				4822	ret = btrfs_commit_inode_delayed_items(trans, inode);
				4823	else
				4824	ret = btrfs_commit_inode_delayed_inode(inode);
				4825
				4826	if (ret) {
				4827	btrfs_free_path(path);
				4828	btrfs_free_path(dst_path);
				4829	return ret;
				4830	}
				4831
				4832	if (inode_only == LOG_OTHER_INODE) {
				4833	inode_only = LOG_INODE_EXISTS;
				4834	mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
				4835	} else {
				4836	mutex_lock(&inode->log_mutex);
				4837	}
				4838
				4839	/*
				4840	* a brute force approach to making sure we get the most uptodate
				4841	* copies of everything.
				4842	*/
				4843	if (S_ISDIR(inode->vfs_inode.i_mode)) {
				4844	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				4845
				4846	if (inode_only == LOG_INODE_EXISTS)
				4847	max_key_type = BTRFS_XATTR_ITEM_KEY;
				4848	ret = drop_objectid_items(trans, log, path, ino, max_key_type);
				4849	} else {
				4850	if (inode_only == LOG_INODE_EXISTS) {
				4851	/*
				4852	* Make sure the new inode item we write to the log has
				4853	* the same isize as the current one (if it exists).
				4854	* This is necessary to prevent data loss after log
				4855	* replay, and also to prevent doing a wrong expanding
				4856	* truncate - for e.g. create file, write 4K into offset
				4857	* 0, fsync, write 4K into offset 4096, add hard link,
				4858	* fsync some other file (to sync log), power fail - if
				4859	* we use the inode's current i_size, after log replay
				4860	* we get a 8Kb file, with the last 4Kb extent as a hole
				4861	* (zeroes), as if an expanding truncate happened,
				4862	* instead of getting a file of 4Kb only.
				4863	*/
				4864	err = logged_inode_size(log, inode, path, &logged_isize);
				4865	if (err)
				4866	goto out_unlock;
				4867	}
				4868	if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				4869	&inode->runtime_flags)) {
				4870	if (inode_only == LOG_INODE_EXISTS) {
				4871	max_key.type = BTRFS_XATTR_ITEM_KEY;
				4872	ret = drop_objectid_items(trans, log, path, ino,
				4873	max_key.type);
				4874	} else {
				4875	clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				4876	&inode->runtime_flags);
				4877	clear_bit(BTRFS_INODE_COPY_EVERYTHING,
				4878	&inode->runtime_flags);
				4879	while(1) {
				4880	ret = btrfs_truncate_inode_items(trans,
				4881	log, &inode->vfs_inode, 0, 0);
				4882	if (ret != -EAGAIN)
				4883	break;
				4884	}
				4885	}
				4886	} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
				4887	&inode->runtime_flags) \|\|
				4888	inode_only == LOG_INODE_EXISTS) {
				4889	if (inode_only == LOG_INODE_ALL)
				4890	fast_search = true;
				4891	max_key.type = BTRFS_XATTR_ITEM_KEY;
				4892	ret = drop_objectid_items(trans, log, path, ino,
				4893	max_key.type);
				4894	} else {
				4895	if (inode_only == LOG_INODE_ALL)
				4896	fast_search = true;
				4897	goto log_extents;
				4898	}
				4899
				4900	}
				4901	if (ret) {
				4902	err = ret;
				4903	goto out_unlock;
				4904	}
				4905
				4906	while (1) {
				4907	ins_nr = 0;
				4908	ret = btrfs_search_forward(root, &min_key,
				4909	path, trans->transid);
				4910	if (ret < 0) {
				4911	err = ret;
				4912	goto out_unlock;
				4913	}
				4914	if (ret != 0)
				4915	break;
				4916	again:
				4917	/* note, ins_nr might be > 0 here, cleanup outside the loop */
				4918	if (min_key.objectid != ino)
				4919	break;
				4920	if (min_key.type > max_key.type)
				4921	break;
				4922
				4923	if (min_key.type == BTRFS_INODE_ITEM_KEY)
				4924	need_log_inode_item = false;
				4925
				4926	if ((min_key.type == BTRFS_INODE_REF_KEY \|\|
				4927	min_key.type == BTRFS_INODE_EXTREF_KEY) &&
				4928	inode->generation == trans->transid) {
				4929	u64 other_ino = 0;
				4930
				4931	ret = btrfs_check_ref_name_override(path->nodes[0],
				4932	path->slots[0], &min_key, inode,
				4933	&other_ino);
				4934	if (ret < 0) {
				4935	err = ret;
				4936	goto out_unlock;
				4937	} else if (ret > 0 && ctx &&
				4938	other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
				4939	struct btrfs_key inode_key;
				4940	struct inode *other_inode;
				4941
				4942	if (ins_nr > 0) {
				4943	ins_nr++;
				4944	} else {
				4945	ins_nr = 1;
				4946	ins_start_slot = path->slots[0];
				4947	}
				4948	ret = copy_items(trans, inode, dst_path, path,
				4949	ins_start_slot,
				4950	ins_nr, inode_only,
				4951	logged_isize);
				4952	if (ret < 0) {
				4953	err = ret;
				4954	goto out_unlock;
				4955	}
				4956	ins_nr = 0;
				4957	btrfs_release_path(path);
				4958	inode_key.objectid = other_ino;
				4959	inode_key.type = BTRFS_INODE_ITEM_KEY;
				4960	inode_key.offset = 0;
				4961	other_inode = btrfs_iget(fs_info->sb,
				4962	&inode_key, root,
				4963	NULL);
				4964	/*
				4965	* If the other inode that had a conflicting dir
				4966	* entry was deleted in the current transaction,
				4967	* we don't need to do more work nor fallback to
				4968	* a transaction commit.
				4969	*/
				4970	if (IS_ERR(other_inode) &&
				4971	PTR_ERR(other_inode) == -ENOENT) {
				4972	goto next_key;
				4973	} else if (IS_ERR(other_inode)) {
				4974	err = PTR_ERR(other_inode);
				4975	goto out_unlock;
				4976	}
				4977	/*
				4978	* We are safe logging the other inode without
				4979	* acquiring its i_mutex as long as we log with
				4980	* the LOG_INODE_EXISTS mode. We're safe against
				4981	* concurrent renames of the other inode as well
				4982	* because during a rename we pin the log and
				4983	* update the log with the new name before we
				4984	* unpin it.
				4985	*/
				4986	err = btrfs_log_inode(trans, root,
				4987	BTRFS_I(other_inode),
				4988	LOG_OTHER_INODE, 0, LLONG_MAX,
				4989	ctx);
				4990	btrfs_add_delayed_iput(other_inode);
				4991	if (err)
				4992	goto out_unlock;
				4993	else
				4994	goto next_key;
				4995	}
				4996	}
				4997
				4998	/* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
				4999	if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
				5000	if (ins_nr == 0)
				5001	goto next_slot;
				5002	ret = copy_items(trans, inode, dst_path, path,
				5003	ins_start_slot,
				5004	ins_nr, inode_only, logged_isize);
				5005	if (ret < 0) {
				5006	err = ret;
				5007	goto out_unlock;
				5008	}
				5009	ins_nr = 0;
				5010	goto next_slot;
				5011	}
				5012
				5013	src = path->nodes[0];
				5014	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				5015	ins_nr++;
				5016	goto next_slot;
				5017	} else if (!ins_nr) {
				5018	ins_start_slot = path->slots[0];
				5019	ins_nr = 1;
				5020	goto next_slot;
				5021	}
				5022
				5023	ret = copy_items(trans, inode, dst_path, path,
				5024	ins_start_slot, ins_nr, inode_only,
				5025	logged_isize);
				5026	if (ret < 0) {
				5027	err = ret;
				5028	goto out_unlock;
				5029	}
				5030	ins_nr = 1;
				5031	ins_start_slot = path->slots[0];
				5032	next_slot:
				5033
				5034	nritems = btrfs_header_nritems(path->nodes[0]);
				5035	path->slots[0]++;
				5036	if (path->slots[0] < nritems) {
				5037	btrfs_item_key_to_cpu(path->nodes[0], &min_key,
				5038	path->slots[0]);
				5039	goto again;
				5040	}
				5041	if (ins_nr) {
				5042	ret = copy_items(trans, inode, dst_path, path,
				5043	ins_start_slot,
				5044	ins_nr, inode_only, logged_isize);
				5045	if (ret < 0) {
				5046	err = ret;
				5047	goto out_unlock;
				5048	}
				5049	ins_nr = 0;
				5050	}
				5051	btrfs_release_path(path);
				5052	next_key:
				5053	if (min_key.offset < (u64)-1) {
				5054	min_key.offset++;
				5055	} else if (min_key.type < max_key.type) {
				5056	min_key.type++;
				5057	min_key.offset = 0;
				5058	} else {
				5059	break;
				5060	}
				5061	}
				5062	if (ins_nr) {
				5063	ret = copy_items(trans, inode, dst_path, path,
				5064	ins_start_slot, ins_nr, inode_only,
				5065	logged_isize);
				5066	if (ret < 0) {
				5067	err = ret;
				5068	goto out_unlock;
				5069	}
				5070	ins_nr = 0;
				5071	}
				5072
				5073	btrfs_release_path(path);
				5074	btrfs_release_path(dst_path);
				5075	err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
				5076	if (err)
				5077	goto out_unlock;
				5078	xattrs_logged = true;
				5079	if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
				5080	btrfs_release_path(path);
				5081	btrfs_release_path(dst_path);
				5082	err = btrfs_log_holes(trans, root, inode, path);
				5083	if (err)
				5084	goto out_unlock;
				5085	}
				5086	log_extents:
				5087	btrfs_release_path(path);
				5088	btrfs_release_path(dst_path);
				5089	if (need_log_inode_item) {
				5090	err = log_inode_item(trans, log, dst_path, inode);
				5091	if (!err && !xattrs_logged) {
				5092	err = btrfs_log_all_xattrs(trans, root, inode, path,
				5093	dst_path);
				5094	btrfs_release_path(path);
				5095	}
				5096	if (err)
				5097	goto out_unlock;
				5098	}
				5099	if (fast_search) {
				5100	ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
				5101	&logged_list, ctx, start, end);
				5102	if (ret) {
				5103	err = ret;
				5104	goto out_unlock;
				5105	}
				5106	} else if (inode_only == LOG_INODE_ALL) {
				5107	struct extent_map em, n;
				5108
				5109	write_lock(&em_tree->lock);
				5110	/*
				5111	* We can't just remove every em if we're called for a ranged
				5112	* fsync - that is, one that doesn't cover the whole possible
				5113	* file range (0 to LLONG_MAX). This is because we can have
				5114	* em's that fall outside the range we're logging and therefore
				5115	* their ordered operations haven't completed yet
				5116	* (btrfs_finish_ordered_io() not invoked yet). This means we
				5117	* didn't get their respective file extent item in the fs/subvol
				5118	* tree yet, and need to let the next fast fsync (one which
				5119	* consults the list of modified extent maps) find the em so
				5120	* that it logs a matching file extent item and waits for the
				5121	* respective ordered operation to complete (if it's still
				5122	* running).
				5123	*
				5124	* Removing every em outside the range we're logging would make
				5125	* the next fast fsync not log their matching file extent items,
				5126	* therefore making us lose data after a log replay.
				5127	*/
				5128	list_for_each_entry_safe(em, n, &em_tree->modified_extents,
				5129	list) {
				5130	const u64 mod_end = em->mod_start + em->mod_len - 1;
				5131
				5132	if (em->mod_start >= start && mod_end <= end)
				5133	list_del_init(&em->list);
				5134	}
				5135	write_unlock(&em_tree->lock);
				5136	}
				5137
				5138	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
				5139	ret = log_directory_changes(trans, root, inode, path, dst_path,
				5140	ctx);
				5141	if (ret) {
				5142	err = ret;
				5143	goto out_unlock;
				5144	}
				5145	}
				5146
				5147	/*
				5148	* Don't update last_log_commit if we logged that an inode exists after
				5149	* it was loaded to memory (full_sync bit set).
				5150	* This is to prevent data loss when we do a write to the inode, then
				5151	* the inode gets evicted after all delalloc was flushed, then we log
				5152	* it exists (due to a rename for example) and then fsync it. This last
				5153	* fsync would do nothing (not logging the extents previously written).
				5154	*/
				5155	spin_lock(&inode->lock);
				5156	inode->logged_trans = trans->transid;
				5157	if (inode_only != LOG_INODE_EXISTS \|\|
				5158	!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
				5159	inode->last_log_commit = inode->last_sub_trans;
				5160	spin_unlock(&inode->lock);
				5161	out_unlock:
				5162	if (unlikely(err))
				5163	btrfs_put_logged_extents(&logged_list);
				5164	else
				5165	btrfs_submit_logged_extents(&logged_list, log);
				5166	mutex_unlock(&inode->log_mutex);
				5167
				5168	btrfs_free_path(path);
				5169	btrfs_free_path(dst_path);
				5170	return err;
				5171	}
				5172
				5173	/*
				5174	* Check if we must fallback to a transaction commit when logging an inode.
				5175	* This must be called after logging the inode and is used only in the context
				5176	* when fsyncing an inode requires the need to log some other inode - in which
				5177	* case we can't lock the i_mutex of each other inode we need to log as that
				5178	* can lead to deadlocks with concurrent fsync against other inodes (as we can
				5179	* log inodes up or down in the hierarchy) or rename operations for example. So
				5180	* we take the log_mutex of the inode after we have logged it and then check for
				5181	* its last_unlink_trans value - this is safe because any task setting
				5182	* last_unlink_trans must take the log_mutex and it must do this before it does
				5183	* the actual unlink operation, so if we do this check before a concurrent task
				5184	* sets last_unlink_trans it means we've logged a consistent version/state of
				5185	* all the inode items, otherwise we are not sure and must do a transaction
				5186	* commit (the concurrent task might have only updated last_unlink_trans before
				5187	* we logged the inode or it might have also done the unlink).
				5188	*/
				5189	static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
				5190	struct btrfs_inode *inode)
				5191	{
				5192	struct btrfs_fs_info *fs_info = inode->root->fs_info;
				5193	bool ret = false;
				5194
				5195	mutex_lock(&inode->log_mutex);
				5196	if (inode->last_unlink_trans > fs_info->last_trans_committed) {
				5197	/*
				5198	* Make sure any commits to the log are forced to be full
				5199	* commits.
				5200	*/
				5201	btrfs_set_log_full_commit(fs_info, trans);
				5202	ret = true;
				5203	}
				5204	mutex_unlock(&inode->log_mutex);
				5205
				5206	return ret;
				5207	}
				5208
				5209	/*
				5210	* follow the dentry parent pointers up the chain and see if any
				5211	* of the directories in it require a full commit before they can
				5212	* be logged. Returns zero if nothing special needs to be done or 1 if
				5213	* a full commit is required.
				5214	*/
				5215	static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
				5216	struct btrfs_inode *inode,
				5217	struct dentry *parent,
				5218	struct super_block *sb,
				5219	u64 last_committed)
				5220	{
				5221	int ret = 0;
				5222	struct dentry *old_parent = NULL;
				5223
				5224	/*
				5225	* for regular files, if its inode is already on disk, we don't
				5226	* have to worry about the parents at all. This is because
				5227	* we can use the last_unlink_trans field to record renames
				5228	* and other fun in this file.
				5229	*/
				5230	if (S_ISREG(inode->vfs_inode.i_mode) &&
				5231	inode->generation <= last_committed &&
				5232	inode->last_unlink_trans <= last_committed)
				5233	goto out;
				5234
				5235	if (!S_ISDIR(inode->vfs_inode.i_mode)) {
				5236	if (!parent \|\| d_really_is_negative(parent) \|\| sb != parent->d_sb)
				5237	goto out;
				5238	inode = BTRFS_I(d_inode(parent));
				5239	}
				5240
				5241	while (1) {
				5242	if (btrfs_must_commit_transaction(trans, inode)) {
				5243	ret = 1;
				5244	break;
				5245	}
				5246
				5247	if (!parent \|\| d_really_is_negative(parent) \|\| sb != parent->d_sb)
				5248	break;
				5249
				5250	if (IS_ROOT(parent)) {
				5251	inode = BTRFS_I(d_inode(parent));
				5252	if (btrfs_must_commit_transaction(trans, inode))
				5253	ret = 1;
				5254	break;
				5255	}
				5256
				5257	parent = dget_parent(parent);
				5258	dput(old_parent);
				5259	old_parent = parent;
				5260	inode = BTRFS_I(d_inode(parent));
				5261
				5262	}
				5263	dput(old_parent);
				5264	out:
				5265	return ret;
				5266	}
				5267
				5268	struct btrfs_dir_list {
				5269	u64 ino;
				5270	struct list_head list;
				5271	};
				5272
				5273	/*
				5274	* Log the inodes of the new dentries of a directory. See log_dir_items() for
				5275	* details about the why it is needed.
				5276	* This is a recursive operation - if an existing dentry corresponds to a
				5277	* directory, that directory's new entries are logged too (same behaviour as
				5278	* ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
				5279	* the dentries point to we do not lock their i_mutex, otherwise lockdep
				5280	* complains about the following circular lock dependency / possible deadlock:
				5281	*
				5282	* CPU0 CPU1
				5283	* ---- ----
				5284	* lock(&type->i_mutex_dir_key#3/2);
				5285	* lock(sb_internal#2);
				5286	* lock(&type->i_mutex_dir_key#3/2);
				5287	* lock(&sb->s_type->i_mutex_key#14);
				5288	*
				5289	* Where sb_internal is the lock (a counter that works as a lock) acquired by
				5290	* sb_start_intwrite() in btrfs_start_transaction().
				5291	* Not locking i_mutex of the inodes is still safe because:
				5292	*
				5293	* 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
				5294	* that while logging the inode new references (names) are added or removed
				5295	* from the inode, leaving the logged inode item with a link count that does
				5296	* not match the number of logged inode reference items. This is fine because
				5297	* at log replay time we compute the real number of links and correct the
				5298	* link count in the inode item (see replay_one_buffer() and
				5299	* link_to_fixup_dir());
				5300	*
				5301	* 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
				5302	* while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
				5303	* BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
				5304	* has a size that doesn't match the sum of the lengths of all the logged
				5305	* names. This does not result in a problem because if a dir_item key is
				5306	* logged but its matching dir_index key is not logged, at log replay time we
				5307	* don't use it to replay the respective name (see replay_one_name()). On the
				5308	* other hand if only the dir_index key ends up being logged, the respective
				5309	* name is added to the fs/subvol tree with both the dir_item and dir_index
				5310	* keys created (see replay_one_name()).
				5311	* The directory's inode item with a wrong i_size is not a problem as well,
				5312	* since we don't use it at log replay time to set the i_size in the inode
				5313	* item of the fs/subvol tree (see overwrite_item()).
				5314	*/
				5315	static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
				5316	struct btrfs_root *root,
				5317	struct btrfs_inode *start_inode,
				5318	struct btrfs_log_ctx *ctx)
				5319	{
				5320	struct btrfs_fs_info *fs_info = root->fs_info;
				5321	struct btrfs_root *log = root->log_root;
				5322	struct btrfs_path *path;
				5323	LIST_HEAD(dir_list);
				5324	struct btrfs_dir_list *dir_elem;
				5325	int ret = 0;
				5326
				5327	path = btrfs_alloc_path();
				5328	if (!path)
				5329	return -ENOMEM;
				5330
				5331	dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
				5332	if (!dir_elem) {
				5333	btrfs_free_path(path);
				5334	return -ENOMEM;
				5335	}
				5336	dir_elem->ino = btrfs_ino(start_inode);
				5337	list_add_tail(&dir_elem->list, &dir_list);
				5338
				5339	while (!list_empty(&dir_list)) {
				5340	struct extent_buffer *leaf;
				5341	struct btrfs_key min_key;
				5342	int nritems;
				5343	int i;
				5344
				5345	dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
				5346	list);
				5347	if (ret)
				5348	goto next_dir_inode;
				5349
				5350	min_key.objectid = dir_elem->ino;
				5351	min_key.type = BTRFS_DIR_ITEM_KEY;
				5352	min_key.offset = 0;
				5353	again:
				5354	btrfs_release_path(path);
				5355	ret = btrfs_search_forward(log, &min_key, path, trans->transid);
				5356	if (ret < 0) {
				5357	goto next_dir_inode;
				5358	} else if (ret > 0) {
				5359	ret = 0;
				5360	goto next_dir_inode;
				5361	}
				5362
				5363	process_leaf:
				5364	leaf = path->nodes[0];
				5365	nritems = btrfs_header_nritems(leaf);
				5366	for (i = path->slots[0]; i < nritems; i++) {
				5367	struct btrfs_dir_item *di;
				5368	struct btrfs_key di_key;
				5369	struct inode *di_inode;
				5370	struct btrfs_dir_list *new_dir_elem;
				5371	int log_mode = LOG_INODE_EXISTS;
				5372	int type;
				5373
				5374	btrfs_item_key_to_cpu(leaf, &min_key, i);
				5375	if (min_key.objectid != dir_elem->ino \|\|
				5376	min_key.type != BTRFS_DIR_ITEM_KEY)
				5377	goto next_dir_inode;
				5378
				5379	di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
				5380	type = btrfs_dir_type(leaf, di);
				5381	if (btrfs_dir_transid(leaf, di) < trans->transid &&
				5382	type != BTRFS_FT_DIR)
				5383	continue;
				5384	btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
				5385	if (di_key.type == BTRFS_ROOT_ITEM_KEY)
				5386	continue;
				5387
				5388	btrfs_release_path(path);
				5389	di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
				5390	if (IS_ERR(di_inode)) {
				5391	ret = PTR_ERR(di_inode);
				5392	goto next_dir_inode;
				5393	}
				5394
				5395	if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
				5396	btrfs_add_delayed_iput(di_inode);
				5397	break;
				5398	}
				5399
				5400	ctx->log_new_dentries = false;
				5401	if (type == BTRFS_FT_DIR \|\| type == BTRFS_FT_SYMLINK)
				5402	log_mode = LOG_INODE_ALL;
				5403	ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
				5404	log_mode, 0, LLONG_MAX, ctx);
				5405	if (!ret &&
				5406	btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
				5407	ret = 1;
				5408	btrfs_add_delayed_iput(di_inode);
				5409	if (ret)
				5410	goto next_dir_inode;
				5411	if (ctx->log_new_dentries) {
				5412	new_dir_elem = kmalloc(sizeof(*new_dir_elem),
				5413	GFP_NOFS);
				5414	if (!new_dir_elem) {
				5415	ret = -ENOMEM;
				5416	goto next_dir_inode;
				5417	}
				5418	new_dir_elem->ino = di_key.objectid;
				5419	list_add_tail(&new_dir_elem->list, &dir_list);
				5420	}
				5421	break;
				5422	}
				5423	if (i == nritems) {
				5424	ret = btrfs_next_leaf(log, path);
				5425	if (ret < 0) {
				5426	goto next_dir_inode;
				5427	} else if (ret > 0) {
				5428	ret = 0;
				5429	goto next_dir_inode;
				5430	}
				5431	goto process_leaf;
				5432	}
				5433	if (min_key.offset < (u64)-1) {
				5434	min_key.offset++;
				5435	goto again;
				5436	}
				5437	next_dir_inode:
				5438	list_del(&dir_elem->list);
				5439	kfree(dir_elem);
				5440	}
				5441
				5442	btrfs_free_path(path);
				5443	return ret;
				5444	}
				5445
				5446	static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
				5447	struct btrfs_inode *inode,
				5448	struct btrfs_log_ctx *ctx)
				5449	{
				5450	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				5451	int ret;
				5452	struct btrfs_path *path;
				5453	struct btrfs_key key;
				5454	struct btrfs_root *root = inode->root;
				5455	const u64 ino = btrfs_ino(inode);
				5456
				5457	path = btrfs_alloc_path();
				5458	if (!path)
				5459	return -ENOMEM;
				5460	path->skip_locking = 1;
				5461	path->search_commit_root = 1;
				5462
				5463	key.objectid = ino;
				5464	key.type = BTRFS_INODE_REF_KEY;
				5465	key.offset = 0;
				5466	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				5467	if (ret < 0)
				5468	goto out;
				5469
				5470	while (true) {
				5471	struct extent_buffer *leaf = path->nodes[0];
				5472	int slot = path->slots[0];
				5473	u32 cur_offset = 0;
				5474	u32 item_size;
				5475	unsigned long ptr;
				5476
				5477	if (slot >= btrfs_header_nritems(leaf)) {
				5478	ret = btrfs_next_leaf(root, path);
				5479	if (ret < 0)
				5480	goto out;
				5481	else if (ret > 0)
				5482	break;
				5483	continue;
				5484	}
				5485
				5486	btrfs_item_key_to_cpu(leaf, &key, slot);
				5487	/* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
				5488	if (key.objectid != ino \|\| key.type > BTRFS_INODE_EXTREF_KEY)
				5489	break;
				5490
				5491	item_size = btrfs_item_size_nr(leaf, slot);
				5492	ptr = btrfs_item_ptr_offset(leaf, slot);
				5493	while (cur_offset < item_size) {
				5494	struct btrfs_key inode_key;
				5495	struct inode *dir_inode;
				5496
				5497	inode_key.type = BTRFS_INODE_ITEM_KEY;
				5498	inode_key.offset = 0;
				5499
				5500	if (key.type == BTRFS_INODE_EXTREF_KEY) {
				5501	struct btrfs_inode_extref *extref;
				5502
				5503	extref = (struct btrfs_inode_extref *)
				5504	(ptr + cur_offset);
				5505	inode_key.objectid = btrfs_inode_extref_parent(
				5506	leaf, extref);
				5507	cur_offset += sizeof(*extref);
				5508	cur_offset += btrfs_inode_extref_name_len(leaf,
				5509	extref);
				5510	} else {
				5511	inode_key.objectid = key.offset;
				5512	cur_offset = item_size;
				5513	}
				5514
				5515	dir_inode = btrfs_iget(fs_info->sb, &inode_key,
				5516	root, NULL);
				5517	/*
				5518	* If the parent inode was deleted, return an error to
				5519	* fallback to a transaction commit. This is to prevent
				5520	* getting an inode that was moved from one parent A to
				5521	* a parent B, got its former parent A deleted and then
				5522	* it got fsync'ed, from existing at both parents after
				5523	* a log replay (and the old parent still existing).
				5524	* Example:
				5525	*
				5526	* mkdir /mnt/A
				5527	* mkdir /mnt/B
				5528	* touch /mnt/B/bar
				5529	* sync
				5530	* mv /mnt/B/bar /mnt/A/bar
				5531	* mv -T /mnt/A /mnt/B
				5532	* fsync /mnt/B/bar
				5533	* <power fail>
				5534	*
				5535	* If we ignore the old parent B which got deleted,
				5536	* after a log replay we would have file bar linked
				5537	* at both parents and the old parent B would still
				5538	* exist.
				5539	*/
				5540	if (IS_ERR(dir_inode)) {
				5541	ret = PTR_ERR(dir_inode);
				5542	goto out;
				5543	}
				5544
				5545	if (ctx)
				5546	ctx->log_new_dentries = false;
				5547	ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
				5548	LOG_INODE_ALL, 0, LLONG_MAX, ctx);
				5549	if (!ret &&
				5550	btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
				5551	ret = 1;
				5552	if (!ret && ctx && ctx->log_new_dentries)
				5553	ret = log_new_dir_dentries(trans, root,
				5554	BTRFS_I(dir_inode), ctx);
				5555	btrfs_add_delayed_iput(dir_inode);
				5556	if (ret)
				5557	goto out;
				5558	}
				5559	path->slots[0]++;
				5560	}
				5561	ret = 0;
				5562	out:
				5563	btrfs_free_path(path);
				5564	return ret;
				5565	}
				5566
				5567	/*
				5568	* helper function around btrfs_log_inode to make sure newly created
				5569	* parent directories also end up in the log. A minimal inode and backref
				5570	* only logging is done of any parent directories that are older than
				5571	* the last committed transaction
				5572	*/
				5573	static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
				5574	struct btrfs_root *root,
				5575	struct btrfs_inode *inode,
				5576	struct dentry *parent,
				5577	const loff_t start,
				5578	const loff_t end,
				5579	int exists_only,
				5580	struct btrfs_log_ctx *ctx)
				5581	{
				5582	struct btrfs_fs_info *fs_info = root->fs_info;
				5583	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
				5584	struct super_block *sb;
				5585	struct dentry *old_parent = NULL;
				5586	int ret = 0;
				5587	u64 last_committed = fs_info->last_trans_committed;
				5588	bool log_dentries = false;
				5589	struct btrfs_inode *orig_inode = inode;
				5590
				5591	sb = inode->vfs_inode.i_sb;
				5592
				5593	if (btrfs_test_opt(fs_info, NOTREELOG)) {
				5594	ret = 1;
				5595	goto end_no_trans;
				5596	}
				5597
				5598	/*
				5599	* The prev transaction commit doesn't complete, we need do
				5600	* full commit by ourselves.
				5601	*/
				5602	if (fs_info->last_trans_log_full_commit >
				5603	fs_info->last_trans_committed) {
				5604	ret = 1;
				5605	goto end_no_trans;
				5606	}
				5607
				5608	if (root != inode->root \|\| btrfs_root_refs(&root->root_item) == 0) {
				5609	ret = 1;
				5610	goto end_no_trans;
				5611	}
				5612
				5613	ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
				5614	last_committed);
				5615	if (ret)
				5616	goto end_no_trans;
				5617
				5618	/*
				5619	* Skip already logged inodes or inodes corresponding to tmpfiles
				5620	* (since logging them is pointless, a link count of 0 means they
				5621	* will never be accessible).
				5622	*/
				5623	if (btrfs_inode_in_log(inode, trans->transid) \|\|
				5624	inode->vfs_inode.i_nlink == 0) {
				5625	ret = BTRFS_NO_LOG_SYNC;
				5626	goto end_no_trans;
				5627	}
				5628
				5629	ret = start_log_trans(trans, root, ctx);
				5630	if (ret)
				5631	goto end_no_trans;
				5632
				5633	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
				5634	if (ret)
				5635	goto end_trans;
				5636
				5637	/*
				5638	* for regular files, if its inode is already on disk, we don't
				5639	* have to worry about the parents at all. This is because
				5640	* we can use the last_unlink_trans field to record renames
				5641	* and other fun in this file.
				5642	*/
				5643	if (S_ISREG(inode->vfs_inode.i_mode) &&
				5644	inode->generation <= last_committed &&
				5645	inode->last_unlink_trans <= last_committed) {
				5646	ret = 0;
				5647	goto end_trans;
				5648	}
				5649
				5650	if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
				5651	log_dentries = true;
				5652
				5653	/*
				5654	* On unlink we must make sure all our current and old parent directory
				5655	* inodes are fully logged. This is to prevent leaving dangling
				5656	* directory index entries in directories that were our parents but are
				5657	* not anymore. Not doing this results in old parent directory being
				5658	* impossible to delete after log replay (rmdir will always fail with
				5659	* error -ENOTEMPTY).
				5660	*
				5661	* Example 1:
				5662	*
				5663	* mkdir testdir
				5664	* touch testdir/foo
				5665	* ln testdir/foo testdir/bar
				5666	* sync
				5667	* unlink testdir/bar
				5668	* xfs_io -c fsync testdir/foo
				5669	* <power failure>
				5670	* mount fs, triggers log replay
				5671	*
				5672	* If we don't log the parent directory (testdir), after log replay the
				5673	* directory still has an entry pointing to the file inode using the bar
				5674	* name, but a matching BTRFS_INODE_[REF\|EXTREF]_KEY does not exist and
				5675	* the file inode has a link count of 1.
				5676	*
				5677	* Example 2:
				5678	*
				5679	* mkdir testdir
				5680	* touch foo
				5681	* ln foo testdir/foo2
				5682	* ln foo testdir/foo3
				5683	* sync
				5684	* unlink testdir/foo3
				5685	* xfs_io -c fsync foo
				5686	* <power failure>
				5687	* mount fs, triggers log replay
				5688	*
				5689	* Similar as the first example, after log replay the parent directory
				5690	* testdir still has an entry pointing to the inode file with name foo3
				5691	* but the file inode does not have a matching BTRFS_INODE_REF_KEY item
				5692	* and has a link count of 2.
				5693	*/
				5694	if (inode->last_unlink_trans > last_committed) {
				5695	ret = btrfs_log_all_parents(trans, orig_inode, ctx);
				5696	if (ret)
				5697	goto end_trans;
				5698	}
				5699
				5700	/*
				5701	* If a new hard link was added to the inode in the current transaction
				5702	* and its link count is now greater than 1, we need to fallback to a
				5703	* transaction commit, otherwise we can end up not logging all its new
				5704	* parents for all the hard links. Here just from the dentry used to
				5705	* fsync, we can not visit the ancestor inodes for all the other hard
				5706	* links to figure out if any is new, so we fallback to a transaction
				5707	* commit (instead of adding a lot of complexity of scanning a btree,
				5708	* since this scenario is not a common use case).
				5709	*/
				5710	if (inode->vfs_inode.i_nlink > 1 &&
				5711	inode->last_link_trans > last_committed) {
				5712	ret = -EMLINK;
				5713	goto end_trans;
				5714	}
				5715
				5716	while (1) {
				5717	if (!parent \|\| d_really_is_negative(parent) \|\| sb != parent->d_sb)
				5718	break;
				5719
				5720	inode = BTRFS_I(d_inode(parent));
				5721	if (root != inode->root)
				5722	break;
				5723
				5724	if (inode->generation > last_committed) {
				5725	ret = btrfs_log_inode(trans, root, inode,
				5726	LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
				5727	if (ret)
				5728	goto end_trans;
				5729	}
				5730	if (IS_ROOT(parent))
				5731	break;
				5732
				5733	parent = dget_parent(parent);
				5734	dput(old_parent);
				5735	old_parent = parent;
				5736	}
				5737	if (log_dentries)
				5738	ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
				5739	else
				5740	ret = 0;
				5741	end_trans:
				5742	dput(old_parent);
				5743	if (ret < 0) {
				5744	btrfs_set_log_full_commit(fs_info, trans);
				5745	ret = 1;
				5746	}
				5747
				5748	if (ret)
				5749	btrfs_remove_log_ctx(root, ctx);
				5750	btrfs_end_log_trans(root);
				5751	end_no_trans:
				5752	return ret;
				5753	}
				5754
				5755	/*
				5756	* it is not safe to log dentry if the chunk root has added new
				5757	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				5758	* If this returns 1, you must commit the transaction to safely get your
				5759	* data on disk.
				5760	*/
				5761	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				5762	struct btrfs_root root, struct dentry dentry,
				5763	const loff_t start,
				5764	const loff_t end,
				5765	struct btrfs_log_ctx *ctx)
				5766	{
				5767	struct dentry *parent = dget_parent(dentry);
				5768	int ret;
				5769
				5770	ret = btrfs_log_inode_parent(trans, root, BTRFS_I(d_inode(dentry)),
				5771	parent, start, end, 0, ctx);
				5772	dput(parent);
				5773
				5774	return ret;
				5775	}
				5776
				5777	/*
				5778	* should be called during mount to recover any replay any log trees
				5779	* from the FS
				5780	*/
				5781	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				5782	{
				5783	int ret;
				5784	struct btrfs_path *path;
				5785	struct btrfs_trans_handle *trans;
				5786	struct btrfs_key key;
				5787	struct btrfs_key found_key;
				5788	struct btrfs_key tmp_key;
				5789	struct btrfs_root *log;
				5790	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
				5791	struct walk_control wc = {
				5792	.process_func = process_one_buffer,
				5793	.stage = 0,
				5794	};
				5795
				5796	path = btrfs_alloc_path();
				5797	if (!path)
				5798	return -ENOMEM;
				5799
				5800	set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
				5801
				5802	trans = btrfs_start_transaction(fs_info->tree_root, 0);
				5803	if (IS_ERR(trans)) {
				5804	ret = PTR_ERR(trans);
				5805	goto error;
				5806	}
				5807
				5808	wc.trans = trans;
				5809	wc.pin = 1;
				5810
				5811	ret = walk_log_tree(trans, log_root_tree, &wc);
				5812	if (ret) {
				5813	btrfs_handle_fs_error(fs_info, ret,
				5814	"Failed to pin buffers while recovering log root tree.");
				5815	goto error;
				5816	}
				5817
				5818	again:
				5819	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				5820	key.offset = (u64)-1;
				5821	key.type = BTRFS_ROOT_ITEM_KEY;
				5822
				5823	while (1) {
				5824	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				5825
				5826	if (ret < 0) {
				5827	btrfs_handle_fs_error(fs_info, ret,
				5828	"Couldn't find tree log root.");
				5829	goto error;
				5830	}
				5831	if (ret > 0) {
				5832	if (path->slots[0] == 0)
				5833	break;
				5834	path->slots[0]--;
				5835	}
				5836	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				5837	path->slots[0]);
				5838	btrfs_release_path(path);
				5839	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				5840	break;
				5841
				5842	log = btrfs_read_fs_root(log_root_tree, &found_key);
				5843	if (IS_ERR(log)) {
				5844	ret = PTR_ERR(log);
				5845	btrfs_handle_fs_error(fs_info, ret,
				5846	"Couldn't read tree log root.");
				5847	goto error;
				5848	}
				5849
				5850	tmp_key.objectid = found_key.offset;
				5851	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				5852	tmp_key.offset = (u64)-1;
				5853
				5854	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
				5855	if (IS_ERR(wc.replay_dest)) {
				5856	ret = PTR_ERR(wc.replay_dest);
				5857
				5858	/*
				5859	* We didn't find the subvol, likely because it was
				5860	* deleted. This is ok, simply skip this log and go to
				5861	* the next one.
				5862	*
				5863	* We need to exclude the root because we can't have
				5864	* other log replays overwriting this log as we'll read
				5865	* it back in a few more times. This will keep our
				5866	* block from being modified, and we'll just bail for
				5867	* each subsequent pass.
				5868	*/
				5869	if (ret == -ENOENT)
				5870	ret = btrfs_pin_extent_for_log_replay(fs_info,
				5871	log->node->start,
				5872	log->node->len);
				5873	free_extent_buffer(log->node);
				5874	free_extent_buffer(log->commit_root);
				5875	kfree(log);
				5876
				5877	if (!ret)
				5878	goto next;
				5879	btrfs_handle_fs_error(fs_info, ret,
				5880	"Couldn't read target root for tree log recovery.");
				5881	goto error;
				5882	}
				5883
				5884	wc.replay_dest->log_root = log;
				5885	btrfs_record_root_in_trans(trans, wc.replay_dest);
				5886	ret = walk_log_tree(trans, log, &wc);
				5887
				5888	if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
				5889	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				5890	path);
				5891	}
				5892
				5893	if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
				5894	struct btrfs_root *root = wc.replay_dest;
				5895
				5896	btrfs_release_path(path);
				5897
				5898	/*
				5899	* We have just replayed everything, and the highest
				5900	* objectid of fs roots probably has changed in case
				5901	* some inode_item's got replayed.
				5902	*
				5903	* root->objectid_mutex is not acquired as log replay
				5904	* could only happen during mount.
				5905	*/
				5906	ret = btrfs_find_highest_objectid(root,
				5907	&root->highest_objectid);
				5908	}
				5909
				5910	wc.replay_dest->log_root = NULL;
				5911	free_extent_buffer(log->node);
				5912	free_extent_buffer(log->commit_root);
				5913	kfree(log);
				5914
				5915	if (ret)
				5916	goto error;
				5917	next:
				5918	if (found_key.offset == 0)
				5919	break;
				5920	key.offset = found_key.offset - 1;
				5921	}
				5922	btrfs_release_path(path);
				5923
				5924	/* step one is to pin it all, step two is to replay just inodes */
				5925	if (wc.pin) {
				5926	wc.pin = 0;
				5927	wc.process_func = replay_one_buffer;
				5928	wc.stage = LOG_WALK_REPLAY_INODES;
				5929	goto again;
				5930	}
				5931	/* step three is to replay everything */
				5932	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				5933	wc.stage++;
				5934	goto again;
				5935	}
				5936
				5937	btrfs_free_path(path);
				5938
				5939	/* step 4: commit the transaction, which also unpins the blocks */
				5940	ret = btrfs_commit_transaction(trans);
				5941	if (ret)
				5942	return ret;
				5943
				5944	free_extent_buffer(log_root_tree->node);
				5945	log_root_tree->log_root = NULL;
				5946	clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
				5947	kfree(log_root_tree);
				5948
				5949	return 0;
				5950	error:
				5951	if (wc.trans)
				5952	btrfs_end_transaction(wc.trans);
				5953	btrfs_free_path(path);
				5954	return ret;
				5955	}
				5956
				5957	/*
				5958	* there are some corner cases where we want to force a full
				5959	* commit instead of allowing a directory to be logged.
				5960	*
				5961	* They revolve around files there were unlinked from the directory, and
				5962	* this function updates the parent directory so that a full commit is
				5963	* properly done if it is fsync'd later after the unlinks are done.
				5964	*
				5965	* Must be called before the unlink operations (updates to the subvolume tree,
				5966	* inodes, etc) are done.
				5967	*/
				5968	void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
				5969	struct btrfs_inode dir, struct btrfs_inode inode,
				5970	int for_rename)
				5971	{
				5972	/*
				5973	* when we're logging a file, if it hasn't been renamed
				5974	* or unlinked, and its inode is fully committed on disk,
				5975	* we don't have to worry about walking up the directory chain
				5976	* to log its parents.
				5977	*
				5978	* So, we use the last_unlink_trans field to put this transid
				5979	* into the file. When the file is logged we check it and
				5980	* don't log the parents if the file is fully on disk.
				5981	*/
				5982	mutex_lock(&inode->log_mutex);
				5983	inode->last_unlink_trans = trans->transid;
				5984	mutex_unlock(&inode->log_mutex);
				5985
				5986	/*
				5987	* if this directory was already logged any new
				5988	* names for this file/dir will get recorded
				5989	*/
				5990	if (dir->logged_trans == trans->transid)
				5991	return;
				5992
				5993	/*
				5994	* if the inode we're about to unlink was logged,
				5995	* the log will be properly updated for any new names
				5996	*/
				5997	if (inode->logged_trans == trans->transid)
				5998	return;
				5999
				6000	/*
				6001	* when renaming files across directories, if the directory
				6002	* there we're unlinking from gets fsync'd later on, there's
				6003	* no way to find the destination directory later and fsync it
				6004	* properly. So, we have to be conservative and force commits
				6005	* so the new name gets discovered.
				6006	*/
				6007	if (for_rename)
				6008	goto record;
				6009
				6010	/* we can safely do the unlink without any special recording */
				6011	return;
				6012
				6013	record:
				6014	mutex_lock(&dir->log_mutex);
				6015	dir->last_unlink_trans = trans->transid;
				6016	mutex_unlock(&dir->log_mutex);
				6017	}
				6018
				6019	/*
				6020	* Make sure that if someone attempts to fsync the parent directory of a deleted
				6021	* snapshot, it ends up triggering a transaction commit. This is to guarantee
				6022	* that after replaying the log tree of the parent directory's root we will not
				6023	* see the snapshot anymore and at log replay time we will not see any log tree
				6024	* corresponding to the deleted snapshot's root, which could lead to replaying
				6025	* it after replaying the log tree of the parent directory (which would replay
				6026	* the snapshot delete operation).
				6027	*
				6028	* Must be called before the actual snapshot destroy operation (updates to the
				6029	* parent root and tree of tree roots trees, etc) are done.
				6030	*/
				6031	void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
				6032	struct btrfs_inode *dir)
				6033	{
				6034	mutex_lock(&dir->log_mutex);
				6035	dir->last_unlink_trans = trans->transid;
				6036	mutex_unlock(&dir->log_mutex);
				6037	}
				6038
				6039	/*
				6040	* Call this after adding a new name for a file and it will properly
				6041	* update the log to reflect the new name.
				6042	*
				6043	* It will return zero if all goes well, and it will return 1 if a
				6044	* full transaction commit is required.
				6045	*/
				6046	int btrfs_log_new_name(struct btrfs_trans_handle *trans,
				6047	struct btrfs_inode inode, struct btrfs_inode old_dir,
				6048	struct dentry *parent)
				6049	{
				6050	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
				6051	struct btrfs_root *root = inode->root;
				6052
				6053	/*
				6054	* this will force the logging code to walk the dentry chain
				6055	* up for the file
				6056	*/
				6057	if (!S_ISDIR(inode->vfs_inode.i_mode))
				6058	inode->last_unlink_trans = trans->transid;
				6059
				6060	/*
				6061	* if this inode hasn't been logged and directory we're renaming it
				6062	* from hasn't been logged, we don't need to log it
				6063	*/
				6064	if (inode->logged_trans <= fs_info->last_trans_committed &&
				6065	(!old_dir \|\| old_dir->logged_trans <= fs_info->last_trans_committed))
				6066	return 0;
				6067
				6068	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
				6069	LLONG_MAX, 1, NULL);
				6070	}
				6071