blob: 84ff398ae70b6d125e3dd6c2b623dd5fa59e2937 [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2007,2008 Oracle. All rights reserved.
4 */
5
6#include <linux/sched.h>
7#include <linux/slab.h>
8#include <linux/rbtree.h>
9#include <linux/mm.h>
10#include "ctree.h"
11#include "disk-io.h"
12#include "transaction.h"
13#include "print-tree.h"
14#include "locking.h"
15
16static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
17 *root, struct btrfs_path *path, int level);
18static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
19 const struct btrfs_key *ins_key, struct btrfs_path *path,
20 int data_size, int extend);
21static int push_node_left(struct btrfs_trans_handle *trans,
22 struct btrfs_fs_info *fs_info,
23 struct extent_buffer *dst,
24 struct extent_buffer *src, int empty);
25static int balance_node_right(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info,
27 struct extent_buffer *dst_buf,
28 struct extent_buffer *src_buf);
29static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30 int level, int slot);
31
32struct btrfs_path *btrfs_alloc_path(void)
33{
34 return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
35}
36
37/*
38 * set all locked nodes in the path to blocking locks. This should
39 * be done before scheduling
40 */
41noinline void btrfs_set_path_blocking(struct btrfs_path *p)
42{
43 int i;
44 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
45 if (!p->nodes[i] || !p->locks[i])
46 continue;
47 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
48 if (p->locks[i] == BTRFS_READ_LOCK)
49 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
50 else if (p->locks[i] == BTRFS_WRITE_LOCK)
51 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
52 }
53}
54
55/*
56 * reset all the locked nodes in the patch to spinning locks.
57 *
58 * held is used to keep lockdep happy, when lockdep is enabled
59 * we set held to a blocking lock before we go around and
60 * retake all the spinlocks in the path. You can safely use NULL
61 * for held
62 */
63noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
64 struct extent_buffer *held, int held_rw)
65{
66 int i;
67
68 if (held) {
69 btrfs_set_lock_blocking_rw(held, held_rw);
70 if (held_rw == BTRFS_WRITE_LOCK)
71 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
72 else if (held_rw == BTRFS_READ_LOCK)
73 held_rw = BTRFS_READ_LOCK_BLOCKING;
74 }
75 btrfs_set_path_blocking(p);
76
77 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
78 if (p->nodes[i] && p->locks[i]) {
79 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
80 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
81 p->locks[i] = BTRFS_WRITE_LOCK;
82 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
83 p->locks[i] = BTRFS_READ_LOCK;
84 }
85 }
86
87 if (held)
88 btrfs_clear_lock_blocking_rw(held, held_rw);
89}
90
91/* this also releases the path */
92void btrfs_free_path(struct btrfs_path *p)
93{
94 if (!p)
95 return;
96 btrfs_release_path(p);
97 kmem_cache_free(btrfs_path_cachep, p);
98}
99
100/*
101 * path release drops references on the extent buffers in the path
102 * and it drops any locks held by this path
103 *
104 * It is safe to call this on paths that no locks or extent buffers held.
105 */
106noinline void btrfs_release_path(struct btrfs_path *p)
107{
108 int i;
109
110 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
111 p->slots[i] = 0;
112 if (!p->nodes[i])
113 continue;
114 if (p->locks[i]) {
115 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
116 p->locks[i] = 0;
117 }
118 free_extent_buffer(p->nodes[i]);
119 p->nodes[i] = NULL;
120 }
121}
122
123/*
124 * safely gets a reference on the root node of a tree. A lock
125 * is not taken, so a concurrent writer may put a different node
126 * at the root of the tree. See btrfs_lock_root_node for the
127 * looping required.
128 *
129 * The extent buffer returned by this has a reference taken, so
130 * it won't disappear. It may stop being the root of the tree
131 * at any time because there are no locks held.
132 */
133struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
134{
135 struct extent_buffer *eb;
136
137 while (1) {
138 rcu_read_lock();
139 eb = rcu_dereference(root->node);
140
141 /*
142 * RCU really hurts here, we could free up the root node because
143 * it was COWed but we may not get the new root node yet so do
144 * the inc_not_zero dance and if it doesn't work then
145 * synchronize_rcu and try again.
146 */
147 if (atomic_inc_not_zero(&eb->refs)) {
148 rcu_read_unlock();
149 break;
150 }
151 rcu_read_unlock();
152 synchronize_rcu();
153 }
154 return eb;
155}
156
157/* loop around taking references on and locking the root node of the
158 * tree until you end up with a lock on the root. A locked buffer
159 * is returned, with a reference held.
160 */
161struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
162{
163 struct extent_buffer *eb;
164
165 while (1) {
166 eb = btrfs_root_node(root);
167 btrfs_tree_lock(eb);
168 if (eb == root->node)
169 break;
170 btrfs_tree_unlock(eb);
171 free_extent_buffer(eb);
172 }
173 return eb;
174}
175
176/* loop around taking references on and locking the root node of the
177 * tree until you end up with a lock on the root. A locked buffer
178 * is returned, with a reference held.
179 */
180struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
181{
182 struct extent_buffer *eb;
183
184 while (1) {
185 eb = btrfs_root_node(root);
186 btrfs_tree_read_lock(eb);
187 if (eb == root->node)
188 break;
189 btrfs_tree_read_unlock(eb);
190 free_extent_buffer(eb);
191 }
192 return eb;
193}
194
195/* cowonly root (everything not a reference counted cow subvolume), just get
196 * put onto a simple dirty list. transaction.c walks this to make sure they
197 * get properly updated on disk.
198 */
199static void add_root_to_dirty_list(struct btrfs_root *root)
200{
201 struct btrfs_fs_info *fs_info = root->fs_info;
202
203 if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
204 !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
205 return;
206
207 spin_lock(&fs_info->trans_lock);
208 if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
209 /* Want the extent tree to be the last on the list */
210 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
211 list_move_tail(&root->dirty_list,
212 &fs_info->dirty_cowonly_roots);
213 else
214 list_move(&root->dirty_list,
215 &fs_info->dirty_cowonly_roots);
216 }
217 spin_unlock(&fs_info->trans_lock);
218}
219
220/*
221 * used by snapshot creation to make a copy of a root for a tree with
222 * a given objectid. The buffer with the new root node is returned in
223 * cow_ret, and this func returns zero on success or a negative error code.
224 */
225int btrfs_copy_root(struct btrfs_trans_handle *trans,
226 struct btrfs_root *root,
227 struct extent_buffer *buf,
228 struct extent_buffer **cow_ret, u64 new_root_objectid)
229{
230 struct btrfs_fs_info *fs_info = root->fs_info;
231 struct extent_buffer *cow;
232 int ret = 0;
233 int level;
234 struct btrfs_disk_key disk_key;
235
236 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
237 trans->transid != fs_info->running_transaction->transid);
238 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
239 trans->transid != root->last_trans);
240
241 level = btrfs_header_level(buf);
242 if (level == 0)
243 btrfs_item_key(buf, &disk_key, 0);
244 else
245 btrfs_node_key(buf, &disk_key, 0);
246
247 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
248 &disk_key, level, buf->start, 0);
249 if (IS_ERR(cow))
250 return PTR_ERR(cow);
251
252 copy_extent_buffer_full(cow, buf);
253 btrfs_set_header_bytenr(cow, cow->start);
254 btrfs_set_header_generation(cow, trans->transid);
255 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
256 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
257 BTRFS_HEADER_FLAG_RELOC);
258 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
259 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
260 else
261 btrfs_set_header_owner(cow, new_root_objectid);
262
263 write_extent_buffer_fsid(cow, fs_info->fsid);
264
265 WARN_ON(btrfs_header_generation(buf) > trans->transid);
266 if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
267 ret = btrfs_inc_ref(trans, root, cow, 1);
268 else
269 ret = btrfs_inc_ref(trans, root, cow, 0);
270
271 if (ret)
272 return ret;
273
274 btrfs_mark_buffer_dirty(cow);
275 *cow_ret = cow;
276 return 0;
277}
278
279enum mod_log_op {
280 MOD_LOG_KEY_REPLACE,
281 MOD_LOG_KEY_ADD,
282 MOD_LOG_KEY_REMOVE,
283 MOD_LOG_KEY_REMOVE_WHILE_FREEING,
284 MOD_LOG_KEY_REMOVE_WHILE_MOVING,
285 MOD_LOG_MOVE_KEYS,
286 MOD_LOG_ROOT_REPLACE,
287};
288
289struct tree_mod_root {
290 u64 logical;
291 u8 level;
292};
293
294struct tree_mod_elem {
295 struct rb_node node;
296 u64 logical;
297 u64 seq;
298 enum mod_log_op op;
299
300 /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
301 int slot;
302
303 /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
304 u64 generation;
305
306 /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
307 struct btrfs_disk_key key;
308 u64 blockptr;
309
310 /* this is used for op == MOD_LOG_MOVE_KEYS */
311 struct {
312 int dst_slot;
313 int nr_items;
314 } move;
315
316 /* this is used for op == MOD_LOG_ROOT_REPLACE */
317 struct tree_mod_root old_root;
318};
319
320/*
321 * Pull a new tree mod seq number for our operation.
322 */
323static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
324{
325 return atomic64_inc_return(&fs_info->tree_mod_seq);
326}
327
328/*
329 * This adds a new blocker to the tree mod log's blocker list if the @elem
330 * passed does not already have a sequence number set. So when a caller expects
331 * to record tree modifications, it should ensure to set elem->seq to zero
332 * before calling btrfs_get_tree_mod_seq.
333 * Returns a fresh, unused tree log modification sequence number, even if no new
334 * blocker was added.
335 */
336u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
337 struct seq_list *elem)
338{
339 write_lock(&fs_info->tree_mod_log_lock);
340 spin_lock(&fs_info->tree_mod_seq_lock);
341 if (!elem->seq) {
342 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
343 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
344 }
345 spin_unlock(&fs_info->tree_mod_seq_lock);
346 write_unlock(&fs_info->tree_mod_log_lock);
347
348 return elem->seq;
349}
350
351void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
352 struct seq_list *elem)
353{
354 struct rb_root *tm_root;
355 struct rb_node *node;
356 struct rb_node *next;
357 struct seq_list *cur_elem;
358 struct tree_mod_elem *tm;
359 u64 min_seq = (u64)-1;
360 u64 seq_putting = elem->seq;
361
362 if (!seq_putting)
363 return;
364
365 spin_lock(&fs_info->tree_mod_seq_lock);
366 list_del(&elem->list);
367 elem->seq = 0;
368
369 list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
370 if (cur_elem->seq < min_seq) {
371 if (seq_putting > cur_elem->seq) {
372 /*
373 * blocker with lower sequence number exists, we
374 * cannot remove anything from the log
375 */
376 spin_unlock(&fs_info->tree_mod_seq_lock);
377 return;
378 }
379 min_seq = cur_elem->seq;
380 }
381 }
382 spin_unlock(&fs_info->tree_mod_seq_lock);
383
384 /*
385 * anything that's lower than the lowest existing (read: blocked)
386 * sequence number can be removed from the tree.
387 */
388 write_lock(&fs_info->tree_mod_log_lock);
389 tm_root = &fs_info->tree_mod_log;
390 for (node = rb_first(tm_root); node; node = next) {
391 next = rb_next(node);
392 tm = rb_entry(node, struct tree_mod_elem, node);
393 if (tm->seq >= min_seq)
394 continue;
395 rb_erase(node, tm_root);
396 kfree(tm);
397 }
398 write_unlock(&fs_info->tree_mod_log_lock);
399}
400
401/*
402 * key order of the log:
403 * node/leaf start address -> sequence
404 *
405 * The 'start address' is the logical address of the *new* root node
406 * for root replace operations, or the logical address of the affected
407 * block for all other operations.
408 *
409 * Note: must be called with write lock for fs_info::tree_mod_log_lock.
410 */
411static noinline int
412__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
413{
414 struct rb_root *tm_root;
415 struct rb_node **new;
416 struct rb_node *parent = NULL;
417 struct tree_mod_elem *cur;
418
419 tm->seq = btrfs_inc_tree_mod_seq(fs_info);
420
421 tm_root = &fs_info->tree_mod_log;
422 new = &tm_root->rb_node;
423 while (*new) {
424 cur = rb_entry(*new, struct tree_mod_elem, node);
425 parent = *new;
426 if (cur->logical < tm->logical)
427 new = &((*new)->rb_left);
428 else if (cur->logical > tm->logical)
429 new = &((*new)->rb_right);
430 else if (cur->seq < tm->seq)
431 new = &((*new)->rb_left);
432 else if (cur->seq > tm->seq)
433 new = &((*new)->rb_right);
434 else
435 return -EEXIST;
436 }
437
438 rb_link_node(&tm->node, parent, new);
439 rb_insert_color(&tm->node, tm_root);
440 return 0;
441}
442
443/*
444 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
445 * returns zero with the tree_mod_log_lock acquired. The caller must hold
446 * this until all tree mod log insertions are recorded in the rb tree and then
447 * write unlock fs_info::tree_mod_log_lock.
448 */
449static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
450 struct extent_buffer *eb) {
451 smp_mb();
452 if (list_empty(&(fs_info)->tree_mod_seq_list))
453 return 1;
454 if (eb && btrfs_header_level(eb) == 0)
455 return 1;
456
457 write_lock(&fs_info->tree_mod_log_lock);
458 if (list_empty(&(fs_info)->tree_mod_seq_list)) {
459 write_unlock(&fs_info->tree_mod_log_lock);
460 return 1;
461 }
462
463 return 0;
464}
465
466/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
467static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
468 struct extent_buffer *eb)
469{
470 smp_mb();
471 if (list_empty(&(fs_info)->tree_mod_seq_list))
472 return 0;
473 if (eb && btrfs_header_level(eb) == 0)
474 return 0;
475
476 return 1;
477}
478
479static struct tree_mod_elem *
480alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
481 enum mod_log_op op, gfp_t flags)
482{
483 struct tree_mod_elem *tm;
484
485 tm = kzalloc(sizeof(*tm), flags);
486 if (!tm)
487 return NULL;
488
489 tm->logical = eb->start;
490 if (op != MOD_LOG_KEY_ADD) {
491 btrfs_node_key(eb, &tm->key, slot);
492 tm->blockptr = btrfs_node_blockptr(eb, slot);
493 }
494 tm->op = op;
495 tm->slot = slot;
496 tm->generation = btrfs_node_ptr_generation(eb, slot);
497 RB_CLEAR_NODE(&tm->node);
498
499 return tm;
500}
501
502static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
503 enum mod_log_op op, gfp_t flags)
504{
505 struct tree_mod_elem *tm;
506 int ret;
507
508 if (!tree_mod_need_log(eb->fs_info, eb))
509 return 0;
510
511 tm = alloc_tree_mod_elem(eb, slot, op, flags);
512 if (!tm)
513 return -ENOMEM;
514
515 if (tree_mod_dont_log(eb->fs_info, eb)) {
516 kfree(tm);
517 return 0;
518 }
519
520 ret = __tree_mod_log_insert(eb->fs_info, tm);
521 write_unlock(&eb->fs_info->tree_mod_log_lock);
522 if (ret)
523 kfree(tm);
524
525 return ret;
526}
527
528static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
529 int dst_slot, int src_slot, int nr_items)
530{
531 struct tree_mod_elem *tm = NULL;
532 struct tree_mod_elem **tm_list = NULL;
533 int ret = 0;
534 int i;
535 int locked = 0;
536
537 if (!tree_mod_need_log(eb->fs_info, eb))
538 return 0;
539
540 tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
541 if (!tm_list)
542 return -ENOMEM;
543
544 tm = kzalloc(sizeof(*tm), GFP_NOFS);
545 if (!tm) {
546 ret = -ENOMEM;
547 goto free_tms;
548 }
549
550 tm->logical = eb->start;
551 tm->slot = src_slot;
552 tm->move.dst_slot = dst_slot;
553 tm->move.nr_items = nr_items;
554 tm->op = MOD_LOG_MOVE_KEYS;
555
556 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
557 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
558 MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
559 if (!tm_list[i]) {
560 ret = -ENOMEM;
561 goto free_tms;
562 }
563 }
564
565 if (tree_mod_dont_log(eb->fs_info, eb))
566 goto free_tms;
567 locked = 1;
568
569 /*
570 * When we override something during the move, we log these removals.
571 * This can only happen when we move towards the beginning of the
572 * buffer, i.e. dst_slot < src_slot.
573 */
574 for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
575 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
576 if (ret)
577 goto free_tms;
578 }
579
580 ret = __tree_mod_log_insert(eb->fs_info, tm);
581 if (ret)
582 goto free_tms;
583 write_unlock(&eb->fs_info->tree_mod_log_lock);
584 kfree(tm_list);
585
586 return 0;
587free_tms:
588 for (i = 0; i < nr_items; i++) {
589 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
590 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
591 kfree(tm_list[i]);
592 }
593 if (locked)
594 write_unlock(&eb->fs_info->tree_mod_log_lock);
595 kfree(tm_list);
596 kfree(tm);
597
598 return ret;
599}
600
601static inline int
602__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
603 struct tree_mod_elem **tm_list,
604 int nritems)
605{
606 int i, j;
607 int ret;
608
609 for (i = nritems - 1; i >= 0; i--) {
610 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
611 if (ret) {
612 for (j = nritems - 1; j > i; j--)
613 rb_erase(&tm_list[j]->node,
614 &fs_info->tree_mod_log);
615 return ret;
616 }
617 }
618
619 return 0;
620}
621
622static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
623 struct extent_buffer *new_root, int log_removal)
624{
625 struct btrfs_fs_info *fs_info = old_root->fs_info;
626 struct tree_mod_elem *tm = NULL;
627 struct tree_mod_elem **tm_list = NULL;
628 int nritems = 0;
629 int ret = 0;
630 int i;
631
632 if (!tree_mod_need_log(fs_info, NULL))
633 return 0;
634
635 if (log_removal && btrfs_header_level(old_root) > 0) {
636 nritems = btrfs_header_nritems(old_root);
637 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
638 GFP_NOFS);
639 if (!tm_list) {
640 ret = -ENOMEM;
641 goto free_tms;
642 }
643 for (i = 0; i < nritems; i++) {
644 tm_list[i] = alloc_tree_mod_elem(old_root, i,
645 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
646 if (!tm_list[i]) {
647 ret = -ENOMEM;
648 goto free_tms;
649 }
650 }
651 }
652
653 tm = kzalloc(sizeof(*tm), GFP_NOFS);
654 if (!tm) {
655 ret = -ENOMEM;
656 goto free_tms;
657 }
658
659 tm->logical = new_root->start;
660 tm->old_root.logical = old_root->start;
661 tm->old_root.level = btrfs_header_level(old_root);
662 tm->generation = btrfs_header_generation(old_root);
663 tm->op = MOD_LOG_ROOT_REPLACE;
664
665 if (tree_mod_dont_log(fs_info, NULL))
666 goto free_tms;
667
668 if (tm_list)
669 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
670 if (!ret)
671 ret = __tree_mod_log_insert(fs_info, tm);
672
673 write_unlock(&fs_info->tree_mod_log_lock);
674 if (ret)
675 goto free_tms;
676 kfree(tm_list);
677
678 return ret;
679
680free_tms:
681 if (tm_list) {
682 for (i = 0; i < nritems; i++)
683 kfree(tm_list[i]);
684 kfree(tm_list);
685 }
686 kfree(tm);
687
688 return ret;
689}
690
691static struct tree_mod_elem *
692__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
693 int smallest)
694{
695 struct rb_root *tm_root;
696 struct rb_node *node;
697 struct tree_mod_elem *cur = NULL;
698 struct tree_mod_elem *found = NULL;
699
700 read_lock(&fs_info->tree_mod_log_lock);
701 tm_root = &fs_info->tree_mod_log;
702 node = tm_root->rb_node;
703 while (node) {
704 cur = rb_entry(node, struct tree_mod_elem, node);
705 if (cur->logical < start) {
706 node = node->rb_left;
707 } else if (cur->logical > start) {
708 node = node->rb_right;
709 } else if (cur->seq < min_seq) {
710 node = node->rb_left;
711 } else if (!smallest) {
712 /* we want the node with the highest seq */
713 if (found)
714 BUG_ON(found->seq > cur->seq);
715 found = cur;
716 node = node->rb_left;
717 } else if (cur->seq > min_seq) {
718 /* we want the node with the smallest seq */
719 if (found)
720 BUG_ON(found->seq < cur->seq);
721 found = cur;
722 node = node->rb_right;
723 } else {
724 found = cur;
725 break;
726 }
727 }
728 read_unlock(&fs_info->tree_mod_log_lock);
729
730 return found;
731}
732
733/*
734 * this returns the element from the log with the smallest time sequence
735 * value that's in the log (the oldest log item). any element with a time
736 * sequence lower than min_seq will be ignored.
737 */
738static struct tree_mod_elem *
739tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
740 u64 min_seq)
741{
742 return __tree_mod_log_search(fs_info, start, min_seq, 1);
743}
744
745/*
746 * this returns the element from the log with the largest time sequence
747 * value that's in the log (the most recent log item). any element with
748 * a time sequence lower than min_seq will be ignored.
749 */
750static struct tree_mod_elem *
751tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
752{
753 return __tree_mod_log_search(fs_info, start, min_seq, 0);
754}
755
756static noinline int
757tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
758 struct extent_buffer *src, unsigned long dst_offset,
759 unsigned long src_offset, int nr_items)
760{
761 int ret = 0;
762 struct tree_mod_elem **tm_list = NULL;
763 struct tree_mod_elem **tm_list_add, **tm_list_rem;
764 int i;
765 int locked = 0;
766
767 if (!tree_mod_need_log(fs_info, NULL))
768 return 0;
769
770 if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
771 return 0;
772
773 tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
774 GFP_NOFS);
775 if (!tm_list)
776 return -ENOMEM;
777
778 tm_list_add = tm_list;
779 tm_list_rem = tm_list + nr_items;
780 for (i = 0; i < nr_items; i++) {
781 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
782 MOD_LOG_KEY_REMOVE, GFP_NOFS);
783 if (!tm_list_rem[i]) {
784 ret = -ENOMEM;
785 goto free_tms;
786 }
787
788 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
789 MOD_LOG_KEY_ADD, GFP_NOFS);
790 if (!tm_list_add[i]) {
791 ret = -ENOMEM;
792 goto free_tms;
793 }
794 }
795
796 if (tree_mod_dont_log(fs_info, NULL))
797 goto free_tms;
798 locked = 1;
799
800 for (i = 0; i < nr_items; i++) {
801 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
802 if (ret)
803 goto free_tms;
804 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
805 if (ret)
806 goto free_tms;
807 }
808
809 write_unlock(&fs_info->tree_mod_log_lock);
810 kfree(tm_list);
811
812 return 0;
813
814free_tms:
815 for (i = 0; i < nr_items * 2; i++) {
816 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
817 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
818 kfree(tm_list[i]);
819 }
820 if (locked)
821 write_unlock(&fs_info->tree_mod_log_lock);
822 kfree(tm_list);
823
824 return ret;
825}
826
827static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
828{
829 struct tree_mod_elem **tm_list = NULL;
830 int nritems = 0;
831 int i;
832 int ret = 0;
833
834 if (btrfs_header_level(eb) == 0)
835 return 0;
836
837 if (!tree_mod_need_log(eb->fs_info, NULL))
838 return 0;
839
840 nritems = btrfs_header_nritems(eb);
841 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
842 if (!tm_list)
843 return -ENOMEM;
844
845 for (i = 0; i < nritems; i++) {
846 tm_list[i] = alloc_tree_mod_elem(eb, i,
847 MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
848 if (!tm_list[i]) {
849 ret = -ENOMEM;
850 goto free_tms;
851 }
852 }
853
854 if (tree_mod_dont_log(eb->fs_info, eb))
855 goto free_tms;
856
857 ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
858 write_unlock(&eb->fs_info->tree_mod_log_lock);
859 if (ret)
860 goto free_tms;
861 kfree(tm_list);
862
863 return 0;
864
865free_tms:
866 for (i = 0; i < nritems; i++)
867 kfree(tm_list[i]);
868 kfree(tm_list);
869
870 return ret;
871}
872
873/*
874 * check if the tree block can be shared by multiple trees
875 */
876int btrfs_block_can_be_shared(struct btrfs_root *root,
877 struct extent_buffer *buf)
878{
879 /*
880 * Tree blocks not in reference counted trees and tree roots
881 * are never shared. If a block was allocated after the last
882 * snapshot and the block was not allocated by tree relocation,
883 * we know the block is not shared.
884 */
885 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
886 buf != root->node && buf != root->commit_root &&
887 (btrfs_header_generation(buf) <=
888 btrfs_root_last_snapshot(&root->root_item) ||
889 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
890 return 1;
891
892 return 0;
893}
894
895static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
896 struct btrfs_root *root,
897 struct extent_buffer *buf,
898 struct extent_buffer *cow,
899 int *last_ref)
900{
901 struct btrfs_fs_info *fs_info = root->fs_info;
902 u64 refs;
903 u64 owner;
904 u64 flags;
905 u64 new_flags = 0;
906 int ret;
907
908 /*
909 * Backrefs update rules:
910 *
911 * Always use full backrefs for extent pointers in tree block
912 * allocated by tree relocation.
913 *
914 * If a shared tree block is no longer referenced by its owner
915 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
916 * use full backrefs for extent pointers in tree block.
917 *
918 * If a tree block is been relocating
919 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
920 * use full backrefs for extent pointers in tree block.
921 * The reason for this is some operations (such as drop tree)
922 * are only allowed for blocks use full backrefs.
923 */
924
925 if (btrfs_block_can_be_shared(root, buf)) {
926 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
927 btrfs_header_level(buf), 1,
928 &refs, &flags);
929 if (ret)
930 return ret;
931 if (refs == 0) {
932 ret = -EROFS;
933 btrfs_handle_fs_error(fs_info, ret, NULL);
934 return ret;
935 }
936 } else {
937 refs = 1;
938 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
939 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
940 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
941 else
942 flags = 0;
943 }
944
945 owner = btrfs_header_owner(buf);
946 BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
947 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
948
949 if (refs > 1) {
950 if ((owner == root->root_key.objectid ||
951 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
952 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
953 ret = btrfs_inc_ref(trans, root, buf, 1);
954 if (ret)
955 return ret;
956
957 if (root->root_key.objectid ==
958 BTRFS_TREE_RELOC_OBJECTID) {
959 ret = btrfs_dec_ref(trans, root, buf, 0);
960 if (ret)
961 return ret;
962 ret = btrfs_inc_ref(trans, root, cow, 1);
963 if (ret)
964 return ret;
965 }
966 new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
967 } else {
968
969 if (root->root_key.objectid ==
970 BTRFS_TREE_RELOC_OBJECTID)
971 ret = btrfs_inc_ref(trans, root, cow, 1);
972 else
973 ret = btrfs_inc_ref(trans, root, cow, 0);
974 if (ret)
975 return ret;
976 }
977 if (new_flags != 0) {
978 int level = btrfs_header_level(buf);
979
980 ret = btrfs_set_disk_extent_flags(trans, fs_info,
981 buf->start,
982 buf->len,
983 new_flags, level, 0);
984 if (ret)
985 return ret;
986 }
987 } else {
988 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
989 if (root->root_key.objectid ==
990 BTRFS_TREE_RELOC_OBJECTID)
991 ret = btrfs_inc_ref(trans, root, cow, 1);
992 else
993 ret = btrfs_inc_ref(trans, root, cow, 0);
994 if (ret)
995 return ret;
996 ret = btrfs_dec_ref(trans, root, buf, 1);
997 if (ret)
998 return ret;
999 }
1000 clean_tree_block(fs_info, buf);
1001 *last_ref = 1;
1002 }
1003 return 0;
1004}
1005
1006static struct extent_buffer *alloc_tree_block_no_bg_flush(
1007 struct btrfs_trans_handle *trans,
1008 struct btrfs_root *root,
1009 u64 parent_start,
1010 const struct btrfs_disk_key *disk_key,
1011 int level,
1012 u64 hint,
1013 u64 empty_size)
1014{
1015 struct btrfs_fs_info *fs_info = root->fs_info;
1016 struct extent_buffer *ret;
1017
1018 /*
1019 * If we are COWing a node/leaf from the extent, chunk, device or free
1020 * space trees, make sure that we do not finish block group creation of
1021 * pending block groups. We do this to avoid a deadlock.
1022 * COWing can result in allocation of a new chunk, and flushing pending
1023 * block groups (btrfs_create_pending_block_groups()) can be triggered
1024 * when finishing allocation of a new chunk. Creation of a pending block
1025 * group modifies the extent, chunk, device and free space trees,
1026 * therefore we could deadlock with ourselves since we are holding a
1027 * lock on an extent buffer that btrfs_create_pending_block_groups() may
1028 * try to COW later.
1029 * For similar reasons, we also need to delay flushing pending block
1030 * groups when splitting a leaf or node, from one of those trees, since
1031 * we are holding a write lock on it and its parent or when inserting a
1032 * new root node for one of those trees.
1033 */
1034 if (root == fs_info->extent_root ||
1035 root == fs_info->chunk_root ||
1036 root == fs_info->dev_root ||
1037 root == fs_info->free_space_root)
1038 trans->can_flush_pending_bgs = false;
1039
1040 ret = btrfs_alloc_tree_block(trans, root, parent_start,
1041 root->root_key.objectid, disk_key, level,
1042 hint, empty_size);
1043 trans->can_flush_pending_bgs = true;
1044
1045 return ret;
1046}
1047
1048/*
1049 * does the dirty work in cow of a single block. The parent block (if
1050 * supplied) is updated to point to the new cow copy. The new buffer is marked
1051 * dirty and returned locked. If you modify the block it needs to be marked
1052 * dirty again.
1053 *
1054 * search_start -- an allocation hint for the new block
1055 *
1056 * empty_size -- a hint that you plan on doing more cow. This is the size in
1057 * bytes the allocator should try to find free next to the block it returns.
1058 * This is just a hint and may be ignored by the allocator.
1059 */
1060static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1061 struct btrfs_root *root,
1062 struct extent_buffer *buf,
1063 struct extent_buffer *parent, int parent_slot,
1064 struct extent_buffer **cow_ret,
1065 u64 search_start, u64 empty_size)
1066{
1067 struct btrfs_fs_info *fs_info = root->fs_info;
1068 struct btrfs_disk_key disk_key;
1069 struct extent_buffer *cow;
1070 int level, ret;
1071 int last_ref = 0;
1072 int unlock_orig = 0;
1073 u64 parent_start = 0;
1074
1075 if (*cow_ret == buf)
1076 unlock_orig = 1;
1077
1078 btrfs_assert_tree_locked(buf);
1079
1080 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1081 trans->transid != fs_info->running_transaction->transid);
1082 WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1083 trans->transid != root->last_trans);
1084
1085 level = btrfs_header_level(buf);
1086
1087 if (level == 0)
1088 btrfs_item_key(buf, &disk_key, 0);
1089 else
1090 btrfs_node_key(buf, &disk_key, 0);
1091
1092 if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1093 parent_start = parent->start;
1094
1095 cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1096 level, search_start, empty_size);
1097 if (IS_ERR(cow))
1098 return PTR_ERR(cow);
1099
1100 /* cow is set to blocking by btrfs_init_new_buffer */
1101
1102 copy_extent_buffer_full(cow, buf);
1103 btrfs_set_header_bytenr(cow, cow->start);
1104 btrfs_set_header_generation(cow, trans->transid);
1105 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1106 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1107 BTRFS_HEADER_FLAG_RELOC);
1108 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1109 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1110 else
1111 btrfs_set_header_owner(cow, root->root_key.objectid);
1112
1113 write_extent_buffer_fsid(cow, fs_info->fsid);
1114
1115 ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1116 if (ret) {
1117 btrfs_abort_transaction(trans, ret);
1118 return ret;
1119 }
1120
1121 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1122 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1123 if (ret) {
1124 btrfs_abort_transaction(trans, ret);
1125 return ret;
1126 }
1127 }
1128
1129 if (buf == root->node) {
1130 WARN_ON(parent && parent != buf);
1131 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1132 btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1133 parent_start = buf->start;
1134
1135 extent_buffer_get(cow);
1136 ret = tree_mod_log_insert_root(root->node, cow, 1);
1137 BUG_ON(ret < 0);
1138 rcu_assign_pointer(root->node, cow);
1139
1140 btrfs_free_tree_block(trans, root, buf, parent_start,
1141 last_ref);
1142 free_extent_buffer(buf);
1143 add_root_to_dirty_list(root);
1144 } else {
1145 WARN_ON(trans->transid != btrfs_header_generation(parent));
1146 tree_mod_log_insert_key(parent, parent_slot,
1147 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1148 btrfs_set_node_blockptr(parent, parent_slot,
1149 cow->start);
1150 btrfs_set_node_ptr_generation(parent, parent_slot,
1151 trans->transid);
1152 btrfs_mark_buffer_dirty(parent);
1153 if (last_ref) {
1154 ret = tree_mod_log_free_eb(buf);
1155 if (ret) {
1156 btrfs_abort_transaction(trans, ret);
1157 return ret;
1158 }
1159 }
1160 btrfs_free_tree_block(trans, root, buf, parent_start,
1161 last_ref);
1162 }
1163 if (unlock_orig)
1164 btrfs_tree_unlock(buf);
1165 free_extent_buffer_stale(buf);
1166 btrfs_mark_buffer_dirty(cow);
1167 *cow_ret = cow;
1168 return 0;
1169}
1170
1171/*
1172 * returns the logical address of the oldest predecessor of the given root.
1173 * entries older than time_seq are ignored.
1174 */
1175static struct tree_mod_elem *__tree_mod_log_oldest_root(
1176 struct extent_buffer *eb_root, u64 time_seq)
1177{
1178 struct tree_mod_elem *tm;
1179 struct tree_mod_elem *found = NULL;
1180 u64 root_logical = eb_root->start;
1181 int looped = 0;
1182
1183 if (!time_seq)
1184 return NULL;
1185
1186 /*
1187 * the very last operation that's logged for a root is the
1188 * replacement operation (if it is replaced at all). this has
1189 * the logical address of the *new* root, making it the very
1190 * first operation that's logged for this root.
1191 */
1192 while (1) {
1193 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1194 time_seq);
1195 if (!looped && !tm)
1196 return NULL;
1197 /*
1198 * if there are no tree operation for the oldest root, we simply
1199 * return it. this should only happen if that (old) root is at
1200 * level 0.
1201 */
1202 if (!tm)
1203 break;
1204
1205 /*
1206 * if there's an operation that's not a root replacement, we
1207 * found the oldest version of our root. normally, we'll find a
1208 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1209 */
1210 if (tm->op != MOD_LOG_ROOT_REPLACE)
1211 break;
1212
1213 found = tm;
1214 root_logical = tm->old_root.logical;
1215 looped = 1;
1216 }
1217
1218 /* if there's no old root to return, return what we found instead */
1219 if (!found)
1220 found = tm;
1221
1222 return found;
1223}
1224
1225/*
1226 * tm is a pointer to the first operation to rewind within eb. then, all
1227 * previous operations will be rewound (until we reach something older than
1228 * time_seq).
1229 */
1230static void
1231__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1232 u64 time_seq, struct tree_mod_elem *first_tm)
1233{
1234 u32 n;
1235 struct rb_node *next;
1236 struct tree_mod_elem *tm = first_tm;
1237 unsigned long o_dst;
1238 unsigned long o_src;
1239 unsigned long p_size = sizeof(struct btrfs_key_ptr);
1240
1241 n = btrfs_header_nritems(eb);
1242 read_lock(&fs_info->tree_mod_log_lock);
1243 while (tm && tm->seq >= time_seq) {
1244 /*
1245 * all the operations are recorded with the operator used for
1246 * the modification. as we're going backwards, we do the
1247 * opposite of each operation here.
1248 */
1249 switch (tm->op) {
1250 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1251 BUG_ON(tm->slot < n);
1252 /* Fallthrough */
1253 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1254 case MOD_LOG_KEY_REMOVE:
1255 btrfs_set_node_key(eb, &tm->key, tm->slot);
1256 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1257 btrfs_set_node_ptr_generation(eb, tm->slot,
1258 tm->generation);
1259 n++;
1260 break;
1261 case MOD_LOG_KEY_REPLACE:
1262 BUG_ON(tm->slot >= n);
1263 btrfs_set_node_key(eb, &tm->key, tm->slot);
1264 btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1265 btrfs_set_node_ptr_generation(eb, tm->slot,
1266 tm->generation);
1267 break;
1268 case MOD_LOG_KEY_ADD:
1269 /* if a move operation is needed it's in the log */
1270 n--;
1271 break;
1272 case MOD_LOG_MOVE_KEYS:
1273 o_dst = btrfs_node_key_ptr_offset(tm->slot);
1274 o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1275 memmove_extent_buffer(eb, o_dst, o_src,
1276 tm->move.nr_items * p_size);
1277 break;
1278 case MOD_LOG_ROOT_REPLACE:
1279 /*
1280 * this operation is special. for roots, this must be
1281 * handled explicitly before rewinding.
1282 * for non-roots, this operation may exist if the node
1283 * was a root: root A -> child B; then A gets empty and
1284 * B is promoted to the new root. in the mod log, we'll
1285 * have a root-replace operation for B, a tree block
1286 * that is no root. we simply ignore that operation.
1287 */
1288 break;
1289 }
1290 next = rb_next(&tm->node);
1291 if (!next)
1292 break;
1293 tm = rb_entry(next, struct tree_mod_elem, node);
1294 if (tm->logical != first_tm->logical)
1295 break;
1296 }
1297 read_unlock(&fs_info->tree_mod_log_lock);
1298 btrfs_set_header_nritems(eb, n);
1299}
1300
1301/*
1302 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1303 * is returned. If rewind operations happen, a fresh buffer is returned. The
1304 * returned buffer is always read-locked. If the returned buffer is not the
1305 * input buffer, the lock on the input buffer is released and the input buffer
1306 * is freed (its refcount is decremented).
1307 */
1308static struct extent_buffer *
1309tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1310 struct extent_buffer *eb, u64 time_seq)
1311{
1312 struct extent_buffer *eb_rewin;
1313 struct tree_mod_elem *tm;
1314
1315 if (!time_seq)
1316 return eb;
1317
1318 if (btrfs_header_level(eb) == 0)
1319 return eb;
1320
1321 tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1322 if (!tm)
1323 return eb;
1324
1325 btrfs_set_path_blocking(path);
1326 btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1327
1328 if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1329 BUG_ON(tm->slot != 0);
1330 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1331 if (!eb_rewin) {
1332 btrfs_tree_read_unlock_blocking(eb);
1333 free_extent_buffer(eb);
1334 return NULL;
1335 }
1336 btrfs_set_header_bytenr(eb_rewin, eb->start);
1337 btrfs_set_header_backref_rev(eb_rewin,
1338 btrfs_header_backref_rev(eb));
1339 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1340 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1341 } else {
1342 eb_rewin = btrfs_clone_extent_buffer(eb);
1343 if (!eb_rewin) {
1344 btrfs_tree_read_unlock_blocking(eb);
1345 free_extent_buffer(eb);
1346 return NULL;
1347 }
1348 }
1349
1350 btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1351 btrfs_tree_read_unlock_blocking(eb);
1352 free_extent_buffer(eb);
1353
1354 extent_buffer_get(eb_rewin);
1355 btrfs_tree_read_lock(eb_rewin);
1356 __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1357 WARN_ON(btrfs_header_nritems(eb_rewin) >
1358 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1359
1360 return eb_rewin;
1361}
1362
1363/*
1364 * get_old_root() rewinds the state of @root's root node to the given @time_seq
1365 * value. If there are no changes, the current root->root_node is returned. If
1366 * anything changed in between, there's a fresh buffer allocated on which the
1367 * rewind operations are done. In any case, the returned buffer is read locked.
1368 * Returns NULL on error (with no locks held).
1369 */
1370static inline struct extent_buffer *
1371get_old_root(struct btrfs_root *root, u64 time_seq)
1372{
1373 struct btrfs_fs_info *fs_info = root->fs_info;
1374 struct tree_mod_elem *tm;
1375 struct extent_buffer *eb = NULL;
1376 struct extent_buffer *eb_root;
1377 u64 eb_root_owner = 0;
1378 struct extent_buffer *old;
1379 struct tree_mod_root *old_root = NULL;
1380 u64 old_generation = 0;
1381 u64 logical;
1382 int level;
1383
1384 eb_root = btrfs_read_lock_root_node(root);
1385 tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1386 if (!tm)
1387 return eb_root;
1388
1389 if (tm->op == MOD_LOG_ROOT_REPLACE) {
1390 old_root = &tm->old_root;
1391 old_generation = tm->generation;
1392 logical = old_root->logical;
1393 level = old_root->level;
1394 } else {
1395 logical = eb_root->start;
1396 level = btrfs_header_level(eb_root);
1397 }
1398
1399 tm = tree_mod_log_search(fs_info, logical, time_seq);
1400 if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1401 btrfs_tree_read_unlock(eb_root);
1402 free_extent_buffer(eb_root);
1403 old = read_tree_block(fs_info, logical, 0, level, NULL);
1404 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1405 if (!IS_ERR(old))
1406 free_extent_buffer(old);
1407 btrfs_warn(fs_info,
1408 "failed to read tree block %llu from get_old_root",
1409 logical);
1410 } else {
1411 eb = btrfs_clone_extent_buffer(old);
1412 free_extent_buffer(old);
1413 }
1414 } else if (old_root) {
1415 eb_root_owner = btrfs_header_owner(eb_root);
1416 btrfs_tree_read_unlock(eb_root);
1417 free_extent_buffer(eb_root);
1418 eb = alloc_dummy_extent_buffer(fs_info, logical);
1419 } else {
1420 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1421 eb = btrfs_clone_extent_buffer(eb_root);
1422 btrfs_tree_read_unlock_blocking(eb_root);
1423 free_extent_buffer(eb_root);
1424 }
1425
1426 if (!eb)
1427 return NULL;
1428 extent_buffer_get(eb);
1429 btrfs_tree_read_lock(eb);
1430 if (old_root) {
1431 btrfs_set_header_bytenr(eb, eb->start);
1432 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1433 btrfs_set_header_owner(eb, eb_root_owner);
1434 btrfs_set_header_level(eb, old_root->level);
1435 btrfs_set_header_generation(eb, old_generation);
1436 }
1437 if (tm)
1438 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1439 else
1440 WARN_ON(btrfs_header_level(eb) != 0);
1441 WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1442
1443 return eb;
1444}
1445
1446int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1447{
1448 struct tree_mod_elem *tm;
1449 int level;
1450 struct extent_buffer *eb_root = btrfs_root_node(root);
1451
1452 tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1453 if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1454 level = tm->old_root.level;
1455 } else {
1456 level = btrfs_header_level(eb_root);
1457 }
1458 free_extent_buffer(eb_root);
1459
1460 return level;
1461}
1462
1463static inline int should_cow_block(struct btrfs_trans_handle *trans,
1464 struct btrfs_root *root,
1465 struct extent_buffer *buf)
1466{
1467 if (btrfs_is_testing(root->fs_info))
1468 return 0;
1469
1470 /* Ensure we can see the FORCE_COW bit */
1471 smp_mb__before_atomic();
1472
1473 /*
1474 * We do not need to cow a block if
1475 * 1) this block is not created or changed in this transaction;
1476 * 2) this block does not belong to TREE_RELOC tree;
1477 * 3) the root is not forced COW.
1478 *
1479 * What is forced COW:
1480 * when we create snapshot during committing the transaction,
1481 * after we've finished coping src root, we must COW the shared
1482 * block to ensure the metadata consistency.
1483 */
1484 if (btrfs_header_generation(buf) == trans->transid &&
1485 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1486 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1487 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1488 !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1489 return 0;
1490 return 1;
1491}
1492
1493/*
1494 * cows a single block, see __btrfs_cow_block for the real work.
1495 * This version of it has extra checks so that a block isn't COWed more than
1496 * once per transaction, as long as it hasn't been written yet
1497 */
1498noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1499 struct btrfs_root *root, struct extent_buffer *buf,
1500 struct extent_buffer *parent, int parent_slot,
1501 struct extent_buffer **cow_ret)
1502{
1503 struct btrfs_fs_info *fs_info = root->fs_info;
1504 u64 search_start;
1505 int ret;
1506
1507 if (trans->transaction != fs_info->running_transaction)
1508 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1509 trans->transid,
1510 fs_info->running_transaction->transid);
1511
1512 if (trans->transid != fs_info->generation)
1513 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1514 trans->transid, fs_info->generation);
1515
1516 if (!should_cow_block(trans, root, buf)) {
1517 trans->dirty = true;
1518 *cow_ret = buf;
1519 return 0;
1520 }
1521
1522 search_start = buf->start & ~((u64)SZ_1G - 1);
1523
1524 if (parent)
1525 btrfs_set_lock_blocking(parent);
1526 btrfs_set_lock_blocking(buf);
1527
1528 ret = __btrfs_cow_block(trans, root, buf, parent,
1529 parent_slot, cow_ret, search_start, 0);
1530
1531 trace_btrfs_cow_block(root, buf, *cow_ret);
1532
1533 return ret;
1534}
1535
1536/*
1537 * helper function for defrag to decide if two blocks pointed to by a
1538 * node are actually close by
1539 */
1540static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1541{
1542 if (blocknr < other && other - (blocknr + blocksize) < 32768)
1543 return 1;
1544 if (blocknr > other && blocknr - (other + blocksize) < 32768)
1545 return 1;
1546 return 0;
1547}
1548
1549/*
1550 * compare two keys in a memcmp fashion
1551 */
1552static int comp_keys(const struct btrfs_disk_key *disk,
1553 const struct btrfs_key *k2)
1554{
1555 struct btrfs_key k1;
1556
1557 btrfs_disk_key_to_cpu(&k1, disk);
1558
1559 return btrfs_comp_cpu_keys(&k1, k2);
1560}
1561
1562/*
1563 * same as comp_keys only with two btrfs_key's
1564 */
1565int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1566{
1567 if (k1->objectid > k2->objectid)
1568 return 1;
1569 if (k1->objectid < k2->objectid)
1570 return -1;
1571 if (k1->type > k2->type)
1572 return 1;
1573 if (k1->type < k2->type)
1574 return -1;
1575 if (k1->offset > k2->offset)
1576 return 1;
1577 if (k1->offset < k2->offset)
1578 return -1;
1579 return 0;
1580}
1581
1582/*
1583 * this is used by the defrag code to go through all the
1584 * leaves pointed to by a node and reallocate them so that
1585 * disk order is close to key order
1586 */
1587int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1588 struct btrfs_root *root, struct extent_buffer *parent,
1589 int start_slot, u64 *last_ret,
1590 struct btrfs_key *progress)
1591{
1592 struct btrfs_fs_info *fs_info = root->fs_info;
1593 struct extent_buffer *cur;
1594 u64 blocknr;
1595 u64 gen;
1596 u64 search_start = *last_ret;
1597 u64 last_block = 0;
1598 u64 other;
1599 u32 parent_nritems;
1600 int end_slot;
1601 int i;
1602 int err = 0;
1603 int parent_level;
1604 int uptodate;
1605 u32 blocksize;
1606 int progress_passed = 0;
1607 struct btrfs_disk_key disk_key;
1608
1609 parent_level = btrfs_header_level(parent);
1610
1611 WARN_ON(trans->transaction != fs_info->running_transaction);
1612 WARN_ON(trans->transid != fs_info->generation);
1613
1614 parent_nritems = btrfs_header_nritems(parent);
1615 blocksize = fs_info->nodesize;
1616 end_slot = parent_nritems - 1;
1617
1618 if (parent_nritems <= 1)
1619 return 0;
1620
1621 btrfs_set_lock_blocking(parent);
1622
1623 for (i = start_slot; i <= end_slot; i++) {
1624 struct btrfs_key first_key;
1625 int close = 1;
1626
1627 btrfs_node_key(parent, &disk_key, i);
1628 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1629 continue;
1630
1631 progress_passed = 1;
1632 blocknr = btrfs_node_blockptr(parent, i);
1633 gen = btrfs_node_ptr_generation(parent, i);
1634 btrfs_node_key_to_cpu(parent, &first_key, i);
1635 if (last_block == 0)
1636 last_block = blocknr;
1637
1638 if (i > 0) {
1639 other = btrfs_node_blockptr(parent, i - 1);
1640 close = close_blocks(blocknr, other, blocksize);
1641 }
1642 if (!close && i < end_slot) {
1643 other = btrfs_node_blockptr(parent, i + 1);
1644 close = close_blocks(blocknr, other, blocksize);
1645 }
1646 if (close) {
1647 last_block = blocknr;
1648 continue;
1649 }
1650
1651 cur = find_extent_buffer(fs_info, blocknr);
1652 if (cur)
1653 uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1654 else
1655 uptodate = 0;
1656 if (!cur || !uptodate) {
1657 if (!cur) {
1658 cur = read_tree_block(fs_info, blocknr, gen,
1659 parent_level - 1,
1660 &first_key);
1661 if (IS_ERR(cur)) {
1662 return PTR_ERR(cur);
1663 } else if (!extent_buffer_uptodate(cur)) {
1664 free_extent_buffer(cur);
1665 return -EIO;
1666 }
1667 } else if (!uptodate) {
1668 err = btrfs_read_buffer(cur, gen,
1669 parent_level - 1,&first_key);
1670 if (err) {
1671 free_extent_buffer(cur);
1672 return err;
1673 }
1674 }
1675 }
1676 if (search_start == 0)
1677 search_start = last_block;
1678
1679 btrfs_tree_lock(cur);
1680 btrfs_set_lock_blocking(cur);
1681 err = __btrfs_cow_block(trans, root, cur, parent, i,
1682 &cur, search_start,
1683 min(16 * blocksize,
1684 (end_slot - i) * blocksize));
1685 if (err) {
1686 btrfs_tree_unlock(cur);
1687 free_extent_buffer(cur);
1688 break;
1689 }
1690 search_start = cur->start;
1691 last_block = cur->start;
1692 *last_ret = search_start;
1693 btrfs_tree_unlock(cur);
1694 free_extent_buffer(cur);
1695 }
1696 return err;
1697}
1698
1699/*
1700 * search for key in the extent_buffer. The items start at offset p,
1701 * and they are item_size apart. There are 'max' items in p.
1702 *
1703 * the slot in the array is returned via slot, and it points to
1704 * the place where you would insert key if it is not found in
1705 * the array.
1706 *
1707 * slot may point to max if the key is bigger than all of the keys
1708 */
1709static noinline int generic_bin_search(struct extent_buffer *eb,
1710 unsigned long p, int item_size,
1711 const struct btrfs_key *key,
1712 int max, int *slot)
1713{
1714 int low = 0;
1715 int high = max;
1716 int mid;
1717 int ret;
1718 struct btrfs_disk_key *tmp = NULL;
1719 struct btrfs_disk_key unaligned;
1720 unsigned long offset;
1721 char *kaddr = NULL;
1722 unsigned long map_start = 0;
1723 unsigned long map_len = 0;
1724 int err;
1725
1726 if (low > high) {
1727 btrfs_err(eb->fs_info,
1728 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1729 __func__, low, high, eb->start,
1730 btrfs_header_owner(eb), btrfs_header_level(eb));
1731 return -EINVAL;
1732 }
1733
1734 while (low < high) {
1735 mid = (low + high) / 2;
1736 offset = p + mid * item_size;
1737
1738 if (!kaddr || offset < map_start ||
1739 (offset + sizeof(struct btrfs_disk_key)) >
1740 map_start + map_len) {
1741
1742 err = map_private_extent_buffer(eb, offset,
1743 sizeof(struct btrfs_disk_key),
1744 &kaddr, &map_start, &map_len);
1745
1746 if (!err) {
1747 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1748 map_start);
1749 } else if (err == 1) {
1750 read_extent_buffer(eb, &unaligned,
1751 offset, sizeof(unaligned));
1752 tmp = &unaligned;
1753 } else {
1754 return err;
1755 }
1756
1757 } else {
1758 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1759 map_start);
1760 }
1761 ret = comp_keys(tmp, key);
1762
1763 if (ret < 0)
1764 low = mid + 1;
1765 else if (ret > 0)
1766 high = mid;
1767 else {
1768 *slot = mid;
1769 return 0;
1770 }
1771 }
1772 *slot = low;
1773 return 1;
1774}
1775
1776/*
1777 * simple bin_search frontend that does the right thing for
1778 * leaves vs nodes
1779 */
1780int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1781 int level, int *slot)
1782{
1783 if (level == 0)
1784 return generic_bin_search(eb,
1785 offsetof(struct btrfs_leaf, items),
1786 sizeof(struct btrfs_item),
1787 key, btrfs_header_nritems(eb),
1788 slot);
1789 else
1790 return generic_bin_search(eb,
1791 offsetof(struct btrfs_node, ptrs),
1792 sizeof(struct btrfs_key_ptr),
1793 key, btrfs_header_nritems(eb),
1794 slot);
1795}
1796
1797static void root_add_used(struct btrfs_root *root, u32 size)
1798{
1799 spin_lock(&root->accounting_lock);
1800 btrfs_set_root_used(&root->root_item,
1801 btrfs_root_used(&root->root_item) + size);
1802 spin_unlock(&root->accounting_lock);
1803}
1804
1805static void root_sub_used(struct btrfs_root *root, u32 size)
1806{
1807 spin_lock(&root->accounting_lock);
1808 btrfs_set_root_used(&root->root_item,
1809 btrfs_root_used(&root->root_item) - size);
1810 spin_unlock(&root->accounting_lock);
1811}
1812
1813/* given a node and slot number, this reads the blocks it points to. The
1814 * extent buffer is returned with a reference taken (but unlocked).
1815 */
1816static noinline struct extent_buffer *
1817read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent,
1818 int slot)
1819{
1820 int level = btrfs_header_level(parent);
1821 struct extent_buffer *eb;
1822 struct btrfs_key first_key;
1823
1824 if (slot < 0 || slot >= btrfs_header_nritems(parent))
1825 return ERR_PTR(-ENOENT);
1826
1827 BUG_ON(level == 0);
1828
1829 btrfs_node_key_to_cpu(parent, &first_key, slot);
1830 eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot),
1831 btrfs_node_ptr_generation(parent, slot),
1832 level - 1, &first_key);
1833 if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1834 free_extent_buffer(eb);
1835 eb = ERR_PTR(-EIO);
1836 }
1837
1838 return eb;
1839}
1840
1841/*
1842 * node level balancing, used to make sure nodes are in proper order for
1843 * item deletion. We balance from the top down, so we have to make sure
1844 * that a deletion won't leave an node completely empty later on.
1845 */
1846static noinline int balance_level(struct btrfs_trans_handle *trans,
1847 struct btrfs_root *root,
1848 struct btrfs_path *path, int level)
1849{
1850 struct btrfs_fs_info *fs_info = root->fs_info;
1851 struct extent_buffer *right = NULL;
1852 struct extent_buffer *mid;
1853 struct extent_buffer *left = NULL;
1854 struct extent_buffer *parent = NULL;
1855 int ret = 0;
1856 int wret;
1857 int pslot;
1858 int orig_slot = path->slots[level];
1859 u64 orig_ptr;
1860
1861 if (level == 0)
1862 return 0;
1863
1864 mid = path->nodes[level];
1865
1866 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1867 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1868 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1869
1870 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1871
1872 if (level < BTRFS_MAX_LEVEL - 1) {
1873 parent = path->nodes[level + 1];
1874 pslot = path->slots[level + 1];
1875 }
1876
1877 /*
1878 * deal with the case where there is only one pointer in the root
1879 * by promoting the node below to a root
1880 */
1881 if (!parent) {
1882 struct extent_buffer *child;
1883
1884 if (btrfs_header_nritems(mid) != 1)
1885 return 0;
1886
1887 /* promote the child to a root */
1888 child = read_node_slot(fs_info, mid, 0);
1889 if (IS_ERR(child)) {
1890 ret = PTR_ERR(child);
1891 btrfs_handle_fs_error(fs_info, ret, NULL);
1892 goto enospc;
1893 }
1894
1895 btrfs_tree_lock(child);
1896 btrfs_set_lock_blocking(child);
1897 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1898 if (ret) {
1899 btrfs_tree_unlock(child);
1900 free_extent_buffer(child);
1901 goto enospc;
1902 }
1903
1904 ret = tree_mod_log_insert_root(root->node, child, 1);
1905 BUG_ON(ret < 0);
1906 rcu_assign_pointer(root->node, child);
1907
1908 add_root_to_dirty_list(root);
1909 btrfs_tree_unlock(child);
1910
1911 path->locks[level] = 0;
1912 path->nodes[level] = NULL;
1913 clean_tree_block(fs_info, mid);
1914 btrfs_tree_unlock(mid);
1915 /* once for the path */
1916 free_extent_buffer(mid);
1917
1918 root_sub_used(root, mid->len);
1919 btrfs_free_tree_block(trans, root, mid, 0, 1);
1920 /* once for the root ptr */
1921 free_extent_buffer_stale(mid);
1922 return 0;
1923 }
1924 if (btrfs_header_nritems(mid) >
1925 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1926 return 0;
1927
1928 left = read_node_slot(fs_info, parent, pslot - 1);
1929 if (IS_ERR(left))
1930 left = NULL;
1931
1932 if (left) {
1933 btrfs_tree_lock(left);
1934 btrfs_set_lock_blocking(left);
1935 wret = btrfs_cow_block(trans, root, left,
1936 parent, pslot - 1, &left);
1937 if (wret) {
1938 ret = wret;
1939 goto enospc;
1940 }
1941 }
1942
1943 right = read_node_slot(fs_info, parent, pslot + 1);
1944 if (IS_ERR(right))
1945 right = NULL;
1946
1947 if (right) {
1948 btrfs_tree_lock(right);
1949 btrfs_set_lock_blocking(right);
1950 wret = btrfs_cow_block(trans, root, right,
1951 parent, pslot + 1, &right);
1952 if (wret) {
1953 ret = wret;
1954 goto enospc;
1955 }
1956 }
1957
1958 /* first, try to make some room in the middle buffer */
1959 if (left) {
1960 orig_slot += btrfs_header_nritems(left);
1961 wret = push_node_left(trans, fs_info, left, mid, 1);
1962 if (wret < 0)
1963 ret = wret;
1964 }
1965
1966 /*
1967 * then try to empty the right most buffer into the middle
1968 */
1969 if (right) {
1970 wret = push_node_left(trans, fs_info, mid, right, 1);
1971 if (wret < 0 && wret != -ENOSPC)
1972 ret = wret;
1973 if (btrfs_header_nritems(right) == 0) {
1974 clean_tree_block(fs_info, right);
1975 btrfs_tree_unlock(right);
1976 del_ptr(root, path, level + 1, pslot + 1);
1977 root_sub_used(root, right->len);
1978 btrfs_free_tree_block(trans, root, right, 0, 1);
1979 free_extent_buffer_stale(right);
1980 right = NULL;
1981 } else {
1982 struct btrfs_disk_key right_key;
1983 btrfs_node_key(right, &right_key, 0);
1984 ret = tree_mod_log_insert_key(parent, pslot + 1,
1985 MOD_LOG_KEY_REPLACE, GFP_NOFS);
1986 BUG_ON(ret < 0);
1987 btrfs_set_node_key(parent, &right_key, pslot + 1);
1988 btrfs_mark_buffer_dirty(parent);
1989 }
1990 }
1991 if (btrfs_header_nritems(mid) == 1) {
1992 /*
1993 * we're not allowed to leave a node with one item in the
1994 * tree during a delete. A deletion from lower in the tree
1995 * could try to delete the only pointer in this node.
1996 * So, pull some keys from the left.
1997 * There has to be a left pointer at this point because
1998 * otherwise we would have pulled some pointers from the
1999 * right
2000 */
2001 if (!left) {
2002 ret = -EROFS;
2003 btrfs_handle_fs_error(fs_info, ret, NULL);
2004 goto enospc;
2005 }
2006 wret = balance_node_right(trans, fs_info, mid, left);
2007 if (wret < 0) {
2008 ret = wret;
2009 goto enospc;
2010 }
2011 if (wret == 1) {
2012 wret = push_node_left(trans, fs_info, left, mid, 1);
2013 if (wret < 0)
2014 ret = wret;
2015 }
2016 BUG_ON(wret == 1);
2017 }
2018 if (btrfs_header_nritems(mid) == 0) {
2019 clean_tree_block(fs_info, mid);
2020 btrfs_tree_unlock(mid);
2021 del_ptr(root, path, level + 1, pslot);
2022 root_sub_used(root, mid->len);
2023 btrfs_free_tree_block(trans, root, mid, 0, 1);
2024 free_extent_buffer_stale(mid);
2025 mid = NULL;
2026 } else {
2027 /* update the parent key to reflect our changes */
2028 struct btrfs_disk_key mid_key;
2029 btrfs_node_key(mid, &mid_key, 0);
2030 ret = tree_mod_log_insert_key(parent, pslot,
2031 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2032 BUG_ON(ret < 0);
2033 btrfs_set_node_key(parent, &mid_key, pslot);
2034 btrfs_mark_buffer_dirty(parent);
2035 }
2036
2037 /* update the path */
2038 if (left) {
2039 if (btrfs_header_nritems(left) > orig_slot) {
2040 extent_buffer_get(left);
2041 /* left was locked after cow */
2042 path->nodes[level] = left;
2043 path->slots[level + 1] -= 1;
2044 path->slots[level] = orig_slot;
2045 if (mid) {
2046 btrfs_tree_unlock(mid);
2047 free_extent_buffer(mid);
2048 }
2049 } else {
2050 orig_slot -= btrfs_header_nritems(left);
2051 path->slots[level] = orig_slot;
2052 }
2053 }
2054 /* double check we haven't messed things up */
2055 if (orig_ptr !=
2056 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2057 BUG();
2058enospc:
2059 if (right) {
2060 btrfs_tree_unlock(right);
2061 free_extent_buffer(right);
2062 }
2063 if (left) {
2064 if (path->nodes[level] != left)
2065 btrfs_tree_unlock(left);
2066 free_extent_buffer(left);
2067 }
2068 return ret;
2069}
2070
2071/* Node balancing for insertion. Here we only split or push nodes around
2072 * when they are completely full. This is also done top down, so we
2073 * have to be pessimistic.
2074 */
2075static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2076 struct btrfs_root *root,
2077 struct btrfs_path *path, int level)
2078{
2079 struct btrfs_fs_info *fs_info = root->fs_info;
2080 struct extent_buffer *right = NULL;
2081 struct extent_buffer *mid;
2082 struct extent_buffer *left = NULL;
2083 struct extent_buffer *parent = NULL;
2084 int ret = 0;
2085 int wret;
2086 int pslot;
2087 int orig_slot = path->slots[level];
2088
2089 if (level == 0)
2090 return 1;
2091
2092 mid = path->nodes[level];
2093 WARN_ON(btrfs_header_generation(mid) != trans->transid);
2094
2095 if (level < BTRFS_MAX_LEVEL - 1) {
2096 parent = path->nodes[level + 1];
2097 pslot = path->slots[level + 1];
2098 }
2099
2100 if (!parent)
2101 return 1;
2102
2103 left = read_node_slot(fs_info, parent, pslot - 1);
2104 if (IS_ERR(left))
2105 left = NULL;
2106
2107 /* first, try to make some room in the middle buffer */
2108 if (left) {
2109 u32 left_nr;
2110
2111 btrfs_tree_lock(left);
2112 btrfs_set_lock_blocking(left);
2113
2114 left_nr = btrfs_header_nritems(left);
2115 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2116 wret = 1;
2117 } else {
2118 ret = btrfs_cow_block(trans, root, left, parent,
2119 pslot - 1, &left);
2120 if (ret)
2121 wret = 1;
2122 else {
2123 wret = push_node_left(trans, fs_info,
2124 left, mid, 0);
2125 }
2126 }
2127 if (wret < 0)
2128 ret = wret;
2129 if (wret == 0) {
2130 struct btrfs_disk_key disk_key;
2131 orig_slot += left_nr;
2132 btrfs_node_key(mid, &disk_key, 0);
2133 ret = tree_mod_log_insert_key(parent, pslot,
2134 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2135 BUG_ON(ret < 0);
2136 btrfs_set_node_key(parent, &disk_key, pslot);
2137 btrfs_mark_buffer_dirty(parent);
2138 if (btrfs_header_nritems(left) > orig_slot) {
2139 path->nodes[level] = left;
2140 path->slots[level + 1] -= 1;
2141 path->slots[level] = orig_slot;
2142 btrfs_tree_unlock(mid);
2143 free_extent_buffer(mid);
2144 } else {
2145 orig_slot -=
2146 btrfs_header_nritems(left);
2147 path->slots[level] = orig_slot;
2148 btrfs_tree_unlock(left);
2149 free_extent_buffer(left);
2150 }
2151 return 0;
2152 }
2153 btrfs_tree_unlock(left);
2154 free_extent_buffer(left);
2155 }
2156 right = read_node_slot(fs_info, parent, pslot + 1);
2157 if (IS_ERR(right))
2158 right = NULL;
2159
2160 /*
2161 * then try to empty the right most buffer into the middle
2162 */
2163 if (right) {
2164 u32 right_nr;
2165
2166 btrfs_tree_lock(right);
2167 btrfs_set_lock_blocking(right);
2168
2169 right_nr = btrfs_header_nritems(right);
2170 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2171 wret = 1;
2172 } else {
2173 ret = btrfs_cow_block(trans, root, right,
2174 parent, pslot + 1,
2175 &right);
2176 if (ret)
2177 wret = 1;
2178 else {
2179 wret = balance_node_right(trans, fs_info,
2180 right, mid);
2181 }
2182 }
2183 if (wret < 0)
2184 ret = wret;
2185 if (wret == 0) {
2186 struct btrfs_disk_key disk_key;
2187
2188 btrfs_node_key(right, &disk_key, 0);
2189 ret = tree_mod_log_insert_key(parent, pslot + 1,
2190 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2191 BUG_ON(ret < 0);
2192 btrfs_set_node_key(parent, &disk_key, pslot + 1);
2193 btrfs_mark_buffer_dirty(parent);
2194
2195 if (btrfs_header_nritems(mid) <= orig_slot) {
2196 path->nodes[level] = right;
2197 path->slots[level + 1] += 1;
2198 path->slots[level] = orig_slot -
2199 btrfs_header_nritems(mid);
2200 btrfs_tree_unlock(mid);
2201 free_extent_buffer(mid);
2202 } else {
2203 btrfs_tree_unlock(right);
2204 free_extent_buffer(right);
2205 }
2206 return 0;
2207 }
2208 btrfs_tree_unlock(right);
2209 free_extent_buffer(right);
2210 }
2211 return 1;
2212}
2213
2214/*
2215 * readahead one full node of leaves, finding things that are close
2216 * to the block in 'slot', and triggering ra on them.
2217 */
2218static void reada_for_search(struct btrfs_fs_info *fs_info,
2219 struct btrfs_path *path,
2220 int level, int slot, u64 objectid)
2221{
2222 struct extent_buffer *node;
2223 struct btrfs_disk_key disk_key;
2224 u32 nritems;
2225 u64 search;
2226 u64 target;
2227 u64 nread = 0;
2228 struct extent_buffer *eb;
2229 u32 nr;
2230 u32 blocksize;
2231 u32 nscan = 0;
2232
2233 if (level != 1)
2234 return;
2235
2236 if (!path->nodes[level])
2237 return;
2238
2239 node = path->nodes[level];
2240
2241 search = btrfs_node_blockptr(node, slot);
2242 blocksize = fs_info->nodesize;
2243 eb = find_extent_buffer(fs_info, search);
2244 if (eb) {
2245 free_extent_buffer(eb);
2246 return;
2247 }
2248
2249 target = search;
2250
2251 nritems = btrfs_header_nritems(node);
2252 nr = slot;
2253
2254 while (1) {
2255 if (path->reada == READA_BACK) {
2256 if (nr == 0)
2257 break;
2258 nr--;
2259 } else if (path->reada == READA_FORWARD) {
2260 nr++;
2261 if (nr >= nritems)
2262 break;
2263 }
2264 if (path->reada == READA_BACK && objectid) {
2265 btrfs_node_key(node, &disk_key, nr);
2266 if (btrfs_disk_key_objectid(&disk_key) != objectid)
2267 break;
2268 }
2269 search = btrfs_node_blockptr(node, nr);
2270 if ((search <= target && target - search <= 65536) ||
2271 (search > target && search - target <= 65536)) {
2272 readahead_tree_block(fs_info, search);
2273 nread += blocksize;
2274 }
2275 nscan++;
2276 if ((nread > 65536 || nscan > 32))
2277 break;
2278 }
2279}
2280
2281static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2282 struct btrfs_path *path, int level)
2283{
2284 int slot;
2285 int nritems;
2286 struct extent_buffer *parent;
2287 struct extent_buffer *eb;
2288 u64 gen;
2289 u64 block1 = 0;
2290 u64 block2 = 0;
2291
2292 parent = path->nodes[level + 1];
2293 if (!parent)
2294 return;
2295
2296 nritems = btrfs_header_nritems(parent);
2297 slot = path->slots[level + 1];
2298
2299 if (slot > 0) {
2300 block1 = btrfs_node_blockptr(parent, slot - 1);
2301 gen = btrfs_node_ptr_generation(parent, slot - 1);
2302 eb = find_extent_buffer(fs_info, block1);
2303 /*
2304 * if we get -eagain from btrfs_buffer_uptodate, we
2305 * don't want to return eagain here. That will loop
2306 * forever
2307 */
2308 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2309 block1 = 0;
2310 free_extent_buffer(eb);
2311 }
2312 if (slot + 1 < nritems) {
2313 block2 = btrfs_node_blockptr(parent, slot + 1);
2314 gen = btrfs_node_ptr_generation(parent, slot + 1);
2315 eb = find_extent_buffer(fs_info, block2);
2316 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2317 block2 = 0;
2318 free_extent_buffer(eb);
2319 }
2320
2321 if (block1)
2322 readahead_tree_block(fs_info, block1);
2323 if (block2)
2324 readahead_tree_block(fs_info, block2);
2325}
2326
2327
2328/*
2329 * when we walk down the tree, it is usually safe to unlock the higher layers
2330 * in the tree. The exceptions are when our path goes through slot 0, because
2331 * operations on the tree might require changing key pointers higher up in the
2332 * tree.
2333 *
2334 * callers might also have set path->keep_locks, which tells this code to keep
2335 * the lock if the path points to the last slot in the block. This is part of
2336 * walking through the tree, and selecting the next slot in the higher block.
2337 *
2338 * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so
2339 * if lowest_unlock is 1, level 0 won't be unlocked
2340 */
2341static noinline void unlock_up(struct btrfs_path *path, int level,
2342 int lowest_unlock, int min_write_lock_level,
2343 int *write_lock_level)
2344{
2345 int i;
2346 int skip_level = level;
2347 int no_skips = 0;
2348 struct extent_buffer *t;
2349
2350 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2351 if (!path->nodes[i])
2352 break;
2353 if (!path->locks[i])
2354 break;
2355 if (!no_skips && path->slots[i] == 0) {
2356 skip_level = i + 1;
2357 continue;
2358 }
2359 if (!no_skips && path->keep_locks) {
2360 u32 nritems;
2361 t = path->nodes[i];
2362 nritems = btrfs_header_nritems(t);
2363 if (nritems < 1 || path->slots[i] >= nritems - 1) {
2364 skip_level = i + 1;
2365 continue;
2366 }
2367 }
2368 if (skip_level < i && i >= lowest_unlock)
2369 no_skips = 1;
2370
2371 t = path->nodes[i];
2372 if (i >= lowest_unlock && i > skip_level) {
2373 btrfs_tree_unlock_rw(t, path->locks[i]);
2374 path->locks[i] = 0;
2375 if (write_lock_level &&
2376 i > min_write_lock_level &&
2377 i <= *write_lock_level) {
2378 *write_lock_level = i - 1;
2379 }
2380 }
2381 }
2382}
2383
2384/*
2385 * This releases any locks held in the path starting at level and
2386 * going all the way up to the root.
2387 *
2388 * btrfs_search_slot will keep the lock held on higher nodes in a few
2389 * corner cases, such as COW of the block at slot zero in the node. This
2390 * ignores those rules, and it should only be called when there are no
2391 * more updates to be done higher up in the tree.
2392 */
2393noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2394{
2395 int i;
2396
2397 if (path->keep_locks)
2398 return;
2399
2400 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2401 if (!path->nodes[i])
2402 continue;
2403 if (!path->locks[i])
2404 continue;
2405 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2406 path->locks[i] = 0;
2407 }
2408}
2409
2410/*
2411 * helper function for btrfs_search_slot. The goal is to find a block
2412 * in cache without setting the path to blocking. If we find the block
2413 * we return zero and the path is unchanged.
2414 *
2415 * If we can't find the block, we set the path blocking and do some
2416 * reada. -EAGAIN is returned and the search must be repeated.
2417 */
2418static int
2419read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2420 struct extent_buffer **eb_ret, int level, int slot,
2421 const struct btrfs_key *key)
2422{
2423 struct btrfs_fs_info *fs_info = root->fs_info;
2424 u64 blocknr;
2425 u64 gen;
2426 struct extent_buffer *b = *eb_ret;
2427 struct extent_buffer *tmp;
2428 struct btrfs_key first_key;
2429 int ret;
2430 int parent_level;
2431
2432 blocknr = btrfs_node_blockptr(b, slot);
2433 gen = btrfs_node_ptr_generation(b, slot);
2434 parent_level = btrfs_header_level(b);
2435 btrfs_node_key_to_cpu(b, &first_key, slot);
2436
2437 tmp = find_extent_buffer(fs_info, blocknr);
2438 if (tmp) {
2439 /* first we do an atomic uptodate check */
2440 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2441 /*
2442 * Do extra check for first_key, eb can be stale due to
2443 * being cached, read from scrub, or have multiple
2444 * parents (shared tree blocks).
2445 */
2446 if (btrfs_verify_level_key(fs_info, tmp,
2447 parent_level - 1, &first_key, gen)) {
2448 free_extent_buffer(tmp);
2449 return -EUCLEAN;
2450 }
2451 *eb_ret = tmp;
2452 return 0;
2453 }
2454
2455 /* the pages were up to date, but we failed
2456 * the generation number check. Do a full
2457 * read for the generation number that is correct.
2458 * We must do this without dropping locks so
2459 * we can trust our generation number
2460 */
2461 btrfs_set_path_blocking(p);
2462
2463 /* now we're allowed to do a blocking uptodate check */
2464 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2465 if (!ret) {
2466 *eb_ret = tmp;
2467 return 0;
2468 }
2469 free_extent_buffer(tmp);
2470 btrfs_release_path(p);
2471 return -EIO;
2472 }
2473
2474 /*
2475 * reduce lock contention at high levels
2476 * of the btree by dropping locks before
2477 * we read. Don't release the lock on the current
2478 * level because we need to walk this node to figure
2479 * out which blocks to read.
2480 */
2481 btrfs_unlock_up_safe(p, level + 1);
2482 btrfs_set_path_blocking(p);
2483
2484 if (p->reada != READA_NONE)
2485 reada_for_search(fs_info, p, level, slot, key->objectid);
2486
2487 ret = -EAGAIN;
2488 tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2489 &first_key);
2490 if (!IS_ERR(tmp)) {
2491 /*
2492 * If the read above didn't mark this buffer up to date,
2493 * it will never end up being up to date. Set ret to EIO now
2494 * and give up so that our caller doesn't loop forever
2495 * on our EAGAINs.
2496 */
2497 if (!extent_buffer_uptodate(tmp))
2498 ret = -EIO;
2499 free_extent_buffer(tmp);
2500 } else {
2501 ret = PTR_ERR(tmp);
2502 }
2503
2504 btrfs_release_path(p);
2505 return ret;
2506}
2507
2508/*
2509 * helper function for btrfs_search_slot. This does all of the checks
2510 * for node-level blocks and does any balancing required based on
2511 * the ins_len.
2512 *
2513 * If no extra work was required, zero is returned. If we had to
2514 * drop the path, -EAGAIN is returned and btrfs_search_slot must
2515 * start over
2516 */
2517static int
2518setup_nodes_for_search(struct btrfs_trans_handle *trans,
2519 struct btrfs_root *root, struct btrfs_path *p,
2520 struct extent_buffer *b, int level, int ins_len,
2521 int *write_lock_level)
2522{
2523 struct btrfs_fs_info *fs_info = root->fs_info;
2524 int ret;
2525
2526 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2527 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2528 int sret;
2529
2530 if (*write_lock_level < level + 1) {
2531 *write_lock_level = level + 1;
2532 btrfs_release_path(p);
2533 goto again;
2534 }
2535
2536 btrfs_set_path_blocking(p);
2537 reada_for_balance(fs_info, p, level);
2538 sret = split_node(trans, root, p, level);
2539 btrfs_clear_path_blocking(p, NULL, 0);
2540
2541 BUG_ON(sret > 0);
2542 if (sret) {
2543 ret = sret;
2544 goto done;
2545 }
2546 b = p->nodes[level];
2547 } else if (ins_len < 0 && btrfs_header_nritems(b) <
2548 BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2549 int sret;
2550
2551 if (*write_lock_level < level + 1) {
2552 *write_lock_level = level + 1;
2553 btrfs_release_path(p);
2554 goto again;
2555 }
2556
2557 btrfs_set_path_blocking(p);
2558 reada_for_balance(fs_info, p, level);
2559 sret = balance_level(trans, root, p, level);
2560 btrfs_clear_path_blocking(p, NULL, 0);
2561
2562 if (sret) {
2563 ret = sret;
2564 goto done;
2565 }
2566 b = p->nodes[level];
2567 if (!b) {
2568 btrfs_release_path(p);
2569 goto again;
2570 }
2571 BUG_ON(btrfs_header_nritems(b) == 1);
2572 }
2573 return 0;
2574
2575again:
2576 ret = -EAGAIN;
2577done:
2578 return ret;
2579}
2580
2581static void key_search_validate(struct extent_buffer *b,
2582 const struct btrfs_key *key,
2583 int level)
2584{
2585#ifdef CONFIG_BTRFS_ASSERT
2586 struct btrfs_disk_key disk_key;
2587
2588 btrfs_cpu_key_to_disk(&disk_key, key);
2589
2590 if (level == 0)
2591 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2592 offsetof(struct btrfs_leaf, items[0].key),
2593 sizeof(disk_key)));
2594 else
2595 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2596 offsetof(struct btrfs_node, ptrs[0].key),
2597 sizeof(disk_key)));
2598#endif
2599}
2600
2601static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2602 int level, int *prev_cmp, int *slot)
2603{
2604 if (*prev_cmp != 0) {
2605 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2606 return *prev_cmp;
2607 }
2608
2609 key_search_validate(b, key, level);
2610 *slot = 0;
2611
2612 return 0;
2613}
2614
2615int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2616 u64 iobjectid, u64 ioff, u8 key_type,
2617 struct btrfs_key *found_key)
2618{
2619 int ret;
2620 struct btrfs_key key;
2621 struct extent_buffer *eb;
2622
2623 ASSERT(path);
2624 ASSERT(found_key);
2625
2626 key.type = key_type;
2627 key.objectid = iobjectid;
2628 key.offset = ioff;
2629
2630 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2631 if (ret < 0)
2632 return ret;
2633
2634 eb = path->nodes[0];
2635 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2636 ret = btrfs_next_leaf(fs_root, path);
2637 if (ret)
2638 return ret;
2639 eb = path->nodes[0];
2640 }
2641
2642 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2643 if (found_key->type != key.type ||
2644 found_key->objectid != key.objectid)
2645 return 1;
2646
2647 return 0;
2648}
2649
2650static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2651 struct btrfs_path *p,
2652 int write_lock_level)
2653{
2654 struct btrfs_fs_info *fs_info = root->fs_info;
2655 struct extent_buffer *b;
2656 int root_lock;
2657 int level = 0;
2658
2659 /* We try very hard to do read locks on the root */
2660 root_lock = BTRFS_READ_LOCK;
2661
2662 if (p->search_commit_root) {
2663 /*
2664 * The commit roots are read only so we always do read locks,
2665 * and we always must hold the commit_root_sem when doing
2666 * searches on them, the only exception is send where we don't
2667 * want to block transaction commits for a long time, so
2668 * we need to clone the commit root in order to avoid races
2669 * with transaction commits that create a snapshot of one of
2670 * the roots used by a send operation.
2671 */
2672 if (p->need_commit_sem) {
2673 down_read(&fs_info->commit_root_sem);
2674 b = btrfs_clone_extent_buffer(root->commit_root);
2675 up_read(&fs_info->commit_root_sem);
2676 if (!b)
2677 return ERR_PTR(-ENOMEM);
2678
2679 } else {
2680 b = root->commit_root;
2681 extent_buffer_get(b);
2682 }
2683 level = btrfs_header_level(b);
2684 /*
2685 * Ensure that all callers have set skip_locking when
2686 * p->search_commit_root = 1.
2687 */
2688 ASSERT(p->skip_locking == 1);
2689
2690 goto out;
2691 }
2692
2693 if (p->skip_locking) {
2694 b = btrfs_root_node(root);
2695 level = btrfs_header_level(b);
2696 goto out;
2697 }
2698
2699 /*
2700 * If the level is set to maximum, we can skip trying to get the read
2701 * lock.
2702 */
2703 if (write_lock_level < BTRFS_MAX_LEVEL) {
2704 /*
2705 * We don't know the level of the root node until we actually
2706 * have it read locked
2707 */
2708 b = btrfs_read_lock_root_node(root);
2709 level = btrfs_header_level(b);
2710 if (level > write_lock_level)
2711 goto out;
2712
2713 /* Whoops, must trade for write lock */
2714 btrfs_tree_read_unlock(b);
2715 free_extent_buffer(b);
2716 }
2717
2718 b = btrfs_lock_root_node(root);
2719 root_lock = BTRFS_WRITE_LOCK;
2720
2721 /* The level might have changed, check again */
2722 level = btrfs_header_level(b);
2723
2724out:
2725 p->nodes[level] = b;
2726 if (!p->skip_locking)
2727 p->locks[level] = root_lock;
2728 /*
2729 * Callers are responsible for dropping b's references.
2730 */
2731 return b;
2732}
2733
2734
2735/*
2736 * btrfs_search_slot - look for a key in a tree and perform necessary
2737 * modifications to preserve tree invariants.
2738 *
2739 * @trans: Handle of transaction, used when modifying the tree
2740 * @p: Holds all btree nodes along the search path
2741 * @root: The root node of the tree
2742 * @key: The key we are looking for
2743 * @ins_len: Indicates purpose of search, for inserts it is 1, for
2744 * deletions it's -1. 0 for plain searches
2745 * @cow: boolean should CoW operations be performed. Must always be 1
2746 * when modifying the tree.
2747 *
2748 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2749 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2750 *
2751 * If @key is found, 0 is returned and you can find the item in the leaf level
2752 * of the path (level 0)
2753 *
2754 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2755 * points to the slot where it should be inserted
2756 *
2757 * If an error is encountered while searching the tree a negative error number
2758 * is returned
2759 */
2760int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2761 const struct btrfs_key *key, struct btrfs_path *p,
2762 int ins_len, int cow)
2763{
2764 struct btrfs_fs_info *fs_info = root->fs_info;
2765 struct extent_buffer *b;
2766 int slot;
2767 int ret;
2768 int err;
2769 int level;
2770 int lowest_unlock = 1;
2771 /* everything at write_lock_level or lower must be write locked */
2772 int write_lock_level = 0;
2773 u8 lowest_level = 0;
2774 int min_write_lock_level;
2775 int prev_cmp;
2776
2777 lowest_level = p->lowest_level;
2778 WARN_ON(lowest_level && ins_len > 0);
2779 WARN_ON(p->nodes[0] != NULL);
2780 BUG_ON(!cow && ins_len);
2781
2782 if (ins_len < 0) {
2783 lowest_unlock = 2;
2784
2785 /* when we are removing items, we might have to go up to level
2786 * two as we update tree pointers Make sure we keep write
2787 * for those levels as well
2788 */
2789 write_lock_level = 2;
2790 } else if (ins_len > 0) {
2791 /*
2792 * for inserting items, make sure we have a write lock on
2793 * level 1 so we can update keys
2794 */
2795 write_lock_level = 1;
2796 }
2797
2798 if (!cow)
2799 write_lock_level = -1;
2800
2801 if (cow && (p->keep_locks || p->lowest_level))
2802 write_lock_level = BTRFS_MAX_LEVEL;
2803
2804 min_write_lock_level = write_lock_level;
2805
2806again:
2807 prev_cmp = -1;
2808 b = btrfs_search_slot_get_root(root, p, write_lock_level);
2809 if (IS_ERR(b)) {
2810 ret = PTR_ERR(b);
2811 goto done;
2812 }
2813
2814 while (b) {
2815 level = btrfs_header_level(b);
2816
2817 /*
2818 * setup the path here so we can release it under lock
2819 * contention with the cow code
2820 */
2821 if (cow) {
2822 bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2823
2824 /*
2825 * if we don't really need to cow this block
2826 * then we don't want to set the path blocking,
2827 * so we test it here
2828 */
2829 if (!should_cow_block(trans, root, b)) {
2830 trans->dirty = true;
2831 goto cow_done;
2832 }
2833
2834 /*
2835 * must have write locks on this node and the
2836 * parent
2837 */
2838 if (level > write_lock_level ||
2839 (level + 1 > write_lock_level &&
2840 level + 1 < BTRFS_MAX_LEVEL &&
2841 p->nodes[level + 1])) {
2842 write_lock_level = level + 1;
2843 btrfs_release_path(p);
2844 goto again;
2845 }
2846
2847 btrfs_set_path_blocking(p);
2848 if (last_level)
2849 err = btrfs_cow_block(trans, root, b, NULL, 0,
2850 &b);
2851 else
2852 err = btrfs_cow_block(trans, root, b,
2853 p->nodes[level + 1],
2854 p->slots[level + 1], &b);
2855 if (err) {
2856 ret = err;
2857 goto done;
2858 }
2859 }
2860cow_done:
2861 p->nodes[level] = b;
2862 btrfs_clear_path_blocking(p, NULL, 0);
2863
2864 /*
2865 * we have a lock on b and as long as we aren't changing
2866 * the tree, there is no way to for the items in b to change.
2867 * It is safe to drop the lock on our parent before we
2868 * go through the expensive btree search on b.
2869 *
2870 * If we're inserting or deleting (ins_len != 0), then we might
2871 * be changing slot zero, which may require changing the parent.
2872 * So, we can't drop the lock until after we know which slot
2873 * we're operating on.
2874 */
2875 if (!ins_len && !p->keep_locks) {
2876 int u = level + 1;
2877
2878 if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2879 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2880 p->locks[u] = 0;
2881 }
2882 }
2883
2884 ret = key_search(b, key, level, &prev_cmp, &slot);
2885 if (ret < 0)
2886 goto done;
2887
2888 if (level != 0) {
2889 int dec = 0;
2890 if (ret && slot > 0) {
2891 dec = 1;
2892 slot -= 1;
2893 }
2894 p->slots[level] = slot;
2895 err = setup_nodes_for_search(trans, root, p, b, level,
2896 ins_len, &write_lock_level);
2897 if (err == -EAGAIN)
2898 goto again;
2899 if (err) {
2900 ret = err;
2901 goto done;
2902 }
2903 b = p->nodes[level];
2904 slot = p->slots[level];
2905
2906 /*
2907 * slot 0 is special, if we change the key
2908 * we have to update the parent pointer
2909 * which means we must have a write lock
2910 * on the parent
2911 */
2912 if (slot == 0 && ins_len &&
2913 write_lock_level < level + 1) {
2914 write_lock_level = level + 1;
2915 btrfs_release_path(p);
2916 goto again;
2917 }
2918
2919 unlock_up(p, level, lowest_unlock,
2920 min_write_lock_level, &write_lock_level);
2921
2922 if (level == lowest_level) {
2923 if (dec)
2924 p->slots[level]++;
2925 goto done;
2926 }
2927
2928 err = read_block_for_search(root, p, &b, level,
2929 slot, key);
2930 if (err == -EAGAIN)
2931 goto again;
2932 if (err) {
2933 ret = err;
2934 goto done;
2935 }
2936
2937 if (!p->skip_locking) {
2938 level = btrfs_header_level(b);
2939 if (level <= write_lock_level) {
2940 err = btrfs_try_tree_write_lock(b);
2941 if (!err) {
2942 btrfs_set_path_blocking(p);
2943 btrfs_tree_lock(b);
2944 btrfs_clear_path_blocking(p, b,
2945 BTRFS_WRITE_LOCK);
2946 }
2947 p->locks[level] = BTRFS_WRITE_LOCK;
2948 } else {
2949 err = btrfs_tree_read_lock_atomic(b);
2950 if (!err) {
2951 btrfs_set_path_blocking(p);
2952 btrfs_tree_read_lock(b);
2953 btrfs_clear_path_blocking(p, b,
2954 BTRFS_READ_LOCK);
2955 }
2956 p->locks[level] = BTRFS_READ_LOCK;
2957 }
2958 p->nodes[level] = b;
2959 }
2960 } else {
2961 p->slots[level] = slot;
2962 if (ins_len > 0 &&
2963 btrfs_leaf_free_space(fs_info, b) < ins_len) {
2964 if (write_lock_level < 1) {
2965 write_lock_level = 1;
2966 btrfs_release_path(p);
2967 goto again;
2968 }
2969
2970 btrfs_set_path_blocking(p);
2971 err = split_leaf(trans, root, key,
2972 p, ins_len, ret == 0);
2973 btrfs_clear_path_blocking(p, NULL, 0);
2974
2975 BUG_ON(err > 0);
2976 if (err) {
2977 ret = err;
2978 goto done;
2979 }
2980 }
2981 if (!p->search_for_split)
2982 unlock_up(p, level, lowest_unlock,
2983 min_write_lock_level, &write_lock_level);
2984 goto done;
2985 }
2986 }
2987 ret = 1;
2988done:
2989 /*
2990 * we don't really know what they plan on doing with the path
2991 * from here on, so for now just mark it as blocking
2992 */
2993 if (!p->leave_spinning)
2994 btrfs_set_path_blocking(p);
2995 if (ret < 0 && !p->skip_release_on_error)
2996 btrfs_release_path(p);
2997 return ret;
2998}
2999
3000/*
3001 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
3002 * current state of the tree together with the operations recorded in the tree
3003 * modification log to search for the key in a previous version of this tree, as
3004 * denoted by the time_seq parameter.
3005 *
3006 * Naturally, there is no support for insert, delete or cow operations.
3007 *
3008 * The resulting path and return value will be set up as if we called
3009 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
3010 */
3011int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
3012 struct btrfs_path *p, u64 time_seq)
3013{
3014 struct btrfs_fs_info *fs_info = root->fs_info;
3015 struct extent_buffer *b;
3016 int slot;
3017 int ret;
3018 int err;
3019 int level;
3020 int lowest_unlock = 1;
3021 u8 lowest_level = 0;
3022 int prev_cmp = -1;
3023
3024 lowest_level = p->lowest_level;
3025 WARN_ON(p->nodes[0] != NULL);
3026
3027 if (p->search_commit_root) {
3028 BUG_ON(time_seq);
3029 return btrfs_search_slot(NULL, root, key, p, 0, 0);
3030 }
3031
3032again:
3033 b = get_old_root(root, time_seq);
3034 if (!b) {
3035 ret = -EIO;
3036 goto done;
3037 }
3038 level = btrfs_header_level(b);
3039 p->locks[level] = BTRFS_READ_LOCK;
3040
3041 while (b) {
3042 level = btrfs_header_level(b);
3043 p->nodes[level] = b;
3044 btrfs_clear_path_blocking(p, NULL, 0);
3045
3046 /*
3047 * we have a lock on b and as long as we aren't changing
3048 * the tree, there is no way to for the items in b to change.
3049 * It is safe to drop the lock on our parent before we
3050 * go through the expensive btree search on b.
3051 */
3052 btrfs_unlock_up_safe(p, level + 1);
3053
3054 /*
3055 * Since we can unwind ebs we want to do a real search every
3056 * time.
3057 */
3058 prev_cmp = -1;
3059 ret = key_search(b, key, level, &prev_cmp, &slot);
3060
3061 if (level != 0) {
3062 int dec = 0;
3063 if (ret && slot > 0) {
3064 dec = 1;
3065 slot -= 1;
3066 }
3067 p->slots[level] = slot;
3068 unlock_up(p, level, lowest_unlock, 0, NULL);
3069
3070 if (level == lowest_level) {
3071 if (dec)
3072 p->slots[level]++;
3073 goto done;
3074 }
3075
3076 err = read_block_for_search(root, p, &b, level,
3077 slot, key);
3078 if (err == -EAGAIN)
3079 goto again;
3080 if (err) {
3081 ret = err;
3082 goto done;
3083 }
3084
3085 level = btrfs_header_level(b);
3086 err = btrfs_tree_read_lock_atomic(b);
3087 if (!err) {
3088 btrfs_set_path_blocking(p);
3089 btrfs_tree_read_lock(b);
3090 btrfs_clear_path_blocking(p, b,
3091 BTRFS_READ_LOCK);
3092 }
3093 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3094 if (!b) {
3095 ret = -ENOMEM;
3096 goto done;
3097 }
3098 p->locks[level] = BTRFS_READ_LOCK;
3099 p->nodes[level] = b;
3100 } else {
3101 p->slots[level] = slot;
3102 unlock_up(p, level, lowest_unlock, 0, NULL);
3103 goto done;
3104 }
3105 }
3106 ret = 1;
3107done:
3108 if (!p->leave_spinning)
3109 btrfs_set_path_blocking(p);
3110 if (ret < 0)
3111 btrfs_release_path(p);
3112
3113 return ret;
3114}
3115
3116/*
3117 * helper to use instead of search slot if no exact match is needed but
3118 * instead the next or previous item should be returned.
3119 * When find_higher is true, the next higher item is returned, the next lower
3120 * otherwise.
3121 * When return_any and find_higher are both true, and no higher item is found,
3122 * return the next lower instead.
3123 * When return_any is true and find_higher is false, and no lower item is found,
3124 * return the next higher instead.
3125 * It returns 0 if any item is found, 1 if none is found (tree empty), and
3126 * < 0 on error
3127 */
3128int btrfs_search_slot_for_read(struct btrfs_root *root,
3129 const struct btrfs_key *key,
3130 struct btrfs_path *p, int find_higher,
3131 int return_any)
3132{
3133 int ret;
3134 struct extent_buffer *leaf;
3135
3136again:
3137 ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3138 if (ret <= 0)
3139 return ret;
3140 /*
3141 * a return value of 1 means the path is at the position where the
3142 * item should be inserted. Normally this is the next bigger item,
3143 * but in case the previous item is the last in a leaf, path points
3144 * to the first free slot in the previous leaf, i.e. at an invalid
3145 * item.
3146 */
3147 leaf = p->nodes[0];
3148
3149 if (find_higher) {
3150 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3151 ret = btrfs_next_leaf(root, p);
3152 if (ret <= 0)
3153 return ret;
3154 if (!return_any)
3155 return 1;
3156 /*
3157 * no higher item found, return the next
3158 * lower instead
3159 */
3160 return_any = 0;
3161 find_higher = 0;
3162 btrfs_release_path(p);
3163 goto again;
3164 }
3165 } else {
3166 if (p->slots[0] == 0) {
3167 ret = btrfs_prev_leaf(root, p);
3168 if (ret < 0)
3169 return ret;
3170 if (!ret) {
3171 leaf = p->nodes[0];
3172 if (p->slots[0] == btrfs_header_nritems(leaf))
3173 p->slots[0]--;
3174 return 0;
3175 }
3176 if (!return_any)
3177 return 1;
3178 /*
3179 * no lower item found, return the next
3180 * higher instead
3181 */
3182 return_any = 0;
3183 find_higher = 1;
3184 btrfs_release_path(p);
3185 goto again;
3186 } else {
3187 --p->slots[0];
3188 }
3189 }
3190 return 0;
3191}
3192
3193/*
3194 * adjust the pointers going up the tree, starting at level
3195 * making sure the right key of each node is points to 'key'.
3196 * This is used after shifting pointers to the left, so it stops
3197 * fixing up pointers when a given leaf/node is not in slot 0 of the
3198 * higher levels
3199 *
3200 */
3201static void fixup_low_keys(struct btrfs_path *path,
3202 struct btrfs_disk_key *key, int level)
3203{
3204 int i;
3205 struct extent_buffer *t;
3206 int ret;
3207
3208 for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3209 int tslot = path->slots[i];
3210
3211 if (!path->nodes[i])
3212 break;
3213 t = path->nodes[i];
3214 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3215 GFP_ATOMIC);
3216 BUG_ON(ret < 0);
3217 btrfs_set_node_key(t, key, tslot);
3218 btrfs_mark_buffer_dirty(path->nodes[i]);
3219 if (tslot != 0)
3220 break;
3221 }
3222}
3223
3224/*
3225 * update item key.
3226 *
3227 * This function isn't completely safe. It's the caller's responsibility
3228 * that the new key won't break the order
3229 */
3230void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3231 struct btrfs_path *path,
3232 const struct btrfs_key *new_key)
3233{
3234 struct btrfs_disk_key disk_key;
3235 struct extent_buffer *eb;
3236 int slot;
3237
3238 eb = path->nodes[0];
3239 slot = path->slots[0];
3240 if (slot > 0) {
3241 btrfs_item_key(eb, &disk_key, slot - 1);
3242 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3243 }
3244 if (slot < btrfs_header_nritems(eb) - 1) {
3245 btrfs_item_key(eb, &disk_key, slot + 1);
3246 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3247 }
3248
3249 btrfs_cpu_key_to_disk(&disk_key, new_key);
3250 btrfs_set_item_key(eb, &disk_key, slot);
3251 btrfs_mark_buffer_dirty(eb);
3252 if (slot == 0)
3253 fixup_low_keys(path, &disk_key, 1);
3254}
3255
3256/*
3257 * try to push data from one node into the next node left in the
3258 * tree.
3259 *
3260 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3261 * error, and > 0 if there was no room in the left hand block.
3262 */
3263static int push_node_left(struct btrfs_trans_handle *trans,
3264 struct btrfs_fs_info *fs_info,
3265 struct extent_buffer *dst,
3266 struct extent_buffer *src, int empty)
3267{
3268 int push_items = 0;
3269 int src_nritems;
3270 int dst_nritems;
3271 int ret = 0;
3272
3273 src_nritems = btrfs_header_nritems(src);
3274 dst_nritems = btrfs_header_nritems(dst);
3275 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3276 WARN_ON(btrfs_header_generation(src) != trans->transid);
3277 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3278
3279 if (!empty && src_nritems <= 8)
3280 return 1;
3281
3282 if (push_items <= 0)
3283 return 1;
3284
3285 if (empty) {
3286 push_items = min(src_nritems, push_items);
3287 if (push_items < src_nritems) {
3288 /* leave at least 8 pointers in the node if
3289 * we aren't going to empty it
3290 */
3291 if (src_nritems - push_items < 8) {
3292 if (push_items <= 8)
3293 return 1;
3294 push_items -= 8;
3295 }
3296 }
3297 } else
3298 push_items = min(src_nritems - 8, push_items);
3299
3300 ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0,
3301 push_items);
3302 if (ret) {
3303 btrfs_abort_transaction(trans, ret);
3304 return ret;
3305 }
3306 copy_extent_buffer(dst, src,
3307 btrfs_node_key_ptr_offset(dst_nritems),
3308 btrfs_node_key_ptr_offset(0),
3309 push_items * sizeof(struct btrfs_key_ptr));
3310
3311 if (push_items < src_nritems) {
3312 /*
3313 * Don't call tree_mod_log_insert_move here, key removal was
3314 * already fully logged by tree_mod_log_eb_copy above.
3315 */
3316 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3317 btrfs_node_key_ptr_offset(push_items),
3318 (src_nritems - push_items) *
3319 sizeof(struct btrfs_key_ptr));
3320 }
3321 btrfs_set_header_nritems(src, src_nritems - push_items);
3322 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3323 btrfs_mark_buffer_dirty(src);
3324 btrfs_mark_buffer_dirty(dst);
3325
3326 return ret;
3327}
3328
3329/*
3330 * try to push data from one node into the next node right in the
3331 * tree.
3332 *
3333 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3334 * error, and > 0 if there was no room in the right hand block.
3335 *
3336 * this will only push up to 1/2 the contents of the left node over
3337 */
3338static int balance_node_right(struct btrfs_trans_handle *trans,
3339 struct btrfs_fs_info *fs_info,
3340 struct extent_buffer *dst,
3341 struct extent_buffer *src)
3342{
3343 int push_items = 0;
3344 int max_push;
3345 int src_nritems;
3346 int dst_nritems;
3347 int ret = 0;
3348
3349 WARN_ON(btrfs_header_generation(src) != trans->transid);
3350 WARN_ON(btrfs_header_generation(dst) != trans->transid);
3351
3352 src_nritems = btrfs_header_nritems(src);
3353 dst_nritems = btrfs_header_nritems(dst);
3354 push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3355 if (push_items <= 0)
3356 return 1;
3357
3358 if (src_nritems < 4)
3359 return 1;
3360
3361 max_push = src_nritems / 2 + 1;
3362 /* don't try to empty the node */
3363 if (max_push >= src_nritems)
3364 return 1;
3365
3366 if (max_push < push_items)
3367 push_items = max_push;
3368
3369 ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3370 BUG_ON(ret < 0);
3371 memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3372 btrfs_node_key_ptr_offset(0),
3373 (dst_nritems) *
3374 sizeof(struct btrfs_key_ptr));
3375
3376 ret = tree_mod_log_eb_copy(fs_info, dst, src, 0,
3377 src_nritems - push_items, push_items);
3378 if (ret) {
3379 btrfs_abort_transaction(trans, ret);
3380 return ret;
3381 }
3382 copy_extent_buffer(dst, src,
3383 btrfs_node_key_ptr_offset(0),
3384 btrfs_node_key_ptr_offset(src_nritems - push_items),
3385 push_items * sizeof(struct btrfs_key_ptr));
3386
3387 btrfs_set_header_nritems(src, src_nritems - push_items);
3388 btrfs_set_header_nritems(dst, dst_nritems + push_items);
3389
3390 btrfs_mark_buffer_dirty(src);
3391 btrfs_mark_buffer_dirty(dst);
3392
3393 return ret;
3394}
3395
3396/*
3397 * helper function to insert a new root level in the tree.
3398 * A new node is allocated, and a single item is inserted to
3399 * point to the existing root
3400 *
3401 * returns zero on success or < 0 on failure.
3402 */
3403static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3404 struct btrfs_root *root,
3405 struct btrfs_path *path, int level)
3406{
3407 struct btrfs_fs_info *fs_info = root->fs_info;
3408 u64 lower_gen;
3409 struct extent_buffer *lower;
3410 struct extent_buffer *c;
3411 struct extent_buffer *old;
3412 struct btrfs_disk_key lower_key;
3413 int ret;
3414
3415 BUG_ON(path->nodes[level]);
3416 BUG_ON(path->nodes[level-1] != root->node);
3417
3418 lower = path->nodes[level-1];
3419 if (level == 1)
3420 btrfs_item_key(lower, &lower_key, 0);
3421 else
3422 btrfs_node_key(lower, &lower_key, 0);
3423
3424 c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3425 root->node->start, 0);
3426 if (IS_ERR(c))
3427 return PTR_ERR(c);
3428
3429 root_add_used(root, fs_info->nodesize);
3430
3431 btrfs_set_header_nritems(c, 1);
3432 btrfs_set_node_key(c, &lower_key, 0);
3433 btrfs_set_node_blockptr(c, 0, lower->start);
3434 lower_gen = btrfs_header_generation(lower);
3435 WARN_ON(lower_gen != trans->transid);
3436
3437 btrfs_set_node_ptr_generation(c, 0, lower_gen);
3438
3439 btrfs_mark_buffer_dirty(c);
3440
3441 old = root->node;
3442 ret = tree_mod_log_insert_root(root->node, c, 0);
3443 BUG_ON(ret < 0);
3444 rcu_assign_pointer(root->node, c);
3445
3446 /* the super has an extra ref to root->node */
3447 free_extent_buffer(old);
3448
3449 add_root_to_dirty_list(root);
3450 extent_buffer_get(c);
3451 path->nodes[level] = c;
3452 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3453 path->slots[level] = 0;
3454 return 0;
3455}
3456
3457/*
3458 * worker function to insert a single pointer in a node.
3459 * the node should have enough room for the pointer already
3460 *
3461 * slot and level indicate where you want the key to go, and
3462 * blocknr is the block the key points to.
3463 */
3464static void insert_ptr(struct btrfs_trans_handle *trans,
3465 struct btrfs_fs_info *fs_info, struct btrfs_path *path,
3466 struct btrfs_disk_key *key, u64 bytenr,
3467 int slot, int level)
3468{
3469 struct extent_buffer *lower;
3470 int nritems;
3471 int ret;
3472
3473 BUG_ON(!path->nodes[level]);
3474 btrfs_assert_tree_locked(path->nodes[level]);
3475 lower = path->nodes[level];
3476 nritems = btrfs_header_nritems(lower);
3477 BUG_ON(slot > nritems);
3478 BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info));
3479 if (slot != nritems) {
3480 if (level) {
3481 ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3482 nritems - slot);
3483 BUG_ON(ret < 0);
3484 }
3485 memmove_extent_buffer(lower,
3486 btrfs_node_key_ptr_offset(slot + 1),
3487 btrfs_node_key_ptr_offset(slot),
3488 (nritems - slot) * sizeof(struct btrfs_key_ptr));
3489 }
3490 if (level) {
3491 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3492 GFP_NOFS);
3493 BUG_ON(ret < 0);
3494 }
3495 btrfs_set_node_key(lower, key, slot);
3496 btrfs_set_node_blockptr(lower, slot, bytenr);
3497 WARN_ON(trans->transid == 0);
3498 btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3499 btrfs_set_header_nritems(lower, nritems + 1);
3500 btrfs_mark_buffer_dirty(lower);
3501}
3502
3503/*
3504 * split the node at the specified level in path in two.
3505 * The path is corrected to point to the appropriate node after the split
3506 *
3507 * Before splitting this tries to make some room in the node by pushing
3508 * left and right, if either one works, it returns right away.
3509 *
3510 * returns 0 on success and < 0 on failure
3511 */
3512static noinline int split_node(struct btrfs_trans_handle *trans,
3513 struct btrfs_root *root,
3514 struct btrfs_path *path, int level)
3515{
3516 struct btrfs_fs_info *fs_info = root->fs_info;
3517 struct extent_buffer *c;
3518 struct extent_buffer *split;
3519 struct btrfs_disk_key disk_key;
3520 int mid;
3521 int ret;
3522 u32 c_nritems;
3523
3524 c = path->nodes[level];
3525 WARN_ON(btrfs_header_generation(c) != trans->transid);
3526 if (c == root->node) {
3527 /*
3528 * trying to split the root, lets make a new one
3529 *
3530 * tree mod log: We don't log_removal old root in
3531 * insert_new_root, because that root buffer will be kept as a
3532 * normal node. We are going to log removal of half of the
3533 * elements below with tree_mod_log_eb_copy. We're holding a
3534 * tree lock on the buffer, which is why we cannot race with
3535 * other tree_mod_log users.
3536 */
3537 ret = insert_new_root(trans, root, path, level + 1);
3538 if (ret)
3539 return ret;
3540 } else {
3541 ret = push_nodes_for_insert(trans, root, path, level);
3542 c = path->nodes[level];
3543 if (!ret && btrfs_header_nritems(c) <
3544 BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3545 return 0;
3546 if (ret < 0)
3547 return ret;
3548 }
3549
3550 c_nritems = btrfs_header_nritems(c);
3551 mid = (c_nritems + 1) / 2;
3552 btrfs_node_key(c, &disk_key, mid);
3553
3554 split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3555 c->start, 0);
3556 if (IS_ERR(split))
3557 return PTR_ERR(split);
3558
3559 root_add_used(root, fs_info->nodesize);
3560 ASSERT(btrfs_header_level(c) == level);
3561
3562 ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid);
3563 if (ret) {
3564 btrfs_abort_transaction(trans, ret);
3565 return ret;
3566 }
3567 copy_extent_buffer(split, c,
3568 btrfs_node_key_ptr_offset(0),
3569 btrfs_node_key_ptr_offset(mid),
3570 (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3571 btrfs_set_header_nritems(split, c_nritems - mid);
3572 btrfs_set_header_nritems(c, mid);
3573 ret = 0;
3574
3575 btrfs_mark_buffer_dirty(c);
3576 btrfs_mark_buffer_dirty(split);
3577
3578 insert_ptr(trans, fs_info, path, &disk_key, split->start,
3579 path->slots[level + 1] + 1, level + 1);
3580
3581 if (path->slots[level] >= mid) {
3582 path->slots[level] -= mid;
3583 btrfs_tree_unlock(c);
3584 free_extent_buffer(c);
3585 path->nodes[level] = split;
3586 path->slots[level + 1] += 1;
3587 } else {
3588 btrfs_tree_unlock(split);
3589 free_extent_buffer(split);
3590 }
3591 return ret;
3592}
3593
3594/*
3595 * how many bytes are required to store the items in a leaf. start
3596 * and nr indicate which items in the leaf to check. This totals up the
3597 * space used both by the item structs and the item data
3598 */
3599static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3600{
3601 struct btrfs_item *start_item;
3602 struct btrfs_item *end_item;
3603 struct btrfs_map_token token;
3604 int data_len;
3605 int nritems = btrfs_header_nritems(l);
3606 int end = min(nritems, start + nr) - 1;
3607
3608 if (!nr)
3609 return 0;
3610 btrfs_init_map_token(&token);
3611 start_item = btrfs_item_nr(start);
3612 end_item = btrfs_item_nr(end);
3613 data_len = btrfs_token_item_offset(l, start_item, &token) +
3614 btrfs_token_item_size(l, start_item, &token);
3615 data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3616 data_len += sizeof(struct btrfs_item) * nr;
3617 WARN_ON(data_len < 0);
3618 return data_len;
3619}
3620
3621/*
3622 * The space between the end of the leaf items and
3623 * the start of the leaf data. IOW, how much room
3624 * the leaf has left for both items and data
3625 */
3626noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info,
3627 struct extent_buffer *leaf)
3628{
3629 int nritems = btrfs_header_nritems(leaf);
3630 int ret;
3631
3632 ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3633 if (ret < 0) {
3634 btrfs_crit(fs_info,
3635 "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3636 ret,
3637 (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3638 leaf_space_used(leaf, 0, nritems), nritems);
3639 }
3640 return ret;
3641}
3642
3643/*
3644 * min slot controls the lowest index we're willing to push to the
3645 * right. We'll push up to and including min_slot, but no lower
3646 */
3647static noinline int __push_leaf_right(struct btrfs_fs_info *fs_info,
3648 struct btrfs_path *path,
3649 int data_size, int empty,
3650 struct extent_buffer *right,
3651 int free_space, u32 left_nritems,
3652 u32 min_slot)
3653{
3654 struct extent_buffer *left = path->nodes[0];
3655 struct extent_buffer *upper = path->nodes[1];
3656 struct btrfs_map_token token;
3657 struct btrfs_disk_key disk_key;
3658 int slot;
3659 u32 i;
3660 int push_space = 0;
3661 int push_items = 0;
3662 struct btrfs_item *item;
3663 u32 nr;
3664 u32 right_nritems;
3665 u32 data_end;
3666 u32 this_item_size;
3667
3668 btrfs_init_map_token(&token);
3669
3670 if (empty)
3671 nr = 0;
3672 else
3673 nr = max_t(u32, 1, min_slot);
3674
3675 if (path->slots[0] >= left_nritems)
3676 push_space += data_size;
3677
3678 slot = path->slots[1];
3679 i = left_nritems - 1;
3680 while (i >= nr) {
3681 item = btrfs_item_nr(i);
3682
3683 if (!empty && push_items > 0) {
3684 if (path->slots[0] > i)
3685 break;
3686 if (path->slots[0] == i) {
3687 int space = btrfs_leaf_free_space(fs_info, left);
3688 if (space + push_space * 2 > free_space)
3689 break;
3690 }
3691 }
3692
3693 if (path->slots[0] == i)
3694 push_space += data_size;
3695
3696 this_item_size = btrfs_item_size(left, item);
3697 if (this_item_size + sizeof(*item) + push_space > free_space)
3698 break;
3699
3700 push_items++;
3701 push_space += this_item_size + sizeof(*item);
3702 if (i == 0)
3703 break;
3704 i--;
3705 }
3706
3707 if (push_items == 0)
3708 goto out_unlock;
3709
3710 WARN_ON(!empty && push_items == left_nritems);
3711
3712 /* push left to right */
3713 right_nritems = btrfs_header_nritems(right);
3714
3715 push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3716 push_space -= leaf_data_end(fs_info, left);
3717
3718 /* make room in the right data area */
3719 data_end = leaf_data_end(fs_info, right);
3720 memmove_extent_buffer(right,
3721 BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3722 BTRFS_LEAF_DATA_OFFSET + data_end,
3723 BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3724
3725 /* copy from the left data area */
3726 copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3727 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3728 BTRFS_LEAF_DATA_OFFSET + leaf_data_end(fs_info, left),
3729 push_space);
3730
3731 memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3732 btrfs_item_nr_offset(0),
3733 right_nritems * sizeof(struct btrfs_item));
3734
3735 /* copy the items from left to right */
3736 copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3737 btrfs_item_nr_offset(left_nritems - push_items),
3738 push_items * sizeof(struct btrfs_item));
3739
3740 /* update the item pointers */
3741 right_nritems += push_items;
3742 btrfs_set_header_nritems(right, right_nritems);
3743 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3744 for (i = 0; i < right_nritems; i++) {
3745 item = btrfs_item_nr(i);
3746 push_space -= btrfs_token_item_size(right, item, &token);
3747 btrfs_set_token_item_offset(right, item, push_space, &token);
3748 }
3749
3750 left_nritems -= push_items;
3751 btrfs_set_header_nritems(left, left_nritems);
3752
3753 if (left_nritems)
3754 btrfs_mark_buffer_dirty(left);
3755 else
3756 clean_tree_block(fs_info, left);
3757
3758 btrfs_mark_buffer_dirty(right);
3759
3760 btrfs_item_key(right, &disk_key, 0);
3761 btrfs_set_node_key(upper, &disk_key, slot + 1);
3762 btrfs_mark_buffer_dirty(upper);
3763
3764 /* then fixup the leaf pointer in the path */
3765 if (path->slots[0] >= left_nritems) {
3766 path->slots[0] -= left_nritems;
3767 if (btrfs_header_nritems(path->nodes[0]) == 0)
3768 clean_tree_block(fs_info, path->nodes[0]);
3769 btrfs_tree_unlock(path->nodes[0]);
3770 free_extent_buffer(path->nodes[0]);
3771 path->nodes[0] = right;
3772 path->slots[1] += 1;
3773 } else {
3774 btrfs_tree_unlock(right);
3775 free_extent_buffer(right);
3776 }
3777 return 0;
3778
3779out_unlock:
3780 btrfs_tree_unlock(right);
3781 free_extent_buffer(right);
3782 return 1;
3783}
3784
3785/*
3786 * push some data in the path leaf to the right, trying to free up at
3787 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3788 *
3789 * returns 1 if the push failed because the other node didn't have enough
3790 * room, 0 if everything worked out and < 0 if there were major errors.
3791 *
3792 * this will push starting from min_slot to the end of the leaf. It won't
3793 * push any slot lower than min_slot
3794 */
3795static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3796 *root, struct btrfs_path *path,
3797 int min_data_size, int data_size,
3798 int empty, u32 min_slot)
3799{
3800 struct btrfs_fs_info *fs_info = root->fs_info;
3801 struct extent_buffer *left = path->nodes[0];
3802 struct extent_buffer *right;
3803 struct extent_buffer *upper;
3804 int slot;
3805 int free_space;
3806 u32 left_nritems;
3807 int ret;
3808
3809 if (!path->nodes[1])
3810 return 1;
3811
3812 slot = path->slots[1];
3813 upper = path->nodes[1];
3814 if (slot >= btrfs_header_nritems(upper) - 1)
3815 return 1;
3816
3817 btrfs_assert_tree_locked(path->nodes[1]);
3818
3819 right = read_node_slot(fs_info, upper, slot + 1);
3820 /*
3821 * slot + 1 is not valid or we fail to read the right node,
3822 * no big deal, just return.
3823 */
3824 if (IS_ERR(right))
3825 return 1;
3826
3827 btrfs_tree_lock(right);
3828 btrfs_set_lock_blocking(right);
3829
3830 free_space = btrfs_leaf_free_space(fs_info, right);
3831 if (free_space < data_size)
3832 goto out_unlock;
3833
3834 /* cow and double check */
3835 ret = btrfs_cow_block(trans, root, right, upper,
3836 slot + 1, &right);
3837 if (ret)
3838 goto out_unlock;
3839
3840 free_space = btrfs_leaf_free_space(fs_info, right);
3841 if (free_space < data_size)
3842 goto out_unlock;
3843
3844 left_nritems = btrfs_header_nritems(left);
3845 if (left_nritems == 0)
3846 goto out_unlock;
3847
3848 if (path->slots[0] == left_nritems && !empty) {
3849 /* Key greater than all keys in the leaf, right neighbor has
3850 * enough room for it and we're not emptying our leaf to delete
3851 * it, therefore use right neighbor to insert the new item and
3852 * no need to touch/dirty our left leaft. */
3853 btrfs_tree_unlock(left);
3854 free_extent_buffer(left);
3855 path->nodes[0] = right;
3856 path->slots[0] = 0;
3857 path->slots[1]++;
3858 return 0;
3859 }
3860
3861 return __push_leaf_right(fs_info, path, min_data_size, empty,
3862 right, free_space, left_nritems, min_slot);
3863out_unlock:
3864 btrfs_tree_unlock(right);
3865 free_extent_buffer(right);
3866 return 1;
3867}
3868
3869/*
3870 * push some data in the path leaf to the left, trying to free up at
3871 * least data_size bytes. returns zero if the push worked, nonzero otherwise
3872 *
3873 * max_slot can put a limit on how far into the leaf we'll push items. The
3874 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
3875 * items
3876 */
3877static noinline int __push_leaf_left(struct btrfs_fs_info *fs_info,
3878 struct btrfs_path *path, int data_size,
3879 int empty, struct extent_buffer *left,
3880 int free_space, u32 right_nritems,
3881 u32 max_slot)
3882{
3883 struct btrfs_disk_key disk_key;
3884 struct extent_buffer *right = path->nodes[0];
3885 int i;
3886 int push_space = 0;
3887 int push_items = 0;
3888 struct btrfs_item *item;
3889 u32 old_left_nritems;
3890 u32 nr;
3891 int ret = 0;
3892 u32 this_item_size;
3893 u32 old_left_item_size;
3894 struct btrfs_map_token token;
3895
3896 btrfs_init_map_token(&token);
3897
3898 if (empty)
3899 nr = min(right_nritems, max_slot);
3900 else
3901 nr = min(right_nritems - 1, max_slot);
3902
3903 for (i = 0; i < nr; i++) {
3904 item = btrfs_item_nr(i);
3905
3906 if (!empty && push_items > 0) {
3907 if (path->slots[0] < i)
3908 break;
3909 if (path->slots[0] == i) {
3910 int space = btrfs_leaf_free_space(fs_info, right);
3911 if (space + push_space * 2 > free_space)
3912 break;
3913 }
3914 }
3915
3916 if (path->slots[0] == i)
3917 push_space += data_size;
3918
3919 this_item_size = btrfs_item_size(right, item);
3920 if (this_item_size + sizeof(*item) + push_space > free_space)
3921 break;
3922
3923 push_items++;
3924 push_space += this_item_size + sizeof(*item);
3925 }
3926
3927 if (push_items == 0) {
3928 ret = 1;
3929 goto out;
3930 }
3931 WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3932
3933 /* push data from right to left */
3934 copy_extent_buffer(left, right,
3935 btrfs_item_nr_offset(btrfs_header_nritems(left)),
3936 btrfs_item_nr_offset(0),
3937 push_items * sizeof(struct btrfs_item));
3938
3939 push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3940 btrfs_item_offset_nr(right, push_items - 1);
3941
3942 copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3943 leaf_data_end(fs_info, left) - push_space,
3944 BTRFS_LEAF_DATA_OFFSET +
3945 btrfs_item_offset_nr(right, push_items - 1),
3946 push_space);
3947 old_left_nritems = btrfs_header_nritems(left);
3948 BUG_ON(old_left_nritems <= 0);
3949
3950 old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3951 for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3952 u32 ioff;
3953
3954 item = btrfs_item_nr(i);
3955
3956 ioff = btrfs_token_item_offset(left, item, &token);
3957 btrfs_set_token_item_offset(left, item,
3958 ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3959 &token);
3960 }
3961 btrfs_set_header_nritems(left, old_left_nritems + push_items);
3962
3963 /* fixup right node */
3964 if (push_items > right_nritems)
3965 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3966 right_nritems);
3967
3968 if (push_items < right_nritems) {
3969 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3970 leaf_data_end(fs_info, right);
3971 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3972 BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3973 BTRFS_LEAF_DATA_OFFSET +
3974 leaf_data_end(fs_info, right), push_space);
3975
3976 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3977 btrfs_item_nr_offset(push_items),
3978 (btrfs_header_nritems(right) - push_items) *
3979 sizeof(struct btrfs_item));
3980 }
3981 right_nritems -= push_items;
3982 btrfs_set_header_nritems(right, right_nritems);
3983 push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3984 for (i = 0; i < right_nritems; i++) {
3985 item = btrfs_item_nr(i);
3986
3987 push_space = push_space - btrfs_token_item_size(right,
3988 item, &token);
3989 btrfs_set_token_item_offset(right, item, push_space, &token);
3990 }
3991
3992 btrfs_mark_buffer_dirty(left);
3993 if (right_nritems)
3994 btrfs_mark_buffer_dirty(right);
3995 else
3996 clean_tree_block(fs_info, right);
3997
3998 btrfs_item_key(right, &disk_key, 0);
3999 fixup_low_keys(path, &disk_key, 1);
4000
4001 /* then fixup the leaf pointer in the path */
4002 if (path->slots[0] < push_items) {
4003 path->slots[0] += old_left_nritems;
4004 btrfs_tree_unlock(path->nodes[0]);
4005 free_extent_buffer(path->nodes[0]);
4006 path->nodes[0] = left;
4007 path->slots[1] -= 1;
4008 } else {
4009 btrfs_tree_unlock(left);
4010 free_extent_buffer(left);
4011 path->slots[0] -= push_items;
4012 }
4013 BUG_ON(path->slots[0] < 0);
4014 return ret;
4015out:
4016 btrfs_tree_unlock(left);
4017 free_extent_buffer(left);
4018 return ret;
4019}
4020
4021/*
4022 * push some data in the path leaf to the left, trying to free up at
4023 * least data_size bytes. returns zero if the push worked, nonzero otherwise
4024 *
4025 * max_slot can put a limit on how far into the leaf we'll push items. The
4026 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
4027 * items
4028 */
4029static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4030 *root, struct btrfs_path *path, int min_data_size,
4031 int data_size, int empty, u32 max_slot)
4032{
4033 struct btrfs_fs_info *fs_info = root->fs_info;
4034 struct extent_buffer *right = path->nodes[0];
4035 struct extent_buffer *left;
4036 int slot;
4037 int free_space;
4038 u32 right_nritems;
4039 int ret = 0;
4040
4041 slot = path->slots[1];
4042 if (slot == 0)
4043 return 1;
4044 if (!path->nodes[1])
4045 return 1;
4046
4047 right_nritems = btrfs_header_nritems(right);
4048 if (right_nritems == 0)
4049 return 1;
4050
4051 btrfs_assert_tree_locked(path->nodes[1]);
4052
4053 left = read_node_slot(fs_info, path->nodes[1], slot - 1);
4054 /*
4055 * slot - 1 is not valid or we fail to read the left node,
4056 * no big deal, just return.
4057 */
4058 if (IS_ERR(left))
4059 return 1;
4060
4061 btrfs_tree_lock(left);
4062 btrfs_set_lock_blocking(left);
4063
4064 free_space = btrfs_leaf_free_space(fs_info, left);
4065 if (free_space < data_size) {
4066 ret = 1;
4067 goto out;
4068 }
4069
4070 /* cow and double check */
4071 ret = btrfs_cow_block(trans, root, left,
4072 path->nodes[1], slot - 1, &left);
4073 if (ret) {
4074 /* we hit -ENOSPC, but it isn't fatal here */
4075 if (ret == -ENOSPC)
4076 ret = 1;
4077 goto out;
4078 }
4079
4080 free_space = btrfs_leaf_free_space(fs_info, left);
4081 if (free_space < data_size) {
4082 ret = 1;
4083 goto out;
4084 }
4085
4086 return __push_leaf_left(fs_info, path, min_data_size,
4087 empty, left, free_space, right_nritems,
4088 max_slot);
4089out:
4090 btrfs_tree_unlock(left);
4091 free_extent_buffer(left);
4092 return ret;
4093}
4094
4095/*
4096 * split the path's leaf in two, making sure there is at least data_size
4097 * available for the resulting leaf level of the path.
4098 */
4099static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4100 struct btrfs_fs_info *fs_info,
4101 struct btrfs_path *path,
4102 struct extent_buffer *l,
4103 struct extent_buffer *right,
4104 int slot, int mid, int nritems)
4105{
4106 int data_copy_size;
4107 int rt_data_off;
4108 int i;
4109 struct btrfs_disk_key disk_key;
4110 struct btrfs_map_token token;
4111
4112 btrfs_init_map_token(&token);
4113
4114 nritems = nritems - mid;
4115 btrfs_set_header_nritems(right, nritems);
4116 data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l);
4117
4118 copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4119 btrfs_item_nr_offset(mid),
4120 nritems * sizeof(struct btrfs_item));
4121
4122 copy_extent_buffer(right, l,
4123 BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4124 data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4125 leaf_data_end(fs_info, l), data_copy_size);
4126
4127 rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4128
4129 for (i = 0; i < nritems; i++) {
4130 struct btrfs_item *item = btrfs_item_nr(i);
4131 u32 ioff;
4132
4133 ioff = btrfs_token_item_offset(right, item, &token);
4134 btrfs_set_token_item_offset(right, item,
4135 ioff + rt_data_off, &token);
4136 }
4137
4138 btrfs_set_header_nritems(l, mid);
4139 btrfs_item_key(right, &disk_key, 0);
4140 insert_ptr(trans, fs_info, path, &disk_key, right->start,
4141 path->slots[1] + 1, 1);
4142
4143 btrfs_mark_buffer_dirty(right);
4144 btrfs_mark_buffer_dirty(l);
4145 BUG_ON(path->slots[0] != slot);
4146
4147 if (mid <= slot) {
4148 btrfs_tree_unlock(path->nodes[0]);
4149 free_extent_buffer(path->nodes[0]);
4150 path->nodes[0] = right;
4151 path->slots[0] -= mid;
4152 path->slots[1] += 1;
4153 } else {
4154 btrfs_tree_unlock(right);
4155 free_extent_buffer(right);
4156 }
4157
4158 BUG_ON(path->slots[0] < 0);
4159}
4160
4161/*
4162 * double splits happen when we need to insert a big item in the middle
4163 * of a leaf. A double split can leave us with 3 mostly empty leaves:
4164 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4165 * A B C
4166 *
4167 * We avoid this by trying to push the items on either side of our target
4168 * into the adjacent leaves. If all goes well we can avoid the double split
4169 * completely.
4170 */
4171static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4172 struct btrfs_root *root,
4173 struct btrfs_path *path,
4174 int data_size)
4175{
4176 struct btrfs_fs_info *fs_info = root->fs_info;
4177 int ret;
4178 int progress = 0;
4179 int slot;
4180 u32 nritems;
4181 int space_needed = data_size;
4182
4183 slot = path->slots[0];
4184 if (slot < btrfs_header_nritems(path->nodes[0]))
4185 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4186
4187 /*
4188 * try to push all the items after our slot into the
4189 * right leaf
4190 */
4191 ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4192 if (ret < 0)
4193 return ret;
4194
4195 if (ret == 0)
4196 progress++;
4197
4198 nritems = btrfs_header_nritems(path->nodes[0]);
4199 /*
4200 * our goal is to get our slot at the start or end of a leaf. If
4201 * we've done so we're done
4202 */
4203 if (path->slots[0] == 0 || path->slots[0] == nritems)
4204 return 0;
4205
4206 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4207 return 0;
4208
4209 /* try to push all the items before our slot into the next leaf */
4210 slot = path->slots[0];
4211 space_needed = data_size;
4212 if (slot > 0)
4213 space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]);
4214 ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4215 if (ret < 0)
4216 return ret;
4217
4218 if (ret == 0)
4219 progress++;
4220
4221 if (progress)
4222 return 0;
4223 return 1;
4224}
4225
4226/*
4227 * split the path's leaf in two, making sure there is at least data_size
4228 * available for the resulting leaf level of the path.
4229 *
4230 * returns 0 if all went well and < 0 on failure.
4231 */
4232static noinline int split_leaf(struct btrfs_trans_handle *trans,
4233 struct btrfs_root *root,
4234 const struct btrfs_key *ins_key,
4235 struct btrfs_path *path, int data_size,
4236 int extend)
4237{
4238 struct btrfs_disk_key disk_key;
4239 struct extent_buffer *l;
4240 u32 nritems;
4241 int mid;
4242 int slot;
4243 struct extent_buffer *right;
4244 struct btrfs_fs_info *fs_info = root->fs_info;
4245 int ret = 0;
4246 int wret;
4247 int split;
4248 int num_doubles = 0;
4249 int tried_avoid_double = 0;
4250
4251 l = path->nodes[0];
4252 slot = path->slots[0];
4253 if (extend && data_size + btrfs_item_size_nr(l, slot) +
4254 sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4255 return -EOVERFLOW;
4256
4257 /* first try to make some room by pushing left and right */
4258 if (data_size && path->nodes[1]) {
4259 int space_needed = data_size;
4260
4261 if (slot < btrfs_header_nritems(l))
4262 space_needed -= btrfs_leaf_free_space(fs_info, l);
4263
4264 wret = push_leaf_right(trans, root, path, space_needed,
4265 space_needed, 0, 0);
4266 if (wret < 0)
4267 return wret;
4268 if (wret) {
4269 space_needed = data_size;
4270 if (slot > 0)
4271 space_needed -= btrfs_leaf_free_space(fs_info,
4272 l);
4273 wret = push_leaf_left(trans, root, path, space_needed,
4274 space_needed, 0, (u32)-1);
4275 if (wret < 0)
4276 return wret;
4277 }
4278 l = path->nodes[0];
4279
4280 /* did the pushes work? */
4281 if (btrfs_leaf_free_space(fs_info, l) >= data_size)
4282 return 0;
4283 }
4284
4285 if (!path->nodes[1]) {
4286 ret = insert_new_root(trans, root, path, 1);
4287 if (ret)
4288 return ret;
4289 }
4290again:
4291 split = 1;
4292 l = path->nodes[0];
4293 slot = path->slots[0];
4294 nritems = btrfs_header_nritems(l);
4295 mid = (nritems + 1) / 2;
4296
4297 if (mid <= slot) {
4298 if (nritems == 1 ||
4299 leaf_space_used(l, mid, nritems - mid) + data_size >
4300 BTRFS_LEAF_DATA_SIZE(fs_info)) {
4301 if (slot >= nritems) {
4302 split = 0;
4303 } else {
4304 mid = slot;
4305 if (mid != nritems &&
4306 leaf_space_used(l, mid, nritems - mid) +
4307 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4308 if (data_size && !tried_avoid_double)
4309 goto push_for_double;
4310 split = 2;
4311 }
4312 }
4313 }
4314 } else {
4315 if (leaf_space_used(l, 0, mid) + data_size >
4316 BTRFS_LEAF_DATA_SIZE(fs_info)) {
4317 if (!extend && data_size && slot == 0) {
4318 split = 0;
4319 } else if ((extend || !data_size) && slot == 0) {
4320 mid = 1;
4321 } else {
4322 mid = slot;
4323 if (mid != nritems &&
4324 leaf_space_used(l, mid, nritems - mid) +
4325 data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4326 if (data_size && !tried_avoid_double)
4327 goto push_for_double;
4328 split = 2;
4329 }
4330 }
4331 }
4332 }
4333
4334 if (split == 0)
4335 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4336 else
4337 btrfs_item_key(l, &disk_key, mid);
4338
4339 right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4340 l->start, 0);
4341 if (IS_ERR(right))
4342 return PTR_ERR(right);
4343
4344 root_add_used(root, fs_info->nodesize);
4345
4346 if (split == 0) {
4347 if (mid <= slot) {
4348 btrfs_set_header_nritems(right, 0);
4349 insert_ptr(trans, fs_info, path, &disk_key,
4350 right->start, path->slots[1] + 1, 1);
4351 btrfs_tree_unlock(path->nodes[0]);
4352 free_extent_buffer(path->nodes[0]);
4353 path->nodes[0] = right;
4354 path->slots[0] = 0;
4355 path->slots[1] += 1;
4356 } else {
4357 btrfs_set_header_nritems(right, 0);
4358 insert_ptr(trans, fs_info, path, &disk_key,
4359 right->start, path->slots[1], 1);
4360 btrfs_tree_unlock(path->nodes[0]);
4361 free_extent_buffer(path->nodes[0]);
4362 path->nodes[0] = right;
4363 path->slots[0] = 0;
4364 if (path->slots[1] == 0)
4365 fixup_low_keys(path, &disk_key, 1);
4366 }
4367 /*
4368 * We create a new leaf 'right' for the required ins_len and
4369 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4370 * the content of ins_len to 'right'.
4371 */
4372 return ret;
4373 }
4374
4375 copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems);
4376
4377 if (split == 2) {
4378 BUG_ON(num_doubles != 0);
4379 num_doubles++;
4380 goto again;
4381 }
4382
4383 return 0;
4384
4385push_for_double:
4386 push_for_double_split(trans, root, path, data_size);
4387 tried_avoid_double = 1;
4388 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size)
4389 return 0;
4390 goto again;
4391}
4392
4393static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4394 struct btrfs_root *root,
4395 struct btrfs_path *path, int ins_len)
4396{
4397 struct btrfs_fs_info *fs_info = root->fs_info;
4398 struct btrfs_key key;
4399 struct extent_buffer *leaf;
4400 struct btrfs_file_extent_item *fi;
4401 u64 extent_len = 0;
4402 u32 item_size;
4403 int ret;
4404
4405 leaf = path->nodes[0];
4406 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4407
4408 BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4409 key.type != BTRFS_EXTENT_CSUM_KEY);
4410
4411 if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len)
4412 return 0;
4413
4414 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4415 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4416 fi = btrfs_item_ptr(leaf, path->slots[0],
4417 struct btrfs_file_extent_item);
4418 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4419 }
4420 btrfs_release_path(path);
4421
4422 path->keep_locks = 1;
4423 path->search_for_split = 1;
4424 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4425 path->search_for_split = 0;
4426 if (ret > 0)
4427 ret = -EAGAIN;
4428 if (ret < 0)
4429 goto err;
4430
4431 ret = -EAGAIN;
4432 leaf = path->nodes[0];
4433 /* if our item isn't there, return now */
4434 if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4435 goto err;
4436
4437 /* the leaf has changed, it now has room. return now */
4438 if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len)
4439 goto err;
4440
4441 if (key.type == BTRFS_EXTENT_DATA_KEY) {
4442 fi = btrfs_item_ptr(leaf, path->slots[0],
4443 struct btrfs_file_extent_item);
4444 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4445 goto err;
4446 }
4447
4448 btrfs_set_path_blocking(path);
4449 ret = split_leaf(trans, root, &key, path, ins_len, 1);
4450 if (ret)
4451 goto err;
4452
4453 path->keep_locks = 0;
4454 btrfs_unlock_up_safe(path, 1);
4455 return 0;
4456err:
4457 path->keep_locks = 0;
4458 return ret;
4459}
4460
4461static noinline int split_item(struct btrfs_fs_info *fs_info,
4462 struct btrfs_path *path,
4463 const struct btrfs_key *new_key,
4464 unsigned long split_offset)
4465{
4466 struct extent_buffer *leaf;
4467 struct btrfs_item *item;
4468 struct btrfs_item *new_item;
4469 int slot;
4470 char *buf;
4471 u32 nritems;
4472 u32 item_size;
4473 u32 orig_offset;
4474 struct btrfs_disk_key disk_key;
4475
4476 leaf = path->nodes[0];
4477 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item));
4478
4479 btrfs_set_path_blocking(path);
4480
4481 item = btrfs_item_nr(path->slots[0]);
4482 orig_offset = btrfs_item_offset(leaf, item);
4483 item_size = btrfs_item_size(leaf, item);
4484
4485 buf = kmalloc(item_size, GFP_NOFS);
4486 if (!buf)
4487 return -ENOMEM;
4488
4489 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4490 path->slots[0]), item_size);
4491
4492 slot = path->slots[0] + 1;
4493 nritems = btrfs_header_nritems(leaf);
4494 if (slot != nritems) {
4495 /* shift the items */
4496 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4497 btrfs_item_nr_offset(slot),
4498 (nritems - slot) * sizeof(struct btrfs_item));
4499 }
4500
4501 btrfs_cpu_key_to_disk(&disk_key, new_key);
4502 btrfs_set_item_key(leaf, &disk_key, slot);
4503
4504 new_item = btrfs_item_nr(slot);
4505
4506 btrfs_set_item_offset(leaf, new_item, orig_offset);
4507 btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4508
4509 btrfs_set_item_offset(leaf, item,
4510 orig_offset + item_size - split_offset);
4511 btrfs_set_item_size(leaf, item, split_offset);
4512
4513 btrfs_set_header_nritems(leaf, nritems + 1);
4514
4515 /* write the data for the start of the original item */
4516 write_extent_buffer(leaf, buf,
4517 btrfs_item_ptr_offset(leaf, path->slots[0]),
4518 split_offset);
4519
4520 /* write the data for the new item */
4521 write_extent_buffer(leaf, buf + split_offset,
4522 btrfs_item_ptr_offset(leaf, slot),
4523 item_size - split_offset);
4524 btrfs_mark_buffer_dirty(leaf);
4525
4526 BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0);
4527 kfree(buf);
4528 return 0;
4529}
4530
4531/*
4532 * This function splits a single item into two items,
4533 * giving 'new_key' to the new item and splitting the
4534 * old one at split_offset (from the start of the item).
4535 *
4536 * The path may be released by this operation. After
4537 * the split, the path is pointing to the old item. The
4538 * new item is going to be in the same node as the old one.
4539 *
4540 * Note, the item being split must be smaller enough to live alone on
4541 * a tree block with room for one extra struct btrfs_item
4542 *
4543 * This allows us to split the item in place, keeping a lock on the
4544 * leaf the entire time.
4545 */
4546int btrfs_split_item(struct btrfs_trans_handle *trans,
4547 struct btrfs_root *root,
4548 struct btrfs_path *path,
4549 const struct btrfs_key *new_key,
4550 unsigned long split_offset)
4551{
4552 int ret;
4553 ret = setup_leaf_for_split(trans, root, path,
4554 sizeof(struct btrfs_item));
4555 if (ret)
4556 return ret;
4557
4558 ret = split_item(root->fs_info, path, new_key, split_offset);
4559 return ret;
4560}
4561
4562/*
4563 * This function duplicate a item, giving 'new_key' to the new item.
4564 * It guarantees both items live in the same tree leaf and the new item
4565 * is contiguous with the original item.
4566 *
4567 * This allows us to split file extent in place, keeping a lock on the
4568 * leaf the entire time.
4569 */
4570int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4571 struct btrfs_root *root,
4572 struct btrfs_path *path,
4573 const struct btrfs_key *new_key)
4574{
4575 struct extent_buffer *leaf;
4576 int ret;
4577 u32 item_size;
4578
4579 leaf = path->nodes[0];
4580 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4581 ret = setup_leaf_for_split(trans, root, path,
4582 item_size + sizeof(struct btrfs_item));
4583 if (ret)
4584 return ret;
4585
4586 path->slots[0]++;
4587 setup_items_for_insert(root, path, new_key, &item_size,
4588 item_size, item_size +
4589 sizeof(struct btrfs_item), 1);
4590 leaf = path->nodes[0];
4591 memcpy_extent_buffer(leaf,
4592 btrfs_item_ptr_offset(leaf, path->slots[0]),
4593 btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4594 item_size);
4595 return 0;
4596}
4597
4598/*
4599 * make the item pointed to by the path smaller. new_size indicates
4600 * how small to make it, and from_end tells us if we just chop bytes
4601 * off the end of the item or if we shift the item to chop bytes off
4602 * the front.
4603 */
4604void btrfs_truncate_item(struct btrfs_fs_info *fs_info,
4605 struct btrfs_path *path, u32 new_size, int from_end)
4606{
4607 int slot;
4608 struct extent_buffer *leaf;
4609 struct btrfs_item *item;
4610 u32 nritems;
4611 unsigned int data_end;
4612 unsigned int old_data_start;
4613 unsigned int old_size;
4614 unsigned int size_diff;
4615 int i;
4616 struct btrfs_map_token token;
4617
4618 btrfs_init_map_token(&token);
4619
4620 leaf = path->nodes[0];
4621 slot = path->slots[0];
4622
4623 old_size = btrfs_item_size_nr(leaf, slot);
4624 if (old_size == new_size)
4625 return;
4626
4627 nritems = btrfs_header_nritems(leaf);
4628 data_end = leaf_data_end(fs_info, leaf);
4629
4630 old_data_start = btrfs_item_offset_nr(leaf, slot);
4631
4632 size_diff = old_size - new_size;
4633
4634 BUG_ON(slot < 0);
4635 BUG_ON(slot >= nritems);
4636
4637 /*
4638 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4639 */
4640 /* first correct the data pointers */
4641 for (i = slot; i < nritems; i++) {
4642 u32 ioff;
4643 item = btrfs_item_nr(i);
4644
4645 ioff = btrfs_token_item_offset(leaf, item, &token);
4646 btrfs_set_token_item_offset(leaf, item,
4647 ioff + size_diff, &token);
4648 }
4649
4650 /* shift the data */
4651 if (from_end) {
4652 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4653 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4654 data_end, old_data_start + new_size - data_end);
4655 } else {
4656 struct btrfs_disk_key disk_key;
4657 u64 offset;
4658
4659 btrfs_item_key(leaf, &disk_key, slot);
4660
4661 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4662 unsigned long ptr;
4663 struct btrfs_file_extent_item *fi;
4664
4665 fi = btrfs_item_ptr(leaf, slot,
4666 struct btrfs_file_extent_item);
4667 fi = (struct btrfs_file_extent_item *)(
4668 (unsigned long)fi - size_diff);
4669
4670 if (btrfs_file_extent_type(leaf, fi) ==
4671 BTRFS_FILE_EXTENT_INLINE) {
4672 ptr = btrfs_item_ptr_offset(leaf, slot);
4673 memmove_extent_buffer(leaf, ptr,
4674 (unsigned long)fi,
4675 BTRFS_FILE_EXTENT_INLINE_DATA_START);
4676 }
4677 }
4678
4679 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4680 data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4681 data_end, old_data_start - data_end);
4682
4683 offset = btrfs_disk_key_offset(&disk_key);
4684 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4685 btrfs_set_item_key(leaf, &disk_key, slot);
4686 if (slot == 0)
4687 fixup_low_keys(path, &disk_key, 1);
4688 }
4689
4690 item = btrfs_item_nr(slot);
4691 btrfs_set_item_size(leaf, item, new_size);
4692 btrfs_mark_buffer_dirty(leaf);
4693
4694 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4695 btrfs_print_leaf(leaf);
4696 BUG();
4697 }
4698}
4699
4700/*
4701 * make the item pointed to by the path bigger, data_size is the added size.
4702 */
4703void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
4704 u32 data_size)
4705{
4706 int slot;
4707 struct extent_buffer *leaf;
4708 struct btrfs_item *item;
4709 u32 nritems;
4710 unsigned int data_end;
4711 unsigned int old_data;
4712 unsigned int old_size;
4713 int i;
4714 struct btrfs_map_token token;
4715
4716 btrfs_init_map_token(&token);
4717
4718 leaf = path->nodes[0];
4719
4720 nritems = btrfs_header_nritems(leaf);
4721 data_end = leaf_data_end(fs_info, leaf);
4722
4723 if (btrfs_leaf_free_space(fs_info, leaf) < data_size) {
4724 btrfs_print_leaf(leaf);
4725 BUG();
4726 }
4727 slot = path->slots[0];
4728 old_data = btrfs_item_end_nr(leaf, slot);
4729
4730 BUG_ON(slot < 0);
4731 if (slot >= nritems) {
4732 btrfs_print_leaf(leaf);
4733 btrfs_crit(fs_info, "slot %d too large, nritems %d",
4734 slot, nritems);
4735 BUG_ON(1);
4736 }
4737
4738 /*
4739 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4740 */
4741 /* first correct the data pointers */
4742 for (i = slot; i < nritems; i++) {
4743 u32 ioff;
4744 item = btrfs_item_nr(i);
4745
4746 ioff = btrfs_token_item_offset(leaf, item, &token);
4747 btrfs_set_token_item_offset(leaf, item,
4748 ioff - data_size, &token);
4749 }
4750
4751 /* shift the data */
4752 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4753 data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4754 data_end, old_data - data_end);
4755
4756 data_end = old_data;
4757 old_size = btrfs_item_size_nr(leaf, slot);
4758 item = btrfs_item_nr(slot);
4759 btrfs_set_item_size(leaf, item, old_size + data_size);
4760 btrfs_mark_buffer_dirty(leaf);
4761
4762 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4763 btrfs_print_leaf(leaf);
4764 BUG();
4765 }
4766}
4767
4768/*
4769 * this is a helper for btrfs_insert_empty_items, the main goal here is
4770 * to save stack depth by doing the bulk of the work in a function
4771 * that doesn't call btrfs_search_slot
4772 */
4773void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4774 const struct btrfs_key *cpu_key, u32 *data_size,
4775 u32 total_data, u32 total_size, int nr)
4776{
4777 struct btrfs_fs_info *fs_info = root->fs_info;
4778 struct btrfs_item *item;
4779 int i;
4780 u32 nritems;
4781 unsigned int data_end;
4782 struct btrfs_disk_key disk_key;
4783 struct extent_buffer *leaf;
4784 int slot;
4785 struct btrfs_map_token token;
4786
4787 if (path->slots[0] == 0) {
4788 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4789 fixup_low_keys(path, &disk_key, 1);
4790 }
4791 btrfs_unlock_up_safe(path, 1);
4792
4793 btrfs_init_map_token(&token);
4794
4795 leaf = path->nodes[0];
4796 slot = path->slots[0];
4797
4798 nritems = btrfs_header_nritems(leaf);
4799 data_end = leaf_data_end(fs_info, leaf);
4800
4801 if (btrfs_leaf_free_space(fs_info, leaf) < total_size) {
4802 btrfs_print_leaf(leaf);
4803 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4804 total_size, btrfs_leaf_free_space(fs_info, leaf));
4805 BUG();
4806 }
4807
4808 if (slot != nritems) {
4809 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4810
4811 if (old_data < data_end) {
4812 btrfs_print_leaf(leaf);
4813 btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4814 slot, old_data, data_end);
4815 BUG_ON(1);
4816 }
4817 /*
4818 * item0..itemN ... dataN.offset..dataN.size .. data0.size
4819 */
4820 /* first correct the data pointers */
4821 for (i = slot; i < nritems; i++) {
4822 u32 ioff;
4823
4824 item = btrfs_item_nr(i);
4825 ioff = btrfs_token_item_offset(leaf, item, &token);
4826 btrfs_set_token_item_offset(leaf, item,
4827 ioff - total_data, &token);
4828 }
4829 /* shift the items */
4830 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4831 btrfs_item_nr_offset(slot),
4832 (nritems - slot) * sizeof(struct btrfs_item));
4833
4834 /* shift the data */
4835 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4836 data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4837 data_end, old_data - data_end);
4838 data_end = old_data;
4839 }
4840
4841 /* setup the item for the new data */
4842 for (i = 0; i < nr; i++) {
4843 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4844 btrfs_set_item_key(leaf, &disk_key, slot + i);
4845 item = btrfs_item_nr(slot + i);
4846 btrfs_set_token_item_offset(leaf, item,
4847 data_end - data_size[i], &token);
4848 data_end -= data_size[i];
4849 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4850 }
4851
4852 btrfs_set_header_nritems(leaf, nritems + nr);
4853 btrfs_mark_buffer_dirty(leaf);
4854
4855 if (btrfs_leaf_free_space(fs_info, leaf) < 0) {
4856 btrfs_print_leaf(leaf);
4857 BUG();
4858 }
4859}
4860
4861/*
4862 * Given a key and some data, insert items into the tree.
4863 * This does all the path init required, making room in the tree if needed.
4864 */
4865int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4866 struct btrfs_root *root,
4867 struct btrfs_path *path,
4868 const struct btrfs_key *cpu_key, u32 *data_size,
4869 int nr)
4870{
4871 int ret = 0;
4872 int slot;
4873 int i;
4874 u32 total_size = 0;
4875 u32 total_data = 0;
4876
4877 for (i = 0; i < nr; i++)
4878 total_data += data_size[i];
4879
4880 total_size = total_data + (nr * sizeof(struct btrfs_item));
4881 ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4882 if (ret == 0)
4883 return -EEXIST;
4884 if (ret < 0)
4885 return ret;
4886
4887 slot = path->slots[0];
4888 BUG_ON(slot < 0);
4889
4890 setup_items_for_insert(root, path, cpu_key, data_size,
4891 total_data, total_size, nr);
4892 return 0;
4893}
4894
4895/*
4896 * Given a key and some data, insert an item into the tree.
4897 * This does all the path init required, making room in the tree if needed.
4898 */
4899int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4900 const struct btrfs_key *cpu_key, void *data,
4901 u32 data_size)
4902{
4903 int ret = 0;
4904 struct btrfs_path *path;
4905 struct extent_buffer *leaf;
4906 unsigned long ptr;
4907
4908 path = btrfs_alloc_path();
4909 if (!path)
4910 return -ENOMEM;
4911 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4912 if (!ret) {
4913 leaf = path->nodes[0];
4914 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4915 write_extent_buffer(leaf, data, ptr, data_size);
4916 btrfs_mark_buffer_dirty(leaf);
4917 }
4918 btrfs_free_path(path);
4919 return ret;
4920}
4921
4922/*
4923 * delete the pointer from a given node.
4924 *
4925 * the tree should have been previously balanced so the deletion does not
4926 * empty a node.
4927 */
4928static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4929 int level, int slot)
4930{
4931 struct extent_buffer *parent = path->nodes[level];
4932 u32 nritems;
4933 int ret;
4934
4935 nritems = btrfs_header_nritems(parent);
4936 if (slot != nritems - 1) {
4937 if (level) {
4938 ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4939 nritems - slot - 1);
4940 BUG_ON(ret < 0);
4941 }
4942 memmove_extent_buffer(parent,
4943 btrfs_node_key_ptr_offset(slot),
4944 btrfs_node_key_ptr_offset(slot + 1),
4945 sizeof(struct btrfs_key_ptr) *
4946 (nritems - slot - 1));
4947 } else if (level) {
4948 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4949 GFP_NOFS);
4950 BUG_ON(ret < 0);
4951 }
4952
4953 nritems--;
4954 btrfs_set_header_nritems(parent, nritems);
4955 if (nritems == 0 && parent == root->node) {
4956 BUG_ON(btrfs_header_level(root->node) != 1);
4957 /* just turn the root into a leaf and break */
4958 btrfs_set_header_level(root->node, 0);
4959 } else if (slot == 0) {
4960 struct btrfs_disk_key disk_key;
4961
4962 btrfs_node_key(parent, &disk_key, 0);
4963 fixup_low_keys(path, &disk_key, level + 1);
4964 }
4965 btrfs_mark_buffer_dirty(parent);
4966}
4967
4968/*
4969 * a helper function to delete the leaf pointed to by path->slots[1] and
4970 * path->nodes[1].
4971 *
4972 * This deletes the pointer in path->nodes[1] and frees the leaf
4973 * block extent. zero is returned if it all worked out, < 0 otherwise.
4974 *
4975 * The path must have already been setup for deleting the leaf, including
4976 * all the proper balancing. path->nodes[1] must be locked.
4977 */
4978static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4979 struct btrfs_root *root,
4980 struct btrfs_path *path,
4981 struct extent_buffer *leaf)
4982{
4983 WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4984 del_ptr(root, path, 1, path->slots[1]);
4985
4986 /*
4987 * btrfs_free_extent is expensive, we want to make sure we
4988 * aren't holding any locks when we call it
4989 */
4990 btrfs_unlock_up_safe(path, 0);
4991
4992 root_sub_used(root, leaf->len);
4993
4994 extent_buffer_get(leaf);
4995 btrfs_free_tree_block(trans, root, leaf, 0, 1);
4996 free_extent_buffer_stale(leaf);
4997}
4998/*
4999 * delete the item at the leaf level in path. If that empties
5000 * the leaf, remove it from the tree
5001 */
5002int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5003 struct btrfs_path *path, int slot, int nr)
5004{
5005 struct btrfs_fs_info *fs_info = root->fs_info;
5006 struct extent_buffer *leaf;
5007 struct btrfs_item *item;
5008 u32 last_off;
5009 u32 dsize = 0;
5010 int ret = 0;
5011 int wret;
5012 int i;
5013 u32 nritems;
5014 struct btrfs_map_token token;
5015
5016 btrfs_init_map_token(&token);
5017
5018 leaf = path->nodes[0];
5019 last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5020
5021 for (i = 0; i < nr; i++)
5022 dsize += btrfs_item_size_nr(leaf, slot + i);
5023
5024 nritems = btrfs_header_nritems(leaf);
5025
5026 if (slot + nr != nritems) {
5027 int data_end = leaf_data_end(fs_info, leaf);
5028
5029 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5030 data_end + dsize,
5031 BTRFS_LEAF_DATA_OFFSET + data_end,
5032 last_off - data_end);
5033
5034 for (i = slot + nr; i < nritems; i++) {
5035 u32 ioff;
5036
5037 item = btrfs_item_nr(i);
5038 ioff = btrfs_token_item_offset(leaf, item, &token);
5039 btrfs_set_token_item_offset(leaf, item,
5040 ioff + dsize, &token);
5041 }
5042
5043 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5044 btrfs_item_nr_offset(slot + nr),
5045 sizeof(struct btrfs_item) *
5046 (nritems - slot - nr));
5047 }
5048 btrfs_set_header_nritems(leaf, nritems - nr);
5049 nritems -= nr;
5050
5051 /* delete the leaf if we've emptied it */
5052 if (nritems == 0) {
5053 if (leaf == root->node) {
5054 btrfs_set_header_level(leaf, 0);
5055 } else {
5056 btrfs_set_path_blocking(path);
5057 clean_tree_block(fs_info, leaf);
5058 btrfs_del_leaf(trans, root, path, leaf);
5059 }
5060 } else {
5061 int used = leaf_space_used(leaf, 0, nritems);
5062 if (slot == 0) {
5063 struct btrfs_disk_key disk_key;
5064
5065 btrfs_item_key(leaf, &disk_key, 0);
5066 fixup_low_keys(path, &disk_key, 1);
5067 }
5068
5069 /* delete the leaf if it is mostly empty */
5070 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5071 /* push_leaf_left fixes the path.
5072 * make sure the path still points to our leaf
5073 * for possible call to del_ptr below
5074 */
5075 slot = path->slots[1];
5076 extent_buffer_get(leaf);
5077
5078 btrfs_set_path_blocking(path);
5079 wret = push_leaf_left(trans, root, path, 1, 1,
5080 1, (u32)-1);
5081 if (wret < 0 && wret != -ENOSPC)
5082 ret = wret;
5083
5084 if (path->nodes[0] == leaf &&
5085 btrfs_header_nritems(leaf)) {
5086 wret = push_leaf_right(trans, root, path, 1,
5087 1, 1, 0);
5088 if (wret < 0 && wret != -ENOSPC)
5089 ret = wret;
5090 }
5091
5092 if (btrfs_header_nritems(leaf) == 0) {
5093 path->slots[1] = slot;
5094 btrfs_del_leaf(trans, root, path, leaf);
5095 free_extent_buffer(leaf);
5096 ret = 0;
5097 } else {
5098 /* if we're still in the path, make sure
5099 * we're dirty. Otherwise, one of the
5100 * push_leaf functions must have already
5101 * dirtied this buffer
5102 */
5103 if (path->nodes[0] == leaf)
5104 btrfs_mark_buffer_dirty(leaf);
5105 free_extent_buffer(leaf);
5106 }
5107 } else {
5108 btrfs_mark_buffer_dirty(leaf);
5109 }
5110 }
5111 return ret;
5112}
5113
5114/*
5115 * search the tree again to find a leaf with lesser keys
5116 * returns 0 if it found something or 1 if there are no lesser leaves.
5117 * returns < 0 on io errors.
5118 *
5119 * This may release the path, and so you may lose any locks held at the
5120 * time you call it.
5121 */
5122int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5123{
5124 struct btrfs_key key;
5125 struct btrfs_disk_key found_key;
5126 int ret;
5127
5128 btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5129
5130 if (key.offset > 0) {
5131 key.offset--;
5132 } else if (key.type > 0) {
5133 key.type--;
5134 key.offset = (u64)-1;
5135 } else if (key.objectid > 0) {
5136 key.objectid--;
5137 key.type = (u8)-1;
5138 key.offset = (u64)-1;
5139 } else {
5140 return 1;
5141 }
5142
5143 btrfs_release_path(path);
5144 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5145 if (ret < 0)
5146 return ret;
5147 btrfs_item_key(path->nodes[0], &found_key, 0);
5148 ret = comp_keys(&found_key, &key);
5149 /*
5150 * We might have had an item with the previous key in the tree right
5151 * before we released our path. And after we released our path, that
5152 * item might have been pushed to the first slot (0) of the leaf we
5153 * were holding due to a tree balance. Alternatively, an item with the
5154 * previous key can exist as the only element of a leaf (big fat item).
5155 * Therefore account for these 2 cases, so that our callers (like
5156 * btrfs_previous_item) don't miss an existing item with a key matching
5157 * the previous key we computed above.
5158 */
5159 if (ret <= 0)
5160 return 0;
5161 return 1;
5162}
5163
5164/*
5165 * A helper function to walk down the tree starting at min_key, and looking
5166 * for nodes or leaves that are have a minimum transaction id.
5167 * This is used by the btree defrag code, and tree logging
5168 *
5169 * This does not cow, but it does stuff the starting key it finds back
5170 * into min_key, so you can call btrfs_search_slot with cow=1 on the
5171 * key and get a writable path.
5172 *
5173 * This honors path->lowest_level to prevent descent past a given level
5174 * of the tree.
5175 *
5176 * min_trans indicates the oldest transaction that you are interested
5177 * in walking through. Any nodes or leaves older than min_trans are
5178 * skipped over (without reading them).
5179 *
5180 * returns zero if something useful was found, < 0 on error and 1 if there
5181 * was nothing in the tree that matched the search criteria.
5182 */
5183int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5184 struct btrfs_path *path,
5185 u64 min_trans)
5186{
5187 struct btrfs_fs_info *fs_info = root->fs_info;
5188 struct extent_buffer *cur;
5189 struct btrfs_key found_key;
5190 int slot;
5191 int sret;
5192 u32 nritems;
5193 int level;
5194 int ret = 1;
5195 int keep_locks = path->keep_locks;
5196
5197 path->keep_locks = 1;
5198again:
5199 cur = btrfs_read_lock_root_node(root);
5200 level = btrfs_header_level(cur);
5201 WARN_ON(path->nodes[level]);
5202 path->nodes[level] = cur;
5203 path->locks[level] = BTRFS_READ_LOCK;
5204
5205 if (btrfs_header_generation(cur) < min_trans) {
5206 ret = 1;
5207 goto out;
5208 }
5209 while (1) {
5210 nritems = btrfs_header_nritems(cur);
5211 level = btrfs_header_level(cur);
5212 sret = btrfs_bin_search(cur, min_key, level, &slot);
5213
5214 /* at the lowest level, we're done, setup the path and exit */
5215 if (level == path->lowest_level) {
5216 if (slot >= nritems)
5217 goto find_next_key;
5218 ret = 0;
5219 path->slots[level] = slot;
5220 btrfs_item_key_to_cpu(cur, &found_key, slot);
5221 goto out;
5222 }
5223 if (sret && slot > 0)
5224 slot--;
5225 /*
5226 * check this node pointer against the min_trans parameters.
5227 * If it is too old, old, skip to the next one.
5228 */
5229 while (slot < nritems) {
5230 u64 gen;
5231
5232 gen = btrfs_node_ptr_generation(cur, slot);
5233 if (gen < min_trans) {
5234 slot++;
5235 continue;
5236 }
5237 break;
5238 }
5239find_next_key:
5240 /*
5241 * we didn't find a candidate key in this node, walk forward
5242 * and find another one
5243 */
5244 if (slot >= nritems) {
5245 path->slots[level] = slot;
5246 btrfs_set_path_blocking(path);
5247 sret = btrfs_find_next_key(root, path, min_key, level,
5248 min_trans);
5249 if (sret == 0) {
5250 btrfs_release_path(path);
5251 goto again;
5252 } else {
5253 goto out;
5254 }
5255 }
5256 /* save our key for returning back */
5257 btrfs_node_key_to_cpu(cur, &found_key, slot);
5258 path->slots[level] = slot;
5259 if (level == path->lowest_level) {
5260 ret = 0;
5261 goto out;
5262 }
5263 btrfs_set_path_blocking(path);
5264 cur = read_node_slot(fs_info, cur, slot);
5265 if (IS_ERR(cur)) {
5266 ret = PTR_ERR(cur);
5267 goto out;
5268 }
5269
5270 btrfs_tree_read_lock(cur);
5271
5272 path->locks[level - 1] = BTRFS_READ_LOCK;
5273 path->nodes[level - 1] = cur;
5274 unlock_up(path, level, 1, 0, NULL);
5275 btrfs_clear_path_blocking(path, NULL, 0);
5276 }
5277out:
5278 path->keep_locks = keep_locks;
5279 if (ret == 0) {
5280 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5281 btrfs_set_path_blocking(path);
5282 memcpy(min_key, &found_key, sizeof(found_key));
5283 }
5284 return ret;
5285}
5286
5287static int tree_move_down(struct btrfs_fs_info *fs_info,
5288 struct btrfs_path *path,
5289 int *level)
5290{
5291 struct extent_buffer *eb;
5292
5293 BUG_ON(*level == 0);
5294 eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]);
5295 if (IS_ERR(eb))
5296 return PTR_ERR(eb);
5297
5298 path->nodes[*level - 1] = eb;
5299 path->slots[*level - 1] = 0;
5300 (*level)--;
5301 return 0;
5302}
5303
5304static int tree_move_next_or_upnext(struct btrfs_path *path,
5305 int *level, int root_level)
5306{
5307 int ret = 0;
5308 int nritems;
5309 nritems = btrfs_header_nritems(path->nodes[*level]);
5310
5311 path->slots[*level]++;
5312
5313 while (path->slots[*level] >= nritems) {
5314 if (*level == root_level)
5315 return -1;
5316
5317 /* move upnext */
5318 path->slots[*level] = 0;
5319 free_extent_buffer(path->nodes[*level]);
5320 path->nodes[*level] = NULL;
5321 (*level)++;
5322 path->slots[*level]++;
5323
5324 nritems = btrfs_header_nritems(path->nodes[*level]);
5325 ret = 1;
5326 }
5327 return ret;
5328}
5329
5330/*
5331 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5332 * or down.
5333 */
5334static int tree_advance(struct btrfs_fs_info *fs_info,
5335 struct btrfs_path *path,
5336 int *level, int root_level,
5337 int allow_down,
5338 struct btrfs_key *key)
5339{
5340 int ret;
5341
5342 if (*level == 0 || !allow_down) {
5343 ret = tree_move_next_or_upnext(path, level, root_level);
5344 } else {
5345 ret = tree_move_down(fs_info, path, level);
5346 }
5347 if (ret >= 0) {
5348 if (*level == 0)
5349 btrfs_item_key_to_cpu(path->nodes[*level], key,
5350 path->slots[*level]);
5351 else
5352 btrfs_node_key_to_cpu(path->nodes[*level], key,
5353 path->slots[*level]);
5354 }
5355 return ret;
5356}
5357
5358static int tree_compare_item(struct btrfs_path *left_path,
5359 struct btrfs_path *right_path,
5360 char *tmp_buf)
5361{
5362 int cmp;
5363 int len1, len2;
5364 unsigned long off1, off2;
5365
5366 len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5367 len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5368 if (len1 != len2)
5369 return 1;
5370
5371 off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5372 off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5373 right_path->slots[0]);
5374
5375 read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5376
5377 cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5378 if (cmp)
5379 return 1;
5380 return 0;
5381}
5382
5383#define ADVANCE 1
5384#define ADVANCE_ONLY_NEXT -1
5385
5386/*
5387 * This function compares two trees and calls the provided callback for
5388 * every changed/new/deleted item it finds.
5389 * If shared tree blocks are encountered, whole subtrees are skipped, making
5390 * the compare pretty fast on snapshotted subvolumes.
5391 *
5392 * This currently works on commit roots only. As commit roots are read only,
5393 * we don't do any locking. The commit roots are protected with transactions.
5394 * Transactions are ended and rejoined when a commit is tried in between.
5395 *
5396 * This function checks for modifications done to the trees while comparing.
5397 * If it detects a change, it aborts immediately.
5398 */
5399int btrfs_compare_trees(struct btrfs_root *left_root,
5400 struct btrfs_root *right_root,
5401 btrfs_changed_cb_t changed_cb, void *ctx)
5402{
5403 struct btrfs_fs_info *fs_info = left_root->fs_info;
5404 int ret;
5405 int cmp;
5406 struct btrfs_path *left_path = NULL;
5407 struct btrfs_path *right_path = NULL;
5408 struct btrfs_key left_key;
5409 struct btrfs_key right_key;
5410 char *tmp_buf = NULL;
5411 int left_root_level;
5412 int right_root_level;
5413 int left_level;
5414 int right_level;
5415 int left_end_reached;
5416 int right_end_reached;
5417 int advance_left;
5418 int advance_right;
5419 u64 left_blockptr;
5420 u64 right_blockptr;
5421 u64 left_gen;
5422 u64 right_gen;
5423
5424 left_path = btrfs_alloc_path();
5425 if (!left_path) {
5426 ret = -ENOMEM;
5427 goto out;
5428 }
5429 right_path = btrfs_alloc_path();
5430 if (!right_path) {
5431 ret = -ENOMEM;
5432 goto out;
5433 }
5434
5435 tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
5436 if (!tmp_buf) {
5437 ret = -ENOMEM;
5438 goto out;
5439 }
5440
5441 left_path->search_commit_root = 1;
5442 left_path->skip_locking = 1;
5443 right_path->search_commit_root = 1;
5444 right_path->skip_locking = 1;
5445
5446 /*
5447 * Strategy: Go to the first items of both trees. Then do
5448 *
5449 * If both trees are at level 0
5450 * Compare keys of current items
5451 * If left < right treat left item as new, advance left tree
5452 * and repeat
5453 * If left > right treat right item as deleted, advance right tree
5454 * and repeat
5455 * If left == right do deep compare of items, treat as changed if
5456 * needed, advance both trees and repeat
5457 * If both trees are at the same level but not at level 0
5458 * Compare keys of current nodes/leafs
5459 * If left < right advance left tree and repeat
5460 * If left > right advance right tree and repeat
5461 * If left == right compare blockptrs of the next nodes/leafs
5462 * If they match advance both trees but stay at the same level
5463 * and repeat
5464 * If they don't match advance both trees while allowing to go
5465 * deeper and repeat
5466 * If tree levels are different
5467 * Advance the tree that needs it and repeat
5468 *
5469 * Advancing a tree means:
5470 * If we are at level 0, try to go to the next slot. If that's not
5471 * possible, go one level up and repeat. Stop when we found a level
5472 * where we could go to the next slot. We may at this point be on a
5473 * node or a leaf.
5474 *
5475 * If we are not at level 0 and not on shared tree blocks, go one
5476 * level deeper.
5477 *
5478 * If we are not at level 0 and on shared tree blocks, go one slot to
5479 * the right if possible or go up and right.
5480 */
5481
5482 down_read(&fs_info->commit_root_sem);
5483 left_level = btrfs_header_level(left_root->commit_root);
5484 left_root_level = left_level;
5485 left_path->nodes[left_level] =
5486 btrfs_clone_extent_buffer(left_root->commit_root);
5487 if (!left_path->nodes[left_level]) {
5488 up_read(&fs_info->commit_root_sem);
5489 ret = -ENOMEM;
5490 goto out;
5491 }
5492 extent_buffer_get(left_path->nodes[left_level]);
5493
5494 right_level = btrfs_header_level(right_root->commit_root);
5495 right_root_level = right_level;
5496 right_path->nodes[right_level] =
5497 btrfs_clone_extent_buffer(right_root->commit_root);
5498 if (!right_path->nodes[right_level]) {
5499 up_read(&fs_info->commit_root_sem);
5500 ret = -ENOMEM;
5501 goto out;
5502 }
5503 extent_buffer_get(right_path->nodes[right_level]);
5504 up_read(&fs_info->commit_root_sem);
5505
5506 if (left_level == 0)
5507 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5508 &left_key, left_path->slots[left_level]);
5509 else
5510 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5511 &left_key, left_path->slots[left_level]);
5512 if (right_level == 0)
5513 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5514 &right_key, right_path->slots[right_level]);
5515 else
5516 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5517 &right_key, right_path->slots[right_level]);
5518
5519 left_end_reached = right_end_reached = 0;
5520 advance_left = advance_right = 0;
5521
5522 while (1) {
5523 cond_resched();
5524 if (advance_left && !left_end_reached) {
5525 ret = tree_advance(fs_info, left_path, &left_level,
5526 left_root_level,
5527 advance_left != ADVANCE_ONLY_NEXT,
5528 &left_key);
5529 if (ret == -1)
5530 left_end_reached = ADVANCE;
5531 else if (ret < 0)
5532 goto out;
5533 advance_left = 0;
5534 }
5535 if (advance_right && !right_end_reached) {
5536 ret = tree_advance(fs_info, right_path, &right_level,
5537 right_root_level,
5538 advance_right != ADVANCE_ONLY_NEXT,
5539 &right_key);
5540 if (ret == -1)
5541 right_end_reached = ADVANCE;
5542 else if (ret < 0)
5543 goto out;
5544 advance_right = 0;
5545 }
5546
5547 if (left_end_reached && right_end_reached) {
5548 ret = 0;
5549 goto out;
5550 } else if (left_end_reached) {
5551 if (right_level == 0) {
5552 ret = changed_cb(left_path, right_path,
5553 &right_key,
5554 BTRFS_COMPARE_TREE_DELETED,
5555 ctx);
5556 if (ret < 0)
5557 goto out;
5558 }
5559 advance_right = ADVANCE;
5560 continue;
5561 } else if (right_end_reached) {
5562 if (left_level == 0) {
5563 ret = changed_cb(left_path, right_path,
5564 &left_key,
5565 BTRFS_COMPARE_TREE_NEW,
5566 ctx);
5567 if (ret < 0)
5568 goto out;
5569 }
5570 advance_left = ADVANCE;
5571 continue;
5572 }
5573
5574 if (left_level == 0 && right_level == 0) {
5575 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5576 if (cmp < 0) {
5577 ret = changed_cb(left_path, right_path,
5578 &left_key,
5579 BTRFS_COMPARE_TREE_NEW,
5580 ctx);
5581 if (ret < 0)
5582 goto out;
5583 advance_left = ADVANCE;
5584 } else if (cmp > 0) {
5585 ret = changed_cb(left_path, right_path,
5586 &right_key,
5587 BTRFS_COMPARE_TREE_DELETED,
5588 ctx);
5589 if (ret < 0)
5590 goto out;
5591 advance_right = ADVANCE;
5592 } else {
5593 enum btrfs_compare_tree_result result;
5594
5595 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5596 ret = tree_compare_item(left_path, right_path,
5597 tmp_buf);
5598 if (ret)
5599 result = BTRFS_COMPARE_TREE_CHANGED;
5600 else
5601 result = BTRFS_COMPARE_TREE_SAME;
5602 ret = changed_cb(left_path, right_path,
5603 &left_key, result, ctx);
5604 if (ret < 0)
5605 goto out;
5606 advance_left = ADVANCE;
5607 advance_right = ADVANCE;
5608 }
5609 } else if (left_level == right_level) {
5610 cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5611 if (cmp < 0) {
5612 advance_left = ADVANCE;
5613 } else if (cmp > 0) {
5614 advance_right = ADVANCE;
5615 } else {
5616 left_blockptr = btrfs_node_blockptr(
5617 left_path->nodes[left_level],
5618 left_path->slots[left_level]);
5619 right_blockptr = btrfs_node_blockptr(
5620 right_path->nodes[right_level],
5621 right_path->slots[right_level]);
5622 left_gen = btrfs_node_ptr_generation(
5623 left_path->nodes[left_level],
5624 left_path->slots[left_level]);
5625 right_gen = btrfs_node_ptr_generation(
5626 right_path->nodes[right_level],
5627 right_path->slots[right_level]);
5628 if (left_blockptr == right_blockptr &&
5629 left_gen == right_gen) {
5630 /*
5631 * As we're on a shared block, don't
5632 * allow to go deeper.
5633 */
5634 advance_left = ADVANCE_ONLY_NEXT;
5635 advance_right = ADVANCE_ONLY_NEXT;
5636 } else {
5637 advance_left = ADVANCE;
5638 advance_right = ADVANCE;
5639 }
5640 }
5641 } else if (left_level < right_level) {
5642 advance_right = ADVANCE;
5643 } else {
5644 advance_left = ADVANCE;
5645 }
5646 }
5647
5648out:
5649 btrfs_free_path(left_path);
5650 btrfs_free_path(right_path);
5651 kvfree(tmp_buf);
5652 return ret;
5653}
5654
5655/*
5656 * this is similar to btrfs_next_leaf, but does not try to preserve
5657 * and fixup the path. It looks for and returns the next key in the
5658 * tree based on the current path and the min_trans parameters.
5659 *
5660 * 0 is returned if another key is found, < 0 if there are any errors
5661 * and 1 is returned if there are no higher keys in the tree
5662 *
5663 * path->keep_locks should be set to 1 on the search made before
5664 * calling this function.
5665 */
5666int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5667 struct btrfs_key *key, int level, u64 min_trans)
5668{
5669 int slot;
5670 struct extent_buffer *c;
5671
5672 WARN_ON(!path->keep_locks);
5673 while (level < BTRFS_MAX_LEVEL) {
5674 if (!path->nodes[level])
5675 return 1;
5676
5677 slot = path->slots[level] + 1;
5678 c = path->nodes[level];
5679next:
5680 if (slot >= btrfs_header_nritems(c)) {
5681 int ret;
5682 int orig_lowest;
5683 struct btrfs_key cur_key;
5684 if (level + 1 >= BTRFS_MAX_LEVEL ||
5685 !path->nodes[level + 1])
5686 return 1;
5687
5688 if (path->locks[level + 1]) {
5689 level++;
5690 continue;
5691 }
5692
5693 slot = btrfs_header_nritems(c) - 1;
5694 if (level == 0)
5695 btrfs_item_key_to_cpu(c, &cur_key, slot);
5696 else
5697 btrfs_node_key_to_cpu(c, &cur_key, slot);
5698
5699 orig_lowest = path->lowest_level;
5700 btrfs_release_path(path);
5701 path->lowest_level = level;
5702 ret = btrfs_search_slot(NULL, root, &cur_key, path,
5703 0, 0);
5704 path->lowest_level = orig_lowest;
5705 if (ret < 0)
5706 return ret;
5707
5708 c = path->nodes[level];
5709 slot = path->slots[level];
5710 if (ret == 0)
5711 slot++;
5712 goto next;
5713 }
5714
5715 if (level == 0)
5716 btrfs_item_key_to_cpu(c, key, slot);
5717 else {
5718 u64 gen = btrfs_node_ptr_generation(c, slot);
5719
5720 if (gen < min_trans) {
5721 slot++;
5722 goto next;
5723 }
5724 btrfs_node_key_to_cpu(c, key, slot);
5725 }
5726 return 0;
5727 }
5728 return 1;
5729}
5730
5731/*
5732 * search the tree again to find a leaf with greater keys
5733 * returns 0 if it found something or 1 if there are no greater leaves.
5734 * returns < 0 on io errors.
5735 */
5736int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5737{
5738 return btrfs_next_old_leaf(root, path, 0);
5739}
5740
5741int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5742 u64 time_seq)
5743{
5744 int slot;
5745 int level;
5746 struct extent_buffer *c;
5747 struct extent_buffer *next;
5748 struct btrfs_key key;
5749 u32 nritems;
5750 int ret;
5751 int old_spinning = path->leave_spinning;
5752 int next_rw_lock = 0;
5753
5754 nritems = btrfs_header_nritems(path->nodes[0]);
5755 if (nritems == 0)
5756 return 1;
5757
5758 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5759again:
5760 level = 1;
5761 next = NULL;
5762 next_rw_lock = 0;
5763 btrfs_release_path(path);
5764
5765 path->keep_locks = 1;
5766 path->leave_spinning = 1;
5767
5768 if (time_seq)
5769 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5770 else
5771 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5772 path->keep_locks = 0;
5773
5774 if (ret < 0)
5775 return ret;
5776
5777 nritems = btrfs_header_nritems(path->nodes[0]);
5778 /*
5779 * by releasing the path above we dropped all our locks. A balance
5780 * could have added more items next to the key that used to be
5781 * at the very end of the block. So, check again here and
5782 * advance the path if there are now more items available.
5783 */
5784 if (nritems > 0 && path->slots[0] < nritems - 1) {
5785 if (ret == 0)
5786 path->slots[0]++;
5787 ret = 0;
5788 goto done;
5789 }
5790 /*
5791 * So the above check misses one case:
5792 * - after releasing the path above, someone has removed the item that
5793 * used to be at the very end of the block, and balance between leafs
5794 * gets another one with bigger key.offset to replace it.
5795 *
5796 * This one should be returned as well, or we can get leaf corruption
5797 * later(esp. in __btrfs_drop_extents()).
5798 *
5799 * And a bit more explanation about this check,
5800 * with ret > 0, the key isn't found, the path points to the slot
5801 * where it should be inserted, so the path->slots[0] item must be the
5802 * bigger one.
5803 */
5804 if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5805 ret = 0;
5806 goto done;
5807 }
5808
5809 while (level < BTRFS_MAX_LEVEL) {
5810 if (!path->nodes[level]) {
5811 ret = 1;
5812 goto done;
5813 }
5814
5815 slot = path->slots[level] + 1;
5816 c = path->nodes[level];
5817 if (slot >= btrfs_header_nritems(c)) {
5818 level++;
5819 if (level == BTRFS_MAX_LEVEL) {
5820 ret = 1;
5821 goto done;
5822 }
5823 continue;
5824 }
5825
5826 if (next) {
5827 btrfs_tree_unlock_rw(next, next_rw_lock);
5828 free_extent_buffer(next);
5829 }
5830
5831 next = c;
5832 next_rw_lock = path->locks[level];
5833 ret = read_block_for_search(root, path, &next, level,
5834 slot, &key);
5835 if (ret == -EAGAIN)
5836 goto again;
5837
5838 if (ret < 0) {
5839 btrfs_release_path(path);
5840 goto done;
5841 }
5842
5843 if (!path->skip_locking) {
5844 ret = btrfs_try_tree_read_lock(next);
5845 if (!ret && time_seq) {
5846 /*
5847 * If we don't get the lock, we may be racing
5848 * with push_leaf_left, holding that lock while
5849 * itself waiting for the leaf we've currently
5850 * locked. To solve this situation, we give up
5851 * on our lock and cycle.
5852 */
5853 free_extent_buffer(next);
5854 btrfs_release_path(path);
5855 cond_resched();
5856 goto again;
5857 }
5858 if (!ret) {
5859 btrfs_set_path_blocking(path);
5860 btrfs_tree_read_lock(next);
5861 btrfs_clear_path_blocking(path, next,
5862 BTRFS_READ_LOCK);
5863 }
5864 next_rw_lock = BTRFS_READ_LOCK;
5865 }
5866 break;
5867 }
5868 path->slots[level] = slot;
5869 while (1) {
5870 level--;
5871 c = path->nodes[level];
5872 if (path->locks[level])
5873 btrfs_tree_unlock_rw(c, path->locks[level]);
5874
5875 free_extent_buffer(c);
5876 path->nodes[level] = next;
5877 path->slots[level] = 0;
5878 if (!path->skip_locking)
5879 path->locks[level] = next_rw_lock;
5880 if (!level)
5881 break;
5882
5883 ret = read_block_for_search(root, path, &next, level,
5884 0, &key);
5885 if (ret == -EAGAIN)
5886 goto again;
5887
5888 if (ret < 0) {
5889 btrfs_release_path(path);
5890 goto done;
5891 }
5892
5893 if (!path->skip_locking) {
5894 ret = btrfs_try_tree_read_lock(next);
5895 if (!ret) {
5896 btrfs_set_path_blocking(path);
5897 btrfs_tree_read_lock(next);
5898 btrfs_clear_path_blocking(path, next,
5899 BTRFS_READ_LOCK);
5900 }
5901 next_rw_lock = BTRFS_READ_LOCK;
5902 }
5903 }
5904 ret = 0;
5905done:
5906 unlock_up(path, 0, 1, 0, NULL);
5907 path->leave_spinning = old_spinning;
5908 if (!old_spinning)
5909 btrfs_set_path_blocking(path);
5910
5911 return ret;
5912}
5913
5914/*
5915 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5916 * searching until it gets past min_objectid or finds an item of 'type'
5917 *
5918 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5919 */
5920int btrfs_previous_item(struct btrfs_root *root,
5921 struct btrfs_path *path, u64 min_objectid,
5922 int type)
5923{
5924 struct btrfs_key found_key;
5925 struct extent_buffer *leaf;
5926 u32 nritems;
5927 int ret;
5928
5929 while (1) {
5930 if (path->slots[0] == 0) {
5931 btrfs_set_path_blocking(path);
5932 ret = btrfs_prev_leaf(root, path);
5933 if (ret != 0)
5934 return ret;
5935 } else {
5936 path->slots[0]--;
5937 }
5938 leaf = path->nodes[0];
5939 nritems = btrfs_header_nritems(leaf);
5940 if (nritems == 0)
5941 return 1;
5942 if (path->slots[0] == nritems)
5943 path->slots[0]--;
5944
5945 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5946 if (found_key.objectid < min_objectid)
5947 break;
5948 if (found_key.type == type)
5949 return 0;
5950 if (found_key.objectid == min_objectid &&
5951 found_key.type < type)
5952 break;
5953 }
5954 return 1;
5955}
5956
5957/*
5958 * search in extent tree to find a previous Metadata/Data extent item with
5959 * min objecitd.
5960 *
5961 * returns 0 if something is found, 1 if nothing was found and < 0 on error
5962 */
5963int btrfs_previous_extent_item(struct btrfs_root *root,
5964 struct btrfs_path *path, u64 min_objectid)
5965{
5966 struct btrfs_key found_key;
5967 struct extent_buffer *leaf;
5968 u32 nritems;
5969 int ret;
5970
5971 while (1) {
5972 if (path->slots[0] == 0) {
5973 btrfs_set_path_blocking(path);
5974 ret = btrfs_prev_leaf(root, path);
5975 if (ret != 0)
5976 return ret;
5977 } else {
5978 path->slots[0]--;
5979 }
5980 leaf = path->nodes[0];
5981 nritems = btrfs_header_nritems(leaf);
5982 if (nritems == 0)
5983 return 1;
5984 if (path->slots[0] == nritems)
5985 path->slots[0]--;
5986
5987 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5988 if (found_key.objectid < min_objectid)
5989 break;
5990 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5991 found_key.type == BTRFS_METADATA_ITEM_KEY)
5992 return 0;
5993 if (found_key.objectid == min_objectid &&
5994 found_key.type < BTRFS_EXTENT_ITEM_KEY)
5995 break;
5996 }
5997 return 1;
5998}