blob: 931a7d1ddc951475aa224752470ae78b42e782ef [file] [log] [blame]
xjb04a4022021-11-25 15:01:52 +08001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (C) 2012 Alexander Block. All rights reserved.
4 */
5
6#include <linux/bsearch.h>
7#include <linux/fs.h>
8#include <linux/file.h>
9#include <linux/sort.h>
10#include <linux/mount.h>
11#include <linux/xattr.h>
12#include <linux/posix_acl_xattr.h>
13#include <linux/radix-tree.h>
14#include <linux/vmalloc.h>
15#include <linux/string.h>
16#include <linux/compat.h>
17#include <linux/crc32c.h>
18
19#include "send.h"
20#include "backref.h"
21#include "locking.h"
22#include "disk-io.h"
23#include "btrfs_inode.h"
24#include "transaction.h"
25#include "compression.h"
26
27/*
28 * Maximum number of references an extent can have in order for us to attempt to
29 * issue clone operations instead of write operations. This currently exists to
30 * avoid hitting limitations of the backreference walking code (taking a lot of
31 * time and using too much memory for extents with large number of references).
32 */
33#define SEND_MAX_EXTENT_REFS 64
34
35/*
36 * A fs_path is a helper to dynamically build path names with unknown size.
37 * It reallocates the internal buffer on demand.
38 * It allows fast adding of path elements on the right side (normal path) and
39 * fast adding to the left side (reversed path). A reversed path can also be
40 * unreversed if needed.
41 */
42struct fs_path {
43 union {
44 struct {
45 char *start;
46 char *end;
47
48 char *buf;
49 unsigned short buf_len:15;
50 unsigned short reversed:1;
51 char inline_buf[];
52 };
53 /*
54 * Average path length does not exceed 200 bytes, we'll have
55 * better packing in the slab and higher chance to satisfy
56 * a allocation later during send.
57 */
58 char pad[256];
59 };
60};
61#define FS_PATH_INLINE_SIZE \
62 (sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))
63
64
65/* reused for each extent */
66struct clone_root {
67 struct btrfs_root *root;
68 u64 ino;
69 u64 offset;
70
71 u64 found_refs;
72};
73
74#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
75#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
76
77struct send_ctx {
78 struct file *send_filp;
79 loff_t send_off;
80 char *send_buf;
81 u32 send_size;
82 u32 send_max_size;
83 u64 total_send_size;
84 u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
85 u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */
86
87 struct btrfs_root *send_root;
88 struct btrfs_root *parent_root;
89 struct clone_root *clone_roots;
90 int clone_roots_cnt;
91
92 /* current state of the compare_tree call */
93 struct btrfs_path *left_path;
94 struct btrfs_path *right_path;
95 struct btrfs_key *cmp_key;
96
97 /*
98 * infos of the currently processed inode. In case of deleted inodes,
99 * these are the values from the deleted inode.
100 */
101 u64 cur_ino;
102 u64 cur_inode_gen;
103 int cur_inode_new;
104 int cur_inode_new_gen;
105 int cur_inode_deleted;
106 u64 cur_inode_size;
107 u64 cur_inode_mode;
108 u64 cur_inode_rdev;
109 u64 cur_inode_last_extent;
110 u64 cur_inode_next_write_offset;
111 bool ignore_cur_inode;
112
113 u64 send_progress;
114
115 struct list_head new_refs;
116 struct list_head deleted_refs;
117
118 struct radix_tree_root name_cache;
119 struct list_head name_cache_list;
120 int name_cache_size;
121
122 struct file_ra_state ra;
123
124 char *read_buf;
125
126 /*
127 * We process inodes by their increasing order, so if before an
128 * incremental send we reverse the parent/child relationship of
129 * directories such that a directory with a lower inode number was
130 * the parent of a directory with a higher inode number, and the one
131 * becoming the new parent got renamed too, we can't rename/move the
132 * directory with lower inode number when we finish processing it - we
133 * must process the directory with higher inode number first, then
134 * rename/move it and then rename/move the directory with lower inode
135 * number. Example follows.
136 *
137 * Tree state when the first send was performed:
138 *
139 * .
140 * |-- a (ino 257)
141 * |-- b (ino 258)
142 * |
143 * |
144 * |-- c (ino 259)
145 * | |-- d (ino 260)
146 * |
147 * |-- c2 (ino 261)
148 *
149 * Tree state when the second (incremental) send is performed:
150 *
151 * .
152 * |-- a (ino 257)
153 * |-- b (ino 258)
154 * |-- c2 (ino 261)
155 * |-- d2 (ino 260)
156 * |-- cc (ino 259)
157 *
158 * The sequence of steps that lead to the second state was:
159 *
160 * mv /a/b/c/d /a/b/c2/d2
161 * mv /a/b/c /a/b/c2/d2/cc
162 *
163 * "c" has lower inode number, but we can't move it (2nd mv operation)
164 * before we move "d", which has higher inode number.
165 *
166 * So we just memorize which move/rename operations must be performed
167 * later when their respective parent is processed and moved/renamed.
168 */
169
170 /* Indexed by parent directory inode number. */
171 struct rb_root pending_dir_moves;
172
173 /*
174 * Reverse index, indexed by the inode number of a directory that
175 * is waiting for the move/rename of its immediate parent before its
176 * own move/rename can be performed.
177 */
178 struct rb_root waiting_dir_moves;
179
180 /*
181 * A directory that is going to be rm'ed might have a child directory
182 * which is in the pending directory moves index above. In this case,
183 * the directory can only be removed after the move/rename of its child
184 * is performed. Example:
185 *
186 * Parent snapshot:
187 *
188 * . (ino 256)
189 * |-- a/ (ino 257)
190 * |-- b/ (ino 258)
191 * |-- c/ (ino 259)
192 * | |-- x/ (ino 260)
193 * |
194 * |-- y/ (ino 261)
195 *
196 * Send snapshot:
197 *
198 * . (ino 256)
199 * |-- a/ (ino 257)
200 * |-- b/ (ino 258)
201 * |-- YY/ (ino 261)
202 * |-- x/ (ino 260)
203 *
204 * Sequence of steps that lead to the send snapshot:
205 * rm -f /a/b/c/foo.txt
206 * mv /a/b/y /a/b/YY
207 * mv /a/b/c/x /a/b/YY
208 * rmdir /a/b/c
209 *
210 * When the child is processed, its move/rename is delayed until its
211 * parent is processed (as explained above), but all other operations
212 * like update utimes, chown, chgrp, etc, are performed and the paths
213 * that it uses for those operations must use the orphanized name of
214 * its parent (the directory we're going to rm later), so we need to
215 * memorize that name.
216 *
217 * Indexed by the inode number of the directory to be deleted.
218 */
219 struct rb_root orphan_dirs;
220};
221
222struct pending_dir_move {
223 struct rb_node node;
224 struct list_head list;
225 u64 parent_ino;
226 u64 ino;
227 u64 gen;
228 struct list_head update_refs;
229};
230
231struct waiting_dir_move {
232 struct rb_node node;
233 u64 ino;
234 /*
235 * There might be some directory that could not be removed because it
236 * was waiting for this directory inode to be moved first. Therefore
237 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
238 */
239 u64 rmdir_ino;
240 bool orphanized;
241};
242
243struct orphan_dir_info {
244 struct rb_node node;
245 u64 ino;
246 u64 gen;
247 u64 last_dir_index_offset;
248};
249
250struct name_cache_entry {
251 struct list_head list;
252 /*
253 * radix_tree has only 32bit entries but we need to handle 64bit inums.
254 * We use the lower 32bit of the 64bit inum to store it in the tree. If
255 * more then one inum would fall into the same entry, we use radix_list
256 * to store the additional entries. radix_list is also used to store
257 * entries where two entries have the same inum but different
258 * generations.
259 */
260 struct list_head radix_list;
261 u64 ino;
262 u64 gen;
263 u64 parent_ino;
264 u64 parent_gen;
265 int ret;
266 int need_later_update;
267 int name_len;
268 char name[];
269};
270
271__cold
272static void inconsistent_snapshot_error(struct send_ctx *sctx,
273 enum btrfs_compare_tree_result result,
274 const char *what)
275{
276 const char *result_string;
277
278 switch (result) {
279 case BTRFS_COMPARE_TREE_NEW:
280 result_string = "new";
281 break;
282 case BTRFS_COMPARE_TREE_DELETED:
283 result_string = "deleted";
284 break;
285 case BTRFS_COMPARE_TREE_CHANGED:
286 result_string = "updated";
287 break;
288 case BTRFS_COMPARE_TREE_SAME:
289 ASSERT(0);
290 result_string = "unchanged";
291 break;
292 default:
293 ASSERT(0);
294 result_string = "unexpected";
295 }
296
297 btrfs_err(sctx->send_root->fs_info,
298 "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
299 result_string, what, sctx->cmp_key->objectid,
300 sctx->send_root->root_key.objectid,
301 (sctx->parent_root ?
302 sctx->parent_root->root_key.objectid : 0));
303}
304
305static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);
306
307static struct waiting_dir_move *
308get_waiting_dir_move(struct send_ctx *sctx, u64 ino);
309
310static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino);
311
312static int need_send_hole(struct send_ctx *sctx)
313{
314 return (sctx->parent_root && !sctx->cur_inode_new &&
315 !sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
316 S_ISREG(sctx->cur_inode_mode));
317}
318
319static void fs_path_reset(struct fs_path *p)
320{
321 if (p->reversed) {
322 p->start = p->buf + p->buf_len - 1;
323 p->end = p->start;
324 *p->start = 0;
325 } else {
326 p->start = p->buf;
327 p->end = p->start;
328 *p->start = 0;
329 }
330}
331
332static struct fs_path *fs_path_alloc(void)
333{
334 struct fs_path *p;
335
336 p = kmalloc(sizeof(*p), GFP_KERNEL);
337 if (!p)
338 return NULL;
339 p->reversed = 0;
340 p->buf = p->inline_buf;
341 p->buf_len = FS_PATH_INLINE_SIZE;
342 fs_path_reset(p);
343 return p;
344}
345
346static struct fs_path *fs_path_alloc_reversed(void)
347{
348 struct fs_path *p;
349
350 p = fs_path_alloc();
351 if (!p)
352 return NULL;
353 p->reversed = 1;
354 fs_path_reset(p);
355 return p;
356}
357
358static void fs_path_free(struct fs_path *p)
359{
360 if (!p)
361 return;
362 if (p->buf != p->inline_buf)
363 kfree(p->buf);
364 kfree(p);
365}
366
367static int fs_path_len(struct fs_path *p)
368{
369 return p->end - p->start;
370}
371
372static int fs_path_ensure_buf(struct fs_path *p, int len)
373{
374 char *tmp_buf;
375 int path_len;
376 int old_buf_len;
377
378 len++;
379
380 if (p->buf_len >= len)
381 return 0;
382
383 if (len > PATH_MAX) {
384 WARN_ON(1);
385 return -ENOMEM;
386 }
387
388 path_len = p->end - p->start;
389 old_buf_len = p->buf_len;
390
391 /*
392 * First time the inline_buf does not suffice
393 */
394 if (p->buf == p->inline_buf) {
395 tmp_buf = kmalloc(len, GFP_KERNEL);
396 if (tmp_buf)
397 memcpy(tmp_buf, p->buf, old_buf_len);
398 } else {
399 tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
400 }
401 if (!tmp_buf)
402 return -ENOMEM;
403 p->buf = tmp_buf;
404 /*
405 * The real size of the buffer is bigger, this will let the fast path
406 * happen most of the time
407 */
408 p->buf_len = ksize(p->buf);
409
410 if (p->reversed) {
411 tmp_buf = p->buf + old_buf_len - path_len - 1;
412 p->end = p->buf + p->buf_len - 1;
413 p->start = p->end - path_len;
414 memmove(p->start, tmp_buf, path_len + 1);
415 } else {
416 p->start = p->buf;
417 p->end = p->start + path_len;
418 }
419 return 0;
420}
421
422static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
423 char **prepared)
424{
425 int ret;
426 int new_len;
427
428 new_len = p->end - p->start + name_len;
429 if (p->start != p->end)
430 new_len++;
431 ret = fs_path_ensure_buf(p, new_len);
432 if (ret < 0)
433 goto out;
434
435 if (p->reversed) {
436 if (p->start != p->end)
437 *--p->start = '/';
438 p->start -= name_len;
439 *prepared = p->start;
440 } else {
441 if (p->start != p->end)
442 *p->end++ = '/';
443 *prepared = p->end;
444 p->end += name_len;
445 *p->end = 0;
446 }
447
448out:
449 return ret;
450}
451
452static int fs_path_add(struct fs_path *p, const char *name, int name_len)
453{
454 int ret;
455 char *prepared;
456
457 ret = fs_path_prepare_for_add(p, name_len, &prepared);
458 if (ret < 0)
459 goto out;
460 memcpy(prepared, name, name_len);
461
462out:
463 return ret;
464}
465
466static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
467{
468 int ret;
469 char *prepared;
470
471 ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
472 if (ret < 0)
473 goto out;
474 memcpy(prepared, p2->start, p2->end - p2->start);
475
476out:
477 return ret;
478}
479
480static int fs_path_add_from_extent_buffer(struct fs_path *p,
481 struct extent_buffer *eb,
482 unsigned long off, int len)
483{
484 int ret;
485 char *prepared;
486
487 ret = fs_path_prepare_for_add(p, len, &prepared);
488 if (ret < 0)
489 goto out;
490
491 read_extent_buffer(eb, prepared, off, len);
492
493out:
494 return ret;
495}
496
497static int fs_path_copy(struct fs_path *p, struct fs_path *from)
498{
499 int ret;
500
501 p->reversed = from->reversed;
502 fs_path_reset(p);
503
504 ret = fs_path_add_path(p, from);
505
506 return ret;
507}
508
509
510static void fs_path_unreverse(struct fs_path *p)
511{
512 char *tmp;
513 int len;
514
515 if (!p->reversed)
516 return;
517
518 tmp = p->start;
519 len = p->end - p->start;
520 p->start = p->buf;
521 p->end = p->start + len;
522 memmove(p->start, tmp, len + 1);
523 p->reversed = 0;
524}
525
526static struct btrfs_path *alloc_path_for_send(void)
527{
528 struct btrfs_path *path;
529
530 path = btrfs_alloc_path();
531 if (!path)
532 return NULL;
533 path->search_commit_root = 1;
534 path->skip_locking = 1;
535 path->need_commit_sem = 1;
536 return path;
537}
538
539static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
540{
541 int ret;
542 u32 pos = 0;
543
544 while (pos < len) {
545 ret = kernel_write(filp, buf + pos, len - pos, off);
546 /* TODO handle that correctly */
547 /*if (ret == -ERESTARTSYS) {
548 continue;
549 }*/
550 if (ret < 0)
551 return ret;
552 if (ret == 0) {
553 return -EIO;
554 }
555 pos += ret;
556 }
557
558 return 0;
559}
560
561static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
562{
563 struct btrfs_tlv_header *hdr;
564 int total_len = sizeof(*hdr) + len;
565 int left = sctx->send_max_size - sctx->send_size;
566
567 if (unlikely(left < total_len))
568 return -EOVERFLOW;
569
570 hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
571 hdr->tlv_type = cpu_to_le16(attr);
572 hdr->tlv_len = cpu_to_le16(len);
573 memcpy(hdr + 1, data, len);
574 sctx->send_size += total_len;
575
576 return 0;
577}
578
579#define TLV_PUT_DEFINE_INT(bits) \
580 static int tlv_put_u##bits(struct send_ctx *sctx, \
581 u##bits attr, u##bits value) \
582 { \
583 __le##bits __tmp = cpu_to_le##bits(value); \
584 return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \
585 }
586
587TLV_PUT_DEFINE_INT(64)
588
589static int tlv_put_string(struct send_ctx *sctx, u16 attr,
590 const char *str, int len)
591{
592 if (len == -1)
593 len = strlen(str);
594 return tlv_put(sctx, attr, str, len);
595}
596
597static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
598 const u8 *uuid)
599{
600 return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
601}
602
603static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
604 struct extent_buffer *eb,
605 struct btrfs_timespec *ts)
606{
607 struct btrfs_timespec bts;
608 read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
609 return tlv_put(sctx, attr, &bts, sizeof(bts));
610}
611
612
613#define TLV_PUT(sctx, attrtype, data, attrlen) \
614 do { \
615 ret = tlv_put(sctx, attrtype, data, attrlen); \
616 if (ret < 0) \
617 goto tlv_put_failure; \
618 } while (0)
619
620#define TLV_PUT_INT(sctx, attrtype, bits, value) \
621 do { \
622 ret = tlv_put_u##bits(sctx, attrtype, value); \
623 if (ret < 0) \
624 goto tlv_put_failure; \
625 } while (0)
626
627#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
628#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
629#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
630#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
631#define TLV_PUT_STRING(sctx, attrtype, str, len) \
632 do { \
633 ret = tlv_put_string(sctx, attrtype, str, len); \
634 if (ret < 0) \
635 goto tlv_put_failure; \
636 } while (0)
637#define TLV_PUT_PATH(sctx, attrtype, p) \
638 do { \
639 ret = tlv_put_string(sctx, attrtype, p->start, \
640 p->end - p->start); \
641 if (ret < 0) \
642 goto tlv_put_failure; \
643 } while(0)
644#define TLV_PUT_UUID(sctx, attrtype, uuid) \
645 do { \
646 ret = tlv_put_uuid(sctx, attrtype, uuid); \
647 if (ret < 0) \
648 goto tlv_put_failure; \
649 } while (0)
650#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
651 do { \
652 ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
653 if (ret < 0) \
654 goto tlv_put_failure; \
655 } while (0)
656
657static int send_header(struct send_ctx *sctx)
658{
659 struct btrfs_stream_header hdr;
660
661 strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
662 hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION);
663
664 return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
665 &sctx->send_off);
666}
667
668/*
669 * For each command/item we want to send to userspace, we call this function.
670 */
671static int begin_cmd(struct send_ctx *sctx, int cmd)
672{
673 struct btrfs_cmd_header *hdr;
674
675 if (WARN_ON(!sctx->send_buf))
676 return -EINVAL;
677
678 BUG_ON(sctx->send_size);
679
680 sctx->send_size += sizeof(*hdr);
681 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
682 hdr->cmd = cpu_to_le16(cmd);
683
684 return 0;
685}
686
687static int send_cmd(struct send_ctx *sctx)
688{
689 int ret;
690 struct btrfs_cmd_header *hdr;
691 u32 crc;
692
693 hdr = (struct btrfs_cmd_header *)sctx->send_buf;
694 hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr));
695 hdr->crc = 0;
696
697 crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
698 hdr->crc = cpu_to_le32(crc);
699
700 ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
701 &sctx->send_off);
702
703 sctx->total_send_size += sctx->send_size;
704 sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size;
705 sctx->send_size = 0;
706
707 return ret;
708}
709
710/*
711 * Sends a move instruction to user space
712 */
713static int send_rename(struct send_ctx *sctx,
714 struct fs_path *from, struct fs_path *to)
715{
716 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
717 int ret;
718
719 btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
720
721 ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
722 if (ret < 0)
723 goto out;
724
725 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
726 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);
727
728 ret = send_cmd(sctx);
729
730tlv_put_failure:
731out:
732 return ret;
733}
734
735/*
736 * Sends a link instruction to user space
737 */
738static int send_link(struct send_ctx *sctx,
739 struct fs_path *path, struct fs_path *lnk)
740{
741 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
742 int ret;
743
744 btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
745
746 ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
747 if (ret < 0)
748 goto out;
749
750 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
751 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);
752
753 ret = send_cmd(sctx);
754
755tlv_put_failure:
756out:
757 return ret;
758}
759
760/*
761 * Sends an unlink instruction to user space
762 */
763static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
764{
765 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
766 int ret;
767
768 btrfs_debug(fs_info, "send_unlink %s", path->start);
769
770 ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
771 if (ret < 0)
772 goto out;
773
774 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
775
776 ret = send_cmd(sctx);
777
778tlv_put_failure:
779out:
780 return ret;
781}
782
783/*
784 * Sends a rmdir instruction to user space
785 */
786static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
787{
788 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
789 int ret;
790
791 btrfs_debug(fs_info, "send_rmdir %s", path->start);
792
793 ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
794 if (ret < 0)
795 goto out;
796
797 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
798
799 ret = send_cmd(sctx);
800
801tlv_put_failure:
802out:
803 return ret;
804}
805
806/*
807 * Helper function to retrieve some fields from an inode item.
808 */
809static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path,
810 u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid,
811 u64 *gid, u64 *rdev)
812{
813 int ret;
814 struct btrfs_inode_item *ii;
815 struct btrfs_key key;
816
817 key.objectid = ino;
818 key.type = BTRFS_INODE_ITEM_KEY;
819 key.offset = 0;
820 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
821 if (ret) {
822 if (ret > 0)
823 ret = -ENOENT;
824 return ret;
825 }
826
827 ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
828 struct btrfs_inode_item);
829 if (size)
830 *size = btrfs_inode_size(path->nodes[0], ii);
831 if (gen)
832 *gen = btrfs_inode_generation(path->nodes[0], ii);
833 if (mode)
834 *mode = btrfs_inode_mode(path->nodes[0], ii);
835 if (uid)
836 *uid = btrfs_inode_uid(path->nodes[0], ii);
837 if (gid)
838 *gid = btrfs_inode_gid(path->nodes[0], ii);
839 if (rdev)
840 *rdev = btrfs_inode_rdev(path->nodes[0], ii);
841
842 return ret;
843}
844
845static int get_inode_info(struct btrfs_root *root,
846 u64 ino, u64 *size, u64 *gen,
847 u64 *mode, u64 *uid, u64 *gid,
848 u64 *rdev)
849{
850 struct btrfs_path *path;
851 int ret;
852
853 path = alloc_path_for_send();
854 if (!path)
855 return -ENOMEM;
856 ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid,
857 rdev);
858 btrfs_free_path(path);
859 return ret;
860}
861
862typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
863 struct fs_path *p,
864 void *ctx);
865
866/*
867 * Helper function to iterate the entries in ONE btrfs_inode_ref or
868 * btrfs_inode_extref.
869 * The iterate callback may return a non zero value to stop iteration. This can
870 * be a negative value for error codes or 1 to simply stop it.
871 *
872 * path must point to the INODE_REF or INODE_EXTREF when called.
873 */
874static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
875 struct btrfs_key *found_key, int resolve,
876 iterate_inode_ref_t iterate, void *ctx)
877{
878 struct extent_buffer *eb = path->nodes[0];
879 struct btrfs_item *item;
880 struct btrfs_inode_ref *iref;
881 struct btrfs_inode_extref *extref;
882 struct btrfs_path *tmp_path;
883 struct fs_path *p;
884 u32 cur = 0;
885 u32 total;
886 int slot = path->slots[0];
887 u32 name_len;
888 char *start;
889 int ret = 0;
890 int num = 0;
891 int index;
892 u64 dir;
893 unsigned long name_off;
894 unsigned long elem_size;
895 unsigned long ptr;
896
897 p = fs_path_alloc_reversed();
898 if (!p)
899 return -ENOMEM;
900
901 tmp_path = alloc_path_for_send();
902 if (!tmp_path) {
903 fs_path_free(p);
904 return -ENOMEM;
905 }
906
907
908 if (found_key->type == BTRFS_INODE_REF_KEY) {
909 ptr = (unsigned long)btrfs_item_ptr(eb, slot,
910 struct btrfs_inode_ref);
911 item = btrfs_item_nr(slot);
912 total = btrfs_item_size(eb, item);
913 elem_size = sizeof(*iref);
914 } else {
915 ptr = btrfs_item_ptr_offset(eb, slot);
916 total = btrfs_item_size_nr(eb, slot);
917 elem_size = sizeof(*extref);
918 }
919
920 while (cur < total) {
921 fs_path_reset(p);
922
923 if (found_key->type == BTRFS_INODE_REF_KEY) {
924 iref = (struct btrfs_inode_ref *)(ptr + cur);
925 name_len = btrfs_inode_ref_name_len(eb, iref);
926 name_off = (unsigned long)(iref + 1);
927 index = btrfs_inode_ref_index(eb, iref);
928 dir = found_key->offset;
929 } else {
930 extref = (struct btrfs_inode_extref *)(ptr + cur);
931 name_len = btrfs_inode_extref_name_len(eb, extref);
932 name_off = (unsigned long)&extref->name;
933 index = btrfs_inode_extref_index(eb, extref);
934 dir = btrfs_inode_extref_parent(eb, extref);
935 }
936
937 if (resolve) {
938 start = btrfs_ref_to_path(root, tmp_path, name_len,
939 name_off, eb, dir,
940 p->buf, p->buf_len);
941 if (IS_ERR(start)) {
942 ret = PTR_ERR(start);
943 goto out;
944 }
945 if (start < p->buf) {
946 /* overflow , try again with larger buffer */
947 ret = fs_path_ensure_buf(p,
948 p->buf_len + p->buf - start);
949 if (ret < 0)
950 goto out;
951 start = btrfs_ref_to_path(root, tmp_path,
952 name_len, name_off,
953 eb, dir,
954 p->buf, p->buf_len);
955 if (IS_ERR(start)) {
956 ret = PTR_ERR(start);
957 goto out;
958 }
959 BUG_ON(start < p->buf);
960 }
961 p->start = start;
962 } else {
963 ret = fs_path_add_from_extent_buffer(p, eb, name_off,
964 name_len);
965 if (ret < 0)
966 goto out;
967 }
968
969 cur += elem_size + name_len;
970 ret = iterate(num, dir, index, p, ctx);
971 if (ret)
972 goto out;
973 num++;
974 }
975
976out:
977 btrfs_free_path(tmp_path);
978 fs_path_free(p);
979 return ret;
980}
981
982typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
983 const char *name, int name_len,
984 const char *data, int data_len,
985 u8 type, void *ctx);
986
987/*
988 * Helper function to iterate the entries in ONE btrfs_dir_item.
989 * The iterate callback may return a non zero value to stop iteration. This can
990 * be a negative value for error codes or 1 to simply stop it.
991 *
992 * path must point to the dir item when called.
993 */
994static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
995 iterate_dir_item_t iterate, void *ctx)
996{
997 int ret = 0;
998 struct extent_buffer *eb;
999 struct btrfs_item *item;
1000 struct btrfs_dir_item *di;
1001 struct btrfs_key di_key;
1002 char *buf = NULL;
1003 int buf_len;
1004 u32 name_len;
1005 u32 data_len;
1006 u32 cur;
1007 u32 len;
1008 u32 total;
1009 int slot;
1010 int num;
1011 u8 type;
1012
1013 /*
1014 * Start with a small buffer (1 page). If later we end up needing more
1015 * space, which can happen for xattrs on a fs with a leaf size greater
1016 * then the page size, attempt to increase the buffer. Typically xattr
1017 * values are small.
1018 */
1019 buf_len = PATH_MAX;
1020 buf = kmalloc(buf_len, GFP_KERNEL);
1021 if (!buf) {
1022 ret = -ENOMEM;
1023 goto out;
1024 }
1025
1026 eb = path->nodes[0];
1027 slot = path->slots[0];
1028 item = btrfs_item_nr(slot);
1029 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
1030 cur = 0;
1031 len = 0;
1032 total = btrfs_item_size(eb, item);
1033
1034 num = 0;
1035 while (cur < total) {
1036 name_len = btrfs_dir_name_len(eb, di);
1037 data_len = btrfs_dir_data_len(eb, di);
1038 type = btrfs_dir_type(eb, di);
1039 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
1040
1041 if (type == BTRFS_FT_XATTR) {
1042 if (name_len > XATTR_NAME_MAX) {
1043 ret = -ENAMETOOLONG;
1044 goto out;
1045 }
1046 if (name_len + data_len >
1047 BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
1048 ret = -E2BIG;
1049 goto out;
1050 }
1051 } else {
1052 /*
1053 * Path too long
1054 */
1055 if (name_len + data_len > PATH_MAX) {
1056 ret = -ENAMETOOLONG;
1057 goto out;
1058 }
1059 }
1060
1061 if (name_len + data_len > buf_len) {
1062 buf_len = name_len + data_len;
1063 if (is_vmalloc_addr(buf)) {
1064 vfree(buf);
1065 buf = NULL;
1066 } else {
1067 char *tmp = krealloc(buf, buf_len,
1068 GFP_KERNEL | __GFP_NOWARN);
1069
1070 if (!tmp)
1071 kfree(buf);
1072 buf = tmp;
1073 }
1074 if (!buf) {
1075 buf = kvmalloc(buf_len, GFP_KERNEL);
1076 if (!buf) {
1077 ret = -ENOMEM;
1078 goto out;
1079 }
1080 }
1081 }
1082
1083 read_extent_buffer(eb, buf, (unsigned long)(di + 1),
1084 name_len + data_len);
1085
1086 len = sizeof(*di) + name_len + data_len;
1087 di = (struct btrfs_dir_item *)((char *)di + len);
1088 cur += len;
1089
1090 ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1091 data_len, type, ctx);
1092 if (ret < 0)
1093 goto out;
1094 if (ret) {
1095 ret = 0;
1096 goto out;
1097 }
1098
1099 num++;
1100 }
1101
1102out:
1103 kvfree(buf);
1104 return ret;
1105}
1106
1107static int __copy_first_ref(int num, u64 dir, int index,
1108 struct fs_path *p, void *ctx)
1109{
1110 int ret;
1111 struct fs_path *pt = ctx;
1112
1113 ret = fs_path_copy(pt, p);
1114 if (ret < 0)
1115 return ret;
1116
1117 /* we want the first only */
1118 return 1;
1119}
1120
1121/*
1122 * Retrieve the first path of an inode. If an inode has more then one
1123 * ref/hardlink, this is ignored.
1124 */
1125static int get_inode_path(struct btrfs_root *root,
1126 u64 ino, struct fs_path *path)
1127{
1128 int ret;
1129 struct btrfs_key key, found_key;
1130 struct btrfs_path *p;
1131
1132 p = alloc_path_for_send();
1133 if (!p)
1134 return -ENOMEM;
1135
1136 fs_path_reset(path);
1137
1138 key.objectid = ino;
1139 key.type = BTRFS_INODE_REF_KEY;
1140 key.offset = 0;
1141
1142 ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
1143 if (ret < 0)
1144 goto out;
1145 if (ret) {
1146 ret = 1;
1147 goto out;
1148 }
1149 btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
1150 if (found_key.objectid != ino ||
1151 (found_key.type != BTRFS_INODE_REF_KEY &&
1152 found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1153 ret = -ENOENT;
1154 goto out;
1155 }
1156
1157 ret = iterate_inode_ref(root, p, &found_key, 1,
1158 __copy_first_ref, path);
1159 if (ret < 0)
1160 goto out;
1161 ret = 0;
1162
1163out:
1164 btrfs_free_path(p);
1165 return ret;
1166}
1167
1168struct backref_ctx {
1169 struct send_ctx *sctx;
1170
1171 struct btrfs_path *path;
1172 /* number of total found references */
1173 u64 found;
1174
1175 /*
1176 * used for clones found in send_root. clones found behind cur_objectid
1177 * and cur_offset are not considered as allowed clones.
1178 */
1179 u64 cur_objectid;
1180 u64 cur_offset;
1181
1182 /* may be truncated in case it's the last extent in a file */
1183 u64 extent_len;
1184
1185 /* data offset in the file extent item */
1186 u64 data_offset;
1187
1188 /* Just to check for bugs in backref resolving */
1189 int found_itself;
1190};
1191
1192static int __clone_root_cmp_bsearch(const void *key, const void *elt)
1193{
1194 u64 root = (u64)(uintptr_t)key;
1195 struct clone_root *cr = (struct clone_root *)elt;
1196
1197 if (root < cr->root->objectid)
1198 return -1;
1199 if (root > cr->root->objectid)
1200 return 1;
1201 return 0;
1202}
1203
1204static int __clone_root_cmp_sort(const void *e1, const void *e2)
1205{
1206 struct clone_root *cr1 = (struct clone_root *)e1;
1207 struct clone_root *cr2 = (struct clone_root *)e2;
1208
1209 if (cr1->root->objectid < cr2->root->objectid)
1210 return -1;
1211 if (cr1->root->objectid > cr2->root->objectid)
1212 return 1;
1213 return 0;
1214}
1215
1216/*
1217 * Called for every backref that is found for the current extent.
1218 * Results are collected in sctx->clone_roots->ino/offset/found_refs
1219 */
1220static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
1221{
1222 struct backref_ctx *bctx = ctx_;
1223 struct clone_root *found;
1224 int ret;
1225 u64 i_size;
1226
1227 /* First check if the root is in the list of accepted clone sources */
1228 found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
1229 bctx->sctx->clone_roots_cnt,
1230 sizeof(struct clone_root),
1231 __clone_root_cmp_bsearch);
1232 if (!found)
1233 return 0;
1234
1235 if (found->root == bctx->sctx->send_root &&
1236 ino == bctx->cur_objectid &&
1237 offset == bctx->cur_offset) {
1238 bctx->found_itself = 1;
1239 }
1240
1241 /*
1242 * There are inodes that have extents that lie behind its i_size. Don't
1243 * accept clones from these extents.
1244 */
1245 ret = __get_inode_info(found->root, bctx->path, ino, &i_size, NULL, NULL,
1246 NULL, NULL, NULL);
1247 btrfs_release_path(bctx->path);
1248 if (ret < 0)
1249 return ret;
1250
1251 if (offset + bctx->data_offset + bctx->extent_len > i_size)
1252 return 0;
1253
1254 /*
1255 * Make sure we don't consider clones from send_root that are
1256 * behind the current inode/offset.
1257 */
1258 if (found->root == bctx->sctx->send_root) {
1259 /*
1260 * TODO for the moment we don't accept clones from the inode
1261 * that is currently send. We may change this when
1262 * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
1263 * file.
1264 */
1265 if (ino >= bctx->cur_objectid)
1266 return 0;
1267 }
1268
1269 bctx->found++;
1270 found->found_refs++;
1271 if (ino < found->ino) {
1272 found->ino = ino;
1273 found->offset = offset;
1274 } else if (found->ino == ino) {
1275 /*
1276 * same extent found more then once in the same file.
1277 */
1278 if (found->offset > offset + bctx->extent_len)
1279 found->offset = offset;
1280 }
1281
1282 return 0;
1283}
1284
1285/*
1286 * Given an inode, offset and extent item, it finds a good clone for a clone
1287 * instruction. Returns -ENOENT when none could be found. The function makes
1288 * sure that the returned clone is usable at the point where sending is at the
1289 * moment. This means, that no clones are accepted which lie behind the current
1290 * inode+offset.
1291 *
1292 * path must point to the extent item when called.
1293 */
1294static int find_extent_clone(struct send_ctx *sctx,
1295 struct btrfs_path *path,
1296 u64 ino, u64 data_offset,
1297 u64 ino_size,
1298 struct clone_root **found)
1299{
1300 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1301 int ret;
1302 int extent_type;
1303 u64 logical;
1304 u64 disk_byte;
1305 u64 num_bytes;
1306 u64 extent_item_pos;
1307 u64 flags = 0;
1308 struct btrfs_file_extent_item *fi;
1309 struct extent_buffer *eb = path->nodes[0];
1310 struct backref_ctx *backref_ctx = NULL;
1311 struct clone_root *cur_clone_root;
1312 struct btrfs_key found_key;
1313 struct btrfs_path *tmp_path;
1314 struct btrfs_extent_item *ei;
1315 int compressed;
1316 u32 i;
1317
1318 tmp_path = alloc_path_for_send();
1319 if (!tmp_path)
1320 return -ENOMEM;
1321
1322 /* We only use this path under the commit sem */
1323 tmp_path->need_commit_sem = 0;
1324
1325 backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL);
1326 if (!backref_ctx) {
1327 ret = -ENOMEM;
1328 goto out;
1329 }
1330
1331 backref_ctx->path = tmp_path;
1332
1333 if (data_offset >= ino_size) {
1334 /*
1335 * There may be extents that lie behind the file's size.
1336 * I at least had this in combination with snapshotting while
1337 * writing large files.
1338 */
1339 ret = 0;
1340 goto out;
1341 }
1342
1343 fi = btrfs_item_ptr(eb, path->slots[0],
1344 struct btrfs_file_extent_item);
1345 extent_type = btrfs_file_extent_type(eb, fi);
1346 if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1347 ret = -ENOENT;
1348 goto out;
1349 }
1350 compressed = btrfs_file_extent_compression(eb, fi);
1351
1352 num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1353 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1354 if (disk_byte == 0) {
1355 ret = -ENOENT;
1356 goto out;
1357 }
1358 logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1359
1360 down_read(&fs_info->commit_root_sem);
1361 ret = extent_from_logical(fs_info, disk_byte, tmp_path,
1362 &found_key, &flags);
1363 up_read(&fs_info->commit_root_sem);
1364
1365 if (ret < 0)
1366 goto out;
1367 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1368 ret = -EIO;
1369 goto out;
1370 }
1371
1372 ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
1373 struct btrfs_extent_item);
1374 /*
1375 * Backreference walking (iterate_extent_inodes() below) is currently
1376 * too expensive when an extent has a large number of references, both
1377 * in time spent and used memory. So for now just fallback to write
1378 * operations instead of clone operations when an extent has more than
1379 * a certain amount of references.
1380 */
1381 if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
1382 ret = -ENOENT;
1383 goto out;
1384 }
1385 btrfs_release_path(tmp_path);
1386
1387 /*
1388 * Setup the clone roots.
1389 */
1390 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1391 cur_clone_root = sctx->clone_roots + i;
1392 cur_clone_root->ino = (u64)-1;
1393 cur_clone_root->offset = 0;
1394 cur_clone_root->found_refs = 0;
1395 }
1396
1397 backref_ctx->sctx = sctx;
1398 backref_ctx->found = 0;
1399 backref_ctx->cur_objectid = ino;
1400 backref_ctx->cur_offset = data_offset;
1401 backref_ctx->found_itself = 0;
1402 backref_ctx->extent_len = num_bytes;
1403 /*
1404 * For non-compressed extents iterate_extent_inodes() gives us extent
1405 * offsets that already take into account the data offset, but not for
1406 * compressed extents, since the offset is logical and not relative to
1407 * the physical extent locations. We must take this into account to
1408 * avoid sending clone offsets that go beyond the source file's size,
1409 * which would result in the clone ioctl failing with -EINVAL on the
1410 * receiving end.
1411 */
1412 if (compressed == BTRFS_COMPRESS_NONE)
1413 backref_ctx->data_offset = 0;
1414 else
1415 backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi);
1416
1417 /*
1418 * The last extent of a file may be too large due to page alignment.
1419 * We need to adjust extent_len in this case so that the checks in
1420 * __iterate_backrefs work.
1421 */
1422 if (data_offset + num_bytes >= ino_size)
1423 backref_ctx->extent_len = ino_size - data_offset;
1424
1425 /*
1426 * Now collect all backrefs.
1427 */
1428 if (compressed == BTRFS_COMPRESS_NONE)
1429 extent_item_pos = logical - found_key.objectid;
1430 else
1431 extent_item_pos = 0;
1432 ret = iterate_extent_inodes(fs_info, found_key.objectid,
1433 extent_item_pos, 1, __iterate_backrefs,
1434 backref_ctx, false);
1435
1436 if (ret < 0)
1437 goto out;
1438
1439 if (!backref_ctx->found_itself) {
1440 /* found a bug in backref code? */
1441 ret = -EIO;
1442 btrfs_err(fs_info,
1443 "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
1444 ino, data_offset, disk_byte, found_key.objectid);
1445 goto out;
1446 }
1447
1448 btrfs_debug(fs_info,
1449 "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
1450 data_offset, ino, num_bytes, logical);
1451
1452 if (!backref_ctx->found)
1453 btrfs_debug(fs_info, "no clones found");
1454
1455 cur_clone_root = NULL;
1456 for (i = 0; i < sctx->clone_roots_cnt; i++) {
1457 if (sctx->clone_roots[i].found_refs) {
1458 if (!cur_clone_root)
1459 cur_clone_root = sctx->clone_roots + i;
1460 else if (sctx->clone_roots[i].root == sctx->send_root)
1461 /* prefer clones from send_root over others */
1462 cur_clone_root = sctx->clone_roots + i;
1463 }
1464
1465 }
1466
1467 if (cur_clone_root) {
1468 *found = cur_clone_root;
1469 ret = 0;
1470 } else {
1471 ret = -ENOENT;
1472 }
1473
1474out:
1475 btrfs_free_path(tmp_path);
1476 kfree(backref_ctx);
1477 return ret;
1478}
1479
1480static int read_symlink(struct btrfs_root *root,
1481 u64 ino,
1482 struct fs_path *dest)
1483{
1484 int ret;
1485 struct btrfs_path *path;
1486 struct btrfs_key key;
1487 struct btrfs_file_extent_item *ei;
1488 u8 type;
1489 u8 compression;
1490 unsigned long off;
1491 int len;
1492
1493 path = alloc_path_for_send();
1494 if (!path)
1495 return -ENOMEM;
1496
1497 key.objectid = ino;
1498 key.type = BTRFS_EXTENT_DATA_KEY;
1499 key.offset = 0;
1500 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1501 if (ret < 0)
1502 goto out;
1503 if (ret) {
1504 /*
1505 * An empty symlink inode. Can happen in rare error paths when
1506 * creating a symlink (transaction committed before the inode
1507 * eviction handler removed the symlink inode items and a crash
1508 * happened in between or the subvol was snapshoted in between).
1509 * Print an informative message to dmesg/syslog so that the user
1510 * can delete the symlink.
1511 */
1512 btrfs_err(root->fs_info,
1513 "Found empty symlink inode %llu at root %llu",
1514 ino, root->root_key.objectid);
1515 ret = -EIO;
1516 goto out;
1517 }
1518
1519 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
1520 struct btrfs_file_extent_item);
1521 type = btrfs_file_extent_type(path->nodes[0], ei);
1522 compression = btrfs_file_extent_compression(path->nodes[0], ei);
1523 BUG_ON(type != BTRFS_FILE_EXTENT_INLINE);
1524 BUG_ON(compression);
1525
1526 off = btrfs_file_extent_inline_start(ei);
1527 len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
1528
1529 ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);
1530
1531out:
1532 btrfs_free_path(path);
1533 return ret;
1534}
1535
1536/*
1537 * Helper function to generate a file name that is unique in the root of
1538 * send_root and parent_root. This is used to generate names for orphan inodes.
1539 */
1540static int gen_unique_name(struct send_ctx *sctx,
1541 u64 ino, u64 gen,
1542 struct fs_path *dest)
1543{
1544 int ret = 0;
1545 struct btrfs_path *path;
1546 struct btrfs_dir_item *di;
1547 char tmp[64];
1548 int len;
1549 u64 idx = 0;
1550
1551 path = alloc_path_for_send();
1552 if (!path)
1553 return -ENOMEM;
1554
1555 while (1) {
1556 len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1557 ino, gen, idx);
1558 ASSERT(len < sizeof(tmp));
1559
1560 di = btrfs_lookup_dir_item(NULL, sctx->send_root,
1561 path, BTRFS_FIRST_FREE_OBJECTID,
1562 tmp, strlen(tmp), 0);
1563 btrfs_release_path(path);
1564 if (IS_ERR(di)) {
1565 ret = PTR_ERR(di);
1566 goto out;
1567 }
1568 if (di) {
1569 /* not unique, try again */
1570 idx++;
1571 continue;
1572 }
1573
1574 if (!sctx->parent_root) {
1575 /* unique */
1576 ret = 0;
1577 break;
1578 }
1579
1580 di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
1581 path, BTRFS_FIRST_FREE_OBJECTID,
1582 tmp, strlen(tmp), 0);
1583 btrfs_release_path(path);
1584 if (IS_ERR(di)) {
1585 ret = PTR_ERR(di);
1586 goto out;
1587 }
1588 if (di) {
1589 /* not unique, try again */
1590 idx++;
1591 continue;
1592 }
1593 /* unique */
1594 break;
1595 }
1596
1597 ret = fs_path_add(dest, tmp, strlen(tmp));
1598
1599out:
1600 btrfs_free_path(path);
1601 return ret;
1602}
1603
1604enum inode_state {
1605 inode_state_no_change,
1606 inode_state_will_create,
1607 inode_state_did_create,
1608 inode_state_will_delete,
1609 inode_state_did_delete,
1610};
1611
1612static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen)
1613{
1614 int ret;
1615 int left_ret;
1616 int right_ret;
1617 u64 left_gen;
1618 u64 right_gen;
1619
1620 ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL,
1621 NULL, NULL);
1622 if (ret < 0 && ret != -ENOENT)
1623 goto out;
1624 left_ret = ret;
1625
1626 if (!sctx->parent_root) {
1627 right_ret = -ENOENT;
1628 } else {
1629 ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen,
1630 NULL, NULL, NULL, NULL);
1631 if (ret < 0 && ret != -ENOENT)
1632 goto out;
1633 right_ret = ret;
1634 }
1635
1636 if (!left_ret && !right_ret) {
1637 if (left_gen == gen && right_gen == gen) {
1638 ret = inode_state_no_change;
1639 } else if (left_gen == gen) {
1640 if (ino < sctx->send_progress)
1641 ret = inode_state_did_create;
1642 else
1643 ret = inode_state_will_create;
1644 } else if (right_gen == gen) {
1645 if (ino < sctx->send_progress)
1646 ret = inode_state_did_delete;
1647 else
1648 ret = inode_state_will_delete;
1649 } else {
1650 ret = -ENOENT;
1651 }
1652 } else if (!left_ret) {
1653 if (left_gen == gen) {
1654 if (ino < sctx->send_progress)
1655 ret = inode_state_did_create;
1656 else
1657 ret = inode_state_will_create;
1658 } else {
1659 ret = -ENOENT;
1660 }
1661 } else if (!right_ret) {
1662 if (right_gen == gen) {
1663 if (ino < sctx->send_progress)
1664 ret = inode_state_did_delete;
1665 else
1666 ret = inode_state_will_delete;
1667 } else {
1668 ret = -ENOENT;
1669 }
1670 } else {
1671 ret = -ENOENT;
1672 }
1673
1674out:
1675 return ret;
1676}
1677
1678static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen)
1679{
1680 int ret;
1681
1682 if (ino == BTRFS_FIRST_FREE_OBJECTID)
1683 return 1;
1684
1685 ret = get_cur_inode_state(sctx, ino, gen);
1686 if (ret < 0)
1687 goto out;
1688
1689 if (ret == inode_state_no_change ||
1690 ret == inode_state_did_create ||
1691 ret == inode_state_will_delete)
1692 ret = 1;
1693 else
1694 ret = 0;
1695
1696out:
1697 return ret;
1698}
1699
1700/*
1701 * Helper function to lookup a dir item in a dir.
1702 */
1703static int lookup_dir_item_inode(struct btrfs_root *root,
1704 u64 dir, const char *name, int name_len,
1705 u64 *found_inode,
1706 u8 *found_type)
1707{
1708 int ret = 0;
1709 struct btrfs_dir_item *di;
1710 struct btrfs_key key;
1711 struct btrfs_path *path;
1712
1713 path = alloc_path_for_send();
1714 if (!path)
1715 return -ENOMEM;
1716
1717 di = btrfs_lookup_dir_item(NULL, root, path,
1718 dir, name, name_len, 0);
1719 if (!di) {
1720 ret = -ENOENT;
1721 goto out;
1722 }
1723 if (IS_ERR(di)) {
1724 ret = PTR_ERR(di);
1725 goto out;
1726 }
1727 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
1728 if (key.type == BTRFS_ROOT_ITEM_KEY) {
1729 ret = -ENOENT;
1730 goto out;
1731 }
1732 *found_inode = key.objectid;
1733 *found_type = btrfs_dir_type(path->nodes[0], di);
1734
1735out:
1736 btrfs_free_path(path);
1737 return ret;
1738}
1739
1740/*
1741 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
1742 * generation of the parent dir and the name of the dir entry.
1743 */
1744static int get_first_ref(struct btrfs_root *root, u64 ino,
1745 u64 *dir, u64 *dir_gen, struct fs_path *name)
1746{
1747 int ret;
1748 struct btrfs_key key;
1749 struct btrfs_key found_key;
1750 struct btrfs_path *path;
1751 int len;
1752 u64 parent_dir;
1753
1754 path = alloc_path_for_send();
1755 if (!path)
1756 return -ENOMEM;
1757
1758 key.objectid = ino;
1759 key.type = BTRFS_INODE_REF_KEY;
1760 key.offset = 0;
1761
1762 ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
1763 if (ret < 0)
1764 goto out;
1765 if (!ret)
1766 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1767 path->slots[0]);
1768 if (ret || found_key.objectid != ino ||
1769 (found_key.type != BTRFS_INODE_REF_KEY &&
1770 found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1771 ret = -ENOENT;
1772 goto out;
1773 }
1774
1775 if (found_key.type == BTRFS_INODE_REF_KEY) {
1776 struct btrfs_inode_ref *iref;
1777 iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1778 struct btrfs_inode_ref);
1779 len = btrfs_inode_ref_name_len(path->nodes[0], iref);
1780 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1781 (unsigned long)(iref + 1),
1782 len);
1783 parent_dir = found_key.offset;
1784 } else {
1785 struct btrfs_inode_extref *extref;
1786 extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
1787 struct btrfs_inode_extref);
1788 len = btrfs_inode_extref_name_len(path->nodes[0], extref);
1789 ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
1790 (unsigned long)&extref->name, len);
1791 parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
1792 }
1793 if (ret < 0)
1794 goto out;
1795 btrfs_release_path(path);
1796
1797 if (dir_gen) {
1798 ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL,
1799 NULL, NULL, NULL);
1800 if (ret < 0)
1801 goto out;
1802 }
1803
1804 *dir = parent_dir;
1805
1806out:
1807 btrfs_free_path(path);
1808 return ret;
1809}
1810
1811static int is_first_ref(struct btrfs_root *root,
1812 u64 ino, u64 dir,
1813 const char *name, int name_len)
1814{
1815 int ret;
1816 struct fs_path *tmp_name;
1817 u64 tmp_dir;
1818
1819 tmp_name = fs_path_alloc();
1820 if (!tmp_name)
1821 return -ENOMEM;
1822
1823 ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
1824 if (ret < 0)
1825 goto out;
1826
1827 if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
1828 ret = 0;
1829 goto out;
1830 }
1831
1832 ret = !memcmp(tmp_name->start, name, name_len);
1833
1834out:
1835 fs_path_free(tmp_name);
1836 return ret;
1837}
1838
1839/*
1840 * Used by process_recorded_refs to determine if a new ref would overwrite an
1841 * already existing ref. In case it detects an overwrite, it returns the
1842 * inode/gen in who_ino/who_gen.
1843 * When an overwrite is detected, process_recorded_refs does proper orphanizing
1844 * to make sure later references to the overwritten inode are possible.
1845 * Orphanizing is however only required for the first ref of an inode.
1846 * process_recorded_refs does an additional is_first_ref check to see if
1847 * orphanizing is really required.
1848 */
1849static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
1850 const char *name, int name_len,
1851 u64 *who_ino, u64 *who_gen, u64 *who_mode)
1852{
1853 int ret = 0;
1854 u64 gen;
1855 u64 other_inode = 0;
1856 u8 other_type = 0;
1857
1858 if (!sctx->parent_root)
1859 goto out;
1860
1861 ret = is_inode_existent(sctx, dir, dir_gen);
1862 if (ret <= 0)
1863 goto out;
1864
1865 /*
1866 * If we have a parent root we need to verify that the parent dir was
1867 * not deleted and then re-created, if it was then we have no overwrite
1868 * and we can just unlink this entry.
1869 */
1870 if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) {
1871 ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
1872 NULL, NULL, NULL);
1873 if (ret < 0 && ret != -ENOENT)
1874 goto out;
1875 if (ret) {
1876 ret = 0;
1877 goto out;
1878 }
1879 if (gen != dir_gen)
1880 goto out;
1881 }
1882
1883 ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
1884 &other_inode, &other_type);
1885 if (ret < 0 && ret != -ENOENT)
1886 goto out;
1887 if (ret) {
1888 ret = 0;
1889 goto out;
1890 }
1891
1892 /*
1893 * Check if the overwritten ref was already processed. If yes, the ref
1894 * was already unlinked/moved, so we can safely assume that we will not
1895 * overwrite anything at this point in time.
1896 */
1897 if (other_inode > sctx->send_progress ||
1898 is_waiting_for_move(sctx, other_inode)) {
1899 ret = get_inode_info(sctx->parent_root, other_inode, NULL,
1900 who_gen, who_mode, NULL, NULL, NULL);
1901 if (ret < 0)
1902 goto out;
1903
1904 ret = 1;
1905 *who_ino = other_inode;
1906 } else {
1907 ret = 0;
1908 }
1909
1910out:
1911 return ret;
1912}
1913
1914/*
1915 * Checks if the ref was overwritten by an already processed inode. This is
1916 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
1917 * thus the orphan name needs be used.
1918 * process_recorded_refs also uses it to avoid unlinking of refs that were
1919 * overwritten.
1920 */
1921static int did_overwrite_ref(struct send_ctx *sctx,
1922 u64 dir, u64 dir_gen,
1923 u64 ino, u64 ino_gen,
1924 const char *name, int name_len)
1925{
1926 int ret = 0;
1927 u64 gen;
1928 u64 ow_inode;
1929 u8 other_type;
1930
1931 if (!sctx->parent_root)
1932 goto out;
1933
1934 ret = is_inode_existent(sctx, dir, dir_gen);
1935 if (ret <= 0)
1936 goto out;
1937
1938 if (dir != BTRFS_FIRST_FREE_OBJECTID) {
1939 ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL,
1940 NULL, NULL, NULL);
1941 if (ret < 0 && ret != -ENOENT)
1942 goto out;
1943 if (ret) {
1944 ret = 0;
1945 goto out;
1946 }
1947 if (gen != dir_gen)
1948 goto out;
1949 }
1950
1951 /* check if the ref was overwritten by another ref */
1952 ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
1953 &ow_inode, &other_type);
1954 if (ret < 0 && ret != -ENOENT)
1955 goto out;
1956 if (ret) {
1957 /* was never and will never be overwritten */
1958 ret = 0;
1959 goto out;
1960 }
1961
1962 ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL,
1963 NULL, NULL);
1964 if (ret < 0)
1965 goto out;
1966
1967 if (ow_inode == ino && gen == ino_gen) {
1968 ret = 0;
1969 goto out;
1970 }
1971
1972 /*
1973 * We know that it is or will be overwritten. Check this now.
1974 * The current inode being processed might have been the one that caused
1975 * inode 'ino' to be orphanized, therefore check if ow_inode matches
1976 * the current inode being processed.
1977 */
1978 if ((ow_inode < sctx->send_progress) ||
1979 (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
1980 gen == sctx->cur_inode_gen))
1981 ret = 1;
1982 else
1983 ret = 0;
1984
1985out:
1986 return ret;
1987}
1988
1989/*
1990 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
1991 * that got overwritten. This is used by process_recorded_refs to determine
1992 * if it has to use the path as returned by get_cur_path or the orphan name.
1993 */
1994static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
1995{
1996 int ret = 0;
1997 struct fs_path *name = NULL;
1998 u64 dir;
1999 u64 dir_gen;
2000
2001 if (!sctx->parent_root)
2002 goto out;
2003
2004 name = fs_path_alloc();
2005 if (!name)
2006 return -ENOMEM;
2007
2008 ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
2009 if (ret < 0)
2010 goto out;
2011
2012 ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
2013 name->start, fs_path_len(name));
2014
2015out:
2016 fs_path_free(name);
2017 return ret;
2018}
2019
2020/*
2021 * Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
2022 * so we need to do some special handling in case we have clashes. This function
2023 * takes care of this with the help of name_cache_entry::radix_list.
2024 * In case of error, nce is kfreed.
2025 */
2026static int name_cache_insert(struct send_ctx *sctx,
2027 struct name_cache_entry *nce)
2028{
2029 int ret = 0;
2030 struct list_head *nce_head;
2031
2032 nce_head = radix_tree_lookup(&sctx->name_cache,
2033 (unsigned long)nce->ino);
2034 if (!nce_head) {
2035 nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
2036 if (!nce_head) {
2037 kfree(nce);
2038 return -ENOMEM;
2039 }
2040 INIT_LIST_HEAD(nce_head);
2041
2042 ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
2043 if (ret < 0) {
2044 kfree(nce_head);
2045 kfree(nce);
2046 return ret;
2047 }
2048 }
2049 list_add_tail(&nce->radix_list, nce_head);
2050 list_add_tail(&nce->list, &sctx->name_cache_list);
2051 sctx->name_cache_size++;
2052
2053 return ret;
2054}
2055
2056static void name_cache_delete(struct send_ctx *sctx,
2057 struct name_cache_entry *nce)
2058{
2059 struct list_head *nce_head;
2060
2061 nce_head = radix_tree_lookup(&sctx->name_cache,
2062 (unsigned long)nce->ino);
2063 if (!nce_head) {
2064 btrfs_err(sctx->send_root->fs_info,
2065 "name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
2066 nce->ino, sctx->name_cache_size);
2067 }
2068
2069 list_del(&nce->radix_list);
2070 list_del(&nce->list);
2071 sctx->name_cache_size--;
2072
2073 /*
2074 * We may not get to the final release of nce_head if the lookup fails
2075 */
2076 if (nce_head && list_empty(nce_head)) {
2077 radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
2078 kfree(nce_head);
2079 }
2080}
2081
2082static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
2083 u64 ino, u64 gen)
2084{
2085 struct list_head *nce_head;
2086 struct name_cache_entry *cur;
2087
2088 nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
2089 if (!nce_head)
2090 return NULL;
2091
2092 list_for_each_entry(cur, nce_head, radix_list) {
2093 if (cur->ino == ino && cur->gen == gen)
2094 return cur;
2095 }
2096 return NULL;
2097}
2098
2099/*
2100 * Removes the entry from the list and adds it back to the end. This marks the
2101 * entry as recently used so that name_cache_clean_unused does not remove it.
2102 */
2103static void name_cache_used(struct send_ctx *sctx, struct name_cache_entry *nce)
2104{
2105 list_del(&nce->list);
2106 list_add_tail(&nce->list, &sctx->name_cache_list);
2107}
2108
2109/*
2110 * Remove some entries from the beginning of name_cache_list.
2111 */
2112static void name_cache_clean_unused(struct send_ctx *sctx)
2113{
2114 struct name_cache_entry *nce;
2115
2116 if (sctx->name_cache_size < SEND_CTX_NAME_CACHE_CLEAN_SIZE)
2117 return;
2118
2119 while (sctx->name_cache_size > SEND_CTX_MAX_NAME_CACHE_SIZE) {
2120 nce = list_entry(sctx->name_cache_list.next,
2121 struct name_cache_entry, list);
2122 name_cache_delete(sctx, nce);
2123 kfree(nce);
2124 }
2125}
2126
2127static void name_cache_free(struct send_ctx *sctx)
2128{
2129 struct name_cache_entry *nce;
2130
2131 while (!list_empty(&sctx->name_cache_list)) {
2132 nce = list_entry(sctx->name_cache_list.next,
2133 struct name_cache_entry, list);
2134 name_cache_delete(sctx, nce);
2135 kfree(nce);
2136 }
2137}
2138
2139/*
2140 * Used by get_cur_path for each ref up to the root.
2141 * Returns 0 if it succeeded.
2142 * Returns 1 if the inode is not existent or got overwritten. In that case, the
2143 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
2144 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
2145 * Returns <0 in case of error.
2146 */
2147static int __get_cur_name_and_parent(struct send_ctx *sctx,
2148 u64 ino, u64 gen,
2149 u64 *parent_ino,
2150 u64 *parent_gen,
2151 struct fs_path *dest)
2152{
2153 int ret;
2154 int nce_ret;
2155 struct name_cache_entry *nce = NULL;
2156
2157 /*
2158 * First check if we already did a call to this function with the same
2159 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
2160 * return the cached result.
2161 */
2162 nce = name_cache_search(sctx, ino, gen);
2163 if (nce) {
2164 if (ino < sctx->send_progress && nce->need_later_update) {
2165 name_cache_delete(sctx, nce);
2166 kfree(nce);
2167 nce = NULL;
2168 } else {
2169 name_cache_used(sctx, nce);
2170 *parent_ino = nce->parent_ino;
2171 *parent_gen = nce->parent_gen;
2172 ret = fs_path_add(dest, nce->name, nce->name_len);
2173 if (ret < 0)
2174 goto out;
2175 ret = nce->ret;
2176 goto out;
2177 }
2178 }
2179
2180 /*
2181 * If the inode is not existent yet, add the orphan name and return 1.
2182 * This should only happen for the parent dir that we determine in
2183 * __record_new_ref
2184 */
2185 ret = is_inode_existent(sctx, ino, gen);
2186 if (ret < 0)
2187 goto out;
2188
2189 if (!ret) {
2190 ret = gen_unique_name(sctx, ino, gen, dest);
2191 if (ret < 0)
2192 goto out;
2193 ret = 1;
2194 goto out_cache;
2195 }
2196
2197 /*
2198 * Depending on whether the inode was already processed or not, use
2199 * send_root or parent_root for ref lookup.
2200 */
2201 if (ino < sctx->send_progress)
2202 ret = get_first_ref(sctx->send_root, ino,
2203 parent_ino, parent_gen, dest);
2204 else
2205 ret = get_first_ref(sctx->parent_root, ino,
2206 parent_ino, parent_gen, dest);
2207 if (ret < 0)
2208 goto out;
2209
2210 /*
2211 * Check if the ref was overwritten by an inode's ref that was processed
2212 * earlier. If yes, treat as orphan and return 1.
2213 */
2214 ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
2215 dest->start, dest->end - dest->start);
2216 if (ret < 0)
2217 goto out;
2218 if (ret) {
2219 fs_path_reset(dest);
2220 ret = gen_unique_name(sctx, ino, gen, dest);
2221 if (ret < 0)
2222 goto out;
2223 ret = 1;
2224 }
2225
2226out_cache:
2227 /*
2228 * Store the result of the lookup in the name cache.
2229 */
2230 nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
2231 if (!nce) {
2232 ret = -ENOMEM;
2233 goto out;
2234 }
2235
2236 nce->ino = ino;
2237 nce->gen = gen;
2238 nce->parent_ino = *parent_ino;
2239 nce->parent_gen = *parent_gen;
2240 nce->name_len = fs_path_len(dest);
2241 nce->ret = ret;
2242 strcpy(nce->name, dest->start);
2243
2244 if (ino < sctx->send_progress)
2245 nce->need_later_update = 0;
2246 else
2247 nce->need_later_update = 1;
2248
2249 nce_ret = name_cache_insert(sctx, nce);
2250 if (nce_ret < 0)
2251 ret = nce_ret;
2252 name_cache_clean_unused(sctx);
2253
2254out:
2255 return ret;
2256}
2257
2258/*
2259 * Magic happens here. This function returns the first ref to an inode as it
2260 * would look like while receiving the stream at this point in time.
2261 * We walk the path up to the root. For every inode in between, we check if it
2262 * was already processed/sent. If yes, we continue with the parent as found
2263 * in send_root. If not, we continue with the parent as found in parent_root.
2264 * If we encounter an inode that was deleted at this point in time, we use the
2265 * inodes "orphan" name instead of the real name and stop. Same with new inodes
2266 * that were not created yet and overwritten inodes/refs.
2267 *
2268 * When do we have have orphan inodes:
2269 * 1. When an inode is freshly created and thus no valid refs are available yet
2270 * 2. When a directory lost all it's refs (deleted) but still has dir items
2271 * inside which were not processed yet (pending for move/delete). If anyone
2272 * tried to get the path to the dir items, it would get a path inside that
2273 * orphan directory.
2274 * 3. When an inode is moved around or gets new links, it may overwrite the ref
2275 * of an unprocessed inode. If in that case the first ref would be
2276 * overwritten, the overwritten inode gets "orphanized". Later when we
2277 * process this overwritten inode, it is restored at a new place by moving
2278 * the orphan inode.
2279 *
2280 * sctx->send_progress tells this function at which point in time receiving
2281 * would be.
2282 */
2283static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
2284 struct fs_path *dest)
2285{
2286 int ret = 0;
2287 struct fs_path *name = NULL;
2288 u64 parent_inode = 0;
2289 u64 parent_gen = 0;
2290 int stop = 0;
2291
2292 name = fs_path_alloc();
2293 if (!name) {
2294 ret = -ENOMEM;
2295 goto out;
2296 }
2297
2298 dest->reversed = 1;
2299 fs_path_reset(dest);
2300
2301 while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2302 struct waiting_dir_move *wdm;
2303
2304 fs_path_reset(name);
2305
2306 if (is_waiting_for_rm(sctx, ino)) {
2307 ret = gen_unique_name(sctx, ino, gen, name);
2308 if (ret < 0)
2309 goto out;
2310 ret = fs_path_add_path(dest, name);
2311 break;
2312 }
2313
2314 wdm = get_waiting_dir_move(sctx, ino);
2315 if (wdm && wdm->orphanized) {
2316 ret = gen_unique_name(sctx, ino, gen, name);
2317 stop = 1;
2318 } else if (wdm) {
2319 ret = get_first_ref(sctx->parent_root, ino,
2320 &parent_inode, &parent_gen, name);
2321 } else {
2322 ret = __get_cur_name_and_parent(sctx, ino, gen,
2323 &parent_inode,
2324 &parent_gen, name);
2325 if (ret)
2326 stop = 1;
2327 }
2328
2329 if (ret < 0)
2330 goto out;
2331
2332 ret = fs_path_add_path(dest, name);
2333 if (ret < 0)
2334 goto out;
2335
2336 ino = parent_inode;
2337 gen = parent_gen;
2338 }
2339
2340out:
2341 fs_path_free(name);
2342 if (!ret)
2343 fs_path_unreverse(dest);
2344 return ret;
2345}
2346
2347/*
2348 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
2349 */
2350static int send_subvol_begin(struct send_ctx *sctx)
2351{
2352 int ret;
2353 struct btrfs_root *send_root = sctx->send_root;
2354 struct btrfs_root *parent_root = sctx->parent_root;
2355 struct btrfs_path *path;
2356 struct btrfs_key key;
2357 struct btrfs_root_ref *ref;
2358 struct extent_buffer *leaf;
2359 char *name = NULL;
2360 int namelen;
2361
2362 path = btrfs_alloc_path();
2363 if (!path)
2364 return -ENOMEM;
2365
2366 name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2367 if (!name) {
2368 btrfs_free_path(path);
2369 return -ENOMEM;
2370 }
2371
2372 key.objectid = send_root->objectid;
2373 key.type = BTRFS_ROOT_BACKREF_KEY;
2374 key.offset = 0;
2375
2376 ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
2377 &key, path, 1, 0);
2378 if (ret < 0)
2379 goto out;
2380 if (ret) {
2381 ret = -ENOENT;
2382 goto out;
2383 }
2384
2385 leaf = path->nodes[0];
2386 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2387 if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2388 key.objectid != send_root->objectid) {
2389 ret = -ENOENT;
2390 goto out;
2391 }
2392 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
2393 namelen = btrfs_root_ref_name_len(leaf, ref);
2394 read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
2395 btrfs_release_path(path);
2396
2397 if (parent_root) {
2398 ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
2399 if (ret < 0)
2400 goto out;
2401 } else {
2402 ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
2403 if (ret < 0)
2404 goto out;
2405 }
2406
2407 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2408
2409 if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
2410 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2411 sctx->send_root->root_item.received_uuid);
2412 else
2413 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2414 sctx->send_root->root_item.uuid);
2415
2416 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2417 le64_to_cpu(sctx->send_root->root_item.ctransid));
2418 if (parent_root) {
2419 if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
2420 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2421 parent_root->root_item.received_uuid);
2422 else
2423 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
2424 parent_root->root_item.uuid);
2425 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2426 le64_to_cpu(sctx->parent_root->root_item.ctransid));
2427 }
2428
2429 ret = send_cmd(sctx);
2430
2431tlv_put_failure:
2432out:
2433 btrfs_free_path(path);
2434 kfree(name);
2435 return ret;
2436}
2437
2438static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
2439{
2440 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2441 int ret = 0;
2442 struct fs_path *p;
2443
2444 btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
2445
2446 p = fs_path_alloc();
2447 if (!p)
2448 return -ENOMEM;
2449
2450 ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
2451 if (ret < 0)
2452 goto out;
2453
2454 ret = get_cur_path(sctx, ino, gen, p);
2455 if (ret < 0)
2456 goto out;
2457 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2458 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);
2459
2460 ret = send_cmd(sctx);
2461
2462tlv_put_failure:
2463out:
2464 fs_path_free(p);
2465 return ret;
2466}
2467
2468static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
2469{
2470 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2471 int ret = 0;
2472 struct fs_path *p;
2473
2474 btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
2475
2476 p = fs_path_alloc();
2477 if (!p)
2478 return -ENOMEM;
2479
2480 ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
2481 if (ret < 0)
2482 goto out;
2483
2484 ret = get_cur_path(sctx, ino, gen, p);
2485 if (ret < 0)
2486 goto out;
2487 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2488 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);
2489
2490 ret = send_cmd(sctx);
2491
2492tlv_put_failure:
2493out:
2494 fs_path_free(p);
2495 return ret;
2496}
2497
2498static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
2499{
2500 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2501 int ret = 0;
2502 struct fs_path *p;
2503
2504 btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
2505 ino, uid, gid);
2506
2507 p = fs_path_alloc();
2508 if (!p)
2509 return -ENOMEM;
2510
2511 ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
2512 if (ret < 0)
2513 goto out;
2514
2515 ret = get_cur_path(sctx, ino, gen, p);
2516 if (ret < 0)
2517 goto out;
2518 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2519 TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
2520 TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);
2521
2522 ret = send_cmd(sctx);
2523
2524tlv_put_failure:
2525out:
2526 fs_path_free(p);
2527 return ret;
2528}
2529
2530static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
2531{
2532 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2533 int ret = 0;
2534 struct fs_path *p = NULL;
2535 struct btrfs_inode_item *ii;
2536 struct btrfs_path *path = NULL;
2537 struct extent_buffer *eb;
2538 struct btrfs_key key;
2539 int slot;
2540
2541 btrfs_debug(fs_info, "send_utimes %llu", ino);
2542
2543 p = fs_path_alloc();
2544 if (!p)
2545 return -ENOMEM;
2546
2547 path = alloc_path_for_send();
2548 if (!path) {
2549 ret = -ENOMEM;
2550 goto out;
2551 }
2552
2553 key.objectid = ino;
2554 key.type = BTRFS_INODE_ITEM_KEY;
2555 key.offset = 0;
2556 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2557 if (ret > 0)
2558 ret = -ENOENT;
2559 if (ret < 0)
2560 goto out;
2561
2562 eb = path->nodes[0];
2563 slot = path->slots[0];
2564 ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
2565
2566 ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
2567 if (ret < 0)
2568 goto out;
2569
2570 ret = get_cur_path(sctx, ino, gen, p);
2571 if (ret < 0)
2572 goto out;
2573 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2574 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
2575 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
2576 TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2577 /* TODO Add otime support when the otime patches get into upstream */
2578
2579 ret = send_cmd(sctx);
2580
2581tlv_put_failure:
2582out:
2583 fs_path_free(p);
2584 btrfs_free_path(path);
2585 return ret;
2586}
2587
2588/*
2589 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
2590 * a valid path yet because we did not process the refs yet. So, the inode
2591 * is created as orphan.
2592 */
2593static int send_create_inode(struct send_ctx *sctx, u64 ino)
2594{
2595 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2596 int ret = 0;
2597 struct fs_path *p;
2598 int cmd;
2599 u64 gen;
2600 u64 mode;
2601 u64 rdev;
2602
2603 btrfs_debug(fs_info, "send_create_inode %llu", ino);
2604
2605 p = fs_path_alloc();
2606 if (!p)
2607 return -ENOMEM;
2608
2609 if (ino != sctx->cur_ino) {
2610 ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode,
2611 NULL, NULL, &rdev);
2612 if (ret < 0)
2613 goto out;
2614 } else {
2615 gen = sctx->cur_inode_gen;
2616 mode = sctx->cur_inode_mode;
2617 rdev = sctx->cur_inode_rdev;
2618 }
2619
2620 if (S_ISREG(mode)) {
2621 cmd = BTRFS_SEND_C_MKFILE;
2622 } else if (S_ISDIR(mode)) {
2623 cmd = BTRFS_SEND_C_MKDIR;
2624 } else if (S_ISLNK(mode)) {
2625 cmd = BTRFS_SEND_C_SYMLINK;
2626 } else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2627 cmd = BTRFS_SEND_C_MKNOD;
2628 } else if (S_ISFIFO(mode)) {
2629 cmd = BTRFS_SEND_C_MKFIFO;
2630 } else if (S_ISSOCK(mode)) {
2631 cmd = BTRFS_SEND_C_MKSOCK;
2632 } else {
2633 btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2634 (int)(mode & S_IFMT));
2635 ret = -EOPNOTSUPP;
2636 goto out;
2637 }
2638
2639 ret = begin_cmd(sctx, cmd);
2640 if (ret < 0)
2641 goto out;
2642
2643 ret = gen_unique_name(sctx, ino, gen, p);
2644 if (ret < 0)
2645 goto out;
2646
2647 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2648 TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2649
2650 if (S_ISLNK(mode)) {
2651 fs_path_reset(p);
2652 ret = read_symlink(sctx->send_root, ino, p);
2653 if (ret < 0)
2654 goto out;
2655 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
2656 } else if (S_ISCHR(mode) || S_ISBLK(mode) ||
2657 S_ISFIFO(mode) || S_ISSOCK(mode)) {
2658 TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
2659 TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2660 }
2661
2662 ret = send_cmd(sctx);
2663 if (ret < 0)
2664 goto out;
2665
2666
2667tlv_put_failure:
2668out:
2669 fs_path_free(p);
2670 return ret;
2671}
2672
2673/*
2674 * We need some special handling for inodes that get processed before the parent
2675 * directory got created. See process_recorded_refs for details.
2676 * This function does the check if we already created the dir out of order.
2677 */
2678static int did_create_dir(struct send_ctx *sctx, u64 dir)
2679{
2680 int ret = 0;
2681 struct btrfs_path *path = NULL;
2682 struct btrfs_key key;
2683 struct btrfs_key found_key;
2684 struct btrfs_key di_key;
2685 struct extent_buffer *eb;
2686 struct btrfs_dir_item *di;
2687 int slot;
2688
2689 path = alloc_path_for_send();
2690 if (!path) {
2691 ret = -ENOMEM;
2692 goto out;
2693 }
2694
2695 key.objectid = dir;
2696 key.type = BTRFS_DIR_INDEX_KEY;
2697 key.offset = 0;
2698 ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2699 if (ret < 0)
2700 goto out;
2701
2702 while (1) {
2703 eb = path->nodes[0];
2704 slot = path->slots[0];
2705 if (slot >= btrfs_header_nritems(eb)) {
2706 ret = btrfs_next_leaf(sctx->send_root, path);
2707 if (ret < 0) {
2708 goto out;
2709 } else if (ret > 0) {
2710 ret = 0;
2711 break;
2712 }
2713 continue;
2714 }
2715
2716 btrfs_item_key_to_cpu(eb, &found_key, slot);
2717 if (found_key.objectid != key.objectid ||
2718 found_key.type != key.type) {
2719 ret = 0;
2720 goto out;
2721 }
2722
2723 di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
2724 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2725
2726 if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
2727 di_key.objectid < sctx->send_progress) {
2728 ret = 1;
2729 goto out;
2730 }
2731
2732 path->slots[0]++;
2733 }
2734
2735out:
2736 btrfs_free_path(path);
2737 return ret;
2738}
2739
2740/*
2741 * Only creates the inode if it is:
2742 * 1. Not a directory
2743 * 2. Or a directory which was not created already due to out of order
2744 * directories. See did_create_dir and process_recorded_refs for details.
2745 */
2746static int send_create_inode_if_needed(struct send_ctx *sctx)
2747{
2748 int ret;
2749
2750 if (S_ISDIR(sctx->cur_inode_mode)) {
2751 ret = did_create_dir(sctx, sctx->cur_ino);
2752 if (ret < 0)
2753 goto out;
2754 if (ret) {
2755 ret = 0;
2756 goto out;
2757 }
2758 }
2759
2760 ret = send_create_inode(sctx, sctx->cur_ino);
2761 if (ret < 0)
2762 goto out;
2763
2764out:
2765 return ret;
2766}
2767
2768struct recorded_ref {
2769 struct list_head list;
2770 char *name;
2771 struct fs_path *full_path;
2772 u64 dir;
2773 u64 dir_gen;
2774 int name_len;
2775};
2776
2777static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
2778{
2779 ref->full_path = path;
2780 ref->name = (char *)kbasename(ref->full_path->start);
2781 ref->name_len = ref->full_path->end - ref->name;
2782}
2783
2784/*
2785 * We need to process new refs before deleted refs, but compare_tree gives us
2786 * everything mixed. So we first record all refs and later process them.
2787 * This function is a helper to record one ref.
2788 */
2789static int __record_ref(struct list_head *head, u64 dir,
2790 u64 dir_gen, struct fs_path *path)
2791{
2792 struct recorded_ref *ref;
2793
2794 ref = kmalloc(sizeof(*ref), GFP_KERNEL);
2795 if (!ref)
2796 return -ENOMEM;
2797
2798 ref->dir = dir;
2799 ref->dir_gen = dir_gen;
2800 set_ref_path(ref, path);
2801 list_add_tail(&ref->list, head);
2802 return 0;
2803}
2804
2805static int dup_ref(struct recorded_ref *ref, struct list_head *list)
2806{
2807 struct recorded_ref *new;
2808
2809 new = kmalloc(sizeof(*ref), GFP_KERNEL);
2810 if (!new)
2811 return -ENOMEM;
2812
2813 new->dir = ref->dir;
2814 new->dir_gen = ref->dir_gen;
2815 new->full_path = NULL;
2816 INIT_LIST_HEAD(&new->list);
2817 list_add_tail(&new->list, list);
2818 return 0;
2819}
2820
2821static void __free_recorded_refs(struct list_head *head)
2822{
2823 struct recorded_ref *cur;
2824
2825 while (!list_empty(head)) {
2826 cur = list_entry(head->next, struct recorded_ref, list);
2827 fs_path_free(cur->full_path);
2828 list_del(&cur->list);
2829 kfree(cur);
2830 }
2831}
2832
2833static void free_recorded_refs(struct send_ctx *sctx)
2834{
2835 __free_recorded_refs(&sctx->new_refs);
2836 __free_recorded_refs(&sctx->deleted_refs);
2837}
2838
2839/*
2840 * Renames/moves a file/dir to its orphan name. Used when the first
2841 * ref of an unprocessed inode gets overwritten and for all non empty
2842 * directories.
2843 */
2844static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
2845 struct fs_path *path)
2846{
2847 int ret;
2848 struct fs_path *orphan;
2849
2850 orphan = fs_path_alloc();
2851 if (!orphan)
2852 return -ENOMEM;
2853
2854 ret = gen_unique_name(sctx, ino, gen, orphan);
2855 if (ret < 0)
2856 goto out;
2857
2858 ret = send_rename(sctx, path, orphan);
2859
2860out:
2861 fs_path_free(orphan);
2862 return ret;
2863}
2864
2865static struct orphan_dir_info *
2866add_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2867{
2868 struct rb_node **p = &sctx->orphan_dirs.rb_node;
2869 struct rb_node *parent = NULL;
2870 struct orphan_dir_info *entry, *odi;
2871
2872 while (*p) {
2873 parent = *p;
2874 entry = rb_entry(parent, struct orphan_dir_info, node);
2875 if (dir_ino < entry->ino) {
2876 p = &(*p)->rb_left;
2877 } else if (dir_ino > entry->ino) {
2878 p = &(*p)->rb_right;
2879 } else {
2880 return entry;
2881 }
2882 }
2883
2884 odi = kmalloc(sizeof(*odi), GFP_KERNEL);
2885 if (!odi)
2886 return ERR_PTR(-ENOMEM);
2887 odi->ino = dir_ino;
2888 odi->gen = 0;
2889 odi->last_dir_index_offset = 0;
2890
2891 rb_link_node(&odi->node, parent, p);
2892 rb_insert_color(&odi->node, &sctx->orphan_dirs);
2893 return odi;
2894}
2895
2896static struct orphan_dir_info *
2897get_orphan_dir_info(struct send_ctx *sctx, u64 dir_ino)
2898{
2899 struct rb_node *n = sctx->orphan_dirs.rb_node;
2900 struct orphan_dir_info *entry;
2901
2902 while (n) {
2903 entry = rb_entry(n, struct orphan_dir_info, node);
2904 if (dir_ino < entry->ino)
2905 n = n->rb_left;
2906 else if (dir_ino > entry->ino)
2907 n = n->rb_right;
2908 else
2909 return entry;
2910 }
2911 return NULL;
2912}
2913
2914static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino)
2915{
2916 struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino);
2917
2918 return odi != NULL;
2919}
2920
2921static void free_orphan_dir_info(struct send_ctx *sctx,
2922 struct orphan_dir_info *odi)
2923{
2924 if (!odi)
2925 return;
2926 rb_erase(&odi->node, &sctx->orphan_dirs);
2927 kfree(odi);
2928}
2929
2930/*
2931 * Returns 1 if a directory can be removed at this point in time.
2932 * We check this by iterating all dir items and checking if the inode behind
2933 * the dir item was already processed.
2934 */
2935static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
2936 u64 send_progress)
2937{
2938 int ret = 0;
2939 struct btrfs_root *root = sctx->parent_root;
2940 struct btrfs_path *path;
2941 struct btrfs_key key;
2942 struct btrfs_key found_key;
2943 struct btrfs_key loc;
2944 struct btrfs_dir_item *di;
2945 struct orphan_dir_info *odi = NULL;
2946
2947 /*
2948 * Don't try to rmdir the top/root subvolume dir.
2949 */
2950 if (dir == BTRFS_FIRST_FREE_OBJECTID)
2951 return 0;
2952
2953 path = alloc_path_for_send();
2954 if (!path)
2955 return -ENOMEM;
2956
2957 key.objectid = dir;
2958 key.type = BTRFS_DIR_INDEX_KEY;
2959 key.offset = 0;
2960
2961 odi = get_orphan_dir_info(sctx, dir);
2962 if (odi)
2963 key.offset = odi->last_dir_index_offset;
2964
2965 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2966 if (ret < 0)
2967 goto out;
2968
2969 while (1) {
2970 struct waiting_dir_move *dm;
2971
2972 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2973 ret = btrfs_next_leaf(root, path);
2974 if (ret < 0)
2975 goto out;
2976 else if (ret > 0)
2977 break;
2978 continue;
2979 }
2980 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2981 path->slots[0]);
2982 if (found_key.objectid != key.objectid ||
2983 found_key.type != key.type)
2984 break;
2985
2986 di = btrfs_item_ptr(path->nodes[0], path->slots[0],
2987 struct btrfs_dir_item);
2988 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
2989
2990 dm = get_waiting_dir_move(sctx, loc.objectid);
2991 if (dm) {
2992 odi = add_orphan_dir_info(sctx, dir);
2993 if (IS_ERR(odi)) {
2994 ret = PTR_ERR(odi);
2995 goto out;
2996 }
2997 odi->gen = dir_gen;
2998 odi->last_dir_index_offset = found_key.offset;
2999 dm->rmdir_ino = dir;
3000 ret = 0;
3001 goto out;
3002 }
3003
3004 if (loc.objectid > send_progress) {
3005 odi = add_orphan_dir_info(sctx, dir);
3006 if (IS_ERR(odi)) {
3007 ret = PTR_ERR(odi);
3008 goto out;
3009 }
3010 odi->gen = dir_gen;
3011 odi->last_dir_index_offset = found_key.offset;
3012 ret = 0;
3013 goto out;
3014 }
3015
3016 path->slots[0]++;
3017 }
3018 free_orphan_dir_info(sctx, odi);
3019
3020 ret = 1;
3021
3022out:
3023 btrfs_free_path(path);
3024 return ret;
3025}
3026
3027static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
3028{
3029 struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3030
3031 return entry != NULL;
3032}
3033
3034static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3035{
3036 struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
3037 struct rb_node *parent = NULL;
3038 struct waiting_dir_move *entry, *dm;
3039
3040 dm = kmalloc(sizeof(*dm), GFP_KERNEL);
3041 if (!dm)
3042 return -ENOMEM;
3043 dm->ino = ino;
3044 dm->rmdir_ino = 0;
3045 dm->orphanized = orphanized;
3046
3047 while (*p) {
3048 parent = *p;
3049 entry = rb_entry(parent, struct waiting_dir_move, node);
3050 if (ino < entry->ino) {
3051 p = &(*p)->rb_left;
3052 } else if (ino > entry->ino) {
3053 p = &(*p)->rb_right;
3054 } else {
3055 kfree(dm);
3056 return -EEXIST;
3057 }
3058 }
3059
3060 rb_link_node(&dm->node, parent, p);
3061 rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
3062 return 0;
3063}
3064
3065static struct waiting_dir_move *
3066get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3067{
3068 struct rb_node *n = sctx->waiting_dir_moves.rb_node;
3069 struct waiting_dir_move *entry;
3070
3071 while (n) {
3072 entry = rb_entry(n, struct waiting_dir_move, node);
3073 if (ino < entry->ino)
3074 n = n->rb_left;
3075 else if (ino > entry->ino)
3076 n = n->rb_right;
3077 else
3078 return entry;
3079 }
3080 return NULL;
3081}
3082
3083static void free_waiting_dir_move(struct send_ctx *sctx,
3084 struct waiting_dir_move *dm)
3085{
3086 if (!dm)
3087 return;
3088 rb_erase(&dm->node, &sctx->waiting_dir_moves);
3089 kfree(dm);
3090}
3091
3092static int add_pending_dir_move(struct send_ctx *sctx,
3093 u64 ino,
3094 u64 ino_gen,
3095 u64 parent_ino,
3096 struct list_head *new_refs,
3097 struct list_head *deleted_refs,
3098 const bool is_orphan)
3099{
3100 struct rb_node **p = &sctx->pending_dir_moves.rb_node;
3101 struct rb_node *parent = NULL;
3102 struct pending_dir_move *entry = NULL, *pm;
3103 struct recorded_ref *cur;
3104 int exists = 0;
3105 int ret;
3106
3107 pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3108 if (!pm)
3109 return -ENOMEM;
3110 pm->parent_ino = parent_ino;
3111 pm->ino = ino;
3112 pm->gen = ino_gen;
3113 INIT_LIST_HEAD(&pm->list);
3114 INIT_LIST_HEAD(&pm->update_refs);
3115 RB_CLEAR_NODE(&pm->node);
3116
3117 while (*p) {
3118 parent = *p;
3119 entry = rb_entry(parent, struct pending_dir_move, node);
3120 if (parent_ino < entry->parent_ino) {
3121 p = &(*p)->rb_left;
3122 } else if (parent_ino > entry->parent_ino) {
3123 p = &(*p)->rb_right;
3124 } else {
3125 exists = 1;
3126 break;
3127 }
3128 }
3129
3130 list_for_each_entry(cur, deleted_refs, list) {
3131 ret = dup_ref(cur, &pm->update_refs);
3132 if (ret < 0)
3133 goto out;
3134 }
3135 list_for_each_entry(cur, new_refs, list) {
3136 ret = dup_ref(cur, &pm->update_refs);
3137 if (ret < 0)
3138 goto out;
3139 }
3140
3141 ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
3142 if (ret)
3143 goto out;
3144
3145 if (exists) {
3146 list_add_tail(&pm->list, &entry->list);
3147 } else {
3148 rb_link_node(&pm->node, parent, p);
3149 rb_insert_color(&pm->node, &sctx->pending_dir_moves);
3150 }
3151 ret = 0;
3152out:
3153 if (ret) {
3154 __free_recorded_refs(&pm->update_refs);
3155 kfree(pm);
3156 }
3157 return ret;
3158}
3159
3160static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
3161 u64 parent_ino)
3162{
3163 struct rb_node *n = sctx->pending_dir_moves.rb_node;
3164 struct pending_dir_move *entry;
3165
3166 while (n) {
3167 entry = rb_entry(n, struct pending_dir_move, node);
3168 if (parent_ino < entry->parent_ino)
3169 n = n->rb_left;
3170 else if (parent_ino > entry->parent_ino)
3171 n = n->rb_right;
3172 else
3173 return entry;
3174 }
3175 return NULL;
3176}
3177
3178static int path_loop(struct send_ctx *sctx, struct fs_path *name,
3179 u64 ino, u64 gen, u64 *ancestor_ino)
3180{
3181 int ret = 0;
3182 u64 parent_inode = 0;
3183 u64 parent_gen = 0;
3184 u64 start_ino = ino;
3185
3186 *ancestor_ino = 0;
3187 while (ino != BTRFS_FIRST_FREE_OBJECTID) {
3188 fs_path_reset(name);
3189
3190 if (is_waiting_for_rm(sctx, ino))
3191 break;
3192 if (is_waiting_for_move(sctx, ino)) {
3193 if (*ancestor_ino == 0)
3194 *ancestor_ino = ino;
3195 ret = get_first_ref(sctx->parent_root, ino,
3196 &parent_inode, &parent_gen, name);
3197 } else {
3198 ret = __get_cur_name_and_parent(sctx, ino, gen,
3199 &parent_inode,
3200 &parent_gen, name);
3201 if (ret > 0) {
3202 ret = 0;
3203 break;
3204 }
3205 }
3206 if (ret < 0)
3207 break;
3208 if (parent_inode == start_ino) {
3209 ret = 1;
3210 if (*ancestor_ino == 0)
3211 *ancestor_ino = ino;
3212 break;
3213 }
3214 ino = parent_inode;
3215 gen = parent_gen;
3216 }
3217 return ret;
3218}
3219
3220static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
3221{
3222 struct fs_path *from_path = NULL;
3223 struct fs_path *to_path = NULL;
3224 struct fs_path *name = NULL;
3225 u64 orig_progress = sctx->send_progress;
3226 struct recorded_ref *cur;
3227 u64 parent_ino, parent_gen;
3228 struct waiting_dir_move *dm = NULL;
3229 u64 rmdir_ino = 0;
3230 u64 ancestor;
3231 bool is_orphan;
3232 int ret;
3233
3234 name = fs_path_alloc();
3235 from_path = fs_path_alloc();
3236 if (!name || !from_path) {
3237 ret = -ENOMEM;
3238 goto out;
3239 }
3240
3241 dm = get_waiting_dir_move(sctx, pm->ino);
3242 ASSERT(dm);
3243 rmdir_ino = dm->rmdir_ino;
3244 is_orphan = dm->orphanized;
3245 free_waiting_dir_move(sctx, dm);
3246
3247 if (is_orphan) {
3248 ret = gen_unique_name(sctx, pm->ino,
3249 pm->gen, from_path);
3250 } else {
3251 ret = get_first_ref(sctx->parent_root, pm->ino,
3252 &parent_ino, &parent_gen, name);
3253 if (ret < 0)
3254 goto out;
3255 ret = get_cur_path(sctx, parent_ino, parent_gen,
3256 from_path);
3257 if (ret < 0)
3258 goto out;
3259 ret = fs_path_add_path(from_path, name);
3260 }
3261 if (ret < 0)
3262 goto out;
3263
3264 sctx->send_progress = sctx->cur_ino + 1;
3265 ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3266 if (ret < 0)
3267 goto out;
3268 if (ret) {
3269 LIST_HEAD(deleted_refs);
3270 ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
3271 ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
3272 &pm->update_refs, &deleted_refs,
3273 is_orphan);
3274 if (ret < 0)
3275 goto out;
3276 if (rmdir_ino) {
3277 dm = get_waiting_dir_move(sctx, pm->ino);
3278 ASSERT(dm);
3279 dm->rmdir_ino = rmdir_ino;
3280 }
3281 goto out;
3282 }
3283 fs_path_reset(name);
3284 to_path = name;
3285 name = NULL;
3286 ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
3287 if (ret < 0)
3288 goto out;
3289
3290 ret = send_rename(sctx, from_path, to_path);
3291 if (ret < 0)
3292 goto out;
3293
3294 if (rmdir_ino) {
3295 struct orphan_dir_info *odi;
3296 u64 gen;
3297
3298 odi = get_orphan_dir_info(sctx, rmdir_ino);
3299 if (!odi) {
3300 /* already deleted */
3301 goto finish;
3302 }
3303 gen = odi->gen;
3304
3305 ret = can_rmdir(sctx, rmdir_ino, gen, sctx->cur_ino);
3306 if (ret < 0)
3307 goto out;
3308 if (!ret)
3309 goto finish;
3310
3311 name = fs_path_alloc();
3312 if (!name) {
3313 ret = -ENOMEM;
3314 goto out;
3315 }
3316 ret = get_cur_path(sctx, rmdir_ino, gen, name);
3317 if (ret < 0)
3318 goto out;
3319 ret = send_rmdir(sctx, name);
3320 if (ret < 0)
3321 goto out;
3322 }
3323
3324finish:
3325 ret = send_utimes(sctx, pm->ino, pm->gen);
3326 if (ret < 0)
3327 goto out;
3328
3329 /*
3330 * After rename/move, need to update the utimes of both new parent(s)
3331 * and old parent(s).
3332 */
3333 list_for_each_entry(cur, &pm->update_refs, list) {
3334 /*
3335 * The parent inode might have been deleted in the send snapshot
3336 */
3337 ret = get_inode_info(sctx->send_root, cur->dir, NULL,
3338 NULL, NULL, NULL, NULL, NULL);
3339 if (ret == -ENOENT) {
3340 ret = 0;
3341 continue;
3342 }
3343 if (ret < 0)
3344 goto out;
3345
3346 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
3347 if (ret < 0)
3348 goto out;
3349 }
3350
3351out:
3352 fs_path_free(name);
3353 fs_path_free(from_path);
3354 fs_path_free(to_path);
3355 sctx->send_progress = orig_progress;
3356
3357 return ret;
3358}
3359
3360static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
3361{
3362 if (!list_empty(&m->list))
3363 list_del(&m->list);
3364 if (!RB_EMPTY_NODE(&m->node))
3365 rb_erase(&m->node, &sctx->pending_dir_moves);
3366 __free_recorded_refs(&m->update_refs);
3367 kfree(m);
3368}
3369
3370static void tail_append_pending_moves(struct send_ctx *sctx,
3371 struct pending_dir_move *moves,
3372 struct list_head *stack)
3373{
3374 if (list_empty(&moves->list)) {
3375 list_add_tail(&moves->list, stack);
3376 } else {
3377 LIST_HEAD(list);
3378 list_splice_init(&moves->list, &list);
3379 list_add_tail(&moves->list, stack);
3380 list_splice_tail(&list, stack);
3381 }
3382 if (!RB_EMPTY_NODE(&moves->node)) {
3383 rb_erase(&moves->node, &sctx->pending_dir_moves);
3384 RB_CLEAR_NODE(&moves->node);
3385 }
3386}
3387
3388static int apply_children_dir_moves(struct send_ctx *sctx)
3389{
3390 struct pending_dir_move *pm;
3391 struct list_head stack;
3392 u64 parent_ino = sctx->cur_ino;
3393 int ret = 0;
3394
3395 pm = get_pending_dir_moves(sctx, parent_ino);
3396 if (!pm)
3397 return 0;
3398
3399 INIT_LIST_HEAD(&stack);
3400 tail_append_pending_moves(sctx, pm, &stack);
3401
3402 while (!list_empty(&stack)) {
3403 pm = list_first_entry(&stack, struct pending_dir_move, list);
3404 parent_ino = pm->ino;
3405 ret = apply_dir_move(sctx, pm);
3406 free_pending_move(sctx, pm);
3407 if (ret)
3408 goto out;
3409 pm = get_pending_dir_moves(sctx, parent_ino);
3410 if (pm)
3411 tail_append_pending_moves(sctx, pm, &stack);
3412 }
3413 return 0;
3414
3415out:
3416 while (!list_empty(&stack)) {
3417 pm = list_first_entry(&stack, struct pending_dir_move, list);
3418 free_pending_move(sctx, pm);
3419 }
3420 return ret;
3421}
3422
3423/*
3424 * We might need to delay a directory rename even when no ancestor directory
3425 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
3426 * renamed. This happens when we rename a directory to the old name (the name
3427 * in the parent root) of some other unrelated directory that got its rename
3428 * delayed due to some ancestor with higher number that got renamed.
3429 *
3430 * Example:
3431 *
3432 * Parent snapshot:
3433 * . (ino 256)
3434 * |---- a/ (ino 257)
3435 * | |---- file (ino 260)
3436 * |
3437 * |---- b/ (ino 258)
3438 * |---- c/ (ino 259)
3439 *
3440 * Send snapshot:
3441 * . (ino 256)
3442 * |---- a/ (ino 258)
3443 * |---- x/ (ino 259)
3444 * |---- y/ (ino 257)
3445 * |----- file (ino 260)
3446 *
3447 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
3448 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
3449 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
3450 * must issue is:
3451 *
3452 * 1 - rename 259 from 'c' to 'x'
3453 * 2 - rename 257 from 'a' to 'x/y'
3454 * 3 - rename 258 from 'b' to 'a'
3455 *
3456 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
3457 * be done right away and < 0 on error.
3458 */
3459static int wait_for_dest_dir_move(struct send_ctx *sctx,
3460 struct recorded_ref *parent_ref,
3461 const bool is_orphan)
3462{
3463 struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
3464 struct btrfs_path *path;
3465 struct btrfs_key key;
3466 struct btrfs_key di_key;
3467 struct btrfs_dir_item *di;
3468 u64 left_gen;
3469 u64 right_gen;
3470 int ret = 0;
3471 struct waiting_dir_move *wdm;
3472
3473 if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
3474 return 0;
3475
3476 path = alloc_path_for_send();
3477 if (!path)
3478 return -ENOMEM;
3479
3480 key.objectid = parent_ref->dir;
3481 key.type = BTRFS_DIR_ITEM_KEY;
3482 key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);
3483
3484 ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
3485 if (ret < 0) {
3486 goto out;
3487 } else if (ret > 0) {
3488 ret = 0;
3489 goto out;
3490 }
3491
3492 di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
3493 parent_ref->name_len);
3494 if (!di) {
3495 ret = 0;
3496 goto out;
3497 }
3498 /*
3499 * di_key.objectid has the number of the inode that has a dentry in the
3500 * parent directory with the same name that sctx->cur_ino is being
3501 * renamed to. We need to check if that inode is in the send root as
3502 * well and if it is currently marked as an inode with a pending rename,
3503 * if it is, we need to delay the rename of sctx->cur_ino as well, so
3504 * that it happens after that other inode is renamed.
3505 */
3506 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
3507 if (di_key.type != BTRFS_INODE_ITEM_KEY) {
3508 ret = 0;
3509 goto out;
3510 }
3511
3512 ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL,
3513 &left_gen, NULL, NULL, NULL, NULL);
3514 if (ret < 0)
3515 goto out;
3516 ret = get_inode_info(sctx->send_root, di_key.objectid, NULL,
3517 &right_gen, NULL, NULL, NULL, NULL);
3518 if (ret < 0) {
3519 if (ret == -ENOENT)
3520 ret = 0;
3521 goto out;
3522 }
3523
3524 /* Different inode, no need to delay the rename of sctx->cur_ino */
3525 if (right_gen != left_gen) {
3526 ret = 0;
3527 goto out;
3528 }
3529
3530 wdm = get_waiting_dir_move(sctx, di_key.objectid);
3531 if (wdm && !wdm->orphanized) {
3532 ret = add_pending_dir_move(sctx,
3533 sctx->cur_ino,
3534 sctx->cur_inode_gen,
3535 di_key.objectid,
3536 &sctx->new_refs,
3537 &sctx->deleted_refs,
3538 is_orphan);
3539 if (!ret)
3540 ret = 1;
3541 }
3542out:
3543 btrfs_free_path(path);
3544 return ret;
3545}
3546
3547/*
3548 * Check if inode ino2, or any of its ancestors, is inode ino1.
3549 * Return 1 if true, 0 if false and < 0 on error.
3550 */
3551static int check_ino_in_path(struct btrfs_root *root,
3552 const u64 ino1,
3553 const u64 ino1_gen,
3554 const u64 ino2,
3555 const u64 ino2_gen,
3556 struct fs_path *fs_path)
3557{
3558 u64 ino = ino2;
3559
3560 if (ino1 == ino2)
3561 return ino1_gen == ino2_gen;
3562
3563 while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3564 u64 parent;
3565 u64 parent_gen;
3566 int ret;
3567
3568 fs_path_reset(fs_path);
3569 ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
3570 if (ret < 0)
3571 return ret;
3572 if (parent == ino1)
3573 return parent_gen == ino1_gen;
3574 ino = parent;
3575 }
3576 return 0;
3577}
3578
3579/*
3580 * Check if ino ino1 is an ancestor of inode ino2 in the given root for any
3581 * possible path (in case ino2 is not a directory and has multiple hard links).
3582 * Return 1 if true, 0 if false and < 0 on error.
3583 */
3584static int is_ancestor(struct btrfs_root *root,
3585 const u64 ino1,
3586 const u64 ino1_gen,
3587 const u64 ino2,
3588 struct fs_path *fs_path)
3589{
3590 bool free_fs_path = false;
3591 int ret = 0;
3592 struct btrfs_path *path = NULL;
3593 struct btrfs_key key;
3594
3595 if (!fs_path) {
3596 fs_path = fs_path_alloc();
3597 if (!fs_path)
3598 return -ENOMEM;
3599 free_fs_path = true;
3600 }
3601
3602 path = alloc_path_for_send();
3603 if (!path) {
3604 ret = -ENOMEM;
3605 goto out;
3606 }
3607
3608 key.objectid = ino2;
3609 key.type = BTRFS_INODE_REF_KEY;
3610 key.offset = 0;
3611
3612 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3613 if (ret < 0)
3614 goto out;
3615
3616 while (true) {
3617 struct extent_buffer *leaf = path->nodes[0];
3618 int slot = path->slots[0];
3619 u32 cur_offset = 0;
3620 u32 item_size;
3621
3622 if (slot >= btrfs_header_nritems(leaf)) {
3623 ret = btrfs_next_leaf(root, path);
3624 if (ret < 0)
3625 goto out;
3626 if (ret > 0)
3627 break;
3628 continue;
3629 }
3630
3631 btrfs_item_key_to_cpu(leaf, &key, slot);
3632 if (key.objectid != ino2)
3633 break;
3634 if (key.type != BTRFS_INODE_REF_KEY &&
3635 key.type != BTRFS_INODE_EXTREF_KEY)
3636 break;
3637
3638 item_size = btrfs_item_size_nr(leaf, slot);
3639 while (cur_offset < item_size) {
3640 u64 parent;
3641 u64 parent_gen;
3642
3643 if (key.type == BTRFS_INODE_EXTREF_KEY) {
3644 unsigned long ptr;
3645 struct btrfs_inode_extref *extref;
3646
3647 ptr = btrfs_item_ptr_offset(leaf, slot);
3648 extref = (struct btrfs_inode_extref *)
3649 (ptr + cur_offset);
3650 parent = btrfs_inode_extref_parent(leaf,
3651 extref);
3652 cur_offset += sizeof(*extref);
3653 cur_offset += btrfs_inode_extref_name_len(leaf,
3654 extref);
3655 } else {
3656 parent = key.offset;
3657 cur_offset = item_size;
3658 }
3659
3660 ret = get_inode_info(root, parent, NULL, &parent_gen,
3661 NULL, NULL, NULL, NULL);
3662 if (ret < 0)
3663 goto out;
3664 ret = check_ino_in_path(root, ino1, ino1_gen,
3665 parent, parent_gen, fs_path);
3666 if (ret)
3667 goto out;
3668 }
3669 path->slots[0]++;
3670 }
3671 ret = 0;
3672 out:
3673 btrfs_free_path(path);
3674 if (free_fs_path)
3675 fs_path_free(fs_path);
3676 return ret;
3677}
3678
3679static int wait_for_parent_move(struct send_ctx *sctx,
3680 struct recorded_ref *parent_ref,
3681 const bool is_orphan)
3682{
3683 int ret = 0;
3684 u64 ino = parent_ref->dir;
3685 u64 ino_gen = parent_ref->dir_gen;
3686 u64 parent_ino_before, parent_ino_after;
3687 struct fs_path *path_before = NULL;
3688 struct fs_path *path_after = NULL;
3689 int len1, len2;
3690
3691 path_after = fs_path_alloc();
3692 path_before = fs_path_alloc();
3693 if (!path_after || !path_before) {
3694 ret = -ENOMEM;
3695 goto out;
3696 }
3697
3698 /*
3699 * Our current directory inode may not yet be renamed/moved because some
3700 * ancestor (immediate or not) has to be renamed/moved first. So find if
3701 * such ancestor exists and make sure our own rename/move happens after
3702 * that ancestor is processed to avoid path build infinite loops (done
3703 * at get_cur_path()).
3704 */
3705 while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3706 u64 parent_ino_after_gen;
3707
3708 if (is_waiting_for_move(sctx, ino)) {
3709 /*
3710 * If the current inode is an ancestor of ino in the
3711 * parent root, we need to delay the rename of the
3712 * current inode, otherwise don't delayed the rename
3713 * because we can end up with a circular dependency
3714 * of renames, resulting in some directories never
3715 * getting the respective rename operations issued in
3716 * the send stream or getting into infinite path build
3717 * loops.
3718 */
3719 ret = is_ancestor(sctx->parent_root,
3720 sctx->cur_ino, sctx->cur_inode_gen,
3721 ino, path_before);
3722 if (ret)
3723 break;
3724 }
3725
3726 fs_path_reset(path_before);
3727 fs_path_reset(path_after);
3728
3729 ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
3730 &parent_ino_after_gen, path_after);
3731 if (ret < 0)
3732 goto out;
3733 ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
3734 NULL, path_before);
3735 if (ret < 0 && ret != -ENOENT) {
3736 goto out;
3737 } else if (ret == -ENOENT) {
3738 ret = 0;
3739 break;
3740 }
3741
3742 len1 = fs_path_len(path_before);
3743 len2 = fs_path_len(path_after);
3744 if (ino > sctx->cur_ino &&
3745 (parent_ino_before != parent_ino_after || len1 != len2 ||
3746 memcmp(path_before->start, path_after->start, len1))) {
3747 u64 parent_ino_gen;
3748
3749 ret = get_inode_info(sctx->parent_root, ino, NULL,
3750 &parent_ino_gen, NULL, NULL, NULL,
3751 NULL);
3752 if (ret < 0)
3753 goto out;
3754 if (ino_gen == parent_ino_gen) {
3755 ret = 1;
3756 break;
3757 }
3758 }
3759 ino = parent_ino_after;
3760 ino_gen = parent_ino_after_gen;
3761 }
3762
3763out:
3764 fs_path_free(path_before);
3765 fs_path_free(path_after);
3766
3767 if (ret == 1) {
3768 ret = add_pending_dir_move(sctx,
3769 sctx->cur_ino,
3770 sctx->cur_inode_gen,
3771 ino,
3772 &sctx->new_refs,
3773 &sctx->deleted_refs,
3774 is_orphan);
3775 if (!ret)
3776 ret = 1;
3777 }
3778
3779 return ret;
3780}
3781
3782static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
3783{
3784 int ret;
3785 struct fs_path *new_path;
3786
3787 /*
3788 * Our reference's name member points to its full_path member string, so
3789 * we use here a new path.
3790 */
3791 new_path = fs_path_alloc();
3792 if (!new_path)
3793 return -ENOMEM;
3794
3795 ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
3796 if (ret < 0) {
3797 fs_path_free(new_path);
3798 return ret;
3799 }
3800 ret = fs_path_add(new_path, ref->name, ref->name_len);
3801 if (ret < 0) {
3802 fs_path_free(new_path);
3803 return ret;
3804 }
3805
3806 fs_path_free(ref->full_path);
3807 set_ref_path(ref, new_path);
3808
3809 return 0;
3810}
3811
3812/*
3813 * This does all the move/link/unlink/rmdir magic.
3814 */
3815static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
3816{
3817 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
3818 int ret = 0;
3819 struct recorded_ref *cur;
3820 struct recorded_ref *cur2;
3821 struct list_head check_dirs;
3822 struct fs_path *valid_path = NULL;
3823 u64 ow_inode = 0;
3824 u64 ow_gen;
3825 u64 ow_mode;
3826 int did_overwrite = 0;
3827 int is_orphan = 0;
3828 u64 last_dir_ino_rm = 0;
3829 bool can_rename = true;
3830 bool orphanized_dir = false;
3831 bool orphanized_ancestor = false;
3832
3833 btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
3834
3835 /*
3836 * This should never happen as the root dir always has the same ref
3837 * which is always '..'
3838 */
3839 BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
3840 INIT_LIST_HEAD(&check_dirs);
3841
3842 valid_path = fs_path_alloc();
3843 if (!valid_path) {
3844 ret = -ENOMEM;
3845 goto out;
3846 }
3847
3848 /*
3849 * First, check if the first ref of the current inode was overwritten
3850 * before. If yes, we know that the current inode was already orphanized
3851 * and thus use the orphan name. If not, we can use get_cur_path to
3852 * get the path of the first ref as it would like while receiving at
3853 * this point in time.
3854 * New inodes are always orphan at the beginning, so force to use the
3855 * orphan name in this case.
3856 * The first ref is stored in valid_path and will be updated if it
3857 * gets moved around.
3858 */
3859 if (!sctx->cur_inode_new) {
3860 ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
3861 sctx->cur_inode_gen);
3862 if (ret < 0)
3863 goto out;
3864 if (ret)
3865 did_overwrite = 1;
3866 }
3867 if (sctx->cur_inode_new || did_overwrite) {
3868 ret = gen_unique_name(sctx, sctx->cur_ino,
3869 sctx->cur_inode_gen, valid_path);
3870 if (ret < 0)
3871 goto out;
3872 is_orphan = 1;
3873 } else {
3874 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
3875 valid_path);
3876 if (ret < 0)
3877 goto out;
3878 }
3879
3880 list_for_each_entry(cur, &sctx->new_refs, list) {
3881 /*
3882 * We may have refs where the parent directory does not exist
3883 * yet. This happens if the parent directories inum is higher
3884 * the the current inum. To handle this case, we create the
3885 * parent directory out of order. But we need to check if this
3886 * did already happen before due to other refs in the same dir.
3887 */
3888 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
3889 if (ret < 0)
3890 goto out;
3891 if (ret == inode_state_will_create) {
3892 ret = 0;
3893 /*
3894 * First check if any of the current inodes refs did
3895 * already create the dir.
3896 */
3897 list_for_each_entry(cur2, &sctx->new_refs, list) {
3898 if (cur == cur2)
3899 break;
3900 if (cur2->dir == cur->dir) {
3901 ret = 1;
3902 break;
3903 }
3904 }
3905
3906 /*
3907 * If that did not happen, check if a previous inode
3908 * did already create the dir.
3909 */
3910 if (!ret)
3911 ret = did_create_dir(sctx, cur->dir);
3912 if (ret < 0)
3913 goto out;
3914 if (!ret) {
3915 ret = send_create_inode(sctx, cur->dir);
3916 if (ret < 0)
3917 goto out;
3918 }
3919 }
3920
3921 /*
3922 * Check if this new ref would overwrite the first ref of
3923 * another unprocessed inode. If yes, orphanize the
3924 * overwritten inode. If we find an overwritten ref that is
3925 * not the first ref, simply unlink it.
3926 */
3927 ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
3928 cur->name, cur->name_len,
3929 &ow_inode, &ow_gen, &ow_mode);
3930 if (ret < 0)
3931 goto out;
3932 if (ret) {
3933 ret = is_first_ref(sctx->parent_root,
3934 ow_inode, cur->dir, cur->name,
3935 cur->name_len);
3936 if (ret < 0)
3937 goto out;
3938 if (ret) {
3939 struct name_cache_entry *nce;
3940 struct waiting_dir_move *wdm;
3941
3942 ret = orphanize_inode(sctx, ow_inode, ow_gen,
3943 cur->full_path);
3944 if (ret < 0)
3945 goto out;
3946 if (S_ISDIR(ow_mode))
3947 orphanized_dir = true;
3948
3949 /*
3950 * If ow_inode has its rename operation delayed
3951 * make sure that its orphanized name is used in
3952 * the source path when performing its rename
3953 * operation.
3954 */
3955 if (is_waiting_for_move(sctx, ow_inode)) {
3956 wdm = get_waiting_dir_move(sctx,
3957 ow_inode);
3958 ASSERT(wdm);
3959 wdm->orphanized = true;
3960 }
3961
3962 /*
3963 * Make sure we clear our orphanized inode's
3964 * name from the name cache. This is because the
3965 * inode ow_inode might be an ancestor of some
3966 * other inode that will be orphanized as well
3967 * later and has an inode number greater than
3968 * sctx->send_progress. We need to prevent
3969 * future name lookups from using the old name
3970 * and get instead the orphan name.
3971 */
3972 nce = name_cache_search(sctx, ow_inode, ow_gen);
3973 if (nce) {
3974 name_cache_delete(sctx, nce);
3975 kfree(nce);
3976 }
3977
3978 /*
3979 * ow_inode might currently be an ancestor of
3980 * cur_ino, therefore compute valid_path (the
3981 * current path of cur_ino) again because it
3982 * might contain the pre-orphanization name of
3983 * ow_inode, which is no longer valid.
3984 */
3985 ret = is_ancestor(sctx->parent_root,
3986 ow_inode, ow_gen,
3987 sctx->cur_ino, NULL);
3988 if (ret > 0) {
3989 orphanized_ancestor = true;
3990 fs_path_reset(valid_path);
3991 ret = get_cur_path(sctx, sctx->cur_ino,
3992 sctx->cur_inode_gen,
3993 valid_path);
3994 }
3995 if (ret < 0)
3996 goto out;
3997 } else {
3998 ret = send_unlink(sctx, cur->full_path);
3999 if (ret < 0)
4000 goto out;
4001 }
4002 }
4003
4004 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
4005 ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
4006 if (ret < 0)
4007 goto out;
4008 if (ret == 1) {
4009 can_rename = false;
4010 *pending_move = 1;
4011 }
4012 }
4013
4014 if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
4015 can_rename) {
4016 ret = wait_for_parent_move(sctx, cur, is_orphan);
4017 if (ret < 0)
4018 goto out;
4019 if (ret == 1) {
4020 can_rename = false;
4021 *pending_move = 1;
4022 }
4023 }
4024
4025 /*
4026 * link/move the ref to the new place. If we have an orphan
4027 * inode, move it and update valid_path. If not, link or move
4028 * it depending on the inode mode.
4029 */
4030 if (is_orphan && can_rename) {
4031 ret = send_rename(sctx, valid_path, cur->full_path);
4032 if (ret < 0)
4033 goto out;
4034 is_orphan = 0;
4035 ret = fs_path_copy(valid_path, cur->full_path);
4036 if (ret < 0)
4037 goto out;
4038 } else if (can_rename) {
4039 if (S_ISDIR(sctx->cur_inode_mode)) {
4040 /*
4041 * Dirs can't be linked, so move it. For moved
4042 * dirs, we always have one new and one deleted
4043 * ref. The deleted ref is ignored later.
4044 */
4045 ret = send_rename(sctx, valid_path,
4046 cur->full_path);
4047 if (!ret)
4048 ret = fs_path_copy(valid_path,
4049 cur->full_path);
4050 if (ret < 0)
4051 goto out;
4052 } else {
4053 /*
4054 * We might have previously orphanized an inode
4055 * which is an ancestor of our current inode,
4056 * so our reference's full path, which was
4057 * computed before any such orphanizations, must
4058 * be updated.
4059 */
4060 if (orphanized_dir) {
4061 ret = update_ref_path(sctx, cur);
4062 if (ret < 0)
4063 goto out;
4064 }
4065 ret = send_link(sctx, cur->full_path,
4066 valid_path);
4067 if (ret < 0)
4068 goto out;
4069 }
4070 }
4071 ret = dup_ref(cur, &check_dirs);
4072 if (ret < 0)
4073 goto out;
4074 }
4075
4076 if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
4077 /*
4078 * Check if we can already rmdir the directory. If not,
4079 * orphanize it. For every dir item inside that gets deleted
4080 * later, we do this check again and rmdir it then if possible.
4081 * See the use of check_dirs for more details.
4082 */
4083 ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen,
4084 sctx->cur_ino);
4085 if (ret < 0)
4086 goto out;
4087 if (ret) {
4088 ret = send_rmdir(sctx, valid_path);
4089 if (ret < 0)
4090 goto out;
4091 } else if (!is_orphan) {
4092 ret = orphanize_inode(sctx, sctx->cur_ino,
4093 sctx->cur_inode_gen, valid_path);
4094 if (ret < 0)
4095 goto out;
4096 is_orphan = 1;
4097 }
4098
4099 list_for_each_entry(cur, &sctx->deleted_refs, list) {
4100 ret = dup_ref(cur, &check_dirs);
4101 if (ret < 0)
4102 goto out;
4103 }
4104 } else if (S_ISDIR(sctx->cur_inode_mode) &&
4105 !list_empty(&sctx->deleted_refs)) {
4106 /*
4107 * We have a moved dir. Add the old parent to check_dirs
4108 */
4109 cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
4110 list);
4111 ret = dup_ref(cur, &check_dirs);
4112 if (ret < 0)
4113 goto out;
4114 } else if (!S_ISDIR(sctx->cur_inode_mode)) {
4115 /*
4116 * We have a non dir inode. Go through all deleted refs and
4117 * unlink them if they were not already overwritten by other
4118 * inodes.
4119 */
4120 list_for_each_entry(cur, &sctx->deleted_refs, list) {
4121 ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
4122 sctx->cur_ino, sctx->cur_inode_gen,
4123 cur->name, cur->name_len);
4124 if (ret < 0)
4125 goto out;
4126 if (!ret) {
4127 /*
4128 * If we orphanized any ancestor before, we need
4129 * to recompute the full path for deleted names,
4130 * since any such path was computed before we
4131 * processed any references and orphanized any
4132 * ancestor inode.
4133 */
4134 if (orphanized_ancestor) {
4135 ret = update_ref_path(sctx, cur);
4136 if (ret < 0)
4137 goto out;
4138 }
4139 ret = send_unlink(sctx, cur->full_path);
4140 if (ret < 0)
4141 goto out;
4142 }
4143 ret = dup_ref(cur, &check_dirs);
4144 if (ret < 0)
4145 goto out;
4146 }
4147 /*
4148 * If the inode is still orphan, unlink the orphan. This may
4149 * happen when a previous inode did overwrite the first ref
4150 * of this inode and no new refs were added for the current
4151 * inode. Unlinking does not mean that the inode is deleted in
4152 * all cases. There may still be links to this inode in other
4153 * places.
4154 */
4155 if (is_orphan) {
4156 ret = send_unlink(sctx, valid_path);
4157 if (ret < 0)
4158 goto out;
4159 }
4160 }
4161
4162 /*
4163 * We did collect all parent dirs where cur_inode was once located. We
4164 * now go through all these dirs and check if they are pending for
4165 * deletion and if it's finally possible to perform the rmdir now.
4166 * We also update the inode stats of the parent dirs here.
4167 */
4168 list_for_each_entry(cur, &check_dirs, list) {
4169 /*
4170 * In case we had refs into dirs that were not processed yet,
4171 * we don't need to do the utime and rmdir logic for these dirs.
4172 * The dir will be processed later.
4173 */
4174 if (cur->dir > sctx->cur_ino)
4175 continue;
4176
4177 ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
4178 if (ret < 0)
4179 goto out;
4180
4181 if (ret == inode_state_did_create ||
4182 ret == inode_state_no_change) {
4183 /* TODO delayed utimes */
4184 ret = send_utimes(sctx, cur->dir, cur->dir_gen);
4185 if (ret < 0)
4186 goto out;
4187 } else if (ret == inode_state_did_delete &&
4188 cur->dir != last_dir_ino_rm) {
4189 ret = can_rmdir(sctx, cur->dir, cur->dir_gen,
4190 sctx->cur_ino);
4191 if (ret < 0)
4192 goto out;
4193 if (ret) {
4194 ret = get_cur_path(sctx, cur->dir,
4195 cur->dir_gen, valid_path);
4196 if (ret < 0)
4197 goto out;
4198 ret = send_rmdir(sctx, valid_path);
4199 if (ret < 0)
4200 goto out;
4201 last_dir_ino_rm = cur->dir;
4202 }
4203 }
4204 }
4205
4206 ret = 0;
4207
4208out:
4209 __free_recorded_refs(&check_dirs);
4210 free_recorded_refs(sctx);
4211 fs_path_free(valid_path);
4212 return ret;
4213}
4214
4215static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name,
4216 void *ctx, struct list_head *refs)
4217{
4218 int ret = 0;
4219 struct send_ctx *sctx = ctx;
4220 struct fs_path *p;
4221 u64 gen;
4222
4223 p = fs_path_alloc();
4224 if (!p)
4225 return -ENOMEM;
4226
4227 ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL,
4228 NULL, NULL);
4229 if (ret < 0)
4230 goto out;
4231
4232 ret = get_cur_path(sctx, dir, gen, p);
4233 if (ret < 0)
4234 goto out;
4235 ret = fs_path_add_path(p, name);
4236 if (ret < 0)
4237 goto out;
4238
4239 ret = __record_ref(refs, dir, gen, p);
4240
4241out:
4242 if (ret)
4243 fs_path_free(p);
4244 return ret;
4245}
4246
4247static int __record_new_ref(int num, u64 dir, int index,
4248 struct fs_path *name,
4249 void *ctx)
4250{
4251 struct send_ctx *sctx = ctx;
4252 return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs);
4253}
4254
4255
4256static int __record_deleted_ref(int num, u64 dir, int index,
4257 struct fs_path *name,
4258 void *ctx)
4259{
4260 struct send_ctx *sctx = ctx;
4261 return record_ref(sctx->parent_root, dir, name, ctx,
4262 &sctx->deleted_refs);
4263}
4264
4265static int record_new_ref(struct send_ctx *sctx)
4266{
4267 int ret;
4268
4269 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4270 sctx->cmp_key, 0, __record_new_ref, sctx);
4271 if (ret < 0)
4272 goto out;
4273 ret = 0;
4274
4275out:
4276 return ret;
4277}
4278
4279static int record_deleted_ref(struct send_ctx *sctx)
4280{
4281 int ret;
4282
4283 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4284 sctx->cmp_key, 0, __record_deleted_ref, sctx);
4285 if (ret < 0)
4286 goto out;
4287 ret = 0;
4288
4289out:
4290 return ret;
4291}
4292
4293struct find_ref_ctx {
4294 u64 dir;
4295 u64 dir_gen;
4296 struct btrfs_root *root;
4297 struct fs_path *name;
4298 int found_idx;
4299};
4300
4301static int __find_iref(int num, u64 dir, int index,
4302 struct fs_path *name,
4303 void *ctx_)
4304{
4305 struct find_ref_ctx *ctx = ctx_;
4306 u64 dir_gen;
4307 int ret;
4308
4309 if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
4310 strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
4311 /*
4312 * To avoid doing extra lookups we'll only do this if everything
4313 * else matches.
4314 */
4315 ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
4316 NULL, NULL, NULL);
4317 if (ret)
4318 return ret;
4319 if (dir_gen != ctx->dir_gen)
4320 return 0;
4321 ctx->found_idx = num;
4322 return 1;
4323 }
4324 return 0;
4325}
4326
4327static int find_iref(struct btrfs_root *root,
4328 struct btrfs_path *path,
4329 struct btrfs_key *key,
4330 u64 dir, u64 dir_gen, struct fs_path *name)
4331{
4332 int ret;
4333 struct find_ref_ctx ctx;
4334
4335 ctx.dir = dir;
4336 ctx.name = name;
4337 ctx.dir_gen = dir_gen;
4338 ctx.found_idx = -1;
4339 ctx.root = root;
4340
4341 ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
4342 if (ret < 0)
4343 return ret;
4344
4345 if (ctx.found_idx == -1)
4346 return -ENOENT;
4347
4348 return ctx.found_idx;
4349}
4350
4351static int __record_changed_new_ref(int num, u64 dir, int index,
4352 struct fs_path *name,
4353 void *ctx)
4354{
4355 u64 dir_gen;
4356 int ret;
4357 struct send_ctx *sctx = ctx;
4358
4359 ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
4360 NULL, NULL, NULL);
4361 if (ret)
4362 return ret;
4363
4364 ret = find_iref(sctx->parent_root, sctx->right_path,
4365 sctx->cmp_key, dir, dir_gen, name);
4366 if (ret == -ENOENT)
4367 ret = __record_new_ref(num, dir, index, name, sctx);
4368 else if (ret > 0)
4369 ret = 0;
4370
4371 return ret;
4372}
4373
4374static int __record_changed_deleted_ref(int num, u64 dir, int index,
4375 struct fs_path *name,
4376 void *ctx)
4377{
4378 u64 dir_gen;
4379 int ret;
4380 struct send_ctx *sctx = ctx;
4381
4382 ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
4383 NULL, NULL, NULL);
4384 if (ret)
4385 return ret;
4386
4387 ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
4388 dir, dir_gen, name);
4389 if (ret == -ENOENT)
4390 ret = __record_deleted_ref(num, dir, index, name, sctx);
4391 else if (ret > 0)
4392 ret = 0;
4393
4394 return ret;
4395}
4396
4397static int record_changed_ref(struct send_ctx *sctx)
4398{
4399 int ret = 0;
4400
4401 ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4402 sctx->cmp_key, 0, __record_changed_new_ref, sctx);
4403 if (ret < 0)
4404 goto out;
4405 ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4406 sctx->cmp_key, 0, __record_changed_deleted_ref, sctx);
4407 if (ret < 0)
4408 goto out;
4409 ret = 0;
4410
4411out:
4412 return ret;
4413}
4414
4415/*
4416 * Record and process all refs at once. Needed when an inode changes the
4417 * generation number, which means that it was deleted and recreated.
4418 */
4419static int process_all_refs(struct send_ctx *sctx,
4420 enum btrfs_compare_tree_result cmd)
4421{
4422 int ret;
4423 struct btrfs_root *root;
4424 struct btrfs_path *path;
4425 struct btrfs_key key;
4426 struct btrfs_key found_key;
4427 struct extent_buffer *eb;
4428 int slot;
4429 iterate_inode_ref_t cb;
4430 int pending_move = 0;
4431
4432 path = alloc_path_for_send();
4433 if (!path)
4434 return -ENOMEM;
4435
4436 if (cmd == BTRFS_COMPARE_TREE_NEW) {
4437 root = sctx->send_root;
4438 cb = __record_new_ref;
4439 } else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
4440 root = sctx->parent_root;
4441 cb = __record_deleted_ref;
4442 } else {
4443 btrfs_err(sctx->send_root->fs_info,
4444 "Wrong command %d in process_all_refs", cmd);
4445 ret = -EINVAL;
4446 goto out;
4447 }
4448
4449 key.objectid = sctx->cmp_key->objectid;
4450 key.type = BTRFS_INODE_REF_KEY;
4451 key.offset = 0;
4452 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4453 if (ret < 0)
4454 goto out;
4455
4456 while (1) {
4457 eb = path->nodes[0];
4458 slot = path->slots[0];
4459 if (slot >= btrfs_header_nritems(eb)) {
4460 ret = btrfs_next_leaf(root, path);
4461 if (ret < 0)
4462 goto out;
4463 else if (ret > 0)
4464 break;
4465 continue;
4466 }
4467
4468 btrfs_item_key_to_cpu(eb, &found_key, slot);
4469
4470 if (found_key.objectid != key.objectid ||
4471 (found_key.type != BTRFS_INODE_REF_KEY &&
4472 found_key.type != BTRFS_INODE_EXTREF_KEY))
4473 break;
4474
4475 ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
4476 if (ret < 0)
4477 goto out;
4478
4479 path->slots[0]++;
4480 }
4481 btrfs_release_path(path);
4482
4483 /*
4484 * We don't actually care about pending_move as we are simply
4485 * re-creating this inode and will be rename'ing it into place once we
4486 * rename the parent directory.
4487 */
4488 ret = process_recorded_refs(sctx, &pending_move);
4489out:
4490 btrfs_free_path(path);
4491 return ret;
4492}
4493
4494static int send_set_xattr(struct send_ctx *sctx,
4495 struct fs_path *path,
4496 const char *name, int name_len,
4497 const char *data, int data_len)
4498{
4499 int ret = 0;
4500
4501 ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
4502 if (ret < 0)
4503 goto out;
4504
4505 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4506 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4507 TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);
4508
4509 ret = send_cmd(sctx);
4510
4511tlv_put_failure:
4512out:
4513 return ret;
4514}
4515
4516static int send_remove_xattr(struct send_ctx *sctx,
4517 struct fs_path *path,
4518 const char *name, int name_len)
4519{
4520 int ret = 0;
4521
4522 ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
4523 if (ret < 0)
4524 goto out;
4525
4526 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
4527 TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
4528
4529 ret = send_cmd(sctx);
4530
4531tlv_put_failure:
4532out:
4533 return ret;
4534}
4535
4536static int __process_new_xattr(int num, struct btrfs_key *di_key,
4537 const char *name, int name_len,
4538 const char *data, int data_len,
4539 u8 type, void *ctx)
4540{
4541 int ret;
4542 struct send_ctx *sctx = ctx;
4543 struct fs_path *p;
4544 struct posix_acl_xattr_header dummy_acl;
4545
4546 p = fs_path_alloc();
4547 if (!p)
4548 return -ENOMEM;
4549
4550 /*
4551 * This hack is needed because empty acls are stored as zero byte
4552 * data in xattrs. Problem with that is, that receiving these zero byte
4553 * acls will fail later. To fix this, we send a dummy acl list that
4554 * only contains the version number and no entries.
4555 */
4556 if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
4557 !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
4558 if (data_len == 0) {
4559 dummy_acl.a_version =
4560 cpu_to_le32(POSIX_ACL_XATTR_VERSION);
4561 data = (char *)&dummy_acl;
4562 data_len = sizeof(dummy_acl);
4563 }
4564 }
4565
4566 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4567 if (ret < 0)
4568 goto out;
4569
4570 ret = send_set_xattr(sctx, p, name, name_len, data, data_len);
4571
4572out:
4573 fs_path_free(p);
4574 return ret;
4575}
4576
4577static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
4578 const char *name, int name_len,
4579 const char *data, int data_len,
4580 u8 type, void *ctx)
4581{
4582 int ret;
4583 struct send_ctx *sctx = ctx;
4584 struct fs_path *p;
4585
4586 p = fs_path_alloc();
4587 if (!p)
4588 return -ENOMEM;
4589
4590 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4591 if (ret < 0)
4592 goto out;
4593
4594 ret = send_remove_xattr(sctx, p, name, name_len);
4595
4596out:
4597 fs_path_free(p);
4598 return ret;
4599}
4600
4601static int process_new_xattr(struct send_ctx *sctx)
4602{
4603 int ret = 0;
4604
4605 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4606 __process_new_xattr, sctx);
4607
4608 return ret;
4609}
4610
4611static int process_deleted_xattr(struct send_ctx *sctx)
4612{
4613 return iterate_dir_item(sctx->parent_root, sctx->right_path,
4614 __process_deleted_xattr, sctx);
4615}
4616
4617struct find_xattr_ctx {
4618 const char *name;
4619 int name_len;
4620 int found_idx;
4621 char *found_data;
4622 int found_data_len;
4623};
4624
4625static int __find_xattr(int num, struct btrfs_key *di_key,
4626 const char *name, int name_len,
4627 const char *data, int data_len,
4628 u8 type, void *vctx)
4629{
4630 struct find_xattr_ctx *ctx = vctx;
4631
4632 if (name_len == ctx->name_len &&
4633 strncmp(name, ctx->name, name_len) == 0) {
4634 ctx->found_idx = num;
4635 ctx->found_data_len = data_len;
4636 ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
4637 if (!ctx->found_data)
4638 return -ENOMEM;
4639 return 1;
4640 }
4641 return 0;
4642}
4643
4644static int find_xattr(struct btrfs_root *root,
4645 struct btrfs_path *path,
4646 struct btrfs_key *key,
4647 const char *name, int name_len,
4648 char **data, int *data_len)
4649{
4650 int ret;
4651 struct find_xattr_ctx ctx;
4652
4653 ctx.name = name;
4654 ctx.name_len = name_len;
4655 ctx.found_idx = -1;
4656 ctx.found_data = NULL;
4657 ctx.found_data_len = 0;
4658
4659 ret = iterate_dir_item(root, path, __find_xattr, &ctx);
4660 if (ret < 0)
4661 return ret;
4662
4663 if (ctx.found_idx == -1)
4664 return -ENOENT;
4665 if (data) {
4666 *data = ctx.found_data;
4667 *data_len = ctx.found_data_len;
4668 } else {
4669 kfree(ctx.found_data);
4670 }
4671 return ctx.found_idx;
4672}
4673
4674
4675static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
4676 const char *name, int name_len,
4677 const char *data, int data_len,
4678 u8 type, void *ctx)
4679{
4680 int ret;
4681 struct send_ctx *sctx = ctx;
4682 char *found_data = NULL;
4683 int found_data_len = 0;
4684
4685 ret = find_xattr(sctx->parent_root, sctx->right_path,
4686 sctx->cmp_key, name, name_len, &found_data,
4687 &found_data_len);
4688 if (ret == -ENOENT) {
4689 ret = __process_new_xattr(num, di_key, name, name_len, data,
4690 data_len, type, ctx);
4691 } else if (ret >= 0) {
4692 if (data_len != found_data_len ||
4693 memcmp(data, found_data, data_len)) {
4694 ret = __process_new_xattr(num, di_key, name, name_len,
4695 data, data_len, type, ctx);
4696 } else {
4697 ret = 0;
4698 }
4699 }
4700
4701 kfree(found_data);
4702 return ret;
4703}
4704
4705static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
4706 const char *name, int name_len,
4707 const char *data, int data_len,
4708 u8 type, void *ctx)
4709{
4710 int ret;
4711 struct send_ctx *sctx = ctx;
4712
4713 ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
4714 name, name_len, NULL, NULL);
4715 if (ret == -ENOENT)
4716 ret = __process_deleted_xattr(num, di_key, name, name_len, data,
4717 data_len, type, ctx);
4718 else if (ret >= 0)
4719 ret = 0;
4720
4721 return ret;
4722}
4723
4724static int process_changed_xattr(struct send_ctx *sctx)
4725{
4726 int ret = 0;
4727
4728 ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4729 __process_changed_new_xattr, sctx);
4730 if (ret < 0)
4731 goto out;
4732 ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
4733 __process_changed_deleted_xattr, sctx);
4734
4735out:
4736 return ret;
4737}
4738
4739static int process_all_new_xattrs(struct send_ctx *sctx)
4740{
4741 int ret;
4742 struct btrfs_root *root;
4743 struct btrfs_path *path;
4744 struct btrfs_key key;
4745 struct btrfs_key found_key;
4746 struct extent_buffer *eb;
4747 int slot;
4748
4749 path = alloc_path_for_send();
4750 if (!path)
4751 return -ENOMEM;
4752
4753 root = sctx->send_root;
4754
4755 key.objectid = sctx->cmp_key->objectid;
4756 key.type = BTRFS_XATTR_ITEM_KEY;
4757 key.offset = 0;
4758 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4759 if (ret < 0)
4760 goto out;
4761
4762 while (1) {
4763 eb = path->nodes[0];
4764 slot = path->slots[0];
4765 if (slot >= btrfs_header_nritems(eb)) {
4766 ret = btrfs_next_leaf(root, path);
4767 if (ret < 0) {
4768 goto out;
4769 } else if (ret > 0) {
4770 ret = 0;
4771 break;
4772 }
4773 continue;
4774 }
4775
4776 btrfs_item_key_to_cpu(eb, &found_key, slot);
4777 if (found_key.objectid != key.objectid ||
4778 found_key.type != key.type) {
4779 ret = 0;
4780 goto out;
4781 }
4782
4783 ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
4784 if (ret < 0)
4785 goto out;
4786
4787 path->slots[0]++;
4788 }
4789
4790out:
4791 btrfs_free_path(path);
4792 return ret;
4793}
4794
4795static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
4796{
4797 struct btrfs_root *root = sctx->send_root;
4798 struct btrfs_fs_info *fs_info = root->fs_info;
4799 struct inode *inode;
4800 struct page *page;
4801 char *addr;
4802 struct btrfs_key key;
4803 pgoff_t index = offset >> PAGE_SHIFT;
4804 pgoff_t last_index;
4805 unsigned pg_offset = offset & ~PAGE_MASK;
4806 ssize_t ret = 0;
4807
4808 key.objectid = sctx->cur_ino;
4809 key.type = BTRFS_INODE_ITEM_KEY;
4810 key.offset = 0;
4811
4812 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4813 if (IS_ERR(inode))
4814 return PTR_ERR(inode);
4815
4816 if (offset + len > i_size_read(inode)) {
4817 if (offset > i_size_read(inode))
4818 len = 0;
4819 else
4820 len = offset - i_size_read(inode);
4821 }
4822 if (len == 0)
4823 goto out;
4824
4825 last_index = (offset + len - 1) >> PAGE_SHIFT;
4826
4827 /* initial readahead */
4828 memset(&sctx->ra, 0, sizeof(struct file_ra_state));
4829 file_ra_state_init(&sctx->ra, inode->i_mapping);
4830
4831 while (index <= last_index) {
4832 unsigned cur_len = min_t(unsigned, len,
4833 PAGE_SIZE - pg_offset);
4834
4835 page = find_lock_page(inode->i_mapping, index);
4836 if (!page) {
4837 page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
4838 NULL, index, last_index + 1 - index);
4839
4840 page = find_or_create_page(inode->i_mapping, index,
4841 GFP_KERNEL);
4842 if (!page) {
4843 ret = -ENOMEM;
4844 break;
4845 }
4846 }
4847
4848 if (PageReadahead(page)) {
4849 page_cache_async_readahead(inode->i_mapping, &sctx->ra,
4850 NULL, page, index, last_index + 1 - index);
4851 }
4852
4853 if (!PageUptodate(page)) {
4854 btrfs_readpage(NULL, page);
4855 lock_page(page);
4856 if (!PageUptodate(page)) {
4857 unlock_page(page);
4858 put_page(page);
4859 ret = -EIO;
4860 break;
4861 }
4862 }
4863
4864 addr = kmap(page);
4865 memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len);
4866 kunmap(page);
4867 unlock_page(page);
4868 put_page(page);
4869 index++;
4870 pg_offset = 0;
4871 len -= cur_len;
4872 ret += cur_len;
4873 }
4874out:
4875 iput(inode);
4876 return ret;
4877}
4878
4879/*
4880 * Read some bytes from the current inode/file and send a write command to
4881 * user space.
4882 */
4883static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
4884{
4885 struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
4886 int ret = 0;
4887 struct fs_path *p;
4888 ssize_t num_read = 0;
4889
4890 p = fs_path_alloc();
4891 if (!p)
4892 return -ENOMEM;
4893
4894 btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
4895
4896 num_read = fill_read_buf(sctx, offset, len);
4897 if (num_read <= 0) {
4898 if (num_read < 0)
4899 ret = num_read;
4900 goto out;
4901 }
4902
4903 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
4904 if (ret < 0)
4905 goto out;
4906
4907 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4908 if (ret < 0)
4909 goto out;
4910
4911 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
4912 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
4913 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read);
4914
4915 ret = send_cmd(sctx);
4916
4917tlv_put_failure:
4918out:
4919 fs_path_free(p);
4920 if (ret < 0)
4921 return ret;
4922 return num_read;
4923}
4924
4925/*
4926 * Send a clone command to user space.
4927 */
4928static int send_clone(struct send_ctx *sctx,
4929 u64 offset, u32 len,
4930 struct clone_root *clone_root)
4931{
4932 int ret = 0;
4933 struct fs_path *p;
4934 u64 gen;
4935
4936 btrfs_debug(sctx->send_root->fs_info,
4937 "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
4938 offset, len, clone_root->root->objectid, clone_root->ino,
4939 clone_root->offset);
4940
4941 p = fs_path_alloc();
4942 if (!p)
4943 return -ENOMEM;
4944
4945 ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
4946 if (ret < 0)
4947 goto out;
4948
4949 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
4950 if (ret < 0)
4951 goto out;
4952
4953 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
4954 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
4955 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
4956
4957 if (clone_root->root == sctx->send_root) {
4958 ret = get_inode_info(sctx->send_root, clone_root->ino, NULL,
4959 &gen, NULL, NULL, NULL, NULL);
4960 if (ret < 0)
4961 goto out;
4962 ret = get_cur_path(sctx, clone_root->ino, gen, p);
4963 } else {
4964 ret = get_inode_path(clone_root->root, clone_root->ino, p);
4965 }
4966 if (ret < 0)
4967 goto out;
4968
4969 /*
4970 * If the parent we're using has a received_uuid set then use that as
4971 * our clone source as that is what we will look for when doing a
4972 * receive.
4973 *
4974 * This covers the case that we create a snapshot off of a received
4975 * subvolume and then use that as the parent and try to receive on a
4976 * different host.
4977 */
4978 if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
4979 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
4980 clone_root->root->root_item.received_uuid);
4981 else
4982 TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
4983 clone_root->root->root_item.uuid);
4984 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
4985 le64_to_cpu(clone_root->root->root_item.ctransid));
4986 TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
4987 TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
4988 clone_root->offset);
4989
4990 ret = send_cmd(sctx);
4991
4992tlv_put_failure:
4993out:
4994 fs_path_free(p);
4995 return ret;
4996}
4997
4998/*
4999 * Send an update extent command to user space.
5000 */
5001static int send_update_extent(struct send_ctx *sctx,
5002 u64 offset, u32 len)
5003{
5004 int ret = 0;
5005 struct fs_path *p;
5006
5007 p = fs_path_alloc();
5008 if (!p)
5009 return -ENOMEM;
5010
5011 ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
5012 if (ret < 0)
5013 goto out;
5014
5015 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5016 if (ret < 0)
5017 goto out;
5018
5019 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5020 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5021 TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
5022
5023 ret = send_cmd(sctx);
5024
5025tlv_put_failure:
5026out:
5027 fs_path_free(p);
5028 return ret;
5029}
5030
5031static int send_hole(struct send_ctx *sctx, u64 end)
5032{
5033 struct fs_path *p = NULL;
5034 u64 offset = sctx->cur_inode_last_extent;
5035 u64 len;
5036 int ret = 0;
5037
5038 /*
5039 * A hole that starts at EOF or beyond it. Since we do not yet support
5040 * fallocate (for extent preallocation and hole punching), sending a
5041 * write of zeroes starting at EOF or beyond would later require issuing
5042 * a truncate operation which would undo the write and achieve nothing.
5043 */
5044 if (offset >= sctx->cur_inode_size)
5045 return 0;
5046
5047 /*
5048 * Don't go beyond the inode's i_size due to prealloc extents that start
5049 * after the i_size.
5050 */
5051 end = min_t(u64, end, sctx->cur_inode_size);
5052
5053 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5054 return send_update_extent(sctx, offset, end - offset);
5055
5056 p = fs_path_alloc();
5057 if (!p)
5058 return -ENOMEM;
5059 ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
5060 if (ret < 0)
5061 goto tlv_put_failure;
5062 memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE);
5063 while (offset < end) {
5064 len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE);
5065
5066 ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
5067 if (ret < 0)
5068 break;
5069 TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
5070 TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5071 TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len);
5072 ret = send_cmd(sctx);
5073 if (ret < 0)
5074 break;
5075 offset += len;
5076 }
5077 sctx->cur_inode_next_write_offset = offset;
5078tlv_put_failure:
5079 fs_path_free(p);
5080 return ret;
5081}
5082
5083static int send_extent_data(struct send_ctx *sctx,
5084 const u64 offset,
5085 const u64 len)
5086{
5087 u64 sent = 0;
5088
5089 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
5090 return send_update_extent(sctx, offset, len);
5091
5092 while (sent < len) {
5093 u64 size = len - sent;
5094 int ret;
5095
5096 if (size > BTRFS_SEND_READ_SIZE)
5097 size = BTRFS_SEND_READ_SIZE;
5098 ret = send_write(sctx, offset + sent, size);
5099 if (ret < 0)
5100 return ret;
5101 if (!ret)
5102 break;
5103 sent += ret;
5104 }
5105 return 0;
5106}
5107
5108static int clone_range(struct send_ctx *sctx,
5109 struct clone_root *clone_root,
5110 const u64 disk_byte,
5111 u64 data_offset,
5112 u64 offset,
5113 u64 len)
5114{
5115 struct btrfs_path *path;
5116 struct btrfs_key key;
5117 int ret;
5118
5119 /*
5120 * Prevent cloning from a zero offset with a length matching the sector
5121 * size because in some scenarios this will make the receiver fail.
5122 *
5123 * For example, if in the source filesystem the extent at offset 0
5124 * has a length of sectorsize and it was written using direct IO, then
5125 * it can never be an inline extent (even if compression is enabled).
5126 * Then this extent can be cloned in the original filesystem to a non
5127 * zero file offset, but it may not be possible to clone in the
5128 * destination filesystem because it can be inlined due to compression
5129 * on the destination filesystem (as the receiver's write operations are
5130 * always done using buffered IO). The same happens when the original
5131 * filesystem does not have compression enabled but the destination
5132 * filesystem has.
5133 */
5134 if (clone_root->offset == 0 &&
5135 len == sctx->send_root->fs_info->sectorsize)
5136 return send_extent_data(sctx, offset, len);
5137
5138 path = alloc_path_for_send();
5139 if (!path)
5140 return -ENOMEM;
5141
5142 /*
5143 * We can't send a clone operation for the entire range if we find
5144 * extent items in the respective range in the source file that
5145 * refer to different extents or if we find holes.
5146 * So check for that and do a mix of clone and regular write/copy
5147 * operations if needed.
5148 *
5149 * Example:
5150 *
5151 * mkfs.btrfs -f /dev/sda
5152 * mount /dev/sda /mnt
5153 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
5154 * cp --reflink=always /mnt/foo /mnt/bar
5155 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
5156 * btrfs subvolume snapshot -r /mnt /mnt/snap
5157 *
5158 * If when we send the snapshot and we are processing file bar (which
5159 * has a higher inode number than foo) we blindly send a clone operation
5160 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
5161 * a file bar that matches the content of file foo - iow, doesn't match
5162 * the content from bar in the original filesystem.
5163 */
5164 key.objectid = clone_root->ino;
5165 key.type = BTRFS_EXTENT_DATA_KEY;
5166 key.offset = clone_root->offset;
5167 ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
5168 if (ret < 0)
5169 goto out;
5170 if (ret > 0 && path->slots[0] > 0) {
5171 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
5172 if (key.objectid == clone_root->ino &&
5173 key.type == BTRFS_EXTENT_DATA_KEY)
5174 path->slots[0]--;
5175 }
5176
5177 while (true) {
5178 struct extent_buffer *leaf = path->nodes[0];
5179 int slot = path->slots[0];
5180 struct btrfs_file_extent_item *ei;
5181 u8 type;
5182 u64 ext_len;
5183 u64 clone_len;
5184
5185 if (slot >= btrfs_header_nritems(leaf)) {
5186 ret = btrfs_next_leaf(clone_root->root, path);
5187 if (ret < 0)
5188 goto out;
5189 else if (ret > 0)
5190 break;
5191 continue;
5192 }
5193
5194 btrfs_item_key_to_cpu(leaf, &key, slot);
5195
5196 /*
5197 * We might have an implicit trailing hole (NO_HOLES feature
5198 * enabled). We deal with it after leaving this loop.
5199 */
5200 if (key.objectid != clone_root->ino ||
5201 key.type != BTRFS_EXTENT_DATA_KEY)
5202 break;
5203
5204 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5205 type = btrfs_file_extent_type(leaf, ei);
5206 if (type == BTRFS_FILE_EXTENT_INLINE) {
5207 ext_len = btrfs_file_extent_ram_bytes(leaf, ei);
5208 ext_len = PAGE_ALIGN(ext_len);
5209 } else {
5210 ext_len = btrfs_file_extent_num_bytes(leaf, ei);
5211 }
5212
5213 if (key.offset + ext_len <= clone_root->offset)
5214 goto next;
5215
5216 if (key.offset > clone_root->offset) {
5217 /* Implicit hole, NO_HOLES feature enabled. */
5218 u64 hole_len = key.offset - clone_root->offset;
5219
5220 if (hole_len > len)
5221 hole_len = len;
5222 ret = send_extent_data(sctx, offset, hole_len);
5223 if (ret < 0)
5224 goto out;
5225
5226 len -= hole_len;
5227 if (len == 0)
5228 break;
5229 offset += hole_len;
5230 clone_root->offset += hole_len;
5231 data_offset += hole_len;
5232 }
5233
5234 if (key.offset >= clone_root->offset + len)
5235 break;
5236
5237 clone_len = min_t(u64, ext_len, len);
5238
5239 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
5240 btrfs_file_extent_offset(leaf, ei) == data_offset)
5241 ret = send_clone(sctx, offset, clone_len, clone_root);
5242 else
5243 ret = send_extent_data(sctx, offset, clone_len);
5244
5245 if (ret < 0)
5246 goto out;
5247
5248 len -= clone_len;
5249 if (len == 0)
5250 break;
5251 offset += clone_len;
5252 clone_root->offset += clone_len;
5253 data_offset += clone_len;
5254next:
5255 path->slots[0]++;
5256 }
5257
5258 if (len > 0)
5259 ret = send_extent_data(sctx, offset, len);
5260 else
5261 ret = 0;
5262out:
5263 btrfs_free_path(path);
5264 return ret;
5265}
5266
5267static int send_write_or_clone(struct send_ctx *sctx,
5268 struct btrfs_path *path,
5269 struct btrfs_key *key,
5270 struct clone_root *clone_root)
5271{
5272 int ret = 0;
5273 struct btrfs_file_extent_item *ei;
5274 u64 offset = key->offset;
5275 u64 len;
5276 u8 type;
5277 u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
5278
5279 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
5280 struct btrfs_file_extent_item);
5281 type = btrfs_file_extent_type(path->nodes[0], ei);
5282 if (type == BTRFS_FILE_EXTENT_INLINE) {
5283 len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
5284 /*
5285 * it is possible the inline item won't cover the whole page,
5286 * but there may be items after this page. Make
5287 * sure to send the whole thing
5288 */
5289 len = PAGE_ALIGN(len);
5290 } else {
5291 len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
5292 }
5293
5294 if (offset >= sctx->cur_inode_size) {
5295 ret = 0;
5296 goto out;
5297 }
5298 if (offset + len > sctx->cur_inode_size)
5299 len = sctx->cur_inode_size - offset;
5300 if (len == 0) {
5301 ret = 0;
5302 goto out;
5303 }
5304
5305 if (clone_root && IS_ALIGNED(offset + len, bs)) {
5306 u64 disk_byte;
5307 u64 data_offset;
5308
5309 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
5310 data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
5311 ret = clone_range(sctx, clone_root, disk_byte, data_offset,
5312 offset, len);
5313 } else {
5314 ret = send_extent_data(sctx, offset, len);
5315 }
5316 sctx->cur_inode_next_write_offset = offset + len;
5317out:
5318 return ret;
5319}
5320
5321static int is_extent_unchanged(struct send_ctx *sctx,
5322 struct btrfs_path *left_path,
5323 struct btrfs_key *ekey)
5324{
5325 int ret = 0;
5326 struct btrfs_key key;
5327 struct btrfs_path *path = NULL;
5328 struct extent_buffer *eb;
5329 int slot;
5330 struct btrfs_key found_key;
5331 struct btrfs_file_extent_item *ei;
5332 u64 left_disknr;
5333 u64 right_disknr;
5334 u64 left_offset;
5335 u64 right_offset;
5336 u64 left_offset_fixed;
5337 u64 left_len;
5338 u64 right_len;
5339 u64 left_gen;
5340 u64 right_gen;
5341 u8 left_type;
5342 u8 right_type;
5343
5344 path = alloc_path_for_send();
5345 if (!path)
5346 return -ENOMEM;
5347
5348 eb = left_path->nodes[0];
5349 slot = left_path->slots[0];
5350 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
5351 left_type = btrfs_file_extent_type(eb, ei);
5352
5353 if (left_type != BTRFS_FILE_EXTENT_REG) {
5354 ret = 0;
5355 goto out;
5356 }
5357 left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
5358 left_len = btrfs_file_extent_num_bytes(eb, ei);
5359 left_offset = btrfs_file_extent_offset(eb, ei);
5360 left_gen = btrfs_file_extent_generation(eb, ei);
5361
5362 /*
5363 * Following comments will refer to these graphics. L is the left
5364 * extents which we are checking at the moment. 1-8 are the right
5365 * extents that we iterate.
5366 *
5367 * |-----L-----|
5368 * |-1-|-2a-|-3-|-4-|-5-|-6-|
5369 *
5370 * |-----L-----|
5371 * |--1--|-2b-|...(same as above)
5372 *
5373 * Alternative situation. Happens on files where extents got split.
5374 * |-----L-----|
5375 * |-----------7-----------|-6-|
5376 *
5377 * Alternative situation. Happens on files which got larger.
5378 * |-----L-----|
5379 * |-8-|
5380 * Nothing follows after 8.
5381 */
5382
5383 key.objectid = ekey->objectid;
5384 key.type = BTRFS_EXTENT_DATA_KEY;
5385 key.offset = ekey->offset;
5386 ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
5387 if (ret < 0)
5388 goto out;
5389 if (ret) {
5390 ret = 0;
5391 goto out;
5392 }
5393
5394 /*
5395 * Handle special case where the right side has no extents at all.
5396 */
5397 eb = path->nodes[0];
5398 slot = path->slots[0];
5399 btrfs_item_key_to_cpu(eb, &found_key, slot);
5400 if (found_key.objectid != key.objectid ||
5401 found_key.type != key.type) {
5402 /* If we're a hole then just pretend nothing changed */
5403 ret = (left_disknr) ? 0 : 1;
5404 goto out;
5405 }
5406
5407 /*
5408 * We're now on 2a, 2b or 7.
5409 */
5410 key = found_key;
5411 while (key.offset < ekey->offset + left_len) {
5412 ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
5413 right_type = btrfs_file_extent_type(eb, ei);
5414 if (right_type != BTRFS_FILE_EXTENT_REG &&
5415 right_type != BTRFS_FILE_EXTENT_INLINE) {
5416 ret = 0;
5417 goto out;
5418 }
5419
5420 if (right_type == BTRFS_FILE_EXTENT_INLINE) {
5421 right_len = btrfs_file_extent_ram_bytes(eb, ei);
5422 right_len = PAGE_ALIGN(right_len);
5423 } else {
5424 right_len = btrfs_file_extent_num_bytes(eb, ei);
5425 }
5426
5427 /*
5428 * Are we at extent 8? If yes, we know the extent is changed.
5429 * This may only happen on the first iteration.
5430 */
5431 if (found_key.offset + right_len <= ekey->offset) {
5432 /* If we're a hole just pretend nothing changed */
5433 ret = (left_disknr) ? 0 : 1;
5434 goto out;
5435 }
5436
5437 /*
5438 * We just wanted to see if when we have an inline extent, what
5439 * follows it is a regular extent (wanted to check the above
5440 * condition for inline extents too). This should normally not
5441 * happen but it's possible for example when we have an inline
5442 * compressed extent representing data with a size matching
5443 * the page size (currently the same as sector size).
5444 */
5445 if (right_type == BTRFS_FILE_EXTENT_INLINE) {
5446 ret = 0;
5447 goto out;
5448 }
5449
5450 right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
5451 right_offset = btrfs_file_extent_offset(eb, ei);
5452 right_gen = btrfs_file_extent_generation(eb, ei);
5453
5454 left_offset_fixed = left_offset;
5455 if (key.offset < ekey->offset) {
5456 /* Fix the right offset for 2a and 7. */
5457 right_offset += ekey->offset - key.offset;
5458 } else {
5459 /* Fix the left offset for all behind 2a and 2b */
5460 left_offset_fixed += key.offset - ekey->offset;
5461 }
5462
5463 /*
5464 * Check if we have the same extent.
5465 */
5466 if (left_disknr != right_disknr ||
5467 left_offset_fixed != right_offset ||
5468 left_gen != right_gen) {
5469 ret = 0;
5470 goto out;
5471 }
5472
5473 /*
5474 * Go to the next extent.
5475 */
5476 ret = btrfs_next_item(sctx->parent_root, path);
5477 if (ret < 0)
5478 goto out;
5479 if (!ret) {
5480 eb = path->nodes[0];
5481 slot = path->slots[0];
5482 btrfs_item_key_to_cpu(eb, &found_key, slot);
5483 }
5484 if (ret || found_key.objectid != key.objectid ||
5485 found_key.type != key.type) {
5486 key.offset += right_len;
5487 break;
5488 }
5489 if (found_key.offset != key.offset + right_len) {
5490 ret = 0;
5491 goto out;
5492 }
5493 key = found_key;
5494 }
5495
5496 /*
5497 * We're now behind the left extent (treat as unchanged) or at the end
5498 * of the right side (treat as changed).
5499 */
5500 if (key.offset >= ekey->offset + left_len)
5501 ret = 1;
5502 else
5503 ret = 0;
5504
5505
5506out:
5507 btrfs_free_path(path);
5508 return ret;
5509}
5510
5511static int get_last_extent(struct send_ctx *sctx, u64 offset)
5512{
5513 struct btrfs_path *path;
5514 struct btrfs_root *root = sctx->send_root;
5515 struct btrfs_file_extent_item *fi;
5516 struct btrfs_key key;
5517 u64 extent_end;
5518 u8 type;
5519 int ret;
5520
5521 path = alloc_path_for_send();
5522 if (!path)
5523 return -ENOMEM;
5524
5525 sctx->cur_inode_last_extent = 0;
5526
5527 key.objectid = sctx->cur_ino;
5528 key.type = BTRFS_EXTENT_DATA_KEY;
5529 key.offset = offset;
5530 ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
5531 if (ret < 0)
5532 goto out;
5533 ret = 0;
5534 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
5535 if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
5536 goto out;
5537
5538 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
5539 struct btrfs_file_extent_item);
5540 type = btrfs_file_extent_type(path->nodes[0], fi);
5541 if (type == BTRFS_FILE_EXTENT_INLINE) {
5542 u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
5543 extent_end = ALIGN(key.offset + size,
5544 sctx->send_root->fs_info->sectorsize);
5545 } else {
5546 extent_end = key.offset +
5547 btrfs_file_extent_num_bytes(path->nodes[0], fi);
5548 }
5549 sctx->cur_inode_last_extent = extent_end;
5550out:
5551 btrfs_free_path(path);
5552 return ret;
5553}
5554
5555static int range_is_hole_in_parent(struct send_ctx *sctx,
5556 const u64 start,
5557 const u64 end)
5558{
5559 struct btrfs_path *path;
5560 struct btrfs_key key;
5561 struct btrfs_root *root = sctx->parent_root;
5562 u64 search_start = start;
5563 int ret;
5564
5565 path = alloc_path_for_send();
5566 if (!path)
5567 return -ENOMEM;
5568
5569 key.objectid = sctx->cur_ino;
5570 key.type = BTRFS_EXTENT_DATA_KEY;
5571 key.offset = search_start;
5572 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5573 if (ret < 0)
5574 goto out;
5575 if (ret > 0 && path->slots[0] > 0)
5576 path->slots[0]--;
5577
5578 while (search_start < end) {
5579 struct extent_buffer *leaf = path->nodes[0];
5580 int slot = path->slots[0];
5581 struct btrfs_file_extent_item *fi;
5582 u64 extent_end;
5583
5584 if (slot >= btrfs_header_nritems(leaf)) {
5585 ret = btrfs_next_leaf(root, path);
5586 if (ret < 0)
5587 goto out;
5588 else if (ret > 0)
5589 break;
5590 continue;
5591 }
5592
5593 btrfs_item_key_to_cpu(leaf, &key, slot);
5594 if (key.objectid < sctx->cur_ino ||
5595 key.type < BTRFS_EXTENT_DATA_KEY)
5596 goto next;
5597 if (key.objectid > sctx->cur_ino ||
5598 key.type > BTRFS_EXTENT_DATA_KEY ||
5599 key.offset >= end)
5600 break;
5601
5602 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5603 if (btrfs_file_extent_type(leaf, fi) ==
5604 BTRFS_FILE_EXTENT_INLINE) {
5605 u64 size = btrfs_file_extent_ram_bytes(leaf, fi);
5606
5607 extent_end = ALIGN(key.offset + size,
5608 root->fs_info->sectorsize);
5609 } else {
5610 extent_end = key.offset +
5611 btrfs_file_extent_num_bytes(leaf, fi);
5612 }
5613 if (extent_end <= start)
5614 goto next;
5615 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
5616 search_start = extent_end;
5617 goto next;
5618 }
5619 ret = 0;
5620 goto out;
5621next:
5622 path->slots[0]++;
5623 }
5624 ret = 1;
5625out:
5626 btrfs_free_path(path);
5627 return ret;
5628}
5629
5630static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
5631 struct btrfs_key *key)
5632{
5633 struct btrfs_file_extent_item *fi;
5634 u64 extent_end;
5635 u8 type;
5636 int ret = 0;
5637
5638 if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
5639 return 0;
5640
5641 if (sctx->cur_inode_last_extent == (u64)-1) {
5642 ret = get_last_extent(sctx, key->offset - 1);
5643 if (ret)
5644 return ret;
5645 }
5646
5647 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
5648 struct btrfs_file_extent_item);
5649 type = btrfs_file_extent_type(path->nodes[0], fi);
5650 if (type == BTRFS_FILE_EXTENT_INLINE) {
5651 u64 size = btrfs_file_extent_ram_bytes(path->nodes[0], fi);
5652 extent_end = ALIGN(key->offset + size,
5653 sctx->send_root->fs_info->sectorsize);
5654 } else {
5655 extent_end = key->offset +
5656 btrfs_file_extent_num_bytes(path->nodes[0], fi);
5657 }
5658
5659 if (path->slots[0] == 0 &&
5660 sctx->cur_inode_last_extent < key->offset) {
5661 /*
5662 * We might have skipped entire leafs that contained only
5663 * file extent items for our current inode. These leafs have
5664 * a generation number smaller (older) than the one in the
5665 * current leaf and the leaf our last extent came from, and
5666 * are located between these 2 leafs.
5667 */
5668 ret = get_last_extent(sctx, key->offset - 1);
5669 if (ret)
5670 return ret;
5671 }
5672
5673 if (sctx->cur_inode_last_extent < key->offset) {
5674 ret = range_is_hole_in_parent(sctx,
5675 sctx->cur_inode_last_extent,
5676 key->offset);
5677 if (ret < 0)
5678 return ret;
5679 else if (ret == 0)
5680 ret = send_hole(sctx, key->offset);
5681 else
5682 ret = 0;
5683 }
5684 sctx->cur_inode_last_extent = extent_end;
5685 return ret;
5686}
5687
5688static int process_extent(struct send_ctx *sctx,
5689 struct btrfs_path *path,
5690 struct btrfs_key *key)
5691{
5692 struct clone_root *found_clone = NULL;
5693 int ret = 0;
5694
5695 if (S_ISLNK(sctx->cur_inode_mode))
5696 return 0;
5697
5698 if (sctx->parent_root && !sctx->cur_inode_new) {
5699 ret = is_extent_unchanged(sctx, path, key);
5700 if (ret < 0)
5701 goto out;
5702 if (ret) {
5703 ret = 0;
5704 goto out_hole;
5705 }
5706 } else {
5707 struct btrfs_file_extent_item *ei;
5708 u8 type;
5709
5710 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
5711 struct btrfs_file_extent_item);
5712 type = btrfs_file_extent_type(path->nodes[0], ei);
5713 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
5714 type == BTRFS_FILE_EXTENT_REG) {
5715 /*
5716 * The send spec does not have a prealloc command yet,
5717 * so just leave a hole for prealloc'ed extents until
5718 * we have enough commands queued up to justify rev'ing
5719 * the send spec.
5720 */
5721 if (type == BTRFS_FILE_EXTENT_PREALLOC) {
5722 ret = 0;
5723 goto out;
5724 }
5725
5726 /* Have a hole, just skip it. */
5727 if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
5728 ret = 0;
5729 goto out;
5730 }
5731 }
5732 }
5733
5734 ret = find_extent_clone(sctx, path, key->objectid, key->offset,
5735 sctx->cur_inode_size, &found_clone);
5736 if (ret != -ENOENT && ret < 0)
5737 goto out;
5738
5739 ret = send_write_or_clone(sctx, path, key, found_clone);
5740 if (ret)
5741 goto out;
5742out_hole:
5743 ret = maybe_send_hole(sctx, path, key);
5744out:
5745 return ret;
5746}
5747
5748static int process_all_extents(struct send_ctx *sctx)
5749{
5750 int ret;
5751 struct btrfs_root *root;
5752 struct btrfs_path *path;
5753 struct btrfs_key key;
5754 struct btrfs_key found_key;
5755 struct extent_buffer *eb;
5756 int slot;
5757
5758 root = sctx->send_root;
5759 path = alloc_path_for_send();
5760 if (!path)
5761 return -ENOMEM;
5762
5763 key.objectid = sctx->cmp_key->objectid;
5764 key.type = BTRFS_EXTENT_DATA_KEY;
5765 key.offset = 0;
5766 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5767 if (ret < 0)
5768 goto out;
5769
5770 while (1) {
5771 eb = path->nodes[0];
5772 slot = path->slots[0];
5773
5774 if (slot >= btrfs_header_nritems(eb)) {
5775 ret = btrfs_next_leaf(root, path);
5776 if (ret < 0) {
5777 goto out;
5778 } else if (ret > 0) {
5779 ret = 0;
5780 break;
5781 }
5782 continue;
5783 }
5784
5785 btrfs_item_key_to_cpu(eb, &found_key, slot);
5786
5787 if (found_key.objectid != key.objectid ||
5788 found_key.type != key.type) {
5789 ret = 0;
5790 goto out;
5791 }
5792
5793 ret = process_extent(sctx, path, &found_key);
5794 if (ret < 0)
5795 goto out;
5796
5797 path->slots[0]++;
5798 }
5799
5800out:
5801 btrfs_free_path(path);
5802 return ret;
5803}
5804
5805static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
5806 int *pending_move,
5807 int *refs_processed)
5808{
5809 int ret = 0;
5810
5811 if (sctx->cur_ino == 0)
5812 goto out;
5813 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
5814 sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
5815 goto out;
5816 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
5817 goto out;
5818
5819 ret = process_recorded_refs(sctx, pending_move);
5820 if (ret < 0)
5821 goto out;
5822
5823 *refs_processed = 1;
5824out:
5825 return ret;
5826}
5827
5828static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
5829{
5830 int ret = 0;
5831 u64 left_mode;
5832 u64 left_uid;
5833 u64 left_gid;
5834 u64 right_mode;
5835 u64 right_uid;
5836 u64 right_gid;
5837 int need_chmod = 0;
5838 int need_chown = 0;
5839 int need_truncate = 1;
5840 int pending_move = 0;
5841 int refs_processed = 0;
5842
5843 if (sctx->ignore_cur_inode)
5844 return 0;
5845
5846 ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
5847 &refs_processed);
5848 if (ret < 0)
5849 goto out;
5850
5851 /*
5852 * We have processed the refs and thus need to advance send_progress.
5853 * Now, calls to get_cur_xxx will take the updated refs of the current
5854 * inode into account.
5855 *
5856 * On the other hand, if our current inode is a directory and couldn't
5857 * be moved/renamed because its parent was renamed/moved too and it has
5858 * a higher inode number, we can only move/rename our current inode
5859 * after we moved/renamed its parent. Therefore in this case operate on
5860 * the old path (pre move/rename) of our current inode, and the
5861 * move/rename will be performed later.
5862 */
5863 if (refs_processed && !pending_move)
5864 sctx->send_progress = sctx->cur_ino + 1;
5865
5866 if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
5867 goto out;
5868 if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
5869 goto out;
5870
5871 ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL,
5872 &left_mode, &left_uid, &left_gid, NULL);
5873 if (ret < 0)
5874 goto out;
5875
5876 if (!sctx->parent_root || sctx->cur_inode_new) {
5877 need_chown = 1;
5878 if (!S_ISLNK(sctx->cur_inode_mode))
5879 need_chmod = 1;
5880 if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
5881 need_truncate = 0;
5882 } else {
5883 u64 old_size;
5884
5885 ret = get_inode_info(sctx->parent_root, sctx->cur_ino,
5886 &old_size, NULL, &right_mode, &right_uid,
5887 &right_gid, NULL);
5888 if (ret < 0)
5889 goto out;
5890
5891 if (left_uid != right_uid || left_gid != right_gid)
5892 need_chown = 1;
5893 if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
5894 need_chmod = 1;
5895 if ((old_size == sctx->cur_inode_size) ||
5896 (sctx->cur_inode_size > old_size &&
5897 sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
5898 need_truncate = 0;
5899 }
5900
5901 if (S_ISREG(sctx->cur_inode_mode)) {
5902 if (need_send_hole(sctx)) {
5903 if (sctx->cur_inode_last_extent == (u64)-1 ||
5904 sctx->cur_inode_last_extent <
5905 sctx->cur_inode_size) {
5906 ret = get_last_extent(sctx, (u64)-1);
5907 if (ret)
5908 goto out;
5909 }
5910 if (sctx->cur_inode_last_extent <
5911 sctx->cur_inode_size) {
5912 ret = send_hole(sctx, sctx->cur_inode_size);
5913 if (ret)
5914 goto out;
5915 }
5916 }
5917 if (need_truncate) {
5918 ret = send_truncate(sctx, sctx->cur_ino,
5919 sctx->cur_inode_gen,
5920 sctx->cur_inode_size);
5921 if (ret < 0)
5922 goto out;
5923 }
5924 }
5925
5926 if (need_chown) {
5927 ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
5928 left_uid, left_gid);
5929 if (ret < 0)
5930 goto out;
5931 }
5932 if (need_chmod) {
5933 ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
5934 left_mode);
5935 if (ret < 0)
5936 goto out;
5937 }
5938
5939 /*
5940 * If other directory inodes depended on our current directory
5941 * inode's move/rename, now do their move/rename operations.
5942 */
5943 if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
5944 ret = apply_children_dir_moves(sctx);
5945 if (ret)
5946 goto out;
5947 /*
5948 * Need to send that every time, no matter if it actually
5949 * changed between the two trees as we have done changes to
5950 * the inode before. If our inode is a directory and it's
5951 * waiting to be moved/renamed, we will send its utimes when
5952 * it's moved/renamed, therefore we don't need to do it here.
5953 */
5954 sctx->send_progress = sctx->cur_ino + 1;
5955 ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
5956 if (ret < 0)
5957 goto out;
5958 }
5959
5960out:
5961 return ret;
5962}
5963
5964struct parent_paths_ctx {
5965 struct list_head *refs;
5966 struct send_ctx *sctx;
5967};
5968
5969static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name,
5970 void *ctx)
5971{
5972 struct parent_paths_ctx *ppctx = ctx;
5973
5974 return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx,
5975 ppctx->refs);
5976}
5977
5978/*
5979 * Issue unlink operations for all paths of the current inode found in the
5980 * parent snapshot.
5981 */
5982static int btrfs_unlink_all_paths(struct send_ctx *sctx)
5983{
5984 LIST_HEAD(deleted_refs);
5985 struct btrfs_path *path;
5986 struct btrfs_key key;
5987 struct parent_paths_ctx ctx;
5988 int ret;
5989
5990 path = alloc_path_for_send();
5991 if (!path)
5992 return -ENOMEM;
5993
5994 key.objectid = sctx->cur_ino;
5995 key.type = BTRFS_INODE_REF_KEY;
5996 key.offset = 0;
5997 ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
5998 if (ret < 0)
5999 goto out;
6000
6001 ctx.refs = &deleted_refs;
6002 ctx.sctx = sctx;
6003
6004 while (true) {
6005 struct extent_buffer *eb = path->nodes[0];
6006 int slot = path->slots[0];
6007
6008 if (slot >= btrfs_header_nritems(eb)) {
6009 ret = btrfs_next_leaf(sctx->parent_root, path);
6010 if (ret < 0)
6011 goto out;
6012 else if (ret > 0)
6013 break;
6014 continue;
6015 }
6016
6017 btrfs_item_key_to_cpu(eb, &key, slot);
6018 if (key.objectid != sctx->cur_ino)
6019 break;
6020 if (key.type != BTRFS_INODE_REF_KEY &&
6021 key.type != BTRFS_INODE_EXTREF_KEY)
6022 break;
6023
6024 ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
6025 record_parent_ref, &ctx);
6026 if (ret < 0)
6027 goto out;
6028
6029 path->slots[0]++;
6030 }
6031
6032 while (!list_empty(&deleted_refs)) {
6033 struct recorded_ref *ref;
6034
6035 ref = list_first_entry(&deleted_refs, struct recorded_ref, list);
6036 ret = send_unlink(sctx, ref->full_path);
6037 if (ret < 0)
6038 goto out;
6039 fs_path_free(ref->full_path);
6040 list_del(&ref->list);
6041 kfree(ref);
6042 }
6043 ret = 0;
6044out:
6045 btrfs_free_path(path);
6046 if (ret)
6047 __free_recorded_refs(&deleted_refs);
6048 return ret;
6049}
6050
6051static int changed_inode(struct send_ctx *sctx,
6052 enum btrfs_compare_tree_result result)
6053{
6054 int ret = 0;
6055 struct btrfs_key *key = sctx->cmp_key;
6056 struct btrfs_inode_item *left_ii = NULL;
6057 struct btrfs_inode_item *right_ii = NULL;
6058 u64 left_gen = 0;
6059 u64 right_gen = 0;
6060
6061 sctx->cur_ino = key->objectid;
6062 sctx->cur_inode_new_gen = 0;
6063 sctx->cur_inode_last_extent = (u64)-1;
6064 sctx->cur_inode_next_write_offset = 0;
6065 sctx->ignore_cur_inode = false;
6066
6067 /*
6068 * Set send_progress to current inode. This will tell all get_cur_xxx
6069 * functions that the current inode's refs are not updated yet. Later,
6070 * when process_recorded_refs is finished, it is set to cur_ino + 1.
6071 */
6072 sctx->send_progress = sctx->cur_ino;
6073
6074 if (result == BTRFS_COMPARE_TREE_NEW ||
6075 result == BTRFS_COMPARE_TREE_CHANGED) {
6076 left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
6077 sctx->left_path->slots[0],
6078 struct btrfs_inode_item);
6079 left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
6080 left_ii);
6081 } else {
6082 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6083 sctx->right_path->slots[0],
6084 struct btrfs_inode_item);
6085 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6086 right_ii);
6087 }
6088 if (result == BTRFS_COMPARE_TREE_CHANGED) {
6089 right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
6090 sctx->right_path->slots[0],
6091 struct btrfs_inode_item);
6092
6093 right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
6094 right_ii);
6095
6096 /*
6097 * The cur_ino = root dir case is special here. We can't treat
6098 * the inode as deleted+reused because it would generate a
6099 * stream that tries to delete/mkdir the root dir.
6100 */
6101 if (left_gen != right_gen &&
6102 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6103 sctx->cur_inode_new_gen = 1;
6104 }
6105
6106 /*
6107 * Normally we do not find inodes with a link count of zero (orphans)
6108 * because the most common case is to create a snapshot and use it
6109 * for a send operation. However other less common use cases involve
6110 * using a subvolume and send it after turning it to RO mode just
6111 * after deleting all hard links of a file while holding an open
6112 * file descriptor against it or turning a RO snapshot into RW mode,
6113 * keep an open file descriptor against a file, delete it and then
6114 * turn the snapshot back to RO mode before using it for a send
6115 * operation. So if we find such cases, ignore the inode and all its
6116 * items completely if it's a new inode, or if it's a changed inode
6117 * make sure all its previous paths (from the parent snapshot) are all
6118 * unlinked and all other the inode items are ignored.
6119 */
6120 if (result == BTRFS_COMPARE_TREE_NEW ||
6121 result == BTRFS_COMPARE_TREE_CHANGED) {
6122 u32 nlinks;
6123
6124 nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
6125 if (nlinks == 0) {
6126 sctx->ignore_cur_inode = true;
6127 if (result == BTRFS_COMPARE_TREE_CHANGED)
6128 ret = btrfs_unlink_all_paths(sctx);
6129 goto out;
6130 }
6131 }
6132
6133 if (result == BTRFS_COMPARE_TREE_NEW) {
6134 sctx->cur_inode_gen = left_gen;
6135 sctx->cur_inode_new = 1;
6136 sctx->cur_inode_deleted = 0;
6137 sctx->cur_inode_size = btrfs_inode_size(
6138 sctx->left_path->nodes[0], left_ii);
6139 sctx->cur_inode_mode = btrfs_inode_mode(
6140 sctx->left_path->nodes[0], left_ii);
6141 sctx->cur_inode_rdev = btrfs_inode_rdev(
6142 sctx->left_path->nodes[0], left_ii);
6143 if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6144 ret = send_create_inode_if_needed(sctx);
6145 } else if (result == BTRFS_COMPARE_TREE_DELETED) {
6146 sctx->cur_inode_gen = right_gen;
6147 sctx->cur_inode_new = 0;
6148 sctx->cur_inode_deleted = 1;
6149 sctx->cur_inode_size = btrfs_inode_size(
6150 sctx->right_path->nodes[0], right_ii);
6151 sctx->cur_inode_mode = btrfs_inode_mode(
6152 sctx->right_path->nodes[0], right_ii);
6153 } else if (result == BTRFS_COMPARE_TREE_CHANGED) {
6154 /*
6155 * We need to do some special handling in case the inode was
6156 * reported as changed with a changed generation number. This
6157 * means that the original inode was deleted and new inode
6158 * reused the same inum. So we have to treat the old inode as
6159 * deleted and the new one as new.
6160 */
6161 if (sctx->cur_inode_new_gen) {
6162 /*
6163 * First, process the inode as if it was deleted.
6164 */
6165 sctx->cur_inode_gen = right_gen;
6166 sctx->cur_inode_new = 0;
6167 sctx->cur_inode_deleted = 1;
6168 sctx->cur_inode_size = btrfs_inode_size(
6169 sctx->right_path->nodes[0], right_ii);
6170 sctx->cur_inode_mode = btrfs_inode_mode(
6171 sctx->right_path->nodes[0], right_ii);
6172 ret = process_all_refs(sctx,
6173 BTRFS_COMPARE_TREE_DELETED);
6174 if (ret < 0)
6175 goto out;
6176
6177 /*
6178 * Now process the inode as if it was new.
6179 */
6180 sctx->cur_inode_gen = left_gen;
6181 sctx->cur_inode_new = 1;
6182 sctx->cur_inode_deleted = 0;
6183 sctx->cur_inode_size = btrfs_inode_size(
6184 sctx->left_path->nodes[0], left_ii);
6185 sctx->cur_inode_mode = btrfs_inode_mode(
6186 sctx->left_path->nodes[0], left_ii);
6187 sctx->cur_inode_rdev = btrfs_inode_rdev(
6188 sctx->left_path->nodes[0], left_ii);
6189 ret = send_create_inode_if_needed(sctx);
6190 if (ret < 0)
6191 goto out;
6192
6193 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
6194 if (ret < 0)
6195 goto out;
6196 /*
6197 * Advance send_progress now as we did not get into
6198 * process_recorded_refs_if_needed in the new_gen case.
6199 */
6200 sctx->send_progress = sctx->cur_ino + 1;
6201
6202 /*
6203 * Now process all extents and xattrs of the inode as if
6204 * they were all new.
6205 */
6206 ret = process_all_extents(sctx);
6207 if (ret < 0)
6208 goto out;
6209 ret = process_all_new_xattrs(sctx);
6210 if (ret < 0)
6211 goto out;
6212 } else {
6213 sctx->cur_inode_gen = left_gen;
6214 sctx->cur_inode_new = 0;
6215 sctx->cur_inode_new_gen = 0;
6216 sctx->cur_inode_deleted = 0;
6217 sctx->cur_inode_size = btrfs_inode_size(
6218 sctx->left_path->nodes[0], left_ii);
6219 sctx->cur_inode_mode = btrfs_inode_mode(
6220 sctx->left_path->nodes[0], left_ii);
6221 }
6222 }
6223
6224out:
6225 return ret;
6226}
6227
6228/*
6229 * We have to process new refs before deleted refs, but compare_trees gives us
6230 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
6231 * first and later process them in process_recorded_refs.
6232 * For the cur_inode_new_gen case, we skip recording completely because
6233 * changed_inode did already initiate processing of refs. The reason for this is
6234 * that in this case, compare_tree actually compares the refs of 2 different
6235 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
6236 * refs of the right tree as deleted and all refs of the left tree as new.
6237 */
6238static int changed_ref(struct send_ctx *sctx,
6239 enum btrfs_compare_tree_result result)
6240{
6241 int ret = 0;
6242
6243 if (sctx->cur_ino != sctx->cmp_key->objectid) {
6244 inconsistent_snapshot_error(sctx, result, "reference");
6245 return -EIO;
6246 }
6247
6248 if (!sctx->cur_inode_new_gen &&
6249 sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
6250 if (result == BTRFS_COMPARE_TREE_NEW)
6251 ret = record_new_ref(sctx);
6252 else if (result == BTRFS_COMPARE_TREE_DELETED)
6253 ret = record_deleted_ref(sctx);
6254 else if (result == BTRFS_COMPARE_TREE_CHANGED)
6255 ret = record_changed_ref(sctx);
6256 }
6257
6258 return ret;
6259}
6260
6261/*
6262 * Process new/deleted/changed xattrs. We skip processing in the
6263 * cur_inode_new_gen case because changed_inode did already initiate processing
6264 * of xattrs. The reason is the same as in changed_ref
6265 */
6266static int changed_xattr(struct send_ctx *sctx,
6267 enum btrfs_compare_tree_result result)
6268{
6269 int ret = 0;
6270
6271 if (sctx->cur_ino != sctx->cmp_key->objectid) {
6272 inconsistent_snapshot_error(sctx, result, "xattr");
6273 return -EIO;
6274 }
6275
6276 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
6277 if (result == BTRFS_COMPARE_TREE_NEW)
6278 ret = process_new_xattr(sctx);
6279 else if (result == BTRFS_COMPARE_TREE_DELETED)
6280 ret = process_deleted_xattr(sctx);
6281 else if (result == BTRFS_COMPARE_TREE_CHANGED)
6282 ret = process_changed_xattr(sctx);
6283 }
6284
6285 return ret;
6286}
6287
6288/*
6289 * Process new/deleted/changed extents. We skip processing in the
6290 * cur_inode_new_gen case because changed_inode did already initiate processing
6291 * of extents. The reason is the same as in changed_ref
6292 */
6293static int changed_extent(struct send_ctx *sctx,
6294 enum btrfs_compare_tree_result result)
6295{
6296 int ret = 0;
6297
6298 /*
6299 * We have found an extent item that changed without the inode item
6300 * having changed. This can happen either after relocation (where the
6301 * disk_bytenr of an extent item is replaced at
6302 * relocation.c:replace_file_extents()) or after deduplication into a
6303 * file in both the parent and send snapshots (where an extent item can
6304 * get modified or replaced with a new one). Note that deduplication
6305 * updates the inode item, but it only changes the iversion (sequence
6306 * field in the inode item) of the inode, so if a file is deduplicated
6307 * the same amount of times in both the parent and send snapshots, its
6308 * iversion becames the same in both snapshots, whence the inode item is
6309 * the same on both snapshots.
6310 */
6311 if (sctx->cur_ino != sctx->cmp_key->objectid)
6312 return 0;
6313
6314 if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
6315 if (result != BTRFS_COMPARE_TREE_DELETED)
6316 ret = process_extent(sctx, sctx->left_path,
6317 sctx->cmp_key);
6318 }
6319
6320 return ret;
6321}
6322
6323static int dir_changed(struct send_ctx *sctx, u64 dir)
6324{
6325 u64 orig_gen, new_gen;
6326 int ret;
6327
6328 ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
6329 NULL, NULL);
6330 if (ret)
6331 return ret;
6332
6333 ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
6334 NULL, NULL, NULL);
6335 if (ret)
6336 return ret;
6337
6338 return (orig_gen != new_gen) ? 1 : 0;
6339}
6340
6341static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
6342 struct btrfs_key *key)
6343{
6344 struct btrfs_inode_extref *extref;
6345 struct extent_buffer *leaf;
6346 u64 dirid = 0, last_dirid = 0;
6347 unsigned long ptr;
6348 u32 item_size;
6349 u32 cur_offset = 0;
6350 int ref_name_len;
6351 int ret = 0;
6352
6353 /* Easy case, just check this one dirid */
6354 if (key->type == BTRFS_INODE_REF_KEY) {
6355 dirid = key->offset;
6356
6357 ret = dir_changed(sctx, dirid);
6358 goto out;
6359 }
6360
6361 leaf = path->nodes[0];
6362 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
6363 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
6364 while (cur_offset < item_size) {
6365 extref = (struct btrfs_inode_extref *)(ptr +
6366 cur_offset);
6367 dirid = btrfs_inode_extref_parent(leaf, extref);
6368 ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
6369 cur_offset += ref_name_len + sizeof(*extref);
6370 if (dirid == last_dirid)
6371 continue;
6372 ret = dir_changed(sctx, dirid);
6373 if (ret)
6374 break;
6375 last_dirid = dirid;
6376 }
6377out:
6378 return ret;
6379}
6380
6381/*
6382 * Updates compare related fields in sctx and simply forwards to the actual
6383 * changed_xxx functions.
6384 */
6385static int changed_cb(struct btrfs_path *left_path,
6386 struct btrfs_path *right_path,
6387 struct btrfs_key *key,
6388 enum btrfs_compare_tree_result result,
6389 void *ctx)
6390{
6391 int ret = 0;
6392 struct send_ctx *sctx = ctx;
6393
6394 if (result == BTRFS_COMPARE_TREE_SAME) {
6395 if (key->type == BTRFS_INODE_REF_KEY ||
6396 key->type == BTRFS_INODE_EXTREF_KEY) {
6397 ret = compare_refs(sctx, left_path, key);
6398 if (!ret)
6399 return 0;
6400 if (ret < 0)
6401 return ret;
6402 } else if (key->type == BTRFS_EXTENT_DATA_KEY) {
6403 return maybe_send_hole(sctx, left_path, key);
6404 } else {
6405 return 0;
6406 }
6407 result = BTRFS_COMPARE_TREE_CHANGED;
6408 ret = 0;
6409 }
6410
6411 sctx->left_path = left_path;
6412 sctx->right_path = right_path;
6413 sctx->cmp_key = key;
6414
6415 ret = finish_inode_if_needed(sctx, 0);
6416 if (ret < 0)
6417 goto out;
6418
6419 /* Ignore non-FS objects */
6420 if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
6421 key->objectid == BTRFS_FREE_SPACE_OBJECTID)
6422 goto out;
6423
6424 if (key->type == BTRFS_INODE_ITEM_KEY) {
6425 ret = changed_inode(sctx, result);
6426 } else if (!sctx->ignore_cur_inode) {
6427 if (key->type == BTRFS_INODE_REF_KEY ||
6428 key->type == BTRFS_INODE_EXTREF_KEY)
6429 ret = changed_ref(sctx, result);
6430 else if (key->type == BTRFS_XATTR_ITEM_KEY)
6431 ret = changed_xattr(sctx, result);
6432 else if (key->type == BTRFS_EXTENT_DATA_KEY)
6433 ret = changed_extent(sctx, result);
6434 }
6435
6436out:
6437 return ret;
6438}
6439
6440static int full_send_tree(struct send_ctx *sctx)
6441{
6442 int ret;
6443 struct btrfs_root *send_root = sctx->send_root;
6444 struct btrfs_key key;
6445 struct btrfs_path *path;
6446 struct extent_buffer *eb;
6447 int slot;
6448
6449 path = alloc_path_for_send();
6450 if (!path)
6451 return -ENOMEM;
6452
6453 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
6454 key.type = BTRFS_INODE_ITEM_KEY;
6455 key.offset = 0;
6456
6457 ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
6458 if (ret < 0)
6459 goto out;
6460 if (ret)
6461 goto out_finish;
6462
6463 while (1) {
6464 eb = path->nodes[0];
6465 slot = path->slots[0];
6466 btrfs_item_key_to_cpu(eb, &key, slot);
6467
6468 ret = changed_cb(path, NULL, &key,
6469 BTRFS_COMPARE_TREE_NEW, sctx);
6470 if (ret < 0)
6471 goto out;
6472
6473 ret = btrfs_next_item(send_root, path);
6474 if (ret < 0)
6475 goto out;
6476 if (ret) {
6477 ret = 0;
6478 break;
6479 }
6480 }
6481
6482out_finish:
6483 ret = finish_inode_if_needed(sctx, 1);
6484
6485out:
6486 btrfs_free_path(path);
6487 return ret;
6488}
6489
6490static int send_subvol(struct send_ctx *sctx)
6491{
6492 int ret;
6493
6494 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
6495 ret = send_header(sctx);
6496 if (ret < 0)
6497 goto out;
6498 }
6499
6500 ret = send_subvol_begin(sctx);
6501 if (ret < 0)
6502 goto out;
6503
6504 if (sctx->parent_root) {
6505 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root,
6506 changed_cb, sctx);
6507 if (ret < 0)
6508 goto out;
6509 ret = finish_inode_if_needed(sctx, 1);
6510 if (ret < 0)
6511 goto out;
6512 } else {
6513 ret = full_send_tree(sctx);
6514 if (ret < 0)
6515 goto out;
6516 }
6517
6518out:
6519 free_recorded_refs(sctx);
6520 return ret;
6521}
6522
6523/*
6524 * If orphan cleanup did remove any orphans from a root, it means the tree
6525 * was modified and therefore the commit root is not the same as the current
6526 * root anymore. This is a problem, because send uses the commit root and
6527 * therefore can see inode items that don't exist in the current root anymore,
6528 * and for example make calls to btrfs_iget, which will do tree lookups based
6529 * on the current root and not on the commit root. Those lookups will fail,
6530 * returning a -ESTALE error, and making send fail with that error. So make
6531 * sure a send does not see any orphans we have just removed, and that it will
6532 * see the same inodes regardless of whether a transaction commit happened
6533 * before it started (meaning that the commit root will be the same as the
6534 * current root) or not.
6535 */
6536static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
6537{
6538 int i;
6539 struct btrfs_trans_handle *trans = NULL;
6540
6541again:
6542 if (sctx->parent_root &&
6543 sctx->parent_root->node != sctx->parent_root->commit_root)
6544 goto commit_trans;
6545
6546 for (i = 0; i < sctx->clone_roots_cnt; i++)
6547 if (sctx->clone_roots[i].root->node !=
6548 sctx->clone_roots[i].root->commit_root)
6549 goto commit_trans;
6550
6551 if (trans)
6552 return btrfs_end_transaction(trans);
6553
6554 return 0;
6555
6556commit_trans:
6557 /* Use any root, all fs roots will get their commit roots updated. */
6558 if (!trans) {
6559 trans = btrfs_join_transaction(sctx->send_root);
6560 if (IS_ERR(trans))
6561 return PTR_ERR(trans);
6562 goto again;
6563 }
6564
6565 return btrfs_commit_transaction(trans);
6566}
6567
6568/*
6569 * Make sure any existing dellaloc is flushed for any root used by a send
6570 * operation so that we do not miss any data and we do not race with writeback
6571 * finishing and changing a tree while send is using the tree. This could
6572 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
6573 * a send operation then uses the subvolume.
6574 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
6575 */
6576static int flush_delalloc_roots(struct send_ctx *sctx)
6577{
6578 struct btrfs_root *root = sctx->parent_root;
6579 int ret;
6580 int i;
6581
6582 if (root) {
6583 ret = btrfs_start_delalloc_snapshot(root);
6584 if (ret)
6585 return ret;
6586 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
6587 }
6588
6589 for (i = 0; i < sctx->clone_roots_cnt; i++) {
6590 root = sctx->clone_roots[i].root;
6591 ret = btrfs_start_delalloc_snapshot(root);
6592 if (ret)
6593 return ret;
6594 btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
6595 }
6596
6597 return 0;
6598}
6599
6600static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
6601{
6602 spin_lock(&root->root_item_lock);
6603 root->send_in_progress--;
6604 /*
6605 * Not much left to do, we don't know why it's unbalanced and
6606 * can't blindly reset it to 0.
6607 */
6608 if (root->send_in_progress < 0)
6609 btrfs_err(root->fs_info,
6610 "send_in_progress unbalanced %d root %llu",
6611 root->send_in_progress, root->root_key.objectid);
6612 spin_unlock(&root->root_item_lock);
6613}
6614
6615long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
6616{
6617 int ret = 0;
6618 struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root;
6619 struct btrfs_fs_info *fs_info = send_root->fs_info;
6620 struct btrfs_root *clone_root;
6621 struct btrfs_key key;
6622 struct send_ctx *sctx = NULL;
6623 u32 i;
6624 u64 *clone_sources_tmp = NULL;
6625 int clone_sources_to_rollback = 0;
6626 unsigned alloc_size;
6627 int sort_clone_roots = 0;
6628 int index;
6629
6630 if (!capable(CAP_SYS_ADMIN))
6631 return -EPERM;
6632
6633 /*
6634 * The subvolume must remain read-only during send, protect against
6635 * making it RW. This also protects against deletion.
6636 */
6637 spin_lock(&send_root->root_item_lock);
6638 send_root->send_in_progress++;
6639 spin_unlock(&send_root->root_item_lock);
6640
6641 /*
6642 * Userspace tools do the checks and warn the user if it's
6643 * not RO.
6644 */
6645 if (!btrfs_root_readonly(send_root)) {
6646 ret = -EPERM;
6647 goto out;
6648 }
6649
6650 /*
6651 * Check that we don't overflow at later allocations, we request
6652 * clone_sources_count + 1 items, and compare to unsigned long inside
6653 * access_ok.
6654 */
6655 if (arg->clone_sources_count >
6656 ULONG_MAX / sizeof(struct clone_root) - 1) {
6657 ret = -EINVAL;
6658 goto out;
6659 }
6660
6661 if (!access_ok(VERIFY_READ, arg->clone_sources,
6662 sizeof(*arg->clone_sources) *
6663 arg->clone_sources_count)) {
6664 ret = -EFAULT;
6665 goto out;
6666 }
6667
6668 if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
6669 ret = -EINVAL;
6670 goto out;
6671 }
6672
6673 sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
6674 if (!sctx) {
6675 ret = -ENOMEM;
6676 goto out;
6677 }
6678
6679 INIT_LIST_HEAD(&sctx->new_refs);
6680 INIT_LIST_HEAD(&sctx->deleted_refs);
6681 INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
6682 INIT_LIST_HEAD(&sctx->name_cache_list);
6683
6684 sctx->flags = arg->flags;
6685
6686 sctx->send_filp = fget(arg->send_fd);
6687 if (!sctx->send_filp) {
6688 ret = -EBADF;
6689 goto out;
6690 }
6691
6692 sctx->send_root = send_root;
6693 /*
6694 * Unlikely but possible, if the subvolume is marked for deletion but
6695 * is slow to remove the directory entry, send can still be started
6696 */
6697 if (btrfs_root_dead(sctx->send_root)) {
6698 ret = -EPERM;
6699 goto out;
6700 }
6701
6702 sctx->clone_roots_cnt = arg->clone_sources_count;
6703
6704 sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
6705 sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
6706 if (!sctx->send_buf) {
6707 ret = -ENOMEM;
6708 goto out;
6709 }
6710
6711 sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL);
6712 if (!sctx->read_buf) {
6713 ret = -ENOMEM;
6714 goto out;
6715 }
6716
6717 sctx->pending_dir_moves = RB_ROOT;
6718 sctx->waiting_dir_moves = RB_ROOT;
6719 sctx->orphan_dirs = RB_ROOT;
6720
6721 alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
6722
6723 sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL);
6724 if (!sctx->clone_roots) {
6725 ret = -ENOMEM;
6726 goto out;
6727 }
6728
6729 alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
6730
6731 if (arg->clone_sources_count) {
6732 clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
6733 if (!clone_sources_tmp) {
6734 ret = -ENOMEM;
6735 goto out;
6736 }
6737
6738 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
6739 alloc_size);
6740 if (ret) {
6741 ret = -EFAULT;
6742 goto out;
6743 }
6744
6745 for (i = 0; i < arg->clone_sources_count; i++) {
6746 key.objectid = clone_sources_tmp[i];
6747 key.type = BTRFS_ROOT_ITEM_KEY;
6748 key.offset = (u64)-1;
6749
6750 index = srcu_read_lock(&fs_info->subvol_srcu);
6751
6752 clone_root = btrfs_read_fs_root_no_name(fs_info, &key);
6753 if (IS_ERR(clone_root)) {
6754 srcu_read_unlock(&fs_info->subvol_srcu, index);
6755 ret = PTR_ERR(clone_root);
6756 goto out;
6757 }
6758 spin_lock(&clone_root->root_item_lock);
6759 if (!btrfs_root_readonly(clone_root) ||
6760 btrfs_root_dead(clone_root)) {
6761 spin_unlock(&clone_root->root_item_lock);
6762 srcu_read_unlock(&fs_info->subvol_srcu, index);
6763 ret = -EPERM;
6764 goto out;
6765 }
6766 clone_root->send_in_progress++;
6767 spin_unlock(&clone_root->root_item_lock);
6768 srcu_read_unlock(&fs_info->subvol_srcu, index);
6769
6770 sctx->clone_roots[i].root = clone_root;
6771 clone_sources_to_rollback = i + 1;
6772 }
6773 kvfree(clone_sources_tmp);
6774 clone_sources_tmp = NULL;
6775 }
6776
6777 if (arg->parent_root) {
6778 key.objectid = arg->parent_root;
6779 key.type = BTRFS_ROOT_ITEM_KEY;
6780 key.offset = (u64)-1;
6781
6782 index = srcu_read_lock(&fs_info->subvol_srcu);
6783
6784 sctx->parent_root = btrfs_read_fs_root_no_name(fs_info, &key);
6785 if (IS_ERR(sctx->parent_root)) {
6786 srcu_read_unlock(&fs_info->subvol_srcu, index);
6787 ret = PTR_ERR(sctx->parent_root);
6788 goto out;
6789 }
6790
6791 spin_lock(&sctx->parent_root->root_item_lock);
6792 sctx->parent_root->send_in_progress++;
6793 if (!btrfs_root_readonly(sctx->parent_root) ||
6794 btrfs_root_dead(sctx->parent_root)) {
6795 spin_unlock(&sctx->parent_root->root_item_lock);
6796 srcu_read_unlock(&fs_info->subvol_srcu, index);
6797 ret = -EPERM;
6798 goto out;
6799 }
6800 spin_unlock(&sctx->parent_root->root_item_lock);
6801
6802 srcu_read_unlock(&fs_info->subvol_srcu, index);
6803 }
6804
6805 /*
6806 * Clones from send_root are allowed, but only if the clone source
6807 * is behind the current send position. This is checked while searching
6808 * for possible clone sources.
6809 */
6810 sctx->clone_roots[sctx->clone_roots_cnt++].root = sctx->send_root;
6811
6812 /* We do a bsearch later */
6813 sort(sctx->clone_roots, sctx->clone_roots_cnt,
6814 sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
6815 NULL);
6816 sort_clone_roots = 1;
6817
6818 ret = flush_delalloc_roots(sctx);
6819 if (ret)
6820 goto out;
6821
6822 ret = ensure_commit_roots_uptodate(sctx);
6823 if (ret)
6824 goto out;
6825
6826 current->journal_info = BTRFS_SEND_TRANS_STUB;
6827 ret = send_subvol(sctx);
6828 current->journal_info = NULL;
6829 if (ret < 0)
6830 goto out;
6831
6832 if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
6833 ret = begin_cmd(sctx, BTRFS_SEND_C_END);
6834 if (ret < 0)
6835 goto out;
6836 ret = send_cmd(sctx);
6837 if (ret < 0)
6838 goto out;
6839 }
6840
6841out:
6842 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
6843 while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
6844 struct rb_node *n;
6845 struct pending_dir_move *pm;
6846
6847 n = rb_first(&sctx->pending_dir_moves);
6848 pm = rb_entry(n, struct pending_dir_move, node);
6849 while (!list_empty(&pm->list)) {
6850 struct pending_dir_move *pm2;
6851
6852 pm2 = list_first_entry(&pm->list,
6853 struct pending_dir_move, list);
6854 free_pending_move(sctx, pm2);
6855 }
6856 free_pending_move(sctx, pm);
6857 }
6858
6859 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
6860 while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
6861 struct rb_node *n;
6862 struct waiting_dir_move *dm;
6863
6864 n = rb_first(&sctx->waiting_dir_moves);
6865 dm = rb_entry(n, struct waiting_dir_move, node);
6866 rb_erase(&dm->node, &sctx->waiting_dir_moves);
6867 kfree(dm);
6868 }
6869
6870 WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
6871 while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
6872 struct rb_node *n;
6873 struct orphan_dir_info *odi;
6874
6875 n = rb_first(&sctx->orphan_dirs);
6876 odi = rb_entry(n, struct orphan_dir_info, node);
6877 free_orphan_dir_info(sctx, odi);
6878 }
6879
6880 if (sort_clone_roots) {
6881 for (i = 0; i < sctx->clone_roots_cnt; i++)
6882 btrfs_root_dec_send_in_progress(
6883 sctx->clone_roots[i].root);
6884 } else {
6885 for (i = 0; sctx && i < clone_sources_to_rollback; i++)
6886 btrfs_root_dec_send_in_progress(
6887 sctx->clone_roots[i].root);
6888
6889 btrfs_root_dec_send_in_progress(send_root);
6890 }
6891 if (sctx && !IS_ERR_OR_NULL(sctx->parent_root))
6892 btrfs_root_dec_send_in_progress(sctx->parent_root);
6893
6894 kvfree(clone_sources_tmp);
6895
6896 if (sctx) {
6897 if (sctx->send_filp)
6898 fput(sctx->send_filp);
6899
6900 kvfree(sctx->clone_roots);
6901 kvfree(sctx->send_buf);
6902 kvfree(sctx->read_buf);
6903
6904 name_cache_free(sctx);
6905
6906 kfree(sctx);
6907 }
6908
6909 return ret;
6910}